Add: dev_tools.downtime_notice_crawler

2026-06-25 22:35:29 +08:00 · 2026-06-25 22:25:00 +08:00
parent 1b7772c372
commit b7bc1e7c06
1 changed files with 301 additions and 0 deletions
--- a/dev_tools/downtime_notice_crawler.py
+++ b/dev_tools/downtime_notice_crawler.py
@@ -0,0 +1,301 @@
+"""
+碧蓝航线停服维护公告爬取工具
+
+从B站碧蓝航线官方专栏合集抓取最新维护公告，输出停服时间段。
+
+默认项
+  --mid             233114659（碧蓝航线B站官方号UID）
+  --collection-name 《碧蓝航线》维护公告
+  --cache           公告缓存文件，默认 maintenance/_cache.jsonl，位于脚本同级目录。
+  --state           合集状态文件，默认 maintenance/state.json，位于脚本同级目录。
+
+命令
+  fetch  爬取：拉取专栏合集数据，写入缓存。
+  parse  解析：从缓存读取并解析公告，向 stdout 输出纯文本停服时间段，如：2026-06-20 14:00~18:00
+               解析失败输出空行，并返回退出码 2。
+  run    完整流程测试：拉取最新公告并解析停服维护信息，不依赖缓存。
+
+示例
+  .\\toolkit\\python.exe dev_tools/bilibili_maintenance.py fetch --force
+  .\\toolkit\\python.exe dev_tools/bilibili_maintenance.py parse
+  .\\toolkit\\python.exe dev_tools/bilibili_maintenance.py run --verbose
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import logging
+import re
+import sys
+from dataclasses import dataclass
+from datetime import datetime, timezone, timedelta
+from pathlib import Path
+from typing import Optional
+
+import requests
+
+SCRIPT_DIR = Path(__file__).resolve().parent
+DEFAULT_MID = 233114659
+DEFAULT_CACHE = SCRIPT_DIR / 'maintenance' / '_cache.jsonl'
+DEFAULT_STATE = SCRIPT_DIR / 'maintenance' / 'state.json'
+DEFAULT_COLLECTION_NAME = '《碧蓝航线》维护公告'
+
+ARTICLE_LISTS_API = 'https://api.bilibili.com/x/article/up/lists'
+ARTICLE_COLLECTION_API = 'https://api.bilibili.com/x/article/list/web/articles'
+CN_TZ = timezone(timedelta(hours=8))
+
+WS = r'[\s　]*'
+WINDOW_RE = re.compile(
+    r'司令部将于' + WS +
+    r'(\d{1,2})月(\d{1,2})日' + WS +
+    r'(\d{1,2})[:：](\d{2})' + WS +
+    r'[~～至到—\-]+' + WS +
+    r'(\d{1,2})[:：](\d{2})',
+)
+DURATION_RE = re.compile(r'为期' + WS + r'(\d+)' + WS + r'个?小时')
+TITLE_TIME_RE = re.compile(r'(\d{1,2})月(\d{1,2})日' + WS + r'(\d{1,2})[:：](\d{2})')
+
+logger = logging.getLogger('bilibili_maintenance')
+
+
+class FetchError(RuntimeError):
+    pass
+
+
+@dataclass
+class MaintenanceInfo:
+    window: str
+    duration_hours: int
+
+
+@dataclass
+class CollectionMeta:
+    collection_id: int
+    collection_name: str
+    articles_count: int
+    update_time: int
+
+
+def configure_stdio() -> None:
+    if hasattr(sys.stdout, 'reconfigure'):
+        sys.stdout.reconfigure(encoding='utf-8')
+
+
+def session_for(mid: int) -> requests.Session:
+    s = requests.Session()
+    s.headers.update({
+        'User-Agent': (
+            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
+            '(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'
+        ),
+        'Referer': f'https://space.bilibili.com/{mid}/article',
+    })
+    return s
+
+
+def get_json(session: requests.Session, url: str, params: dict, timeout: float) -> dict:
+    response = session.get(url, params=params, timeout=timeout)
+    response.raise_for_status()
+    payload = response.json()
+    if payload['code'] != 0:
+        raise FetchError(payload.get('message') or payload.get('msg') or 'unknown error')
+    return payload
+
+
+def find_collection(session: requests.Session, mid: int, name: str, timeout: float) -> tuple[CollectionMeta, dict]:
+    payload = get_json(session, ARTICLE_LISTS_API, {'mid': str(mid), 'sort': 0}, timeout)
+    for item in payload['data']['lists']:
+        if item['name'] == name:
+            return CollectionMeta(collection_id=int(item['id']), collection_name=item['name'], articles_count=int(item['articles_count']), update_time=int(item['update_time'])), payload
+    raise FetchError(f'collection not found: mid={mid} name={name!r}')
+
+
+def normalize_article(article: dict, meta: CollectionMeta) -> dict:
+    cvid = str(article['id'])
+    pub_ts = int(article['publish_time'])
+    return {
+        'pub_ts': pub_ts,
+        'pub_time': datetime.fromtimestamp(pub_ts, tz=CN_TZ).isoformat(),
+        'title': article['title'],
+        'text': article['summary'],
+        'url': f'https://www.bilibili.com/read/cv{cvid}',
+        'collection_id': meta.collection_id,
+        'collection_name': meta.collection_name,
+    }
+
+
+def load_state(path: Path) -> dict:
+    if not path.exists():
+        return {}
+    return json.loads(path.read_text(encoding='utf-8'))
+
+
+def load_jsonl(path: Path) -> list[dict]:
+    if not path.exists():
+        return []
+    return [json.loads(line) for line in path.read_text(encoding='utf-8').splitlines() if line.strip()]
+
+
+def persist_fetch(cache_path: Path, state_path: Path, records: list[dict], state: dict) -> None:
+    cache_path.parent.mkdir(parents=True, exist_ok=True)
+    cache_content = ''.join(json.dumps(record, ensure_ascii=False) + '\n' for record in records)
+    state_content = json.dumps(state, ensure_ascii=False, indent=2)
+    cache_tmp = cache_path.with_suffix(cache_path.suffix + '.tmp')
+    state_tmp = state_path.with_suffix(state_path.suffix + '.tmp')
+    cache_tmp.write_text(cache_content, encoding='utf-8')
+    state_tmp.write_text(state_content, encoding='utf-8')
+    cache_tmp.replace(cache_path)
+    state_tmp.replace(state_path)
+
+
+def clean_text(text: str) -> str:
+    return text.replace('[图片]', '').replace('\u3000', ' ').strip()
+
+
+def infer_year(published: datetime, month: int, day: int) -> int:
+    year = published.year
+    delta = (datetime(year, month, day, tzinfo=CN_TZ).date() - published.date()).days
+    if delta < -60:
+        year += 1
+    elif delta > 300:
+        year -= 1
+    return year
+
+
+def _build_window_dt(published: datetime, sm: int, sd: int, sh: int, smin: int, eh: int, emin: int) -> tuple[datetime, datetime]:
+    year = infer_year(published, sm, sd)
+    start = datetime(year, sm, sd, sh, smin, tzinfo=CN_TZ)
+    end = datetime(year, sm, sd, eh, emin, tzinfo=CN_TZ)
+    return start, end
+
+
+def _hours_between(start: datetime, end: datetime) -> int:
+    return round((end - start).total_seconds() / 3600)
+
+
+def parse_maintenance(text: str, pub_ts: int, title: str) -> Optional[MaintenanceInfo]:
+    published = datetime.fromtimestamp(pub_ts, tz=CN_TZ)
+    body = clean_text(text)
+    window_match = WINDOW_RE.search(body)
+    duration_match = DURATION_RE.search(body)
+
+    if window_match:
+        sm = int(window_match.group(1))
+        sd = int(window_match.group(2))
+        sh = int(window_match.group(3))
+        smin = int(window_match.group(4))
+        eh = int(window_match.group(5))
+        emin = int(window_match.group(6))
+        start, end = _build_window_dt(published, sm, sd, sh, smin, eh, emin)
+        hours = int(duration_match.group(1)) if duration_match else _hours_between(start, end)
+        return MaintenanceInfo(window=f'{start:%Y-%m-%d} {start:%H:%M}~{end:%H:%M}', duration_hours=hours)
+
+    if duration_match:
+        title_match = TITLE_TIME_RE.search(title)
+        if title_match:
+            sm = int(title_match.group(1))
+            sd = int(title_match.group(2))
+            sh = int(title_match.group(3))
+            smin = int(title_match.group(4))
+            hours = int(duration_match.group(1))
+            start, _ = _build_window_dt(published, sm, sd, sh, smin, sh, smin)
+            end = start + timedelta(hours=hours)
+            logger.warning('WINDOW_RE not matched; synthesized window from TITLE_TIME_RE and DURATION_RE (%02d-%02d %02d:%02d, duration %d h).', sm, sd, sh, smin, hours)
+            return MaintenanceInfo(window=f'{start:%Y-%m-%d} {start:%H:%M}~{end:%H:%M}', duration_hours=hours)
+
+    logger.warning('Failed to parse maintenance window from article text.')
+    return None
+
+
+def latest_record(records: list[dict]) -> dict:
+    return max(records, key=lambda r: r['pub_ts'])
+
+
+def parse_records(records: list[dict]) -> Optional[MaintenanceInfo]:
+    if not records:
+        return None
+    record = latest_record(records)
+    return parse_maintenance(record['text'], record['pub_ts'], record['title'])
+
+
+def fetch_articles(cache_path: Path, state_path: Path, mid: int, collection_name: str, timeout: float, force: bool, verbose: bool) -> tuple[list[dict], bool]:
+    state = load_state(state_path)
+    session = session_for(mid)
+    meta, lists_payload = find_collection(session, mid, collection_name, timeout)
+
+    if verbose:
+        logger.info('%s', json.dumps(lists_payload, ensure_ascii=False, indent=2))
+
+    if not force and state.get('articles_count') == meta.articles_count and state.get('update_time') == meta.update_time:
+        logger.info('skip: count=%s update_time=%s', meta.articles_count, meta.update_time)
+        return load_jsonl(cache_path), False
+
+    payload = get_json(session, ARTICLE_COLLECTION_API, {'id': str(meta.collection_id)}, timeout)
+    records = [normalize_article(article, meta) for article in payload['data']['articles']]
+    persist_fetch(cache_path, state_path, records, {'last_run_at': datetime.now(tz=CN_TZ).isoformat(timespec='seconds'), 'collection_id': meta.collection_id, 'articles_count': meta.articles_count, 'update_time': meta.update_time})
+    logger.info('updated: count=%s update_time=%s records=%s', meta.articles_count, meta.update_time, len(records))
+    return records, True
+
+
+def build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(prog='bilibili_maintenance.py', description='碧蓝航线停服维护公告爬取工具')
+    parser.add_argument('command', choices=['fetch', 'parse', 'run'], help='fetch: 拉取最新停服维护公告 | parse: 基于缓存解析停服时间 | run: 拉取并解析最新公告')
+    parser.add_argument('--cache', type=Path, default=DEFAULT_CACHE, help=f'公告缓存文件路径（默认 {DEFAULT_CACHE}）')
+    parser.add_argument('--state', type=Path, default=DEFAULT_STATE, help=f'合集状态文件路径（默认 {DEFAULT_STATE}）')
+    parser.add_argument('--mid', type=int, default=DEFAULT_MID, help=f'B站UP主UID（默认 {DEFAULT_MID}）')
+    parser.add_argument('--collection-name', default=DEFAULT_COLLECTION_NAME, help=f'专栏合集名称，须与B站合集页标题一致（默认 {DEFAULT_COLLECTION_NAME!r}）')
+    parser.add_argument('--timeout', type=float, default=30.0, help='网络请求超时秒数（默认 30）')
+    parser.add_argument('--force', action='store_true', help='强制重新拉取并刷新缓存数据')
+    parser.add_argument('--verbose', action='store_true', help='启用调试日志，输出 API 原始响应')
+    return parser
+
+
+def cmd_fetch(args: argparse.Namespace) -> int:
+    try:
+        fetch_articles(args.cache, args.state, args.mid, args.collection_name, args.timeout, args.force, args.verbose)
+    except (FetchError, requests.RequestException) as exc:
+        logger.error('fetch failed: %s', exc)
+        return 1
+    return 0
+
+
+def cmd_parse(args: argparse.Namespace) -> int:
+    info = parse_records(load_jsonl(args.cache))
+    if info:
+        print(info.window)
+        return 0
+    print()
+    return 2
+
+
+def cmd_run(args: argparse.Namespace) -> int:
+    try:
+        records, _ = fetch_articles(args.cache, args.state, args.mid, args.collection_name, args.timeout, force=True, verbose=args.verbose)
+    except (FetchError, requests.RequestException) as exc:
+        logger.error('fetch failed: %s', exc)
+        return 1
+    info = parse_records(records)
+    if info:
+        print(info.window)
+        return 0
+    print()
+    return 2
+
+
+def main() -> int:
+    configure_stdio()
+    logging.basicConfig(level=logging.WARNING, format='%(levelname)s: %(message)s')
+    args = build_parser().parse_args()
+    if args.verbose:
+        logging.getLogger().setLevel(logging.INFO)
+    if args.command == 'fetch':
+        return cmd_fetch(args)
+    if args.command == 'parse':
+        return cmd_parse(args)
+    if args.command == 'run':
+        return cmd_run(args)
+    return 1
+
+
+if __name__ == '__main__':
+    raise SystemExit(main())