From b7bc1e7c064a9d2d1095564bdcd7e5dc87eadde6 Mon Sep 17 00:00:00 2001 From: positnuec <93694981+positnuec@users.noreply.github.com> Date: Thu, 25 Jun 2026 22:25:00 +0800 Subject: [PATCH] Add: dev_tools.downtime_notice_crawler --- dev_tools/downtime_notice_crawler.py | 301 +++++++++++++++++++++++++++ 1 file changed, 301 insertions(+) create mode 100644 dev_tools/downtime_notice_crawler.py diff --git a/dev_tools/downtime_notice_crawler.py b/dev_tools/downtime_notice_crawler.py new file mode 100644 index 000000000..ca9dfbd54 --- /dev/null +++ b/dev_tools/downtime_notice_crawler.py @@ -0,0 +1,301 @@ +""" +碧蓝航线停服维护公告爬取工具 + +从B站碧蓝航线官方专栏合集抓取最新维护公告,输出停服时间段。 + +默认项 + --mid 233114659(碧蓝航线B站官方号UID) + --collection-name 《碧蓝航线》维护公告 + --cache 公告缓存文件,默认 maintenance/_cache.jsonl,位于脚本同级目录。 + --state 合集状态文件,默认 maintenance/state.json,位于脚本同级目录。 + +命令 + fetch 爬取:拉取专栏合集数据,写入缓存。 + parse 解析:从缓存读取并解析公告,向 stdout 输出纯文本停服时间段,如:2026-06-20 14:00~18:00 + 解析失败输出空行,并返回退出码 2。 + run 完整流程测试:拉取最新公告并解析停服维护信息,不依赖缓存。 + +示例 + .\\toolkit\\python.exe dev_tools/bilibili_maintenance.py fetch --force + .\\toolkit\\python.exe dev_tools/bilibili_maintenance.py parse + .\\toolkit\\python.exe dev_tools/bilibili_maintenance.py run --verbose +""" +from __future__ import annotations + +import argparse +import json +import logging +import re +import sys +from dataclasses import dataclass +from datetime import datetime, timezone, timedelta +from pathlib import Path +from typing import Optional + +import requests + +SCRIPT_DIR = Path(__file__).resolve().parent +DEFAULT_MID = 233114659 +DEFAULT_CACHE = SCRIPT_DIR / 'maintenance' / '_cache.jsonl' +DEFAULT_STATE = SCRIPT_DIR / 'maintenance' / 'state.json' +DEFAULT_COLLECTION_NAME = '《碧蓝航线》维护公告' + +ARTICLE_LISTS_API = 'https://api.bilibili.com/x/article/up/lists' +ARTICLE_COLLECTION_API = 'https://api.bilibili.com/x/article/list/web/articles' +CN_TZ = timezone(timedelta(hours=8)) + +WS = r'[\s ]*' +WINDOW_RE = re.compile( + r'司令部将于' + WS + + r'(\d{1,2})月(\d{1,2})日' + WS + + r'(\d{1,2})[::](\d{2})' + WS + + r'[~~至到—\-]+' + WS + + r'(\d{1,2})[::](\d{2})', +) +DURATION_RE = re.compile(r'为期' + WS + r'(\d+)' + WS + r'个?小时') +TITLE_TIME_RE = re.compile(r'(\d{1,2})月(\d{1,2})日' + WS + r'(\d{1,2})[::](\d{2})') + +logger = logging.getLogger('bilibili_maintenance') + + +class FetchError(RuntimeError): + pass + + +@dataclass +class MaintenanceInfo: + window: str + duration_hours: int + + +@dataclass +class CollectionMeta: + collection_id: int + collection_name: str + articles_count: int + update_time: int + + +def configure_stdio() -> None: + if hasattr(sys.stdout, 'reconfigure'): + sys.stdout.reconfigure(encoding='utf-8') + + +def session_for(mid: int) -> requests.Session: + s = requests.Session() + s.headers.update({ + 'User-Agent': ( + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' + '(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36' + ), + 'Referer': f'https://space.bilibili.com/{mid}/article', + }) + return s + + +def get_json(session: requests.Session, url: str, params: dict, timeout: float) -> dict: + response = session.get(url, params=params, timeout=timeout) + response.raise_for_status() + payload = response.json() + if payload['code'] != 0: + raise FetchError(payload.get('message') or payload.get('msg') or 'unknown error') + return payload + + +def find_collection(session: requests.Session, mid: int, name: str, timeout: float) -> tuple[CollectionMeta, dict]: + payload = get_json(session, ARTICLE_LISTS_API, {'mid': str(mid), 'sort': 0}, timeout) + for item in payload['data']['lists']: + if item['name'] == name: + return CollectionMeta(collection_id=int(item['id']), collection_name=item['name'], articles_count=int(item['articles_count']), update_time=int(item['update_time'])), payload + raise FetchError(f'collection not found: mid={mid} name={name!r}') + + +def normalize_article(article: dict, meta: CollectionMeta) -> dict: + cvid = str(article['id']) + pub_ts = int(article['publish_time']) + return { + 'pub_ts': pub_ts, + 'pub_time': datetime.fromtimestamp(pub_ts, tz=CN_TZ).isoformat(), + 'title': article['title'], + 'text': article['summary'], + 'url': f'https://www.bilibili.com/read/cv{cvid}', + 'collection_id': meta.collection_id, + 'collection_name': meta.collection_name, + } + + +def load_state(path: Path) -> dict: + if not path.exists(): + return {} + return json.loads(path.read_text(encoding='utf-8')) + + +def load_jsonl(path: Path) -> list[dict]: + if not path.exists(): + return [] + return [json.loads(line) for line in path.read_text(encoding='utf-8').splitlines() if line.strip()] + + +def persist_fetch(cache_path: Path, state_path: Path, records: list[dict], state: dict) -> None: + cache_path.parent.mkdir(parents=True, exist_ok=True) + cache_content = ''.join(json.dumps(record, ensure_ascii=False) + '\n' for record in records) + state_content = json.dumps(state, ensure_ascii=False, indent=2) + cache_tmp = cache_path.with_suffix(cache_path.suffix + '.tmp') + state_tmp = state_path.with_suffix(state_path.suffix + '.tmp') + cache_tmp.write_text(cache_content, encoding='utf-8') + state_tmp.write_text(state_content, encoding='utf-8') + cache_tmp.replace(cache_path) + state_tmp.replace(state_path) + + +def clean_text(text: str) -> str: + return text.replace('[图片]', '').replace('\u3000', ' ').strip() + + +def infer_year(published: datetime, month: int, day: int) -> int: + year = published.year + delta = (datetime(year, month, day, tzinfo=CN_TZ).date() - published.date()).days + if delta < -60: + year += 1 + elif delta > 300: + year -= 1 + return year + + +def _build_window_dt(published: datetime, sm: int, sd: int, sh: int, smin: int, eh: int, emin: int) -> tuple[datetime, datetime]: + year = infer_year(published, sm, sd) + start = datetime(year, sm, sd, sh, smin, tzinfo=CN_TZ) + end = datetime(year, sm, sd, eh, emin, tzinfo=CN_TZ) + return start, end + + +def _hours_between(start: datetime, end: datetime) -> int: + return round((end - start).total_seconds() / 3600) + + +def parse_maintenance(text: str, pub_ts: int, title: str) -> Optional[MaintenanceInfo]: + published = datetime.fromtimestamp(pub_ts, tz=CN_TZ) + body = clean_text(text) + window_match = WINDOW_RE.search(body) + duration_match = DURATION_RE.search(body) + + if window_match: + sm = int(window_match.group(1)) + sd = int(window_match.group(2)) + sh = int(window_match.group(3)) + smin = int(window_match.group(4)) + eh = int(window_match.group(5)) + emin = int(window_match.group(6)) + start, end = _build_window_dt(published, sm, sd, sh, smin, eh, emin) + hours = int(duration_match.group(1)) if duration_match else _hours_between(start, end) + return MaintenanceInfo(window=f'{start:%Y-%m-%d} {start:%H:%M}~{end:%H:%M}', duration_hours=hours) + + if duration_match: + title_match = TITLE_TIME_RE.search(title) + if title_match: + sm = int(title_match.group(1)) + sd = int(title_match.group(2)) + sh = int(title_match.group(3)) + smin = int(title_match.group(4)) + hours = int(duration_match.group(1)) + start, _ = _build_window_dt(published, sm, sd, sh, smin, sh, smin) + end = start + timedelta(hours=hours) + logger.warning('WINDOW_RE not matched; synthesized window from TITLE_TIME_RE and DURATION_RE (%02d-%02d %02d:%02d, duration %d h).', sm, sd, sh, smin, hours) + return MaintenanceInfo(window=f'{start:%Y-%m-%d} {start:%H:%M}~{end:%H:%M}', duration_hours=hours) + + logger.warning('Failed to parse maintenance window from article text.') + return None + + +def latest_record(records: list[dict]) -> dict: + return max(records, key=lambda r: r['pub_ts']) + + +def parse_records(records: list[dict]) -> Optional[MaintenanceInfo]: + if not records: + return None + record = latest_record(records) + return parse_maintenance(record['text'], record['pub_ts'], record['title']) + + +def fetch_articles(cache_path: Path, state_path: Path, mid: int, collection_name: str, timeout: float, force: bool, verbose: bool) -> tuple[list[dict], bool]: + state = load_state(state_path) + session = session_for(mid) + meta, lists_payload = find_collection(session, mid, collection_name, timeout) + + if verbose: + logger.info('%s', json.dumps(lists_payload, ensure_ascii=False, indent=2)) + + if not force and state.get('articles_count') == meta.articles_count and state.get('update_time') == meta.update_time: + logger.info('skip: count=%s update_time=%s', meta.articles_count, meta.update_time) + return load_jsonl(cache_path), False + + payload = get_json(session, ARTICLE_COLLECTION_API, {'id': str(meta.collection_id)}, timeout) + records = [normalize_article(article, meta) for article in payload['data']['articles']] + persist_fetch(cache_path, state_path, records, {'last_run_at': datetime.now(tz=CN_TZ).isoformat(timespec='seconds'), 'collection_id': meta.collection_id, 'articles_count': meta.articles_count, 'update_time': meta.update_time}) + logger.info('updated: count=%s update_time=%s records=%s', meta.articles_count, meta.update_time, len(records)) + return records, True + + +def build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(prog='bilibili_maintenance.py', description='碧蓝航线停服维护公告爬取工具') + parser.add_argument('command', choices=['fetch', 'parse', 'run'], help='fetch: 拉取最新停服维护公告 | parse: 基于缓存解析停服时间 | run: 拉取并解析最新公告') + parser.add_argument('--cache', type=Path, default=DEFAULT_CACHE, help=f'公告缓存文件路径(默认 {DEFAULT_CACHE})') + parser.add_argument('--state', type=Path, default=DEFAULT_STATE, help=f'合集状态文件路径(默认 {DEFAULT_STATE})') + parser.add_argument('--mid', type=int, default=DEFAULT_MID, help=f'B站UP主UID(默认 {DEFAULT_MID})') + parser.add_argument('--collection-name', default=DEFAULT_COLLECTION_NAME, help=f'专栏合集名称,须与B站合集页标题一致(默认 {DEFAULT_COLLECTION_NAME!r})') + parser.add_argument('--timeout', type=float, default=30.0, help='网络请求超时秒数(默认 30)') + parser.add_argument('--force', action='store_true', help='强制重新拉取并刷新缓存数据') + parser.add_argument('--verbose', action='store_true', help='启用调试日志,输出 API 原始响应') + return parser + + +def cmd_fetch(args: argparse.Namespace) -> int: + try: + fetch_articles(args.cache, args.state, args.mid, args.collection_name, args.timeout, args.force, args.verbose) + except (FetchError, requests.RequestException) as exc: + logger.error('fetch failed: %s', exc) + return 1 + return 0 + + +def cmd_parse(args: argparse.Namespace) -> int: + info = parse_records(load_jsonl(args.cache)) + if info: + print(info.window) + return 0 + print() + return 2 + + +def cmd_run(args: argparse.Namespace) -> int: + try: + records, _ = fetch_articles(args.cache, args.state, args.mid, args.collection_name, args.timeout, force=True, verbose=args.verbose) + except (FetchError, requests.RequestException) as exc: + logger.error('fetch failed: %s', exc) + return 1 + info = parse_records(records) + if info: + print(info.window) + return 0 + print() + return 2 + + +def main() -> int: + configure_stdio() + logging.basicConfig(level=logging.WARNING, format='%(levelname)s: %(message)s') + args = build_parser().parse_args() + if args.verbose: + logging.getLogger().setLevel(logging.INFO) + if args.command == 'fetch': + return cmd_fetch(args) + if args.command == 'parse': + return cmd_parse(args) + if args.command == 'run': + return cmd_run(args) + return 1 + + +if __name__ == '__main__': + raise SystemExit(main())