1
0
mirror of https://github.com/sui-feng-cb/AzurLaneAutoScript1.git synced 2026-06-25 22:35:29 +08:00

Add: dev_tools.downtime_notice_crawler

This commit is contained in:
positnuec
2026-06-25 22:25:00 +08:00
parent 1b7772c372
commit b7bc1e7c06

View File

@@ -0,0 +1,301 @@
"""
碧蓝航线停服维护公告爬取工具
从B站碧蓝航线官方专栏合集抓取最新维护公告输出停服时间段。
默认项
--mid 233114659碧蓝航线B站官方号UID
--collection-name 《碧蓝航线》维护公告
--cache 公告缓存文件,默认 maintenance/_cache.jsonl位于脚本同级目录。
--state 合集状态文件,默认 maintenance/state.json位于脚本同级目录。
命令
fetch 爬取:拉取专栏合集数据,写入缓存。
parse 解析:从缓存读取并解析公告,向 stdout 输出纯文本停服时间段2026-06-20 14:00~18:00
解析失败输出空行,并返回退出码 2。
run 完整流程测试:拉取最新公告并解析停服维护信息,不依赖缓存。
示例
.\\toolkit\\python.exe dev_tools/bilibili_maintenance.py fetch --force
.\\toolkit\\python.exe dev_tools/bilibili_maintenance.py parse
.\\toolkit\\python.exe dev_tools/bilibili_maintenance.py run --verbose
"""
from __future__ import annotations
import argparse
import json
import logging
import re
import sys
from dataclasses import dataclass
from datetime import datetime, timezone, timedelta
from pathlib import Path
from typing import Optional
import requests
SCRIPT_DIR = Path(__file__).resolve().parent
DEFAULT_MID = 233114659
DEFAULT_CACHE = SCRIPT_DIR / 'maintenance' / '_cache.jsonl'
DEFAULT_STATE = SCRIPT_DIR / 'maintenance' / 'state.json'
DEFAULT_COLLECTION_NAME = '《碧蓝航线》维护公告'
ARTICLE_LISTS_API = 'https://api.bilibili.com/x/article/up/lists'
ARTICLE_COLLECTION_API = 'https://api.bilibili.com/x/article/list/web/articles'
CN_TZ = timezone(timedelta(hours=8))
WS = r'[\s ]*'
WINDOW_RE = re.compile(
r'司令部将于' + WS +
r'(\d{1,2})月(\d{1,2})日' + WS +
r'(\d{1,2})[:](\d{2})' + WS +
r'[~~至到—\-]+' + WS +
r'(\d{1,2})[:](\d{2})',
)
DURATION_RE = re.compile(r'为期' + WS + r'(\d+)' + WS + r'个?小时')
TITLE_TIME_RE = re.compile(r'(\d{1,2})月(\d{1,2})日' + WS + r'(\d{1,2})[:](\d{2})')
logger = logging.getLogger('bilibili_maintenance')
class FetchError(RuntimeError):
pass
@dataclass
class MaintenanceInfo:
window: str
duration_hours: int
@dataclass
class CollectionMeta:
collection_id: int
collection_name: str
articles_count: int
update_time: int
def configure_stdio() -> None:
if hasattr(sys.stdout, 'reconfigure'):
sys.stdout.reconfigure(encoding='utf-8')
def session_for(mid: int) -> requests.Session:
s = requests.Session()
s.headers.update({
'User-Agent': (
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'
),
'Referer': f'https://space.bilibili.com/{mid}/article',
})
return s
def get_json(session: requests.Session, url: str, params: dict, timeout: float) -> dict:
response = session.get(url, params=params, timeout=timeout)
response.raise_for_status()
payload = response.json()
if payload['code'] != 0:
raise FetchError(payload.get('message') or payload.get('msg') or 'unknown error')
return payload
def find_collection(session: requests.Session, mid: int, name: str, timeout: float) -> tuple[CollectionMeta, dict]:
payload = get_json(session, ARTICLE_LISTS_API, {'mid': str(mid), 'sort': 0}, timeout)
for item in payload['data']['lists']:
if item['name'] == name:
return CollectionMeta(collection_id=int(item['id']), collection_name=item['name'], articles_count=int(item['articles_count']), update_time=int(item['update_time'])), payload
raise FetchError(f'collection not found: mid={mid} name={name!r}')
def normalize_article(article: dict, meta: CollectionMeta) -> dict:
cvid = str(article['id'])
pub_ts = int(article['publish_time'])
return {
'pub_ts': pub_ts,
'pub_time': datetime.fromtimestamp(pub_ts, tz=CN_TZ).isoformat(),
'title': article['title'],
'text': article['summary'],
'url': f'https://www.bilibili.com/read/cv{cvid}',
'collection_id': meta.collection_id,
'collection_name': meta.collection_name,
}
def load_state(path: Path) -> dict:
if not path.exists():
return {}
return json.loads(path.read_text(encoding='utf-8'))
def load_jsonl(path: Path) -> list[dict]:
if not path.exists():
return []
return [json.loads(line) for line in path.read_text(encoding='utf-8').splitlines() if line.strip()]
def persist_fetch(cache_path: Path, state_path: Path, records: list[dict], state: dict) -> None:
cache_path.parent.mkdir(parents=True, exist_ok=True)
cache_content = ''.join(json.dumps(record, ensure_ascii=False) + '\n' for record in records)
state_content = json.dumps(state, ensure_ascii=False, indent=2)
cache_tmp = cache_path.with_suffix(cache_path.suffix + '.tmp')
state_tmp = state_path.with_suffix(state_path.suffix + '.tmp')
cache_tmp.write_text(cache_content, encoding='utf-8')
state_tmp.write_text(state_content, encoding='utf-8')
cache_tmp.replace(cache_path)
state_tmp.replace(state_path)
def clean_text(text: str) -> str:
return text.replace('[图片]', '').replace('\u3000', ' ').strip()
def infer_year(published: datetime, month: int, day: int) -> int:
year = published.year
delta = (datetime(year, month, day, tzinfo=CN_TZ).date() - published.date()).days
if delta < -60:
year += 1
elif delta > 300:
year -= 1
return year
def _build_window_dt(published: datetime, sm: int, sd: int, sh: int, smin: int, eh: int, emin: int) -> tuple[datetime, datetime]:
year = infer_year(published, sm, sd)
start = datetime(year, sm, sd, sh, smin, tzinfo=CN_TZ)
end = datetime(year, sm, sd, eh, emin, tzinfo=CN_TZ)
return start, end
def _hours_between(start: datetime, end: datetime) -> int:
return round((end - start).total_seconds() / 3600)
def parse_maintenance(text: str, pub_ts: int, title: str) -> Optional[MaintenanceInfo]:
published = datetime.fromtimestamp(pub_ts, tz=CN_TZ)
body = clean_text(text)
window_match = WINDOW_RE.search(body)
duration_match = DURATION_RE.search(body)
if window_match:
sm = int(window_match.group(1))
sd = int(window_match.group(2))
sh = int(window_match.group(3))
smin = int(window_match.group(4))
eh = int(window_match.group(5))
emin = int(window_match.group(6))
start, end = _build_window_dt(published, sm, sd, sh, smin, eh, emin)
hours = int(duration_match.group(1)) if duration_match else _hours_between(start, end)
return MaintenanceInfo(window=f'{start:%Y-%m-%d} {start:%H:%M}~{end:%H:%M}', duration_hours=hours)
if duration_match:
title_match = TITLE_TIME_RE.search(title)
if title_match:
sm = int(title_match.group(1))
sd = int(title_match.group(2))
sh = int(title_match.group(3))
smin = int(title_match.group(4))
hours = int(duration_match.group(1))
start, _ = _build_window_dt(published, sm, sd, sh, smin, sh, smin)
end = start + timedelta(hours=hours)
logger.warning('WINDOW_RE not matched; synthesized window from TITLE_TIME_RE and DURATION_RE (%02d-%02d %02d:%02d, duration %d h).', sm, sd, sh, smin, hours)
return MaintenanceInfo(window=f'{start:%Y-%m-%d} {start:%H:%M}~{end:%H:%M}', duration_hours=hours)
logger.warning('Failed to parse maintenance window from article text.')
return None
def latest_record(records: list[dict]) -> dict:
return max(records, key=lambda r: r['pub_ts'])
def parse_records(records: list[dict]) -> Optional[MaintenanceInfo]:
if not records:
return None
record = latest_record(records)
return parse_maintenance(record['text'], record['pub_ts'], record['title'])
def fetch_articles(cache_path: Path, state_path: Path, mid: int, collection_name: str, timeout: float, force: bool, verbose: bool) -> tuple[list[dict], bool]:
state = load_state(state_path)
session = session_for(mid)
meta, lists_payload = find_collection(session, mid, collection_name, timeout)
if verbose:
logger.info('%s', json.dumps(lists_payload, ensure_ascii=False, indent=2))
if not force and state.get('articles_count') == meta.articles_count and state.get('update_time') == meta.update_time:
logger.info('skip: count=%s update_time=%s', meta.articles_count, meta.update_time)
return load_jsonl(cache_path), False
payload = get_json(session, ARTICLE_COLLECTION_API, {'id': str(meta.collection_id)}, timeout)
records = [normalize_article(article, meta) for article in payload['data']['articles']]
persist_fetch(cache_path, state_path, records, {'last_run_at': datetime.now(tz=CN_TZ).isoformat(timespec='seconds'), 'collection_id': meta.collection_id, 'articles_count': meta.articles_count, 'update_time': meta.update_time})
logger.info('updated: count=%s update_time=%s records=%s', meta.articles_count, meta.update_time, len(records))
return records, True
def build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(prog='bilibili_maintenance.py', description='碧蓝航线停服维护公告爬取工具')
parser.add_argument('command', choices=['fetch', 'parse', 'run'], help='fetch: 拉取最新停服维护公告 | parse: 基于缓存解析停服时间 | run: 拉取并解析最新公告')
parser.add_argument('--cache', type=Path, default=DEFAULT_CACHE, help=f'公告缓存文件路径(默认 {DEFAULT_CACHE}')
parser.add_argument('--state', type=Path, default=DEFAULT_STATE, help=f'合集状态文件路径(默认 {DEFAULT_STATE}')
parser.add_argument('--mid', type=int, default=DEFAULT_MID, help=f'B站UP主UID默认 {DEFAULT_MID}')
parser.add_argument('--collection-name', default=DEFAULT_COLLECTION_NAME, help=f'专栏合集名称须与B站合集页标题一致默认 {DEFAULT_COLLECTION_NAME!r}')
parser.add_argument('--timeout', type=float, default=30.0, help='网络请求超时秒数(默认 30')
parser.add_argument('--force', action='store_true', help='强制重新拉取并刷新缓存数据')
parser.add_argument('--verbose', action='store_true', help='启用调试日志,输出 API 原始响应')
return parser
def cmd_fetch(args: argparse.Namespace) -> int:
try:
fetch_articles(args.cache, args.state, args.mid, args.collection_name, args.timeout, args.force, args.verbose)
except (FetchError, requests.RequestException) as exc:
logger.error('fetch failed: %s', exc)
return 1
return 0
def cmd_parse(args: argparse.Namespace) -> int:
info = parse_records(load_jsonl(args.cache))
if info:
print(info.window)
return 0
print()
return 2
def cmd_run(args: argparse.Namespace) -> int:
try:
records, _ = fetch_articles(args.cache, args.state, args.mid, args.collection_name, args.timeout, force=True, verbose=args.verbose)
except (FetchError, requests.RequestException) as exc:
logger.error('fetch failed: %s', exc)
return 1
info = parse_records(records)
if info:
print(info.window)
return 0
print()
return 2
def main() -> int:
configure_stdio()
logging.basicConfig(level=logging.WARNING, format='%(levelname)s: %(message)s')
args = build_parser().parse_args()
if args.verbose:
logging.getLogger().setLevel(logging.INFO)
if args.command == 'fetch':
return cmd_fetch(args)
if args.command == 'parse':
return cmd_parse(args)
if args.command == 'run':
return cmd_run(args)
return 1
if __name__ == '__main__':
raise SystemExit(main())