From 23e7ad0b93c76e72413c3cf84c0ffdf359884287 Mon Sep 17 00:00:00 2001 From: BoliviaYu Date: Fri, 27 Mar 2026 14:02:17 +0800 Subject: [PATCH] =?UTF-8?q?260327-=E7=AA=97=E5=8F=A3=E8=BF=AD=E4=BB=A3?= =?UTF-8?q?=E9=80=BB=E8=BE=91=E4=BF=AE=E5=A4=8D=EF=BC=8C=E4=B8=8D=E6=98=AF?= =?UTF-8?q?=E6=AF=8F=E5=A4=A9=E5=8F=AA=E8=B7=91=E4=B8=80=E6=9D=A1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- main.py | 17 ++++++++++++----- run_daily_incremental.sh | 21 +++++++++++++++++++++ 2 files changed, 33 insertions(+), 5 deletions(-) diff --git a/main.py b/main.py index 42dfdb9..f4a77c1 100644 --- a/main.py +++ b/main.py @@ -403,7 +403,8 @@ async def scrape_one_source( f"[{source_key}] 窗口增量模式 start={start_dt} end={end_dt} (UTC), " f"message_id > {last_id}" ) - iterator = client.iter_messages(entity, min_id=last_id, reverse=True) + # reverse=False: 从新到旧遍历,结合时间窗口可快速终止,避免扫全量历史。 + iterator = client.iter_messages(entity, min_id=last_id, reverse=False) else: last_id = store.get_last_message_id(source_key) logger.info(f"[{source_key}] 增量模式,从 message_id > {last_id} 开始") @@ -411,6 +412,10 @@ async def scrape_one_source( async for message in iterator: scanned += 1 + msg_id = int(message.id) + if msg_id > max_seen_id: + max_seen_id = msg_id + message_dt = message.date.astimezone(timezone.utc) if window_mode: @@ -419,16 +424,12 @@ async def scrape_one_source( if start_dt and message_dt < start_dt: break - msg_id = int(message.id) msg_date = message_dt.strftime("%Y-%m-%d %H:%M:%S") content = build_message_content(message) if store.save_message(source_key, chat_id, msg_id, content, msg_date): inserted += 1 - if msg_id > max_seen_id: - max_seen_id = msg_id - if scanned % 200 == 0: logger.info(f"[{source_key}] 进度: 扫描 {scanned} 条, 新增 {inserted} 条") @@ -441,6 +442,12 @@ async def scrape_one_source( if max_seen_id > old_last: store.set_last_message_id(source_key, max_seen_id) + if window_mode and scanned <= 1: + logger.warning( + f"[{source_key}] 本次仅扫描 {scanned} 条消息,请检查源最近是否活跃," + "或确认 time_window 配置与系统时钟是否正确。" + ) + logger.info(f"[{source_key}] 完成: 扫描 {scanned} 条, 新增 {inserted} 条") diff --git a/run_daily_incremental.sh b/run_daily_incremental.sh index 7054c02..c500174 100755 --- a/run_daily_incremental.sh +++ b/run_daily_incremental.sh @@ -25,6 +25,27 @@ if [[ ! -x "$PY_BIN" ]]; then exit 1 fi +# Dependency preflight (avoid silent runtime failures after cron upgrades) +if ! "$PY_BIN" - <<'PY' >> "$LOG_DIR/daily_job.log" 2>&1 +import importlib +missing = [] +for name in ("telethon", "pymysql", "cryptography"): + try: + importlib.import_module(name) + except Exception: + missing.append(name) + +if missing: + print(f"missing python packages: {', '.join(missing)}") + raise SystemExit(1) + +print("python dependency preflight passed") +PY +then + echo "[$(date '+%Y-%m-%d %H:%M:%S')] dependency preflight failed, exit" >> "$LOG_DIR/daily_job.log" + exit 1 +fi + # Auto-advance time window to a rolling daily range. "$PY_BIN" - <<'PY' import json