完成所有来源数据清洗和表格导入

This commit is contained in:
BoliviaYu
2026-03-05 23:55:18 +08:00
parent 5efb8fc9ed
commit 70fce8ebab
9 changed files with 1887 additions and 255 deletions

94
main.py
View File

@@ -84,7 +84,9 @@ def parse_datetime(raw: str, *, is_end: bool = False) -> datetime:
return dt
def load_runtime_config() -> tuple[list[str], datetime | None, datetime | None, dict, dict]:
def load_runtime_config() -> tuple[
list[str], datetime | None, datetime | None, dict, dict, dict
]:
if not os.path.exists(CONFIG_FILE):
raise FileNotFoundError(f"未找到配置文件: {CONFIG_FILE}")
@@ -142,7 +144,36 @@ def load_runtime_config() -> tuple[list[str], datetime | None, datetime | None,
if not mysql_final["password"]:
raise ValueError("配置错误: mysql.password 不能为空")
return sources, start_dt, end_dt, throttle_cfg, mysql_final
backfill = cfg.get("backfill", {})
if not isinstance(backfill, dict):
raise ValueError("配置错误: backfill 必须是对象")
backfill_enabled = bool(backfill.get("enabled", False))
backfill_start_raw = str(backfill.get("start", "") or "").strip()
backfill_end_raw = str(backfill.get("end", "") or "").strip()
backfill_sources = backfill.get("sources", [])
if backfill_sources and not isinstance(backfill_sources, list):
raise ValueError("配置错误: backfill.sources 必须是数组")
backfill_sources = [str(s).strip() for s in backfill_sources if str(s).strip()]
if backfill_enabled:
bf_start = parse_datetime(backfill_start_raw, is_end=False) if backfill_start_raw else None
bf_end = parse_datetime(backfill_end_raw, is_end=True) if backfill_end_raw else None
if bf_start and bf_end and bf_start > bf_end:
raise ValueError("配置错误: backfill.start 不能晚于 backfill.end")
else:
bf_start = None
bf_end = None
backfill_cfg = {
"enabled": backfill_enabled,
"start_dt": bf_start,
"end_dt": bf_end,
"sources": backfill_sources,
"ignore_sync_state": bool(backfill.get("ignore_sync_state", True)),
}
return sources, start_dt, end_dt, throttle_cfg, mysql_final, backfill_cfg
# =======================
@@ -337,6 +368,7 @@ async def scrape_one_source(
raw_source: str,
start_dt: datetime | None,
end_dt: datetime | None,
ignore_sync_state: bool,
throttle_cfg: dict,
):
try:
@@ -358,9 +390,17 @@ async def scrape_one_source(
use_throttle = bool(throttle_cfg.get("enabled", True))
per_message_delay = float(throttle_cfg.get("per_message_delay_sec", 0.0))
if window_mode:
if window_mode and ignore_sync_state:
logger.info(f"[{source_key}] 时间窗口模式 start={start_dt} end={end_dt} (UTC)")
iterator = client.iter_messages(entity, limit=INITIAL_BACKFILL_LIMIT)
elif window_mode:
# 用于日常窗口抓取,仍可依赖 sync_state 避免重复扫过大历史。
last_id = store.get_last_message_id(source_key)
logger.info(
f"[{source_key}] 窗口增量模式 start={start_dt} end={end_dt} (UTC), "
f"message_id > {last_id}"
)
iterator = client.iter_messages(entity, min_id=last_id, reverse=True)
else:
last_id = store.get_last_message_id(source_key)
logger.info(f"[{source_key}] 增量模式,从 message_id > {last_id} 开始")
@@ -392,7 +432,8 @@ async def scrape_one_source(
if use_throttle and per_message_delay > 0:
await asyncio.sleep(per_message_delay)
if not window_mode and max_seen_id > 0:
should_update_sync = (not window_mode) or (window_mode and not ignore_sync_state)
if should_update_sync and max_seen_id > 0:
old_last = store.get_last_message_id(source_key)
if max_seen_id > old_last:
store.set_last_message_id(source_key, max_seen_id)
@@ -404,6 +445,7 @@ async def run_scraper(
sources: list[str],
start_dt: datetime | None,
end_dt: datetime | None,
ignore_sync_state: bool,
throttle_cfg: dict,
store: MySQLStore,
):
@@ -420,7 +462,15 @@ async def run_scraper(
between_sources_delay = float(throttle_cfg.get("between_sources_delay_sec", 0.0))
for idx, source in enumerate(sources):
await scrape_one_source(client, store, source, start_dt, end_dt, throttle_cfg)
await scrape_one_source(
client,
store,
source,
start_dt,
end_dt,
ignore_sync_state,
throttle_cfg,
)
if use_throttle and between_sources_delay > 0 and idx < len(sources) - 1:
logger.info(f"源切换等待 {between_sources_delay:.2f}s 以降低风控")
@@ -433,7 +483,28 @@ async def run_scraper(
# 主程序入口
# =======================
def main():
sources, start_dt, end_dt, throttle_cfg, mysql_cfg = load_runtime_config()
(
sources,
start_dt,
end_dt,
throttle_cfg,
mysql_cfg,
backfill_cfg,
) = load_runtime_config()
if backfill_cfg["enabled"]:
if backfill_cfg["sources"]:
sources = backfill_cfg["sources"]
start_dt = backfill_cfg["start_dt"]
end_dt = backfill_cfg["end_dt"]
ignore_sync_state = bool(backfill_cfg["ignore_sync_state"])
logger.info(
"回补模式启用: "
f"sources={sources}, start={start_dt}, end={end_dt}, "
f"ignore_sync_state={ignore_sync_state}"
)
else:
ignore_sync_state = False
logger.info("程序启动")
logger.info(f"本次数据源: {sources}")
@@ -450,7 +521,16 @@ def main():
store.connect()
try:
store.init_db()
asyncio.run(run_scraper(sources, start_dt, end_dt, throttle_cfg, store))
asyncio.run(
run_scraper(
sources,
start_dt,
end_dt,
ignore_sync_state,
throttle_cfg,
store,
)
)
finally:
store.close()