完成所有来源数据清洗和表格导入
This commit is contained in:
94
main.py
94
main.py
@@ -84,7 +84,9 @@ def parse_datetime(raw: str, *, is_end: bool = False) -> datetime:
|
||||
return dt
|
||||
|
||||
|
||||
def load_runtime_config() -> tuple[list[str], datetime | None, datetime | None, dict, dict]:
|
||||
def load_runtime_config() -> tuple[
|
||||
list[str], datetime | None, datetime | None, dict, dict, dict
|
||||
]:
|
||||
if not os.path.exists(CONFIG_FILE):
|
||||
raise FileNotFoundError(f"未找到配置文件: {CONFIG_FILE}")
|
||||
|
||||
@@ -142,7 +144,36 @@ def load_runtime_config() -> tuple[list[str], datetime | None, datetime | None,
|
||||
if not mysql_final["password"]:
|
||||
raise ValueError("配置错误: mysql.password 不能为空")
|
||||
|
||||
return sources, start_dt, end_dt, throttle_cfg, mysql_final
|
||||
backfill = cfg.get("backfill", {})
|
||||
if not isinstance(backfill, dict):
|
||||
raise ValueError("配置错误: backfill 必须是对象")
|
||||
|
||||
backfill_enabled = bool(backfill.get("enabled", False))
|
||||
backfill_start_raw = str(backfill.get("start", "") or "").strip()
|
||||
backfill_end_raw = str(backfill.get("end", "") or "").strip()
|
||||
backfill_sources = backfill.get("sources", [])
|
||||
if backfill_sources and not isinstance(backfill_sources, list):
|
||||
raise ValueError("配置错误: backfill.sources 必须是数组")
|
||||
backfill_sources = [str(s).strip() for s in backfill_sources if str(s).strip()]
|
||||
|
||||
if backfill_enabled:
|
||||
bf_start = parse_datetime(backfill_start_raw, is_end=False) if backfill_start_raw else None
|
||||
bf_end = parse_datetime(backfill_end_raw, is_end=True) if backfill_end_raw else None
|
||||
if bf_start and bf_end and bf_start > bf_end:
|
||||
raise ValueError("配置错误: backfill.start 不能晚于 backfill.end")
|
||||
else:
|
||||
bf_start = None
|
||||
bf_end = None
|
||||
|
||||
backfill_cfg = {
|
||||
"enabled": backfill_enabled,
|
||||
"start_dt": bf_start,
|
||||
"end_dt": bf_end,
|
||||
"sources": backfill_sources,
|
||||
"ignore_sync_state": bool(backfill.get("ignore_sync_state", True)),
|
||||
}
|
||||
|
||||
return sources, start_dt, end_dt, throttle_cfg, mysql_final, backfill_cfg
|
||||
|
||||
|
||||
# =======================
|
||||
@@ -337,6 +368,7 @@ async def scrape_one_source(
|
||||
raw_source: str,
|
||||
start_dt: datetime | None,
|
||||
end_dt: datetime | None,
|
||||
ignore_sync_state: bool,
|
||||
throttle_cfg: dict,
|
||||
):
|
||||
try:
|
||||
@@ -358,9 +390,17 @@ async def scrape_one_source(
|
||||
use_throttle = bool(throttle_cfg.get("enabled", True))
|
||||
per_message_delay = float(throttle_cfg.get("per_message_delay_sec", 0.0))
|
||||
|
||||
if window_mode:
|
||||
if window_mode and ignore_sync_state:
|
||||
logger.info(f"[{source_key}] 时间窗口模式 start={start_dt} end={end_dt} (UTC)")
|
||||
iterator = client.iter_messages(entity, limit=INITIAL_BACKFILL_LIMIT)
|
||||
elif window_mode:
|
||||
# 用于日常窗口抓取,仍可依赖 sync_state 避免重复扫过大历史。
|
||||
last_id = store.get_last_message_id(source_key)
|
||||
logger.info(
|
||||
f"[{source_key}] 窗口增量模式 start={start_dt} end={end_dt} (UTC), "
|
||||
f"message_id > {last_id}"
|
||||
)
|
||||
iterator = client.iter_messages(entity, min_id=last_id, reverse=True)
|
||||
else:
|
||||
last_id = store.get_last_message_id(source_key)
|
||||
logger.info(f"[{source_key}] 增量模式,从 message_id > {last_id} 开始")
|
||||
@@ -392,7 +432,8 @@ async def scrape_one_source(
|
||||
if use_throttle and per_message_delay > 0:
|
||||
await asyncio.sleep(per_message_delay)
|
||||
|
||||
if not window_mode and max_seen_id > 0:
|
||||
should_update_sync = (not window_mode) or (window_mode and not ignore_sync_state)
|
||||
if should_update_sync and max_seen_id > 0:
|
||||
old_last = store.get_last_message_id(source_key)
|
||||
if max_seen_id > old_last:
|
||||
store.set_last_message_id(source_key, max_seen_id)
|
||||
@@ -404,6 +445,7 @@ async def run_scraper(
|
||||
sources: list[str],
|
||||
start_dt: datetime | None,
|
||||
end_dt: datetime | None,
|
||||
ignore_sync_state: bool,
|
||||
throttle_cfg: dict,
|
||||
store: MySQLStore,
|
||||
):
|
||||
@@ -420,7 +462,15 @@ async def run_scraper(
|
||||
between_sources_delay = float(throttle_cfg.get("between_sources_delay_sec", 0.0))
|
||||
|
||||
for idx, source in enumerate(sources):
|
||||
await scrape_one_source(client, store, source, start_dt, end_dt, throttle_cfg)
|
||||
await scrape_one_source(
|
||||
client,
|
||||
store,
|
||||
source,
|
||||
start_dt,
|
||||
end_dt,
|
||||
ignore_sync_state,
|
||||
throttle_cfg,
|
||||
)
|
||||
|
||||
if use_throttle and between_sources_delay > 0 and idx < len(sources) - 1:
|
||||
logger.info(f"源切换等待 {between_sources_delay:.2f}s 以降低风控")
|
||||
@@ -433,7 +483,28 @@ async def run_scraper(
|
||||
# 主程序入口
|
||||
# =======================
|
||||
def main():
|
||||
sources, start_dt, end_dt, throttle_cfg, mysql_cfg = load_runtime_config()
|
||||
(
|
||||
sources,
|
||||
start_dt,
|
||||
end_dt,
|
||||
throttle_cfg,
|
||||
mysql_cfg,
|
||||
backfill_cfg,
|
||||
) = load_runtime_config()
|
||||
|
||||
if backfill_cfg["enabled"]:
|
||||
if backfill_cfg["sources"]:
|
||||
sources = backfill_cfg["sources"]
|
||||
start_dt = backfill_cfg["start_dt"]
|
||||
end_dt = backfill_cfg["end_dt"]
|
||||
ignore_sync_state = bool(backfill_cfg["ignore_sync_state"])
|
||||
logger.info(
|
||||
"回补模式启用: "
|
||||
f"sources={sources}, start={start_dt}, end={end_dt}, "
|
||||
f"ignore_sync_state={ignore_sync_state}"
|
||||
)
|
||||
else:
|
||||
ignore_sync_state = False
|
||||
|
||||
logger.info("程序启动")
|
||||
logger.info(f"本次数据源: {sources}")
|
||||
@@ -450,7 +521,16 @@ def main():
|
||||
store.connect()
|
||||
try:
|
||||
store.init_db()
|
||||
asyncio.run(run_scraper(sources, start_dt, end_dt, throttle_cfg, store))
|
||||
asyncio.run(
|
||||
run_scraper(
|
||||
sources,
|
||||
start_dt,
|
||||
end_dt,
|
||||
ignore_sync_state,
|
||||
throttle_cfg,
|
||||
store,
|
||||
)
|
||||
)
|
||||
finally:
|
||||
store.close()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user