完成所有来源数据清洗和表格导入

2026-03-05 23:55:18 +08:00
parent 5efb8fc9ed
commit 70fce8ebab
9 changed files with 1887 additions and 255 deletions
--- a/sync_to_cloud_mysql.py
+++ b/sync_to_cloud_mysql.py
@@ -0,0 +1,528 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+Sync local MySQL data to cloud MySQL daily (incremental + upsert).
+
+Synced tables:
+- messages            (incremental by local id)
+- sync_state          (full upsert, small table)
+- structured_jobs     (incremental by local id + recent updates by cleaned_at)
+- clean_state         (full upsert, small table)
+- internship_jobs_raw (incremental by local id, if table exists)
+
+State is stored on cloud DB in table `cloud_sync_state`.
+"""
+
+import json
+import logging
+import os
+from datetime import datetime, timezone
+
+import pymysql
+
+CONFIG_FILE = "config.json"
+PIPELINE_NAME = "local_to_cloud_mysql_v1"
+BATCH_SIZE = 1000
+
+
+def qid(name: str) -> str:
+    return f"`{name.replace('`', '``')}`"
+
+
+def setup_logger() -> logging.Logger:
+    os.makedirs("logs", exist_ok=True)
+    logger = logging.getLogger("sync_to_cloud_mysql")
+    logger.setLevel(logging.INFO)
+    if logger.handlers:
+        return logger
+
+    fmt = logging.Formatter(
+        "[%(asctime)s] [%(levelname)s] %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
+    )
+    ch = logging.StreamHandler()
+    ch.setFormatter(fmt)
+    fh = logging.FileHandler("logs/sync_to_cloud_mysql.log", encoding="utf-8")
+    fh.setFormatter(fmt)
+    logger.addHandler(ch)
+    logger.addHandler(fh)
+    return logger
+
+
+logger = setup_logger()
+
+
+def load_config() -> tuple[dict, dict]:
+    if not os.path.exists(CONFIG_FILE):
+        raise FileNotFoundError(f"未找到配置文件: {CONFIG_FILE}")
+
+    with open(CONFIG_FILE, "r", encoding="utf-8") as f:
+        cfg = json.load(f)
+
+    local_cfg = cfg.get("mysql", {})
+    cloud_cfg = cfg.get("mysql_cloud", {})
+
+    if not isinstance(local_cfg, dict) or not isinstance(cloud_cfg, dict):
+        raise ValueError("配置错误: mysql / mysql_cloud 必须是对象")
+
+    def norm(db: dict, env_prefix: str, defaults: dict) -> dict:
+        out = {
+            "host": db.get("host") or os.getenv(f"{env_prefix}_HOST", defaults["host"]),
+            "port": int(db.get("port") or os.getenv(f"{env_prefix}_PORT", defaults["port"])),
+            "user": db.get("user") or os.getenv(f"{env_prefix}_USER", defaults["user"]),
+            "password": db.get("password") or os.getenv(f"{env_prefix}_PASSWORD", ""),
+            "database": db.get("database") or os.getenv(f"{env_prefix}_DATABASE", defaults["database"]),
+            "charset": db.get("charset") or os.getenv(f"{env_prefix}_CHARSET", "utf8mb4"),
+        }
+        if not out["password"] or out["password"] == "CHANGE_ME":
+            raise ValueError(f"配置错误: {env_prefix}.password 不能为空")
+        return out
+
+    local = norm(local_cfg, "MYSQL_LOCAL", {"host": "127.0.0.1", "port": "3306", "user": "jobs_user", "database": "jobs"})
+    cloud = norm(cloud_cfg, "MYSQL_CLOUD", {"host": "127.0.0.1", "port": "3306", "user": "jobs_user", "database": "jobs"})
+    return local, cloud
+
+
+def connect_mysql(cfg: dict):
+    return pymysql.connect(
+        host=cfg["host"],
+        port=cfg["port"],
+        user=cfg["user"],
+        password=cfg["password"],
+        database=cfg["database"],
+        charset=cfg["charset"],
+        autocommit=True,
+        cursorclass=pymysql.cursors.DictCursor,
+    )
+
+
+def ensure_cloud_tables(cloud_conn):
+    with cloud_conn.cursor() as cur:
+        cur.execute(
+            """
+            CREATE TABLE IF NOT EXISTS cloud_sync_state (
+                pipeline_name VARCHAR(128) PRIMARY KEY,
+                last_messages_id BIGINT NOT NULL DEFAULT 0,
+                last_structured_jobs_id BIGINT NOT NULL DEFAULT 0,
+                last_internship_id BIGINT NOT NULL DEFAULT 0,
+                last_structured_sync_at DATETIME NULL,
+                updated_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP
+                    ON UPDATE CURRENT_TIMESTAMP
+            ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
+            """
+        )
+
+
+def ensure_destination_tables(local_conn, cloud_conn):
+    required = ["messages", "sync_state", "structured_jobs", "clean_state"]
+    optional = ["internship_jobs_raw"]
+
+    for table in required + optional:
+        if not table_exists(local_conn, table):
+            if table in required:
+                raise RuntimeError(f"本地缺少必要表: {table}")
+            logger.info(f"skip create cloud table {table}: local table not exists")
+            continue
+
+        if table_exists(cloud_conn, table):
+            continue
+
+        with local_conn.cursor() as cur:
+            cur.execute(f"SHOW CREATE TABLE `{table}`")
+            row = cur.fetchone()
+            ddl = row.get("Create Table") if isinstance(row, dict) else None
+            if not ddl:
+                raise RuntimeError(f"无法读取本地表结构: {table}")
+
+        with cloud_conn.cursor() as cur:
+            cur.execute(ddl)
+        logger.info(f"created cloud table from local ddl: {table}")
+
+
+def ensure_cloud_column(local_conn, cloud_conn, table: str, column: str):
+    if not table_exists(local_conn, table) or not table_exists(cloud_conn, table):
+        return
+
+    with local_conn.cursor() as cur:
+        cur.execute(f"SHOW COLUMNS FROM {qid(table)} LIKE %s", (column,))
+        local_col = cur.fetchone()
+    if not local_col:
+        return
+
+    with cloud_conn.cursor() as cur:
+        cur.execute(f"SHOW COLUMNS FROM {qid(table)} LIKE %s", (column,))
+        cloud_col = cur.fetchone()
+    if cloud_col:
+        return
+
+    # Build minimal compatible ADD COLUMN from local definition.
+    col_type = local_col.get("Type")
+    nullable = local_col.get("Null", "YES") == "YES"
+    default_val = local_col.get("Default")
+    extra = local_col.get("Extra") or ""
+
+    parts = [f"ALTER TABLE {qid(table)} ADD COLUMN {qid(column)} {col_type}"]
+    parts.append("NULL" if nullable else "NOT NULL")
+    if default_val is not None:
+        if isinstance(default_val, str):
+            escaped = default_val.replace("'", "''")
+            parts.append(f"DEFAULT '{escaped}'")
+        else:
+            parts.append(f"DEFAULT {default_val}")
+    if extra:
+        parts.append(extra)
+
+    sql = " ".join(parts)
+    with cloud_conn.cursor() as cur:
+        cur.execute(sql)
+    logger.info(f"added missing cloud column: {table}.{column}")
+
+
+def get_cloud_state(cloud_conn) -> dict:
+    with cloud_conn.cursor() as cur:
+        cur.execute(
+            """
+            SELECT last_messages_id, last_structured_jobs_id, last_internship_id, last_structured_sync_at
+            FROM cloud_sync_state
+            WHERE pipeline_name=%s
+            """,
+            (PIPELINE_NAME,),
+        )
+        row = cur.fetchone()
+
+    if not row:
+        return {
+            "last_messages_id": 0,
+            "last_structured_jobs_id": 0,
+            "last_internship_id": 0,
+            "last_structured_sync_at": None,
+        }
+    return row
+
+
+def set_cloud_state(cloud_conn, state: dict):
+    with cloud_conn.cursor() as cur:
+        cur.execute(
+            """
+            INSERT INTO cloud_sync_state (
+                pipeline_name, last_messages_id, last_structured_jobs_id,
+                last_internship_id, last_structured_sync_at, updated_at
+            ) VALUES (%s, %s, %s, %s, %s, NOW())
+            ON DUPLICATE KEY UPDATE
+                last_messages_id=VALUES(last_messages_id),
+                last_structured_jobs_id=VALUES(last_structured_jobs_id),
+                last_internship_id=VALUES(last_internship_id),
+                last_structured_sync_at=VALUES(last_structured_sync_at),
+                updated_at=NOW()
+            """,
+            (
+                PIPELINE_NAME,
+                state["last_messages_id"],
+                state["last_structured_jobs_id"],
+                state["last_internship_id"],
+                state["last_structured_sync_at"],
+            ),
+        )
+
+
+def table_exists(conn, table: str) -> bool:
+    with conn.cursor() as cur:
+        cur.execute("SHOW TABLES LIKE %s", (table,))
+        return cur.fetchone() is not None
+
+
+def sync_messages(local_conn, cloud_conn, last_id: int) -> int:
+    logger.info(f"sync messages from id > {last_id}")
+    max_id = last_id
+    total = 0
+    while True:
+        with local_conn.cursor() as cur:
+            cur.execute(
+                """
+                SELECT id, source, chat_id, message_id, content, date, created_at
+                FROM messages
+                WHERE id > %s
+                ORDER BY id ASC
+                LIMIT %s
+                """,
+                (max_id, BATCH_SIZE),
+            )
+            rows = cur.fetchall()
+
+        if not rows:
+            break
+
+        with cloud_conn.cursor() as cur:
+            for r in rows:
+                cur.execute(
+                    """
+                    INSERT INTO messages (source, chat_id, message_id, content, date, created_at)
+                    VALUES (%s, %s, %s, %s, %s, %s)
+                    ON DUPLICATE KEY UPDATE
+                        chat_id=VALUES(chat_id),
+                        content=VALUES(content),
+                        date=VALUES(date)
+                    """,
+                    (
+                        r["source"],
+                        r["chat_id"],
+                        r["message_id"],
+                        r["content"],
+                        r["date"],
+                        r["created_at"],
+                    ),
+                )
+                max_id = max(max_id, int(r["id"]))
+                total += 1
+
+        logger.info(f"messages synced batch={len(rows)}, total={total}, max_id={max_id}")
+
+    return max_id
+
+
+def sync_small_table_full(local_conn, cloud_conn, table: str):
+    if not table_exists(local_conn, table):
+        logger.info(f"skip {table}: local table not exists")
+        return
+
+    with local_conn.cursor() as cur:
+        cur.execute(f"SELECT * FROM {table}")
+        rows = cur.fetchall()
+
+    if not rows:
+        logger.info(f"{table}: no rows")
+        return
+
+    cols = list(rows[0].keys())
+    col_sql = ", ".join(qid(c) for c in cols)
+    val_sql = ", ".join(["%s"] * len(cols))
+
+    # primary key handling for known small tables
+    if table == "sync_state":
+        pk = "source"
+    elif table == "clean_state":
+        pk = "pipeline_name"
+    else:
+        pk = cols[0]
+
+    update_cols = [c for c in cols if c != pk]
+    update_sql = ", ".join([f"{qid(c)}=VALUES({qid(c)})" for c in update_cols])
+
+    with cloud_conn.cursor() as cur:
+        for r in rows:
+            cur.execute(
+                f"""
+                INSERT INTO {qid(table)} ({col_sql}) VALUES ({val_sql})
+                ON DUPLICATE KEY UPDATE {update_sql}
+                """,
+                tuple(r[c] for c in cols),
+            )
+
+    logger.info(f"{table}: synced rows={len(rows)}")
+
+
+def sync_structured_jobs(local_conn, cloud_conn, last_id: int, last_sync_at) -> tuple[int, str]:
+    logger.info(
+        f"sync structured_jobs from id > {last_id}, last_sync_at={last_sync_at}"
+    )
+
+    max_id = last_id
+    total = 0
+    touched_ids = set()
+
+    # 1) incremental new rows by id
+    while True:
+        with local_conn.cursor() as cur:
+            cur.execute(
+                """
+                SELECT * FROM structured_jobs
+                WHERE id > %s
+                ORDER BY id ASC
+                LIMIT %s
+                """,
+                (max_id, BATCH_SIZE),
+            )
+            rows = cur.fetchall()
+
+        if not rows:
+            break
+
+        _upsert_structured_rows(cloud_conn, rows)
+        for r in rows:
+            rid = int(r["id"])
+            max_id = max(max_id, rid)
+            touched_ids.add(rid)
+        total += len(rows)
+        logger.info(f"structured_jobs incremental batch={len(rows)}, total={total}, max_id={max_id}")
+
+    # 2) update window by cleaned_at (catch updated existing rows)
+    if last_sync_at is None:
+        last_sync_at = "1970-01-01 00:00:00"
+    if isinstance(last_sync_at, datetime):
+        last_sync_at = last_sync_at.strftime("%Y-%m-%d %H:%M:%S")
+
+    while True:
+        with local_conn.cursor() as cur:
+            cur.execute(
+                """
+                SELECT * FROM structured_jobs
+                WHERE cleaned_at > %s
+                ORDER BY cleaned_at ASC, id ASC
+                LIMIT %s
+                """,
+                (last_sync_at, BATCH_SIZE),
+            )
+            rows = cur.fetchall()
+
+        if not rows:
+            break
+
+        # avoid repeated heavy updates in loop: only keep rows not already touched in incremental loop
+        delta = [r for r in rows if int(r["id"]) not in touched_ids]
+        if delta:
+            _upsert_structured_rows(cloud_conn, delta)
+            total += len(delta)
+            logger.info(f"structured_jobs update-window batch={len(delta)}, total={total}")
+
+        # move cursor forward to avoid pagination loops on timestamp boundary
+        last_row_cleaned_at = rows[-1]["cleaned_at"]
+        if isinstance(last_row_cleaned_at, datetime):
+            last_sync_at = last_row_cleaned_at.strftime("%Y-%m-%d %H:%M:%S")
+        elif last_row_cleaned_at:
+            last_sync_at = str(last_row_cleaned_at)
+
+        if len(rows) < BATCH_SIZE:
+            break
+
+    now_utc = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
+    return max_id, now_utc
+
+
+def _upsert_structured_rows(cloud_conn, rows: list[dict]):
+    if not rows:
+        return
+    cols = list(rows[0].keys())
+    # id is local technical key, do not sync into cloud table (cloud has own auto id)
+    cols = [c for c in cols if c != "id"]
+
+    col_sql = ", ".join(qid(c) for c in cols)
+    val_sql = ", ".join(["%s"] * len(cols))
+    update_cols = [c for c in cols if c not in ("source", "message_id")]
+    update_sql = ", ".join(
+        [f"{qid(c)}=VALUES({qid(c)})" for c in update_cols]
+        + [f"{qid('cleaned_at')}=CURRENT_TIMESTAMP"]
+    )
+
+    with cloud_conn.cursor() as cur:
+        for r in rows:
+            cur.execute(
+                f"""
+                INSERT INTO {qid('structured_jobs')} ({col_sql})
+                VALUES ({val_sql})
+                ON DUPLICATE KEY UPDATE {update_sql}
+                """,
+                tuple(r[c] for c in cols),
+            )
+
+
+def sync_internship_raw(local_conn, cloud_conn, last_id: int) -> int:
+    if not table_exists(local_conn, "internship_jobs_raw"):
+        logger.info("skip internship_jobs_raw: local table not exists")
+        return last_id
+
+    max_id = last_id
+    total = 0
+    while True:
+        with local_conn.cursor() as cur:
+            cur.execute(
+                """
+                SELECT * FROM internship_jobs_raw
+                WHERE id > %s
+                ORDER BY id ASC
+                LIMIT %s
+                """,
+                (max_id, BATCH_SIZE),
+            )
+            rows = cur.fetchall()
+
+        if not rows:
+            break
+
+        cols = [c for c in rows[0].keys() if c != "id"]
+        col_sql = ", ".join(qid(c) for c in cols)
+        val_sql = ", ".join(["%s"] * len(cols))
+        update_cols = [c for c in cols if c != "fingerprint"]
+        update_sql = ", ".join(
+            [f"{qid(c)}=VALUES({qid(c)})" for c in update_cols]
+            + [f"{qid('imported_at')}=CURRENT_TIMESTAMP"]
+        )
+
+        with cloud_conn.cursor() as cur:
+            for r in rows:
+                cur.execute(
+                    f"""
+                    INSERT INTO {qid('internship_jobs_raw')} ({col_sql})
+                    VALUES ({val_sql})
+                    ON DUPLICATE KEY UPDATE {update_sql}
+                    """,
+                    tuple(r[c] for c in cols),
+                )
+                max_id = max(max_id, int(r["id"]))
+                total += 1
+
+        logger.info(f"internship_jobs_raw batch={len(rows)}, total={total}, max_id={max_id}")
+
+    return max_id
+
+
+def main():
+    local_cfg, cloud_cfg = load_config()
+    logger.info(
+        "sync start: "
+        f"local={local_cfg['host']}:{local_cfg['port']}/{local_cfg['database']} -> "
+        f"cloud={cloud_cfg['host']}:{cloud_cfg['port']}/{cloud_cfg['database']}"
+    )
+
+    local_conn = connect_mysql(local_cfg)
+    cloud_conn = connect_mysql(cloud_cfg)
+
+    try:
+        ensure_cloud_tables(cloud_conn)
+        ensure_destination_tables(local_conn, cloud_conn)
+        ensure_cloud_column(local_conn, cloud_conn, "internship_jobs_raw", "updated_at_utc")
+        state = get_cloud_state(cloud_conn)
+        logger.info(f"state before sync: {state}")
+
+        state["last_messages_id"] = sync_messages(
+            local_conn, cloud_conn, int(state["last_messages_id"])
+        )
+
+        sync_small_table_full(local_conn, cloud_conn, "sync_state")
+        sync_small_table_full(local_conn, cloud_conn, "clean_state")
+
+        last_structured_id, last_structured_sync_at = sync_structured_jobs(
+            local_conn,
+            cloud_conn,
+            int(state["last_structured_jobs_id"]),
+            state["last_structured_sync_at"],
+        )
+        state["last_structured_jobs_id"] = last_structured_id
+        state["last_structured_sync_at"] = last_structured_sync_at
+
+        state["last_internship_id"] = sync_internship_raw(
+            local_conn, cloud_conn, int(state["last_internship_id"])
+        )
+
+        set_cloud_state(cloud_conn, state)
+        logger.info(f"state after sync: {state}")
+        logger.info("sync done")
+    except Exception:
+        logger.exception("sync failed")
+        raise
+    finally:
+        local_conn.close()
+        cloud_conn.close()
+
+
+if __name__ == "__main__":
+    main()