sync_to_cloud_mysql.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Sync local MySQL data to cloud MySQL daily (incremental + upsert).

Synced tables:
- messages            (incremental by local id)
- sync_state          (full upsert, small table)
- structured_jobs     (incremental by local id + recent updates by cleaned_at)
- clean_state         (full upsert, small table)
- internship_jobs_raw (incremental by local id, if table exists)

State is stored on cloud DB in table `cloud_sync_state`.
"""

import json
import logging
import os
from datetime import datetime, timezone

import pymysql

CONFIG_FILE = "config.json"
PIPELINE_NAME = "local_to_cloud_mysql_v1"
BATCH_SIZE = 1000


def qid(name: str) -> str:
    return f"`{name.replace('`', '``')}`"


def setup_logger() -> logging.Logger:
    os.makedirs("logs", exist_ok=True)
    logger = logging.getLogger("sync_to_cloud_mysql")
    logger.setLevel(logging.INFO)
    if logger.handlers:
        return logger

    fmt = logging.Formatter(
        "[%(asctime)s] [%(levelname)s] %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
    )
    ch = logging.StreamHandler()
    ch.setFormatter(fmt)
    fh = logging.FileHandler("logs/sync_to_cloud_mysql.log", encoding="utf-8")
    fh.setFormatter(fmt)
    logger.addHandler(ch)
    logger.addHandler(fh)
    return logger


logger = setup_logger()


def load_config() -> tuple[dict, dict]:
    if not os.path.exists(CONFIG_FILE):
        raise FileNotFoundError(f"未找到配置文件: {CONFIG_FILE}")

    with open(CONFIG_FILE, "r", encoding="utf-8") as f:
        cfg = json.load(f)

    local_cfg = cfg.get("mysql", {})
    cloud_cfg = cfg.get("mysql_cloud", {})

    if not isinstance(local_cfg, dict) or not isinstance(cloud_cfg, dict):
        raise ValueError("配置错误: mysql / mysql_cloud 必须是对象")

    def norm(db: dict, env_prefix: str, defaults: dict) -> dict:
        out = {
            "host": db.get("host") or os.getenv(f"{env_prefix}_HOST", defaults["host"]),
            "port": int(db.get("port") or os.getenv(f"{env_prefix}_PORT", defaults["port"])),
            "user": db.get("user") or os.getenv(f"{env_prefix}_USER", defaults["user"]),
            "password": db.get("password") or os.getenv(f"{env_prefix}_PASSWORD", ""),
            "database": db.get("database") or os.getenv(f"{env_prefix}_DATABASE", defaults["database"]),
            "charset": db.get("charset") or os.getenv(f"{env_prefix}_CHARSET", "utf8mb4"),
        }
        if not out["password"] or out["password"] == "CHANGE_ME":
            raise ValueError(f"配置错误: {env_prefix}.password 不能为空")
        return out

    local = norm(local_cfg, "MYSQL_LOCAL", {"host": "127.0.0.1", "port": "3306", "user": "jobs_user", "database": "jobs"})
    cloud = norm(cloud_cfg, "MYSQL_CLOUD", {"host": "127.0.0.1", "port": "3306", "user": "jobs_user", "database": "jobs"})
    return local, cloud


def connect_mysql(cfg: dict):
    conn = pymysql.connect(
        host=cfg["host"],
        port=cfg["port"],
        user=cfg["user"],
        password=cfg["password"],
        database=cfg["database"],
        charset=cfg["charset"],
        autocommit=True,
        cursorclass=pymysql.cursors.DictCursor,
    )
    with conn.cursor() as cur:
        cur.execute("SET time_zone = '+00:00'")
    return conn


def ensure_cloud_tables(cloud_conn):
    with cloud_conn.cursor() as cur:
        cur.execute(
            """
            CREATE TABLE IF NOT EXISTS cloud_sync_state (
                pipeline_name VARCHAR(128) PRIMARY KEY,
                last_messages_id BIGINT NOT NULL DEFAULT 0,
                last_structured_jobs_id BIGINT NOT NULL DEFAULT 0,
                last_internship_id BIGINT NOT NULL DEFAULT 0,
                last_structured_sync_at DATETIME NULL,
                updated_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP
                    ON UPDATE CURRENT_TIMESTAMP
            ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
            """
        )


def ensure_destination_tables(local_conn, cloud_conn):
    required = ["messages", "sync_state", "structured_jobs", "clean_state"]
    optional = ["internship_jobs_raw"]

    for table in required + optional:
        if not table_exists(local_conn, table):
            if table in required:
                raise RuntimeError(f"本地缺少必要表: {table}")
            logger.info(f"skip create cloud table {table}: local table not exists")
            continue

        if table_exists(cloud_conn, table):
            continue

        with local_conn.cursor() as cur:
            cur.execute(f"SHOW CREATE TABLE `{table}`")
            row = cur.fetchone()
            ddl = row.get("Create Table") if isinstance(row, dict) else None
            if not ddl:
                raise RuntimeError(f"无法读取本地表结构: {table}")

        with cloud_conn.cursor() as cur:
            cur.execute(ddl)
        logger.info(f"created cloud table from local ddl: {table}")


def ensure_cloud_column(local_conn, cloud_conn, table: str, column: str):
    if not table_exists(local_conn, table) or not table_exists(cloud_conn, table):
        return

    with local_conn.cursor() as cur:
        cur.execute(f"SHOW COLUMNS FROM {qid(table)} LIKE %s", (column,))
        local_col = cur.fetchone()
    if not local_col:
        return

    with cloud_conn.cursor() as cur:
        cur.execute(f"SHOW COLUMNS FROM {qid(table)} LIKE %s", (column,))
        cloud_col = cur.fetchone()
    if cloud_col:
        return

    # Build minimal compatible ADD COLUMN from local definition.
    col_type = local_col.get("Type")
    nullable = local_col.get("Null", "YES") == "YES"
    default_val = local_col.get("Default")
    extra = local_col.get("Extra") or ""

    parts = [f"ALTER TABLE {qid(table)} ADD COLUMN {qid(column)} {col_type}"]
    parts.append("NULL" if nullable else "NOT NULL")
    if default_val is not None:
        if isinstance(default_val, str):
            escaped = default_val.replace("'", "''")
            parts.append(f"DEFAULT '{escaped}'")
        else:
            parts.append(f"DEFAULT {default_val}")
    if extra:
        parts.append(extra)

    sql = " ".join(parts)
    with cloud_conn.cursor() as cur:
        cur.execute(sql)
    logger.info(f"added missing cloud column: {table}.{column}")


def get_cloud_state(cloud_conn) -> dict:
    with cloud_conn.cursor() as cur:
        cur.execute(
            """
            SELECT last_messages_id, last_structured_jobs_id, last_internship_id, last_structured_sync_at
            FROM cloud_sync_state
            WHERE pipeline_name=%s
            """,
            (PIPELINE_NAME,),
        )
        row = cur.fetchone()

    if not row:
        return {
            "last_messages_id": 0,
            "last_structured_jobs_id": 0,
            "last_internship_id": 0,
            "last_structured_sync_at": None,
        }
    return row


def set_cloud_state(cloud_conn, state: dict):
    with cloud_conn.cursor() as cur:
        cur.execute(
            """
            INSERT INTO cloud_sync_state (
                pipeline_name, last_messages_id, last_structured_jobs_id,
                last_internship_id, last_structured_sync_at, updated_at
            ) VALUES (%s, %s, %s, %s, %s, NOW())
            ON DUPLICATE KEY UPDATE
                last_messages_id=VALUES(last_messages_id),
                last_structured_jobs_id=VALUES(last_structured_jobs_id),
                last_internship_id=VALUES(last_internship_id),
                last_structured_sync_at=VALUES(last_structured_sync_at),
                updated_at=NOW()
            """,
            (
                PIPELINE_NAME,
                state["last_messages_id"],
                state["last_structured_jobs_id"],
                state["last_internship_id"],
                state["last_structured_sync_at"],
            ),
        )


def table_exists(conn, table: str) -> bool:
    with conn.cursor() as cur:
        cur.execute("SHOW TABLES LIKE %s", (table,))
        return cur.fetchone() is not None


def sync_messages(local_conn, cloud_conn, last_id: int) -> int:
    logger.info(f"sync messages from id > {last_id}")
    max_id = last_id
    total = 0
    while True:
        with local_conn.cursor() as cur:
            cur.execute(
                """
                SELECT id, source, chat_id, message_id, content, date, created_at
                FROM messages
                WHERE id > %s
                ORDER BY id ASC
                LIMIT %s
                """,
                (max_id, BATCH_SIZE),
            )
            rows = cur.fetchall()

        if not rows:
            break

        with cloud_conn.cursor() as cur:
            for r in rows:
                cur.execute(
                    """
                    INSERT INTO messages (source, chat_id, message_id, content, date, created_at)
                    VALUES (%s, %s, %s, %s, %s, %s)
                    ON DUPLICATE KEY UPDATE
                        chat_id=VALUES(chat_id),
                        content=VALUES(content),
                        date=VALUES(date)
                    """,
                    (
                        r["source"],
                        r["chat_id"],
                        r["message_id"],
                        r["content"],
                        r["date"],
                        r["created_at"],
                    ),
                )
                max_id = max(max_id, int(r["id"]))
                total += 1

        logger.info(f"messages synced batch={len(rows)}, total={total}, max_id={max_id}")

    return max_id


def sync_small_table_full(local_conn, cloud_conn, table: str):
    if not table_exists(local_conn, table):
        logger.info(f"skip {table}: local table not exists")
        return

    with local_conn.cursor() as cur:
        cur.execute(f"SELECT * FROM {table}")
        rows = cur.fetchall()

    if not rows:
        logger.info(f"{table}: no rows")
        return

    cols = list(rows[0].keys())
    col_sql = ", ".join(qid(c) for c in cols)
    val_sql = ", ".join(["%s"] * len(cols))

    # primary key handling for known small tables
    if table == "sync_state":
        pk = "source"
    elif table == "clean_state":
        pk = "pipeline_name"
    else:
        pk = cols[0]

    update_cols = [c for c in cols if c != pk]
    update_sql = ", ".join([f"{qid(c)}=VALUES({qid(c)})" for c in update_cols])

    with cloud_conn.cursor() as cur:
        for r in rows:
            cur.execute(
                f"""
                INSERT INTO {qid(table)} ({col_sql}) VALUES ({val_sql})
                ON DUPLICATE KEY UPDATE {update_sql}
                """,
                tuple(r[c] for c in cols),
            )

    logger.info(f"{table}: synced rows={len(rows)}")


def sync_structured_jobs(local_conn, cloud_conn, last_id: int, last_sync_at) -> tuple[int, str]:
    logger.info(
        f"sync structured_jobs from id > {last_id}, last_sync_at={last_sync_at}"
    )

    max_id = last_id
    total = 0
    touched_ids = set()

    # 1) incremental new rows by id
    while True:
        with local_conn.cursor() as cur:
            cur.execute(
                """
                SELECT * FROM structured_jobs
                WHERE id > %s
                ORDER BY id ASC
                LIMIT %s
                """,
                (max_id, BATCH_SIZE),
            )
            rows = cur.fetchall()

        if not rows:
            break

        _upsert_structured_rows(cloud_conn, rows)
        for r in rows:
            rid = int(r["id"])
            max_id = max(max_id, rid)
            touched_ids.add(rid)
        total += len(rows)
        logger.info(f"structured_jobs incremental batch={len(rows)}, total={total}, max_id={max_id}")

    # 2) update window by cleaned_at (catch updated existing rows)
    if last_sync_at is None:
        last_sync_at = "1970-01-01 00:00:00"
    if isinstance(last_sync_at, datetime):
        last_sync_at = last_sync_at.strftime("%Y-%m-%d %H:%M:%S")

    while True:
        with local_conn.cursor() as cur:
            cur.execute(
                """
                SELECT * FROM structured_jobs
                WHERE cleaned_at > %s
                ORDER BY cleaned_at ASC, id ASC
                LIMIT %s
                """,
                (last_sync_at, BATCH_SIZE),
            )
            rows = cur.fetchall()

        if not rows:
            break

        # avoid repeated heavy updates in loop: only keep rows not already touched in incremental loop
        delta = [r for r in rows if int(r["id"]) not in touched_ids]
        if delta:
            _upsert_structured_rows(cloud_conn, delta)
            total += len(delta)
            logger.info(f"structured_jobs update-window batch={len(delta)}, total={total}")

        # move cursor forward to avoid pagination loops on timestamp boundary
        last_row_cleaned_at = rows[-1]["cleaned_at"]
        if isinstance(last_row_cleaned_at, datetime):
            last_sync_at = last_row_cleaned_at.strftime("%Y-%m-%d %H:%M:%S")
        elif last_row_cleaned_at:
            last_sync_at = str(last_row_cleaned_at)

        if len(rows) < BATCH_SIZE:
            break

    now_utc = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
    return max_id, now_utc


def _upsert_structured_rows(cloud_conn, rows: list[dict]):
    if not rows:
        return
    cols = list(rows[0].keys())
    # id is local technical key, do not sync into cloud table (cloud has own auto id)
    cols = [c for c in cols if c != "id"]

    col_sql = ", ".join(qid(c) for c in cols)
    val_sql = ", ".join(["%s"] * len(cols))
    update_cols = [c for c in cols if c not in ("source", "message_id")]
    update_sql = ", ".join(
        [f"{qid(c)}=VALUES({qid(c)})" for c in update_cols]
        + [f"{qid('cleaned_at')}=CURRENT_TIMESTAMP"]
    )

    with cloud_conn.cursor() as cur:
        for r in rows:
            cur.execute(
                f"""
                INSERT INTO {qid('structured_jobs')} ({col_sql})
                VALUES ({val_sql})
                ON DUPLICATE KEY UPDATE {update_sql}
                """,
                tuple(r[c] for c in cols),
            )


def sync_internship_raw(local_conn, cloud_conn, last_id: int) -> int:
    if not table_exists(local_conn, "internship_jobs_raw"):
        logger.info("skip internship_jobs_raw: local table not exists")
        return last_id

    max_id = last_id
    total = 0
    while True:
        with local_conn.cursor() as cur:
            cur.execute(
                """
                SELECT * FROM internship_jobs_raw
                WHERE id > %s
                ORDER BY id ASC
                LIMIT %s
                """,
                (max_id, BATCH_SIZE),
            )
            rows = cur.fetchall()

        if not rows:
            break

        cols = [c for c in rows[0].keys() if c != "id"]
        col_sql = ", ".join(qid(c) for c in cols)
        val_sql = ", ".join(["%s"] * len(cols))
        update_cols = [c for c in cols if c != "fingerprint"]
        update_sql = ", ".join(
            [f"{qid(c)}=VALUES({qid(c)})" for c in update_cols]
            + [f"{qid('imported_at')}=CURRENT_TIMESTAMP"]
        )

        with cloud_conn.cursor() as cur:
            for r in rows:
                cur.execute(
                    f"""
                    INSERT INTO {qid('internship_jobs_raw')} ({col_sql})
                    VALUES ({val_sql})
                    ON DUPLICATE KEY UPDATE {update_sql}
                    """,
                    tuple(r[c] for c in cols),
                )
                max_id = max(max_id, int(r["id"]))
                total += 1

        logger.info(f"internship_jobs_raw batch={len(rows)}, total={total}, max_id={max_id}")

    return max_id


def main():
    local_cfg, cloud_cfg = load_config()
    logger.info(
        "sync start: "
        f"local={local_cfg['host']}:{local_cfg['port']}/{local_cfg['database']} -> "
        f"cloud={cloud_cfg['host']}:{cloud_cfg['port']}/{cloud_cfg['database']}"
    )

    local_conn = connect_mysql(local_cfg)
    cloud_conn = connect_mysql(cloud_cfg)

    try:
        ensure_cloud_tables(cloud_conn)
        ensure_destination_tables(local_conn, cloud_conn)
        ensure_cloud_column(local_conn, cloud_conn, "internship_jobs_raw", "updated_at_utc")
        state = get_cloud_state(cloud_conn)
        logger.info(f"state before sync: {state}")

        state["last_messages_id"] = sync_messages(
            local_conn, cloud_conn, int(state["last_messages_id"])
        )

        sync_small_table_full(local_conn, cloud_conn, "sync_state")
        sync_small_table_full(local_conn, cloud_conn, "clean_state")

        last_structured_id, last_structured_sync_at = sync_structured_jobs(
            local_conn,
            cloud_conn,
            int(state["last_structured_jobs_id"]),
            state["last_structured_sync_at"],
        )
        state["last_structured_jobs_id"] = last_structured_id
        state["last_structured_sync_at"] = last_structured_sync_at

        state["last_internship_id"] = sync_internship_raw(
            local_conn, cloud_conn, int(state["last_internship_id"])
        )

        set_cloud_state(cloud_conn, state)
        logger.info(f"state after sync: {state}")
        logger.info("sync done")
    except Exception:
        logger.exception("sync failed")
        raise
    finally:
        local_conn.close()
        cloud_conn.close()


if __name__ == "__main__":
    main()
完成所有来源数据清洗和表格导入 2026-03-05 23:55:18 +08:00			`#!/usr/bin/env python3`
			`# -- coding: utf-8 --`

			`"""`
			`Sync local MySQL data to cloud MySQL daily (incremental + upsert).`

			`Synced tables:`
			`- messages (incremental by local id)`
			`- sync_state (full upsert, small table)`
			`- structured_jobs (incremental by local id + recent updates by cleaned_at)`
			`- clean_state (full upsert, small table)`
			`- internship_jobs_raw (incremental by local id, if table exists)`

			State is stored on cloud DB in table `cloud_sync_state`.
			`"""`

			`import json`
			`import logging`
			`import os`
			`from datetime import datetime, timezone`

			`import pymysql`

			`CONFIG_FILE = "config.json"`
			`PIPELINE_NAME = "local_to_cloud_mysql_v1"`
			`BATCH_SIZE = 1000`


			`def qid(name: str) -> str:`
			return f"`{name.replace('`', '``')}`"


			`def setup_logger() -> logging.Logger:`
			`os.makedirs("logs", exist_ok=True)`
			`logger = logging.getLogger("sync_to_cloud_mysql")`
			`logger.setLevel(logging.INFO)`
			`if logger.handlers:`
			`return logger`

			`fmt = logging.Formatter(`
			`"[%(asctime)s] [%(levelname)s] %(message)s", datefmt="%Y-%m-%d %H:%M:%S"`
			`)`
			`ch = logging.StreamHandler()`
			`ch.setFormatter(fmt)`
			`fh = logging.FileHandler("logs/sync_to_cloud_mysql.log", encoding="utf-8")`
			`fh.setFormatter(fmt)`
			`logger.addHandler(ch)`
			`logger.addHandler(fh)`
			`return logger`


			`logger = setup_logger()`


			`def load_config() -> tuple[dict, dict]:`
			`if not os.path.exists(CONFIG_FILE):`
			`raise FileNotFoundError(f"未找到配置文件: {CONFIG_FILE}")`

			`with open(CONFIG_FILE, "r", encoding="utf-8") as f:`
			`cfg = json.load(f)`

			`local_cfg = cfg.get("mysql", {})`
			`cloud_cfg = cfg.get("mysql_cloud", {})`

			`if not isinstance(local_cfg, dict) or not isinstance(cloud_cfg, dict):`
			`raise ValueError("配置错误: mysql / mysql_cloud 必须是对象")`

			`def norm(db: dict, env_prefix: str, defaults: dict) -> dict:`
			`out = {`
			`"host": db.get("host") or os.getenv(f"{env_prefix}_HOST", defaults["host"]),`
			`"port": int(db.get("port") or os.getenv(f"{env_prefix}_PORT", defaults["port"])),`
			`"user": db.get("user") or os.getenv(f"{env_prefix}_USER", defaults["user"]),`
			`"password": db.get("password") or os.getenv(f"{env_prefix}_PASSWORD", ""),`
			`"database": db.get("database") or os.getenv(f"{env_prefix}_DATABASE", defaults["database"]),`
			`"charset": db.get("charset") or os.getenv(f"{env_prefix}_CHARSET", "utf8mb4"),`
			`}`
			`if not out["password"] or out["password"] == "CHANGE_ME":`
			`raise ValueError(f"配置错误: {env_prefix}.password 不能为空")`
			`return out`

			`local = norm(local_cfg, "MYSQL_LOCAL", {"host": "127.0.0.1", "port": "3306", "user": "jobs_user", "database": "jobs"})`
			`cloud = norm(cloud_cfg, "MYSQL_CLOUD", {"host": "127.0.0.1", "port": "3306", "user": "jobs_user", "database": "jobs"})`
			`return local, cloud`


			`def connect_mysql(cfg: dict):`
修复货币数字错误和没有货币种类问题，修复UTC和本地时间混用，改用UTC 2026-03-07 15:02:11 +08:00			`conn = pymysql.connect(`
完成所有来源数据清洗和表格导入 2026-03-05 23:55:18 +08:00			`host=cfg["host"],`
			`port=cfg["port"],`
			`user=cfg["user"],`
			`password=cfg["password"],`
			`database=cfg["database"],`
			`charset=cfg["charset"],`
			`autocommit=True,`
			`cursorclass=pymysql.cursors.DictCursor,`
			`)`
修复货币数字错误和没有货币种类问题，修复UTC和本地时间混用，改用UTC 2026-03-07 15:02:11 +08:00			`with conn.cursor() as cur:`
			`cur.execute("SET time_zone = '+00:00'")`
			`return conn`
完成所有来源数据清洗和表格导入 2026-03-05 23:55:18 +08:00

			`def ensure_cloud_tables(cloud_conn):`
			`with cloud_conn.cursor() as cur:`
			`cur.execute(`
			`"""`
			`CREATE TABLE IF NOT EXISTS cloud_sync_state (`
			`pipeline_name VARCHAR(128) PRIMARY KEY,`
			`last_messages_id BIGINT NOT NULL DEFAULT 0,`
			`last_structured_jobs_id BIGINT NOT NULL DEFAULT 0,`
			`last_internship_id BIGINT NOT NULL DEFAULT 0,`
			`last_structured_sync_at DATETIME NULL,`
			`updated_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP`
			`ON UPDATE CURRENT_TIMESTAMP`
			`) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4`
			`"""`
			`)`


			`def ensure_destination_tables(local_conn, cloud_conn):`
			`required = ["messages", "sync_state", "structured_jobs", "clean_state"]`
			`optional = ["internship_jobs_raw"]`

			`for table in required + optional:`
			`if not table_exists(local_conn, table):`
			`if table in required:`
			`raise RuntimeError(f"本地缺少必要表: {table}")`
			`logger.info(f"skip create cloud table {table}: local table not exists")`
			`continue`

			`if table_exists(cloud_conn, table):`
			`continue`

			`with local_conn.cursor() as cur:`
			cur.execute(f"SHOW CREATE TABLE `{table}`")
			`row = cur.fetchone()`
			`ddl = row.get("Create Table") if isinstance(row, dict) else None`
			`if not ddl:`
			`raise RuntimeError(f"无法读取本地表结构: {table}")`

			`with cloud_conn.cursor() as cur:`
			`cur.execute(ddl)`
			`logger.info(f"created cloud table from local ddl: {table}")`


			`def ensure_cloud_column(local_conn, cloud_conn, table: str, column: str):`
			`if not table_exists(local_conn, table) or not table_exists(cloud_conn, table):`
			`return`

			`with local_conn.cursor() as cur:`
			`cur.execute(f"SHOW COLUMNS FROM {qid(table)} LIKE %s", (column,))`
			`local_col = cur.fetchone()`
			`if not local_col:`
			`return`

			`with cloud_conn.cursor() as cur:`
			`cur.execute(f"SHOW COLUMNS FROM {qid(table)} LIKE %s", (column,))`
			`cloud_col = cur.fetchone()`
			`if cloud_col:`
			`return`

			`# Build minimal compatible ADD COLUMN from local definition.`
			`col_type = local_col.get("Type")`
			`nullable = local_col.get("Null", "YES") == "YES"`
			`default_val = local_col.get("Default")`
			`extra = local_col.get("Extra") or ""`

			`parts = [f"ALTER TABLE {qid(table)} ADD COLUMN {qid(column)} {col_type}"]`
			`parts.append("NULL" if nullable else "NOT NULL")`
			`if default_val is not None:`
			`if isinstance(default_val, str):`
			`escaped = default_val.replace("'", "''")`
			`parts.append(f"DEFAULT '{escaped}'")`
			`else:`
			`parts.append(f"DEFAULT {default_val}")`
			`if extra:`
			`parts.append(extra)`

			`sql = " ".join(parts)`
			`with cloud_conn.cursor() as cur:`
			`cur.execute(sql)`
			`logger.info(f"added missing cloud column: {table}.{column}")`


			`def get_cloud_state(cloud_conn) -> dict:`
			`with cloud_conn.cursor() as cur:`
			`cur.execute(`
			`"""`
			`SELECT last_messages_id, last_structured_jobs_id, last_internship_id, last_structured_sync_at`
			`FROM cloud_sync_state`
			`WHERE pipeline_name=%s`
			`""",`
			`(PIPELINE_NAME,),`
			`)`
			`row = cur.fetchone()`

			`if not row:`
			`return {`
			`"last_messages_id": 0,`
			`"last_structured_jobs_id": 0,`
			`"last_internship_id": 0,`
			`"last_structured_sync_at": None,`
			`}`
			`return row`


			`def set_cloud_state(cloud_conn, state: dict):`
			`with cloud_conn.cursor() as cur:`
			`cur.execute(`
			`"""`
			`INSERT INTO cloud_sync_state (`
			`pipeline_name, last_messages_id, last_structured_jobs_id,`
			`last_internship_id, last_structured_sync_at, updated_at`
			`) VALUES (%s, %s, %s, %s, %s, NOW())`
			`ON DUPLICATE KEY UPDATE`
			`last_messages_id=VALUES(last_messages_id),`
			`last_structured_jobs_id=VALUES(last_structured_jobs_id),`
			`last_internship_id=VALUES(last_internship_id),`
			`last_structured_sync_at=VALUES(last_structured_sync_at),`
			`updated_at=NOW()`
			`""",`
			`(`
			`PIPELINE_NAME,`
			`state["last_messages_id"],`
			`state["last_structured_jobs_id"],`
			`state["last_internship_id"],`
			`state["last_structured_sync_at"],`
			`),`
			`)`


			`def table_exists(conn, table: str) -> bool:`
			`with conn.cursor() as cur:`
			`cur.execute("SHOW TABLES LIKE %s", (table,))`
			`return cur.fetchone() is not None`


			`def sync_messages(local_conn, cloud_conn, last_id: int) -> int:`
			`logger.info(f"sync messages from id > {last_id}")`
			`max_id = last_id`
			`total = 0`
			`while True:`
			`with local_conn.cursor() as cur:`
			`cur.execute(`
			`"""`
			`SELECT id, source, chat_id, message_id, content, date, created_at`
			`FROM messages`
			`WHERE id > %s`
			`ORDER BY id ASC`
			`LIMIT %s`
			`""",`
			`(max_id, BATCH_SIZE),`
			`)`
			`rows = cur.fetchall()`

			`if not rows:`
			`break`

			`with cloud_conn.cursor() as cur:`
			`for r in rows:`
			`cur.execute(`
			`"""`
			`INSERT INTO messages (source, chat_id, message_id, content, date, created_at)`
			`VALUES (%s, %s, %s, %s, %s, %s)`
			`ON DUPLICATE KEY UPDATE`
			`chat_id=VALUES(chat_id),`
			`content=VALUES(content),`
			`date=VALUES(date)`
			`""",`
			`(`
			`r["source"],`
			`r["chat_id"],`
			`r["message_id"],`
			`r["content"],`
			`r["date"],`
			`r["created_at"],`
			`),`
			`)`
			`max_id = max(max_id, int(r["id"]))`
			`total += 1`

			`logger.info(f"messages synced batch={len(rows)}, total={total}, max_id={max_id}")`

			`return max_id`


			`def sync_small_table_full(local_conn, cloud_conn, table: str):`
			`if not table_exists(local_conn, table):`
			`logger.info(f"skip {table}: local table not exists")`
			`return`

			`with local_conn.cursor() as cur:`
			`cur.execute(f"SELECT * FROM {table}")`
			`rows = cur.fetchall()`

			`if not rows:`
			`logger.info(f"{table}: no rows")`
			`return`

			`cols = list(rows[0].keys())`
			`col_sql = ", ".join(qid(c) for c in cols)`
			`val_sql = ", ".join(["%s"] * len(cols))`

			`# primary key handling for known small tables`
			`if table == "sync_state":`
			`pk = "source"`
			`elif table == "clean_state":`
			`pk = "pipeline_name"`
			`else:`
			`pk = cols[0]`

			`update_cols = [c for c in cols if c != pk]`
			`update_sql = ", ".join([f"{qid(c)}=VALUES({qid(c)})" for c in update_cols])`

			`with cloud_conn.cursor() as cur:`
			`for r in rows:`
			`cur.execute(`
			`f"""`
			`INSERT INTO {qid(table)} ({col_sql}) VALUES ({val_sql})`
			`ON DUPLICATE KEY UPDATE {update_sql}`
			`""",`
			`tuple(r[c] for c in cols),`
			`)`

			`logger.info(f"{table}: synced rows={len(rows)}")`


			`def sync_structured_jobs(local_conn, cloud_conn, last_id: int, last_sync_at) -> tuple[int, str]:`
			`logger.info(`
			`f"sync structured_jobs from id > {last_id}, last_sync_at={last_sync_at}"`
			`)`

			`max_id = last_id`
			`total = 0`
			`touched_ids = set()`

			`# 1) incremental new rows by id`
			`while True:`
			`with local_conn.cursor() as cur:`
			`cur.execute(`
			`"""`
			`SELECT * FROM structured_jobs`
			`WHERE id > %s`
			`ORDER BY id ASC`
			`LIMIT %s`
			`""",`
			`(max_id, BATCH_SIZE),`
			`)`
			`rows = cur.fetchall()`

			`if not rows:`
			`break`

			`_upsert_structured_rows(cloud_conn, rows)`
			`for r in rows:`
			`rid = int(r["id"])`
			`max_id = max(max_id, rid)`
			`touched_ids.add(rid)`
			`total += len(rows)`
			`logger.info(f"structured_jobs incremental batch={len(rows)}, total={total}, max_id={max_id}")`

			`# 2) update window by cleaned_at (catch updated existing rows)`
			`if last_sync_at is None:`
			`last_sync_at = "1970-01-01 00:00:00"`
			`if isinstance(last_sync_at, datetime):`
			`last_sync_at = last_sync_at.strftime("%Y-%m-%d %H:%M:%S")`

			`while True:`
			`with local_conn.cursor() as cur:`
			`cur.execute(`
			`"""`
			`SELECT * FROM structured_jobs`
			`WHERE cleaned_at > %s`
			`ORDER BY cleaned_at ASC, id ASC`
			`LIMIT %s`
			`""",`
			`(last_sync_at, BATCH_SIZE),`
			`)`
			`rows = cur.fetchall()`

			`if not rows:`
			`break`

			`# avoid repeated heavy updates in loop: only keep rows not already touched in incremental loop`
			`delta = [r for r in rows if int(r["id"]) not in touched_ids]`
			`if delta:`
			`_upsert_structured_rows(cloud_conn, delta)`
			`total += len(delta)`
			`logger.info(f"structured_jobs update-window batch={len(delta)}, total={total}")`

			`# move cursor forward to avoid pagination loops on timestamp boundary`
			`last_row_cleaned_at = rows[-1]["cleaned_at"]`
			`if isinstance(last_row_cleaned_at, datetime):`
			`last_sync_at = last_row_cleaned_at.strftime("%Y-%m-%d %H:%M:%S")`
			`elif last_row_cleaned_at:`
			`last_sync_at = str(last_row_cleaned_at)`

			`if len(rows) < BATCH_SIZE:`
			`break`

			`now_utc = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S")`
			`return max_id, now_utc`


			`def _upsert_structured_rows(cloud_conn, rows: list[dict]):`
			`if not rows:`
			`return`
			`cols = list(rows[0].keys())`
			`# id is local technical key, do not sync into cloud table (cloud has own auto id)`
			`cols = [c for c in cols if c != "id"]`

			`col_sql = ", ".join(qid(c) for c in cols)`
			`val_sql = ", ".join(["%s"] * len(cols))`
			`update_cols = [c for c in cols if c not in ("source", "message_id")]`
			`update_sql = ", ".join(`
			`[f"{qid(c)}=VALUES({qid(c)})" for c in update_cols]`
			`+ [f"{qid('cleaned_at')}=CURRENT_TIMESTAMP"]`
			`)`

			`with cloud_conn.cursor() as cur:`
			`for r in rows:`
			`cur.execute(`
			`f"""`
			`INSERT INTO {qid('structured_jobs')} ({col_sql})`
			`VALUES ({val_sql})`
			`ON DUPLICATE KEY UPDATE {update_sql}`
			`""",`
			`tuple(r[c] for c in cols),`
			`)`


			`def sync_internship_raw(local_conn, cloud_conn, last_id: int) -> int:`
			`if not table_exists(local_conn, "internship_jobs_raw"):`
			`logger.info("skip internship_jobs_raw: local table not exists")`
			`return last_id`

			`max_id = last_id`
			`total = 0`
			`while True:`
			`with local_conn.cursor() as cur:`
			`cur.execute(`
			`"""`
			`SELECT * FROM internship_jobs_raw`
			`WHERE id > %s`
			`ORDER BY id ASC`
			`LIMIT %s`
			`""",`
			`(max_id, BATCH_SIZE),`
			`)`
			`rows = cur.fetchall()`

			`if not rows:`
			`break`

			`cols = [c for c in rows[0].keys() if c != "id"]`
			`col_sql = ", ".join(qid(c) for c in cols)`
			`val_sql = ", ".join(["%s"] * len(cols))`
			`update_cols = [c for c in cols if c != "fingerprint"]`
			`update_sql = ", ".join(`
			`[f"{qid(c)}=VALUES({qid(c)})" for c in update_cols]`
			`+ [f"{qid('imported_at')}=CURRENT_TIMESTAMP"]`
			`)`

			`with cloud_conn.cursor() as cur:`
			`for r in rows:`
			`cur.execute(`
			`f"""`
			`INSERT INTO {qid('internship_jobs_raw')} ({col_sql})`
			`VALUES ({val_sql})`
			`ON DUPLICATE KEY UPDATE {update_sql}`
			`""",`
			`tuple(r[c] for c in cols),`
			`)`
			`max_id = max(max_id, int(r["id"]))`
			`total += 1`

			`logger.info(f"internship_jobs_raw batch={len(rows)}, total={total}, max_id={max_id}")`

			`return max_id`


			`def main():`
			`local_cfg, cloud_cfg = load_config()`
			`logger.info(`
			`"sync start: "`
			`f"local={local_cfg['host']}:{local_cfg['port']}/{local_cfg['database']} -> "`
			`f"cloud={cloud_cfg['host']}:{cloud_cfg['port']}/{cloud_cfg['database']}"`
			`)`

			`local_conn = connect_mysql(local_cfg)`
			`cloud_conn = connect_mysql(cloud_cfg)`

			`try:`
			`ensure_cloud_tables(cloud_conn)`
			`ensure_destination_tables(local_conn, cloud_conn)`
			`ensure_cloud_column(local_conn, cloud_conn, "internship_jobs_raw", "updated_at_utc")`
			`state = get_cloud_state(cloud_conn)`
			`logger.info(f"state before sync: {state}")`

			`state["last_messages_id"] = sync_messages(`
			`local_conn, cloud_conn, int(state["last_messages_id"])`
			`)`

			`sync_small_table_full(local_conn, cloud_conn, "sync_state")`
			`sync_small_table_full(local_conn, cloud_conn, "clean_state")`

			`last_structured_id, last_structured_sync_at = sync_structured_jobs(`
			`local_conn,`
			`cloud_conn,`
			`int(state["last_structured_jobs_id"]),`
			`state["last_structured_sync_at"],`
			`)`
			`state["last_structured_jobs_id"] = last_structured_id`
			`state["last_structured_sync_at"] = last_structured_sync_at`

			`state["last_internship_id"] = sync_internship_raw(`
			`local_conn, cloud_conn, int(state["last_internship_id"])`
			`)`

			`set_cloud_state(cloud_conn, state)`
			`logger.info(f"state after sync: {state}")`
			`logger.info("sync done")`
			`except Exception:`
			`logger.exception("sync failed")`
			`raise`
			`finally:`
			`local_conn.close()`
			`cloud_conn.close()`


			`if __name__ == "__main__":`
			`main()`