2026-03-05 23:55:18 +08:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
|
|
|
|
|
"""
|
|
|
|
|
Sync local MySQL data to cloud MySQL daily (incremental + upsert).
|
|
|
|
|
|
|
|
|
|
Synced tables:
|
|
|
|
|
- messages (incremental by local id)
|
|
|
|
|
- sync_state (full upsert, small table)
|
|
|
|
|
- structured_jobs (incremental by local id + recent updates by cleaned_at)
|
|
|
|
|
- clean_state (full upsert, small table)
|
|
|
|
|
- internship_jobs_raw (incremental by local id, if table exists)
|
|
|
|
|
|
|
|
|
|
State is stored on cloud DB in table `cloud_sync_state`.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
import json
|
|
|
|
|
import logging
|
|
|
|
|
import os
|
|
|
|
|
from datetime import datetime, timezone
|
|
|
|
|
|
|
|
|
|
import pymysql
|
|
|
|
|
|
|
|
|
|
CONFIG_FILE = "config.json"
|
|
|
|
|
PIPELINE_NAME = "local_to_cloud_mysql_v1"
|
|
|
|
|
BATCH_SIZE = 1000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def qid(name: str) -> str:
|
|
|
|
|
return f"`{name.replace('`', '``')}`"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def setup_logger() -> logging.Logger:
|
|
|
|
|
os.makedirs("logs", exist_ok=True)
|
|
|
|
|
logger = logging.getLogger("sync_to_cloud_mysql")
|
|
|
|
|
logger.setLevel(logging.INFO)
|
|
|
|
|
if logger.handlers:
|
|
|
|
|
return logger
|
|
|
|
|
|
|
|
|
|
fmt = logging.Formatter(
|
|
|
|
|
"[%(asctime)s] [%(levelname)s] %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
|
|
|
|
|
)
|
|
|
|
|
ch = logging.StreamHandler()
|
|
|
|
|
ch.setFormatter(fmt)
|
|
|
|
|
fh = logging.FileHandler("logs/sync_to_cloud_mysql.log", encoding="utf-8")
|
|
|
|
|
fh.setFormatter(fmt)
|
|
|
|
|
logger.addHandler(ch)
|
|
|
|
|
logger.addHandler(fh)
|
|
|
|
|
return logger
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
logger = setup_logger()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def load_config() -> tuple[dict, dict]:
|
|
|
|
|
if not os.path.exists(CONFIG_FILE):
|
|
|
|
|
raise FileNotFoundError(f"未找到配置文件: {CONFIG_FILE}")
|
|
|
|
|
|
|
|
|
|
with open(CONFIG_FILE, "r", encoding="utf-8") as f:
|
|
|
|
|
cfg = json.load(f)
|
|
|
|
|
|
|
|
|
|
local_cfg = cfg.get("mysql", {})
|
|
|
|
|
cloud_cfg = cfg.get("mysql_cloud", {})
|
|
|
|
|
|
|
|
|
|
if not isinstance(local_cfg, dict) or not isinstance(cloud_cfg, dict):
|
|
|
|
|
raise ValueError("配置错误: mysql / mysql_cloud 必须是对象")
|
|
|
|
|
|
|
|
|
|
def norm(db: dict, env_prefix: str, defaults: dict) -> dict:
|
|
|
|
|
out = {
|
|
|
|
|
"host": db.get("host") or os.getenv(f"{env_prefix}_HOST", defaults["host"]),
|
|
|
|
|
"port": int(db.get("port") or os.getenv(f"{env_prefix}_PORT", defaults["port"])),
|
|
|
|
|
"user": db.get("user") or os.getenv(f"{env_prefix}_USER", defaults["user"]),
|
|
|
|
|
"password": db.get("password") or os.getenv(f"{env_prefix}_PASSWORD", ""),
|
|
|
|
|
"database": db.get("database") or os.getenv(f"{env_prefix}_DATABASE", defaults["database"]),
|
|
|
|
|
"charset": db.get("charset") or os.getenv(f"{env_prefix}_CHARSET", "utf8mb4"),
|
|
|
|
|
}
|
|
|
|
|
if not out["password"] or out["password"] == "CHANGE_ME":
|
|
|
|
|
raise ValueError(f"配置错误: {env_prefix}.password 不能为空")
|
|
|
|
|
return out
|
|
|
|
|
|
|
|
|
|
local = norm(local_cfg, "MYSQL_LOCAL", {"host": "127.0.0.1", "port": "3306", "user": "jobs_user", "database": "jobs"})
|
|
|
|
|
cloud = norm(cloud_cfg, "MYSQL_CLOUD", {"host": "127.0.0.1", "port": "3306", "user": "jobs_user", "database": "jobs"})
|
|
|
|
|
return local, cloud
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def connect_mysql(cfg: dict):
|
2026-03-07 15:02:11 +08:00
|
|
|
conn = pymysql.connect(
|
2026-03-05 23:55:18 +08:00
|
|
|
host=cfg["host"],
|
|
|
|
|
port=cfg["port"],
|
|
|
|
|
user=cfg["user"],
|
|
|
|
|
password=cfg["password"],
|
|
|
|
|
database=cfg["database"],
|
|
|
|
|
charset=cfg["charset"],
|
|
|
|
|
autocommit=True,
|
|
|
|
|
cursorclass=pymysql.cursors.DictCursor,
|
|
|
|
|
)
|
2026-03-07 15:02:11 +08:00
|
|
|
with conn.cursor() as cur:
|
|
|
|
|
cur.execute("SET time_zone = '+00:00'")
|
|
|
|
|
return conn
|
2026-03-05 23:55:18 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def ensure_cloud_tables(cloud_conn):
|
|
|
|
|
with cloud_conn.cursor() as cur:
|
|
|
|
|
cur.execute(
|
|
|
|
|
"""
|
|
|
|
|
CREATE TABLE IF NOT EXISTS cloud_sync_state (
|
|
|
|
|
pipeline_name VARCHAR(128) PRIMARY KEY,
|
|
|
|
|
last_messages_id BIGINT NOT NULL DEFAULT 0,
|
|
|
|
|
last_structured_jobs_id BIGINT NOT NULL DEFAULT 0,
|
|
|
|
|
last_internship_id BIGINT NOT NULL DEFAULT 0,
|
|
|
|
|
last_structured_sync_at DATETIME NULL,
|
|
|
|
|
updated_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP
|
|
|
|
|
ON UPDATE CURRENT_TIMESTAMP
|
|
|
|
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
|
|
|
|
|
"""
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def ensure_destination_tables(local_conn, cloud_conn):
|
|
|
|
|
required = ["messages", "sync_state", "structured_jobs", "clean_state"]
|
|
|
|
|
optional = ["internship_jobs_raw"]
|
|
|
|
|
|
|
|
|
|
for table in required + optional:
|
|
|
|
|
if not table_exists(local_conn, table):
|
|
|
|
|
if table in required:
|
|
|
|
|
raise RuntimeError(f"本地缺少必要表: {table}")
|
|
|
|
|
logger.info(f"skip create cloud table {table}: local table not exists")
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
if table_exists(cloud_conn, table):
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
with local_conn.cursor() as cur:
|
|
|
|
|
cur.execute(f"SHOW CREATE TABLE `{table}`")
|
|
|
|
|
row = cur.fetchone()
|
|
|
|
|
ddl = row.get("Create Table") if isinstance(row, dict) else None
|
|
|
|
|
if not ddl:
|
|
|
|
|
raise RuntimeError(f"无法读取本地表结构: {table}")
|
|
|
|
|
|
|
|
|
|
with cloud_conn.cursor() as cur:
|
|
|
|
|
cur.execute(ddl)
|
|
|
|
|
logger.info(f"created cloud table from local ddl: {table}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def ensure_cloud_column(local_conn, cloud_conn, table: str, column: str):
|
|
|
|
|
if not table_exists(local_conn, table) or not table_exists(cloud_conn, table):
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
with local_conn.cursor() as cur:
|
|
|
|
|
cur.execute(f"SHOW COLUMNS FROM {qid(table)} LIKE %s", (column,))
|
|
|
|
|
local_col = cur.fetchone()
|
|
|
|
|
if not local_col:
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
with cloud_conn.cursor() as cur:
|
|
|
|
|
cur.execute(f"SHOW COLUMNS FROM {qid(table)} LIKE %s", (column,))
|
|
|
|
|
cloud_col = cur.fetchone()
|
|
|
|
|
if cloud_col:
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
# Build minimal compatible ADD COLUMN from local definition.
|
|
|
|
|
col_type = local_col.get("Type")
|
|
|
|
|
nullable = local_col.get("Null", "YES") == "YES"
|
|
|
|
|
default_val = local_col.get("Default")
|
|
|
|
|
extra = local_col.get("Extra") or ""
|
|
|
|
|
|
|
|
|
|
parts = [f"ALTER TABLE {qid(table)} ADD COLUMN {qid(column)} {col_type}"]
|
|
|
|
|
parts.append("NULL" if nullable else "NOT NULL")
|
|
|
|
|
if default_val is not None:
|
|
|
|
|
if isinstance(default_val, str):
|
|
|
|
|
escaped = default_val.replace("'", "''")
|
|
|
|
|
parts.append(f"DEFAULT '{escaped}'")
|
|
|
|
|
else:
|
|
|
|
|
parts.append(f"DEFAULT {default_val}")
|
|
|
|
|
if extra:
|
|
|
|
|
parts.append(extra)
|
|
|
|
|
|
|
|
|
|
sql = " ".join(parts)
|
|
|
|
|
with cloud_conn.cursor() as cur:
|
|
|
|
|
cur.execute(sql)
|
|
|
|
|
logger.info(f"added missing cloud column: {table}.{column}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_cloud_state(cloud_conn) -> dict:
|
|
|
|
|
with cloud_conn.cursor() as cur:
|
|
|
|
|
cur.execute(
|
|
|
|
|
"""
|
|
|
|
|
SELECT last_messages_id, last_structured_jobs_id, last_internship_id, last_structured_sync_at
|
|
|
|
|
FROM cloud_sync_state
|
|
|
|
|
WHERE pipeline_name=%s
|
|
|
|
|
""",
|
|
|
|
|
(PIPELINE_NAME,),
|
|
|
|
|
)
|
|
|
|
|
row = cur.fetchone()
|
|
|
|
|
|
|
|
|
|
if not row:
|
|
|
|
|
return {
|
|
|
|
|
"last_messages_id": 0,
|
|
|
|
|
"last_structured_jobs_id": 0,
|
|
|
|
|
"last_internship_id": 0,
|
|
|
|
|
"last_structured_sync_at": None,
|
|
|
|
|
}
|
|
|
|
|
return row
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def set_cloud_state(cloud_conn, state: dict):
|
|
|
|
|
with cloud_conn.cursor() as cur:
|
|
|
|
|
cur.execute(
|
|
|
|
|
"""
|
|
|
|
|
INSERT INTO cloud_sync_state (
|
|
|
|
|
pipeline_name, last_messages_id, last_structured_jobs_id,
|
|
|
|
|
last_internship_id, last_structured_sync_at, updated_at
|
|
|
|
|
) VALUES (%s, %s, %s, %s, %s, NOW())
|
|
|
|
|
ON DUPLICATE KEY UPDATE
|
|
|
|
|
last_messages_id=VALUES(last_messages_id),
|
|
|
|
|
last_structured_jobs_id=VALUES(last_structured_jobs_id),
|
|
|
|
|
last_internship_id=VALUES(last_internship_id),
|
|
|
|
|
last_structured_sync_at=VALUES(last_structured_sync_at),
|
|
|
|
|
updated_at=NOW()
|
|
|
|
|
""",
|
|
|
|
|
(
|
|
|
|
|
PIPELINE_NAME,
|
|
|
|
|
state["last_messages_id"],
|
|
|
|
|
state["last_structured_jobs_id"],
|
|
|
|
|
state["last_internship_id"],
|
|
|
|
|
state["last_structured_sync_at"],
|
|
|
|
|
),
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def table_exists(conn, table: str) -> bool:
|
|
|
|
|
with conn.cursor() as cur:
|
|
|
|
|
cur.execute("SHOW TABLES LIKE %s", (table,))
|
|
|
|
|
return cur.fetchone() is not None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def sync_messages(local_conn, cloud_conn, last_id: int) -> int:
|
|
|
|
|
logger.info(f"sync messages from id > {last_id}")
|
|
|
|
|
max_id = last_id
|
|
|
|
|
total = 0
|
|
|
|
|
while True:
|
|
|
|
|
with local_conn.cursor() as cur:
|
|
|
|
|
cur.execute(
|
|
|
|
|
"""
|
|
|
|
|
SELECT id, source, chat_id, message_id, content, date, created_at
|
|
|
|
|
FROM messages
|
|
|
|
|
WHERE id > %s
|
|
|
|
|
ORDER BY id ASC
|
|
|
|
|
LIMIT %s
|
|
|
|
|
""",
|
|
|
|
|
(max_id, BATCH_SIZE),
|
|
|
|
|
)
|
|
|
|
|
rows = cur.fetchall()
|
|
|
|
|
|
|
|
|
|
if not rows:
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
with cloud_conn.cursor() as cur:
|
|
|
|
|
for r in rows:
|
|
|
|
|
cur.execute(
|
|
|
|
|
"""
|
|
|
|
|
INSERT INTO messages (source, chat_id, message_id, content, date, created_at)
|
|
|
|
|
VALUES (%s, %s, %s, %s, %s, %s)
|
|
|
|
|
ON DUPLICATE KEY UPDATE
|
|
|
|
|
chat_id=VALUES(chat_id),
|
|
|
|
|
content=VALUES(content),
|
|
|
|
|
date=VALUES(date)
|
|
|
|
|
""",
|
|
|
|
|
(
|
|
|
|
|
r["source"],
|
|
|
|
|
r["chat_id"],
|
|
|
|
|
r["message_id"],
|
|
|
|
|
r["content"],
|
|
|
|
|
r["date"],
|
|
|
|
|
r["created_at"],
|
|
|
|
|
),
|
|
|
|
|
)
|
|
|
|
|
max_id = max(max_id, int(r["id"]))
|
|
|
|
|
total += 1
|
|
|
|
|
|
|
|
|
|
logger.info(f"messages synced batch={len(rows)}, total={total}, max_id={max_id}")
|
|
|
|
|
|
|
|
|
|
return max_id
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def sync_small_table_full(local_conn, cloud_conn, table: str):
|
|
|
|
|
if not table_exists(local_conn, table):
|
|
|
|
|
logger.info(f"skip {table}: local table not exists")
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
with local_conn.cursor() as cur:
|
|
|
|
|
cur.execute(f"SELECT * FROM {table}")
|
|
|
|
|
rows = cur.fetchall()
|
|
|
|
|
|
|
|
|
|
if not rows:
|
|
|
|
|
logger.info(f"{table}: no rows")
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
cols = list(rows[0].keys())
|
|
|
|
|
col_sql = ", ".join(qid(c) for c in cols)
|
|
|
|
|
val_sql = ", ".join(["%s"] * len(cols))
|
|
|
|
|
|
|
|
|
|
# primary key handling for known small tables
|
|
|
|
|
if table == "sync_state":
|
|
|
|
|
pk = "source"
|
|
|
|
|
elif table == "clean_state":
|
|
|
|
|
pk = "pipeline_name"
|
|
|
|
|
else:
|
|
|
|
|
pk = cols[0]
|
|
|
|
|
|
|
|
|
|
update_cols = [c for c in cols if c != pk]
|
|
|
|
|
update_sql = ", ".join([f"{qid(c)}=VALUES({qid(c)})" for c in update_cols])
|
|
|
|
|
|
|
|
|
|
with cloud_conn.cursor() as cur:
|
|
|
|
|
for r in rows:
|
|
|
|
|
cur.execute(
|
|
|
|
|
f"""
|
|
|
|
|
INSERT INTO {qid(table)} ({col_sql}) VALUES ({val_sql})
|
|
|
|
|
ON DUPLICATE KEY UPDATE {update_sql}
|
|
|
|
|
""",
|
|
|
|
|
tuple(r[c] for c in cols),
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
logger.info(f"{table}: synced rows={len(rows)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def sync_structured_jobs(local_conn, cloud_conn, last_id: int, last_sync_at) -> tuple[int, str]:
|
|
|
|
|
logger.info(
|
|
|
|
|
f"sync structured_jobs from id > {last_id}, last_sync_at={last_sync_at}"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
max_id = last_id
|
|
|
|
|
total = 0
|
|
|
|
|
touched_ids = set()
|
|
|
|
|
|
|
|
|
|
# 1) incremental new rows by id
|
|
|
|
|
while True:
|
|
|
|
|
with local_conn.cursor() as cur:
|
|
|
|
|
cur.execute(
|
|
|
|
|
"""
|
|
|
|
|
SELECT * FROM structured_jobs
|
|
|
|
|
WHERE id > %s
|
|
|
|
|
ORDER BY id ASC
|
|
|
|
|
LIMIT %s
|
|
|
|
|
""",
|
|
|
|
|
(max_id, BATCH_SIZE),
|
|
|
|
|
)
|
|
|
|
|
rows = cur.fetchall()
|
|
|
|
|
|
|
|
|
|
if not rows:
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
_upsert_structured_rows(cloud_conn, rows)
|
|
|
|
|
for r in rows:
|
|
|
|
|
rid = int(r["id"])
|
|
|
|
|
max_id = max(max_id, rid)
|
|
|
|
|
touched_ids.add(rid)
|
|
|
|
|
total += len(rows)
|
|
|
|
|
logger.info(f"structured_jobs incremental batch={len(rows)}, total={total}, max_id={max_id}")
|
|
|
|
|
|
|
|
|
|
# 2) update window by cleaned_at (catch updated existing rows)
|
|
|
|
|
if last_sync_at is None:
|
|
|
|
|
last_sync_at = "1970-01-01 00:00:00"
|
|
|
|
|
if isinstance(last_sync_at, datetime):
|
|
|
|
|
last_sync_at = last_sync_at.strftime("%Y-%m-%d %H:%M:%S")
|
|
|
|
|
|
|
|
|
|
while True:
|
|
|
|
|
with local_conn.cursor() as cur:
|
|
|
|
|
cur.execute(
|
|
|
|
|
"""
|
|
|
|
|
SELECT * FROM structured_jobs
|
|
|
|
|
WHERE cleaned_at > %s
|
|
|
|
|
ORDER BY cleaned_at ASC, id ASC
|
|
|
|
|
LIMIT %s
|
|
|
|
|
""",
|
|
|
|
|
(last_sync_at, BATCH_SIZE),
|
|
|
|
|
)
|
|
|
|
|
rows = cur.fetchall()
|
|
|
|
|
|
|
|
|
|
if not rows:
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
# avoid repeated heavy updates in loop: only keep rows not already touched in incremental loop
|
|
|
|
|
delta = [r for r in rows if int(r["id"]) not in touched_ids]
|
|
|
|
|
if delta:
|
|
|
|
|
_upsert_structured_rows(cloud_conn, delta)
|
|
|
|
|
total += len(delta)
|
|
|
|
|
logger.info(f"structured_jobs update-window batch={len(delta)}, total={total}")
|
|
|
|
|
|
|
|
|
|
# move cursor forward to avoid pagination loops on timestamp boundary
|
|
|
|
|
last_row_cleaned_at = rows[-1]["cleaned_at"]
|
|
|
|
|
if isinstance(last_row_cleaned_at, datetime):
|
|
|
|
|
last_sync_at = last_row_cleaned_at.strftime("%Y-%m-%d %H:%M:%S")
|
|
|
|
|
elif last_row_cleaned_at:
|
|
|
|
|
last_sync_at = str(last_row_cleaned_at)
|
|
|
|
|
|
|
|
|
|
if len(rows) < BATCH_SIZE:
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
now_utc = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
|
|
|
|
|
return max_id, now_utc
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _upsert_structured_rows(cloud_conn, rows: list[dict]):
|
|
|
|
|
if not rows:
|
|
|
|
|
return
|
|
|
|
|
cols = list(rows[0].keys())
|
|
|
|
|
# id is local technical key, do not sync into cloud table (cloud has own auto id)
|
|
|
|
|
cols = [c for c in cols if c != "id"]
|
|
|
|
|
|
|
|
|
|
col_sql = ", ".join(qid(c) for c in cols)
|
|
|
|
|
val_sql = ", ".join(["%s"] * len(cols))
|
|
|
|
|
update_cols = [c for c in cols if c not in ("source", "message_id")]
|
|
|
|
|
update_sql = ", ".join(
|
|
|
|
|
[f"{qid(c)}=VALUES({qid(c)})" for c in update_cols]
|
|
|
|
|
+ [f"{qid('cleaned_at')}=CURRENT_TIMESTAMP"]
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
with cloud_conn.cursor() as cur:
|
|
|
|
|
for r in rows:
|
|
|
|
|
cur.execute(
|
|
|
|
|
f"""
|
|
|
|
|
INSERT INTO {qid('structured_jobs')} ({col_sql})
|
|
|
|
|
VALUES ({val_sql})
|
|
|
|
|
ON DUPLICATE KEY UPDATE {update_sql}
|
|
|
|
|
""",
|
|
|
|
|
tuple(r[c] for c in cols),
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def sync_internship_raw(local_conn, cloud_conn, last_id: int) -> int:
|
|
|
|
|
if not table_exists(local_conn, "internship_jobs_raw"):
|
|
|
|
|
logger.info("skip internship_jobs_raw: local table not exists")
|
|
|
|
|
return last_id
|
|
|
|
|
|
|
|
|
|
max_id = last_id
|
|
|
|
|
total = 0
|
|
|
|
|
while True:
|
|
|
|
|
with local_conn.cursor() as cur:
|
|
|
|
|
cur.execute(
|
|
|
|
|
"""
|
|
|
|
|
SELECT * FROM internship_jobs_raw
|
|
|
|
|
WHERE id > %s
|
|
|
|
|
ORDER BY id ASC
|
|
|
|
|
LIMIT %s
|
|
|
|
|
""",
|
|
|
|
|
(max_id, BATCH_SIZE),
|
|
|
|
|
)
|
|
|
|
|
rows = cur.fetchall()
|
|
|
|
|
|
|
|
|
|
if not rows:
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
cols = [c for c in rows[0].keys() if c != "id"]
|
|
|
|
|
col_sql = ", ".join(qid(c) for c in cols)
|
|
|
|
|
val_sql = ", ".join(["%s"] * len(cols))
|
|
|
|
|
update_cols = [c for c in cols if c != "fingerprint"]
|
|
|
|
|
update_sql = ", ".join(
|
|
|
|
|
[f"{qid(c)}=VALUES({qid(c)})" for c in update_cols]
|
|
|
|
|
+ [f"{qid('imported_at')}=CURRENT_TIMESTAMP"]
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
with cloud_conn.cursor() as cur:
|
|
|
|
|
for r in rows:
|
|
|
|
|
cur.execute(
|
|
|
|
|
f"""
|
|
|
|
|
INSERT INTO {qid('internship_jobs_raw')} ({col_sql})
|
|
|
|
|
VALUES ({val_sql})
|
|
|
|
|
ON DUPLICATE KEY UPDATE {update_sql}
|
|
|
|
|
""",
|
|
|
|
|
tuple(r[c] for c in cols),
|
|
|
|
|
)
|
|
|
|
|
max_id = max(max_id, int(r["id"]))
|
|
|
|
|
total += 1
|
|
|
|
|
|
|
|
|
|
logger.info(f"internship_jobs_raw batch={len(rows)}, total={total}, max_id={max_id}")
|
|
|
|
|
|
|
|
|
|
return max_id
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
|
local_cfg, cloud_cfg = load_config()
|
|
|
|
|
logger.info(
|
|
|
|
|
"sync start: "
|
|
|
|
|
f"local={local_cfg['host']}:{local_cfg['port']}/{local_cfg['database']} -> "
|
|
|
|
|
f"cloud={cloud_cfg['host']}:{cloud_cfg['port']}/{cloud_cfg['database']}"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
local_conn = connect_mysql(local_cfg)
|
|
|
|
|
cloud_conn = connect_mysql(cloud_cfg)
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
ensure_cloud_tables(cloud_conn)
|
|
|
|
|
ensure_destination_tables(local_conn, cloud_conn)
|
|
|
|
|
ensure_cloud_column(local_conn, cloud_conn, "internship_jobs_raw", "updated_at_utc")
|
|
|
|
|
state = get_cloud_state(cloud_conn)
|
|
|
|
|
logger.info(f"state before sync: {state}")
|
|
|
|
|
|
|
|
|
|
state["last_messages_id"] = sync_messages(
|
|
|
|
|
local_conn, cloud_conn, int(state["last_messages_id"])
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
sync_small_table_full(local_conn, cloud_conn, "sync_state")
|
|
|
|
|
sync_small_table_full(local_conn, cloud_conn, "clean_state")
|
|
|
|
|
|
|
|
|
|
last_structured_id, last_structured_sync_at = sync_structured_jobs(
|
|
|
|
|
local_conn,
|
|
|
|
|
cloud_conn,
|
|
|
|
|
int(state["last_structured_jobs_id"]),
|
|
|
|
|
state["last_structured_sync_at"],
|
|
|
|
|
)
|
|
|
|
|
state["last_structured_jobs_id"] = last_structured_id
|
|
|
|
|
state["last_structured_sync_at"] = last_structured_sync_at
|
|
|
|
|
|
|
|
|
|
state["last_internship_id"] = sync_internship_raw(
|
|
|
|
|
local_conn, cloud_conn, int(state["last_internship_id"])
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
set_cloud_state(cloud_conn, state)
|
|
|
|
|
logger.info(f"state after sync: {state}")
|
|
|
|
|
logger.info("sync done")
|
|
|
|
|
except Exception:
|
|
|
|
|
logger.exception("sync failed")
|
|
|
|
|
raise
|
|
|
|
|
finally:
|
|
|
|
|
local_conn.close()
|
|
|
|
|
cloud_conn.close()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
main()
|