From 4f038242997977baaf63b5fd5b55cdd7ef0c863b Mon Sep 17 00:00:00 2001 From: BoliviaYu Date: Sat, 7 Mar 2026 15:02:11 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E8=B4=A7=E5=B8=81=E6=95=B0?= =?UTF-8?q?=E5=AD=97=E9=94=99=E8=AF=AF=E5=92=8C=E6=B2=A1=E6=9C=89=E8=B4=A7?= =?UTF-8?q?=E5=B8=81=E7=A7=8D=E7=B1=BB=E9=97=AE=E9=A2=98=EF=BC=8C=E4=BF=AE?= =?UTF-8?q?=E5=A4=8DUTC=E5=92=8C=E6=9C=AC=E5=9C=B0=E6=97=B6=E9=97=B4?= =?UTF-8?q?=E6=B7=B7=E7=94=A8=EF=BC=8C=E6=94=B9=E7=94=A8UTC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 10 +++++ clean_to_structured.py | 88 ++++++++++++++++++++++++++++++++++-------- import_excel_jobs.py | 5 ++- main.py | 3 ++ sync_to_cloud_mysql.py | 5 ++- 5 files changed, 92 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index 908d7f9..df74aff 100644 --- a/README.md +++ b/README.md @@ -48,6 +48,12 @@ cp config.example.json config.json - `mysql`: 本地 MySQL 连接 - `mysql_cloud`: 云端 MySQL 连接(用于同步) +时间规范: + +- 项目统一使用 UTC 存储所有 `DATETIME` +- 各脚本连接 MySQL 后会执行 `SET time_zone = '+00:00'` +- `NOW()` / `CURRENT_TIMESTAMP` 产生的时间也按 UTC 写入 + ## 4. 运行方式 ### 4.1 手动执行 @@ -287,6 +293,10 @@ uv run import_excel_jobs.py --file /path/to/jobs.xlsx --sheet Sheet1 --source @e - 检查 `messages` 是否有新数据。 - 检查 `clean_state.last_message_row_id` 是否已到最新。 +6. 历史数据有 UTC+8 和 UTC 混用怎么办 +- 新版脚本已统一写入 UTC。 +- 历史数据需一次性迁移后再对齐分析口径(建议先备份再修复)。 + ## 10. 协作规范建议 - 新增来源规则时,优先增加 source 专用 parser,避免影响已有来源。 diff --git a/clean_to_structured.py b/clean_to_structured.py index 0be6ddf..24fd034 100644 --- a/clean_to_structured.py +++ b/clean_to_structured.py @@ -119,7 +119,7 @@ def load_mysql_config() -> dict: def connect_mysql(cfg: dict): - return pymysql.connect( + conn = pymysql.connect( host=cfg["host"], port=cfg["port"], user=cfg["user"], @@ -128,6 +128,9 @@ def connect_mysql(cfg: dict): charset=cfg["charset"], autocommit=True, ) + with conn.cursor() as cur: + cur.execute("SET time_zone = '+00:00'") + return conn def init_target_db(conn): @@ -221,16 +224,62 @@ def clean_company_name(s: str | None) -> str | None: return s or None -def parse_salary(raw: str | None) -> tuple[str | None, int | None, int | None, str | None]: +def infer_salary_currency(text: str) -> str | None: + low = text.lower() + if any(k in low for k in ["usd", "us$", "dollar"]) or "$" in text: + return "USD" + if any(k in text for k in ["¥", "¥", "人民币"]) or "cny" in low: + return "CNY" + if "元" in text or "万" in text: + return "CNY" + if "k" in low and any(k in low for k in ["month", "year", "day"]): + return "USD" + if "eur" in low or "€" in text: + return "EUR" + if "hkd" in low or "hk$" in low: + return "HKD" + if "sgd" in low or "s$" in low: + return "SGD" + + # No explicit marker: infer by language. + if re.search(r"[\u4e00-\u9fff]", text): + return "CNY" + return "USD" + + +def parse_salary( + raw: str | None, +) -> tuple[str | None, str | None, int | None, int | None, str | None]: if not raw: - return None, None, None, None + return None, None, None, None, None text = clean_md_text(raw) lower = text.lower() + currency = infer_salary_currency(text) - nums = re.findall(r"\d+(?:\.\d+)?", text.replace(",", "")) - salary_min = int(float(nums[0])) if len(nums) >= 1 else None - salary_max = int(float(nums[1])) if len(nums) >= 2 else None + num_tokens = re.findall(r"(\d+(?:\.\d+)?)\s*([kKwW万]?)", text.replace(",", "")) + salary_min = None + salary_max = None + if num_tokens: + vals: list[tuple[float, str]] = [(float(n), u) for n, u in num_tokens] + if len(vals) >= 2: + u1 = vals[0][1] + u2 = vals[1][1] + if not u1 and u2: + vals[0] = (vals[0][0], u2) + if not u2 and u1: + vals[1] = (vals[1][0], u1) + + def scaled(v: float, unit: str) -> int: + m = 1 + if unit in ("k", "K"): + m = 1000 + elif unit in ("w", "W", "万"): + m = 10000 + return int(v * m) + + salary_min = scaled(vals[0][0], vals[0][1]) if len(vals) >= 1 else None + salary_max = scaled(vals[1][0], vals[1][1]) if len(vals) >= 2 else None period = None if "month" in lower or "每月" in text or "月" in text: @@ -240,7 +289,7 @@ def parse_salary(raw: str | None) -> tuple[str | None, int | None, int | None, s elif "day" in lower or "日" in text: period = "day" - return text, salary_min, salary_max, period + return text, currency, salary_min, salary_max, period def strip_meta_lines(content: str) -> str: @@ -570,8 +619,9 @@ def parse_dejob_official( salary_line = ln break - salary_raw, salary_min, salary_max, salary_period = parse_salary(salary_line) - salary_currency = "USD" if salary_raw and "$" in salary_raw else None + salary_raw, salary_currency, salary_min, salary_max, salary_period = parse_salary( + salary_line + ) responsibilities = extract_list_section(body_text, "岗位职责") requirements = extract_list_section(body_text, "岗位要求") @@ -643,8 +693,9 @@ def parse_generic( salary_line = ln break - salary_raw, salary_min, salary_max, salary_period = parse_salary(salary_line) - salary_currency = "USD" if salary_raw and "$" in salary_raw else None + salary_raw, salary_currency, salary_min, salary_max, salary_period = parse_salary( + salary_line + ) job_type = "招聘" if ("招聘" in body_text or "recruit" in body_text.lower()) else None ( @@ -770,8 +821,9 @@ def parse_dejob_global( if "薪酬" in ln or "salary" in ln.lower(): salary_line = ln break - salary_raw, salary_min, salary_max, salary_period = parse_salary(salary_line) - salary_currency = "USD" if salary_raw and "$" in salary_raw else None + salary_raw, salary_currency, salary_min, salary_max, salary_period = parse_salary( + salary_line + ) responsibilities = extract_list_section(body_text, "岗位职责") or extract_list_section( body_text, "Responsibilities" @@ -849,8 +901,9 @@ def parse_remote_cn( if ln.startswith("摘要:"): summary_line = ln break - salary_raw, salary_min, salary_max, salary_period = parse_salary(summary_line) - salary_currency = "USD" if salary_raw and "$" in salary_raw else None + salary_raw, salary_currency, salary_min, salary_max, salary_period = parse_salary( + summary_line + ) urls = extract_urls(body_text) apply_email = extract_apply_email(body_text) @@ -935,8 +988,9 @@ def parse_cryptojobslist_source( if any(k in ln.lower() for k in ("salary", "$", "usd")): salary_line = ln break - salary_raw, salary_min, salary_max, salary_period = parse_salary(salary_line) - salary_currency = "USD" if salary_raw and "$" in salary_raw else None + salary_raw, salary_currency, salary_min, salary_max, salary_period = parse_salary( + salary_line + ) apply_email = extract_apply_email(body_text) apply_tg = extract_apply_telegram(body_text) diff --git a/import_excel_jobs.py b/import_excel_jobs.py index d28bc6f..bf6f892 100644 --- a/import_excel_jobs.py +++ b/import_excel_jobs.py @@ -89,7 +89,7 @@ def load_mysql_config() -> dict: def connect_mysql(cfg: dict): - return pymysql.connect( + conn = pymysql.connect( host=cfg["host"], port=cfg["port"], user=cfg["user"], @@ -98,6 +98,9 @@ def connect_mysql(cfg: dict): charset=cfg["charset"], autocommit=True, ) + with conn.cursor() as cur: + cur.execute("SET time_zone = '+00:00'") + return conn def init_tables(conn): diff --git a/main.py b/main.py index 2b14b01..42dfdb9 100644 --- a/main.py +++ b/main.py @@ -194,6 +194,9 @@ class MySQLStore: charset=self.cfg["charset"], autocommit=True, ) + # Force session timestamps to UTC for NOW()/CURRENT_TIMESTAMP consistency. + with self.conn.cursor() as cursor: + cursor.execute("SET time_zone = '+00:00'") def close(self): if self.conn: diff --git a/sync_to_cloud_mysql.py b/sync_to_cloud_mysql.py index 9b294d8..24798e3 100644 --- a/sync_to_cloud_mysql.py +++ b/sync_to_cloud_mysql.py @@ -84,7 +84,7 @@ def load_config() -> tuple[dict, dict]: def connect_mysql(cfg: dict): - return pymysql.connect( + conn = pymysql.connect( host=cfg["host"], port=cfg["port"], user=cfg["user"], @@ -94,6 +94,9 @@ def connect_mysql(cfg: dict): autocommit=True, cursorclass=pymysql.cursors.DictCursor, ) + with conn.cursor() as cur: + cur.execute("SET time_zone = '+00:00'") + return conn def ensure_cloud_tables(cloud_conn):