修复货币数字错误和没有货币种类问题,修复UTC和本地时间混用,改用UTC

This commit is contained in:
BoliviaYu
2026-03-07 15:02:11 +08:00
parent 0323a1d940
commit 4f03824299
5 changed files with 92 additions and 19 deletions

View File

@@ -48,6 +48,12 @@ cp config.example.json config.json
- `mysql`: 本地 MySQL 连接
- `mysql_cloud`: 云端 MySQL 连接(用于同步)
时间规范:
- 项目统一使用 UTC 存储所有 `DATETIME`
- 各脚本连接 MySQL 后会执行 `SET time_zone = '+00:00'`
- `NOW()` / `CURRENT_TIMESTAMP` 产生的时间也按 UTC 写入
## 4. 运行方式
### 4.1 手动执行
@@ -287,6 +293,10 @@ uv run import_excel_jobs.py --file /path/to/jobs.xlsx --sheet Sheet1 --source @e
- 检查 `messages` 是否有新数据。
- 检查 `clean_state.last_message_row_id` 是否已到最新。
6. 历史数据有 UTC+8 和 UTC 混用怎么办
- 新版脚本已统一写入 UTC。
- 历史数据需一次性迁移后再对齐分析口径(建议先备份再修复)。
## 10. 协作规范建议
- 新增来源规则时,优先增加 source 专用 parser避免影响已有来源。

View File

@@ -119,7 +119,7 @@ def load_mysql_config() -> dict:
def connect_mysql(cfg: dict):
return pymysql.connect(
conn = pymysql.connect(
host=cfg["host"],
port=cfg["port"],
user=cfg["user"],
@@ -128,6 +128,9 @@ def connect_mysql(cfg: dict):
charset=cfg["charset"],
autocommit=True,
)
with conn.cursor() as cur:
cur.execute("SET time_zone = '+00:00'")
return conn
def init_target_db(conn):
@@ -221,16 +224,62 @@ def clean_company_name(s: str | None) -> str | None:
return s or None
def parse_salary(raw: str | None) -> tuple[str | None, int | None, int | None, str | None]:
def infer_salary_currency(text: str) -> str | None:
low = text.lower()
if any(k in low for k in ["usd", "us$", "dollar"]) or "$" in text:
return "USD"
if any(k in text for k in ["¥", "", "人民币"]) or "cny" in low:
return "CNY"
if "" in text or "" in text:
return "CNY"
if "k" in low and any(k in low for k in ["month", "year", "day"]):
return "USD"
if "eur" in low or "" in text:
return "EUR"
if "hkd" in low or "hk$" in low:
return "HKD"
if "sgd" in low or "s$" in low:
return "SGD"
# No explicit marker: infer by language.
if re.search(r"[\u4e00-\u9fff]", text):
return "CNY"
return "USD"
def parse_salary(
raw: str | None,
) -> tuple[str | None, str | None, int | None, int | None, str | None]:
if not raw:
return None, None, None, None
return None, None, None, None, None
text = clean_md_text(raw)
lower = text.lower()
currency = infer_salary_currency(text)
nums = re.findall(r"\d+(?:\.\d+)?", text.replace(",", ""))
salary_min = int(float(nums[0])) if len(nums) >= 1 else None
salary_max = int(float(nums[1])) if len(nums) >= 2 else None
num_tokens = re.findall(r"(\d+(?:\.\d+)?)\s*([kKwW万]?)", text.replace(",", ""))
salary_min = None
salary_max = None
if num_tokens:
vals: list[tuple[float, str]] = [(float(n), u) for n, u in num_tokens]
if len(vals) >= 2:
u1 = vals[0][1]
u2 = vals[1][1]
if not u1 and u2:
vals[0] = (vals[0][0], u2)
if not u2 and u1:
vals[1] = (vals[1][0], u1)
def scaled(v: float, unit: str) -> int:
m = 1
if unit in ("k", "K"):
m = 1000
elif unit in ("w", "W", ""):
m = 10000
return int(v * m)
salary_min = scaled(vals[0][0], vals[0][1]) if len(vals) >= 1 else None
salary_max = scaled(vals[1][0], vals[1][1]) if len(vals) >= 2 else None
period = None
if "month" in lower or "每月" in text or "" in text:
@@ -240,7 +289,7 @@ def parse_salary(raw: str | None) -> tuple[str | None, int | None, int | None, s
elif "day" in lower or "" in text:
period = "day"
return text, salary_min, salary_max, period
return text, currency, salary_min, salary_max, period
def strip_meta_lines(content: str) -> str:
@@ -570,8 +619,9 @@ def parse_dejob_official(
salary_line = ln
break
salary_raw, salary_min, salary_max, salary_period = parse_salary(salary_line)
salary_currency = "USD" if salary_raw and "$" in salary_raw else None
salary_raw, salary_currency, salary_min, salary_max, salary_period = parse_salary(
salary_line
)
responsibilities = extract_list_section(body_text, "岗位职责")
requirements = extract_list_section(body_text, "岗位要求")
@@ -643,8 +693,9 @@ def parse_generic(
salary_line = ln
break
salary_raw, salary_min, salary_max, salary_period = parse_salary(salary_line)
salary_currency = "USD" if salary_raw and "$" in salary_raw else None
salary_raw, salary_currency, salary_min, salary_max, salary_period = parse_salary(
salary_line
)
job_type = "招聘" if ("招聘" in body_text or "recruit" in body_text.lower()) else None
(
@@ -770,8 +821,9 @@ def parse_dejob_global(
if "薪酬" in ln or "salary" in ln.lower():
salary_line = ln
break
salary_raw, salary_min, salary_max, salary_period = parse_salary(salary_line)
salary_currency = "USD" if salary_raw and "$" in salary_raw else None
salary_raw, salary_currency, salary_min, salary_max, salary_period = parse_salary(
salary_line
)
responsibilities = extract_list_section(body_text, "岗位职责") or extract_list_section(
body_text, "Responsibilities"
@@ -849,8 +901,9 @@ def parse_remote_cn(
if ln.startswith("摘要:"):
summary_line = ln
break
salary_raw, salary_min, salary_max, salary_period = parse_salary(summary_line)
salary_currency = "USD" if salary_raw and "$" in salary_raw else None
salary_raw, salary_currency, salary_min, salary_max, salary_period = parse_salary(
summary_line
)
urls = extract_urls(body_text)
apply_email = extract_apply_email(body_text)
@@ -935,8 +988,9 @@ def parse_cryptojobslist_source(
if any(k in ln.lower() for k in ("salary", "$", "usd")):
salary_line = ln
break
salary_raw, salary_min, salary_max, salary_period = parse_salary(salary_line)
salary_currency = "USD" if salary_raw and "$" in salary_raw else None
salary_raw, salary_currency, salary_min, salary_max, salary_period = parse_salary(
salary_line
)
apply_email = extract_apply_email(body_text)
apply_tg = extract_apply_telegram(body_text)

View File

@@ -89,7 +89,7 @@ def load_mysql_config() -> dict:
def connect_mysql(cfg: dict):
return pymysql.connect(
conn = pymysql.connect(
host=cfg["host"],
port=cfg["port"],
user=cfg["user"],
@@ -98,6 +98,9 @@ def connect_mysql(cfg: dict):
charset=cfg["charset"],
autocommit=True,
)
with conn.cursor() as cur:
cur.execute("SET time_zone = '+00:00'")
return conn
def init_tables(conn):

View File

@@ -194,6 +194,9 @@ class MySQLStore:
charset=self.cfg["charset"],
autocommit=True,
)
# Force session timestamps to UTC for NOW()/CURRENT_TIMESTAMP consistency.
with self.conn.cursor() as cursor:
cursor.execute("SET time_zone = '+00:00'")
def close(self):
if self.conn:

View File

@@ -84,7 +84,7 @@ def load_config() -> tuple[dict, dict]:
def connect_mysql(cfg: dict):
return pymysql.connect(
conn = pymysql.connect(
host=cfg["host"],
port=cfg["port"],
user=cfg["user"],
@@ -94,6 +94,9 @@ def connect_mysql(cfg: dict):
autocommit=True,
cursorclass=pymysql.cursors.DictCursor,
)
with conn.cursor() as cur:
cur.execute("SET time_zone = '+00:00'")
return conn
def ensure_cloud_tables(cloud_conn):