修复货币数字错误和没有货币种类问题,修复UTC和本地时间混用,改用UTC
This commit is contained in:
10
README.md
10
README.md
@@ -48,6 +48,12 @@ cp config.example.json config.json
|
||||
- `mysql`: 本地 MySQL 连接
|
||||
- `mysql_cloud`: 云端 MySQL 连接(用于同步)
|
||||
|
||||
时间规范:
|
||||
|
||||
- 项目统一使用 UTC 存储所有 `DATETIME`
|
||||
- 各脚本连接 MySQL 后会执行 `SET time_zone = '+00:00'`
|
||||
- `NOW()` / `CURRENT_TIMESTAMP` 产生的时间也按 UTC 写入
|
||||
|
||||
## 4. 运行方式
|
||||
|
||||
### 4.1 手动执行
|
||||
@@ -287,6 +293,10 @@ uv run import_excel_jobs.py --file /path/to/jobs.xlsx --sheet Sheet1 --source @e
|
||||
- 检查 `messages` 是否有新数据。
|
||||
- 检查 `clean_state.last_message_row_id` 是否已到最新。
|
||||
|
||||
6. 历史数据有 UTC+8 和 UTC 混用怎么办
|
||||
- 新版脚本已统一写入 UTC。
|
||||
- 历史数据需一次性迁移后再对齐分析口径(建议先备份再修复)。
|
||||
|
||||
## 10. 协作规范建议
|
||||
|
||||
- 新增来源规则时,优先增加 source 专用 parser,避免影响已有来源。
|
||||
|
||||
@@ -119,7 +119,7 @@ def load_mysql_config() -> dict:
|
||||
|
||||
|
||||
def connect_mysql(cfg: dict):
|
||||
return pymysql.connect(
|
||||
conn = pymysql.connect(
|
||||
host=cfg["host"],
|
||||
port=cfg["port"],
|
||||
user=cfg["user"],
|
||||
@@ -128,6 +128,9 @@ def connect_mysql(cfg: dict):
|
||||
charset=cfg["charset"],
|
||||
autocommit=True,
|
||||
)
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("SET time_zone = '+00:00'")
|
||||
return conn
|
||||
|
||||
|
||||
def init_target_db(conn):
|
||||
@@ -221,16 +224,62 @@ def clean_company_name(s: str | None) -> str | None:
|
||||
return s or None
|
||||
|
||||
|
||||
def parse_salary(raw: str | None) -> tuple[str | None, int | None, int | None, str | None]:
|
||||
def infer_salary_currency(text: str) -> str | None:
|
||||
low = text.lower()
|
||||
if any(k in low for k in ["usd", "us$", "dollar"]) or "$" in text:
|
||||
return "USD"
|
||||
if any(k in text for k in ["¥", "¥", "人民币"]) or "cny" in low:
|
||||
return "CNY"
|
||||
if "元" in text or "万" in text:
|
||||
return "CNY"
|
||||
if "k" in low and any(k in low for k in ["month", "year", "day"]):
|
||||
return "USD"
|
||||
if "eur" in low or "€" in text:
|
||||
return "EUR"
|
||||
if "hkd" in low or "hk$" in low:
|
||||
return "HKD"
|
||||
if "sgd" in low or "s$" in low:
|
||||
return "SGD"
|
||||
|
||||
# No explicit marker: infer by language.
|
||||
if re.search(r"[\u4e00-\u9fff]", text):
|
||||
return "CNY"
|
||||
return "USD"
|
||||
|
||||
|
||||
def parse_salary(
|
||||
raw: str | None,
|
||||
) -> tuple[str | None, str | None, int | None, int | None, str | None]:
|
||||
if not raw:
|
||||
return None, None, None, None
|
||||
return None, None, None, None, None
|
||||
|
||||
text = clean_md_text(raw)
|
||||
lower = text.lower()
|
||||
currency = infer_salary_currency(text)
|
||||
|
||||
nums = re.findall(r"\d+(?:\.\d+)?", text.replace(",", ""))
|
||||
salary_min = int(float(nums[0])) if len(nums) >= 1 else None
|
||||
salary_max = int(float(nums[1])) if len(nums) >= 2 else None
|
||||
num_tokens = re.findall(r"(\d+(?:\.\d+)?)\s*([kKwW万]?)", text.replace(",", ""))
|
||||
salary_min = None
|
||||
salary_max = None
|
||||
if num_tokens:
|
||||
vals: list[tuple[float, str]] = [(float(n), u) for n, u in num_tokens]
|
||||
if len(vals) >= 2:
|
||||
u1 = vals[0][1]
|
||||
u2 = vals[1][1]
|
||||
if not u1 and u2:
|
||||
vals[0] = (vals[0][0], u2)
|
||||
if not u2 and u1:
|
||||
vals[1] = (vals[1][0], u1)
|
||||
|
||||
def scaled(v: float, unit: str) -> int:
|
||||
m = 1
|
||||
if unit in ("k", "K"):
|
||||
m = 1000
|
||||
elif unit in ("w", "W", "万"):
|
||||
m = 10000
|
||||
return int(v * m)
|
||||
|
||||
salary_min = scaled(vals[0][0], vals[0][1]) if len(vals) >= 1 else None
|
||||
salary_max = scaled(vals[1][0], vals[1][1]) if len(vals) >= 2 else None
|
||||
|
||||
period = None
|
||||
if "month" in lower or "每月" in text or "月" in text:
|
||||
@@ -240,7 +289,7 @@ def parse_salary(raw: str | None) -> tuple[str | None, int | None, int | None, s
|
||||
elif "day" in lower or "日" in text:
|
||||
period = "day"
|
||||
|
||||
return text, salary_min, salary_max, period
|
||||
return text, currency, salary_min, salary_max, period
|
||||
|
||||
|
||||
def strip_meta_lines(content: str) -> str:
|
||||
@@ -570,8 +619,9 @@ def parse_dejob_official(
|
||||
salary_line = ln
|
||||
break
|
||||
|
||||
salary_raw, salary_min, salary_max, salary_period = parse_salary(salary_line)
|
||||
salary_currency = "USD" if salary_raw and "$" in salary_raw else None
|
||||
salary_raw, salary_currency, salary_min, salary_max, salary_period = parse_salary(
|
||||
salary_line
|
||||
)
|
||||
|
||||
responsibilities = extract_list_section(body_text, "岗位职责")
|
||||
requirements = extract_list_section(body_text, "岗位要求")
|
||||
@@ -643,8 +693,9 @@ def parse_generic(
|
||||
salary_line = ln
|
||||
break
|
||||
|
||||
salary_raw, salary_min, salary_max, salary_period = parse_salary(salary_line)
|
||||
salary_currency = "USD" if salary_raw and "$" in salary_raw else None
|
||||
salary_raw, salary_currency, salary_min, salary_max, salary_period = parse_salary(
|
||||
salary_line
|
||||
)
|
||||
|
||||
job_type = "招聘" if ("招聘" in body_text or "recruit" in body_text.lower()) else None
|
||||
(
|
||||
@@ -770,8 +821,9 @@ def parse_dejob_global(
|
||||
if "薪酬" in ln or "salary" in ln.lower():
|
||||
salary_line = ln
|
||||
break
|
||||
salary_raw, salary_min, salary_max, salary_period = parse_salary(salary_line)
|
||||
salary_currency = "USD" if salary_raw and "$" in salary_raw else None
|
||||
salary_raw, salary_currency, salary_min, salary_max, salary_period = parse_salary(
|
||||
salary_line
|
||||
)
|
||||
|
||||
responsibilities = extract_list_section(body_text, "岗位职责") or extract_list_section(
|
||||
body_text, "Responsibilities"
|
||||
@@ -849,8 +901,9 @@ def parse_remote_cn(
|
||||
if ln.startswith("摘要:"):
|
||||
summary_line = ln
|
||||
break
|
||||
salary_raw, salary_min, salary_max, salary_period = parse_salary(summary_line)
|
||||
salary_currency = "USD" if salary_raw and "$" in salary_raw else None
|
||||
salary_raw, salary_currency, salary_min, salary_max, salary_period = parse_salary(
|
||||
summary_line
|
||||
)
|
||||
|
||||
urls = extract_urls(body_text)
|
||||
apply_email = extract_apply_email(body_text)
|
||||
@@ -935,8 +988,9 @@ def parse_cryptojobslist_source(
|
||||
if any(k in ln.lower() for k in ("salary", "$", "usd")):
|
||||
salary_line = ln
|
||||
break
|
||||
salary_raw, salary_min, salary_max, salary_period = parse_salary(salary_line)
|
||||
salary_currency = "USD" if salary_raw and "$" in salary_raw else None
|
||||
salary_raw, salary_currency, salary_min, salary_max, salary_period = parse_salary(
|
||||
salary_line
|
||||
)
|
||||
|
||||
apply_email = extract_apply_email(body_text)
|
||||
apply_tg = extract_apply_telegram(body_text)
|
||||
|
||||
@@ -89,7 +89,7 @@ def load_mysql_config() -> dict:
|
||||
|
||||
|
||||
def connect_mysql(cfg: dict):
|
||||
return pymysql.connect(
|
||||
conn = pymysql.connect(
|
||||
host=cfg["host"],
|
||||
port=cfg["port"],
|
||||
user=cfg["user"],
|
||||
@@ -98,6 +98,9 @@ def connect_mysql(cfg: dict):
|
||||
charset=cfg["charset"],
|
||||
autocommit=True,
|
||||
)
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("SET time_zone = '+00:00'")
|
||||
return conn
|
||||
|
||||
|
||||
def init_tables(conn):
|
||||
|
||||
3
main.py
3
main.py
@@ -194,6 +194,9 @@ class MySQLStore:
|
||||
charset=self.cfg["charset"],
|
||||
autocommit=True,
|
||||
)
|
||||
# Force session timestamps to UTC for NOW()/CURRENT_TIMESTAMP consistency.
|
||||
with self.conn.cursor() as cursor:
|
||||
cursor.execute("SET time_zone = '+00:00'")
|
||||
|
||||
def close(self):
|
||||
if self.conn:
|
||||
|
||||
@@ -84,7 +84,7 @@ def load_config() -> tuple[dict, dict]:
|
||||
|
||||
|
||||
def connect_mysql(cfg: dict):
|
||||
return pymysql.connect(
|
||||
conn = pymysql.connect(
|
||||
host=cfg["host"],
|
||||
port=cfg["port"],
|
||||
user=cfg["user"],
|
||||
@@ -94,6 +94,9 @@ def connect_mysql(cfg: dict):
|
||||
autocommit=True,
|
||||
cursorclass=pymysql.cursors.DictCursor,
|
||||
)
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("SET time_zone = '+00:00'")
|
||||
return conn
|
||||
|
||||
|
||||
def ensure_cloud_tables(cloud_conn):
|
||||
|
||||
Reference in New Issue
Block a user