修复货币数字错误和没有货币种类问题,修复UTC和本地时间混用,改用UTC

This commit is contained in:
BoliviaYu
2026-03-07 15:02:11 +08:00
parent 0323a1d940
commit 4f03824299
5 changed files with 92 additions and 19 deletions

View File

@@ -119,7 +119,7 @@ def load_mysql_config() -> dict:
def connect_mysql(cfg: dict):
return pymysql.connect(
conn = pymysql.connect(
host=cfg["host"],
port=cfg["port"],
user=cfg["user"],
@@ -128,6 +128,9 @@ def connect_mysql(cfg: dict):
charset=cfg["charset"],
autocommit=True,
)
with conn.cursor() as cur:
cur.execute("SET time_zone = '+00:00'")
return conn
def init_target_db(conn):
@@ -221,16 +224,62 @@ def clean_company_name(s: str | None) -> str | None:
return s or None
def parse_salary(raw: str | None) -> tuple[str | None, int | None, int | None, str | None]:
def infer_salary_currency(text: str) -> str | None:
low = text.lower()
if any(k in low for k in ["usd", "us$", "dollar"]) or "$" in text:
return "USD"
if any(k in text for k in ["¥", "", "人民币"]) or "cny" in low:
return "CNY"
if "" in text or "" in text:
return "CNY"
if "k" in low and any(k in low for k in ["month", "year", "day"]):
return "USD"
if "eur" in low or "" in text:
return "EUR"
if "hkd" in low or "hk$" in low:
return "HKD"
if "sgd" in low or "s$" in low:
return "SGD"
# No explicit marker: infer by language.
if re.search(r"[\u4e00-\u9fff]", text):
return "CNY"
return "USD"
def parse_salary(
raw: str | None,
) -> tuple[str | None, str | None, int | None, int | None, str | None]:
if not raw:
return None, None, None, None
return None, None, None, None, None
text = clean_md_text(raw)
lower = text.lower()
currency = infer_salary_currency(text)
nums = re.findall(r"\d+(?:\.\d+)?", text.replace(",", ""))
salary_min = int(float(nums[0])) if len(nums) >= 1 else None
salary_max = int(float(nums[1])) if len(nums) >= 2 else None
num_tokens = re.findall(r"(\d+(?:\.\d+)?)\s*([kKwW万]?)", text.replace(",", ""))
salary_min = None
salary_max = None
if num_tokens:
vals: list[tuple[float, str]] = [(float(n), u) for n, u in num_tokens]
if len(vals) >= 2:
u1 = vals[0][1]
u2 = vals[1][1]
if not u1 and u2:
vals[0] = (vals[0][0], u2)
if not u2 and u1:
vals[1] = (vals[1][0], u1)
def scaled(v: float, unit: str) -> int:
m = 1
if unit in ("k", "K"):
m = 1000
elif unit in ("w", "W", ""):
m = 10000
return int(v * m)
salary_min = scaled(vals[0][0], vals[0][1]) if len(vals) >= 1 else None
salary_max = scaled(vals[1][0], vals[1][1]) if len(vals) >= 2 else None
period = None
if "month" in lower or "每月" in text or "" in text:
@@ -240,7 +289,7 @@ def parse_salary(raw: str | None) -> tuple[str | None, int | None, int | None, s
elif "day" in lower or "" in text:
period = "day"
return text, salary_min, salary_max, period
return text, currency, salary_min, salary_max, period
def strip_meta_lines(content: str) -> str:
@@ -570,8 +619,9 @@ def parse_dejob_official(
salary_line = ln
break
salary_raw, salary_min, salary_max, salary_period = parse_salary(salary_line)
salary_currency = "USD" if salary_raw and "$" in salary_raw else None
salary_raw, salary_currency, salary_min, salary_max, salary_period = parse_salary(
salary_line
)
responsibilities = extract_list_section(body_text, "岗位职责")
requirements = extract_list_section(body_text, "岗位要求")
@@ -643,8 +693,9 @@ def parse_generic(
salary_line = ln
break
salary_raw, salary_min, salary_max, salary_period = parse_salary(salary_line)
salary_currency = "USD" if salary_raw and "$" in salary_raw else None
salary_raw, salary_currency, salary_min, salary_max, salary_period = parse_salary(
salary_line
)
job_type = "招聘" if ("招聘" in body_text or "recruit" in body_text.lower()) else None
(
@@ -770,8 +821,9 @@ def parse_dejob_global(
if "薪酬" in ln or "salary" in ln.lower():
salary_line = ln
break
salary_raw, salary_min, salary_max, salary_period = parse_salary(salary_line)
salary_currency = "USD" if salary_raw and "$" in salary_raw else None
salary_raw, salary_currency, salary_min, salary_max, salary_period = parse_salary(
salary_line
)
responsibilities = extract_list_section(body_text, "岗位职责") or extract_list_section(
body_text, "Responsibilities"
@@ -849,8 +901,9 @@ def parse_remote_cn(
if ln.startswith("摘要:"):
summary_line = ln
break
salary_raw, salary_min, salary_max, salary_period = parse_salary(summary_line)
salary_currency = "USD" if salary_raw and "$" in salary_raw else None
salary_raw, salary_currency, salary_min, salary_max, salary_period = parse_salary(
summary_line
)
urls = extract_urls(body_text)
apply_email = extract_apply_email(body_text)
@@ -935,8 +988,9 @@ def parse_cryptojobslist_source(
if any(k in ln.lower() for k in ("salary", "$", "usd")):
salary_line = ln
break
salary_raw, salary_min, salary_max, salary_period = parse_salary(salary_line)
salary_currency = "USD" if salary_raw and "$" in salary_raw else None
salary_raw, salary_currency, salary_min, salary_max, salary_period = parse_salary(
salary_line
)
apply_email = extract_apply_email(body_text)
apply_tg = extract_apply_telegram(body_text)