修复货币数字错误和没有货币种类问题,修复UTC和本地时间混用,改用UTC
This commit is contained in:
@@ -119,7 +119,7 @@ def load_mysql_config() -> dict:
|
||||
|
||||
|
||||
def connect_mysql(cfg: dict):
|
||||
return pymysql.connect(
|
||||
conn = pymysql.connect(
|
||||
host=cfg["host"],
|
||||
port=cfg["port"],
|
||||
user=cfg["user"],
|
||||
@@ -128,6 +128,9 @@ def connect_mysql(cfg: dict):
|
||||
charset=cfg["charset"],
|
||||
autocommit=True,
|
||||
)
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("SET time_zone = '+00:00'")
|
||||
return conn
|
||||
|
||||
|
||||
def init_target_db(conn):
|
||||
@@ -221,16 +224,62 @@ def clean_company_name(s: str | None) -> str | None:
|
||||
return s or None
|
||||
|
||||
|
||||
def parse_salary(raw: str | None) -> tuple[str | None, int | None, int | None, str | None]:
|
||||
def infer_salary_currency(text: str) -> str | None:
|
||||
low = text.lower()
|
||||
if any(k in low for k in ["usd", "us$", "dollar"]) or "$" in text:
|
||||
return "USD"
|
||||
if any(k in text for k in ["¥", "¥", "人民币"]) or "cny" in low:
|
||||
return "CNY"
|
||||
if "元" in text or "万" in text:
|
||||
return "CNY"
|
||||
if "k" in low and any(k in low for k in ["month", "year", "day"]):
|
||||
return "USD"
|
||||
if "eur" in low or "€" in text:
|
||||
return "EUR"
|
||||
if "hkd" in low or "hk$" in low:
|
||||
return "HKD"
|
||||
if "sgd" in low or "s$" in low:
|
||||
return "SGD"
|
||||
|
||||
# No explicit marker: infer by language.
|
||||
if re.search(r"[\u4e00-\u9fff]", text):
|
||||
return "CNY"
|
||||
return "USD"
|
||||
|
||||
|
||||
def parse_salary(
|
||||
raw: str | None,
|
||||
) -> tuple[str | None, str | None, int | None, int | None, str | None]:
|
||||
if not raw:
|
||||
return None, None, None, None
|
||||
return None, None, None, None, None
|
||||
|
||||
text = clean_md_text(raw)
|
||||
lower = text.lower()
|
||||
currency = infer_salary_currency(text)
|
||||
|
||||
nums = re.findall(r"\d+(?:\.\d+)?", text.replace(",", ""))
|
||||
salary_min = int(float(nums[0])) if len(nums) >= 1 else None
|
||||
salary_max = int(float(nums[1])) if len(nums) >= 2 else None
|
||||
num_tokens = re.findall(r"(\d+(?:\.\d+)?)\s*([kKwW万]?)", text.replace(",", ""))
|
||||
salary_min = None
|
||||
salary_max = None
|
||||
if num_tokens:
|
||||
vals: list[tuple[float, str]] = [(float(n), u) for n, u in num_tokens]
|
||||
if len(vals) >= 2:
|
||||
u1 = vals[0][1]
|
||||
u2 = vals[1][1]
|
||||
if not u1 and u2:
|
||||
vals[0] = (vals[0][0], u2)
|
||||
if not u2 and u1:
|
||||
vals[1] = (vals[1][0], u1)
|
||||
|
||||
def scaled(v: float, unit: str) -> int:
|
||||
m = 1
|
||||
if unit in ("k", "K"):
|
||||
m = 1000
|
||||
elif unit in ("w", "W", "万"):
|
||||
m = 10000
|
||||
return int(v * m)
|
||||
|
||||
salary_min = scaled(vals[0][0], vals[0][1]) if len(vals) >= 1 else None
|
||||
salary_max = scaled(vals[1][0], vals[1][1]) if len(vals) >= 2 else None
|
||||
|
||||
period = None
|
||||
if "month" in lower or "每月" in text or "月" in text:
|
||||
@@ -240,7 +289,7 @@ def parse_salary(raw: str | None) -> tuple[str | None, int | None, int | None, s
|
||||
elif "day" in lower or "日" in text:
|
||||
period = "day"
|
||||
|
||||
return text, salary_min, salary_max, period
|
||||
return text, currency, salary_min, salary_max, period
|
||||
|
||||
|
||||
def strip_meta_lines(content: str) -> str:
|
||||
@@ -570,8 +619,9 @@ def parse_dejob_official(
|
||||
salary_line = ln
|
||||
break
|
||||
|
||||
salary_raw, salary_min, salary_max, salary_period = parse_salary(salary_line)
|
||||
salary_currency = "USD" if salary_raw and "$" in salary_raw else None
|
||||
salary_raw, salary_currency, salary_min, salary_max, salary_period = parse_salary(
|
||||
salary_line
|
||||
)
|
||||
|
||||
responsibilities = extract_list_section(body_text, "岗位职责")
|
||||
requirements = extract_list_section(body_text, "岗位要求")
|
||||
@@ -643,8 +693,9 @@ def parse_generic(
|
||||
salary_line = ln
|
||||
break
|
||||
|
||||
salary_raw, salary_min, salary_max, salary_period = parse_salary(salary_line)
|
||||
salary_currency = "USD" if salary_raw and "$" in salary_raw else None
|
||||
salary_raw, salary_currency, salary_min, salary_max, salary_period = parse_salary(
|
||||
salary_line
|
||||
)
|
||||
|
||||
job_type = "招聘" if ("招聘" in body_text or "recruit" in body_text.lower()) else None
|
||||
(
|
||||
@@ -770,8 +821,9 @@ def parse_dejob_global(
|
||||
if "薪酬" in ln or "salary" in ln.lower():
|
||||
salary_line = ln
|
||||
break
|
||||
salary_raw, salary_min, salary_max, salary_period = parse_salary(salary_line)
|
||||
salary_currency = "USD" if salary_raw and "$" in salary_raw else None
|
||||
salary_raw, salary_currency, salary_min, salary_max, salary_period = parse_salary(
|
||||
salary_line
|
||||
)
|
||||
|
||||
responsibilities = extract_list_section(body_text, "岗位职责") or extract_list_section(
|
||||
body_text, "Responsibilities"
|
||||
@@ -849,8 +901,9 @@ def parse_remote_cn(
|
||||
if ln.startswith("摘要:"):
|
||||
summary_line = ln
|
||||
break
|
||||
salary_raw, salary_min, salary_max, salary_period = parse_salary(summary_line)
|
||||
salary_currency = "USD" if salary_raw and "$" in salary_raw else None
|
||||
salary_raw, salary_currency, salary_min, salary_max, salary_period = parse_salary(
|
||||
summary_line
|
||||
)
|
||||
|
||||
urls = extract_urls(body_text)
|
||||
apply_email = extract_apply_email(body_text)
|
||||
@@ -935,8 +988,9 @@ def parse_cryptojobslist_source(
|
||||
if any(k in ln.lower() for k in ("salary", "$", "usd")):
|
||||
salary_line = ln
|
||||
break
|
||||
salary_raw, salary_min, salary_max, salary_period = parse_salary(salary_line)
|
||||
salary_currency = "USD" if salary_raw and "$" in salary_raw else None
|
||||
salary_raw, salary_currency, salary_min, salary_max, salary_period = parse_salary(
|
||||
salary_line
|
||||
)
|
||||
|
||||
apply_email = extract_apply_email(body_text)
|
||||
apply_tg = extract_apply_telegram(body_text)
|
||||
|
||||
Reference in New Issue
Block a user