Files
tg_crawl/clean_to_structured.py

1259 lines
40 KiB
Python
Raw Normal View History

2026-02-26 20:00:06 +08:00
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Source-aware structured cleaning (MySQL).
Input table: messages
Output table: structured_jobs
"""
import json
import logging
import os
import re
from dataclasses import dataclass
import pymysql
CONFIG_FILE = "config.json"
PIPELINE_NAME = "structured_cleaner_v1"
URL_RE = re.compile(r"https?://[^\s)]+", re.IGNORECASE)
EMAIL_RE = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)
TG_RE = re.compile(r"(?<!\w)@[a-zA-Z0-9_]{4,}")
HASHTAG_RE = re.compile(r"#([A-Za-z0-9_\-\u4e00-\u9fff·]+)")
MD_TAG_RE = re.compile(r"#(?:\*\*)?([A-Za-z0-9_\-\u4e00-\u9fff·]+)(?:\*\*)?")
SECTION_KEYS = (
"简介",
"合作方式",
"待招岗位",
"薪酬福利",
"岗位职责",
"岗位要求",
"申请方式",
"岗位来源",
)
@dataclass
class StructuredJob:
source: str
source_channel: str | None
parser_name: str
parser_version: str
chat_id: int | None
message_id: int
message_date: str
job_type: str | None
company_name: str | None
industry_tags: list[str]
company_intro: str | None
company_url: str | None
work_mode: str
job_nature: str
job_location_text: str | None
job_location_tags: list[str] | None
employment_type_raw: str | None
position_name: str | None
position_tags: list[str]
salary_raw: str | None
salary_currency: str | None
salary_min: int | None
salary_max: int | None
salary_period: str | None
responsibilities: list[str]
requirements: list[str]
apply_email: str | None
apply_telegram: str | None
job_source_url: str | None
body_text: str
raw_content: str
def setup_logger() -> logging.Logger:
os.makedirs("logs", exist_ok=True)
logger = logging.getLogger("clean_to_structured")
logger.setLevel(logging.INFO)
if logger.handlers:
return logger
fmt = logging.Formatter(
"[%(asctime)s] [%(levelname)s] %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
)
ch = logging.StreamHandler()
ch.setFormatter(fmt)
fh = logging.FileHandler("logs/clean_to_structured.log", encoding="utf-8")
fh.setFormatter(fmt)
logger.addHandler(ch)
logger.addHandler(fh)
return logger
logger = setup_logger()
def load_mysql_config() -> dict:
if not os.path.exists(CONFIG_FILE):
raise FileNotFoundError(f"未找到配置文件: {CONFIG_FILE}")
with open(CONFIG_FILE, "r", encoding="utf-8") as f:
cfg = json.load(f)
mysql_cfg = cfg.get("mysql", {})
if not isinstance(mysql_cfg, dict):
raise ValueError("配置错误: mysql 必须是对象")
result = {
"host": mysql_cfg.get("host") or os.getenv("MYSQL_HOST", "127.0.0.1"),
"port": int(mysql_cfg.get("port") or os.getenv("MYSQL_PORT", "3306")),
"user": mysql_cfg.get("user") or os.getenv("MYSQL_USER", "jobs_user"),
"password": mysql_cfg.get("password") or os.getenv("MYSQL_PASSWORD", ""),
"database": mysql_cfg.get("database") or os.getenv("MYSQL_DATABASE", "jobs"),
"charset": mysql_cfg.get("charset") or os.getenv("MYSQL_CHARSET", "utf8mb4"),
}
if not result["password"]:
raise ValueError("配置错误: mysql.password 不能为空")
return result
def connect_mysql(cfg: dict):
conn = pymysql.connect(
2026-02-26 20:00:06 +08:00
host=cfg["host"],
port=cfg["port"],
user=cfg["user"],
password=cfg["password"],
database=cfg["database"],
charset=cfg["charset"],
autocommit=True,
)
with conn.cursor() as cur:
cur.execute("SET time_zone = '+00:00'")
return conn
2026-02-26 20:00:06 +08:00
def init_target_db(conn):
with conn.cursor() as cur:
cur.execute(
"""
CREATE TABLE IF NOT EXISTS structured_jobs (
id BIGINT PRIMARY KEY AUTO_INCREMENT,
source VARCHAR(255) NOT NULL,
source_channel VARCHAR(255) NULL,
parser_name VARCHAR(64) NOT NULL,
parser_version VARCHAR(32) NOT NULL,
chat_id BIGINT NULL,
message_id BIGINT NOT NULL,
message_date DATETIME NOT NULL,
job_type VARCHAR(64) NULL,
company_name VARCHAR(255) NULL,
industry_tags_json JSON NOT NULL,
company_intro LONGTEXT NULL,
company_url TEXT NULL,
work_mode VARCHAR(32) NOT NULL,
job_nature VARCHAR(32) NOT NULL,
job_location_text VARCHAR(255) NULL,
job_location_tags_json JSON NULL,
employment_type_raw TEXT NULL,
position_name VARCHAR(255) NULL,
position_tags_json JSON NOT NULL,
salary_raw TEXT NULL,
salary_currency VARCHAR(16) NULL,
salary_min BIGINT NULL,
salary_max BIGINT NULL,
salary_period VARCHAR(16) NULL,
responsibilities_json JSON NOT NULL,
requirements_json JSON NOT NULL,
apply_email VARCHAR(255) NULL,
apply_telegram VARCHAR(255) NULL,
job_source_url TEXT NULL,
body_text LONGTEXT NOT NULL,
raw_content LONGTEXT NOT NULL,
cleaned_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP
ON UPDATE CURRENT_TIMESTAMP,
UNIQUE KEY uk_source_message (source, message_id),
KEY idx_structured_source_date (source, message_date)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
"""
)
cur.execute(
"""
CREATE TABLE IF NOT EXISTS clean_state (
pipeline_name VARCHAR(128) PRIMARY KEY,
last_message_row_id BIGINT NOT NULL DEFAULT 0,
updated_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP
ON UPDATE CURRENT_TIMESTAMP
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
"""
)
def dedupe(values: list[str]) -> list[str]:
seen = set()
out = []
for v in values:
if not v:
continue
if v in seen:
continue
seen.add(v)
out.append(v)
return out
def clean_md_text(s: str) -> str:
s = re.sub(r"\*+", "", s)
s = re.sub(r"~+", "", s)
s = s.replace("`", "").strip()
return re.sub(r"\s+", " ", s).strip()
def normalize_md_line(s: str) -> str:
s = s.replace("**", "").replace("`", "")
s = s.replace("~~", "")
s = s.replace("\u3000", " ")
return re.sub(r"\s+", " ", s).strip()
def clean_company_name(s: str | None) -> str | None:
if not s:
return None
s = clean_md_text(s)
s = s.strip(" -|:#")
return s or None
def infer_salary_currency(text: str) -> str | None:
low = text.lower()
if any(k in low for k in ["usd", "us$", "dollar"]) or "$" in text:
return "USD"
if any(k in text for k in ["¥", "", "人民币"]) or "cny" in low:
return "CNY"
if "" in text or "" in text:
return "CNY"
if "k" in low and any(k in low for k in ["month", "year", "day"]):
return "USD"
if "eur" in low or "" in text:
return "EUR"
if "hkd" in low or "hk$" in low:
return "HKD"
if "sgd" in low or "s$" in low:
return "SGD"
# No explicit marker: infer by language.
if re.search(r"[\u4e00-\u9fff]", text):
return "CNY"
return "USD"
def parse_salary(
raw: str | None,
) -> tuple[str | None, str | None, int | None, int | None, str | None]:
2026-02-26 20:00:06 +08:00
if not raw:
return None, None, None, None, None
2026-02-26 20:00:06 +08:00
text = clean_md_text(raw)
lower = text.lower()
currency = infer_salary_currency(text)
num_tokens = re.findall(r"(\d+(?:\.\d+)?)\s*([kKwW万]?)", text.replace(",", ""))
salary_min = None
salary_max = None
if num_tokens:
vals: list[tuple[float, str]] = [(float(n), u) for n, u in num_tokens]
if len(vals) >= 2:
u1 = vals[0][1]
u2 = vals[1][1]
if not u1 and u2:
vals[0] = (vals[0][0], u2)
if not u2 and u1:
vals[1] = (vals[1][0], u1)
def scaled(v: float, unit: str) -> int:
m = 1
if unit in ("k", "K"):
m = 1000
elif unit in ("w", "W", ""):
m = 10000
return int(v * m)
salary_min = scaled(vals[0][0], vals[0][1]) if len(vals) >= 1 else None
salary_max = scaled(vals[1][0], vals[1][1]) if len(vals) >= 2 else None
2026-02-26 20:00:06 +08:00
period = None
if "month" in lower or "每月" in text or "" in text:
period = "month"
elif "year" in lower or "" in text:
period = "year"
elif "day" in lower or "" in text:
period = "day"
return text, currency, salary_min, salary_max, period
2026-02-26 20:00:06 +08:00
def strip_meta_lines(content: str) -> str:
lines = []
for ln in (content or "").splitlines():
if ln.startswith("[MEDIA_TYPE] "):
continue
if ln.startswith("[ACTION_TYPE] "):
continue
if ln.startswith("[MEDIA_JSON] "):
continue
if ln.startswith("[ACTION_JSON] "):
continue
if ln.startswith("phones="):
continue
if ln.startswith("[MEDIA_TEXT] "):
continue
lines.append(ln.rstrip())
return "\n".join(lines).strip()
def preprocess_body_text(body_text: str) -> str:
text = re.sub(r"\[([^\]]+)\]\((https?://[^)\s]+)\)", r"\1 \2", body_text)
text = text.replace("**", "").replace("__", "").replace("~~", "").replace("`", "")
lines = [re.sub(r"[ \t]+", " ", ln).strip() for ln in text.splitlines()]
return "\n".join(lines).strip()
def extract_section(body_text: str, section_name: str) -> str | None:
lines = body_text.splitlines()
start = None
for i, ln in enumerate(lines):
if section_name in ln:
start = i + 1
break
if start is None:
return None
collected = []
for ln in lines[start:]:
if any(k in ln for k in SECTION_KEYS):
break
collected.append(ln)
text = "\n".join(collected).strip()
return text or None
def extract_first_url(text: str | None) -> str | None:
if not text:
return None
urls = URL_RE.findall(text)
return urls[0] if urls else None
def extract_job_source_url(body_text: str) -> str | None:
for ln in body_text.splitlines():
if "岗位来源" in ln:
urls = URL_RE.findall(ln)
if urls:
return urls[0]
return None
def extract_company_name_dejob(body_text: str) -> str | None:
for ln in body_text.splitlines():
if "🏡" in ln:
no_md = clean_md_text(ln)
no_md = no_md.replace("🏡", "").strip()
if "#" in no_md:
no_md = no_md.split("#", 1)[0].strip()
return clean_company_name(no_md)
return None
def extract_tags_after_key(line: str, key: str) -> list[str]:
if key not in line:
return []
frag = normalize_md_line(line.split(key, 1)[1])
tags = [clean_md_text(t).replace("·", " ").strip() for t in MD_TAG_RE.findall(frag)]
return dedupe([t for t in tags if t])
def extract_list_section(body_text: str, key: str) -> list[str]:
sec = extract_section(body_text, key)
if not sec:
return []
items = []
for ln in sec.splitlines():
t = clean_md_text(ln)
t = re.sub(r"^\d+️⃣?\s*", "", t)
t = re.sub(r"^\d+[\.、]\s*", "", t)
if t:
items.append(t)
return items
def extract_position_name_dejob(body_text: str) -> str | None:
for ln in body_text.splitlines():
if "待招岗位" in ln:
tags = extract_tags_after_key(ln, "待招岗位")
if tags:
return tags[0]
return None
def extract_apply_email(body_text: str) -> str | None:
emails = EMAIL_RE.findall(body_text)
return emails[0] if emails else None
def extract_apply_telegram(body_text: str) -> str | None:
for ln in body_text.splitlines():
if "Telegram" in ln:
m = TG_RE.search(ln)
if m:
return m.group(0)
handles = TG_RE.findall(body_text)
return handles[0] if handles else None
def extract_urls(body_text: str) -> list[str]:
return dedupe(URL_RE.findall(body_text))
def extract_first_url_by_keyword(body_text: str, keywords: list[str]) -> str | None:
urls = extract_urls(body_text)
for u in urls:
lu = u.lower()
if any(k.lower() in lu for k in keywords):
return u
return None
def extract_first_nonempty_line(body_text: str) -> str | None:
for ln in body_text.splitlines():
t = clean_md_text(ln)
if t:
return t
return None
def normalize_possible_url(raw: str) -> str | None:
token = clean_md_text(raw or "")
if not token:
return None
token = token.strip("()[]<>.,;\"' ")
if not token:
return None
if token.lower().startswith(("http://", "https://")):
return token
if token.lower().startswith("www."):
return "https://" + token
# simple domain-style fallback, e.g. company.com/apply
if " " not in token and "." in token and "/" in token:
return "https://" + token
if " " not in token and re.fullmatch(r"[A-Za-z0-9.-]+\.[A-Za-z]{2,}", token):
return "https://" + token
return None
def extract_apply_link(body_text: str) -> str | None:
# Priority 1: explicit apply-like lines.
for ln in body_text.splitlines():
low = ln.lower()
if "apply" not in low and "申请" not in ln and "投递" not in ln:
continue
# direct URL in line
line_urls = URL_RE.findall(ln)
if line_urls:
return line_urls[0]
# try parse right side after ':' / '-'
if ":" in ln:
rhs = ln.split(":", 1)[1]
elif "" in ln:
rhs = ln.split("", 1)[1]
elif "-" in ln:
rhs = ln.split("-", 1)[1]
else:
rhs = ln
for token in re.split(r"\s+", rhs.strip()):
u = normalize_possible_url(token)
if u:
return u
# Priority 2: first URL that looks like an apply page.
for u in extract_urls(body_text):
lu = u.lower()
if "apply" in lu or "job" in lu or "careers" in lu:
return u
return None
2026-02-26 20:00:06 +08:00
def infer_employment_fields(
tags: list[str], raw_line: str | None
) -> tuple[str, str, str | None, list[str] | None, str | None]:
mode_remote = {"远程", "remote", "居家", "在家办公", "home office", "wfh"}
mode_onsite = {"实地", "现场", "线下", "onsite", "on-site", "坐班", "到岗"}
nature_map = {
"全职": "full_time",
"兼职": "part_time",
"实习": "intern",
"合同": "contract",
"contract": "contract",
"自由职业": "freelance",
"freelance": "freelance",
}
nature_priority = ["full_time", "part_time", "contract", "intern", "freelance"]
normalized = []
for t in tags:
n = clean_md_text(t).replace("·", " ").strip()
if n:
normalized.append(n)
normalized = dedupe(normalized)
has_remote = False
has_onsite = False
natures_found = []
locations = []
for tag in normalized:
low = tag.lower()
if low in mode_remote or tag in mode_remote:
has_remote = True
continue
if low in mode_onsite or tag in mode_onsite:
has_onsite = True
continue
mapped = nature_map.get(tag) or nature_map.get(low)
if mapped:
natures_found.append(mapped)
continue
locations.append(tag)
if has_remote and has_onsite:
work_mode = "hybrid"
elif has_remote:
work_mode = "remote"
elif has_onsite:
work_mode = "onsite"
else:
work_mode = "unknown"
job_nature = "unknown"
for cand in nature_priority:
if cand in natures_found:
job_nature = cand
break
location_tags_raw = dedupe(locations)
location_text = location_tags_raw[0] if location_tags_raw else None
location_tags: list[str] | None = location_tags_raw if location_tags_raw else None
raw = clean_md_text(raw_line) if raw_line else None
return work_mode, job_nature, location_text, location_tags, raw
def parse_dejob_official(
source: str,
chat_id: int | None,
message_id: int,
message_date: str,
body_text: str,
raw_content: str,
) -> StructuredJob:
job_type = "招聘" if ("招聘" in body_text or "Recruitment" in body_text) else None
company_name = extract_company_name_dejob(body_text)
industry_tags = []
for ln in body_text.splitlines():
if "🏡" in ln:
norm_ln = normalize_md_line(ln)
industry_tags = [
clean_md_text(t).replace("·", " ") for t in MD_TAG_RE.findall(norm_ln)
]
industry_tags = dedupe([t for t in industry_tags if t])
break
cooperation_tags = []
cooperation_line = None
for ln in body_text.splitlines():
if "合作方式" in ln:
cooperation_line = ln
norm_ln = normalize_md_line(ln)
cooperation_tags = [
clean_md_text(t).replace("·", " ") for t in MD_TAG_RE.findall(norm_ln)
]
cooperation_tags = dedupe([t for t in cooperation_tags if t])
break
(
work_mode,
job_nature,
job_location_text,
job_location_tags,
employment_type_raw,
) = infer_employment_fields(cooperation_tags, cooperation_line)
position_tags = []
for ln in body_text.splitlines():
if "待招岗位" in ln:
norm_ln = normalize_md_line(ln)
position_tags = [
clean_md_text(t).replace("·", " ") for t in MD_TAG_RE.findall(norm_ln)
]
position_tags = dedupe([t for t in position_tags if t])
break
position_name = extract_position_name_dejob(body_text)
intro_sec = extract_section(body_text, "简介")
company_url = extract_first_url(intro_sec) or extract_first_url(body_text)
company_intro = None
if intro_sec:
intro_lines = [ln for ln in intro_sec.splitlines() if not URL_RE.search(ln)]
company_intro = clean_md_text("\n".join(intro_lines)) or None
salary_line = None
for ln in body_text.splitlines():
if "薪酬" in ln or "Salary" in ln or "salary" in ln:
salary_line = ln
break
salary_raw, salary_currency, salary_min, salary_max, salary_period = parse_salary(
salary_line
)
2026-02-26 20:00:06 +08:00
responsibilities = extract_list_section(body_text, "岗位职责")
requirements = extract_list_section(body_text, "岗位要求")
apply_email = extract_apply_email(body_text)
apply_tg = extract_apply_telegram(body_text)
job_source_url = extract_job_source_url(body_text)
return StructuredJob(
source=source,
source_channel="DeJob",
parser_name="dejob_official",
parser_version="v1",
chat_id=chat_id,
message_id=message_id,
message_date=message_date,
job_type=job_type,
company_name=company_name,
industry_tags=industry_tags,
company_intro=company_intro,
company_url=company_url,
work_mode=work_mode,
job_nature=job_nature,
job_location_text=job_location_text,
job_location_tags=job_location_tags,
employment_type_raw=employment_type_raw,
position_name=position_name,
position_tags=position_tags,
salary_raw=salary_raw,
salary_currency=salary_currency,
salary_min=salary_min,
salary_max=salary_max,
salary_period=salary_period,
responsibilities=responsibilities,
requirements=requirements,
apply_email=apply_email,
apply_telegram=apply_tg,
job_source_url=job_source_url,
body_text=body_text or "empty_message",
raw_content=raw_content,
)
def parse_generic(
source: str,
chat_id: int | None,
message_id: int,
message_date: str,
body_text: str,
raw_content: str,
) -> StructuredJob:
hashtags = [h.replace("·", " ").strip() for h in HASHTAG_RE.findall(body_text)]
hashtags = dedupe([h for h in hashtags if h])
urls = URL_RE.findall(body_text)
emails = EMAIL_RE.findall(body_text)
tgs = TG_RE.findall(body_text)
title = None
for ln in body_text.splitlines():
t = clean_md_text(ln)
if t:
title = t[:120]
break
salary_line = None
for ln in body_text.splitlines():
if any(k in ln.lower() for k in ("salary", "薪资", "薪酬", "k/", "$")):
salary_line = ln
break
salary_raw, salary_currency, salary_min, salary_max, salary_period = parse_salary(
salary_line
)
2026-02-26 20:00:06 +08:00
job_type = "招聘" if ("招聘" in body_text or "recruit" in body_text.lower()) else None
(
work_mode,
job_nature,
job_location_text,
job_location_tags,
employment_type_raw,
) = infer_employment_fields(hashtags, None)
return StructuredJob(
source=source,
source_channel=None,
parser_name="generic",
parser_version="v1",
chat_id=chat_id,
message_id=message_id,
message_date=message_date,
job_type=job_type,
company_name=None,
industry_tags=hashtags,
company_intro=None,
company_url=urls[0] if urls else None,
work_mode=work_mode,
job_nature=job_nature,
job_location_text=job_location_text,
job_location_tags=job_location_tags,
employment_type_raw=employment_type_raw,
position_name=title,
position_tags=hashtags,
salary_raw=salary_raw,
salary_currency=salary_currency,
salary_min=salary_min,
salary_max=salary_max,
salary_period=salary_period,
responsibilities=[],
requirements=[],
apply_email=emails[0] if emails else None,
apply_telegram=tgs[0] if tgs else None,
job_source_url=None,
body_text=body_text or "empty_message",
raw_content=raw_content,
)
def parse_dejob_global(
source: str,
chat_id: int | None,
message_id: int,
message_date: str,
body_text: str,
raw_content: str,
) -> StructuredJob:
job_type = "招聘" if ("招聘" in body_text or "recruit" in body_text.lower()) else None
company_name = extract_company_name_dejob(body_text)
industry_tags = []
for ln in body_text.splitlines():
if "🏡" in ln:
norm_ln = normalize_md_line(ln)
industry_tags = [
clean_md_text(t).replace("·", " ") for t in MD_TAG_RE.findall(norm_ln)
]
industry_tags = dedupe([t for t in industry_tags if t])
break
if not industry_tags:
industry_tags = [h.replace("·", " ").strip() for h in HASHTAG_RE.findall(body_text)]
industry_tags = dedupe([h for h in industry_tags if h])
cooperation_tags = []
cooperation_line = None
for ln in body_text.splitlines():
low = ln.lower()
if "合作方式" in ln or "fulltime" in low or "parttime" in low or "remote" in low:
cooperation_line = ln
norm_ln = normalize_md_line(ln)
cooperation_tags = [
clean_md_text(t).replace("·", " ") for t in MD_TAG_RE.findall(norm_ln)
]
cooperation_tags = dedupe([t for t in cooperation_tags if t])
break
(
work_mode,
job_nature,
job_location_text,
job_location_tags,
employment_type_raw,
) = infer_employment_fields(cooperation_tags, cooperation_line)
position_tags = []
for ln in body_text.splitlines():
if "待招岗位" in ln or "📚" in ln:
norm_ln = normalize_md_line(ln)
position_tags = [
clean_md_text(t).replace("·", " ") for t in MD_TAG_RE.findall(norm_ln)
]
position_tags = dedupe([t for t in position_tags if t])
break
if not position_tags:
position_tags = industry_tags
position_name = position_tags[0] if position_tags else extract_first_nonempty_line(body_text)
intro_sec = extract_section(body_text, "Introduction") or extract_section(body_text, "简介")
urls = extract_urls(body_text)
company_url = extract_first_url_by_keyword(body_text, ["dejob.top/jobDetail"])
if company_url and urls:
for u in urls:
if "dejob.top/jobDetail" not in u:
company_url = u
break
if not company_url:
company_url = extract_first_url(intro_sec) or (urls[0] if urls else None)
company_intro = None
if intro_sec:
intro_lines = [ln for ln in intro_sec.splitlines() if not URL_RE.search(ln)]
company_intro = clean_md_text("\n".join(intro_lines)) or None
salary_line = None
for ln in body_text.splitlines():
if "薪酬" in ln or "salary" in ln.lower():
salary_line = ln
break
salary_raw, salary_currency, salary_min, salary_max, salary_period = parse_salary(
salary_line
)
responsibilities = extract_list_section(body_text, "岗位职责") or extract_list_section(
body_text, "Responsibilities"
)
requirements = extract_list_section(body_text, "岗位要求") or extract_list_section(
body_text, "Requirements"
)
apply_email = extract_apply_email(body_text)
apply_tg = extract_apply_telegram(body_text)
job_source_url = extract_first_url_by_keyword(body_text, ["dejob.top/jobDetail"])
if not job_source_url:
urls = extract_urls(body_text)
job_source_url = urls[0] if urls else None
return StructuredJob(
source=source,
source_channel="DeJob",
parser_name="dejob_global",
parser_version="v1",
chat_id=chat_id,
message_id=message_id,
message_date=message_date,
job_type=job_type,
company_name=company_name,
industry_tags=industry_tags,
company_intro=company_intro,
company_url=company_url,
work_mode=work_mode,
job_nature=job_nature,
job_location_text=job_location_text,
job_location_tags=job_location_tags,
employment_type_raw=employment_type_raw,
position_name=position_name,
position_tags=position_tags,
salary_raw=salary_raw,
salary_currency=salary_currency,
salary_min=salary_min,
salary_max=salary_max,
salary_period=salary_period,
responsibilities=responsibilities,
requirements=requirements,
apply_email=apply_email,
apply_telegram=apply_tg,
job_source_url=job_source_url or company_url,
body_text=body_text or "empty_message",
raw_content=raw_content,
)
def parse_remote_cn(
source: str,
chat_id: int | None,
message_id: int,
message_date: str,
body_text: str,
raw_content: str,
) -> StructuredJob:
lines = [clean_md_text(ln) for ln in body_text.splitlines() if clean_md_text(ln)]
title = lines[0] if lines else None
hashtags = [h.replace("·", " ").strip() for h in HASHTAG_RE.findall(body_text)]
hashtags = dedupe([h for h in hashtags if h])
(
work_mode,
job_nature,
job_location_text,
job_location_tags,
employment_type_raw,
) = infer_employment_fields(hashtags, None)
summary_line = None
for ln in lines:
if ln.startswith("摘要:"):
summary_line = ln
break
salary_raw, salary_currency, salary_min, salary_max, salary_period = parse_salary(
summary_line
)
urls = extract_urls(body_text)
apply_email = extract_apply_email(body_text)
apply_tg = extract_apply_telegram(body_text)
# remote_cn often places the detail link right below the title line.
top_url = None
raw_lines = [ln.strip() for ln in body_text.splitlines() if ln.strip()]
for ln in raw_lines[:6]:
found = URL_RE.findall(ln)
if found:
top_url = found[0]
break
job_source_url = (
top_url
or extract_first_url_by_keyword(body_text, ["remote-info.cn/jobs/"])
or (urls[0] if urls else None)
)
job_type = "招聘" if ("招聘" in body_text or "job" in body_text.lower()) else None
return StructuredJob(
source=source,
source_channel="remote_cn",
parser_name="remote_cn",
parser_version="v1",
chat_id=chat_id,
message_id=message_id,
message_date=message_date,
job_type=job_type,
company_name=None,
industry_tags=hashtags,
company_intro=summary_line.replace("摘要:", "", 1).strip() if summary_line else None,
company_url=job_source_url or (urls[0] if urls else None),
work_mode=work_mode,
job_nature=job_nature,
job_location_text=job_location_text,
job_location_tags=job_location_tags,
employment_type_raw=employment_type_raw,
position_name=title,
position_tags=hashtags,
salary_raw=salary_raw,
salary_currency=salary_currency,
salary_min=salary_min,
salary_max=salary_max,
salary_period=salary_period,
responsibilities=[],
requirements=[],
apply_email=apply_email,
apply_telegram=apply_tg,
job_source_url=job_source_url,
body_text=body_text or "empty_message",
raw_content=raw_content,
)
def parse_cryptojobslist_source(
source: str,
chat_id: int | None,
message_id: int,
message_date: str,
body_text: str,
raw_content: str,
) -> StructuredJob:
lines = [clean_md_text(ln) for ln in body_text.splitlines() if clean_md_text(ln)]
title = lines[0] if lines else None
urls = extract_urls(body_text)
hashtags = [h.replace("·", " ").strip() for h in HASHTAG_RE.findall(body_text)]
hashtags = dedupe([h for h in hashtags if h])
(
work_mode,
job_nature,
job_location_text,
job_location_tags,
employment_type_raw,
) = infer_employment_fields(hashtags, None)
salary_line = None
for ln in lines:
if any(k in ln.lower() for k in ("salary", "$", "usd")):
salary_line = ln
break
salary_raw, salary_currency, salary_min, salary_max, salary_period = parse_salary(
salary_line
)
apply_email = extract_apply_email(body_text)
apply_tg = extract_apply_telegram(body_text)
apply_link = extract_apply_link(body_text)
job_source_url = (
apply_link
or extract_first_url_by_keyword(body_text, ["cryptojobslist.com"])
or (urls[0] if urls else None)
)
job_type = "招聘" if ("job" in body_text.lower() or "hiring" in body_text.lower()) else None
return StructuredJob(
source=source,
source_channel="cryptojobslist",
parser_name="cryptojobslist",
parser_version="v1",
chat_id=chat_id,
message_id=message_id,
message_date=message_date,
job_type=job_type,
company_name=None,
industry_tags=hashtags,
company_intro=None,
company_url=job_source_url or (urls[0] if urls else None),
work_mode=work_mode,
job_nature=job_nature,
job_location_text=job_location_text,
job_location_tags=job_location_tags,
employment_type_raw=employment_type_raw,
position_name=title,
position_tags=hashtags,
salary_raw=salary_raw,
salary_currency=salary_currency,
salary_min=salary_min,
salary_max=salary_max,
salary_period=salary_period,
responsibilities=[],
requirements=[],
apply_email=apply_email,
apply_telegram=apply_tg,
job_source_url=job_source_url,
body_text=body_text or "empty_message",
raw_content=raw_content,
)
2026-02-26 20:00:06 +08:00
def route_parse(row: tuple) -> StructuredJob:
source, chat_id, message_id, content, message_date = row
raw_content = content or ""
body_text = preprocess_body_text(strip_meta_lines(raw_content))
if source == "@DeJob_official":
return parse_dejob_official(
source, chat_id, message_id, message_date, body_text, raw_content
)
if source == "@DeJob_Global_group":
return parse_dejob_global(
source, chat_id, message_id, message_date, body_text, raw_content
)
if source == "@remote_cn":
return parse_remote_cn(
source, chat_id, message_id, message_date, body_text, raw_content
)
if source == "@cryptojobslist":
return parse_cryptojobslist_source(
source, chat_id, message_id, message_date, body_text, raw_content
)
2026-02-26 20:00:06 +08:00
return parse_generic(source, chat_id, message_id, message_date, body_text, raw_content)
def upsert_structured(conn, item: StructuredJob):
with conn.cursor() as cur:
cur.execute(
"""
INSERT INTO structured_jobs (
source, source_channel, parser_name, parser_version, chat_id, message_id,
message_date, job_type, company_name, industry_tags_json, company_intro,
company_url, work_mode, job_nature, job_location_text, job_location_tags_json, employment_type_raw,
position_name, position_tags_json,
salary_raw, salary_currency, salary_min, salary_max, salary_period,
responsibilities_json, requirements_json, apply_email, apply_telegram,
job_source_url, body_text, raw_content
) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
ON DUPLICATE KEY UPDATE
source_channel=VALUES(source_channel),
parser_name=VALUES(parser_name),
parser_version=VALUES(parser_version),
chat_id=VALUES(chat_id),
message_date=VALUES(message_date),
job_type=VALUES(job_type),
company_name=VALUES(company_name),
industry_tags_json=VALUES(industry_tags_json),
company_intro=VALUES(company_intro),
company_url=VALUES(company_url),
work_mode=VALUES(work_mode),
job_nature=VALUES(job_nature),
job_location_text=VALUES(job_location_text),
job_location_tags_json=VALUES(job_location_tags_json),
employment_type_raw=VALUES(employment_type_raw),
position_name=VALUES(position_name),
position_tags_json=VALUES(position_tags_json),
salary_raw=VALUES(salary_raw),
salary_currency=VALUES(salary_currency),
salary_min=VALUES(salary_min),
salary_max=VALUES(salary_max),
salary_period=VALUES(salary_period),
responsibilities_json=VALUES(responsibilities_json),
requirements_json=VALUES(requirements_json),
apply_email=VALUES(apply_email),
apply_telegram=VALUES(apply_telegram),
job_source_url=VALUES(job_source_url),
body_text=VALUES(body_text),
raw_content=VALUES(raw_content),
cleaned_at=CURRENT_TIMESTAMP
""",
(
item.source,
item.source_channel,
item.parser_name,
item.parser_version,
item.chat_id,
item.message_id,
item.message_date,
item.job_type,
item.company_name,
json.dumps(item.industry_tags, ensure_ascii=False),
item.company_intro,
item.company_url,
item.work_mode,
item.job_nature,
item.job_location_text,
json.dumps(item.job_location_tags, ensure_ascii=False)
if item.job_location_tags is not None
else None,
item.employment_type_raw,
item.position_name,
json.dumps(item.position_tags, ensure_ascii=False),
item.salary_raw,
item.salary_currency,
item.salary_min,
item.salary_max,
item.salary_period,
json.dumps(item.responsibilities, ensure_ascii=False),
json.dumps(item.requirements, ensure_ascii=False),
item.apply_email,
item.apply_telegram,
item.job_source_url,
item.body_text,
item.raw_content,
),
)
def is_recruitment_job(item: StructuredJob) -> bool:
return item.job_type == "招聘"
def has_usable_job_link(item: StructuredJob) -> bool:
return bool((item.job_source_url or "").strip())
2026-02-26 20:00:06 +08:00
def get_last_processed_row_id(conn, pipeline_name: str) -> int:
with conn.cursor() as cur:
cur.execute(
"SELECT COALESCE(last_message_row_id, 0) FROM clean_state WHERE pipeline_name=%s",
(pipeline_name,),
)
row = cur.fetchone()
return int(row[0]) if row else 0
def set_last_processed_row_id(conn, pipeline_name: str, row_id: int):
with conn.cursor() as cur:
cur.execute(
"""
INSERT INTO clean_state (pipeline_name, last_message_row_id, updated_at)
VALUES (%s, %s, NOW())
ON DUPLICATE KEY UPDATE
last_message_row_id=VALUES(last_message_row_id),
updated_at=NOW()
""",
(pipeline_name, row_id),
)
def main():
mysql_cfg = load_mysql_config()
conn = connect_mysql(mysql_cfg)
try:
init_target_db(conn)
last_row_id = get_last_processed_row_id(conn, PIPELINE_NAME)
logger.info(f"增量清洗起点 messages.id > {last_row_id}")
with conn.cursor() as src_cur:
src_cur.execute(
"""
SELECT id, source, chat_id, message_id, content, date
FROM messages
WHERE id > %s
ORDER BY id ASC
""",
(last_row_id,),
)
rows = src_cur.fetchall()
processed = 0
inserted = 0
skipped_non_recruit = 0
skipped_no_link = 0
2026-02-26 20:00:06 +08:00
by_parser = {}
max_row_id = last_row_id
for row in rows:
row_id, source, chat_id, message_id, content, message_date = row
item = route_parse((source, chat_id, message_id, content, message_date))
processed += 1
by_parser[item.parser_name] = by_parser.get(item.parser_name, 0) + 1
if row_id > max_row_id:
max_row_id = row_id
if not is_recruitment_job(item):
skipped_non_recruit += 1
continue
if not has_usable_job_link(item):
skipped_no_link += 1
continue
2026-02-26 20:00:06 +08:00
upsert_structured(conn, item)
inserted += 1
if processed % 500 == 0:
logger.info(
f"[clean] processed={processed}, inserted={inserted}, "
f"skipped_non_recruit={skipped_non_recruit}, skipped_no_link={skipped_no_link}"
2026-02-26 20:00:06 +08:00
)
if max_row_id > last_row_id:
set_last_processed_row_id(conn, PIPELINE_NAME, max_row_id)
logger.info(f"更新检查点 last_message_row_id={max_row_id}")
with conn.cursor() as cur:
cur.execute("SELECT count(*) FROM structured_jobs")
total = cur.fetchone()[0]
logger.info(
"[done] "
f"structured_jobs={total}, inserted={inserted}, skipped_non_recruit={skipped_non_recruit}, "
f"skipped_no_link={skipped_no_link}, "
2026-02-26 20:00:06 +08:00
f"target=mysql.structured_jobs, parsers={by_parser}"
)
if processed == 0:
logger.info("无新增消息,清洗完成")
except Exception:
logger.exception("清洗任务失败")
raise
finally:
conn.close()
if __name__ == "__main__":
main()