first init

2026-02-26 20:00:06 +08:00
commit 5efb8fc9ed
7 changed files with 1707 additions and 0 deletions
--- a/clean_to_structured.py
+++ b/clean_to_structured.py
@@ -0,0 +1,812 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+Source-aware structured cleaning (MySQL).
+
+Input table:  messages
+Output table: structured_jobs
+"""
+
+import json
+import logging
+import os
+import re
+from dataclasses import dataclass
+
+import pymysql
+
+CONFIG_FILE = "config.json"
+PIPELINE_NAME = "structured_cleaner_v1"
+
+URL_RE = re.compile(r"https?://[^\s)]+", re.IGNORECASE)
+EMAIL_RE = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)
+TG_RE = re.compile(r"(?<!\w)@[a-zA-Z0-9_]{4,}")
+HASHTAG_RE = re.compile(r"#([A-Za-z0-9_\-\u4e00-\u9fff·]+)")
+MD_TAG_RE = re.compile(r"#(?:\*\*)?([A-Za-z0-9_\-\u4e00-\u9fff·]+)(?:\*\*)?")
+
+SECTION_KEYS = (
+    "简介",
+    "合作方式",
+    "待招岗位",
+    "薪酬福利",
+    "岗位职责",
+    "岗位要求",
+    "申请方式",
+    "岗位来源",
+)
+
+
+@dataclass
+class StructuredJob:
+    source: str
+    source_channel: str | None
+    parser_name: str
+    parser_version: str
+    chat_id: int | None
+    message_id: int
+    message_date: str
+    job_type: str | None
+    company_name: str | None
+    industry_tags: list[str]
+    company_intro: str | None
+    company_url: str | None
+    work_mode: str
+    job_nature: str
+    job_location_text: str | None
+    job_location_tags: list[str] | None
+    employment_type_raw: str | None
+    position_name: str | None
+    position_tags: list[str]
+    salary_raw: str | None
+    salary_currency: str | None
+    salary_min: int | None
+    salary_max: int | None
+    salary_period: str | None
+    responsibilities: list[str]
+    requirements: list[str]
+    apply_email: str | None
+    apply_telegram: str | None
+    job_source_url: str | None
+    body_text: str
+    raw_content: str
+
+
+def setup_logger() -> logging.Logger:
+    os.makedirs("logs", exist_ok=True)
+    logger = logging.getLogger("clean_to_structured")
+    logger.setLevel(logging.INFO)
+    if logger.handlers:
+        return logger
+
+    fmt = logging.Formatter(
+        "[%(asctime)s] [%(levelname)s] %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
+    )
+    ch = logging.StreamHandler()
+    ch.setFormatter(fmt)
+    fh = logging.FileHandler("logs/clean_to_structured.log", encoding="utf-8")
+    fh.setFormatter(fmt)
+    logger.addHandler(ch)
+    logger.addHandler(fh)
+    return logger
+
+
+logger = setup_logger()
+
+
+def load_mysql_config() -> dict:
+    if not os.path.exists(CONFIG_FILE):
+        raise FileNotFoundError(f"未找到配置文件: {CONFIG_FILE}")
+
+    with open(CONFIG_FILE, "r", encoding="utf-8") as f:
+        cfg = json.load(f)
+
+    mysql_cfg = cfg.get("mysql", {})
+    if not isinstance(mysql_cfg, dict):
+        raise ValueError("配置错误: mysql 必须是对象")
+
+    result = {
+        "host": mysql_cfg.get("host") or os.getenv("MYSQL_HOST", "127.0.0.1"),
+        "port": int(mysql_cfg.get("port") or os.getenv("MYSQL_PORT", "3306")),
+        "user": mysql_cfg.get("user") or os.getenv("MYSQL_USER", "jobs_user"),
+        "password": mysql_cfg.get("password") or os.getenv("MYSQL_PASSWORD", ""),
+        "database": mysql_cfg.get("database") or os.getenv("MYSQL_DATABASE", "jobs"),
+        "charset": mysql_cfg.get("charset") or os.getenv("MYSQL_CHARSET", "utf8mb4"),
+    }
+    if not result["password"]:
+        raise ValueError("配置错误: mysql.password 不能为空")
+    return result
+
+
+def connect_mysql(cfg: dict):
+    return pymysql.connect(
+        host=cfg["host"],
+        port=cfg["port"],
+        user=cfg["user"],
+        password=cfg["password"],
+        database=cfg["database"],
+        charset=cfg["charset"],
+        autocommit=True,
+    )
+
+
+def init_target_db(conn):
+    with conn.cursor() as cur:
+        cur.execute(
+            """
+            CREATE TABLE IF NOT EXISTS structured_jobs (
+                id BIGINT PRIMARY KEY AUTO_INCREMENT,
+                source VARCHAR(255) NOT NULL,
+                source_channel VARCHAR(255) NULL,
+                parser_name VARCHAR(64) NOT NULL,
+                parser_version VARCHAR(32) NOT NULL,
+                chat_id BIGINT NULL,
+                message_id BIGINT NOT NULL,
+                message_date DATETIME NOT NULL,
+                job_type VARCHAR(64) NULL,
+                company_name VARCHAR(255) NULL,
+                industry_tags_json JSON NOT NULL,
+                company_intro LONGTEXT NULL,
+                company_url TEXT NULL,
+                work_mode VARCHAR(32) NOT NULL,
+                job_nature VARCHAR(32) NOT NULL,
+                job_location_text VARCHAR(255) NULL,
+                job_location_tags_json JSON NULL,
+                employment_type_raw TEXT NULL,
+                position_name VARCHAR(255) NULL,
+                position_tags_json JSON NOT NULL,
+                salary_raw TEXT NULL,
+                salary_currency VARCHAR(16) NULL,
+                salary_min BIGINT NULL,
+                salary_max BIGINT NULL,
+                salary_period VARCHAR(16) NULL,
+                responsibilities_json JSON NOT NULL,
+                requirements_json JSON NOT NULL,
+                apply_email VARCHAR(255) NULL,
+                apply_telegram VARCHAR(255) NULL,
+                job_source_url TEXT NULL,
+                body_text LONGTEXT NOT NULL,
+                raw_content LONGTEXT NOT NULL,
+                cleaned_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP
+                    ON UPDATE CURRENT_TIMESTAMP,
+                UNIQUE KEY uk_source_message (source, message_id),
+                KEY idx_structured_source_date (source, message_date)
+            ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
+            """
+        )
+        cur.execute(
+            """
+            CREATE TABLE IF NOT EXISTS clean_state (
+                pipeline_name VARCHAR(128) PRIMARY KEY,
+                last_message_row_id BIGINT NOT NULL DEFAULT 0,
+                updated_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP
+                    ON UPDATE CURRENT_TIMESTAMP
+            ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
+            """
+        )
+
+
+def dedupe(values: list[str]) -> list[str]:
+    seen = set()
+    out = []
+    for v in values:
+        if not v:
+            continue
+        if v in seen:
+            continue
+        seen.add(v)
+        out.append(v)
+    return out
+
+
+def clean_md_text(s: str) -> str:
+    s = re.sub(r"\*+", "", s)
+    s = re.sub(r"~+", "", s)
+    s = s.replace("`", "").strip()
+    return re.sub(r"\s+", " ", s).strip()
+
+
+def normalize_md_line(s: str) -> str:
+    s = s.replace("**", "").replace("`", "")
+    s = s.replace("~~", "")
+    s = s.replace("\u3000", " ")
+    return re.sub(r"\s+", " ", s).strip()
+
+
+def clean_company_name(s: str | None) -> str | None:
+    if not s:
+        return None
+    s = clean_md_text(s)
+    s = s.strip(" -|:：#")
+    return s or None
+
+
+def parse_salary(raw: str | None) -> tuple[str | None, int | None, int | None, str | None]:
+    if not raw:
+        return None, None, None, None
+
+    text = clean_md_text(raw)
+    lower = text.lower()
+
+    nums = re.findall(r"\d+(?:\.\d+)?", text.replace(",", ""))
+    salary_min = int(float(nums[0])) if len(nums) >= 1 else None
+    salary_max = int(float(nums[1])) if len(nums) >= 2 else None
+
+    period = None
+    if "month" in lower or "每月" in text or "月" in text:
+        period = "month"
+    elif "year" in lower or "年" in text:
+        period = "year"
+    elif "day" in lower or "日" in text:
+        period = "day"
+
+    return text, salary_min, salary_max, period
+
+
+def strip_meta_lines(content: str) -> str:
+    lines = []
+    for ln in (content or "").splitlines():
+        if ln.startswith("[MEDIA_TYPE] "):
+            continue
+        if ln.startswith("[ACTION_TYPE] "):
+            continue
+        if ln.startswith("[MEDIA_JSON] "):
+            continue
+        if ln.startswith("[ACTION_JSON] "):
+            continue
+        if ln.startswith("phones="):
+            continue
+        if ln.startswith("[MEDIA_TEXT] "):
+            continue
+        lines.append(ln.rstrip())
+    return "\n".join(lines).strip()
+
+
+def preprocess_body_text(body_text: str) -> str:
+    text = re.sub(r"\[([^\]]+)\]\((https?://[^)\s]+)\)", r"\1 \2", body_text)
+    text = text.replace("**", "").replace("__", "").replace("~~", "").replace("`", "")
+    lines = [re.sub(r"[ \t]+", " ", ln).strip() for ln in text.splitlines()]
+    return "\n".join(lines).strip()
+
+
+def extract_section(body_text: str, section_name: str) -> str | None:
+    lines = body_text.splitlines()
+    start = None
+    for i, ln in enumerate(lines):
+        if section_name in ln:
+            start = i + 1
+            break
+    if start is None:
+        return None
+
+    collected = []
+    for ln in lines[start:]:
+        if any(k in ln for k in SECTION_KEYS):
+            break
+        collected.append(ln)
+
+    text = "\n".join(collected).strip()
+    return text or None
+
+
+def extract_first_url(text: str | None) -> str | None:
+    if not text:
+        return None
+    urls = URL_RE.findall(text)
+    return urls[0] if urls else None
+
+
+def extract_job_source_url(body_text: str) -> str | None:
+    for ln in body_text.splitlines():
+        if "岗位来源" in ln:
+            urls = URL_RE.findall(ln)
+            if urls:
+                return urls[0]
+    return None
+
+
+def extract_company_name_dejob(body_text: str) -> str | None:
+    for ln in body_text.splitlines():
+        if "🏡" in ln:
+            no_md = clean_md_text(ln)
+            no_md = no_md.replace("🏡", "").strip()
+            if "#" in no_md:
+                no_md = no_md.split("#", 1)[0].strip()
+            return clean_company_name(no_md)
+    return None
+
+
+def extract_tags_after_key(line: str, key: str) -> list[str]:
+    if key not in line:
+        return []
+    frag = normalize_md_line(line.split(key, 1)[1])
+    tags = [clean_md_text(t).replace("·", " ").strip() for t in MD_TAG_RE.findall(frag)]
+    return dedupe([t for t in tags if t])
+
+
+def extract_list_section(body_text: str, key: str) -> list[str]:
+    sec = extract_section(body_text, key)
+    if not sec:
+        return []
+    items = []
+    for ln in sec.splitlines():
+        t = clean_md_text(ln)
+        t = re.sub(r"^\d+️⃣?\s*", "", t)
+        t = re.sub(r"^\d+[\.、]\s*", "", t)
+        if t:
+            items.append(t)
+    return items
+
+
+def extract_position_name_dejob(body_text: str) -> str | None:
+    for ln in body_text.splitlines():
+        if "待招岗位" in ln:
+            tags = extract_tags_after_key(ln, "待招岗位")
+            if tags:
+                return tags[0]
+    return None
+
+
+def extract_apply_email(body_text: str) -> str | None:
+    emails = EMAIL_RE.findall(body_text)
+    return emails[0] if emails else None
+
+
+def extract_apply_telegram(body_text: str) -> str | None:
+    for ln in body_text.splitlines():
+        if "Telegram" in ln:
+            m = TG_RE.search(ln)
+            if m:
+                return m.group(0)
+    handles = TG_RE.findall(body_text)
+    return handles[0] if handles else None
+
+
+def infer_employment_fields(
+    tags: list[str], raw_line: str | None
+) -> tuple[str, str, str | None, list[str] | None, str | None]:
+    mode_remote = {"远程", "remote", "居家", "在家办公", "home office", "wfh"}
+    mode_onsite = {"实地", "现场", "线下", "onsite", "on-site", "坐班", "到岗"}
+    nature_map = {
+        "全职": "full_time",
+        "兼职": "part_time",
+        "实习": "intern",
+        "合同": "contract",
+        "contract": "contract",
+        "自由职业": "freelance",
+        "freelance": "freelance",
+    }
+    nature_priority = ["full_time", "part_time", "contract", "intern", "freelance"]
+
+    normalized = []
+    for t in tags:
+        n = clean_md_text(t).replace("·", " ").strip()
+        if n:
+            normalized.append(n)
+    normalized = dedupe(normalized)
+
+    has_remote = False
+    has_onsite = False
+    natures_found = []
+    locations = []
+
+    for tag in normalized:
+        low = tag.lower()
+        if low in mode_remote or tag in mode_remote:
+            has_remote = True
+            continue
+        if low in mode_onsite or tag in mode_onsite:
+            has_onsite = True
+            continue
+
+        mapped = nature_map.get(tag) or nature_map.get(low)
+        if mapped:
+            natures_found.append(mapped)
+            continue
+
+        locations.append(tag)
+
+    if has_remote and has_onsite:
+        work_mode = "hybrid"
+    elif has_remote:
+        work_mode = "remote"
+    elif has_onsite:
+        work_mode = "onsite"
+    else:
+        work_mode = "unknown"
+
+    job_nature = "unknown"
+    for cand in nature_priority:
+        if cand in natures_found:
+            job_nature = cand
+            break
+
+    location_tags_raw = dedupe(locations)
+    location_text = location_tags_raw[0] if location_tags_raw else None
+    location_tags: list[str] | None = location_tags_raw if location_tags_raw else None
+    raw = clean_md_text(raw_line) if raw_line else None
+
+    return work_mode, job_nature, location_text, location_tags, raw
+
+
+def parse_dejob_official(
+    source: str,
+    chat_id: int | None,
+    message_id: int,
+    message_date: str,
+    body_text: str,
+    raw_content: str,
+) -> StructuredJob:
+    job_type = "招聘" if ("招聘" in body_text or "Recruitment" in body_text) else None
+
+    company_name = extract_company_name_dejob(body_text)
+
+    industry_tags = []
+    for ln in body_text.splitlines():
+        if "🏡" in ln:
+            norm_ln = normalize_md_line(ln)
+            industry_tags = [
+                clean_md_text(t).replace("·", " ") for t in MD_TAG_RE.findall(norm_ln)
+            ]
+            industry_tags = dedupe([t for t in industry_tags if t])
+            break
+
+    cooperation_tags = []
+    cooperation_line = None
+    for ln in body_text.splitlines():
+        if "合作方式" in ln:
+            cooperation_line = ln
+            norm_ln = normalize_md_line(ln)
+            cooperation_tags = [
+                clean_md_text(t).replace("·", " ") for t in MD_TAG_RE.findall(norm_ln)
+            ]
+            cooperation_tags = dedupe([t for t in cooperation_tags if t])
+            break
+    (
+        work_mode,
+        job_nature,
+        job_location_text,
+        job_location_tags,
+        employment_type_raw,
+    ) = infer_employment_fields(cooperation_tags, cooperation_line)
+
+    position_tags = []
+    for ln in body_text.splitlines():
+        if "待招岗位" in ln:
+            norm_ln = normalize_md_line(ln)
+            position_tags = [
+                clean_md_text(t).replace("·", " ") for t in MD_TAG_RE.findall(norm_ln)
+            ]
+            position_tags = dedupe([t for t in position_tags if t])
+            break
+
+    position_name = extract_position_name_dejob(body_text)
+
+    intro_sec = extract_section(body_text, "简介")
+    company_url = extract_first_url(intro_sec) or extract_first_url(body_text)
+    company_intro = None
+    if intro_sec:
+        intro_lines = [ln for ln in intro_sec.splitlines() if not URL_RE.search(ln)]
+        company_intro = clean_md_text("\n".join(intro_lines)) or None
+
+    salary_line = None
+    for ln in body_text.splitlines():
+        if "薪酬" in ln or "Salary" in ln or "salary" in ln:
+            salary_line = ln
+            break
+
+    salary_raw, salary_min, salary_max, salary_period = parse_salary(salary_line)
+    salary_currency = "USD" if salary_raw and "$" in salary_raw else None
+
+    responsibilities = extract_list_section(body_text, "岗位职责")
+    requirements = extract_list_section(body_text, "岗位要求")
+
+    apply_email = extract_apply_email(body_text)
+    apply_tg = extract_apply_telegram(body_text)
+    job_source_url = extract_job_source_url(body_text)
+
+    return StructuredJob(
+        source=source,
+        source_channel="DeJob",
+        parser_name="dejob_official",
+        parser_version="v1",
+        chat_id=chat_id,
+        message_id=message_id,
+        message_date=message_date,
+        job_type=job_type,
+        company_name=company_name,
+        industry_tags=industry_tags,
+        company_intro=company_intro,
+        company_url=company_url,
+        work_mode=work_mode,
+        job_nature=job_nature,
+        job_location_text=job_location_text,
+        job_location_tags=job_location_tags,
+        employment_type_raw=employment_type_raw,
+        position_name=position_name,
+        position_tags=position_tags,
+        salary_raw=salary_raw,
+        salary_currency=salary_currency,
+        salary_min=salary_min,
+        salary_max=salary_max,
+        salary_period=salary_period,
+        responsibilities=responsibilities,
+        requirements=requirements,
+        apply_email=apply_email,
+        apply_telegram=apply_tg,
+        job_source_url=job_source_url,
+        body_text=body_text or "empty_message",
+        raw_content=raw_content,
+    )
+
+
+def parse_generic(
+    source: str,
+    chat_id: int | None,
+    message_id: int,
+    message_date: str,
+    body_text: str,
+    raw_content: str,
+) -> StructuredJob:
+    hashtags = [h.replace("·", " ").strip() for h in HASHTAG_RE.findall(body_text)]
+    hashtags = dedupe([h for h in hashtags if h])
+
+    urls = URL_RE.findall(body_text)
+    emails = EMAIL_RE.findall(body_text)
+    tgs = TG_RE.findall(body_text)
+
+    title = None
+    for ln in body_text.splitlines():
+        t = clean_md_text(ln)
+        if t:
+            title = t[:120]
+            break
+
+    salary_line = None
+    for ln in body_text.splitlines():
+        if any(k in ln.lower() for k in ("salary", "薪资", "薪酬", "k/", "$")):
+            salary_line = ln
+            break
+
+    salary_raw, salary_min, salary_max, salary_period = parse_salary(salary_line)
+    salary_currency = "USD" if salary_raw and "$" in salary_raw else None
+
+    job_type = "招聘" if ("招聘" in body_text or "recruit" in body_text.lower()) else None
+    (
+        work_mode,
+        job_nature,
+        job_location_text,
+        job_location_tags,
+        employment_type_raw,
+    ) = infer_employment_fields(hashtags, None)
+
+    return StructuredJob(
+        source=source,
+        source_channel=None,
+        parser_name="generic",
+        parser_version="v1",
+        chat_id=chat_id,
+        message_id=message_id,
+        message_date=message_date,
+        job_type=job_type,
+        company_name=None,
+        industry_tags=hashtags,
+        company_intro=None,
+        company_url=urls[0] if urls else None,
+        work_mode=work_mode,
+        job_nature=job_nature,
+        job_location_text=job_location_text,
+        job_location_tags=job_location_tags,
+        employment_type_raw=employment_type_raw,
+        position_name=title,
+        position_tags=hashtags,
+        salary_raw=salary_raw,
+        salary_currency=salary_currency,
+        salary_min=salary_min,
+        salary_max=salary_max,
+        salary_period=salary_period,
+        responsibilities=[],
+        requirements=[],
+        apply_email=emails[0] if emails else None,
+        apply_telegram=tgs[0] if tgs else None,
+        job_source_url=None,
+        body_text=body_text or "empty_message",
+        raw_content=raw_content,
+    )
+
+
+def route_parse(row: tuple) -> StructuredJob:
+    source, chat_id, message_id, content, message_date = row
+    raw_content = content or ""
+    body_text = preprocess_body_text(strip_meta_lines(raw_content))
+
+    if source == "@DeJob_official":
+        return parse_dejob_official(
+            source, chat_id, message_id, message_date, body_text, raw_content
+        )
+
+    return parse_generic(source, chat_id, message_id, message_date, body_text, raw_content)
+
+
+def upsert_structured(conn, item: StructuredJob):
+    with conn.cursor() as cur:
+        cur.execute(
+            """
+            INSERT INTO structured_jobs (
+                source, source_channel, parser_name, parser_version, chat_id, message_id,
+                message_date, job_type, company_name, industry_tags_json, company_intro,
+                company_url, work_mode, job_nature, job_location_text, job_location_tags_json, employment_type_raw,
+                position_name, position_tags_json,
+                salary_raw, salary_currency, salary_min, salary_max, salary_period,
+                responsibilities_json, requirements_json, apply_email, apply_telegram,
+                job_source_url, body_text, raw_content
+            ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
+            ON DUPLICATE KEY UPDATE
+                source_channel=VALUES(source_channel),
+                parser_name=VALUES(parser_name),
+                parser_version=VALUES(parser_version),
+                chat_id=VALUES(chat_id),
+                message_date=VALUES(message_date),
+                job_type=VALUES(job_type),
+                company_name=VALUES(company_name),
+                industry_tags_json=VALUES(industry_tags_json),
+                company_intro=VALUES(company_intro),
+                company_url=VALUES(company_url),
+                work_mode=VALUES(work_mode),
+                job_nature=VALUES(job_nature),
+                job_location_text=VALUES(job_location_text),
+                job_location_tags_json=VALUES(job_location_tags_json),
+                employment_type_raw=VALUES(employment_type_raw),
+                position_name=VALUES(position_name),
+                position_tags_json=VALUES(position_tags_json),
+                salary_raw=VALUES(salary_raw),
+                salary_currency=VALUES(salary_currency),
+                salary_min=VALUES(salary_min),
+                salary_max=VALUES(salary_max),
+                salary_period=VALUES(salary_period),
+                responsibilities_json=VALUES(responsibilities_json),
+                requirements_json=VALUES(requirements_json),
+                apply_email=VALUES(apply_email),
+                apply_telegram=VALUES(apply_telegram),
+                job_source_url=VALUES(job_source_url),
+                body_text=VALUES(body_text),
+                raw_content=VALUES(raw_content),
+                cleaned_at=CURRENT_TIMESTAMP
+            """,
+            (
+                item.source,
+                item.source_channel,
+                item.parser_name,
+                item.parser_version,
+                item.chat_id,
+                item.message_id,
+                item.message_date,
+                item.job_type,
+                item.company_name,
+                json.dumps(item.industry_tags, ensure_ascii=False),
+                item.company_intro,
+                item.company_url,
+                item.work_mode,
+                item.job_nature,
+                item.job_location_text,
+                json.dumps(item.job_location_tags, ensure_ascii=False)
+                if item.job_location_tags is not None
+                else None,
+                item.employment_type_raw,
+                item.position_name,
+                json.dumps(item.position_tags, ensure_ascii=False),
+                item.salary_raw,
+                item.salary_currency,
+                item.salary_min,
+                item.salary_max,
+                item.salary_period,
+                json.dumps(item.responsibilities, ensure_ascii=False),
+                json.dumps(item.requirements, ensure_ascii=False),
+                item.apply_email,
+                item.apply_telegram,
+                item.job_source_url,
+                item.body_text,
+                item.raw_content,
+            ),
+        )
+
+
+def is_recruitment_job(item: StructuredJob) -> bool:
+    return item.job_type == "招聘"
+
+
+def get_last_processed_row_id(conn, pipeline_name: str) -> int:
+    with conn.cursor() as cur:
+        cur.execute(
+            "SELECT COALESCE(last_message_row_id, 0) FROM clean_state WHERE pipeline_name=%s",
+            (pipeline_name,),
+        )
+        row = cur.fetchone()
+        return int(row[0]) if row else 0
+
+
+def set_last_processed_row_id(conn, pipeline_name: str, row_id: int):
+    with conn.cursor() as cur:
+        cur.execute(
+            """
+            INSERT INTO clean_state (pipeline_name, last_message_row_id, updated_at)
+            VALUES (%s, %s, NOW())
+            ON DUPLICATE KEY UPDATE
+                last_message_row_id=VALUES(last_message_row_id),
+                updated_at=NOW()
+            """,
+            (pipeline_name, row_id),
+        )
+
+
+def main():
+    mysql_cfg = load_mysql_config()
+    conn = connect_mysql(mysql_cfg)
+
+    try:
+        init_target_db(conn)
+        last_row_id = get_last_processed_row_id(conn, PIPELINE_NAME)
+        logger.info(f"增量清洗起点 messages.id > {last_row_id}")
+
+        with conn.cursor() as src_cur:
+            src_cur.execute(
+                """
+                SELECT id, source, chat_id, message_id, content, date
+                FROM messages
+                WHERE id > %s
+                ORDER BY id ASC
+                """,
+                (last_row_id,),
+            )
+            rows = src_cur.fetchall()
+
+        processed = 0
+        inserted = 0
+        skipped_non_recruit = 0
+        by_parser = {}
+        max_row_id = last_row_id
+
+        for row in rows:
+            row_id, source, chat_id, message_id, content, message_date = row
+            item = route_parse((source, chat_id, message_id, content, message_date))
+            processed += 1
+            by_parser[item.parser_name] = by_parser.get(item.parser_name, 0) + 1
+            if row_id > max_row_id:
+                max_row_id = row_id
+
+            if not is_recruitment_job(item):
+                skipped_non_recruit += 1
+                continue
+
+            upsert_structured(conn, item)
+            inserted += 1
+
+            if processed % 500 == 0:
+                logger.info(
+                    f"[clean] processed={processed}, inserted={inserted}, skipped_non_recruit={skipped_non_recruit}"
+                )
+
+        if max_row_id > last_row_id:
+            set_last_processed_row_id(conn, PIPELINE_NAME, max_row_id)
+            logger.info(f"更新检查点 last_message_row_id={max_row_id}")
+
+        with conn.cursor() as cur:
+            cur.execute("SELECT count(*) FROM structured_jobs")
+            total = cur.fetchone()[0]
+
+        logger.info(
+            "[done] "
+            f"structured_jobs={total}, inserted={inserted}, skipped_non_recruit={skipped_non_recruit}, "
+            f"target=mysql.structured_jobs, parsers={by_parser}"
+        )
+        if processed == 0:
+            logger.info("无新增消息，清洗完成")
+    except Exception:
+        logger.exception("清洗任务失败")
+        raise
+    finally:
+        conn.close()
+
+
+if __name__ == "__main__":
+    main()