完成所有来源数据清洗和表格导入

2026-03-05 23:55:18 +08:00
parent 5efb8fc9ed
commit 70fce8ebab
9 changed files with 1887 additions and 255 deletions
--- a/clean_to_structured.py
+++ b/clean_to_structured.py
@@ -362,6 +362,81 @@ def extract_apply_telegram(body_text: str) -> str | None:
    return handles[0] if handles else None


+def extract_urls(body_text: str) -> list[str]:
+    return dedupe(URL_RE.findall(body_text))
+
+
+def extract_first_url_by_keyword(body_text: str, keywords: list[str]) -> str | None:
+    urls = extract_urls(body_text)
+    for u in urls:
+        lu = u.lower()
+        if any(k.lower() in lu for k in keywords):
+            return u
+    return None
+
+
+def extract_first_nonempty_line(body_text: str) -> str | None:
+    for ln in body_text.splitlines():
+        t = clean_md_text(ln)
+        if t:
+            return t
+    return None
+
+
+def normalize_possible_url(raw: str) -> str | None:
+    token = clean_md_text(raw or "")
+    if not token:
+        return None
+    token = token.strip("()[]<>.,;\"' ")
+    if not token:
+        return None
+    if token.lower().startswith(("http://", "https://")):
+        return token
+    if token.lower().startswith("www."):
+        return "https://" + token
+    # simple domain-style fallback, e.g. company.com/apply
+    if " " not in token and "." in token and "/" in token:
+        return "https://" + token
+    if " " not in token and re.fullmatch(r"[A-Za-z0-9.-]+\.[A-Za-z]{2,}", token):
+        return "https://" + token
+    return None
+
+
+def extract_apply_link(body_text: str) -> str | None:
+    # Priority 1: explicit apply-like lines.
+    for ln in body_text.splitlines():
+        low = ln.lower()
+        if "apply" not in low and "申请" not in ln and "投递" not in ln:
+            continue
+
+        # direct URL in line
+        line_urls = URL_RE.findall(ln)
+        if line_urls:
+            return line_urls[0]
+
+        # try parse right side after ':' / '-'
+        if ":" in ln:
+            rhs = ln.split(":", 1)[1]
+        elif "：" in ln:
+            rhs = ln.split("：", 1)[1]
+        elif "-" in ln:
+            rhs = ln.split("-", 1)[1]
+        else:
+            rhs = ln
+
+        for token in re.split(r"\s+", rhs.strip()):
+            u = normalize_possible_url(token)
+            if u:
+                return u
+
+    # Priority 2: first URL that looks like an apply page.
+    for u in extract_urls(body_text):
+        lu = u.lower()
+        if "apply" in lu or "job" in lu or "careers" in lu:
+            return u
+    return None
+
+
 def infer_employment_fields(
    tags: list[str], raw_line: str | None
 ) -> tuple[str, str, str | None, list[str] | None, str | None]:
@@ -615,6 +690,300 @@ def parse_generic(
    )


+def parse_dejob_global(
+    source: str,
+    chat_id: int | None,
+    message_id: int,
+    message_date: str,
+    body_text: str,
+    raw_content: str,
+) -> StructuredJob:
+    job_type = "招聘" if ("招聘" in body_text or "recruit" in body_text.lower()) else None
+
+    company_name = extract_company_name_dejob(body_text)
+
+    industry_tags = []
+    for ln in body_text.splitlines():
+        if "🏡" in ln:
+            norm_ln = normalize_md_line(ln)
+            industry_tags = [
+                clean_md_text(t).replace("·", " ") for t in MD_TAG_RE.findall(norm_ln)
+            ]
+            industry_tags = dedupe([t for t in industry_tags if t])
+            break
+    if not industry_tags:
+        industry_tags = [h.replace("·", " ").strip() for h in HASHTAG_RE.findall(body_text)]
+        industry_tags = dedupe([h for h in industry_tags if h])
+
+    cooperation_tags = []
+    cooperation_line = None
+    for ln in body_text.splitlines():
+        low = ln.lower()
+        if "合作方式" in ln or "fulltime" in low or "parttime" in low or "remote" in low:
+            cooperation_line = ln
+            norm_ln = normalize_md_line(ln)
+            cooperation_tags = [
+                clean_md_text(t).replace("·", " ") for t in MD_TAG_RE.findall(norm_ln)
+            ]
+            cooperation_tags = dedupe([t for t in cooperation_tags if t])
+            break
+    (
+        work_mode,
+        job_nature,
+        job_location_text,
+        job_location_tags,
+        employment_type_raw,
+    ) = infer_employment_fields(cooperation_tags, cooperation_line)
+
+    position_tags = []
+    for ln in body_text.splitlines():
+        if "待招岗位" in ln or "📚" in ln:
+            norm_ln = normalize_md_line(ln)
+            position_tags = [
+                clean_md_text(t).replace("·", " ") for t in MD_TAG_RE.findall(norm_ln)
+            ]
+            position_tags = dedupe([t for t in position_tags if t])
+            break
+    if not position_tags:
+        position_tags = industry_tags
+
+    position_name = position_tags[0] if position_tags else extract_first_nonempty_line(body_text)
+
+    intro_sec = extract_section(body_text, "Introduction") or extract_section(body_text, "简介")
+    urls = extract_urls(body_text)
+    company_url = extract_first_url_by_keyword(body_text, ["dejob.top/jobDetail"])
+    if company_url and urls:
+        for u in urls:
+            if "dejob.top/jobDetail" not in u:
+                company_url = u
+                break
+    if not company_url:
+        company_url = extract_first_url(intro_sec) or (urls[0] if urls else None)
+
+    company_intro = None
+    if intro_sec:
+        intro_lines = [ln for ln in intro_sec.splitlines() if not URL_RE.search(ln)]
+        company_intro = clean_md_text("\n".join(intro_lines)) or None
+
+    salary_line = None
+    for ln in body_text.splitlines():
+        if "薪酬" in ln or "salary" in ln.lower():
+            salary_line = ln
+            break
+    salary_raw, salary_min, salary_max, salary_period = parse_salary(salary_line)
+    salary_currency = "USD" if salary_raw and "$" in salary_raw else None
+
+    responsibilities = extract_list_section(body_text, "岗位职责") or extract_list_section(
+        body_text, "Responsibilities"
+    )
+    requirements = extract_list_section(body_text, "岗位要求") or extract_list_section(
+        body_text, "Requirements"
+    )
+
+    apply_email = extract_apply_email(body_text)
+    apply_tg = extract_apply_telegram(body_text)
+    job_source_url = extract_first_url_by_keyword(body_text, ["dejob.top/jobDetail"])
+    if not job_source_url:
+        urls = extract_urls(body_text)
+        job_source_url = urls[0] if urls else None
+
+    return StructuredJob(
+        source=source,
+        source_channel="DeJob",
+        parser_name="dejob_global",
+        parser_version="v1",
+        chat_id=chat_id,
+        message_id=message_id,
+        message_date=message_date,
+        job_type=job_type,
+        company_name=company_name,
+        industry_tags=industry_tags,
+        company_intro=company_intro,
+        company_url=company_url,
+        work_mode=work_mode,
+        job_nature=job_nature,
+        job_location_text=job_location_text,
+        job_location_tags=job_location_tags,
+        employment_type_raw=employment_type_raw,
+        position_name=position_name,
+        position_tags=position_tags,
+        salary_raw=salary_raw,
+        salary_currency=salary_currency,
+        salary_min=salary_min,
+        salary_max=salary_max,
+        salary_period=salary_period,
+        responsibilities=responsibilities,
+        requirements=requirements,
+        apply_email=apply_email,
+        apply_telegram=apply_tg,
+        job_source_url=job_source_url or company_url,
+        body_text=body_text or "empty_message",
+        raw_content=raw_content,
+    )
+
+
+def parse_remote_cn(
+    source: str,
+    chat_id: int | None,
+    message_id: int,
+    message_date: str,
+    body_text: str,
+    raw_content: str,
+) -> StructuredJob:
+    lines = [clean_md_text(ln) for ln in body_text.splitlines() if clean_md_text(ln)]
+    title = lines[0] if lines else None
+
+    hashtags = [h.replace("·", " ").strip() for h in HASHTAG_RE.findall(body_text)]
+    hashtags = dedupe([h for h in hashtags if h])
+
+    (
+        work_mode,
+        job_nature,
+        job_location_text,
+        job_location_tags,
+        employment_type_raw,
+    ) = infer_employment_fields(hashtags, None)
+
+    summary_line = None
+    for ln in lines:
+        if ln.startswith("摘要:"):
+            summary_line = ln
+            break
+    salary_raw, salary_min, salary_max, salary_period = parse_salary(summary_line)
+    salary_currency = "USD" if salary_raw and "$" in salary_raw else None
+
+    urls = extract_urls(body_text)
+    apply_email = extract_apply_email(body_text)
+    apply_tg = extract_apply_telegram(body_text)
+
+    # remote_cn often places the detail link right below the title line.
+    top_url = None
+    raw_lines = [ln.strip() for ln in body_text.splitlines() if ln.strip()]
+    for ln in raw_lines[:6]:
+        found = URL_RE.findall(ln)
+        if found:
+            top_url = found[0]
+            break
+
+    job_source_url = (
+        top_url
+        or extract_first_url_by_keyword(body_text, ["remote-info.cn/jobs/"])
+        or (urls[0] if urls else None)
+    )
+
+    job_type = "招聘" if ("招聘" in body_text or "job" in body_text.lower()) else None
+
+    return StructuredJob(
+        source=source,
+        source_channel="remote_cn",
+        parser_name="remote_cn",
+        parser_version="v1",
+        chat_id=chat_id,
+        message_id=message_id,
+        message_date=message_date,
+        job_type=job_type,
+        company_name=None,
+        industry_tags=hashtags,
+        company_intro=summary_line.replace("摘要:", "", 1).strip() if summary_line else None,
+        company_url=job_source_url or (urls[0] if urls else None),
+        work_mode=work_mode,
+        job_nature=job_nature,
+        job_location_text=job_location_text,
+        job_location_tags=job_location_tags,
+        employment_type_raw=employment_type_raw,
+        position_name=title,
+        position_tags=hashtags,
+        salary_raw=salary_raw,
+        salary_currency=salary_currency,
+        salary_min=salary_min,
+        salary_max=salary_max,
+        salary_period=salary_period,
+        responsibilities=[],
+        requirements=[],
+        apply_email=apply_email,
+        apply_telegram=apply_tg,
+        job_source_url=job_source_url,
+        body_text=body_text or "empty_message",
+        raw_content=raw_content,
+    )
+
+
+def parse_cryptojobslist_source(
+    source: str,
+    chat_id: int | None,
+    message_id: int,
+    message_date: str,
+    body_text: str,
+    raw_content: str,
+) -> StructuredJob:
+    lines = [clean_md_text(ln) for ln in body_text.splitlines() if clean_md_text(ln)]
+    title = lines[0] if lines else None
+    urls = extract_urls(body_text)
+    hashtags = [h.replace("·", " ").strip() for h in HASHTAG_RE.findall(body_text)]
+    hashtags = dedupe([h for h in hashtags if h])
+
+    (
+        work_mode,
+        job_nature,
+        job_location_text,
+        job_location_tags,
+        employment_type_raw,
+    ) = infer_employment_fields(hashtags, None)
+
+    salary_line = None
+    for ln in lines:
+        if any(k in ln.lower() for k in ("salary", "$", "usd")):
+            salary_line = ln
+            break
+    salary_raw, salary_min, salary_max, salary_period = parse_salary(salary_line)
+    salary_currency = "USD" if salary_raw and "$" in salary_raw else None
+
+    apply_email = extract_apply_email(body_text)
+    apply_tg = extract_apply_telegram(body_text)
+    apply_link = extract_apply_link(body_text)
+    job_source_url = (
+        apply_link
+        or extract_first_url_by_keyword(body_text, ["cryptojobslist.com"])
+        or (urls[0] if urls else None)
+    )
+
+    job_type = "招聘" if ("job" in body_text.lower() or "hiring" in body_text.lower()) else None
+
+    return StructuredJob(
+        source=source,
+        source_channel="cryptojobslist",
+        parser_name="cryptojobslist",
+        parser_version="v1",
+        chat_id=chat_id,
+        message_id=message_id,
+        message_date=message_date,
+        job_type=job_type,
+        company_name=None,
+        industry_tags=hashtags,
+        company_intro=None,
+        company_url=job_source_url or (urls[0] if urls else None),
+        work_mode=work_mode,
+        job_nature=job_nature,
+        job_location_text=job_location_text,
+        job_location_tags=job_location_tags,
+        employment_type_raw=employment_type_raw,
+        position_name=title,
+        position_tags=hashtags,
+        salary_raw=salary_raw,
+        salary_currency=salary_currency,
+        salary_min=salary_min,
+        salary_max=salary_max,
+        salary_period=salary_period,
+        responsibilities=[],
+        requirements=[],
+        apply_email=apply_email,
+        apply_telegram=apply_tg,
+        job_source_url=job_source_url,
+        body_text=body_text or "empty_message",
+        raw_content=raw_content,
+    )
+
+
 def route_parse(row: tuple) -> StructuredJob:
    source, chat_id, message_id, content, message_date = row
    raw_content = content or ""
@@ -624,6 +993,18 @@ def route_parse(row: tuple) -> StructuredJob:
        return parse_dejob_official(
            source, chat_id, message_id, message_date, body_text, raw_content
        )
+    if source == "@DeJob_Global_group":
+        return parse_dejob_global(
+            source, chat_id, message_id, message_date, body_text, raw_content
+        )
+    if source == "@remote_cn":
+        return parse_remote_cn(
+            source, chat_id, message_id, message_date, body_text, raw_content
+        )
+    if source == "@cryptojobslist":
+        return parse_cryptojobslist_source(
+            source, chat_id, message_id, message_date, body_text, raw_content
+        )

    return parse_generic(source, chat_id, message_id, message_date, body_text, raw_content)

@@ -715,6 +1096,10 @@ def is_recruitment_job(item: StructuredJob) -> bool:
    return item.job_type == "招聘"


+def has_usable_job_link(item: StructuredJob) -> bool:
+    return bool((item.job_source_url or "").strip())
+
+
 def get_last_processed_row_id(conn, pipeline_name: str) -> int:
    with conn.cursor() as cur:
        cur.execute(
@@ -763,6 +1148,7 @@ def main():
        processed = 0
        inserted = 0
        skipped_non_recruit = 0
+        skipped_no_link = 0
        by_parser = {}
        max_row_id = last_row_id

@@ -778,12 +1164,17 @@ def main():
                skipped_non_recruit += 1
                continue

+            if not has_usable_job_link(item):
+                skipped_no_link += 1
+                continue
+
            upsert_structured(conn, item)
            inserted += 1

            if processed % 500 == 0:
                logger.info(
-                    f"[clean] processed={processed}, inserted={inserted}, skipped_non_recruit={skipped_non_recruit}"
+                    f"[clean] processed={processed}, inserted={inserted}, "
+                    f"skipped_non_recruit={skipped_non_recruit}, skipped_no_link={skipped_no_link}"
                )

        if max_row_id > last_row_id:
@@ -797,6 +1188,7 @@ def main():
        logger.info(
            "[done] "
            f"structured_jobs={total}, inserted={inserted}, skipped_non_recruit={skipped_non_recruit}, "
+            f"skipped_no_link={skipped_no_link}, "
            f"target=mysql.structured_jobs, parsers={by_parser}"
        )
        if processed == 0: