完成所有来源数据清洗和表格导入

This commit is contained in:
BoliviaYu
2026-03-05 23:55:18 +08:00
parent 5efb8fc9ed
commit 70fce8ebab
9 changed files with 1887 additions and 255 deletions

View File

@@ -362,6 +362,81 @@ def extract_apply_telegram(body_text: str) -> str | None:
return handles[0] if handles else None
def extract_urls(body_text: str) -> list[str]:
return dedupe(URL_RE.findall(body_text))
def extract_first_url_by_keyword(body_text: str, keywords: list[str]) -> str | None:
urls = extract_urls(body_text)
for u in urls:
lu = u.lower()
if any(k.lower() in lu for k in keywords):
return u
return None
def extract_first_nonempty_line(body_text: str) -> str | None:
for ln in body_text.splitlines():
t = clean_md_text(ln)
if t:
return t
return None
def normalize_possible_url(raw: str) -> str | None:
token = clean_md_text(raw or "")
if not token:
return None
token = token.strip("()[]<>.,;\"' ")
if not token:
return None
if token.lower().startswith(("http://", "https://")):
return token
if token.lower().startswith("www."):
return "https://" + token
# simple domain-style fallback, e.g. company.com/apply
if " " not in token and "." in token and "/" in token:
return "https://" + token
if " " not in token and re.fullmatch(r"[A-Za-z0-9.-]+\.[A-Za-z]{2,}", token):
return "https://" + token
return None
def extract_apply_link(body_text: str) -> str | None:
# Priority 1: explicit apply-like lines.
for ln in body_text.splitlines():
low = ln.lower()
if "apply" not in low and "申请" not in ln and "投递" not in ln:
continue
# direct URL in line
line_urls = URL_RE.findall(ln)
if line_urls:
return line_urls[0]
# try parse right side after ':' / '-'
if ":" in ln:
rhs = ln.split(":", 1)[1]
elif "" in ln:
rhs = ln.split("", 1)[1]
elif "-" in ln:
rhs = ln.split("-", 1)[1]
else:
rhs = ln
for token in re.split(r"\s+", rhs.strip()):
u = normalize_possible_url(token)
if u:
return u
# Priority 2: first URL that looks like an apply page.
for u in extract_urls(body_text):
lu = u.lower()
if "apply" in lu or "job" in lu or "careers" in lu:
return u
return None
def infer_employment_fields(
tags: list[str], raw_line: str | None
) -> tuple[str, str, str | None, list[str] | None, str | None]:
@@ -615,6 +690,300 @@ def parse_generic(
)
def parse_dejob_global(
source: str,
chat_id: int | None,
message_id: int,
message_date: str,
body_text: str,
raw_content: str,
) -> StructuredJob:
job_type = "招聘" if ("招聘" in body_text or "recruit" in body_text.lower()) else None
company_name = extract_company_name_dejob(body_text)
industry_tags = []
for ln in body_text.splitlines():
if "🏡" in ln:
norm_ln = normalize_md_line(ln)
industry_tags = [
clean_md_text(t).replace("·", " ") for t in MD_TAG_RE.findall(norm_ln)
]
industry_tags = dedupe([t for t in industry_tags if t])
break
if not industry_tags:
industry_tags = [h.replace("·", " ").strip() for h in HASHTAG_RE.findall(body_text)]
industry_tags = dedupe([h for h in industry_tags if h])
cooperation_tags = []
cooperation_line = None
for ln in body_text.splitlines():
low = ln.lower()
if "合作方式" in ln or "fulltime" in low or "parttime" in low or "remote" in low:
cooperation_line = ln
norm_ln = normalize_md_line(ln)
cooperation_tags = [
clean_md_text(t).replace("·", " ") for t in MD_TAG_RE.findall(norm_ln)
]
cooperation_tags = dedupe([t for t in cooperation_tags if t])
break
(
work_mode,
job_nature,
job_location_text,
job_location_tags,
employment_type_raw,
) = infer_employment_fields(cooperation_tags, cooperation_line)
position_tags = []
for ln in body_text.splitlines():
if "待招岗位" in ln or "📚" in ln:
norm_ln = normalize_md_line(ln)
position_tags = [
clean_md_text(t).replace("·", " ") for t in MD_TAG_RE.findall(norm_ln)
]
position_tags = dedupe([t for t in position_tags if t])
break
if not position_tags:
position_tags = industry_tags
position_name = position_tags[0] if position_tags else extract_first_nonempty_line(body_text)
intro_sec = extract_section(body_text, "Introduction") or extract_section(body_text, "简介")
urls = extract_urls(body_text)
company_url = extract_first_url_by_keyword(body_text, ["dejob.top/jobDetail"])
if company_url and urls:
for u in urls:
if "dejob.top/jobDetail" not in u:
company_url = u
break
if not company_url:
company_url = extract_first_url(intro_sec) or (urls[0] if urls else None)
company_intro = None
if intro_sec:
intro_lines = [ln for ln in intro_sec.splitlines() if not URL_RE.search(ln)]
company_intro = clean_md_text("\n".join(intro_lines)) or None
salary_line = None
for ln in body_text.splitlines():
if "薪酬" in ln or "salary" in ln.lower():
salary_line = ln
break
salary_raw, salary_min, salary_max, salary_period = parse_salary(salary_line)
salary_currency = "USD" if salary_raw and "$" in salary_raw else None
responsibilities = extract_list_section(body_text, "岗位职责") or extract_list_section(
body_text, "Responsibilities"
)
requirements = extract_list_section(body_text, "岗位要求") or extract_list_section(
body_text, "Requirements"
)
apply_email = extract_apply_email(body_text)
apply_tg = extract_apply_telegram(body_text)
job_source_url = extract_first_url_by_keyword(body_text, ["dejob.top/jobDetail"])
if not job_source_url:
urls = extract_urls(body_text)
job_source_url = urls[0] if urls else None
return StructuredJob(
source=source,
source_channel="DeJob",
parser_name="dejob_global",
parser_version="v1",
chat_id=chat_id,
message_id=message_id,
message_date=message_date,
job_type=job_type,
company_name=company_name,
industry_tags=industry_tags,
company_intro=company_intro,
company_url=company_url,
work_mode=work_mode,
job_nature=job_nature,
job_location_text=job_location_text,
job_location_tags=job_location_tags,
employment_type_raw=employment_type_raw,
position_name=position_name,
position_tags=position_tags,
salary_raw=salary_raw,
salary_currency=salary_currency,
salary_min=salary_min,
salary_max=salary_max,
salary_period=salary_period,
responsibilities=responsibilities,
requirements=requirements,
apply_email=apply_email,
apply_telegram=apply_tg,
job_source_url=job_source_url or company_url,
body_text=body_text or "empty_message",
raw_content=raw_content,
)
def parse_remote_cn(
source: str,
chat_id: int | None,
message_id: int,
message_date: str,
body_text: str,
raw_content: str,
) -> StructuredJob:
lines = [clean_md_text(ln) for ln in body_text.splitlines() if clean_md_text(ln)]
title = lines[0] if lines else None
hashtags = [h.replace("·", " ").strip() for h in HASHTAG_RE.findall(body_text)]
hashtags = dedupe([h for h in hashtags if h])
(
work_mode,
job_nature,
job_location_text,
job_location_tags,
employment_type_raw,
) = infer_employment_fields(hashtags, None)
summary_line = None
for ln in lines:
if ln.startswith("摘要:"):
summary_line = ln
break
salary_raw, salary_min, salary_max, salary_period = parse_salary(summary_line)
salary_currency = "USD" if salary_raw and "$" in salary_raw else None
urls = extract_urls(body_text)
apply_email = extract_apply_email(body_text)
apply_tg = extract_apply_telegram(body_text)
# remote_cn often places the detail link right below the title line.
top_url = None
raw_lines = [ln.strip() for ln in body_text.splitlines() if ln.strip()]
for ln in raw_lines[:6]:
found = URL_RE.findall(ln)
if found:
top_url = found[0]
break
job_source_url = (
top_url
or extract_first_url_by_keyword(body_text, ["remote-info.cn/jobs/"])
or (urls[0] if urls else None)
)
job_type = "招聘" if ("招聘" in body_text or "job" in body_text.lower()) else None
return StructuredJob(
source=source,
source_channel="remote_cn",
parser_name="remote_cn",
parser_version="v1",
chat_id=chat_id,
message_id=message_id,
message_date=message_date,
job_type=job_type,
company_name=None,
industry_tags=hashtags,
company_intro=summary_line.replace("摘要:", "", 1).strip() if summary_line else None,
company_url=job_source_url or (urls[0] if urls else None),
work_mode=work_mode,
job_nature=job_nature,
job_location_text=job_location_text,
job_location_tags=job_location_tags,
employment_type_raw=employment_type_raw,
position_name=title,
position_tags=hashtags,
salary_raw=salary_raw,
salary_currency=salary_currency,
salary_min=salary_min,
salary_max=salary_max,
salary_period=salary_period,
responsibilities=[],
requirements=[],
apply_email=apply_email,
apply_telegram=apply_tg,
job_source_url=job_source_url,
body_text=body_text or "empty_message",
raw_content=raw_content,
)
def parse_cryptojobslist_source(
source: str,
chat_id: int | None,
message_id: int,
message_date: str,
body_text: str,
raw_content: str,
) -> StructuredJob:
lines = [clean_md_text(ln) for ln in body_text.splitlines() if clean_md_text(ln)]
title = lines[0] if lines else None
urls = extract_urls(body_text)
hashtags = [h.replace("·", " ").strip() for h in HASHTAG_RE.findall(body_text)]
hashtags = dedupe([h for h in hashtags if h])
(
work_mode,
job_nature,
job_location_text,
job_location_tags,
employment_type_raw,
) = infer_employment_fields(hashtags, None)
salary_line = None
for ln in lines:
if any(k in ln.lower() for k in ("salary", "$", "usd")):
salary_line = ln
break
salary_raw, salary_min, salary_max, salary_period = parse_salary(salary_line)
salary_currency = "USD" if salary_raw and "$" in salary_raw else None
apply_email = extract_apply_email(body_text)
apply_tg = extract_apply_telegram(body_text)
apply_link = extract_apply_link(body_text)
job_source_url = (
apply_link
or extract_first_url_by_keyword(body_text, ["cryptojobslist.com"])
or (urls[0] if urls else None)
)
job_type = "招聘" if ("job" in body_text.lower() or "hiring" in body_text.lower()) else None
return StructuredJob(
source=source,
source_channel="cryptojobslist",
parser_name="cryptojobslist",
parser_version="v1",
chat_id=chat_id,
message_id=message_id,
message_date=message_date,
job_type=job_type,
company_name=None,
industry_tags=hashtags,
company_intro=None,
company_url=job_source_url or (urls[0] if urls else None),
work_mode=work_mode,
job_nature=job_nature,
job_location_text=job_location_text,
job_location_tags=job_location_tags,
employment_type_raw=employment_type_raw,
position_name=title,
position_tags=hashtags,
salary_raw=salary_raw,
salary_currency=salary_currency,
salary_min=salary_min,
salary_max=salary_max,
salary_period=salary_period,
responsibilities=[],
requirements=[],
apply_email=apply_email,
apply_telegram=apply_tg,
job_source_url=job_source_url,
body_text=body_text or "empty_message",
raw_content=raw_content,
)
def route_parse(row: tuple) -> StructuredJob:
source, chat_id, message_id, content, message_date = row
raw_content = content or ""
@@ -624,6 +993,18 @@ def route_parse(row: tuple) -> StructuredJob:
return parse_dejob_official(
source, chat_id, message_id, message_date, body_text, raw_content
)
if source == "@DeJob_Global_group":
return parse_dejob_global(
source, chat_id, message_id, message_date, body_text, raw_content
)
if source == "@remote_cn":
return parse_remote_cn(
source, chat_id, message_id, message_date, body_text, raw_content
)
if source == "@cryptojobslist":
return parse_cryptojobslist_source(
source, chat_id, message_id, message_date, body_text, raw_content
)
return parse_generic(source, chat_id, message_id, message_date, body_text, raw_content)
@@ -715,6 +1096,10 @@ def is_recruitment_job(item: StructuredJob) -> bool:
return item.job_type == "招聘"
def has_usable_job_link(item: StructuredJob) -> bool:
return bool((item.job_source_url or "").strip())
def get_last_processed_row_id(conn, pipeline_name: str) -> int:
with conn.cursor() as cur:
cur.execute(
@@ -763,6 +1148,7 @@ def main():
processed = 0
inserted = 0
skipped_non_recruit = 0
skipped_no_link = 0
by_parser = {}
max_row_id = last_row_id
@@ -778,12 +1164,17 @@ def main():
skipped_non_recruit += 1
continue
if not has_usable_job_link(item):
skipped_no_link += 1
continue
upsert_structured(conn, item)
inserted += 1
if processed % 500 == 0:
logger.info(
f"[clean] processed={processed}, inserted={inserted}, skipped_non_recruit={skipped_non_recruit}"
f"[clean] processed={processed}, inserted={inserted}, "
f"skipped_non_recruit={skipped_non_recruit}, skipped_no_link={skipped_no_link}"
)
if max_row_id > last_row_id:
@@ -797,6 +1188,7 @@ def main():
logger.info(
"[done] "
f"structured_jobs={total}, inserted={inserted}, skipped_non_recruit={skipped_non_recruit}, "
f"skipped_no_link={skipped_no_link}, "
f"target=mysql.structured_jobs, parsers={by_parser}"
)
if processed == 0: