完成所有来源数据清洗和表格导入
This commit is contained in:
@@ -362,6 +362,81 @@ def extract_apply_telegram(body_text: str) -> str | None:
|
||||
return handles[0] if handles else None
|
||||
|
||||
|
||||
def extract_urls(body_text: str) -> list[str]:
|
||||
return dedupe(URL_RE.findall(body_text))
|
||||
|
||||
|
||||
def extract_first_url_by_keyword(body_text: str, keywords: list[str]) -> str | None:
|
||||
urls = extract_urls(body_text)
|
||||
for u in urls:
|
||||
lu = u.lower()
|
||||
if any(k.lower() in lu for k in keywords):
|
||||
return u
|
||||
return None
|
||||
|
||||
|
||||
def extract_first_nonempty_line(body_text: str) -> str | None:
|
||||
for ln in body_text.splitlines():
|
||||
t = clean_md_text(ln)
|
||||
if t:
|
||||
return t
|
||||
return None
|
||||
|
||||
|
||||
def normalize_possible_url(raw: str) -> str | None:
|
||||
token = clean_md_text(raw or "")
|
||||
if not token:
|
||||
return None
|
||||
token = token.strip("()[]<>.,;\"' ")
|
||||
if not token:
|
||||
return None
|
||||
if token.lower().startswith(("http://", "https://")):
|
||||
return token
|
||||
if token.lower().startswith("www."):
|
||||
return "https://" + token
|
||||
# simple domain-style fallback, e.g. company.com/apply
|
||||
if " " not in token and "." in token and "/" in token:
|
||||
return "https://" + token
|
||||
if " " not in token and re.fullmatch(r"[A-Za-z0-9.-]+\.[A-Za-z]{2,}", token):
|
||||
return "https://" + token
|
||||
return None
|
||||
|
||||
|
||||
def extract_apply_link(body_text: str) -> str | None:
|
||||
# Priority 1: explicit apply-like lines.
|
||||
for ln in body_text.splitlines():
|
||||
low = ln.lower()
|
||||
if "apply" not in low and "申请" not in ln and "投递" not in ln:
|
||||
continue
|
||||
|
||||
# direct URL in line
|
||||
line_urls = URL_RE.findall(ln)
|
||||
if line_urls:
|
||||
return line_urls[0]
|
||||
|
||||
# try parse right side after ':' / '-'
|
||||
if ":" in ln:
|
||||
rhs = ln.split(":", 1)[1]
|
||||
elif ":" in ln:
|
||||
rhs = ln.split(":", 1)[1]
|
||||
elif "-" in ln:
|
||||
rhs = ln.split("-", 1)[1]
|
||||
else:
|
||||
rhs = ln
|
||||
|
||||
for token in re.split(r"\s+", rhs.strip()):
|
||||
u = normalize_possible_url(token)
|
||||
if u:
|
||||
return u
|
||||
|
||||
# Priority 2: first URL that looks like an apply page.
|
||||
for u in extract_urls(body_text):
|
||||
lu = u.lower()
|
||||
if "apply" in lu or "job" in lu or "careers" in lu:
|
||||
return u
|
||||
return None
|
||||
|
||||
|
||||
def infer_employment_fields(
|
||||
tags: list[str], raw_line: str | None
|
||||
) -> tuple[str, str, str | None, list[str] | None, str | None]:
|
||||
@@ -615,6 +690,300 @@ def parse_generic(
|
||||
)
|
||||
|
||||
|
||||
def parse_dejob_global(
|
||||
source: str,
|
||||
chat_id: int | None,
|
||||
message_id: int,
|
||||
message_date: str,
|
||||
body_text: str,
|
||||
raw_content: str,
|
||||
) -> StructuredJob:
|
||||
job_type = "招聘" if ("招聘" in body_text or "recruit" in body_text.lower()) else None
|
||||
|
||||
company_name = extract_company_name_dejob(body_text)
|
||||
|
||||
industry_tags = []
|
||||
for ln in body_text.splitlines():
|
||||
if "🏡" in ln:
|
||||
norm_ln = normalize_md_line(ln)
|
||||
industry_tags = [
|
||||
clean_md_text(t).replace("·", " ") for t in MD_TAG_RE.findall(norm_ln)
|
||||
]
|
||||
industry_tags = dedupe([t for t in industry_tags if t])
|
||||
break
|
||||
if not industry_tags:
|
||||
industry_tags = [h.replace("·", " ").strip() for h in HASHTAG_RE.findall(body_text)]
|
||||
industry_tags = dedupe([h for h in industry_tags if h])
|
||||
|
||||
cooperation_tags = []
|
||||
cooperation_line = None
|
||||
for ln in body_text.splitlines():
|
||||
low = ln.lower()
|
||||
if "合作方式" in ln or "fulltime" in low or "parttime" in low or "remote" in low:
|
||||
cooperation_line = ln
|
||||
norm_ln = normalize_md_line(ln)
|
||||
cooperation_tags = [
|
||||
clean_md_text(t).replace("·", " ") for t in MD_TAG_RE.findall(norm_ln)
|
||||
]
|
||||
cooperation_tags = dedupe([t for t in cooperation_tags if t])
|
||||
break
|
||||
(
|
||||
work_mode,
|
||||
job_nature,
|
||||
job_location_text,
|
||||
job_location_tags,
|
||||
employment_type_raw,
|
||||
) = infer_employment_fields(cooperation_tags, cooperation_line)
|
||||
|
||||
position_tags = []
|
||||
for ln in body_text.splitlines():
|
||||
if "待招岗位" in ln or "📚" in ln:
|
||||
norm_ln = normalize_md_line(ln)
|
||||
position_tags = [
|
||||
clean_md_text(t).replace("·", " ") for t in MD_TAG_RE.findall(norm_ln)
|
||||
]
|
||||
position_tags = dedupe([t for t in position_tags if t])
|
||||
break
|
||||
if not position_tags:
|
||||
position_tags = industry_tags
|
||||
|
||||
position_name = position_tags[0] if position_tags else extract_first_nonempty_line(body_text)
|
||||
|
||||
intro_sec = extract_section(body_text, "Introduction") or extract_section(body_text, "简介")
|
||||
urls = extract_urls(body_text)
|
||||
company_url = extract_first_url_by_keyword(body_text, ["dejob.top/jobDetail"])
|
||||
if company_url and urls:
|
||||
for u in urls:
|
||||
if "dejob.top/jobDetail" not in u:
|
||||
company_url = u
|
||||
break
|
||||
if not company_url:
|
||||
company_url = extract_first_url(intro_sec) or (urls[0] if urls else None)
|
||||
|
||||
company_intro = None
|
||||
if intro_sec:
|
||||
intro_lines = [ln for ln in intro_sec.splitlines() if not URL_RE.search(ln)]
|
||||
company_intro = clean_md_text("\n".join(intro_lines)) or None
|
||||
|
||||
salary_line = None
|
||||
for ln in body_text.splitlines():
|
||||
if "薪酬" in ln or "salary" in ln.lower():
|
||||
salary_line = ln
|
||||
break
|
||||
salary_raw, salary_min, salary_max, salary_period = parse_salary(salary_line)
|
||||
salary_currency = "USD" if salary_raw and "$" in salary_raw else None
|
||||
|
||||
responsibilities = extract_list_section(body_text, "岗位职责") or extract_list_section(
|
||||
body_text, "Responsibilities"
|
||||
)
|
||||
requirements = extract_list_section(body_text, "岗位要求") or extract_list_section(
|
||||
body_text, "Requirements"
|
||||
)
|
||||
|
||||
apply_email = extract_apply_email(body_text)
|
||||
apply_tg = extract_apply_telegram(body_text)
|
||||
job_source_url = extract_first_url_by_keyword(body_text, ["dejob.top/jobDetail"])
|
||||
if not job_source_url:
|
||||
urls = extract_urls(body_text)
|
||||
job_source_url = urls[0] if urls else None
|
||||
|
||||
return StructuredJob(
|
||||
source=source,
|
||||
source_channel="DeJob",
|
||||
parser_name="dejob_global",
|
||||
parser_version="v1",
|
||||
chat_id=chat_id,
|
||||
message_id=message_id,
|
||||
message_date=message_date,
|
||||
job_type=job_type,
|
||||
company_name=company_name,
|
||||
industry_tags=industry_tags,
|
||||
company_intro=company_intro,
|
||||
company_url=company_url,
|
||||
work_mode=work_mode,
|
||||
job_nature=job_nature,
|
||||
job_location_text=job_location_text,
|
||||
job_location_tags=job_location_tags,
|
||||
employment_type_raw=employment_type_raw,
|
||||
position_name=position_name,
|
||||
position_tags=position_tags,
|
||||
salary_raw=salary_raw,
|
||||
salary_currency=salary_currency,
|
||||
salary_min=salary_min,
|
||||
salary_max=salary_max,
|
||||
salary_period=salary_period,
|
||||
responsibilities=responsibilities,
|
||||
requirements=requirements,
|
||||
apply_email=apply_email,
|
||||
apply_telegram=apply_tg,
|
||||
job_source_url=job_source_url or company_url,
|
||||
body_text=body_text or "empty_message",
|
||||
raw_content=raw_content,
|
||||
)
|
||||
|
||||
|
||||
def parse_remote_cn(
|
||||
source: str,
|
||||
chat_id: int | None,
|
||||
message_id: int,
|
||||
message_date: str,
|
||||
body_text: str,
|
||||
raw_content: str,
|
||||
) -> StructuredJob:
|
||||
lines = [clean_md_text(ln) for ln in body_text.splitlines() if clean_md_text(ln)]
|
||||
title = lines[0] if lines else None
|
||||
|
||||
hashtags = [h.replace("·", " ").strip() for h in HASHTAG_RE.findall(body_text)]
|
||||
hashtags = dedupe([h for h in hashtags if h])
|
||||
|
||||
(
|
||||
work_mode,
|
||||
job_nature,
|
||||
job_location_text,
|
||||
job_location_tags,
|
||||
employment_type_raw,
|
||||
) = infer_employment_fields(hashtags, None)
|
||||
|
||||
summary_line = None
|
||||
for ln in lines:
|
||||
if ln.startswith("摘要:"):
|
||||
summary_line = ln
|
||||
break
|
||||
salary_raw, salary_min, salary_max, salary_period = parse_salary(summary_line)
|
||||
salary_currency = "USD" if salary_raw and "$" in salary_raw else None
|
||||
|
||||
urls = extract_urls(body_text)
|
||||
apply_email = extract_apply_email(body_text)
|
||||
apply_tg = extract_apply_telegram(body_text)
|
||||
|
||||
# remote_cn often places the detail link right below the title line.
|
||||
top_url = None
|
||||
raw_lines = [ln.strip() for ln in body_text.splitlines() if ln.strip()]
|
||||
for ln in raw_lines[:6]:
|
||||
found = URL_RE.findall(ln)
|
||||
if found:
|
||||
top_url = found[0]
|
||||
break
|
||||
|
||||
job_source_url = (
|
||||
top_url
|
||||
or extract_first_url_by_keyword(body_text, ["remote-info.cn/jobs/"])
|
||||
or (urls[0] if urls else None)
|
||||
)
|
||||
|
||||
job_type = "招聘" if ("招聘" in body_text or "job" in body_text.lower()) else None
|
||||
|
||||
return StructuredJob(
|
||||
source=source,
|
||||
source_channel="remote_cn",
|
||||
parser_name="remote_cn",
|
||||
parser_version="v1",
|
||||
chat_id=chat_id,
|
||||
message_id=message_id,
|
||||
message_date=message_date,
|
||||
job_type=job_type,
|
||||
company_name=None,
|
||||
industry_tags=hashtags,
|
||||
company_intro=summary_line.replace("摘要:", "", 1).strip() if summary_line else None,
|
||||
company_url=job_source_url or (urls[0] if urls else None),
|
||||
work_mode=work_mode,
|
||||
job_nature=job_nature,
|
||||
job_location_text=job_location_text,
|
||||
job_location_tags=job_location_tags,
|
||||
employment_type_raw=employment_type_raw,
|
||||
position_name=title,
|
||||
position_tags=hashtags,
|
||||
salary_raw=salary_raw,
|
||||
salary_currency=salary_currency,
|
||||
salary_min=salary_min,
|
||||
salary_max=salary_max,
|
||||
salary_period=salary_period,
|
||||
responsibilities=[],
|
||||
requirements=[],
|
||||
apply_email=apply_email,
|
||||
apply_telegram=apply_tg,
|
||||
job_source_url=job_source_url,
|
||||
body_text=body_text or "empty_message",
|
||||
raw_content=raw_content,
|
||||
)
|
||||
|
||||
|
||||
def parse_cryptojobslist_source(
|
||||
source: str,
|
||||
chat_id: int | None,
|
||||
message_id: int,
|
||||
message_date: str,
|
||||
body_text: str,
|
||||
raw_content: str,
|
||||
) -> StructuredJob:
|
||||
lines = [clean_md_text(ln) for ln in body_text.splitlines() if clean_md_text(ln)]
|
||||
title = lines[0] if lines else None
|
||||
urls = extract_urls(body_text)
|
||||
hashtags = [h.replace("·", " ").strip() for h in HASHTAG_RE.findall(body_text)]
|
||||
hashtags = dedupe([h for h in hashtags if h])
|
||||
|
||||
(
|
||||
work_mode,
|
||||
job_nature,
|
||||
job_location_text,
|
||||
job_location_tags,
|
||||
employment_type_raw,
|
||||
) = infer_employment_fields(hashtags, None)
|
||||
|
||||
salary_line = None
|
||||
for ln in lines:
|
||||
if any(k in ln.lower() for k in ("salary", "$", "usd")):
|
||||
salary_line = ln
|
||||
break
|
||||
salary_raw, salary_min, salary_max, salary_period = parse_salary(salary_line)
|
||||
salary_currency = "USD" if salary_raw and "$" in salary_raw else None
|
||||
|
||||
apply_email = extract_apply_email(body_text)
|
||||
apply_tg = extract_apply_telegram(body_text)
|
||||
apply_link = extract_apply_link(body_text)
|
||||
job_source_url = (
|
||||
apply_link
|
||||
or extract_first_url_by_keyword(body_text, ["cryptojobslist.com"])
|
||||
or (urls[0] if urls else None)
|
||||
)
|
||||
|
||||
job_type = "招聘" if ("job" in body_text.lower() or "hiring" in body_text.lower()) else None
|
||||
|
||||
return StructuredJob(
|
||||
source=source,
|
||||
source_channel="cryptojobslist",
|
||||
parser_name="cryptojobslist",
|
||||
parser_version="v1",
|
||||
chat_id=chat_id,
|
||||
message_id=message_id,
|
||||
message_date=message_date,
|
||||
job_type=job_type,
|
||||
company_name=None,
|
||||
industry_tags=hashtags,
|
||||
company_intro=None,
|
||||
company_url=job_source_url or (urls[0] if urls else None),
|
||||
work_mode=work_mode,
|
||||
job_nature=job_nature,
|
||||
job_location_text=job_location_text,
|
||||
job_location_tags=job_location_tags,
|
||||
employment_type_raw=employment_type_raw,
|
||||
position_name=title,
|
||||
position_tags=hashtags,
|
||||
salary_raw=salary_raw,
|
||||
salary_currency=salary_currency,
|
||||
salary_min=salary_min,
|
||||
salary_max=salary_max,
|
||||
salary_period=salary_period,
|
||||
responsibilities=[],
|
||||
requirements=[],
|
||||
apply_email=apply_email,
|
||||
apply_telegram=apply_tg,
|
||||
job_source_url=job_source_url,
|
||||
body_text=body_text or "empty_message",
|
||||
raw_content=raw_content,
|
||||
)
|
||||
|
||||
|
||||
def route_parse(row: tuple) -> StructuredJob:
|
||||
source, chat_id, message_id, content, message_date = row
|
||||
raw_content = content or ""
|
||||
@@ -624,6 +993,18 @@ def route_parse(row: tuple) -> StructuredJob:
|
||||
return parse_dejob_official(
|
||||
source, chat_id, message_id, message_date, body_text, raw_content
|
||||
)
|
||||
if source == "@DeJob_Global_group":
|
||||
return parse_dejob_global(
|
||||
source, chat_id, message_id, message_date, body_text, raw_content
|
||||
)
|
||||
if source == "@remote_cn":
|
||||
return parse_remote_cn(
|
||||
source, chat_id, message_id, message_date, body_text, raw_content
|
||||
)
|
||||
if source == "@cryptojobslist":
|
||||
return parse_cryptojobslist_source(
|
||||
source, chat_id, message_id, message_date, body_text, raw_content
|
||||
)
|
||||
|
||||
return parse_generic(source, chat_id, message_id, message_date, body_text, raw_content)
|
||||
|
||||
@@ -715,6 +1096,10 @@ def is_recruitment_job(item: StructuredJob) -> bool:
|
||||
return item.job_type == "招聘"
|
||||
|
||||
|
||||
def has_usable_job_link(item: StructuredJob) -> bool:
|
||||
return bool((item.job_source_url or "").strip())
|
||||
|
||||
|
||||
def get_last_processed_row_id(conn, pipeline_name: str) -> int:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute(
|
||||
@@ -763,6 +1148,7 @@ def main():
|
||||
processed = 0
|
||||
inserted = 0
|
||||
skipped_non_recruit = 0
|
||||
skipped_no_link = 0
|
||||
by_parser = {}
|
||||
max_row_id = last_row_id
|
||||
|
||||
@@ -778,12 +1164,17 @@ def main():
|
||||
skipped_non_recruit += 1
|
||||
continue
|
||||
|
||||
if not has_usable_job_link(item):
|
||||
skipped_no_link += 1
|
||||
continue
|
||||
|
||||
upsert_structured(conn, item)
|
||||
inserted += 1
|
||||
|
||||
if processed % 500 == 0:
|
||||
logger.info(
|
||||
f"[clean] processed={processed}, inserted={inserted}, skipped_non_recruit={skipped_non_recruit}"
|
||||
f"[clean] processed={processed}, inserted={inserted}, "
|
||||
f"skipped_non_recruit={skipped_non_recruit}, skipped_no_link={skipped_no_link}"
|
||||
)
|
||||
|
||||
if max_row_id > last_row_id:
|
||||
@@ -797,6 +1188,7 @@ def main():
|
||||
logger.info(
|
||||
"[done] "
|
||||
f"structured_jobs={total}, inserted={inserted}, skipped_non_recruit={skipped_non_recruit}, "
|
||||
f"skipped_no_link={skipped_no_link}, "
|
||||
f"target=mysql.structured_jobs, parsers={by_parser}"
|
||||
)
|
||||
if processed == 0:
|
||||
|
||||
Reference in New Issue
Block a user