完成所有来源数据清洗和表格导入

This commit is contained in:
BoliviaYu
2026-03-05 23:55:18 +08:00
parent 5efb8fc9ed
commit 70fce8ebab
9 changed files with 1887 additions and 255 deletions

View File

@@ -5,6 +5,7 @@ PROJECT_DIR="/home/liam/code/python/jobs_robots"
LOG_DIR="$PROJECT_DIR/logs"
LOCK_FILE="$PROJECT_DIR/.daily_job.lock"
TS="$(date '+%Y-%m-%d %H:%M:%S')"
PY_BIN="$PROJECT_DIR/.venv/bin/python"
mkdir -p "$LOG_DIR"
@@ -19,8 +20,13 @@ cd "$PROJECT_DIR"
echo "[$TS] daily job start" >> "$LOG_DIR/daily_job.log"
if [[ ! -x "$PY_BIN" ]]; then
echo "[$TS] python not found: $PY_BIN" >> "$LOG_DIR/daily_job.log"
exit 1
fi
# Auto-advance time window to a rolling daily range.
.venv/bin/python - <<'PY'
"$PY_BIN" - <<'PY'
import json
from datetime import datetime, timezone, timedelta
@@ -50,9 +56,25 @@ print(
PY
# 1) Crawl TG incremental
uv run main.py >> "$LOG_DIR/daily_job.log" 2>&1
"$PY_BIN" main.py >> "$LOG_DIR/daily_job.log" 2>&1
# 2) Clean dejob_official and others into structured table
uv run clean_to_structured.py >> "$LOG_DIR/daily_job.log" 2>&1
"$PY_BIN" clean_to_structured.py >> "$LOG_DIR/daily_job.log" 2>&1
# 3) Sync local MySQL to cloud MySQL (only when mysql_cloud is configured)
if "$PY_BIN" - <<'PY'
import json
with open("config.json", "r", encoding="utf-8") as f:
cfg = json.load(f)
cloud = cfg.get("mysql_cloud") or {}
ok = bool(cloud.get("host") and cloud.get("user") and cloud.get("database"))
ok = ok and bool(cloud.get("password")) and cloud.get("password") != "CHANGE_ME"
raise SystemExit(0 if ok else 1)
PY
then
"$PY_BIN" sync_to_cloud_mysql.py >> "$LOG_DIR/daily_job.log" 2>&1
else
echo "[$(date '+%Y-%m-%d %H:%M:%S')] skip cloud sync: mysql_cloud not configured" >> "$LOG_DIR/daily_job.log"
fi
echo "[$(date '+%Y-%m-%d %H:%M:%S')] daily job done" >> "$LOG_DIR/daily_job.log"