Files
tg_crawl/run_daily_incremental.sh

81 lines
2.3 KiB
Bash
Raw Permalink Normal View History

2026-02-26 20:00:06 +08:00
#!/usr/bin/env bash
set -euo pipefail
PROJECT_DIR="/home/liam/code/python/jobs_robots"
LOG_DIR="$PROJECT_DIR/logs"
LOCK_FILE="$PROJECT_DIR/.daily_job.lock"
TS="$(date '+%Y-%m-%d %H:%M:%S')"
PY_BIN="$PROJECT_DIR/.venv/bin/python"
2026-02-26 20:00:06 +08:00
mkdir -p "$LOG_DIR"
# Prevent overlap if previous run is still active.
exec 9>"$LOCK_FILE"
if ! flock -n 9; then
echo "[$TS] another job is running, exit" >> "$LOG_DIR/daily_job.log"
exit 0
fi
cd "$PROJECT_DIR"
echo "[$TS] daily job start" >> "$LOG_DIR/daily_job.log"
if [[ ! -x "$PY_BIN" ]]; then
echo "[$TS] python not found: $PY_BIN" >> "$LOG_DIR/daily_job.log"
exit 1
fi
2026-02-26 20:00:06 +08:00
# Auto-advance time window to a rolling daily range.
"$PY_BIN" - <<'PY'
2026-02-26 20:00:06 +08:00
import json
from datetime import datetime, timezone, timedelta
cfg_path = 'config.json'
with open(cfg_path, 'r', encoding='utf-8') as f:
cfg = json.load(f)
window = cfg.setdefault('time_window', {})
if window.get('enabled', False):
days = int(cfg.get('daily_window_days', 1) or 1)
if days < 1:
days = 1
end_dt = datetime.now(timezone.utc).date()
start_dt = end_dt - timedelta(days=days - 1)
window['start'] = start_dt.strftime('%Y-%m-%d')
window['end'] = end_dt.strftime('%Y-%m-%d')
with open(cfg_path, 'w', encoding='utf-8') as f:
json.dump(cfg, f, ensure_ascii=False, indent=2)
f.write('\n')
print(
"updated time_window: "
f"start={window.get('start')} end={window.get('end')} "
f"daily_window_days={cfg.get('daily_window_days', 1)}"
)
PY
# 1) Crawl TG incremental
"$PY_BIN" main.py >> "$LOG_DIR/daily_job.log" 2>&1
2026-02-26 20:00:06 +08:00
# 2) Clean dejob_official and others into structured table
"$PY_BIN" clean_to_structured.py >> "$LOG_DIR/daily_job.log" 2>&1
# 3) Sync local MySQL to cloud MySQL (only when mysql_cloud is configured)
if "$PY_BIN" - <<'PY'
import json
with open("config.json", "r", encoding="utf-8") as f:
cfg = json.load(f)
cloud = cfg.get("mysql_cloud") or {}
ok = bool(cloud.get("host") and cloud.get("user") and cloud.get("database"))
ok = ok and bool(cloud.get("password")) and cloud.get("password") != "CHANGE_ME"
raise SystemExit(0 if ok else 1)
PY
then
"$PY_BIN" sync_to_cloud_mysql.py >> "$LOG_DIR/daily_job.log" 2>&1
else
echo "[$(date '+%Y-%m-%d %H:%M:%S')] skip cloud sync: mysql_cloud not configured" >> "$LOG_DIR/daily_job.log"
fi
2026-02-26 20:00:06 +08:00
echo "[$(date '+%Y-%m-%d %H:%M:%S')] daily job done" >> "$LOG_DIR/daily_job.log"