Files
tg_crawl/run_daily_incremental.sh

59 lines
1.6 KiB
Bash
Raw Normal View History

2026-02-26 20:00:06 +08:00
#!/usr/bin/env bash
set -euo pipefail
PROJECT_DIR="/home/liam/code/python/jobs_robots"
LOG_DIR="$PROJECT_DIR/logs"
LOCK_FILE="$PROJECT_DIR/.daily_job.lock"
TS="$(date '+%Y-%m-%d %H:%M:%S')"
mkdir -p "$LOG_DIR"
# Prevent overlap if previous run is still active.
exec 9>"$LOCK_FILE"
if ! flock -n 9; then
echo "[$TS] another job is running, exit" >> "$LOG_DIR/daily_job.log"
exit 0
fi
cd "$PROJECT_DIR"
echo "[$TS] daily job start" >> "$LOG_DIR/daily_job.log"
# Auto-advance time window to a rolling daily range.
.venv/bin/python - <<'PY'
import json
from datetime import datetime, timezone, timedelta
cfg_path = 'config.json'
with open(cfg_path, 'r', encoding='utf-8') as f:
cfg = json.load(f)
window = cfg.setdefault('time_window', {})
if window.get('enabled', False):
days = int(cfg.get('daily_window_days', 1) or 1)
if days < 1:
days = 1
end_dt = datetime.now(timezone.utc).date()
start_dt = end_dt - timedelta(days=days - 1)
window['start'] = start_dt.strftime('%Y-%m-%d')
window['end'] = end_dt.strftime('%Y-%m-%d')
with open(cfg_path, 'w', encoding='utf-8') as f:
json.dump(cfg, f, ensure_ascii=False, indent=2)
f.write('\n')
print(
"updated time_window: "
f"start={window.get('start')} end={window.get('end')} "
f"daily_window_days={cfg.get('daily_window_days', 1)}"
)
PY
# 1) Crawl TG incremental
uv run main.py >> "$LOG_DIR/daily_job.log" 2>&1
# 2) Clean dejob_official and others into structured table
uv run clean_to_structured.py >> "$LOG_DIR/daily_job.log" 2>&1
echo "[$(date '+%Y-%m-%d %H:%M:%S')] daily job done" >> "$LOG_DIR/daily_job.log"