feat: implement background scheduler for job scraping with Gunicorn support
This commit is contained in:
22
web/app.py
22
web/app.py
@@ -4,7 +4,7 @@ from flask_wtf import CSRFProtect
|
||||
from typing import Dict, List
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from web.craigslist import scraper
|
||||
from web.craigslist import scraper, start_scheduler_in_background
|
||||
from web.db import (
|
||||
db_init,
|
||||
delete_user_by_id,
|
||||
@@ -60,6 +60,26 @@ app.static_folder = "static"
|
||||
csrf = CSRFProtect(app)
|
||||
|
||||
|
||||
def _scheduler_enabled() -> bool:
|
||||
flag = (os.environ.get("SCRAPE_SCHEDULER_ENABLED") or "").strip().lower()
|
||||
if flag not in {"1", "true", "yes", "on"}:
|
||||
return False
|
||||
# Avoid starting scheduler in Gunicorn workers (master hook handles it).
|
||||
server_software = (os.environ.get("SERVER_SOFTWARE") or "").lower()
|
||||
if "gunicorn" in server_software:
|
||||
return False
|
||||
# Avoid starting twice under Flask reloader.
|
||||
if os.environ.get("FLASK_RUN_FROM_CLI") == "true" and os.environ.get("WERKZEUG_RUN_MAIN") != "true":
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
@app.before_first_request
|
||||
def _start_scheduler_if_enabled():
|
||||
if _scheduler_enabled():
|
||||
start_scheduler_in_background()
|
||||
|
||||
|
||||
def require_admin():
|
||||
username = session.get('username')
|
||||
if not username:
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
from datetime import datetime, timezone
|
||||
import threading
|
||||
import types
|
||||
from web.scraper import process_region_keyword, scrape_job_page
|
||||
from web.db import (
|
||||
db_init,
|
||||
@@ -269,7 +271,10 @@ def scrape_jobs_with_retry(max_retries=3):
|
||||
"""Run the scraping process with retry logic for failures."""
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
scraper()
|
||||
result = scraper()
|
||||
if isinstance(result, types.GeneratorType):
|
||||
for _ in result:
|
||||
pass
|
||||
return True
|
||||
except Exception as e:
|
||||
if attempt < max_retries - 1:
|
||||
@@ -291,6 +296,22 @@ def start_scheduler():
|
||||
time.sleep(60) # Check every minute
|
||||
|
||||
|
||||
_scheduler_thread: threading.Thread | None = None
|
||||
_scheduler_lock = threading.Lock()
|
||||
|
||||
|
||||
def start_scheduler_in_background() -> threading.Thread:
|
||||
"""Start the scheduler loop in a daemon thread (idempotent)."""
|
||||
global _scheduler_thread
|
||||
with _scheduler_lock:
|
||||
if _scheduler_thread and _scheduler_thread.is_alive():
|
||||
return _scheduler_thread
|
||||
thread = threading.Thread(target=start_scheduler, daemon=True)
|
||||
thread.start()
|
||||
_scheduler_thread = thread
|
||||
return thread
|
||||
|
||||
|
||||
def run_scheduled_scraping():
|
||||
"""Run the scheduled scraping process."""
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user