feat: implement background scheduler for job scraping with Gunicorn support
This commit is contained in:
@@ -1,4 +1,6 @@
|
||||
from datetime import datetime, timezone
|
||||
import threading
|
||||
import types
|
||||
from web.scraper import process_region_keyword, scrape_job_page
|
||||
from web.db import (
|
||||
db_init,
|
||||
@@ -269,7 +271,10 @@ def scrape_jobs_with_retry(max_retries=3):
|
||||
"""Run the scraping process with retry logic for failures."""
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
scraper()
|
||||
result = scraper()
|
||||
if isinstance(result, types.GeneratorType):
|
||||
for _ in result:
|
||||
pass
|
||||
return True
|
||||
except Exception as e:
|
||||
if attempt < max_retries - 1:
|
||||
@@ -291,6 +296,22 @@ def start_scheduler():
|
||||
time.sleep(60) # Check every minute
|
||||
|
||||
|
||||
_scheduler_thread: threading.Thread | None = None
|
||||
_scheduler_lock = threading.Lock()
|
||||
|
||||
|
||||
def start_scheduler_in_background() -> threading.Thread:
|
||||
"""Start the scheduler loop in a daemon thread (idempotent)."""
|
||||
global _scheduler_thread
|
||||
with _scheduler_lock:
|
||||
if _scheduler_thread and _scheduler_thread.is_alive():
|
||||
return _scheduler_thread
|
||||
thread = threading.Thread(target=start_scheduler, daemon=True)
|
||||
thread.start()
|
||||
_scheduler_thread = thread
|
||||
return thread
|
||||
|
||||
|
||||
def run_scheduled_scraping():
|
||||
"""Run the scheduled scraping process."""
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user