feat: implement background scheduler for job scraping with Gunicorn support
Some checks failed
CI/CD Pipeline / test (push) Failing after 13s
CI/CD Pipeline / build-image (push) Has been skipped

This commit is contained in:
2026-01-21 19:07:43 +01:00
parent e8baeb3bcf
commit d84b8f128b
6 changed files with 84 additions and 6 deletions

View File

@@ -1,4 +1,6 @@
from datetime import datetime, timezone
import threading
import types
from web.scraper import process_region_keyword, scrape_job_page
from web.db import (
db_init,
@@ -269,7 +271,10 @@ def scrape_jobs_with_retry(max_retries=3):
"""Run the scraping process with retry logic for failures."""
for attempt in range(max_retries):
try:
scraper()
result = scraper()
if isinstance(result, types.GeneratorType):
for _ in result:
pass
return True
except Exception as e:
if attempt < max_retries - 1:
@@ -291,6 +296,22 @@ def start_scheduler():
time.sleep(60) # Check every minute
_scheduler_thread: threading.Thread | None = None
_scheduler_lock = threading.Lock()
def start_scheduler_in_background() -> threading.Thread:
"""Start the scheduler loop in a daemon thread (idempotent)."""
global _scheduler_thread
with _scheduler_lock:
if _scheduler_thread and _scheduler_thread.is_alive():
return _scheduler_thread
thread = threading.Thread(target=start_scheduler, daemon=True)
thread.start()
_scheduler_thread = thread
return thread
def run_scheduled_scraping():
"""Run the scheduled scraping process."""
try: