feat: implement background scheduler for job scraping with Gunicorn support

2026-01-21 19:07:43 +01:00
parent e8baeb3bcf
commit d84b8f128b
6 changed files with 84 additions and 6 deletions
--- a/web/craigslist.py
+++ b/web/craigslist.py
@@ -1,4 +1,6 @@
 from datetime import datetime, timezone
+import threading
+import types
 from web.scraper import process_region_keyword, scrape_job_page
 from web.db import (
    db_init,
@@ -269,7 +271,10 @@ def scrape_jobs_with_retry(max_retries=3):
    """Run the scraping process with retry logic for failures."""
    for attempt in range(max_retries):
        try:
-            scraper()
+            result = scraper()
+            if isinstance(result, types.GeneratorType):
+                for _ in result:
+                    pass
            return True
        except Exception as e:
            if attempt < max_retries - 1:
@@ -291,6 +296,22 @@ def start_scheduler():
        time.sleep(60)  # Check every minute


+_scheduler_thread: threading.Thread | None = None
+_scheduler_lock = threading.Lock()
+
+
+def start_scheduler_in_background() -> threading.Thread:
+    """Start the scheduler loop in a daemon thread (idempotent)."""
+    global _scheduler_thread
+    with _scheduler_lock:
+        if _scheduler_thread and _scheduler_thread.is_alive():
+            return _scheduler_thread
+        thread = threading.Thread(target=start_scheduler, daemon=True)
+        thread.start()
+        _scheduler_thread = thread
+        return thread
+
+
 def run_scheduled_scraping():
    """Run the scheduled scraping process."""
    try: