feat: implement background scheduler for job scraping with Gunicorn support

2026-01-21 19:07:43 +01:00
parent e8baeb3bcf
commit d84b8f128b
6 changed files with 84 additions and 6 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -166,4 +166,5 @@ cython_debug/
 docs/online.md
 .github/copilot*
 .github/TODO.md
 .github/instructions/
 .vscode/launch.json
--- a/README.md
+++ b/README.md
@@ -46,7 +46,14 @@ This layered design makes it straightforward to extend the scraper to new source
 ## Scheduler Configuration
-The application includes an automated scheduler that runs the job scraping process every hour. The scheduler is implemented in `web/craigslist.py` and includes:
+The application includes an automated scheduler that runs the job scraping process every hour. The scheduler is implemented in `web/craigslist.py` and can run alongside the web app when explicitly enabled.
 **Enable background scheduling** by setting the environment variable `SCRAPE_SCHEDULER_ENABLED=1`.
 - **Gunicorn**: the scheduler starts once in the Gunicorn master process (see `gunicorn.conf.py`). Worker processes skip scheduler startup to avoid duplicate runs.
 - **Flask dev server**: the scheduler starts on the first request (to avoid the reloader starting it twice).
 When enabled, the scheduler includes:
 - **Automatic Scheduling**: Scraping runs every hour automatically
 - **Failure Handling**: Retry logic with exponential backoff (up to 3 attempts)
@@ -107,15 +114,12 @@ When scraping individual job listings, the following contact information is extr
 The `extract_contact_info()` function intelligently parses various types of reply URLs:
 1. **Mailto Links**: `mailto:jobs@company.com?subject=...`
   - Extracts the email address directly
 2. **Phone Links**: `tel:+1234567890`
   - Extracts the phone number
 3. **URL Parameters**: `https://apply.company.com?email=hr@company.com&phone=555-1234&name=HR%20Team`
   - Searches for common parameter names: `email`, `phone`, `contact_name`, etc.
 4. **Graceful Fallback**: If contact information cannot be extracted, the fields are set to `"N/A"`
--- a/gunicorn.conf.py
+++ b/gunicorn.conf.py
@@ -36,3 +36,17 @@ certfile = None
 # Application
 wsgi_module = "web.app:app"
 callable = "app"
 def when_ready(server):
 	"""Start background scheduler once in Gunicorn master if enabled."""
 	import os
 	flag = (os.environ.get("SCRAPE_SCHEDULER_ENABLED") or "").strip().lower()
 	if flag not in {"1", "true", "yes", "on"}:
 		return
 	try:
 		from web.craigslist import start_scheduler_in_background
 		start_scheduler_in_background()
 		server.log.info("Background scraper scheduler started in Gunicorn master.")
 	except Exception as exc:
 		server.log.warning("Failed to start background scraper scheduler: %s", exc)
--- a/tests/test_scheduler.py
+++ b/tests/test_scheduler.py
@@ -39,6 +39,24 @@ class TestScheduler:
        from web.craigslist import schedule
        assert schedule is not None
    @patch('web.craigslist.threading.Thread')
    def test_start_scheduler_in_background_idempotent(self, mock_thread):
        """Ensure background scheduler starts only once per process."""
        import web.craigslist as craigslist
        thread_instance = MagicMock()
        thread_instance.is_alive.return_value = True
        mock_thread.return_value = thread_instance
        craigslist._scheduler_thread = None
        first = craigslist.start_scheduler_in_background()
        second = craigslist.start_scheduler_in_background()
        assert first is second
        mock_thread.assert_called_once()
        thread_instance.start.assert_called_once()
    @patch('web.craigslist.db_get_all_job_urls')
    @patch('web.craigslist.seed_regions_keywords_from_listings')
    @patch('web.craigslist.get_all_regions')
--- a/web/app.py
+++ b/web/app.py
@@ -4,7 +4,7 @@ from flask_wtf import CSRFProtect
 from typing import Dict, List
 from datetime import datetime, timezone
-from web.craigslist import scraper
+from web.craigslist import scraper, start_scheduler_in_background
 from web.db import (
    db_init,
    delete_user_by_id,
@@ -60,6 +60,26 @@ app.static_folder = "static"
 csrf = CSRFProtect(app)
 def _scheduler_enabled() -> bool:
    flag = (os.environ.get("SCRAPE_SCHEDULER_ENABLED") or "").strip().lower()
    if flag not in {"1", "true", "yes", "on"}:
        return False
    # Avoid starting scheduler in Gunicorn workers (master hook handles it).
    server_software = (os.environ.get("SERVER_SOFTWARE") or "").lower()
    if "gunicorn" in server_software:
        return False
    # Avoid starting twice under Flask reloader.
    if os.environ.get("FLASK_RUN_FROM_CLI") == "true" and os.environ.get("WERKZEUG_RUN_MAIN") != "true":
        return False
    return True
@app.before_first_request
 def _start_scheduler_if_enabled():
    if _scheduler_enabled():
        start_scheduler_in_background()
 def require_admin():
    username = session.get('username')
    if not username:
--- a/web/craigslist.py
+++ b/web/craigslist.py
@@ -1,4 +1,6 @@
 from datetime import datetime, timezone
 import threading
 import types
 from web.scraper import process_region_keyword, scrape_job_page
 from web.db import (
    db_init,
@@ -269,7 +271,10 @@ def scrape_jobs_with_retry(max_retries=3):
    """Run the scraping process with retry logic for failures."""
    for attempt in range(max_retries):
        try:
-            scraper()
+            result = scraper()
            if isinstance(result, types.GeneratorType):
                for _ in result:
                    pass
            return True
        except Exception as e:
            if attempt < max_retries - 1:
@@ -291,6 +296,22 @@ def start_scheduler():
        time.sleep(60)  # Check every minute
 _scheduler_thread: threading.Thread | None = None
 _scheduler_lock = threading.Lock()
 def start_scheduler_in_background() -> threading.Thread:
    """Start the scheduler loop in a daemon thread (idempotent)."""
    global _scheduler_thread
    with _scheduler_lock:
        if _scheduler_thread and _scheduler_thread.is_alive():
            return _scheduler_thread
        thread = threading.Thread(target=start_scheduler, daemon=True)
        thread.start()
        _scheduler_thread = thread
        return thread
 def run_scheduled_scraping():
    """Run the scheduled scraping process."""
    try: