diff --git a/.gitignore b/.gitignore index 0e20687..02f0427 100644 --- a/.gitignore +++ b/.gitignore @@ -166,4 +166,5 @@ cython_debug/ docs/online.md .github/copilot* .github/TODO.md +.github/instructions/ .vscode/launch.json diff --git a/README.md b/README.md index b39fc4a..6cf2a71 100644 --- a/README.md +++ b/README.md @@ -46,7 +46,14 @@ This layered design makes it straightforward to extend the scraper to new source ## Scheduler Configuration -The application includes an automated scheduler that runs the job scraping process every hour. The scheduler is implemented in `web/craigslist.py` and includes: +The application includes an automated scheduler that runs the job scraping process every hour. The scheduler is implemented in `web/craigslist.py` and can run alongside the web app when explicitly enabled. + +**Enable background scheduling** by setting the environment variable `SCRAPE_SCHEDULER_ENABLED=1`. + +- **Gunicorn**: the scheduler starts once in the Gunicorn master process (see `gunicorn.conf.py`). Worker processes skip scheduler startup to avoid duplicate runs. +- **Flask dev server**: the scheduler starts on the first request (to avoid the reloader starting it twice). + +When enabled, the scheduler includes: - **Automatic Scheduling**: Scraping runs every hour automatically - **Failure Handling**: Retry logic with exponential backoff (up to 3 attempts) @@ -107,15 +114,12 @@ When scraping individual job listings, the following contact information is extr The `extract_contact_info()` function intelligently parses various types of reply URLs: 1. **Mailto Links**: `mailto:jobs@company.com?subject=...` - - Extracts the email address directly 2. **Phone Links**: `tel:+1234567890` - - Extracts the phone number 3. **URL Parameters**: `https://apply.company.com?email=hr@company.com&phone=555-1234&name=HR%20Team` - - Searches for common parameter names: `email`, `phone`, `contact_name`, etc. 4. **Graceful Fallback**: If contact information cannot be extracted, the fields are set to `"N/A"` diff --git a/gunicorn.conf.py b/gunicorn.conf.py index 53d9b95..ee142b0 100644 --- a/gunicorn.conf.py +++ b/gunicorn.conf.py @@ -36,3 +36,17 @@ certfile = None # Application wsgi_module = "web.app:app" callable = "app" + + +def when_ready(server): + """Start background scheduler once in Gunicorn master if enabled.""" + import os + flag = (os.environ.get("SCRAPE_SCHEDULER_ENABLED") or "").strip().lower() + if flag not in {"1", "true", "yes", "on"}: + return + try: + from web.craigslist import start_scheduler_in_background + start_scheduler_in_background() + server.log.info("Background scraper scheduler started in Gunicorn master.") + except Exception as exc: + server.log.warning("Failed to start background scraper scheduler: %s", exc) diff --git a/tests/test_scheduler.py b/tests/test_scheduler.py index c9c620d..38521cb 100644 --- a/tests/test_scheduler.py +++ b/tests/test_scheduler.py @@ -39,6 +39,24 @@ class TestScheduler: from web.craigslist import schedule assert schedule is not None + @patch('web.craigslist.threading.Thread') + def test_start_scheduler_in_background_idempotent(self, mock_thread): + """Ensure background scheduler starts only once per process.""" + import web.craigslist as craigslist + + thread_instance = MagicMock() + thread_instance.is_alive.return_value = True + mock_thread.return_value = thread_instance + + craigslist._scheduler_thread = None + + first = craigslist.start_scheduler_in_background() + second = craigslist.start_scheduler_in_background() + + assert first is second + mock_thread.assert_called_once() + thread_instance.start.assert_called_once() + @patch('web.craigslist.db_get_all_job_urls') @patch('web.craigslist.seed_regions_keywords_from_listings') @patch('web.craigslist.get_all_regions') diff --git a/web/app.py b/web/app.py index c82339b..49ce029 100644 --- a/web/app.py +++ b/web/app.py @@ -4,7 +4,7 @@ from flask_wtf import CSRFProtect from typing import Dict, List from datetime import datetime, timezone -from web.craigslist import scraper +from web.craigslist import scraper, start_scheduler_in_background from web.db import ( db_init, delete_user_by_id, @@ -60,6 +60,26 @@ app.static_folder = "static" csrf = CSRFProtect(app) +def _scheduler_enabled() -> bool: + flag = (os.environ.get("SCRAPE_SCHEDULER_ENABLED") or "").strip().lower() + if flag not in {"1", "true", "yes", "on"}: + return False + # Avoid starting scheduler in Gunicorn workers (master hook handles it). + server_software = (os.environ.get("SERVER_SOFTWARE") or "").lower() + if "gunicorn" in server_software: + return False + # Avoid starting twice under Flask reloader. + if os.environ.get("FLASK_RUN_FROM_CLI") == "true" and os.environ.get("WERKZEUG_RUN_MAIN") != "true": + return False + return True + + +@app.before_first_request +def _start_scheduler_if_enabled(): + if _scheduler_enabled(): + start_scheduler_in_background() + + def require_admin(): username = session.get('username') if not username: diff --git a/web/craigslist.py b/web/craigslist.py index f7217e9..b846356 100644 --- a/web/craigslist.py +++ b/web/craigslist.py @@ -1,4 +1,6 @@ from datetime import datetime, timezone +import threading +import types from web.scraper import process_region_keyword, scrape_job_page from web.db import ( db_init, @@ -269,7 +271,10 @@ def scrape_jobs_with_retry(max_retries=3): """Run the scraping process with retry logic for failures.""" for attempt in range(max_retries): try: - scraper() + result = scraper() + if isinstance(result, types.GeneratorType): + for _ in result: + pass return True except Exception as e: if attempt < max_retries - 1: @@ -291,6 +296,22 @@ def start_scheduler(): time.sleep(60) # Check every minute +_scheduler_thread: threading.Thread | None = None +_scheduler_lock = threading.Lock() + + +def start_scheduler_in_background() -> threading.Thread: + """Start the scheduler loop in a daemon thread (idempotent).""" + global _scheduler_thread + with _scheduler_lock: + if _scheduler_thread and _scheduler_thread.is_alive(): + return _scheduler_thread + thread = threading.Thread(target=start_scheduler, daemon=True) + thread.start() + _scheduler_thread = thread + return thread + + def run_scheduled_scraping(): """Run the scheduled scraping process.""" try: