feat: implement background scheduler for job scraping with Gunicorn support
Some checks failed
CI/CD Pipeline / test (push) Failing after 13s
CI/CD Pipeline / build-image (push) Has been skipped

This commit is contained in:
2026-01-21 19:07:43 +01:00
parent e8baeb3bcf
commit d84b8f128b
6 changed files with 84 additions and 6 deletions

1
.gitignore vendored
View File

@@ -166,4 +166,5 @@ cython_debug/
docs/online.md docs/online.md
.github/copilot* .github/copilot*
.github/TODO.md .github/TODO.md
.github/instructions/
.vscode/launch.json .vscode/launch.json

View File

@@ -46,7 +46,14 @@ This layered design makes it straightforward to extend the scraper to new source
## Scheduler Configuration ## Scheduler Configuration
The application includes an automated scheduler that runs the job scraping process every hour. The scheduler is implemented in `web/craigslist.py` and includes: The application includes an automated scheduler that runs the job scraping process every hour. The scheduler is implemented in `web/craigslist.py` and can run alongside the web app when explicitly enabled.
**Enable background scheduling** by setting the environment variable `SCRAPE_SCHEDULER_ENABLED=1`.
- **Gunicorn**: the scheduler starts once in the Gunicorn master process (see `gunicorn.conf.py`). Worker processes skip scheduler startup to avoid duplicate runs.
- **Flask dev server**: the scheduler starts on the first request (to avoid the reloader starting it twice).
When enabled, the scheduler includes:
- **Automatic Scheduling**: Scraping runs every hour automatically - **Automatic Scheduling**: Scraping runs every hour automatically
- **Failure Handling**: Retry logic with exponential backoff (up to 3 attempts) - **Failure Handling**: Retry logic with exponential backoff (up to 3 attempts)
@@ -107,15 +114,12 @@ When scraping individual job listings, the following contact information is extr
The `extract_contact_info()` function intelligently parses various types of reply URLs: The `extract_contact_info()` function intelligently parses various types of reply URLs:
1. **Mailto Links**: `mailto:jobs@company.com?subject=...` 1. **Mailto Links**: `mailto:jobs@company.com?subject=...`
- Extracts the email address directly - Extracts the email address directly
2. **Phone Links**: `tel:+1234567890` 2. **Phone Links**: `tel:+1234567890`
- Extracts the phone number - Extracts the phone number
3. **URL Parameters**: `https://apply.company.com?email=hr@company.com&phone=555-1234&name=HR%20Team` 3. **URL Parameters**: `https://apply.company.com?email=hr@company.com&phone=555-1234&name=HR%20Team`
- Searches for common parameter names: `email`, `phone`, `contact_name`, etc. - Searches for common parameter names: `email`, `phone`, `contact_name`, etc.
4. **Graceful Fallback**: If contact information cannot be extracted, the fields are set to `"N/A"` 4. **Graceful Fallback**: If contact information cannot be extracted, the fields are set to `"N/A"`

View File

@@ -36,3 +36,17 @@ certfile = None
# Application # Application
wsgi_module = "web.app:app" wsgi_module = "web.app:app"
callable = "app" callable = "app"
def when_ready(server):
"""Start background scheduler once in Gunicorn master if enabled."""
import os
flag = (os.environ.get("SCRAPE_SCHEDULER_ENABLED") or "").strip().lower()
if flag not in {"1", "true", "yes", "on"}:
return
try:
from web.craigslist import start_scheduler_in_background
start_scheduler_in_background()
server.log.info("Background scraper scheduler started in Gunicorn master.")
except Exception as exc:
server.log.warning("Failed to start background scraper scheduler: %s", exc)

View File

@@ -39,6 +39,24 @@ class TestScheduler:
from web.craigslist import schedule from web.craigslist import schedule
assert schedule is not None assert schedule is not None
@patch('web.craigslist.threading.Thread')
def test_start_scheduler_in_background_idempotent(self, mock_thread):
"""Ensure background scheduler starts only once per process."""
import web.craigslist as craigslist
thread_instance = MagicMock()
thread_instance.is_alive.return_value = True
mock_thread.return_value = thread_instance
craigslist._scheduler_thread = None
first = craigslist.start_scheduler_in_background()
second = craigslist.start_scheduler_in_background()
assert first is second
mock_thread.assert_called_once()
thread_instance.start.assert_called_once()
@patch('web.craigslist.db_get_all_job_urls') @patch('web.craigslist.db_get_all_job_urls')
@patch('web.craigslist.seed_regions_keywords_from_listings') @patch('web.craigslist.seed_regions_keywords_from_listings')
@patch('web.craigslist.get_all_regions') @patch('web.craigslist.get_all_regions')

View File

@@ -4,7 +4,7 @@ from flask_wtf import CSRFProtect
from typing import Dict, List from typing import Dict, List
from datetime import datetime, timezone from datetime import datetime, timezone
from web.craigslist import scraper from web.craigslist import scraper, start_scheduler_in_background
from web.db import ( from web.db import (
db_init, db_init,
delete_user_by_id, delete_user_by_id,
@@ -60,6 +60,26 @@ app.static_folder = "static"
csrf = CSRFProtect(app) csrf = CSRFProtect(app)
def _scheduler_enabled() -> bool:
flag = (os.environ.get("SCRAPE_SCHEDULER_ENABLED") or "").strip().lower()
if flag not in {"1", "true", "yes", "on"}:
return False
# Avoid starting scheduler in Gunicorn workers (master hook handles it).
server_software = (os.environ.get("SERVER_SOFTWARE") or "").lower()
if "gunicorn" in server_software:
return False
# Avoid starting twice under Flask reloader.
if os.environ.get("FLASK_RUN_FROM_CLI") == "true" and os.environ.get("WERKZEUG_RUN_MAIN") != "true":
return False
return True
@app.before_first_request
def _start_scheduler_if_enabled():
if _scheduler_enabled():
start_scheduler_in_background()
def require_admin(): def require_admin():
username = session.get('username') username = session.get('username')
if not username: if not username:

View File

@@ -1,4 +1,6 @@
from datetime import datetime, timezone from datetime import datetime, timezone
import threading
import types
from web.scraper import process_region_keyword, scrape_job_page from web.scraper import process_region_keyword, scrape_job_page
from web.db import ( from web.db import (
db_init, db_init,
@@ -269,7 +271,10 @@ def scrape_jobs_with_retry(max_retries=3):
"""Run the scraping process with retry logic for failures.""" """Run the scraping process with retry logic for failures."""
for attempt in range(max_retries): for attempt in range(max_retries):
try: try:
scraper() result = scraper()
if isinstance(result, types.GeneratorType):
for _ in result:
pass
return True return True
except Exception as e: except Exception as e:
if attempt < max_retries - 1: if attempt < max_retries - 1:
@@ -291,6 +296,22 @@ def start_scheduler():
time.sleep(60) # Check every minute time.sleep(60) # Check every minute
_scheduler_thread: threading.Thread | None = None
_scheduler_lock = threading.Lock()
def start_scheduler_in_background() -> threading.Thread:
"""Start the scheduler loop in a daemon thread (idempotent)."""
global _scheduler_thread
with _scheduler_lock:
if _scheduler_thread and _scheduler_thread.is_alive():
return _scheduler_thread
thread = threading.Thread(target=start_scheduler, daemon=True)
thread.start()
_scheduler_thread = thread
return thread
def run_scheduled_scraping(): def run_scheduled_scraping():
"""Run the scheduled scraping process.""" """Run the scheduled scraping process."""
try: try: