feat: implement background scheduler for job scraping with Gunicorn support
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -166,4 +166,5 @@ cython_debug/
|
||||
docs/online.md
|
||||
.github/copilot*
|
||||
.github/TODO.md
|
||||
.github/instructions/
|
||||
.vscode/launch.json
|
||||
|
||||
12
README.md
12
README.md
@@ -46,7 +46,14 @@ This layered design makes it straightforward to extend the scraper to new source
|
||||
|
||||
## Scheduler Configuration
|
||||
|
||||
The application includes an automated scheduler that runs the job scraping process every hour. The scheduler is implemented in `web/craigslist.py` and includes:
|
||||
The application includes an automated scheduler that runs the job scraping process every hour. The scheduler is implemented in `web/craigslist.py` and can run alongside the web app when explicitly enabled.
|
||||
|
||||
**Enable background scheduling** by setting the environment variable `SCRAPE_SCHEDULER_ENABLED=1`.
|
||||
|
||||
- **Gunicorn**: the scheduler starts once in the Gunicorn master process (see `gunicorn.conf.py`). Worker processes skip scheduler startup to avoid duplicate runs.
|
||||
- **Flask dev server**: the scheduler starts on the first request (to avoid the reloader starting it twice).
|
||||
|
||||
When enabled, the scheduler includes:
|
||||
|
||||
- **Automatic Scheduling**: Scraping runs every hour automatically
|
||||
- **Failure Handling**: Retry logic with exponential backoff (up to 3 attempts)
|
||||
@@ -107,15 +114,12 @@ When scraping individual job listings, the following contact information is extr
|
||||
The `extract_contact_info()` function intelligently parses various types of reply URLs:
|
||||
|
||||
1. **Mailto Links**: `mailto:jobs@company.com?subject=...`
|
||||
|
||||
- Extracts the email address directly
|
||||
|
||||
2. **Phone Links**: `tel:+1234567890`
|
||||
|
||||
- Extracts the phone number
|
||||
|
||||
3. **URL Parameters**: `https://apply.company.com?email=hr@company.com&phone=555-1234&name=HR%20Team`
|
||||
|
||||
- Searches for common parameter names: `email`, `phone`, `contact_name`, etc.
|
||||
|
||||
4. **Graceful Fallback**: If contact information cannot be extracted, the fields are set to `"N/A"`
|
||||
|
||||
@@ -36,3 +36,17 @@ certfile = None
|
||||
# Application
|
||||
wsgi_module = "web.app:app"
|
||||
callable = "app"
|
||||
|
||||
|
||||
def when_ready(server):
|
||||
"""Start background scheduler once in Gunicorn master if enabled."""
|
||||
import os
|
||||
flag = (os.environ.get("SCRAPE_SCHEDULER_ENABLED") or "").strip().lower()
|
||||
if flag not in {"1", "true", "yes", "on"}:
|
||||
return
|
||||
try:
|
||||
from web.craigslist import start_scheduler_in_background
|
||||
start_scheduler_in_background()
|
||||
server.log.info("Background scraper scheduler started in Gunicorn master.")
|
||||
except Exception as exc:
|
||||
server.log.warning("Failed to start background scraper scheduler: %s", exc)
|
||||
|
||||
@@ -39,6 +39,24 @@ class TestScheduler:
|
||||
from web.craigslist import schedule
|
||||
assert schedule is not None
|
||||
|
||||
@patch('web.craigslist.threading.Thread')
|
||||
def test_start_scheduler_in_background_idempotent(self, mock_thread):
|
||||
"""Ensure background scheduler starts only once per process."""
|
||||
import web.craigslist as craigslist
|
||||
|
||||
thread_instance = MagicMock()
|
||||
thread_instance.is_alive.return_value = True
|
||||
mock_thread.return_value = thread_instance
|
||||
|
||||
craigslist._scheduler_thread = None
|
||||
|
||||
first = craigslist.start_scheduler_in_background()
|
||||
second = craigslist.start_scheduler_in_background()
|
||||
|
||||
assert first is second
|
||||
mock_thread.assert_called_once()
|
||||
thread_instance.start.assert_called_once()
|
||||
|
||||
@patch('web.craigslist.db_get_all_job_urls')
|
||||
@patch('web.craigslist.seed_regions_keywords_from_listings')
|
||||
@patch('web.craigslist.get_all_regions')
|
||||
|
||||
22
web/app.py
22
web/app.py
@@ -4,7 +4,7 @@ from flask_wtf import CSRFProtect
|
||||
from typing import Dict, List
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from web.craigslist import scraper
|
||||
from web.craigslist import scraper, start_scheduler_in_background
|
||||
from web.db import (
|
||||
db_init,
|
||||
delete_user_by_id,
|
||||
@@ -60,6 +60,26 @@ app.static_folder = "static"
|
||||
csrf = CSRFProtect(app)
|
||||
|
||||
|
||||
def _scheduler_enabled() -> bool:
|
||||
flag = (os.environ.get("SCRAPE_SCHEDULER_ENABLED") or "").strip().lower()
|
||||
if flag not in {"1", "true", "yes", "on"}:
|
||||
return False
|
||||
# Avoid starting scheduler in Gunicorn workers (master hook handles it).
|
||||
server_software = (os.environ.get("SERVER_SOFTWARE") or "").lower()
|
||||
if "gunicorn" in server_software:
|
||||
return False
|
||||
# Avoid starting twice under Flask reloader.
|
||||
if os.environ.get("FLASK_RUN_FROM_CLI") == "true" and os.environ.get("WERKZEUG_RUN_MAIN") != "true":
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
@app.before_first_request
|
||||
def _start_scheduler_if_enabled():
|
||||
if _scheduler_enabled():
|
||||
start_scheduler_in_background()
|
||||
|
||||
|
||||
def require_admin():
|
||||
username = session.get('username')
|
||||
if not username:
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
from datetime import datetime, timezone
|
||||
import threading
|
||||
import types
|
||||
from web.scraper import process_region_keyword, scrape_job_page
|
||||
from web.db import (
|
||||
db_init,
|
||||
@@ -269,7 +271,10 @@ def scrape_jobs_with_retry(max_retries=3):
|
||||
"""Run the scraping process with retry logic for failures."""
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
scraper()
|
||||
result = scraper()
|
||||
if isinstance(result, types.GeneratorType):
|
||||
for _ in result:
|
||||
pass
|
||||
return True
|
||||
except Exception as e:
|
||||
if attempt < max_retries - 1:
|
||||
@@ -291,6 +296,22 @@ def start_scheduler():
|
||||
time.sleep(60) # Check every minute
|
||||
|
||||
|
||||
_scheduler_thread: threading.Thread | None = None
|
||||
_scheduler_lock = threading.Lock()
|
||||
|
||||
|
||||
def start_scheduler_in_background() -> threading.Thread:
|
||||
"""Start the scheduler loop in a daemon thread (idempotent)."""
|
||||
global _scheduler_thread
|
||||
with _scheduler_lock:
|
||||
if _scheduler_thread and _scheduler_thread.is_alive():
|
||||
return _scheduler_thread
|
||||
thread = threading.Thread(target=start_scheduler, daemon=True)
|
||||
thread.start()
|
||||
_scheduler_thread = thread
|
||||
return thread
|
||||
|
||||
|
||||
def run_scheduled_scraping():
|
||||
"""Run the scheduled scraping process."""
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user