feat: implement background scheduler for job scraping with Gunicorn support
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -166,4 +166,5 @@ cython_debug/
|
|||||||
docs/online.md
|
docs/online.md
|
||||||
.github/copilot*
|
.github/copilot*
|
||||||
.github/TODO.md
|
.github/TODO.md
|
||||||
|
.github/instructions/
|
||||||
.vscode/launch.json
|
.vscode/launch.json
|
||||||
|
|||||||
12
README.md
12
README.md
@@ -46,7 +46,14 @@ This layered design makes it straightforward to extend the scraper to new source
|
|||||||
|
|
||||||
## Scheduler Configuration
|
## Scheduler Configuration
|
||||||
|
|
||||||
The application includes an automated scheduler that runs the job scraping process every hour. The scheduler is implemented in `web/craigslist.py` and includes:
|
The application includes an automated scheduler that runs the job scraping process every hour. The scheduler is implemented in `web/craigslist.py` and can run alongside the web app when explicitly enabled.
|
||||||
|
|
||||||
|
**Enable background scheduling** by setting the environment variable `SCRAPE_SCHEDULER_ENABLED=1`.
|
||||||
|
|
||||||
|
- **Gunicorn**: the scheduler starts once in the Gunicorn master process (see `gunicorn.conf.py`). Worker processes skip scheduler startup to avoid duplicate runs.
|
||||||
|
- **Flask dev server**: the scheduler starts on the first request (to avoid the reloader starting it twice).
|
||||||
|
|
||||||
|
When enabled, the scheduler includes:
|
||||||
|
|
||||||
- **Automatic Scheduling**: Scraping runs every hour automatically
|
- **Automatic Scheduling**: Scraping runs every hour automatically
|
||||||
- **Failure Handling**: Retry logic with exponential backoff (up to 3 attempts)
|
- **Failure Handling**: Retry logic with exponential backoff (up to 3 attempts)
|
||||||
@@ -107,15 +114,12 @@ When scraping individual job listings, the following contact information is extr
|
|||||||
The `extract_contact_info()` function intelligently parses various types of reply URLs:
|
The `extract_contact_info()` function intelligently parses various types of reply URLs:
|
||||||
|
|
||||||
1. **Mailto Links**: `mailto:jobs@company.com?subject=...`
|
1. **Mailto Links**: `mailto:jobs@company.com?subject=...`
|
||||||
|
|
||||||
- Extracts the email address directly
|
- Extracts the email address directly
|
||||||
|
|
||||||
2. **Phone Links**: `tel:+1234567890`
|
2. **Phone Links**: `tel:+1234567890`
|
||||||
|
|
||||||
- Extracts the phone number
|
- Extracts the phone number
|
||||||
|
|
||||||
3. **URL Parameters**: `https://apply.company.com?email=hr@company.com&phone=555-1234&name=HR%20Team`
|
3. **URL Parameters**: `https://apply.company.com?email=hr@company.com&phone=555-1234&name=HR%20Team`
|
||||||
|
|
||||||
- Searches for common parameter names: `email`, `phone`, `contact_name`, etc.
|
- Searches for common parameter names: `email`, `phone`, `contact_name`, etc.
|
||||||
|
|
||||||
4. **Graceful Fallback**: If contact information cannot be extracted, the fields are set to `"N/A"`
|
4. **Graceful Fallback**: If contact information cannot be extracted, the fields are set to `"N/A"`
|
||||||
|
|||||||
@@ -36,3 +36,17 @@ certfile = None
|
|||||||
# Application
|
# Application
|
||||||
wsgi_module = "web.app:app"
|
wsgi_module = "web.app:app"
|
||||||
callable = "app"
|
callable = "app"
|
||||||
|
|
||||||
|
|
||||||
|
def when_ready(server):
|
||||||
|
"""Start background scheduler once in Gunicorn master if enabled."""
|
||||||
|
import os
|
||||||
|
flag = (os.environ.get("SCRAPE_SCHEDULER_ENABLED") or "").strip().lower()
|
||||||
|
if flag not in {"1", "true", "yes", "on"}:
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
from web.craigslist import start_scheduler_in_background
|
||||||
|
start_scheduler_in_background()
|
||||||
|
server.log.info("Background scraper scheduler started in Gunicorn master.")
|
||||||
|
except Exception as exc:
|
||||||
|
server.log.warning("Failed to start background scraper scheduler: %s", exc)
|
||||||
|
|||||||
@@ -39,6 +39,24 @@ class TestScheduler:
|
|||||||
from web.craigslist import schedule
|
from web.craigslist import schedule
|
||||||
assert schedule is not None
|
assert schedule is not None
|
||||||
|
|
||||||
|
@patch('web.craigslist.threading.Thread')
|
||||||
|
def test_start_scheduler_in_background_idempotent(self, mock_thread):
|
||||||
|
"""Ensure background scheduler starts only once per process."""
|
||||||
|
import web.craigslist as craigslist
|
||||||
|
|
||||||
|
thread_instance = MagicMock()
|
||||||
|
thread_instance.is_alive.return_value = True
|
||||||
|
mock_thread.return_value = thread_instance
|
||||||
|
|
||||||
|
craigslist._scheduler_thread = None
|
||||||
|
|
||||||
|
first = craigslist.start_scheduler_in_background()
|
||||||
|
second = craigslist.start_scheduler_in_background()
|
||||||
|
|
||||||
|
assert first is second
|
||||||
|
mock_thread.assert_called_once()
|
||||||
|
thread_instance.start.assert_called_once()
|
||||||
|
|
||||||
@patch('web.craigslist.db_get_all_job_urls')
|
@patch('web.craigslist.db_get_all_job_urls')
|
||||||
@patch('web.craigslist.seed_regions_keywords_from_listings')
|
@patch('web.craigslist.seed_regions_keywords_from_listings')
|
||||||
@patch('web.craigslist.get_all_regions')
|
@patch('web.craigslist.get_all_regions')
|
||||||
|
|||||||
22
web/app.py
22
web/app.py
@@ -4,7 +4,7 @@ from flask_wtf import CSRFProtect
|
|||||||
from typing import Dict, List
|
from typing import Dict, List
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
|
|
||||||
from web.craigslist import scraper
|
from web.craigslist import scraper, start_scheduler_in_background
|
||||||
from web.db import (
|
from web.db import (
|
||||||
db_init,
|
db_init,
|
||||||
delete_user_by_id,
|
delete_user_by_id,
|
||||||
@@ -60,6 +60,26 @@ app.static_folder = "static"
|
|||||||
csrf = CSRFProtect(app)
|
csrf = CSRFProtect(app)
|
||||||
|
|
||||||
|
|
||||||
|
def _scheduler_enabled() -> bool:
|
||||||
|
flag = (os.environ.get("SCRAPE_SCHEDULER_ENABLED") or "").strip().lower()
|
||||||
|
if flag not in {"1", "true", "yes", "on"}:
|
||||||
|
return False
|
||||||
|
# Avoid starting scheduler in Gunicorn workers (master hook handles it).
|
||||||
|
server_software = (os.environ.get("SERVER_SOFTWARE") or "").lower()
|
||||||
|
if "gunicorn" in server_software:
|
||||||
|
return False
|
||||||
|
# Avoid starting twice under Flask reloader.
|
||||||
|
if os.environ.get("FLASK_RUN_FROM_CLI") == "true" and os.environ.get("WERKZEUG_RUN_MAIN") != "true":
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
@app.before_first_request
|
||||||
|
def _start_scheduler_if_enabled():
|
||||||
|
if _scheduler_enabled():
|
||||||
|
start_scheduler_in_background()
|
||||||
|
|
||||||
|
|
||||||
def require_admin():
|
def require_admin():
|
||||||
username = session.get('username')
|
username = session.get('username')
|
||||||
if not username:
|
if not username:
|
||||||
|
|||||||
@@ -1,4 +1,6 @@
|
|||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
|
import threading
|
||||||
|
import types
|
||||||
from web.scraper import process_region_keyword, scrape_job_page
|
from web.scraper import process_region_keyword, scrape_job_page
|
||||||
from web.db import (
|
from web.db import (
|
||||||
db_init,
|
db_init,
|
||||||
@@ -269,7 +271,10 @@ def scrape_jobs_with_retry(max_retries=3):
|
|||||||
"""Run the scraping process with retry logic for failures."""
|
"""Run the scraping process with retry logic for failures."""
|
||||||
for attempt in range(max_retries):
|
for attempt in range(max_retries):
|
||||||
try:
|
try:
|
||||||
scraper()
|
result = scraper()
|
||||||
|
if isinstance(result, types.GeneratorType):
|
||||||
|
for _ in result:
|
||||||
|
pass
|
||||||
return True
|
return True
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if attempt < max_retries - 1:
|
if attempt < max_retries - 1:
|
||||||
@@ -291,6 +296,22 @@ def start_scheduler():
|
|||||||
time.sleep(60) # Check every minute
|
time.sleep(60) # Check every minute
|
||||||
|
|
||||||
|
|
||||||
|
_scheduler_thread: threading.Thread | None = None
|
||||||
|
_scheduler_lock = threading.Lock()
|
||||||
|
|
||||||
|
|
||||||
|
def start_scheduler_in_background() -> threading.Thread:
|
||||||
|
"""Start the scheduler loop in a daemon thread (idempotent)."""
|
||||||
|
global _scheduler_thread
|
||||||
|
with _scheduler_lock:
|
||||||
|
if _scheduler_thread and _scheduler_thread.is_alive():
|
||||||
|
return _scheduler_thread
|
||||||
|
thread = threading.Thread(target=start_scheduler, daemon=True)
|
||||||
|
thread.start()
|
||||||
|
_scheduler_thread = thread
|
||||||
|
return thread
|
||||||
|
|
||||||
|
|
||||||
def run_scheduled_scraping():
|
def run_scheduled_scraping():
|
||||||
"""Run the scheduled scraping process."""
|
"""Run the scheduled scraping process."""
|
||||||
try:
|
try:
|
||||||
|
|||||||
Reference in New Issue
Block a user