implement automated job scraping scheduler with retry logic and logging

This commit is contained in:
2025-11-01 18:00:59 +01:00
parent 8e3a6f4f41
commit 504dc8e2b0
5 changed files with 109 additions and 1 deletions

View File

@@ -12,7 +12,8 @@ from web.db import (
insert_log,
get_last_fetch_time,
)
import schedule
import time
# Import utility functions
from web.utils import (
get_base_url,
@@ -165,5 +166,42 @@ def scraper():
yield "\nScraping completed successfully!\n"
def scrape_jobs_with_retry(max_retries=3):
"""Run the scraping process with retry logic for failures."""
for attempt in range(max_retries):
try:
scraper()
return True
except Exception as e:
if attempt < max_retries - 1:
time.sleep(2 ** attempt * 60) # Exponential backoff
return False
def start_scheduler():
"""Start the scheduler to run scraping every hour."""
# Clear any existing jobs
schedule.clear()
# Schedule scraping every hour
schedule.every().hour.do(scrape_jobs_with_retry)
# Run the scheduler in a loop
while True:
schedule.run_pending()
time.sleep(60) # Check every minute
def run_scheduled_scraping():
"""Run the scheduled scraping process."""
try:
scrape_jobs_with_retry()
except Exception as e:
pass
# Initialize scheduler when module is imported
schedule.every().hour.do(run_scheduled_scraping)
if __name__ == "__main__":
scraper()