implement automated job scraping scheduler with retry logic and logging
This commit is contained in:
@@ -12,7 +12,8 @@ from web.db import (
|
||||
insert_log,
|
||||
get_last_fetch_time,
|
||||
)
|
||||
|
||||
import schedule
|
||||
import time
|
||||
# Import utility functions
|
||||
from web.utils import (
|
||||
get_base_url,
|
||||
@@ -165,5 +166,42 @@ def scraper():
|
||||
yield "\nScraping completed successfully!\n"
|
||||
|
||||
|
||||
def scrape_jobs_with_retry(max_retries=3):
|
||||
"""Run the scraping process with retry logic for failures."""
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
scraper()
|
||||
return True
|
||||
except Exception as e:
|
||||
if attempt < max_retries - 1:
|
||||
time.sleep(2 ** attempt * 60) # Exponential backoff
|
||||
return False
|
||||
|
||||
|
||||
def start_scheduler():
|
||||
"""Start the scheduler to run scraping every hour."""
|
||||
# Clear any existing jobs
|
||||
schedule.clear()
|
||||
|
||||
# Schedule scraping every hour
|
||||
schedule.every().hour.do(scrape_jobs_with_retry)
|
||||
|
||||
# Run the scheduler in a loop
|
||||
while True:
|
||||
schedule.run_pending()
|
||||
time.sleep(60) # Check every minute
|
||||
|
||||
|
||||
def run_scheduled_scraping():
|
||||
"""Run the scheduled scraping process."""
|
||||
try:
|
||||
scrape_jobs_with_retry()
|
||||
except Exception as e:
|
||||
pass
|
||||
|
||||
|
||||
# Initialize scheduler when module is imported
|
||||
schedule.every().hour.do(run_scheduled_scraping)
|
||||
|
||||
if __name__ == "__main__":
|
||||
scraper()
|
||||
|
||||
Reference in New Issue
Block a user