implement automated job scraping scheduler with retry logic and logging

2025-11-01 18:00:59 +01:00
parent 8e3a6f4f41
commit 504dc8e2b0
5 changed files with 109 additions and 1 deletions
--- a/web/craigslist.py
+++ b/web/craigslist.py
@@ -12,7 +12,8 @@ from web.db import (
    insert_log,
    get_last_fetch_time,
 )
-
+import schedule
+import time
 # Import utility functions
 from web.utils import (
    get_base_url,
@@ -165,5 +166,42 @@ def scraper():
    yield "\nScraping completed successfully!\n"


+def scrape_jobs_with_retry(max_retries=3):
+    """Run the scraping process with retry logic for failures."""
+    for attempt in range(max_retries):
+        try:
+            scraper()
+            return True
+        except Exception as e:
+            if attempt < max_retries - 1:
+                time.sleep(2 ** attempt * 60)  # Exponential backoff
+    return False
+
+
+def start_scheduler():
+    """Start the scheduler to run scraping every hour."""
+    # Clear any existing jobs
+    schedule.clear()
+
+    # Schedule scraping every hour
+    schedule.every().hour.do(scrape_jobs_with_retry)
+
+    # Run the scheduler in a loop
+    while True:
+        schedule.run_pending()
+        time.sleep(60)  # Check every minute
+
+
+def run_scheduled_scraping():
+    """Run the scheduled scraping process."""
+    try:
+        scrape_jobs_with_retry()
+    except Exception as e:
+        pass
+
+
+# Initialize scheduler when module is imported
+schedule.every().hour.do(run_scheduled_scraping)
+
 if __name__ == "__main__":
    scraper()