diff --git a/web/craigslist.py b/web/craigslist.py index 73fede7..b6ee54e 100644 --- a/web/craigslist.py +++ b/web/craigslist.py @@ -15,6 +15,7 @@ from web.db import ( # Import utility functions from web.utils import ( + get_base_url, make_request_with_retry, now_iso, ) @@ -54,9 +55,10 @@ def fetch_listings(): if not keyword_name: continue # Build a canonical search identifier for this region+keyword combination. + url = get_base_url().format(region=region, keyword=keyword_name.replace(" ", "+")) search_page_id = f"search:{region_name}:{keyword_name}" try: - last = get_last_fetch_time(search_page_id) + last = get_last_fetch_time(url) if last is not None: # skip if fetched within the last 24 hours age = datetime.now( @@ -72,7 +74,7 @@ def fetch_listings(): yield f"Processing {region_name} + {keyword_name} ({processed}/{total_combinations})...\n" # record that we're fetching this search page now try: - insert_log(search_page_id, region=region_name, + insert_log(url, region=region_name, keyword=keyword_name, fetched_at=datetime.now(timezone.utc)) except Exception: pass @@ -99,6 +101,15 @@ def fetch_listings(): def process_job_url(job_url: str, region: str = "", keyword: str = ""): + last = get_last_fetch_time(job_url) + if last is not None: + # skip if fetched within the last 24 hours + age = datetime.now( + timezone.utc) - (last if last.tzinfo is not None else last.replace(tzinfo=timezone.utc)) + if age.total_seconds() < 24 * 3600: + yield f"Skipping job {job_url} (fetched {age.seconds//3600}h ago)...\n" + return None + try: job_id = url_to_job_id(job_url) yield f"Fetching job page: {job_url}\n" diff --git a/web/db.py b/web/db.py index 627a189..d2d2a89 100644 --- a/web/db.py +++ b/web/db.py @@ -14,10 +14,8 @@ Tables: """ from datetime import datetime, UTC -import os from typing import Optional, Dict, Any, List from web.utils import ( - get_url_from_filename, get_color_from_string, url_to_job_id, normalize_job_id,