remove caching

2025-09-08 14:44:46 +02:00
parent f8e23d0fba
commit 042a196718
13 changed files with 144 additions and 525 deletions
--- a/web/craigslist.py
+++ b/web/craigslist.py
@@ -2,31 +2,19 @@ from datetime import datetime, timezone
 from web.scraper import process_region_keyword, scrape_job_page
 from web.db import (
    db_init,
-    upsert_cached_page,
    upsert_listing,
    upsert_job_details,
    url_to_job_id,
    upsert_user_interaction,
-    db_remove_cached_url,
-    db_sync_cached_pages,
    db_get_all_job_urls,
-    db_get_cache_url,
    db_delete_job,
    remove_job,
-    normalize_cached_page_paths,
 )

 # Import utility functions
 from web.utils import (
-    get_cache_dir,
    make_request_with_retry,
    now_iso,
-    get_cache_path,
-    cache_page,
-    is_cache_stale,
-    delete_cached_page,
-    get_cached_content,
-    ensure_cache_dir
 )
 from web.db import get_all_regions, get_all_keywords, seed_regions_keywords_from_listings

@@ -45,15 +33,26 @@ def fetch_listings():
    except Exception:
        pass

+    yield "Initializing database and seeding regions/keywords...\n"
+
    # Fetch listings for each region/keyword from DB
-    for region in get_all_regions():
+    regions = get_all_regions()
+    keywords = get_all_keywords()
+    total_combinations = len(regions) * len(keywords)
+    processed = 0
+
+    yield f"Found {len(regions)} regions and {len(keywords)} keywords. Processing {total_combinations} combinations...\n"
+
+    for region in regions:
        region_name = region.get("name")
        if not region_name:
            continue
-        for keyword in get_all_keywords():
+        for keyword in keywords:
            keyword_name = keyword.get("name")
            if not keyword_name:
                continue
+            processed += 1
+            yield f"Processing {region_name} + {keyword_name} ({processed}/{total_combinations})...\n"
            for row in process_region_keyword(region_name, keyword_name, discovered_urls):
                timestamp, region, keyword, title, pay, location, url = row
                discovered_urls.add(url)
@@ -72,75 +71,68 @@ def fetch_listings():

    # Remove stale listings: those present in DB but not discovered now.
    stale_urls = existing_db_urls - discovered_urls
-    for url in stale_urls:
-        try:
-            jid = url_to_job_id(url)
-            db_delete_job(jid)
-            # Also try to remove cached file and its metadata
-            delete_cached_page(url)
-            db_remove_cached_url(url)
-        except Exception:
-            pass
+    if stale_urls:
+        yield f"Removing {len(stale_urls)} stale listings...\n"
+        for url in stale_urls:
+            try:
+                jid = url_to_job_id(url)
+                db_delete_job(jid)
+            except Exception:
+                pass

+    yield f"Listing fetch complete: {len(discovered_urls)} discovered, {len(new_rows)} new, {len(stale_urls)} stale\n"
    return {"discovered": len(discovered_urls), "new": len(new_rows), "stale": len(stale_urls)}


 def process_job_url(job_url: str):
    try:
        job_id = url_to_job_id(job_url)
-        content = None
-        cached_page = db_get_cache_url(job_url)
-        if cached_page:
-            last_modified = cached_page.get("last_modified")
-            if last_modified and not is_cache_stale(last_modified):
-                content = get_cached_content(job_url)
-            else:
-                content = make_request_with_retry(job_url, 1)
-        else:
-            content = make_request_with_retry(job_url, 1)
+        yield f"Fetching job page: {job_url}\n"
+        content = make_request_with_retry(job_url, 1)

        if content is None:
+            yield f"Failed to fetch content for {job_url}, removing from database\n"
            remove_job(job_url)
            return None

-        # refresh cache and details
-        cache_page(job_url, content)
-        upsert_cached_page(
-            file_path=get_cache_path(job_url),
-            url_guess=job_url,
-            last_modified=now_iso(),
-            size_bytes=len(content),
-            job_id=job_id
-        )
+        yield f"Scraping job data from {job_url}\n"
        job_data = scrape_job_page(content, job_url)
        if job_data:
+            yield f"Upserting job details for {job_id}\n"
            upsert_job_details(job_data)
            upsert_user_interaction(
                job_id, seen_at=datetime.now(timezone.utc).isoformat())
+            yield f"Successfully processed job {job_id}: {job_data.get('title', 'Unknown')}\n"
            return job_data
+        else:
+            yield f"Failed to scrape job data from {job_url}\n"
        return None
-    except Exception:
+    except Exception as e:
+        yield f"Error processing {job_url}: {str(e)}\n"
        return None


 def scraper():
    """Main function to run the scraper."""
-    ensure_cache_dir()
+    yield "Starting scraper...\n"
    db_init()
+    yield "Database initialized\n"
+
    # First, fetch current listings from search pages and make DB reflect them.
-    jl = fetch_listings()
-
-    # Sync any cached files we have on disk into the cached_pages table.
-    db_sync_cached_pages(get_cache_dir())
-
-    # Normalize any relative cached file paths to absolute paths in DB
-    normalized = normalize_cached_page_paths()
-    if normalized:
-        pass
+    yield "Fetching listings...\n"
+    for message in fetch_listings():
+        yield message

    # Finally, fetch and refresh individual job pages for current listings
-    for url in db_get_all_job_urls():
-        process_job_url(url)
+    job_urls = db_get_all_job_urls()
+    yield f"Processing {len(job_urls)} job pages...\n"
+
+    for i, url in enumerate(job_urls, 1):
+        yield f"\n--- Processing job {i}/{len(job_urls)} ---\n"
+        for message in process_job_url(url):
+            yield message
+
+    yield "\nScraping completed successfully!\n"


 if __name__ == "__main__":