initial project commit

2025-08-29 15:07:58 +02:00
parent 38708e6d1d
commit 23a67d7fe1
31 changed files with 3433 additions and 0 deletions
--- a/web/craigslist.py
+++ b/web/craigslist.py
@@ -0,0 +1,147 @@
+from datetime import datetime, timezone
+from web.scraper import process_region_keyword, scrape_job_page
+from web.db import (
+    db_init,
+    upsert_cached_page,
+    upsert_listing,
+    upsert_job_details,
+    url_to_job_id,
+    upsert_user_interaction,
+    db_remove_cached_url,
+    db_sync_cached_pages,
+    db_get_all_job_urls,
+    db_get_cache_url,
+    db_delete_job,
+    remove_job,
+    normalize_cached_page_paths,
+)
+
+# Import utility functions
+from web.utils import (
+    get_cache_dir,
+    make_request_with_retry,
+    now_iso,
+    get_cache_path,
+    cache_page,
+    is_cache_stale,
+    delete_cached_page,
+    get_cached_content,
+    ensure_cache_dir
+)
+from web.db import get_all_regions, get_all_keywords, seed_regions_keywords_from_listings
+
+
+def fetch_listings():
+    """Fetch job listings from all regions and keywords."""
+    # We'll collect URLs discovered in this run and then remove any DB listings
+    # not present in this set (treat DB as reflecting current search results).
+    existing_db_urls = set(db_get_all_job_urls())
+    discovered_urls = set()
+    new_rows = []
+
+    # Ensure regions/keywords master lists exist
+    try:
+        seed_regions_keywords_from_listings()
+    except Exception:
+        pass
+
+    # Fetch listings for each region/keyword from DB
+    for region in get_all_regions():
+        region_name = region.get("name")
+        if not region_name:
+            continue
+        for keyword in get_all_keywords():
+            keyword_name = keyword.get("name")
+            if not keyword_name:
+                continue
+            for row in process_region_keyword(region_name, keyword_name, discovered_urls):
+                timestamp, region, keyword, title, pay, location, url = row
+                discovered_urls.add(url)
+                if url not in existing_db_urls:
+                    new_rows.append(row)
+                # Upsert or update listing to reflect current search result
+                upsert_listing(
+                    url=url,
+                    region=region,
+                    keyword=keyword,
+                    title=title,
+                    pay=pay,
+                    location=location,
+                    timestamp=timestamp,
+                )
+
+    # Remove stale listings: those present in DB but not discovered now.
+    stale_urls = existing_db_urls - discovered_urls
+    for url in stale_urls:
+        try:
+            jid = url_to_job_id(url)
+            db_delete_job(jid)
+            # Also try to remove cached file and its metadata
+            delete_cached_page(url)
+            db_remove_cached_url(url)
+        except Exception:
+            pass
+
+    return {"discovered": len(discovered_urls), "new": len(new_rows), "stale": len(stale_urls)}
+
+
+def process_job_url(job_url: str):
+    try:
+        job_id = url_to_job_id(job_url)
+        content = None
+        cached_page = db_get_cache_url(job_url)
+        if cached_page:
+            last_modified = cached_page.get("last_modified")
+            if last_modified and not is_cache_stale(last_modified):
+                content = get_cached_content(job_url)
+            else:
+                content = make_request_with_retry(job_url, 1)
+        else:
+            content = make_request_with_retry(job_url, 1)
+
+        if content is None:
+            remove_job(job_url)
+            return None
+
+        # refresh cache and details
+        cache_page(job_url, content)
+        upsert_cached_page(
+            file_path=get_cache_path(job_url),
+            url_guess=job_url,
+            last_modified=now_iso(),
+            size_bytes=len(content),
+            job_id=job_id
+        )
+        job_data = scrape_job_page(content, job_url)
+        if job_data:
+            upsert_job_details(job_data)
+            upsert_user_interaction(
+                job_id, seen_at=datetime.now(timezone.utc).isoformat())
+            return job_data
+        return None
+    except Exception:
+        return None
+
+
+def scraper():
+    """Main function to run the scraper."""
+    ensure_cache_dir()
+    db_init()
+    # First, fetch current listings from search pages and make DB reflect them.
+    jl = fetch_listings()
+
+    # Sync any cached files we have on disk into the cached_pages table.
+    db_sync_cached_pages(get_cache_dir())
+
+    # Normalize any relative cached file paths to absolute paths in DB
+    normalized = normalize_cached_page_paths()
+    if normalized:
+        pass
+
+    # Finally, fetch and refresh individual job pages for current listings
+    for url in db_get_all_job_urls():
+        process_job_url(url)
+
+
+if __name__ == "__main__":
+    scraper()