extending logging

2025-09-17 15:53:28 +02:00
parent 94730439a2
commit b6f9d39ad8
2 changed files with 18 additions and 11 deletions
--- a/web/craigslist.py
+++ b/web/craigslist.py
@@ -25,7 +25,7 @@ def fetch_listings():
    """Fetch job listings from all regions and keywords."""
    # We'll collect URLs discovered in this run and then remove any DB listings
    # not present in this set (treat DB as reflecting current search results).
-    existing_db_urls = set(db_get_all_job_urls())
+    existing_db_urls = set(row['url'] for row in db_get_all_job_urls())
    discovered_urls = set()
    new_rows = []
@@ -98,7 +98,7 @@ def fetch_listings():
    return {"discovered": len(discovered_urls), "new": len(new_rows)}
-def process_job_url(job_url: str):
+def process_job_url(job_url: str, region: str = "", keyword: str = ""):
    try:
        job_id = url_to_job_id(job_url)
        yield f"Fetching job page: {job_url}\n"
@@ -113,7 +113,7 @@ def process_job_url(job_url: str):
        job_data = scrape_job_page(content, job_url)
        if job_data:
            yield f"Upserting job details for {job_id}\n"
-            upsert_job_details(job_data)
+            upsert_job_details(job_data, region=region, keyword=keyword)
            upsert_user_interaction(
                job_id, seen_at=datetime.now(timezone.utc).isoformat())
            yield f"Successfully processed job {job_id}: {job_data.get('title', 'Unknown')}\n"
@@ -141,9 +141,11 @@ def scraper():
    job_urls = db_get_all_job_urls()
    yield f"Processing {len(job_urls)} job pages...\n"
-    for i, url in enumerate(job_urls, 1):
+    i = 0
    for url, region, keyword in job_urls:
        i += 1
        yield f"\n--- Processing job {i}/{len(job_urls)} ---\n"
-        for message in process_job_url(url):
+        for message in process_job_url(job_url=url, region=region, keyword=keyword):
            yield message
    yield "\nScraping completed successfully!\n"
--- a/web/db.py
+++ b/web/db.py
@@ -272,7 +272,7 @@ def get_last_fetch_time(page_url: str) -> Optional[datetime]:
        return None
-def upsert_job_details(job_data: Dict[str, Any]):
+def upsert_job_details(job_data: Dict[str, Any], region: str = "", keyword: str = ""):
    """Upsert into job_descriptions table using scraped job details dict.
    Behavior additions:
@@ -356,7 +356,7 @@ SELECT l.job_id
 ,d.posted_time
 ,l.url
 FROM job_listings AS l
-INNER JOIN job_descriptions AS d 
+INNER JOIN job_descriptions AS d
 ON l.job_id = d.job_id
 AND l.url = d.url
 ORDER BY d.posted_time DESC
@@ -381,11 +381,16 @@ ORDER BY d.posted_time DESC
        return jobs
-def db_get_all_job_urls() -> List[str]:
+def db_get_all_job_urls() -> List[dict]:
-    """Return list of job URLs from job_listings."""
+    """Return list of job URLs from job_listings.
    Returns:
    - List of dicts with keys: url, region, keyword
    """
    with _ensure_session() as session:
-        rows = session.execute(text("SELECT url FROM job_listings")).fetchall()
+        rows = session.execute(
-        return [r[0] for r in rows]
+            text("SELECT url, region, keyword FROM job_listings")).fetchall()
        return [{"url": r[0], "region": r[1], "keyword": r[2]} for r in rows]
 def db_delete_job(job_id: str | int):