diff --git a/web/craigslist.py b/web/craigslist.py index 9816ffe..e5eeb41 100644 --- a/web/craigslist.py +++ b/web/craigslist.py @@ -25,7 +25,7 @@ def fetch_listings(): """Fetch job listings from all regions and keywords.""" # We'll collect URLs discovered in this run and then remove any DB listings # not present in this set (treat DB as reflecting current search results). - existing_db_urls = set(db_get_all_job_urls()) + existing_db_urls = set(row['url'] for row in db_get_all_job_urls()) discovered_urls = set() new_rows = [] @@ -98,7 +98,7 @@ def fetch_listings(): return {"discovered": len(discovered_urls), "new": len(new_rows)} -def process_job_url(job_url: str): +def process_job_url(job_url: str, region: str = "", keyword: str = ""): try: job_id = url_to_job_id(job_url) yield f"Fetching job page: {job_url}\n" @@ -113,7 +113,7 @@ def process_job_url(job_url: str): job_data = scrape_job_page(content, job_url) if job_data: yield f"Upserting job details for {job_id}\n" - upsert_job_details(job_data) + upsert_job_details(job_data, region=region, keyword=keyword) upsert_user_interaction( job_id, seen_at=datetime.now(timezone.utc).isoformat()) yield f"Successfully processed job {job_id}: {job_data.get('title', 'Unknown')}\n" @@ -141,9 +141,11 @@ def scraper(): job_urls = db_get_all_job_urls() yield f"Processing {len(job_urls)} job pages...\n" - for i, url in enumerate(job_urls, 1): + i = 0 + for url, region, keyword in job_urls: + i += 1 yield f"\n--- Processing job {i}/{len(job_urls)} ---\n" - for message in process_job_url(url): + for message in process_job_url(job_url=url, region=region, keyword=keyword): yield message yield "\nScraping completed successfully!\n" diff --git a/web/db.py b/web/db.py index 2d61576..627a189 100644 --- a/web/db.py +++ b/web/db.py @@ -272,7 +272,7 @@ def get_last_fetch_time(page_url: str) -> Optional[datetime]: return None -def upsert_job_details(job_data: Dict[str, Any]): +def upsert_job_details(job_data: Dict[str, Any], region: str = "", keyword: str = ""): """Upsert into job_descriptions table using scraped job details dict. Behavior additions: @@ -356,7 +356,7 @@ SELECT l.job_id ,d.posted_time ,l.url FROM job_listings AS l -INNER JOIN job_descriptions AS d +INNER JOIN job_descriptions AS d ON l.job_id = d.job_id AND l.url = d.url ORDER BY d.posted_time DESC @@ -381,11 +381,16 @@ ORDER BY d.posted_time DESC return jobs -def db_get_all_job_urls() -> List[str]: - """Return list of job URLs from job_listings.""" +def db_get_all_job_urls() -> List[dict]: + """Return list of job URLs from job_listings. + + Returns: + - List of dicts with keys: url, region, keyword + """ with _ensure_session() as session: - rows = session.execute(text("SELECT url FROM job_listings")).fetchall() - return [r[0] for r in rows] + rows = session.execute( + text("SELECT url, region, keyword FROM job_listings")).fetchall() + return [{"url": r[0], "region": r[1], "keyword": r[2]} for r in rows] def db_delete_job(job_id: str | int):