from datetime import datetime, timezone from web.scraper import process_region_keyword, scrape_job_page from web.db import ( db_init, upsert_cached_page, upsert_listing, upsert_job_details, url_to_job_id, upsert_user_interaction, db_remove_cached_url, db_sync_cached_pages, db_get_all_job_urls, db_get_cache_url, db_delete_job, remove_job, normalize_cached_page_paths, ) # Import utility functions from web.utils import ( get_cache_dir, make_request_with_retry, now_iso, get_cache_path, cache_page, is_cache_stale, delete_cached_page, get_cached_content, ensure_cache_dir ) from web.db import get_all_regions, get_all_keywords, seed_regions_keywords_from_listings def fetch_listings(): """Fetch job listings from all regions and keywords.""" # We'll collect URLs discovered in this run and then remove any DB listings # not present in this set (treat DB as reflecting current search results). existing_db_urls = set(db_get_all_job_urls()) discovered_urls = set() new_rows = [] # Ensure regions/keywords master lists exist try: seed_regions_keywords_from_listings() except Exception: pass # Fetch listings for each region/keyword from DB for region in get_all_regions(): region_name = region.get("name") if not region_name: continue for keyword in get_all_keywords(): keyword_name = keyword.get("name") if not keyword_name: continue for row in process_region_keyword(region_name, keyword_name, discovered_urls): timestamp, region, keyword, title, pay, location, url = row discovered_urls.add(url) if url not in existing_db_urls: new_rows.append(row) # Upsert or update listing to reflect current search result upsert_listing( url=url, region=region, keyword=keyword, title=title, pay=pay, location=location, timestamp=timestamp, ) # Remove stale listings: those present in DB but not discovered now. stale_urls = existing_db_urls - discovered_urls for url in stale_urls: try: jid = url_to_job_id(url) db_delete_job(jid) # Also try to remove cached file and its metadata delete_cached_page(url) db_remove_cached_url(url) except Exception: pass return {"discovered": len(discovered_urls), "new": len(new_rows), "stale": len(stale_urls)} def process_job_url(job_url: str): try: job_id = url_to_job_id(job_url) content = None cached_page = db_get_cache_url(job_url) if cached_page: last_modified = cached_page.get("last_modified") if last_modified and not is_cache_stale(last_modified): content = get_cached_content(job_url) else: content = make_request_with_retry(job_url, 1) else: content = make_request_with_retry(job_url, 1) if content is None: remove_job(job_url) return None # refresh cache and details cache_page(job_url, content) upsert_cached_page( file_path=get_cache_path(job_url), url_guess=job_url, last_modified=now_iso(), size_bytes=len(content), job_id=job_id ) job_data = scrape_job_page(content, job_url) if job_data: upsert_job_details(job_data) upsert_user_interaction( job_id, seen_at=datetime.now(timezone.utc).isoformat()) return job_data return None except Exception: return None def scraper(): """Main function to run the scraper.""" ensure_cache_dir() db_init() # First, fetch current listings from search pages and make DB reflect them. jl = fetch_listings() # Sync any cached files we have on disk into the cached_pages table. db_sync_cached_pages(get_cache_dir()) # Normalize any relative cached file paths to absolute paths in DB normalized = normalize_cached_page_paths() if normalized: pass # Finally, fetch and refresh individual job pages for current listings for url in db_get_all_job_urls(): process_job_url(url) if __name__ == "__main__": scraper()