jobs/web/craigslist.py

from datetime import datetime, timezone
from web.scraper import process_region_keyword, scrape_job_page
from web.db import (
    db_init,
    upsert_cached_page,
    upsert_listing,
    upsert_job_details,
    url_to_job_id,
    upsert_user_interaction,
    db_remove_cached_url,
    db_sync_cached_pages,
    db_get_all_job_urls,
    db_get_cache_url,
    db_delete_job,
    remove_job,
    normalize_cached_page_paths,
)

# Import utility functions
from web.utils import (
    get_cache_dir,
    make_request_with_retry,
    now_iso,
    get_cache_path,
    cache_page,
    is_cache_stale,
    delete_cached_page,
    get_cached_content,
    ensure_cache_dir
)
from web.db import get_all_regions, get_all_keywords, seed_regions_keywords_from_listings


def fetch_listings():
    """Fetch job listings from all regions and keywords."""
    # We'll collect URLs discovered in this run and then remove any DB listings
    # not present in this set (treat DB as reflecting current search results).
    existing_db_urls = set(db_get_all_job_urls())
    discovered_urls = set()
    new_rows = []

    # Ensure regions/keywords master lists exist
    try:
        seed_regions_keywords_from_listings()
    except Exception:
        pass

    # Fetch listings for each region/keyword from DB
    for region in get_all_regions():
        region_name = region.get("name")
        if not region_name:
            continue
        for keyword in get_all_keywords():
            keyword_name = keyword.get("name")
            if not keyword_name:
                continue
            for row in process_region_keyword(region_name, keyword_name, discovered_urls):
                timestamp, region, keyword, title, pay, location, url = row
                discovered_urls.add(url)
                if url not in existing_db_urls:
                    new_rows.append(row)
                # Upsert or update listing to reflect current search result
                upsert_listing(
                    url=url,
                    region=region,
                    keyword=keyword,
                    title=title,
                    pay=pay,
                    location=location,
                    timestamp=timestamp,
                )

    # Remove stale listings: those present in DB but not discovered now.
    stale_urls = existing_db_urls - discovered_urls
    for url in stale_urls:
        try:
            jid = url_to_job_id(url)
            db_delete_job(jid)
            # Also try to remove cached file and its metadata
            delete_cached_page(url)
            db_remove_cached_url(url)
        except Exception:
            pass

    return {"discovered": len(discovered_urls), "new": len(new_rows), "stale": len(stale_urls)}


def process_job_url(job_url: str):
    try:
        job_id = url_to_job_id(job_url)
        content = None
        cached_page = db_get_cache_url(job_url)
        if cached_page:
            last_modified = cached_page.get("last_modified")
            if last_modified and not is_cache_stale(last_modified):
                content = get_cached_content(job_url)
            else:
                content = make_request_with_retry(job_url, 1)
        else:
            content = make_request_with_retry(job_url, 1)

        if content is None:
            remove_job(job_url)
            return None

        # refresh cache and details
        cache_page(job_url, content)
        upsert_cached_page(
            file_path=get_cache_path(job_url),
            url_guess=job_url,
            last_modified=now_iso(),
            size_bytes=len(content),
            job_id=job_id
        )
        job_data = scrape_job_page(content, job_url)
        if job_data:
            upsert_job_details(job_data)
            upsert_user_interaction(
                job_id, seen_at=datetime.now(timezone.utc).isoformat())
            return job_data
        return None
    except Exception:
        return None


def scraper():
    """Main function to run the scraper."""
    ensure_cache_dir()
    db_init()
    # First, fetch current listings from search pages and make DB reflect them.
    jl = fetch_listings()

    # Sync any cached files we have on disk into the cached_pages table.
    db_sync_cached_pages(get_cache_dir())

    # Normalize any relative cached file paths to absolute paths in DB
    normalized = normalize_cached_page_paths()
    if normalized:
        pass

    # Finally, fetch and refresh individual job pages for current listings
    for url in db_get_all_job_urls():
        process_job_url(url)


if __name__ == "__main__":
    scraper()