Files
jobs/web/craigslist.py
georg.sinn-schirwitz 23a67d7fe1 initial project commit
2025-08-29 15:07:58 +02:00

148 lines
4.5 KiB
Python

from datetime import datetime, timezone
from web.scraper import process_region_keyword, scrape_job_page
from web.db import (
db_init,
upsert_cached_page,
upsert_listing,
upsert_job_details,
url_to_job_id,
upsert_user_interaction,
db_remove_cached_url,
db_sync_cached_pages,
db_get_all_job_urls,
db_get_cache_url,
db_delete_job,
remove_job,
normalize_cached_page_paths,
)
# Import utility functions
from web.utils import (
get_cache_dir,
make_request_with_retry,
now_iso,
get_cache_path,
cache_page,
is_cache_stale,
delete_cached_page,
get_cached_content,
ensure_cache_dir
)
from web.db import get_all_regions, get_all_keywords, seed_regions_keywords_from_listings
def fetch_listings():
"""Fetch job listings from all regions and keywords."""
# We'll collect URLs discovered in this run and then remove any DB listings
# not present in this set (treat DB as reflecting current search results).
existing_db_urls = set(db_get_all_job_urls())
discovered_urls = set()
new_rows = []
# Ensure regions/keywords master lists exist
try:
seed_regions_keywords_from_listings()
except Exception:
pass
# Fetch listings for each region/keyword from DB
for region in get_all_regions():
region_name = region.get("name")
if not region_name:
continue
for keyword in get_all_keywords():
keyword_name = keyword.get("name")
if not keyword_name:
continue
for row in process_region_keyword(region_name, keyword_name, discovered_urls):
timestamp, region, keyword, title, pay, location, url = row
discovered_urls.add(url)
if url not in existing_db_urls:
new_rows.append(row)
# Upsert or update listing to reflect current search result
upsert_listing(
url=url,
region=region,
keyword=keyword,
title=title,
pay=pay,
location=location,
timestamp=timestamp,
)
# Remove stale listings: those present in DB but not discovered now.
stale_urls = existing_db_urls - discovered_urls
for url in stale_urls:
try:
jid = url_to_job_id(url)
db_delete_job(jid)
# Also try to remove cached file and its metadata
delete_cached_page(url)
db_remove_cached_url(url)
except Exception:
pass
return {"discovered": len(discovered_urls), "new": len(new_rows), "stale": len(stale_urls)}
def process_job_url(job_url: str):
try:
job_id = url_to_job_id(job_url)
content = None
cached_page = db_get_cache_url(job_url)
if cached_page:
last_modified = cached_page.get("last_modified")
if last_modified and not is_cache_stale(last_modified):
content = get_cached_content(job_url)
else:
content = make_request_with_retry(job_url, 1)
else:
content = make_request_with_retry(job_url, 1)
if content is None:
remove_job(job_url)
return None
# refresh cache and details
cache_page(job_url, content)
upsert_cached_page(
file_path=get_cache_path(job_url),
url_guess=job_url,
last_modified=now_iso(),
size_bytes=len(content),
job_id=job_id
)
job_data = scrape_job_page(content, job_url)
if job_data:
upsert_job_details(job_data)
upsert_user_interaction(
job_id, seen_at=datetime.now(timezone.utc).isoformat())
return job_data
return None
except Exception:
return None
def scraper():
"""Main function to run the scraper."""
ensure_cache_dir()
db_init()
# First, fetch current listings from search pages and make DB reflect them.
jl = fetch_listings()
# Sync any cached files we have on disk into the cached_pages table.
db_sync_cached_pages(get_cache_dir())
# Normalize any relative cached file paths to absolute paths in DB
normalized = normalize_cached_page_paths()
if normalized:
pass
# Finally, fetch and refresh individual job pages for current listings
for url in db_get_all_job_urls():
process_job_url(url)
if __name__ == "__main__":
scraper()