remove caching
This commit is contained in:
@@ -2,31 +2,19 @@ from datetime import datetime, timezone
|
||||
from web.scraper import process_region_keyword, scrape_job_page
|
||||
from web.db import (
|
||||
db_init,
|
||||
upsert_cached_page,
|
||||
upsert_listing,
|
||||
upsert_job_details,
|
||||
url_to_job_id,
|
||||
upsert_user_interaction,
|
||||
db_remove_cached_url,
|
||||
db_sync_cached_pages,
|
||||
db_get_all_job_urls,
|
||||
db_get_cache_url,
|
||||
db_delete_job,
|
||||
remove_job,
|
||||
normalize_cached_page_paths,
|
||||
)
|
||||
|
||||
# Import utility functions
|
||||
from web.utils import (
|
||||
get_cache_dir,
|
||||
make_request_with_retry,
|
||||
now_iso,
|
||||
get_cache_path,
|
||||
cache_page,
|
||||
is_cache_stale,
|
||||
delete_cached_page,
|
||||
get_cached_content,
|
||||
ensure_cache_dir
|
||||
)
|
||||
from web.db import get_all_regions, get_all_keywords, seed_regions_keywords_from_listings
|
||||
|
||||
@@ -45,15 +33,26 @@ def fetch_listings():
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
yield "Initializing database and seeding regions/keywords...\n"
|
||||
|
||||
# Fetch listings for each region/keyword from DB
|
||||
for region in get_all_regions():
|
||||
regions = get_all_regions()
|
||||
keywords = get_all_keywords()
|
||||
total_combinations = len(regions) * len(keywords)
|
||||
processed = 0
|
||||
|
||||
yield f"Found {len(regions)} regions and {len(keywords)} keywords. Processing {total_combinations} combinations...\n"
|
||||
|
||||
for region in regions:
|
||||
region_name = region.get("name")
|
||||
if not region_name:
|
||||
continue
|
||||
for keyword in get_all_keywords():
|
||||
for keyword in keywords:
|
||||
keyword_name = keyword.get("name")
|
||||
if not keyword_name:
|
||||
continue
|
||||
processed += 1
|
||||
yield f"Processing {region_name} + {keyword_name} ({processed}/{total_combinations})...\n"
|
||||
for row in process_region_keyword(region_name, keyword_name, discovered_urls):
|
||||
timestamp, region, keyword, title, pay, location, url = row
|
||||
discovered_urls.add(url)
|
||||
@@ -72,75 +71,68 @@ def fetch_listings():
|
||||
|
||||
# Remove stale listings: those present in DB but not discovered now.
|
||||
stale_urls = existing_db_urls - discovered_urls
|
||||
for url in stale_urls:
|
||||
try:
|
||||
jid = url_to_job_id(url)
|
||||
db_delete_job(jid)
|
||||
# Also try to remove cached file and its metadata
|
||||
delete_cached_page(url)
|
||||
db_remove_cached_url(url)
|
||||
except Exception:
|
||||
pass
|
||||
if stale_urls:
|
||||
yield f"Removing {len(stale_urls)} stale listings...\n"
|
||||
for url in stale_urls:
|
||||
try:
|
||||
jid = url_to_job_id(url)
|
||||
db_delete_job(jid)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
yield f"Listing fetch complete: {len(discovered_urls)} discovered, {len(new_rows)} new, {len(stale_urls)} stale\n"
|
||||
return {"discovered": len(discovered_urls), "new": len(new_rows), "stale": len(stale_urls)}
|
||||
|
||||
|
||||
def process_job_url(job_url: str):
|
||||
try:
|
||||
job_id = url_to_job_id(job_url)
|
||||
content = None
|
||||
cached_page = db_get_cache_url(job_url)
|
||||
if cached_page:
|
||||
last_modified = cached_page.get("last_modified")
|
||||
if last_modified and not is_cache_stale(last_modified):
|
||||
content = get_cached_content(job_url)
|
||||
else:
|
||||
content = make_request_with_retry(job_url, 1)
|
||||
else:
|
||||
content = make_request_with_retry(job_url, 1)
|
||||
yield f"Fetching job page: {job_url}\n"
|
||||
content = make_request_with_retry(job_url, 1)
|
||||
|
||||
if content is None:
|
||||
yield f"Failed to fetch content for {job_url}, removing from database\n"
|
||||
remove_job(job_url)
|
||||
return None
|
||||
|
||||
# refresh cache and details
|
||||
cache_page(job_url, content)
|
||||
upsert_cached_page(
|
||||
file_path=get_cache_path(job_url),
|
||||
url_guess=job_url,
|
||||
last_modified=now_iso(),
|
||||
size_bytes=len(content),
|
||||
job_id=job_id
|
||||
)
|
||||
yield f"Scraping job data from {job_url}\n"
|
||||
job_data = scrape_job_page(content, job_url)
|
||||
if job_data:
|
||||
yield f"Upserting job details for {job_id}\n"
|
||||
upsert_job_details(job_data)
|
||||
upsert_user_interaction(
|
||||
job_id, seen_at=datetime.now(timezone.utc).isoformat())
|
||||
yield f"Successfully processed job {job_id}: {job_data.get('title', 'Unknown')}\n"
|
||||
return job_data
|
||||
else:
|
||||
yield f"Failed to scrape job data from {job_url}\n"
|
||||
return None
|
||||
except Exception:
|
||||
except Exception as e:
|
||||
yield f"Error processing {job_url}: {str(e)}\n"
|
||||
return None
|
||||
|
||||
|
||||
def scraper():
|
||||
"""Main function to run the scraper."""
|
||||
ensure_cache_dir()
|
||||
yield "Starting scraper...\n"
|
||||
db_init()
|
||||
yield "Database initialized\n"
|
||||
|
||||
# First, fetch current listings from search pages and make DB reflect them.
|
||||
jl = fetch_listings()
|
||||
|
||||
# Sync any cached files we have on disk into the cached_pages table.
|
||||
db_sync_cached_pages(get_cache_dir())
|
||||
|
||||
# Normalize any relative cached file paths to absolute paths in DB
|
||||
normalized = normalize_cached_page_paths()
|
||||
if normalized:
|
||||
pass
|
||||
yield "Fetching listings...\n"
|
||||
for message in fetch_listings():
|
||||
yield message
|
||||
|
||||
# Finally, fetch and refresh individual job pages for current listings
|
||||
for url in db_get_all_job_urls():
|
||||
process_job_url(url)
|
||||
job_urls = db_get_all_job_urls()
|
||||
yield f"Processing {len(job_urls)} job pages...\n"
|
||||
|
||||
for i, url in enumerate(job_urls, 1):
|
||||
yield f"\n--- Processing job {i}/{len(job_urls)} ---\n"
|
||||
for message in process_job_url(url):
|
||||
yield message
|
||||
|
||||
yield "\nScraping completed successfully!\n"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Reference in New Issue
Block a user