extending logging

This commit is contained in:
2025-09-17 15:53:28 +02:00
parent 94730439a2
commit b6f9d39ad8
2 changed files with 18 additions and 11 deletions

View File

@@ -25,7 +25,7 @@ def fetch_listings():
"""Fetch job listings from all regions and keywords.""" """Fetch job listings from all regions and keywords."""
# We'll collect URLs discovered in this run and then remove any DB listings # We'll collect URLs discovered in this run and then remove any DB listings
# not present in this set (treat DB as reflecting current search results). # not present in this set (treat DB as reflecting current search results).
existing_db_urls = set(db_get_all_job_urls()) existing_db_urls = set(row['url'] for row in db_get_all_job_urls())
discovered_urls = set() discovered_urls = set()
new_rows = [] new_rows = []
@@ -98,7 +98,7 @@ def fetch_listings():
return {"discovered": len(discovered_urls), "new": len(new_rows)} return {"discovered": len(discovered_urls), "new": len(new_rows)}
def process_job_url(job_url: str): def process_job_url(job_url: str, region: str = "", keyword: str = ""):
try: try:
job_id = url_to_job_id(job_url) job_id = url_to_job_id(job_url)
yield f"Fetching job page: {job_url}\n" yield f"Fetching job page: {job_url}\n"
@@ -113,7 +113,7 @@ def process_job_url(job_url: str):
job_data = scrape_job_page(content, job_url) job_data = scrape_job_page(content, job_url)
if job_data: if job_data:
yield f"Upserting job details for {job_id}\n" yield f"Upserting job details for {job_id}\n"
upsert_job_details(job_data) upsert_job_details(job_data, region=region, keyword=keyword)
upsert_user_interaction( upsert_user_interaction(
job_id, seen_at=datetime.now(timezone.utc).isoformat()) job_id, seen_at=datetime.now(timezone.utc).isoformat())
yield f"Successfully processed job {job_id}: {job_data.get('title', 'Unknown')}\n" yield f"Successfully processed job {job_id}: {job_data.get('title', 'Unknown')}\n"
@@ -141,9 +141,11 @@ def scraper():
job_urls = db_get_all_job_urls() job_urls = db_get_all_job_urls()
yield f"Processing {len(job_urls)} job pages...\n" yield f"Processing {len(job_urls)} job pages...\n"
for i, url in enumerate(job_urls, 1): i = 0
for url, region, keyword in job_urls:
i += 1
yield f"\n--- Processing job {i}/{len(job_urls)} ---\n" yield f"\n--- Processing job {i}/{len(job_urls)} ---\n"
for message in process_job_url(url): for message in process_job_url(job_url=url, region=region, keyword=keyword):
yield message yield message
yield "\nScraping completed successfully!\n" yield "\nScraping completed successfully!\n"

View File

@@ -272,7 +272,7 @@ def get_last_fetch_time(page_url: str) -> Optional[datetime]:
return None return None
def upsert_job_details(job_data: Dict[str, Any]): def upsert_job_details(job_data: Dict[str, Any], region: str = "", keyword: str = ""):
"""Upsert into job_descriptions table using scraped job details dict. """Upsert into job_descriptions table using scraped job details dict.
Behavior additions: Behavior additions:
@@ -356,7 +356,7 @@ SELECT l.job_id
,d.posted_time ,d.posted_time
,l.url ,l.url
FROM job_listings AS l FROM job_listings AS l
INNER JOIN job_descriptions AS d INNER JOIN job_descriptions AS d
ON l.job_id = d.job_id ON l.job_id = d.job_id
AND l.url = d.url AND l.url = d.url
ORDER BY d.posted_time DESC ORDER BY d.posted_time DESC
@@ -381,11 +381,16 @@ ORDER BY d.posted_time DESC
return jobs return jobs
def db_get_all_job_urls() -> List[str]: def db_get_all_job_urls() -> List[dict]:
"""Return list of job URLs from job_listings.""" """Return list of job URLs from job_listings.
Returns:
- List of dicts with keys: url, region, keyword
"""
with _ensure_session() as session: with _ensure_session() as session:
rows = session.execute(text("SELECT url FROM job_listings")).fetchall() rows = session.execute(
return [r[0] for r in rows] text("SELECT url, region, keyword FROM job_listings")).fetchall()
return [{"url": r[0], "region": r[1], "keyword": r[2]} for r in rows]
def db_delete_job(job_id: str | int): def db_delete_job(job_id: str | int):