extending logging
This commit is contained in:
@@ -25,7 +25,7 @@ def fetch_listings():
|
|||||||
"""Fetch job listings from all regions and keywords."""
|
"""Fetch job listings from all regions and keywords."""
|
||||||
# We'll collect URLs discovered in this run and then remove any DB listings
|
# We'll collect URLs discovered in this run and then remove any DB listings
|
||||||
# not present in this set (treat DB as reflecting current search results).
|
# not present in this set (treat DB as reflecting current search results).
|
||||||
existing_db_urls = set(db_get_all_job_urls())
|
existing_db_urls = set(row['url'] for row in db_get_all_job_urls())
|
||||||
discovered_urls = set()
|
discovered_urls = set()
|
||||||
new_rows = []
|
new_rows = []
|
||||||
|
|
||||||
@@ -98,7 +98,7 @@ def fetch_listings():
|
|||||||
return {"discovered": len(discovered_urls), "new": len(new_rows)}
|
return {"discovered": len(discovered_urls), "new": len(new_rows)}
|
||||||
|
|
||||||
|
|
||||||
def process_job_url(job_url: str):
|
def process_job_url(job_url: str, region: str = "", keyword: str = ""):
|
||||||
try:
|
try:
|
||||||
job_id = url_to_job_id(job_url)
|
job_id = url_to_job_id(job_url)
|
||||||
yield f"Fetching job page: {job_url}\n"
|
yield f"Fetching job page: {job_url}\n"
|
||||||
@@ -113,7 +113,7 @@ def process_job_url(job_url: str):
|
|||||||
job_data = scrape_job_page(content, job_url)
|
job_data = scrape_job_page(content, job_url)
|
||||||
if job_data:
|
if job_data:
|
||||||
yield f"Upserting job details for {job_id}\n"
|
yield f"Upserting job details for {job_id}\n"
|
||||||
upsert_job_details(job_data)
|
upsert_job_details(job_data, region=region, keyword=keyword)
|
||||||
upsert_user_interaction(
|
upsert_user_interaction(
|
||||||
job_id, seen_at=datetime.now(timezone.utc).isoformat())
|
job_id, seen_at=datetime.now(timezone.utc).isoformat())
|
||||||
yield f"Successfully processed job {job_id}: {job_data.get('title', 'Unknown')}\n"
|
yield f"Successfully processed job {job_id}: {job_data.get('title', 'Unknown')}\n"
|
||||||
@@ -141,9 +141,11 @@ def scraper():
|
|||||||
job_urls = db_get_all_job_urls()
|
job_urls = db_get_all_job_urls()
|
||||||
yield f"Processing {len(job_urls)} job pages...\n"
|
yield f"Processing {len(job_urls)} job pages...\n"
|
||||||
|
|
||||||
for i, url in enumerate(job_urls, 1):
|
i = 0
|
||||||
|
for url, region, keyword in job_urls:
|
||||||
|
i += 1
|
||||||
yield f"\n--- Processing job {i}/{len(job_urls)} ---\n"
|
yield f"\n--- Processing job {i}/{len(job_urls)} ---\n"
|
||||||
for message in process_job_url(url):
|
for message in process_job_url(job_url=url, region=region, keyword=keyword):
|
||||||
yield message
|
yield message
|
||||||
|
|
||||||
yield "\nScraping completed successfully!\n"
|
yield "\nScraping completed successfully!\n"
|
||||||
|
|||||||
17
web/db.py
17
web/db.py
@@ -272,7 +272,7 @@ def get_last_fetch_time(page_url: str) -> Optional[datetime]:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def upsert_job_details(job_data: Dict[str, Any]):
|
def upsert_job_details(job_data: Dict[str, Any], region: str = "", keyword: str = ""):
|
||||||
"""Upsert into job_descriptions table using scraped job details dict.
|
"""Upsert into job_descriptions table using scraped job details dict.
|
||||||
|
|
||||||
Behavior additions:
|
Behavior additions:
|
||||||
@@ -356,7 +356,7 @@ SELECT l.job_id
|
|||||||
,d.posted_time
|
,d.posted_time
|
||||||
,l.url
|
,l.url
|
||||||
FROM job_listings AS l
|
FROM job_listings AS l
|
||||||
INNER JOIN job_descriptions AS d
|
INNER JOIN job_descriptions AS d
|
||||||
ON l.job_id = d.job_id
|
ON l.job_id = d.job_id
|
||||||
AND l.url = d.url
|
AND l.url = d.url
|
||||||
ORDER BY d.posted_time DESC
|
ORDER BY d.posted_time DESC
|
||||||
@@ -381,11 +381,16 @@ ORDER BY d.posted_time DESC
|
|||||||
return jobs
|
return jobs
|
||||||
|
|
||||||
|
|
||||||
def db_get_all_job_urls() -> List[str]:
|
def db_get_all_job_urls() -> List[dict]:
|
||||||
"""Return list of job URLs from job_listings."""
|
"""Return list of job URLs from job_listings.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
- List of dicts with keys: url, region, keyword
|
||||||
|
"""
|
||||||
with _ensure_session() as session:
|
with _ensure_session() as session:
|
||||||
rows = session.execute(text("SELECT url FROM job_listings")).fetchall()
|
rows = session.execute(
|
||||||
return [r[0] for r in rows]
|
text("SELECT url, region, keyword FROM job_listings")).fetchall()
|
||||||
|
return [{"url": r[0], "region": r[1], "keyword": r[2]} for r in rows]
|
||||||
|
|
||||||
|
|
||||||
def db_delete_job(job_id: str | int):
|
def db_delete_job(job_id: str | int):
|
||||||
|
|||||||
Reference in New Issue
Block a user