initial project commit
This commit is contained in:
121
web/scraper.py
Normal file
121
web/scraper.py
Normal file
@@ -0,0 +1,121 @@
|
||||
from datetime import datetime, UTC
|
||||
from bs4 import BeautifulSoup
|
||||
from typing import List, Dict, Set
|
||||
from web.utils import get_base_url, cache_page, safe_get_text, safe_get_attr, is_cached, get_cached_content, make_request_with_retry
|
||||
|
||||
|
||||
def scrape_listings_page(listing, region: str, keyword: str, seen_urls: Set[str]) -> List:
|
||||
"""Parse a single job listing."""
|
||||
try:
|
||||
title_elem = listing.find("div", class_="title")
|
||||
url_elem = listing.find("a")
|
||||
pay_elem = listing.find("div", class_="attr remuneration")
|
||||
if pay_elem:
|
||||
pay_elem = pay_elem.find("span", class_="valu")
|
||||
location_elem = listing.find("div", class_="location")
|
||||
|
||||
if not title_elem or not url_elem:
|
||||
return []
|
||||
|
||||
title = title_elem.get_text(strip=True)
|
||||
url = url_elem["href"]
|
||||
pay = pay_elem.get_text(strip=True) if pay_elem else "N/A"
|
||||
location = location_elem.get_text(
|
||||
strip=True) if location_elem else "N/A"
|
||||
|
||||
status = "DUPLICATE" if url in seen_urls else "NEW"
|
||||
if url in seen_urls:
|
||||
return []
|
||||
|
||||
# job_summary variable retained for parity but not used
|
||||
job_summary = f"{status} [{region}/{keyword}] | Title: {title[:50]}{'...' if len(title) > 50 else ''} | Location: {location} | URL: {url}"
|
||||
_ = job_summary
|
||||
|
||||
return [datetime.now(UTC).isoformat(), region, keyword, title, pay, location, url]
|
||||
except (AttributeError, KeyError):
|
||||
return []
|
||||
|
||||
|
||||
def scrape_job_page(content: str, url: str) -> Dict:
|
||||
"""Scrape job details from a job listing page."""
|
||||
soup = BeautifulSoup(content, "html.parser")
|
||||
|
||||
# Extract each field
|
||||
title = safe_get_text(soup.find("h1", class_="postingtitle"))
|
||||
company = safe_get_text(soup.find("h2", class_="company-name"))
|
||||
|
||||
map_elem = soup.find("div", id="map")
|
||||
if map_elem:
|
||||
lat = safe_get_attr(map_elem, "data-latitude")
|
||||
lon = safe_get_attr(map_elem, "data-longitude")
|
||||
accuracy = safe_get_attr(map_elem, "data-accuracy")
|
||||
location = f"Lat: {lat}, Lon: {lon}, Accuracy: {accuracy}"
|
||||
else:
|
||||
location = "N/A"
|
||||
|
||||
mapaddress = soup.find("div", class_="mapaddress")
|
||||
if mapaddress:
|
||||
location = safe_get_text(mapaddress) + " " + location
|
||||
|
||||
description_elem = soup.find("section", id="postingbody")
|
||||
if description_elem:
|
||||
de = BeautifulSoup(str(description_elem), "html.parser")
|
||||
qr_code_elem = de.find(class_="print-qrcode-label")
|
||||
# Remove QR code if it exists
|
||||
if qr_code_elem:
|
||||
qr_code_elem.decompose()
|
||||
description = de.text.strip()
|
||||
else:
|
||||
description = ''
|
||||
|
||||
posting_info = soup.find("div", class_="postinginfos")
|
||||
if posting_info:
|
||||
pi = BeautifulSoup(str(posting_info), "html.parser")
|
||||
postinginfo_tags = pi.find_all("p", class_="postinginfo")
|
||||
job_id = safe_get_text(postinginfo_tags[0]) if postinginfo_tags else ""
|
||||
posted_time_elem = pi.find("time", class_="date timeago")
|
||||
posted_time = safe_get_attr(
|
||||
posted_time_elem, "datetime") if posted_time_elem else ""
|
||||
else:
|
||||
job_id = ""
|
||||
posted_time = ""
|
||||
|
||||
return {
|
||||
"url": url,
|
||||
"title": title,
|
||||
"company": company,
|
||||
"location": location,
|
||||
"description": description,
|
||||
"id": job_id,
|
||||
"posted_time": posted_time
|
||||
}
|
||||
|
||||
|
||||
def scrape_job_data(content: str, region: str, keyword: str, seen_urls: Set[str]) -> List[List]:
|
||||
"""Parse HTML content to extract job listings."""
|
||||
soup = BeautifulSoup(content, "html.parser")
|
||||
listings = soup.find_all("li", class_="cl-static-search-result")
|
||||
new_rows = []
|
||||
|
||||
for i, listing in enumerate(listings):
|
||||
job_data = scrape_listings_page(listing, region, keyword, seen_urls)
|
||||
if job_data:
|
||||
new_rows.append(job_data)
|
||||
|
||||
return new_rows
|
||||
|
||||
|
||||
def process_region_keyword(region: str, keyword: str, seen_urls: Set[str]) -> List[List]:
|
||||
"""Process a single region and keyword."""
|
||||
url = get_base_url().format(region=region, keyword=keyword.replace(" ", "+"))
|
||||
if is_cached(url):
|
||||
content = get_cached_content(url)
|
||||
cache_status = "CACHED"
|
||||
else:
|
||||
content = make_request_with_retry(url, 3)
|
||||
if content is None:
|
||||
return []
|
||||
cache_page(url, content)
|
||||
cache_status = "FETCHED"
|
||||
_ = cache_status # no-op to silence unused var
|
||||
return scrape_job_data(content, region, keyword, seen_urls)
|
||||
Reference in New Issue
Block a user