from datetime import datetime, UTC from bs4 import BeautifulSoup from typing import List, Dict, Set from web.utils import get_base_url, safe_get_text, safe_get_attr, make_request_with_retry def scrape_listings_page(listing, region: str, keyword: str, seen_urls: Set[str]) -> List: """Parse a single job listing.""" try: title_elem = listing.find("div", class_="title") url_elem = listing.find("a") pay_elem = listing.find("div", class_="attr remuneration") if pay_elem: pay_elem = pay_elem.find("span", class_="valu") location_elem = listing.find("div", class_="location") if not title_elem or not url_elem: return [] title = title_elem.get_text(strip=True) url = url_elem["href"] pay = pay_elem.get_text(strip=True) if pay_elem else "N/A" location = location_elem.get_text( strip=True) if location_elem else "N/A" status = "DUPLICATE" if url in seen_urls else "NEW" if url in seen_urls: return [] # job_summary variable retained for parity but not used job_summary = f"{status} [{region}/{keyword}] | Title: {title[:50]}{'...' if len(title) > 50 else ''} | Location: {location} | URL: {url}" _ = job_summary return [datetime.now(UTC).isoformat(), region, keyword, title, pay, location, url] except (AttributeError, KeyError): return [] def scrape_job_page(content: str, url: str) -> Dict: """Scrape job details from a job listing page.""" soup = BeautifulSoup(content, "html.parser") # Extract each field title = safe_get_text(soup.find("h1", class_="postingtitle")) company = safe_get_text(soup.find("h2", class_="company-name")) map_elem = soup.find("div", id="map") if map_elem: lat = safe_get_attr(map_elem, "data-latitude") lon = safe_get_attr(map_elem, "data-longitude") accuracy = safe_get_attr(map_elem, "data-accuracy") location = f"Lat: {lat}, Lon: {lon}, Accuracy: {accuracy}" else: location = "N/A" mapaddress = soup.find("div", class_="mapaddress") if mapaddress: location = safe_get_text(mapaddress) + " " + location description_elem = soup.find("section", id="postingbody") if description_elem: de = BeautifulSoup(str(description_elem), "html.parser") qr_code_elem = de.find(class_="print-qrcode-label") # Remove QR code if it exists if qr_code_elem: qr_code_elem.decompose() description = de.text.strip() else: description = '' posting_info = soup.find("div", class_="postinginfos") if posting_info: pi = BeautifulSoup(str(posting_info), "html.parser") postinginfo_tags = pi.find_all("p", class_="postinginfo") job_id = safe_get_text(postinginfo_tags[0]) if postinginfo_tags else "" posted_time_elem = pi.find("time", class_="date timeago") posted_time = safe_get_attr( posted_time_elem, "datetime") if posted_time_elem else "" else: job_id = "" posted_time = "" return { "url": url, "title": title, "company": company, "location": location, "description": description, "id": job_id, "posted_time": posted_time } def scrape_job_data(content: str, region: str, keyword: str, seen_urls: Set[str]) -> List[List]: """Parse HTML content to extract job listings.""" soup = BeautifulSoup(content, "html.parser") listings = soup.find_all("li", class_="cl-static-search-result") new_rows = [] for i, listing in enumerate(listings): job_data = scrape_listings_page(listing, region, keyword, seen_urls) if job_data: new_rows.append(job_data) return new_rows def process_region_keyword(region: str, keyword: str, seen_urls: Set[str]) -> List[List]: """Process a single region and keyword.""" url = get_base_url().format(region=region, keyword=keyword.replace(" ", "+")) content = make_request_with_retry(url, 3) if content is None: return [] return scrape_job_data(content, region, keyword, seen_urls)