initial project commit

2025-08-29 15:07:58 +02:00
parent 38708e6d1d
commit 23a67d7fe1
31 changed files with 3433 additions and 0 deletions
--- a/web/scraper.py
+++ b/web/scraper.py
@@ -0,0 +1,121 @@
+from datetime import datetime, UTC
+from bs4 import BeautifulSoup
+from typing import List, Dict, Set
+from web.utils import get_base_url, cache_page, safe_get_text, safe_get_attr, is_cached, get_cached_content, make_request_with_retry
+
+
+def scrape_listings_page(listing, region: str, keyword: str, seen_urls: Set[str]) -> List:
+    """Parse a single job listing."""
+    try:
+        title_elem = listing.find("div", class_="title")
+        url_elem = listing.find("a")
+        pay_elem = listing.find("div", class_="attr remuneration")
+        if pay_elem:
+            pay_elem = pay_elem.find("span", class_="valu")
+        location_elem = listing.find("div", class_="location")
+
+        if not title_elem or not url_elem:
+            return []
+
+        title = title_elem.get_text(strip=True)
+        url = url_elem["href"]
+        pay = pay_elem.get_text(strip=True) if pay_elem else "N/A"
+        location = location_elem.get_text(
+            strip=True) if location_elem else "N/A"
+
+        status = "DUPLICATE" if url in seen_urls else "NEW"
+        if url in seen_urls:
+            return []
+
+        # job_summary variable retained for parity but not used
+        job_summary = f"{status} [{region}/{keyword}] | Title: {title[:50]}{'...' if len(title) > 50 else ''} | Location: {location} | URL: {url}"
+        _ = job_summary
+
+        return [datetime.now(UTC).isoformat(), region, keyword, title, pay, location, url]
+    except (AttributeError, KeyError):
+        return []
+
+
+def scrape_job_page(content: str, url: str) -> Dict:
+    """Scrape job details from a job listing page."""
+    soup = BeautifulSoup(content, "html.parser")
+
+    # Extract each field
+    title = safe_get_text(soup.find("h1", class_="postingtitle"))
+    company = safe_get_text(soup.find("h2", class_="company-name"))
+
+    map_elem = soup.find("div", id="map")
+    if map_elem:
+        lat = safe_get_attr(map_elem, "data-latitude")
+        lon = safe_get_attr(map_elem, "data-longitude")
+        accuracy = safe_get_attr(map_elem, "data-accuracy")
+        location = f"Lat: {lat}, Lon: {lon}, Accuracy: {accuracy}"
+    else:
+        location = "N/A"
+
+    mapaddress = soup.find("div", class_="mapaddress")
+    if mapaddress:
+        location = safe_get_text(mapaddress) + " " + location
+
+    description_elem = soup.find("section", id="postingbody")
+    if description_elem:
+        de = BeautifulSoup(str(description_elem), "html.parser")
+        qr_code_elem = de.find(class_="print-qrcode-label")
+        # Remove QR code if it exists
+        if qr_code_elem:
+            qr_code_elem.decompose()
+        description = de.text.strip()
+    else:
+        description = ''
+
+    posting_info = soup.find("div", class_="postinginfos")
+    if posting_info:
+        pi = BeautifulSoup(str(posting_info), "html.parser")
+        postinginfo_tags = pi.find_all("p", class_="postinginfo")
+        job_id = safe_get_text(postinginfo_tags[0]) if postinginfo_tags else ""
+        posted_time_elem = pi.find("time", class_="date timeago")
+        posted_time = safe_get_attr(
+            posted_time_elem, "datetime") if posted_time_elem else ""
+    else:
+        job_id = ""
+        posted_time = ""
+
+    return {
+        "url": url,
+        "title": title,
+        "company": company,
+        "location": location,
+        "description": description,
+        "id": job_id,
+        "posted_time": posted_time
+    }
+
+
+def scrape_job_data(content: str, region: str, keyword: str, seen_urls: Set[str]) -> List[List]:
+    """Parse HTML content to extract job listings."""
+    soup = BeautifulSoup(content, "html.parser")
+    listings = soup.find_all("li", class_="cl-static-search-result")
+    new_rows = []
+
+    for i, listing in enumerate(listings):
+        job_data = scrape_listings_page(listing, region, keyword, seen_urls)
+        if job_data:
+            new_rows.append(job_data)
+
+    return new_rows
+
+
+def process_region_keyword(region: str, keyword: str, seen_urls: Set[str]) -> List[List]:
+    """Process a single region and keyword."""
+    url = get_base_url().format(region=region, keyword=keyword.replace(" ", "+"))
+    if is_cached(url):
+        content = get_cached_content(url)
+        cache_status = "CACHED"
+    else:
+        content = make_request_with_retry(url, 3)
+        if content is None:
+            return []
+        cache_page(url, content)
+        cache_status = "FETCHED"
+    _ = cache_status  # no-op to silence unused var
+    return scrape_job_data(content, region, keyword, seen_urls)