jobs/web/scraper.py

from datetime import datetime, UTC
from bs4 import BeautifulSoup
from typing import List, Dict, Set
from urllib.parse import urlparse, parse_qs
import re
from web.utils import (
    get_base_url,
    safe_get_text,
    safe_get_attr,
    make_request_with_retry,
    get_negative_keywords,
)


def extract_contact_info(reply_url) -> Dict[str, str]:
    """Extract contact information from reply URL.

    Parses mailto links, phone links, and contact form URLs to extract:
    - email: Email address (from mailto links)
    - phone: Phone number (from tel links or URL parameters)
    - contact_name: Contact person name (if available in URL parameters)

    Returns a dict with email, phone, and contact_name keys (values may be "N/A").
    """
    contact_info = {
        "email": "N/A",
        "phone": "N/A",
        "contact_name": "N/A"
    }

    # Handle None or empty cases
    if not reply_url or reply_url == "N/A":
        return contact_info

    reply_url = str(reply_url).strip()
    if not reply_url or reply_url == "N/A":
        return contact_info

    try:
        # Check for mailto links
        if reply_url.startswith("mailto:"):
            email_part = reply_url.replace("mailto:", "")
            # Extract email (may contain ?subject=...)
            email = email_part.split("?")[0]
            contact_info["email"] = email
            return contact_info

        # Check for tel links
        if reply_url.startswith("tel:"):
            phone = reply_url.replace("tel:", "")
            contact_info["phone"] = phone
            return contact_info

        # Parse as URL
        if reply_url.startswith("http"):
            parsed = urlparse(reply_url)
            params = parse_qs(parsed.query)

            # Try to extract email from parameters
            for key in ["email", "from_email", "sender_email", "contact_email"]:
                if key in params:
                    contact_info["email"] = params[key][0]
                    break

            # Try to extract phone from parameters
            for key in ["phone", "tel", "telephone"]:
                if key in params:
                    contact_info["phone"] = params[key][0]
                    break

            # Try to extract contact name from parameters
            for key in ["contact_name", "from_name", "name"]:
                if key in params:
                    contact_info["contact_name"] = params[key][0]
                    break
    except Exception:
        pass

    return contact_info


def scrape_listings_page(listing, region: str, keyword: str, seen_urls: Set[str]) -> List:
    """Parse a single job listing."""
    try:
        title_elem = listing.find("div", class_="title")
        url_elem = listing.find("a")
        pay_elem = listing.find("div", class_="attr remuneration")
        if pay_elem:
            pay_elem = pay_elem.find("span", class_="valu")
        location_elem = listing.find("div", class_="location")

        if not title_elem or not url_elem:
            return []

        title = title_elem.get_text(strip=True)
        url = url_elem["href"]
        pay = pay_elem.get_text(strip=True) if pay_elem else "N/A"
        location = location_elem.get_text(
            strip=True) if location_elem else "N/A"

        status = "DUPLICATE" if url in seen_urls else "NEW"
        if url in seen_urls:
            return []

        # job_summary variable retained for parity but not used
        job_summary = f"{status} [{region}/{keyword}] | Title: {title[:50]}{'...' if len(title) > 50 else ''} | Location: {location} | URL: {url}"
        _ = job_summary

        return [datetime.now(UTC).isoformat(), region, keyword, title, pay, location, url]
    except (AttributeError, KeyError):
        return []


def scrape_job_page(content: str, url: str) -> Dict:
    """Scrape job details from a job listing page."""
    soup = BeautifulSoup(content, "html.parser")

    # Extract reply button
    reply_button = soup.find("button", class_="reply-button")
    if reply_button:
        reply_url = safe_get_attr(reply_button, "data-href")
    else:
        reply_url = "N/A"

    # Extract contact information from reply URL
    contact_info = extract_contact_info(reply_url)

    # Extract each field
    title = safe_get_text(soup.find("h1", class_="postingtitle"))
    company = safe_get_text(soup.find("h2", class_="company-name"))

    map_elem = soup.find("div", id="map")
    if map_elem:
        lat = safe_get_attr(map_elem, "data-latitude")
        lon = safe_get_attr(map_elem, "data-longitude")
        accuracy = safe_get_attr(map_elem, "data-accuracy")
        location = f"Lat: {lat}, Lon: {lon}, Accuracy: {accuracy}"
    else:
        location = "N/A"

    mapaddress = soup.find("div", class_="mapaddress")
    if mapaddress:
        location = safe_get_text(mapaddress) + " " + location

    description_elem = soup.find("section", id="postingbody")
    if description_elem:
        de = BeautifulSoup(str(description_elem), "html.parser")
        qr_code_elem = de.find(class_="print-qrcode-label")
        # Remove QR code if it exists
        if qr_code_elem:
            qr_code_elem.decompose()
        description = de.text.strip()
    else:
        description = ''

    posting_info = soup.find("div", class_="postinginfos")
    if posting_info:
        pi = BeautifulSoup(str(posting_info), "html.parser")
        postinginfo_tags = pi.find_all("p", class_="postinginfo")
        job_id = safe_get_text(postinginfo_tags[0]) if postinginfo_tags else ""
        posted_time_elem = pi.find("time", class_="date timeago")
        posted_time = safe_get_attr(
            posted_time_elem, "datetime") if posted_time_elem else ""
    else:
        job_id = ""
        posted_time = ""

    # Negative keyword detection
    negative_keyword_match = None
    negative_match_field = None
    negative_keywords = get_negative_keywords()
    if negative_keywords:
        fields_to_check = {
            "title": title or "",
            "company": company or "",
            "location": location or "",
            "description": description or "",
        }
        for keyword in negative_keywords:
            if not keyword:
                continue
            pattern = re.compile(
                r"\b" + re.escape(keyword) + r"\b", re.IGNORECASE)
            for field_name, field_value in fields_to_check.items():
                if field_value and pattern.search(field_value):
                    negative_keyword_match = keyword
                    negative_match_field = field_name
                    break
            if negative_keyword_match:
                break

    return {
        "url": url,
        "title": title,
        "company": company,
        "location": location,
        "description": description,
        "id": job_id,
        "posted_time": posted_time,
        "reply_url": reply_url,
        "contact_email": contact_info["email"],
        "contact_phone": contact_info["phone"],
        "contact_name": contact_info["contact_name"],
        "negative_keyword_match": negative_keyword_match,
        "negative_match_field": negative_match_field,
        "is_negative_match": bool(negative_keyword_match),
    }


def scrape_job_data(content: str, region: str, keyword: str, seen_urls: Set[str]) -> List[List]:
    """Parse HTML content to extract job listings."""
    soup = BeautifulSoup(content, "html.parser")
    listings = soup.find_all("li", class_="cl-static-search-result")
    new_rows = []

    for i, listing in enumerate(listings):
        job_data = scrape_listings_page(listing, region, keyword, seen_urls)
        if job_data:
            new_rows.append(job_data)

    return new_rows


def process_region_keyword(region: str, keyword: str, seen_urls: Set[str]) -> List[List]:
    """Process a single region and keyword."""
    url = get_base_url().format(region=region, keyword=keyword.replace(" ", "+"))
    content = make_request_with_retry(url, 3)
    if content is None:
        return []
    return scrape_job_data(content, region, keyword, seen_urls)