from datetime import datetime, UTC from bs4 import BeautifulSoup from typing import List, Dict, Set from urllib.parse import urlparse, parse_qs import re from web.utils import ( get_base_url, safe_get_text, safe_get_attr, make_request_with_retry, get_negative_keywords, ) def extract_contact_info(reply_url) -> Dict[str, str]: """Extract contact information from reply URL. Parses mailto links, phone links, and contact form URLs to extract: - email: Email address (from mailto links) - phone: Phone number (from tel links or URL parameters) - contact_name: Contact person name (if available in URL parameters) Returns a dict with email, phone, and contact_name keys (values may be "N/A"). """ contact_info = { "email": "N/A", "phone": "N/A", "contact_name": "N/A" } # Handle None or empty cases if not reply_url or reply_url == "N/A": return contact_info reply_url = str(reply_url).strip() if not reply_url or reply_url == "N/A": return contact_info try: # Check for mailto links if reply_url.startswith("mailto:"): email_part = reply_url.replace("mailto:", "") # Extract email (may contain ?subject=...) email = email_part.split("?")[0] contact_info["email"] = email return contact_info # Check for tel links if reply_url.startswith("tel:"): phone = reply_url.replace("tel:", "") contact_info["phone"] = phone return contact_info # Parse as URL if reply_url.startswith("http"): parsed = urlparse(reply_url) params = parse_qs(parsed.query) # Try to extract email from parameters for key in ["email", "from_email", "sender_email", "contact_email"]: if key in params: contact_info["email"] = params[key][0] break # Try to extract phone from parameters for key in ["phone", "tel", "telephone"]: if key in params: contact_info["phone"] = params[key][0] break # Try to extract contact name from parameters for key in ["contact_name", "from_name", "name"]: if key in params: contact_info["contact_name"] = params[key][0] break except Exception: pass return contact_info def scrape_listings_page(listing, region: str, keyword: str, seen_urls: Set[str]) -> List: """Parse a single job listing.""" try: title_elem = listing.find("div", class_="title") url_elem = listing.find("a") pay_elem = listing.find("div", class_="attr remuneration") if pay_elem: pay_elem = pay_elem.find("span", class_="valu") location_elem = listing.find("div", class_="location") if not title_elem or not url_elem: return [] title = title_elem.get_text(strip=True) url = url_elem["href"] pay = pay_elem.get_text(strip=True) if pay_elem else "N/A" location = location_elem.get_text( strip=True) if location_elem else "N/A" status = "DUPLICATE" if url in seen_urls else "NEW" if url in seen_urls: return [] # job_summary variable retained for parity but not used job_summary = f"{status} [{region}/{keyword}] | Title: {title[:50]}{'...' if len(title) > 50 else ''} | Location: {location} | URL: {url}" _ = job_summary return [datetime.now(UTC).isoformat(), region, keyword, title, pay, location, url] except (AttributeError, KeyError): return [] def scrape_job_page(content: str, url: str) -> Dict: """Scrape job details from a job listing page.""" soup = BeautifulSoup(content, "html.parser") # Extract reply button reply_button = soup.find("button", class_="reply-button") if reply_button: reply_url = safe_get_attr(reply_button, "data-href") else: reply_url = "N/A" # Extract contact information from reply URL contact_info = extract_contact_info(reply_url) # Extract each field title = safe_get_text(soup.find("h1", class_="postingtitle")) company = safe_get_text(soup.find("h2", class_="company-name")) map_elem = soup.find("div", id="map") if map_elem: lat = safe_get_attr(map_elem, "data-latitude") lon = safe_get_attr(map_elem, "data-longitude") accuracy = safe_get_attr(map_elem, "data-accuracy") location = f"Lat: {lat}, Lon: {lon}, Accuracy: {accuracy}" else: location = "N/A" mapaddress = soup.find("div", class_="mapaddress") if mapaddress: location = safe_get_text(mapaddress) + " " + location description_elem = soup.find("section", id="postingbody") if description_elem: de = BeautifulSoup(str(description_elem), "html.parser") qr_code_elem = de.find(class_="print-qrcode-label") # Remove QR code if it exists if qr_code_elem: qr_code_elem.decompose() description = de.text.strip() else: description = '' posting_info = soup.find("div", class_="postinginfos") if posting_info: pi = BeautifulSoup(str(posting_info), "html.parser") postinginfo_tags = pi.find_all("p", class_="postinginfo") job_id = safe_get_text(postinginfo_tags[0]) if postinginfo_tags else "" posted_time_elem = pi.find("time", class_="date timeago") posted_time = safe_get_attr( posted_time_elem, "datetime") if posted_time_elem else "" else: job_id = "" posted_time = "" # Negative keyword detection negative_keyword_match = None negative_match_field = None negative_keywords = get_negative_keywords() if negative_keywords: fields_to_check = { "title": title or "", "company": company or "", "location": location or "", "description": description or "", } for keyword in negative_keywords: if not keyword: continue pattern = re.compile( r"\b" + re.escape(keyword) + r"\b", re.IGNORECASE) for field_name, field_value in fields_to_check.items(): if field_value and pattern.search(field_value): negative_keyword_match = keyword negative_match_field = field_name break if negative_keyword_match: break return { "url": url, "title": title, "company": company, "location": location, "description": description, "id": job_id, "posted_time": posted_time, "reply_url": reply_url, "contact_email": contact_info["email"], "contact_phone": contact_info["phone"], "contact_name": contact_info["contact_name"], "negative_keyword_match": negative_keyword_match, "negative_match_field": negative_match_field, "is_negative_match": bool(negative_keyword_match), } def scrape_job_data(content: str, region: str, keyword: str, seen_urls: Set[str]) -> List[List]: """Parse HTML content to extract job listings.""" soup = BeautifulSoup(content, "html.parser") listings = soup.find_all("li", class_="cl-static-search-result") new_rows = [] for i, listing in enumerate(listings): job_data = scrape_listings_page(listing, region, keyword, seen_urls) if job_data: new_rows.append(job_data) return new_rows def process_region_keyword(region: str, keyword: str, seen_urls: Set[str]) -> List[List]: """Process a single region and keyword.""" url = get_base_url().format(region=region, keyword=keyword.replace(" ", "+")) content = make_request_with_retry(url, 1) if content is None: return [] return scrape_job_data(content, region, keyword, seen_urls)