231 lines
7.9 KiB
Python
231 lines
7.9 KiB
Python
from datetime import datetime, UTC
|
|
from bs4 import BeautifulSoup
|
|
from typing import List, Dict, Set
|
|
from urllib.parse import urlparse, parse_qs
|
|
import re
|
|
from web.utils import (
|
|
get_base_url,
|
|
safe_get_text,
|
|
safe_get_attr,
|
|
make_request_with_retry,
|
|
get_negative_keywords,
|
|
)
|
|
|
|
|
|
def extract_contact_info(reply_url) -> Dict[str, str]:
|
|
"""Extract contact information from reply URL.
|
|
|
|
Parses mailto links, phone links, and contact form URLs to extract:
|
|
- email: Email address (from mailto links)
|
|
- phone: Phone number (from tel links or URL parameters)
|
|
- contact_name: Contact person name (if available in URL parameters)
|
|
|
|
Returns a dict with email, phone, and contact_name keys (values may be "N/A").
|
|
"""
|
|
contact_info = {
|
|
"email": "N/A",
|
|
"phone": "N/A",
|
|
"contact_name": "N/A"
|
|
}
|
|
|
|
# Handle None or empty cases
|
|
if not reply_url or reply_url == "N/A":
|
|
return contact_info
|
|
|
|
reply_url = str(reply_url).strip()
|
|
if not reply_url or reply_url == "N/A":
|
|
return contact_info
|
|
|
|
try:
|
|
# Check for mailto links
|
|
if reply_url.startswith("mailto:"):
|
|
email_part = reply_url.replace("mailto:", "")
|
|
# Extract email (may contain ?subject=...)
|
|
email = email_part.split("?")[0]
|
|
contact_info["email"] = email
|
|
return contact_info
|
|
|
|
# Check for tel links
|
|
if reply_url.startswith("tel:"):
|
|
phone = reply_url.replace("tel:", "")
|
|
contact_info["phone"] = phone
|
|
return contact_info
|
|
|
|
# Parse as URL
|
|
if reply_url.startswith("http"):
|
|
parsed = urlparse(reply_url)
|
|
params = parse_qs(parsed.query)
|
|
|
|
# Try to extract email from parameters
|
|
for key in ["email", "from_email", "sender_email", "contact_email"]:
|
|
if key in params:
|
|
contact_info["email"] = params[key][0]
|
|
break
|
|
|
|
# Try to extract phone from parameters
|
|
for key in ["phone", "tel", "telephone"]:
|
|
if key in params:
|
|
contact_info["phone"] = params[key][0]
|
|
break
|
|
|
|
# Try to extract contact name from parameters
|
|
for key in ["contact_name", "from_name", "name"]:
|
|
if key in params:
|
|
contact_info["contact_name"] = params[key][0]
|
|
break
|
|
except Exception:
|
|
pass
|
|
|
|
return contact_info
|
|
|
|
|
|
def scrape_listings_page(listing, region: str, keyword: str, seen_urls: Set[str]) -> List:
|
|
"""Parse a single job listing."""
|
|
try:
|
|
title_elem = listing.find("div", class_="title")
|
|
url_elem = listing.find("a")
|
|
pay_elem = listing.find("div", class_="attr remuneration")
|
|
if pay_elem:
|
|
pay_elem = pay_elem.find("span", class_="valu")
|
|
location_elem = listing.find("div", class_="location")
|
|
|
|
if not title_elem or not url_elem:
|
|
return []
|
|
|
|
title = title_elem.get_text(strip=True)
|
|
url = url_elem["href"]
|
|
pay = pay_elem.get_text(strip=True) if pay_elem else "N/A"
|
|
location = location_elem.get_text(
|
|
strip=True) if location_elem else "N/A"
|
|
|
|
status = "DUPLICATE" if url in seen_urls else "NEW"
|
|
if url in seen_urls:
|
|
return []
|
|
|
|
# job_summary variable retained for parity but not used
|
|
job_summary = f"{status} [{region}/{keyword}] | Title: {title[:50]}{'...' if len(title) > 50 else ''} | Location: {location} | URL: {url}"
|
|
_ = job_summary
|
|
|
|
return [datetime.now(UTC).isoformat(), region, keyword, title, pay, location, url]
|
|
except (AttributeError, KeyError):
|
|
return []
|
|
|
|
|
|
def scrape_job_page(content: str, url: str) -> Dict:
|
|
"""Scrape job details from a job listing page."""
|
|
soup = BeautifulSoup(content, "html.parser")
|
|
|
|
# Extract reply button
|
|
reply_button = soup.find("button", class_="reply-button")
|
|
if reply_button:
|
|
reply_url = safe_get_attr(reply_button, "data-href")
|
|
else:
|
|
reply_url = "N/A"
|
|
|
|
# Extract contact information from reply URL
|
|
contact_info = extract_contact_info(reply_url)
|
|
|
|
# Extract each field
|
|
title = safe_get_text(soup.find("h1", class_="postingtitle"))
|
|
company = safe_get_text(soup.find("h2", class_="company-name"))
|
|
|
|
map_elem = soup.find("div", id="map")
|
|
if map_elem:
|
|
lat = safe_get_attr(map_elem, "data-latitude")
|
|
lon = safe_get_attr(map_elem, "data-longitude")
|
|
accuracy = safe_get_attr(map_elem, "data-accuracy")
|
|
location = f"Lat: {lat}, Lon: {lon}, Accuracy: {accuracy}"
|
|
else:
|
|
location = "N/A"
|
|
|
|
mapaddress = soup.find("div", class_="mapaddress")
|
|
if mapaddress:
|
|
location = safe_get_text(mapaddress) + " " + location
|
|
|
|
description_elem = soup.find("section", id="postingbody")
|
|
if description_elem:
|
|
de = BeautifulSoup(str(description_elem), "html.parser")
|
|
qr_code_elem = de.find(class_="print-qrcode-label")
|
|
# Remove QR code if it exists
|
|
if qr_code_elem:
|
|
qr_code_elem.decompose()
|
|
description = de.text.strip()
|
|
else:
|
|
description = ''
|
|
|
|
posting_info = soup.find("div", class_="postinginfos")
|
|
if posting_info:
|
|
pi = BeautifulSoup(str(posting_info), "html.parser")
|
|
postinginfo_tags = pi.find_all("p", class_="postinginfo")
|
|
job_id = safe_get_text(postinginfo_tags[0]) if postinginfo_tags else ""
|
|
posted_time_elem = pi.find("time", class_="date timeago")
|
|
posted_time = safe_get_attr(
|
|
posted_time_elem, "datetime") if posted_time_elem else ""
|
|
else:
|
|
job_id = ""
|
|
posted_time = ""
|
|
|
|
# Negative keyword detection
|
|
negative_keyword_match = None
|
|
negative_match_field = None
|
|
negative_keywords = get_negative_keywords()
|
|
if negative_keywords:
|
|
fields_to_check = {
|
|
"title": title or "",
|
|
"company": company or "",
|
|
"location": location or "",
|
|
"description": description or "",
|
|
}
|
|
for keyword in negative_keywords:
|
|
if not keyword:
|
|
continue
|
|
pattern = re.compile(
|
|
r"\b" + re.escape(keyword) + r"\b", re.IGNORECASE)
|
|
for field_name, field_value in fields_to_check.items():
|
|
if field_value and pattern.search(field_value):
|
|
negative_keyword_match = keyword
|
|
negative_match_field = field_name
|
|
break
|
|
if negative_keyword_match:
|
|
break
|
|
|
|
return {
|
|
"url": url,
|
|
"title": title,
|
|
"company": company,
|
|
"location": location,
|
|
"description": description,
|
|
"id": job_id,
|
|
"posted_time": posted_time,
|
|
"reply_url": reply_url,
|
|
"contact_email": contact_info["email"],
|
|
"contact_phone": contact_info["phone"],
|
|
"contact_name": contact_info["contact_name"],
|
|
"negative_keyword_match": negative_keyword_match,
|
|
"negative_match_field": negative_match_field,
|
|
"is_negative_match": bool(negative_keyword_match),
|
|
}
|
|
|
|
|
|
def scrape_job_data(content: str, region: str, keyword: str, seen_urls: Set[str]) -> List[List]:
|
|
"""Parse HTML content to extract job listings."""
|
|
soup = BeautifulSoup(content, "html.parser")
|
|
listings = soup.find_all("li", class_="cl-static-search-result")
|
|
new_rows = []
|
|
|
|
for i, listing in enumerate(listings):
|
|
job_data = scrape_listings_page(listing, region, keyword, seen_urls)
|
|
if job_data:
|
|
new_rows.append(job_data)
|
|
|
|
return new_rows
|
|
|
|
|
|
def process_region_keyword(region: str, keyword: str, seen_urls: Set[str]) -> List[List]:
|
|
"""Process a single region and keyword."""
|
|
url = get_base_url().format(region=region, keyword=keyword.replace(" ", "+"))
|
|
content = make_request_with_retry(url, 1)
|
|
if content is None:
|
|
return []
|
|
return scrape_job_data(content, region, keyword, seen_urls)
|