feat: Implement email sending utilities and templates for job notifications
Some checks failed
CI/CD Pipeline / test (push) Failing after 4m9s
Some checks failed
CI/CD Pipeline / test (push) Failing after 4m9s
- Added email_service.py for sending emails with SMTP configuration. - Introduced email_templates.py to render job alert email subjects and bodies. - Enhanced scraper.py to extract contact information from job listings. - Updated settings.js to handle negative keyword input validation. - Created email.html and email_templates.html for managing email subscriptions and templates in the admin interface. - Modified base.html to include links for email alerts and templates. - Expanded user settings.html to allow management of negative keywords. - Updated utils.py to include functions for retrieving negative keywords and email settings. - Enhanced job filtering logic to exclude jobs containing negative keywords.
This commit is contained in:
120
web/scraper.py
120
web/scraper.py
@@ -1,7 +1,82 @@
|
||||
from datetime import datetime, UTC
|
||||
from bs4 import BeautifulSoup
|
||||
from typing import List, Dict, Set
|
||||
from web.utils import get_base_url, safe_get_text, safe_get_attr, make_request_with_retry
|
||||
from urllib.parse import urlparse, parse_qs
|
||||
import re
|
||||
from web.utils import (
|
||||
get_base_url,
|
||||
safe_get_text,
|
||||
safe_get_attr,
|
||||
make_request_with_retry,
|
||||
get_negative_keywords,
|
||||
)
|
||||
|
||||
|
||||
def extract_contact_info(reply_url) -> Dict[str, str]:
|
||||
"""Extract contact information from reply URL.
|
||||
|
||||
Parses mailto links, phone links, and contact form URLs to extract:
|
||||
- email: Email address (from mailto links)
|
||||
- phone: Phone number (from tel links or URL parameters)
|
||||
- contact_name: Contact person name (if available in URL parameters)
|
||||
|
||||
Returns a dict with email, phone, and contact_name keys (values may be "N/A").
|
||||
"""
|
||||
contact_info = {
|
||||
"email": "N/A",
|
||||
"phone": "N/A",
|
||||
"contact_name": "N/A"
|
||||
}
|
||||
|
||||
# Handle None or empty cases
|
||||
if not reply_url or reply_url == "N/A":
|
||||
return contact_info
|
||||
|
||||
reply_url = str(reply_url).strip()
|
||||
if not reply_url or reply_url == "N/A":
|
||||
return contact_info
|
||||
|
||||
try:
|
||||
# Check for mailto links
|
||||
if reply_url.startswith("mailto:"):
|
||||
email_part = reply_url.replace("mailto:", "")
|
||||
# Extract email (may contain ?subject=...)
|
||||
email = email_part.split("?")[0]
|
||||
contact_info["email"] = email
|
||||
return contact_info
|
||||
|
||||
# Check for tel links
|
||||
if reply_url.startswith("tel:"):
|
||||
phone = reply_url.replace("tel:", "")
|
||||
contact_info["phone"] = phone
|
||||
return contact_info
|
||||
|
||||
# Parse as URL
|
||||
if reply_url.startswith("http"):
|
||||
parsed = urlparse(reply_url)
|
||||
params = parse_qs(parsed.query)
|
||||
|
||||
# Try to extract email from parameters
|
||||
for key in ["email", "from_email", "sender_email", "contact_email"]:
|
||||
if key in params:
|
||||
contact_info["email"] = params[key][0]
|
||||
break
|
||||
|
||||
# Try to extract phone from parameters
|
||||
for key in ["phone", "tel", "telephone"]:
|
||||
if key in params:
|
||||
contact_info["phone"] = params[key][0]
|
||||
break
|
||||
|
||||
# Try to extract contact name from parameters
|
||||
for key in ["contact_name", "from_name", "name"]:
|
||||
if key in params:
|
||||
contact_info["contact_name"] = params[key][0]
|
||||
break
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return contact_info
|
||||
|
||||
|
||||
def scrape_listings_page(listing, region: str, keyword: str, seen_urls: Set[str]) -> List:
|
||||
@@ -40,6 +115,16 @@ def scrape_job_page(content: str, url: str) -> Dict:
|
||||
"""Scrape job details from a job listing page."""
|
||||
soup = BeautifulSoup(content, "html.parser")
|
||||
|
||||
# Extract reply button
|
||||
reply_button = soup.find("button", class_="reply-button")
|
||||
if reply_button:
|
||||
reply_url = safe_get_attr(reply_button, "data-href")
|
||||
else:
|
||||
reply_url = "N/A"
|
||||
|
||||
# Extract contact information from reply URL
|
||||
contact_info = extract_contact_info(reply_url)
|
||||
|
||||
# Extract each field
|
||||
title = safe_get_text(soup.find("h1", class_="postingtitle"))
|
||||
company = safe_get_text(soup.find("h2", class_="company-name"))
|
||||
@@ -80,6 +165,30 @@ def scrape_job_page(content: str, url: str) -> Dict:
|
||||
job_id = ""
|
||||
posted_time = ""
|
||||
|
||||
# Negative keyword detection
|
||||
negative_keyword_match = None
|
||||
negative_match_field = None
|
||||
negative_keywords = get_negative_keywords()
|
||||
if negative_keywords:
|
||||
fields_to_check = {
|
||||
"title": title or "",
|
||||
"company": company or "",
|
||||
"location": location or "",
|
||||
"description": description or "",
|
||||
}
|
||||
for keyword in negative_keywords:
|
||||
if not keyword:
|
||||
continue
|
||||
pattern = re.compile(
|
||||
r"\b" + re.escape(keyword) + r"\b", re.IGNORECASE)
|
||||
for field_name, field_value in fields_to_check.items():
|
||||
if field_value and pattern.search(field_value):
|
||||
negative_keyword_match = keyword
|
||||
negative_match_field = field_name
|
||||
break
|
||||
if negative_keyword_match:
|
||||
break
|
||||
|
||||
return {
|
||||
"url": url,
|
||||
"title": title,
|
||||
@@ -87,7 +196,14 @@ def scrape_job_page(content: str, url: str) -> Dict:
|
||||
"location": location,
|
||||
"description": description,
|
||||
"id": job_id,
|
||||
"posted_time": posted_time
|
||||
"posted_time": posted_time,
|
||||
"reply_url": reply_url,
|
||||
"contact_email": contact_info["email"],
|
||||
"contact_phone": contact_info["phone"],
|
||||
"contact_name": contact_info["contact_name"],
|
||||
"negative_keyword_match": negative_keyword_match,
|
||||
"negative_match_field": negative_match_field,
|
||||
"is_negative_match": bool(negative_keyword_match),
|
||||
}
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user