feat: Implement email sending utilities and templates for job notifications

- Added email_service.py for sending emails with SMTP configuration. - Introduced email_templates.py to render job alert email subjects and bodies. - Enhanced scraper.py to extract contact information from job listings. - Updated settings.js to handle negative keyword input validation. - Created email.html and email_templates.html for managing email subscriptions and templates in the admin interface. - Modified base.html to include links for email alerts and templates. - Expanded user settings.html to allow management of negative keywords. - Updated utils.py to include functions for retrieving negative keywords and email settings. - Enhanced job filtering logic to exclude jobs containing negative keywords.
2025-11-28 18:15:08 +01:00
parent 8afb208985
commit 2185a07ff0
23 changed files with 2660 additions and 63 deletions
--- a/web/scraper.py
+++ b/web/scraper.py
@@ -1,7 +1,82 @@
 from datetime import datetime, UTC
 from bs4 import BeautifulSoup
 from typing import List, Dict, Set
-from web.utils import get_base_url, safe_get_text, safe_get_attr, make_request_with_retry
+from urllib.parse import urlparse, parse_qs
+import re
+from web.utils import (
+    get_base_url,
+    safe_get_text,
+    safe_get_attr,
+    make_request_with_retry,
+    get_negative_keywords,
+)
+
+
+def extract_contact_info(reply_url) -> Dict[str, str]:
+    """Extract contact information from reply URL.
+
+    Parses mailto links, phone links, and contact form URLs to extract:
+    - email: Email address (from mailto links)
+    - phone: Phone number (from tel links or URL parameters)
+    - contact_name: Contact person name (if available in URL parameters)
+
+    Returns a dict with email, phone, and contact_name keys (values may be "N/A").
+    """
+    contact_info = {
+        "email": "N/A",
+        "phone": "N/A",
+        "contact_name": "N/A"
+    }
+
+    # Handle None or empty cases
+    if not reply_url or reply_url == "N/A":
+        return contact_info
+
+    reply_url = str(reply_url).strip()
+    if not reply_url or reply_url == "N/A":
+        return contact_info
+
+    try:
+        # Check for mailto links
+        if reply_url.startswith("mailto:"):
+            email_part = reply_url.replace("mailto:", "")
+            # Extract email (may contain ?subject=...)
+            email = email_part.split("?")[0]
+            contact_info["email"] = email
+            return contact_info
+
+        # Check for tel links
+        if reply_url.startswith("tel:"):
+            phone = reply_url.replace("tel:", "")
+            contact_info["phone"] = phone
+            return contact_info
+
+        # Parse as URL
+        if reply_url.startswith("http"):
+            parsed = urlparse(reply_url)
+            params = parse_qs(parsed.query)
+
+            # Try to extract email from parameters
+            for key in ["email", "from_email", "sender_email", "contact_email"]:
+                if key in params:
+                    contact_info["email"] = params[key][0]
+                    break
+
+            # Try to extract phone from parameters
+            for key in ["phone", "tel", "telephone"]:
+                if key in params:
+                    contact_info["phone"] = params[key][0]
+                    break
+
+            # Try to extract contact name from parameters
+            for key in ["contact_name", "from_name", "name"]:
+                if key in params:
+                    contact_info["contact_name"] = params[key][0]
+                    break
+    except Exception:
+        pass
+
+    return contact_info


 def scrape_listings_page(listing, region: str, keyword: str, seen_urls: Set[str]) -> List:
@@ -40,6 +115,16 @@ def scrape_job_page(content: str, url: str) -> Dict:
    """Scrape job details from a job listing page."""
    soup = BeautifulSoup(content, "html.parser")

+    # Extract reply button
+    reply_button = soup.find("button", class_="reply-button")
+    if reply_button:
+        reply_url = safe_get_attr(reply_button, "data-href")
+    else:
+        reply_url = "N/A"
+
+    # Extract contact information from reply URL
+    contact_info = extract_contact_info(reply_url)
+
    # Extract each field
    title = safe_get_text(soup.find("h1", class_="postingtitle"))
    company = safe_get_text(soup.find("h2", class_="company-name"))
@@ -80,6 +165,30 @@ def scrape_job_page(content: str, url: str) -> Dict:
        job_id = ""
        posted_time = ""

+    # Negative keyword detection
+    negative_keyword_match = None
+    negative_match_field = None
+    negative_keywords = get_negative_keywords()
+    if negative_keywords:
+        fields_to_check = {
+            "title": title or "",
+            "company": company or "",
+            "location": location or "",
+            "description": description or "",
+        }
+        for keyword in negative_keywords:
+            if not keyword:
+                continue
+            pattern = re.compile(
+                r"\b" + re.escape(keyword) + r"\b", re.IGNORECASE)
+            for field_name, field_value in fields_to_check.items():
+                if field_value and pattern.search(field_value):
+                    negative_keyword_match = keyword
+                    negative_match_field = field_name
+                    break
+            if negative_keyword_match:
+                break
+
    return {
        "url": url,
        "title": title,
@@ -87,7 +196,14 @@ def scrape_job_page(content: str, url: str) -> Dict:
        "location": location,
        "description": description,
        "id": job_id,
-        "posted_time": posted_time
+        "posted_time": posted_time,
+        "reply_url": reply_url,
+        "contact_email": contact_info["email"],
+        "contact_phone": contact_info["phone"],
+        "contact_name": contact_info["contact_name"],
+        "negative_keyword_match": negative_keyword_match,
+        "negative_match_field": negative_match_field,
+        "is_negative_match": bool(negative_keyword_match),
    }