feat: Implement email sending utilities and templates for job notifications

- Added email_service.py for sending emails with SMTP configuration. - Introduced email_templates.py to render job alert email subjects and bodies. - Enhanced scraper.py to extract contact information from job listings. - Updated settings.js to handle negative keyword input validation. - Created email.html and email_templates.html for managing email subscriptions and templates in the admin interface. - Modified base.html to include links for email alerts and templates. - Expanded user settings.html to allow management of negative keywords. - Updated utils.py to include functions for retrieving negative keywords and email settings. - Enhanced job filtering logic to exclude jobs containing negative keywords.
2025-11-28 18:15:08 +01:00
parent 8afb208985
commit 2185a07ff0
23 changed files with 2660 additions and 63 deletions
--- a/web/craigslist.py
+++ b/web/craigslist.py
@@ -18,18 +18,67 @@ import time
 from web.utils import (
    get_base_url,
    make_request_with_retry,
-    now_iso,
+    get_email_settings,
 )
 from web.db import get_all_regions, get_all_keywords, seed_regions_keywords_from_listings
+from web.email_templates import render_job_alert_email
+from web.email_service import send_email
+
+
+def _negative_match_details(job_data: dict) -> tuple[str, str] | None:
+    """Return (keyword, field) when job_data indicates a negative match."""
+    if not job_data or not job_data.get("is_negative_match"):
+        return None
+    keyword = (job_data.get("negative_keyword_match") or "").strip()
+    field = (job_data.get("negative_match_field")
+             or "unknown").strip() or "unknown"
+    if not keyword:
+        keyword = "unknown keyword"
+    return keyword, field
+
+
+def _send_new_job_alert(new_jobs: list[dict]) -> tuple[bool, str]:
+    """Send an email alert for newly discovered jobs.
+
+    Returns (sent, message) where message explains why mail was skipped.
+    """
+
+    settings = get_email_settings()
+    if not settings.get("enabled"):
+        return False, "email alerts disabled"
+
+    recipients = settings.get("recipients", []) or []
+    if not recipients:
+        return False, "no recipients configured"
+
+    payload = render_job_alert_email(new_jobs)
+    send_email(
+        subject=payload.get("subject", "New jobs available"),
+        body=payload.get("body", ""),
+        to=recipients,
+        settings=settings,
+    )
+    return True, "sent"


 def fetch_listings():
-    """Fetch job listings from all regions and keywords."""
+    """Fetch job listings from all regions and keywords.
+
+    Yields progress messages and returns a dict with:
+    - discovered: total number of unique job URLs discovered
+    - new: total number of new jobs added to the database
+    - by_search: list of dicts, each containing:
+        - region: region name
+        - keyword: keyword name
+        - count: number of jobs fetched for this search
+    """
    # We'll collect URLs discovered in this run and then remove any DB listings
    # not present in this set (treat DB as reflecting current search results).
    existing_db_urls = set(row['url'] for row in db_get_all_job_urls())
    discovered_urls = set()
    new_rows = []
+    new_jobs = []
+    search_results = []  # Track count per search

    # Ensure regions/keywords master lists exist
    try:
@@ -58,13 +107,14 @@ def fetch_listings():
            # Build a canonical search identifier for this region+keyword combination.
            url = get_base_url().format(region=region, keyword=keyword_name.replace(" ", "+"))
            search_page_id = f"search:{region_name}:{keyword_name}"
+            search_count = 0  # Count jobs for this search
            try:
                last = get_last_fetch_time(url)
                if last is not None:
-                    # skip if fetched within the last 24 hours
+                    # skip if fetched within the last hour
                    age = datetime.now(
                        timezone.utc) - (last if last.tzinfo is not None else last.replace(tzinfo=timezone.utc))
-                    if age.total_seconds() < 24 * 3600:
+                    if age.total_seconds() < 1 * 3600:
                        yield f"Skipping {region_name} + {keyword_name} (fetched {age.seconds//3600}h ago)...\n"
                        processed += 1
                        continue
@@ -82,8 +132,18 @@ def fetch_listings():
            for row in process_region_keyword(region_name, keyword_name, discovered_urls):
                timestamp, region, keyword, title, pay, location, url = row
                discovered_urls.add(url)
+                search_count += 1
                if url not in existing_db_urls:
                    new_rows.append(row)
+                    new_jobs.append({
+                        "timestamp": timestamp,
+                        "region": region,
+                        "keyword": keyword,
+                        "title": title,
+                        "pay": pay,
+                        "location": location,
+                        "url": url,
+                    })
                # Upsert or update listing to reflect current search result
                upsert_listing(
                    url=url,
@@ -96,18 +156,29 @@ def fetch_listings():
                    fetched_from=search_page_id,
                    fetched_at=datetime.now(timezone.utc),
                )
+            # Record per-search count
+            search_results.append({
+                "region": region_name,
+                "keyword": keyword_name,
+                "count": search_count
+            })

    yield f"Listing fetch complete: {len(discovered_urls)} discovered, {len(new_rows)} new,\n"
-    return {"discovered": len(discovered_urls), "new": len(new_rows)}
+    return {
+        "discovered": len(discovered_urls),
+        "new": len(new_rows),
+        "by_search": search_results,
+        "new_jobs": new_jobs,
+    }


 def process_job_url(job_url: str, region: str = "", keyword: str = ""):
    last = get_last_fetch_time(job_url)
    if last is not None:
-        # skip if fetched within the last 24 hours
+        # skip if fetched within the last hour
        age = datetime.now(
            timezone.utc) - (last if last.tzinfo is not None else last.replace(tzinfo=timezone.utc))
-        if age.total_seconds() < 24 * 3600:
+        if age.total_seconds() < 1 * 3600:
            yield f"Skipping job {job_url} (fetched {age.seconds//3600}h ago)...\n"
            return None

@@ -124,10 +195,17 @@ def process_job_url(job_url: str, region: str = "", keyword: str = ""):
        yield f"Scraping job data from {job_url}\n"
        job_data = scrape_job_page(content, job_url)
        if job_data:
+            negative_info = _negative_match_details(job_data)
+            if negative_info:
+                keyword, field = negative_info
+                yield (
+                    f"Skipping job {job_id} due to negative keyword "
+                    f"'{keyword}' in {field}\n"
+                )
+                remove_job(job_url)
+                return None
            yield f"Upserting job details for {job_id}\n"
            upsert_job_details(job_data, region=region, keyword=keyword)
-            upsert_user_interaction(
-                job_id, seen_at=datetime.now(timezone.utc).isoformat())
            yield f"Successfully processed job {job_id}: {job_data.get('title', 'Unknown')}\n"
            return job_data
        else:
@@ -146,8 +224,29 @@ def scraper():

    # First, fetch current listings from search pages and make DB reflect them.
    yield "Fetching listings...\n"
-    for message in fetch_listings():
-        yield message
+    listing_summary: dict | None = None
+    fetch_iter = fetch_listings()
+    try:
+        while True:
+            message = next(fetch_iter)
+            yield message
+    except StopIteration as stop:
+        listing_summary = stop.value if isinstance(stop.value, dict) else {}
+
+    new_jobs = []
+    if listing_summary:
+        new_jobs = listing_summary.get("new_jobs", []) or []
+
+    if new_jobs:
+        yield f"Preparing email alert for {len(new_jobs)} new jobs...\n"
+        try:
+            sent, info = _send_new_job_alert(new_jobs)
+            if sent:
+                yield "Job alert email sent.\n"
+            else:
+                yield f"Skipping email alert: {info}\n"
+        except Exception as exc:
+            yield f"Failed to send job alert email: {exc}\n"

    # Finally, fetch and refresh individual job pages for current listings
    job_urls = db_get_all_job_urls()