feat: Implement email sending utilities and templates for job notifications
Some checks failed
CI/CD Pipeline / test (push) Failing after 4m9s
Some checks failed
CI/CD Pipeline / test (push) Failing after 4m9s
- Added email_service.py for sending emails with SMTP configuration. - Introduced email_templates.py to render job alert email subjects and bodies. - Enhanced scraper.py to extract contact information from job listings. - Updated settings.js to handle negative keyword input validation. - Created email.html and email_templates.html for managing email subscriptions and templates in the admin interface. - Modified base.html to include links for email alerts and templates. - Expanded user settings.html to allow management of negative keywords. - Updated utils.py to include functions for retrieving negative keywords and email settings. - Enhanced job filtering logic to exclude jobs containing negative keywords.
This commit is contained in:
384
tests/test_scraper.py
Normal file
384
tests/test_scraper.py
Normal file
@@ -0,0 +1,384 @@
|
||||
import pytest
|
||||
from web.scraper import scrape_job_page, extract_contact_info
|
||||
from web.craigslist import process_job_url, scraper
|
||||
|
||||
|
||||
def _make_negative_job(url: str) -> dict:
|
||||
return {
|
||||
"url": url,
|
||||
"title": "SCAM role",
|
||||
"company": "Test Co",
|
||||
"location": "Remote",
|
||||
"description": "This is a scam offer",
|
||||
"id": "job123",
|
||||
"posted_time": "",
|
||||
"reply_url": "N/A",
|
||||
"contact_email": "N/A",
|
||||
"contact_phone": "N/A",
|
||||
"contact_name": "N/A",
|
||||
"is_negative_match": True,
|
||||
"negative_keyword_match": "scam",
|
||||
"negative_match_field": "title",
|
||||
}
|
||||
|
||||
|
||||
class TestExtractContactInfo:
|
||||
"""Test suite for contact information extraction."""
|
||||
|
||||
def test_extract_email_from_mailto_link(self):
|
||||
"""Test extraction of email from mailto link."""
|
||||
reply_url = "mailto:contact@example.com?subject=Job%20Inquiry"
|
||||
contact_info = extract_contact_info(reply_url)
|
||||
|
||||
assert contact_info["email"] == "contact@example.com"
|
||||
assert contact_info["phone"] == "N/A"
|
||||
assert contact_info["contact_name"] == "N/A"
|
||||
|
||||
def test_extract_phone_from_tel_link(self):
|
||||
"""Test extraction of phone from tel link."""
|
||||
reply_url = "tel:+1234567890"
|
||||
contact_info = extract_contact_info(reply_url)
|
||||
|
||||
assert contact_info["email"] == "N/A"
|
||||
assert contact_info["phone"] == "+1234567890"
|
||||
assert contact_info["contact_name"] == "N/A"
|
||||
|
||||
def test_extract_email_from_url_parameter(self):
|
||||
"""Test extraction of email from URL query parameters."""
|
||||
reply_url = "https://example.com/contact?email=jobs@company.com&name=John%20Doe"
|
||||
contact_info = extract_contact_info(reply_url)
|
||||
|
||||
assert contact_info["email"] == "jobs@company.com"
|
||||
assert contact_info["contact_name"] == "John Doe"
|
||||
|
||||
def test_extract_phone_from_url_parameter(self):
|
||||
"""Test extraction of phone from URL query parameters."""
|
||||
reply_url = "https://example.com/apply?phone=555-1234&email=contact@test.com"
|
||||
contact_info = extract_contact_info(reply_url)
|
||||
|
||||
assert contact_info["phone"] == "555-1234"
|
||||
assert contact_info["email"] == "contact@test.com"
|
||||
|
||||
def test_extract_contact_name_from_url_parameter(self):
|
||||
"""Test extraction of contact name from URL query parameters."""
|
||||
reply_url = "https://example.com/reply?name=Alice%20Smith&contact_name=Bob%20Jones"
|
||||
contact_info = extract_contact_info(reply_url)
|
||||
|
||||
# Should prefer contact_name over name
|
||||
assert contact_info["contact_name"] == "Bob Jones"
|
||||
|
||||
def test_extract_all_fields_from_url(self):
|
||||
"""Test extraction of all fields from URL parameters."""
|
||||
reply_url = "https://example.com/contact?email=hr@company.com&phone=555-9876&contact_name=Jane%20Doe"
|
||||
contact_info = extract_contact_info(reply_url)
|
||||
|
||||
assert contact_info["email"] == "hr@company.com"
|
||||
assert contact_info["phone"] == "555-9876"
|
||||
assert contact_info["contact_name"] == "Jane Doe"
|
||||
|
||||
def test_handle_empty_reply_url(self):
|
||||
"""Test handling of empty reply URL."""
|
||||
contact_info = extract_contact_info("")
|
||||
|
||||
assert contact_info["email"] == "N/A"
|
||||
assert contact_info["phone"] == "N/A"
|
||||
assert contact_info["contact_name"] == "N/A"
|
||||
|
||||
def test_handle_na_reply_url(self):
|
||||
"""Test handling of N/A reply URL."""
|
||||
contact_info = extract_contact_info("N/A")
|
||||
|
||||
assert contact_info["email"] == "N/A"
|
||||
assert contact_info["phone"] == "N/A"
|
||||
assert contact_info["contact_name"] == "N/A"
|
||||
|
||||
def test_handle_none_reply_url(self):
|
||||
"""Test handling of None reply URL."""
|
||||
contact_info = extract_contact_info(None)
|
||||
|
||||
assert contact_info["email"] == "N/A"
|
||||
assert contact_info["phone"] == "N/A"
|
||||
assert contact_info["contact_name"] == "N/A"
|
||||
|
||||
def test_handle_invalid_url(self):
|
||||
"""Test handling of invalid URL (graceful fallback)."""
|
||||
reply_url = "not a valid url at all"
|
||||
contact_info = extract_contact_info(reply_url)
|
||||
|
||||
# Should return all N/A values without crashing
|
||||
assert contact_info["email"] == "N/A"
|
||||
assert contact_info["phone"] == "N/A"
|
||||
assert contact_info["contact_name"] == "N/A"
|
||||
|
||||
def test_multiple_parameter_variations(self):
|
||||
"""Test that function finds email despite multiple parameter name variations."""
|
||||
reply_url = "https://example.com/reply?from_email=sender@test.com&other=value"
|
||||
contact_info = extract_contact_info(reply_url)
|
||||
|
||||
assert contact_info["email"] == "sender@test.com"
|
||||
|
||||
def test_telephone_parameter_name(self):
|
||||
"""Test extraction using 'telephone' parameter name."""
|
||||
reply_url = "https://example.com/contact?telephone=555-0000"
|
||||
contact_info = extract_contact_info(reply_url)
|
||||
|
||||
assert contact_info["phone"] == "555-0000"
|
||||
|
||||
|
||||
class TestScrapeJobPageContactInfo:
|
||||
"""Test suite for scrape_job_page contact information extraction."""
|
||||
|
||||
def test_scrape_job_page_includes_contact_fields(self):
|
||||
"""Test that scrape_job_page includes contact information in return dict."""
|
||||
html_content = """
|
||||
<html>
|
||||
<h1 class="postingtitle">Software Engineer</h1>
|
||||
<h2 class="company-name">Tech Company</h2>
|
||||
<button class="reply-button" data-href="mailto:jobs@techco.com"></button>
|
||||
<div id="map" data-latitude="37.7749" data-longitude="-122.4194" data-accuracy="rooftop"></div>
|
||||
<section id="postingbody">
|
||||
<p>This is a test job description</p>
|
||||
</section>
|
||||
<div class="postinginfos">
|
||||
<p class="postinginfo">posting id: 12345abc</p>
|
||||
<time class="date timeago" datetime="2025-11-03T10:00:00"></time>
|
||||
</div>
|
||||
</html>
|
||||
"""
|
||||
|
||||
job_data = scrape_job_page(html_content, "https://example.com/job/123")
|
||||
|
||||
# Verify all expected keys are present
|
||||
assert "contact_email" in job_data
|
||||
assert "contact_phone" in job_data
|
||||
assert "contact_name" in job_data
|
||||
assert "reply_url" in job_data
|
||||
|
||||
def test_scrape_job_page_extracts_mailto_contact(self):
|
||||
"""Test that scrape_job_page correctly extracts email from mailto link."""
|
||||
html_content = """
|
||||
<html>
|
||||
<h1 class="postingtitle">Job Title</h1>
|
||||
<h2 class="company-name">Company</h2>
|
||||
<button class="reply-button" data-href="mailto:hiring@company.com?subject=Application"></button>
|
||||
<div id="map"></div>
|
||||
<section id="postingbody"><p>Job desc</p></section>
|
||||
<div class="postinginfos">
|
||||
<p class="postinginfo">id: xyz</p>
|
||||
</div>
|
||||
</html>
|
||||
"""
|
||||
|
||||
job_data = scrape_job_page(html_content, "https://example.com/job/456")
|
||||
|
||||
assert job_data["contact_email"] == "hiring@company.com"
|
||||
assert job_data["reply_url"] == "mailto:hiring@company.com?subject=Application"
|
||||
|
||||
def test_scrape_job_page_no_reply_button(self):
|
||||
"""Test scrape_job_page when no reply button is present."""
|
||||
html_content = """
|
||||
<html>
|
||||
<h1 class="postingtitle">Job Title</h1>
|
||||
<h2 class="company-name">Company</h2>
|
||||
<div id="map"></div>
|
||||
<section id="postingbody"><p>Job desc</p></section>
|
||||
<div class="postinginfos">
|
||||
<p class="postinginfo">id: xyz</p>
|
||||
</div>
|
||||
</html>
|
||||
"""
|
||||
|
||||
job_data = scrape_job_page(html_content, "https://example.com/job/789")
|
||||
|
||||
# Should have N/A for all contact fields
|
||||
assert job_data["reply_url"] == "N/A"
|
||||
assert job_data["contact_email"] == "N/A"
|
||||
assert job_data["contact_phone"] == "N/A"
|
||||
assert job_data["contact_name"] == "N/A"
|
||||
|
||||
def test_scrape_job_page_with_url_based_reply(self):
|
||||
"""Test scrape_job_page with URL-based reply link containing contact info."""
|
||||
html_content = """
|
||||
<html>
|
||||
<h1 class="postingtitle">Manager Position</h1>
|
||||
<h2 class="company-name">BigCorp</h2>
|
||||
<button class="reply-button" data-href="https://apply.bigcorp.com?email=hr@bigcorp.com&name=HR%20Team"></button>
|
||||
<div id="map"></div>
|
||||
<section id="postingbody"><p>Apply now</p></section>
|
||||
<div class="postinginfos">
|
||||
<p class="postinginfo">id: manager123</p>
|
||||
</div>
|
||||
</html>
|
||||
"""
|
||||
|
||||
job_data = scrape_job_page(html_content, "https://example.com/job/999")
|
||||
|
||||
assert job_data["contact_email"] == "hr@bigcorp.com"
|
||||
assert job_data["contact_name"] == "HR Team"
|
||||
|
||||
def test_scrape_job_page_negative_keyword_match(self, monkeypatch):
|
||||
"""Test that negative keyword detection flags matching jobs."""
|
||||
|
||||
monkeypatch.setattr(
|
||||
"web.scraper.get_negative_keywords", lambda: ["scam"])
|
||||
|
||||
html_content = """
|
||||
<html>
|
||||
<h1 class="postingtitle">Great Opportunity</h1>
|
||||
<h2 class="company-name">SCAM Corp</h2>
|
||||
<section id="postingbody"><p>This is a scam offer</p></section>
|
||||
</html>
|
||||
"""
|
||||
|
||||
job_data = scrape_job_page(
|
||||
html_content, "https://example.com/job/negative")
|
||||
|
||||
assert job_data["is_negative_match"] is True
|
||||
assert job_data["negative_keyword_match"] == "scam"
|
||||
assert job_data["negative_match_field"] in {
|
||||
"title", "company", "description"}
|
||||
|
||||
def test_scrape_job_page_no_negative_match(self, monkeypatch):
|
||||
"""Test that jobs without matching keywords are not flagged."""
|
||||
|
||||
monkeypatch.setattr(
|
||||
"web.scraper.get_negative_keywords", lambda: ["scam"])
|
||||
|
||||
html_content = """
|
||||
<html>
|
||||
<h1 class="postingtitle">Legit Opportunity</h1>
|
||||
<h2 class="company-name">Honest Corp</h2>
|
||||
<section id="postingbody"><p>We pay well and on time.</p></section>
|
||||
</html>
|
||||
"""
|
||||
|
||||
job_data = scrape_job_page(
|
||||
html_content, "https://example.com/job/positive")
|
||||
|
||||
assert job_data["is_negative_match"] is False
|
||||
assert job_data["negative_keyword_match"] is None
|
||||
assert job_data["negative_match_field"] is None
|
||||
|
||||
|
||||
class TestProcessJobUrlNegativeFiltering:
|
||||
def test_process_job_url_skips_negative_match(self, monkeypatch):
|
||||
job_url = "https://example.com/job/negative"
|
||||
remove_calls = []
|
||||
upsert_calls = []
|
||||
|
||||
monkeypatch.setattr(
|
||||
"web.craigslist.get_last_fetch_time", lambda url: None)
|
||||
monkeypatch.setattr(
|
||||
"web.craigslist.insert_log",
|
||||
lambda *args, **kwargs: None,
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"web.craigslist.make_request_with_retry",
|
||||
lambda url, attempts: "<html />",
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"web.craigslist.scrape_job_page",
|
||||
lambda content, url: _make_negative_job(url),
|
||||
)
|
||||
|
||||
def fake_upsert(job_data, region="", keyword=""):
|
||||
upsert_calls.append(job_data)
|
||||
|
||||
def fake_remove(url):
|
||||
remove_calls.append(url)
|
||||
|
||||
monkeypatch.setattr("web.craigslist.upsert_job_details", fake_upsert)
|
||||
monkeypatch.setattr("web.craigslist.remove_job", fake_remove)
|
||||
|
||||
messages = list(process_job_url(job_url, region="test", keyword="kw"))
|
||||
|
||||
assert any("Skipping job" in message for message in messages)
|
||||
assert remove_calls == [job_url]
|
||||
assert upsert_calls == []
|
||||
|
||||
|
||||
class TestScraperPipelineNegativeFiltering:
|
||||
def test_scraper_skips_negative_jobs(self, monkeypatch):
|
||||
job_url = "https://example.com/job/negative"
|
||||
remove_calls = []
|
||||
upsert_calls = []
|
||||
|
||||
monkeypatch.setattr("web.craigslist.db_init", lambda: None)
|
||||
|
||||
def fake_fetch_listings():
|
||||
yield "Fake listing fetch\n"
|
||||
return {"discovered": 0, "new": 0, "by_search": [], "new_jobs": []}
|
||||
|
||||
monkeypatch.setattr("web.craigslist.fetch_listings",
|
||||
fake_fetch_listings)
|
||||
monkeypatch.setattr(
|
||||
"web.craigslist.db_get_all_job_urls",
|
||||
lambda: [{"url": job_url, "region": "reg", "keyword": "kw"}],
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"web.craigslist.get_last_fetch_time", lambda url: None)
|
||||
monkeypatch.setattr("web.craigslist.insert_log",
|
||||
lambda *args, **kwargs: None)
|
||||
monkeypatch.setattr(
|
||||
"web.craigslist.make_request_with_retry", lambda url, attempts: "<html />"
|
||||
)
|
||||
monkeypatch.setattr("web.craigslist.url_to_job_id",
|
||||
lambda url: "job123")
|
||||
monkeypatch.setattr(
|
||||
"web.craigslist.scrape_job_page",
|
||||
lambda content, url: _make_negative_job(url),
|
||||
)
|
||||
|
||||
def fake_upsert(job_data, region="", keyword=""):
|
||||
upsert_calls.append(job_data)
|
||||
|
||||
def fake_remove(url):
|
||||
remove_calls.append(url)
|
||||
|
||||
monkeypatch.setattr("web.craigslist.upsert_job_details", fake_upsert)
|
||||
monkeypatch.setattr("web.craigslist.remove_job", fake_remove)
|
||||
|
||||
messages = list(scraper())
|
||||
|
||||
assert any("Skipping job" in message for message in messages)
|
||||
assert remove_calls == [job_url]
|
||||
assert upsert_calls == []
|
||||
|
||||
|
||||
class TestScraperEmailNotifications:
|
||||
def test_scraper_sends_email_for_new_jobs(self, monkeypatch):
|
||||
monkeypatch.setattr("web.craigslist.db_init", lambda: None)
|
||||
|
||||
new_jobs = [
|
||||
{
|
||||
"title": "Python Developer",
|
||||
"company": "Acme",
|
||||
"location": "Remote",
|
||||
"url": "https://example.com/jobs/1",
|
||||
}
|
||||
]
|
||||
|
||||
def fake_fetch_listings():
|
||||
yield "Fake listing fetch\n"
|
||||
return {
|
||||
"discovered": 1,
|
||||
"new": 1,
|
||||
"by_search": [],
|
||||
"new_jobs": new_jobs,
|
||||
}
|
||||
|
||||
monkeypatch.setattr("web.craigslist.fetch_listings", fake_fetch_listings)
|
||||
monkeypatch.setattr("web.craigslist.db_get_all_job_urls", lambda: [])
|
||||
|
||||
calls = {}
|
||||
|
||||
def fake_send_alert(jobs):
|
||||
calls["jobs"] = jobs
|
||||
return True, "sent"
|
||||
|
||||
monkeypatch.setattr("web.craigslist._send_new_job_alert", fake_send_alert)
|
||||
|
||||
messages = list(scraper())
|
||||
|
||||
assert calls["jobs"] == new_jobs
|
||||
assert any("Job alert email sent." in message for message in messages)
|
||||
Reference in New Issue
Block a user