import pytest from web.scraper import scrape_job_page, extract_contact_info from web.craigslist import process_job_url, scraper def _make_negative_job(url: str) -> dict: return { "url": url, "title": "SCAM role", "company": "Test Co", "location": "Remote", "description": "This is a scam offer", "id": "job123", "posted_time": "", "reply_url": "N/A", "contact_email": "N/A", "contact_phone": "N/A", "contact_name": "N/A", "is_negative_match": True, "negative_keyword_match": "scam", "negative_match_field": "title", } class TestExtractContactInfo: """Test suite for contact information extraction.""" def test_extract_email_from_mailto_link(self): """Test extraction of email from mailto link.""" reply_url = "mailto:contact@example.com?subject=Job%20Inquiry" contact_info = extract_contact_info(reply_url) assert contact_info["email"] == "contact@example.com" assert contact_info["phone"] == "N/A" assert contact_info["contact_name"] == "N/A" def test_extract_phone_from_tel_link(self): """Test extraction of phone from tel link.""" reply_url = "tel:+1234567890" contact_info = extract_contact_info(reply_url) assert contact_info["email"] == "N/A" assert contact_info["phone"] == "+1234567890" assert contact_info["contact_name"] == "N/A" def test_extract_email_from_url_parameter(self): """Test extraction of email from URL query parameters.""" reply_url = "https://example.com/contact?email=jobs@company.com&name=John%20Doe" contact_info = extract_contact_info(reply_url) assert contact_info["email"] == "jobs@company.com" assert contact_info["contact_name"] == "John Doe" def test_extract_phone_from_url_parameter(self): """Test extraction of phone from URL query parameters.""" reply_url = "https://example.com/apply?phone=555-1234&email=contact@test.com" contact_info = extract_contact_info(reply_url) assert contact_info["phone"] == "555-1234" assert contact_info["email"] == "contact@test.com" def test_extract_contact_name_from_url_parameter(self): """Test extraction of contact name from URL query parameters.""" reply_url = "https://example.com/reply?name=Alice%20Smith&contact_name=Bob%20Jones" contact_info = extract_contact_info(reply_url) # Should prefer contact_name over name assert contact_info["contact_name"] == "Bob Jones" def test_extract_all_fields_from_url(self): """Test extraction of all fields from URL parameters.""" reply_url = "https://example.com/contact?email=hr@company.com&phone=555-9876&contact_name=Jane%20Doe" contact_info = extract_contact_info(reply_url) assert contact_info["email"] == "hr@company.com" assert contact_info["phone"] == "555-9876" assert contact_info["contact_name"] == "Jane Doe" def test_handle_empty_reply_url(self): """Test handling of empty reply URL.""" contact_info = extract_contact_info("") assert contact_info["email"] == "N/A" assert contact_info["phone"] == "N/A" assert contact_info["contact_name"] == "N/A" def test_handle_na_reply_url(self): """Test handling of N/A reply URL.""" contact_info = extract_contact_info("N/A") assert contact_info["email"] == "N/A" assert contact_info["phone"] == "N/A" assert contact_info["contact_name"] == "N/A" def test_handle_none_reply_url(self): """Test handling of None reply URL.""" contact_info = extract_contact_info(None) assert contact_info["email"] == "N/A" assert contact_info["phone"] == "N/A" assert contact_info["contact_name"] == "N/A" def test_handle_invalid_url(self): """Test handling of invalid URL (graceful fallback).""" reply_url = "not a valid url at all" contact_info = extract_contact_info(reply_url) # Should return all N/A values without crashing assert contact_info["email"] == "N/A" assert contact_info["phone"] == "N/A" assert contact_info["contact_name"] == "N/A" def test_multiple_parameter_variations(self): """Test that function finds email despite multiple parameter name variations.""" reply_url = "https://example.com/reply?from_email=sender@test.com&other=value" contact_info = extract_contact_info(reply_url) assert contact_info["email"] == "sender@test.com" def test_telephone_parameter_name(self): """Test extraction using 'telephone' parameter name.""" reply_url = "https://example.com/contact?telephone=555-0000" contact_info = extract_contact_info(reply_url) assert contact_info["phone"] == "555-0000" class TestScrapeJobPageContactInfo: """Test suite for scrape_job_page contact information extraction.""" def test_scrape_job_page_includes_contact_fields(self): """Test that scrape_job_page includes contact information in return dict.""" html_content = """

Software Engineer

Tech Company

This is a test job description

posting id: 12345abc

""" job_data = scrape_job_page(html_content, "https://example.com/job/123") # Verify all expected keys are present assert "contact_email" in job_data assert "contact_phone" in job_data assert "contact_name" in job_data assert "reply_url" in job_data def test_scrape_job_page_extracts_mailto_contact(self): """Test that scrape_job_page correctly extracts email from mailto link.""" html_content = """

Job Title

Company

Job desc

id: xyz

""" job_data = scrape_job_page(html_content, "https://example.com/job/456") assert job_data["contact_email"] == "hiring@company.com" assert job_data["reply_url"] == "mailto:hiring@company.com?subject=Application" def test_scrape_job_page_no_reply_button(self): """Test scrape_job_page when no reply button is present.""" html_content = """

Job Title

Company

Job desc

id: xyz

""" job_data = scrape_job_page(html_content, "https://example.com/job/789") # Should have N/A for all contact fields assert job_data["reply_url"] == "N/A" assert job_data["contact_email"] == "N/A" assert job_data["contact_phone"] == "N/A" assert job_data["contact_name"] == "N/A" def test_scrape_job_page_with_url_based_reply(self): """Test scrape_job_page with URL-based reply link containing contact info.""" html_content = """

Manager Position

BigCorp

Apply now

id: manager123

""" job_data = scrape_job_page(html_content, "https://example.com/job/999") assert job_data["contact_email"] == "hr@bigcorp.com" assert job_data["contact_name"] == "HR Team" def test_scrape_job_page_negative_keyword_match(self, monkeypatch): """Test that negative keyword detection flags matching jobs.""" monkeypatch.setattr( "web.scraper.get_negative_keywords", lambda: ["scam"]) html_content = """

Great Opportunity

SCAM Corp

This is a scam offer

""" job_data = scrape_job_page( html_content, "https://example.com/job/negative") assert job_data["is_negative_match"] is True assert job_data["negative_keyword_match"] == "scam" assert job_data["negative_match_field"] in { "title", "company", "description"} def test_scrape_job_page_no_negative_match(self, monkeypatch): """Test that jobs without matching keywords are not flagged.""" monkeypatch.setattr( "web.scraper.get_negative_keywords", lambda: ["scam"]) html_content = """

Legit Opportunity

Honest Corp

We pay well and on time.

""" job_data = scrape_job_page( html_content, "https://example.com/job/positive") assert job_data["is_negative_match"] is False assert job_data["negative_keyword_match"] is None assert job_data["negative_match_field"] is None class TestProcessJobUrlNegativeFiltering: def test_process_job_url_skips_negative_match(self, monkeypatch): job_url = "https://example.com/job/negative" remove_calls = [] upsert_calls = [] monkeypatch.setattr( "web.craigslist.get_last_fetch_time", lambda url: None) monkeypatch.setattr( "web.craigslist.insert_log", lambda *args, **kwargs: None, ) monkeypatch.setattr( "web.craigslist.make_request_with_retry", lambda url, attempts: "", ) monkeypatch.setattr( "web.craigslist.scrape_job_page", lambda content, url: _make_negative_job(url), ) def fake_upsert(job_data, region="", keyword=""): upsert_calls.append(job_data) def fake_remove(url): remove_calls.append(url) monkeypatch.setattr("web.craigslist.upsert_job_details", fake_upsert) monkeypatch.setattr("web.craigslist.remove_job", fake_remove) messages = list(process_job_url(job_url, region="test", keyword="kw")) assert any("Skipping job" in message for message in messages) assert remove_calls == [job_url] assert upsert_calls == [] class TestScraperPipelineNegativeFiltering: def test_scraper_skips_negative_jobs(self, monkeypatch): job_url = "https://example.com/job/negative" remove_calls = [] upsert_calls = [] monkeypatch.setattr("web.craigslist.db_init", lambda: None) def fake_fetch_listings(): yield "Fake listing fetch\n" return {"discovered": 0, "new": 0, "by_search": [], "new_jobs": []} monkeypatch.setattr("web.craigslist.fetch_listings", fake_fetch_listings) monkeypatch.setattr( "web.craigslist.db_get_all_job_urls", lambda: [{"url": job_url, "region": "reg", "keyword": "kw"}], ) monkeypatch.setattr( "web.craigslist.get_last_fetch_time", lambda url: None) monkeypatch.setattr("web.craigslist.insert_log", lambda *args, **kwargs: None) monkeypatch.setattr( "web.craigslist.make_request_with_retry", lambda url, attempts: "" ) monkeypatch.setattr("web.craigslist.url_to_job_id", lambda url: "job123") monkeypatch.setattr( "web.craigslist.scrape_job_page", lambda content, url: _make_negative_job(url), ) def fake_upsert(job_data, region="", keyword=""): upsert_calls.append(job_data) def fake_remove(url): remove_calls.append(url) monkeypatch.setattr("web.craigslist.upsert_job_details", fake_upsert) monkeypatch.setattr("web.craigslist.remove_job", fake_remove) messages = list(scraper()) assert any("Skipping job" in message for message in messages) assert remove_calls == [job_url] assert upsert_calls == [] class TestScraperEmailNotifications: def test_scraper_sends_email_for_new_jobs(self, monkeypatch): monkeypatch.setattr("web.craigslist.db_init", lambda: None) new_jobs = [ { "title": "Python Developer", "company": "Acme", "location": "Remote", "url": "https://example.com/jobs/1", } ] def fake_fetch_listings(): yield "Fake listing fetch\n" return { "discovered": 1, "new": 1, "by_search": [], "new_jobs": new_jobs, } monkeypatch.setattr("web.craigslist.fetch_listings", fake_fetch_listings) monkeypatch.setattr("web.craigslist.db_get_all_job_urls", lambda: []) calls = {} def fake_send_alert(jobs): calls["jobs"] = jobs return True, "sent" monkeypatch.setattr("web.craigslist._send_new_job_alert", fake_send_alert) messages = list(scraper()) assert calls["jobs"] == new_jobs assert any("Job alert email sent." in message for message in messages)