import pytest import time from unittest.mock import patch, MagicMock from web.craigslist import scrape_jobs_with_retry, run_scheduled_scraping, fetch_listings class TestScheduler: def test_scrape_jobs_with_retry_success(self): """Test that scrape_jobs_with_retry succeeds on first attempt.""" with patch('web.craigslist.scraper') as mock_scrape: result = scrape_jobs_with_retry() assert result is True mock_scrape.assert_called_once() def test_scrape_jobs_with_retry_failure(self): """Test that scrape_jobs_with_retry handles failures properly.""" with patch('web.craigslist.scraper', side_effect=Exception("Test error")) as mock_scrape: result = scrape_jobs_with_retry(max_retries=2) assert result is False assert mock_scrape.call_count == 2 def test_run_scheduled_scraping(self): """Test the scheduled scraping wrapper function.""" with patch('web.craigslist.scrape_jobs_with_retry') as mock_retry: mock_retry.return_value = True run_scheduled_scraping() mock_retry.assert_called_once() def test_scheduler_import(self): """Test that scheduler functions can be imported.""" from web.craigslist import start_scheduler assert callable(start_scheduler) @patch('web.craigslist.schedule') def test_scheduler_setup(self, mock_schedule): """Test that scheduler setup works correctly.""" # This is a basic test to ensure the scheduler can be set up from web.craigslist import schedule assert schedule is not None @patch('web.craigslist.db_get_all_job_urls') @patch('web.craigslist.seed_regions_keywords_from_listings') @patch('web.craigslist.get_all_regions') @patch('web.craigslist.get_all_keywords') @patch('web.craigslist.get_last_fetch_time') @patch('web.craigslist.process_region_keyword') @patch('web.craigslist.upsert_listing') @patch('web.craigslist.insert_log') def test_fetch_listings_return_structure(self, mock_log, mock_upsert, mock_process, mock_last_fetch, mock_keywords, mock_regions, mock_seed, mock_db_urls): """Test that fetch_listings returns the correct structure with per-search counts.""" # Setup mocks mock_db_urls.return_value = [] mock_regions.return_value = [{"name": "sfbay"}] mock_keywords.return_value = [{"name": "python"}] mock_last_fetch.return_value = None # Never fetched before mock_process.return_value = [ ("2025-11-03T10:00:00Z", "sfbay", "python", "Python Dev", "$100k", "San Francisco", "http://example.com/1"), ("2025-11-03T10:00:00Z", "sfbay", "python", "Python Dev", "$100k", "San Francisco", "http://example.com/2"), ] # Collect messages and get return value from generator gen = fetch_listings() messages = [] result = None try: while True: messages.append(next(gen)) except StopIteration as e: result = e.value # Verify return structure assert result is not None assert "discovered" in result assert "new" in result assert "by_search" in result assert isinstance(result.get("by_search"), list) assert result.get("discovered") == 2 assert result.get("new") == 2 @patch('web.craigslist.db_get_all_job_urls') @patch('web.craigslist.seed_regions_keywords_from_listings') @patch('web.craigslist.get_all_regions') @patch('web.craigslist.get_all_keywords') @patch('web.craigslist.get_last_fetch_time') @patch('web.craigslist.process_region_keyword') @patch('web.craigslist.upsert_listing') @patch('web.craigslist.insert_log') def test_fetch_listings_per_search_count(self, mock_log, mock_upsert, mock_process, mock_last_fetch, mock_keywords, mock_regions, mock_seed, mock_db_urls): """Test that fetch_listings correctly counts jobs per search.""" # Setup mocks mock_db_urls.return_value = [] mock_regions.return_value = [{"name": "sfbay"}, {"name": "losangeles"}] mock_keywords.return_value = [{"name": "python"}, {"name": "java"}] mock_last_fetch.return_value = None # Never fetched before # Mock process_region_keyword to return different counts for each search def mock_process_impl(region, keyword, discovered_urls): # Use unique URLs per search to get the total discovered count base_url = f"http://example.com/{region}/{keyword}" counts = { ("sfbay", "python"): 3, ("sfbay", "java"): 2, ("losangeles", "python"): 4, ("losangeles", "java"): 1, } count = counts.get((region, keyword), 0) return [(f"2025-11-03T10:00:00Z", region, keyword, f"Job {i}", "$100k", region, f"{base_url}/{i}") for i in range(count)] mock_process.side_effect = mock_process_impl # Collect result from generator gen = fetch_listings() messages = [] result = None try: while True: messages.append(next(gen)) except StopIteration as e: result = e.value # Verify per-search counts assert result is not None by_search = result.get("by_search", []) assert len(by_search) == 4 search_data = {(r.get("region"), r.get("keyword")) : r.get("count") for r in by_search} assert search_data.get(("sfbay", "python")) == 3 assert search_data.get(("sfbay", "java")) == 2 assert search_data.get(("losangeles", "python")) == 4 assert search_data.get(("losangeles", "java")) == 1 assert result.get("discovered") == 10 # Total unique jobs