From 042a19671870b0b96b884aac69082e491cc188ef Mon Sep 17 00:00:00 2001 From: "georg.sinn-schirwitz" Date: Mon, 8 Sep 2025 14:44:46 +0200 Subject: [PATCH] remove caching --- setup.py | 2 +- tests/test_cache_paths.py | 66 ----------- tests/test_cached_route.py | 53 --------- tests/test_cachedpage_abs_path.py | 27 ----- tests/test_db_integration.py | 33 ------ web/app.py | 54 +++------ web/craigslist.py | 102 ++++++++-------- web/db.py | 187 ------------------------------ web/scraper.py | 15 +-- web/templates/base.html | 1 + web/templates/job.html | 8 -- web/templates/scrape.html | 67 +++++++++++ web/utils.py | 54 ++------- 13 files changed, 144 insertions(+), 525 deletions(-) delete mode 100644 tests/test_cache_paths.py delete mode 100644 tests/test_cached_route.py delete mode 100644 tests/test_cachedpage_abs_path.py create mode 100644 web/templates/scrape.html diff --git a/setup.py b/setup.py index 371cffc..11dc996 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,6 @@ #!/usr/bin/env python """ -MySQL utilities for Craigslist project. +MySQL utility script for initializing database and showing row counts. Usage (PowerShell): # Ensure MySQL database and tables diff --git a/tests/test_cache_paths.py b/tests/test_cache_paths.py deleted file mode 100644 index 11595e6..0000000 --- a/tests/test_cache_paths.py +++ /dev/null @@ -1,66 +0,0 @@ -import os -import tempfile -import pytest - -import web.db as db -from web.utils import get_cache_dir - - -# Skip unless explicitly enabled (MySQL integration expected) -if not os.getenv("RUN_DB_TESTS"): - pytest.skip("Set RUN_DB_TESTS=1 to run cache path integration tests", - allow_module_level=True) - - -def test_db_sync_inserts_relative_paths(tmp_path, monkeypatch): - # arrange: create a temporary cache dir and a fake html file - cache_dir = tmp_path / "cache" - cache_dir.mkdir() - f = cache_dir / "example.org_path_to_page_123.html" - f.write_text("ok") - - # point app at this cache dir - monkeypatch.setenv("PYTEST_CACHE_DIR", str(cache_dir)) - # monkeypatch get_cache_dir used by db functions - monkeypatch.setattr('web.utils.get_cache_dir', lambda: str(cache_dir)) - - # ensure DB initialized - db.db_init() - - # act - db.db_sync_cached_pages(str(cache_dir)) - - # assert: DB contains relative path, not absolute - rows = db.db_get_all_cached_pages() - assert any(r['file_path'] == os.path.relpath( - str(f), start=str(cache_dir)) for r in rows) - - -def test_normalize_cached_page_paths_converts_absolute(tmp_path, monkeypatch): - cache_dir = tmp_path / "cache" - cache_dir.mkdir() - # create an actual file - f = cache_dir / "site_example_page_1.html" - f.write_text("ok") - - monkeypatch.setattr('web.utils.get_cache_dir', lambda: str(cache_dir)) - db.db_init() - - abs_fp = str(f) - rel_fp = os.path.relpath(abs_fp, start=str(cache_dir)) - - # Insert an absolute path row directly (simulate legacy data) - with db._ensure_session() as session: - session.execute( - db.text("INSERT INTO cached_pages(file_path, url_guess, last_modified, size_bytes, job_id) VALUES(:fp, :ug, :lm, :sz, :jid)"), - {"fp": abs_fp, "ug": "https://example.org/page1.html", - "lm": None, "sz": 10, "jid": None} - ) - session.commit() - - # normalize should convert absolute to relative - changed = db.normalize_cached_page_paths() - assert changed >= 1 - - rows = db.db_get_all_cached_pages() - assert any(r['file_path'] == rel_fp for r in rows) diff --git a/tests/test_cached_route.py b/tests/test_cached_route.py deleted file mode 100644 index bef3aaf..0000000 --- a/tests/test_cached_route.py +++ /dev/null @@ -1,53 +0,0 @@ -import os -import tempfile -from web.app import app - - -def test_cached_route_serves_file(monkeypatch): - # Create a temporary file in the configured cache dir - cache_dir = os.path.abspath(os.path.join( - os.path.dirname(__file__), '..', 'cache')) - os.makedirs(cache_dir, exist_ok=True) - fd, tmp_path = tempfile.mkstemp( - prefix='test_cached_', suffix='.html', dir=cache_dir) - os.close(fd) - with open(tmp_path, 'w', encoding='utf-8') as f: - f.write('cached') - - # Fake job record returned by get_job_by_id - fake_job = { - 'id': 'fake123', - 'job_id': 'fake123', - 'file_path': os.path.relpath(tmp_path, cache_dir), - 'file_path_abs': tmp_path, - } - - def fake_get_job_by_id(jid): - if str(jid) in ('fake123',): - return fake_job - return {} - - # Patch the symbol imported into web.app - monkeypatch.setattr('web.app.get_job_by_id', fake_get_job_by_id) - - # Request route - client = app.test_client() - res = client.get('/cached/fake123') - assert res.status_code == 200 - assert b'cached' in res.data - - # Cleanup - try: - os.remove(tmp_path) - except Exception: - pass - - -def test_cached_route_missing(monkeypatch): - def fake_get_job_by_id(jid): - return {} - - monkeypatch.setattr('web.app.get_job_by_id', fake_get_job_by_id) - client = app.test_client() - res = client.get('/cached/nope') - assert res.status_code == 404 diff --git a/tests/test_cachedpage_abs_path.py b/tests/test_cachedpage_abs_path.py deleted file mode 100644 index 021ac1c..0000000 --- a/tests/test_cachedpage_abs_path.py +++ /dev/null @@ -1,27 +0,0 @@ -import os -from web.db import CachedPage -from web.utils import get_cache_dir - - -def test_cachedpage_abs_path(tmp_path, monkeypatch): - # Create a fake cache dir and monkeypatch get_cache_dir - fake_cache = tmp_path / 'cache' - fake_cache.mkdir() - monkeypatch.setenv('PYTHONIOENCODING', 'utf-8') - - # Patch the symbol used by CachedPage.abs_path (imported into web.db) - monkeypatch.setattr('web.db.get_cache_dir', lambda: str(fake_cache)) - - # Create a CachedPage instance and set file_path attribute - cp = CachedPage() - setattr(cp, 'file_path', 'subdir/test.html') - - # Ensure the computed absolute path joins the fake cache dir - expected = os.path.join(os.path.abspath( - str(fake_cache)), 'subdir/test.html') - assert cp.abs_path == expected - - # When file_path is falsy, abs_path should be None - cp2 = CachedPage() - setattr(cp2, 'file_path', None) - assert cp2.abs_path is None diff --git a/tests/test_db_integration.py b/tests/test_db_integration.py index 738ff05..010a99f 100644 --- a/tests/test_db_integration.py +++ b/tests/test_db_integration.py @@ -95,39 +95,6 @@ def test_upsert_listing_details_and_urls(db_ready): pass -def test_cached_page_upsert_and_get(db_ready): - jid_suffix = unique_suffix() - url = f"https://example.org/it/{jid_suffix}.html" - # Ensure a listing exists for FK relation if enforced - db.upsert_listing( - url=url, - region="it", - keyword="cache", - title=f"IT Cache {jid_suffix}", - pay="N/A", - location="Test City", - timestamp=now_iso(), - ) - fp = f"/tmp/integration_{jid_suffix}.html" - db.upsert_cached_page( - file_path=fp, - url_guess=url, - last_modified=now_iso(), - size_bytes=123, - job_id=int(jid_suffix) if jid_suffix.isdigit() else None, - ) - row = db.db_get_cache_url(url) - if row is not None: - assert row["url_guess"] == url - # Cleanup - try: - db.remove_cached_page(fp) - db.db_remove_cached_url(url) - db.db_delete_job(jid_suffix) - except Exception: - pass - - def test_user_interactions_mark_and_visit(db_ready): uname = f"it_user_{unique_suffix()}" db.create_or_update_user(uname, is_active=True) diff --git a/web/app.py b/web/app.py index 154ed61..1312dd9 100644 --- a/web/app.py +++ b/web/app.py @@ -1,5 +1,5 @@ import os -from flask import Flask, request, jsonify, render_template, redirect, url_for, session, flash, send_file +from flask import Flask, request, jsonify, render_template, redirect, url_for, session, flash, Response from flask_wtf import CSRFProtect from typing import Dict, List @@ -34,7 +34,6 @@ from web.utils import ( initialize_users_from_settings, filter_jobs, get_job_by_id, - get_cache_dir, ) from web.db import get_all_regions, get_all_keywords @@ -232,39 +231,6 @@ def job_by_id(job_id): return jsonify({"error": "Job not found"}), 404 -@app.route('/cached/', methods=['GET']) -def serve_cached(job_id): - """Serve the cached HTML file for a job if available. - - Uses the job record's `file_path_abs` when present, or resolves the DB `file_path` via helper. - Ensures the returned file is located under the configured cache directory to avoid path-traversal. - """ - try: - from web.db import db_get_cached_abs_path - j = get_job_by_id(job_id) - if not j: - return "Job not found", 404 - - # Prefer file_path_abs, fall back to resolving the DB-stored file_path - abs_fp = j.get('file_path_abs') or None - if not abs_fp: - db_fp = j.get('file_path') - abs_fp = db_get_cached_abs_path(db_fp) if db_fp else None - - if not abs_fp or not os.path.isfile(abs_fp): - return "Cached file not available", 404 - - cache_dir = os.path.abspath(get_cache_dir()) - abs_fp = os.path.abspath(abs_fp) - # Ensure the file is inside the cache directory - if os.path.commonpath([cache_dir, abs_fp]) != cache_dir: - return "Forbidden", 403 - - return send_file(abs_fp) - except Exception: - return "Error serving cached file", 500 - - @app.route('/jobs//favorite', methods=['POST']) def set_favorite(job_id): """Mark or unmark a job as favorite for a given user. @@ -290,9 +256,21 @@ csrf.exempt(set_favorite) @app.route('/scrape', methods=['GET']) def scrape(): - """Trigger the web scraping process.""" - scraper() - return jsonify({"status": "Scraping completed"}) + """Trigger the web scraping process with streaming output.""" + def generate(): + try: + for message in scraper(): + yield message + except Exception as e: + yield f"Error during scraping: {str(e)}\n" + + return Response(generate(), mimetype='text/plain') + + +@app.route('/scrape-page', methods=['GET']) +def scrape_page(): + """Serve the scrape page with streaming output display.""" + return render_template('scrape.html', title='Scrape Jobs') # ---------------- Auth & Admin UI ------------------------------------------ diff --git a/web/craigslist.py b/web/craigslist.py index 7446366..29b0e44 100644 --- a/web/craigslist.py +++ b/web/craigslist.py @@ -2,31 +2,19 @@ from datetime import datetime, timezone from web.scraper import process_region_keyword, scrape_job_page from web.db import ( db_init, - upsert_cached_page, upsert_listing, upsert_job_details, url_to_job_id, upsert_user_interaction, - db_remove_cached_url, - db_sync_cached_pages, db_get_all_job_urls, - db_get_cache_url, db_delete_job, remove_job, - normalize_cached_page_paths, ) # Import utility functions from web.utils import ( - get_cache_dir, make_request_with_retry, now_iso, - get_cache_path, - cache_page, - is_cache_stale, - delete_cached_page, - get_cached_content, - ensure_cache_dir ) from web.db import get_all_regions, get_all_keywords, seed_regions_keywords_from_listings @@ -45,15 +33,26 @@ def fetch_listings(): except Exception: pass + yield "Initializing database and seeding regions/keywords...\n" + # Fetch listings for each region/keyword from DB - for region in get_all_regions(): + regions = get_all_regions() + keywords = get_all_keywords() + total_combinations = len(regions) * len(keywords) + processed = 0 + + yield f"Found {len(regions)} regions and {len(keywords)} keywords. Processing {total_combinations} combinations...\n" + + for region in regions: region_name = region.get("name") if not region_name: continue - for keyword in get_all_keywords(): + for keyword in keywords: keyword_name = keyword.get("name") if not keyword_name: continue + processed += 1 + yield f"Processing {region_name} + {keyword_name} ({processed}/{total_combinations})...\n" for row in process_region_keyword(region_name, keyword_name, discovered_urls): timestamp, region, keyword, title, pay, location, url = row discovered_urls.add(url) @@ -72,75 +71,68 @@ def fetch_listings(): # Remove stale listings: those present in DB but not discovered now. stale_urls = existing_db_urls - discovered_urls - for url in stale_urls: - try: - jid = url_to_job_id(url) - db_delete_job(jid) - # Also try to remove cached file and its metadata - delete_cached_page(url) - db_remove_cached_url(url) - except Exception: - pass + if stale_urls: + yield f"Removing {len(stale_urls)} stale listings...\n" + for url in stale_urls: + try: + jid = url_to_job_id(url) + db_delete_job(jid) + except Exception: + pass + yield f"Listing fetch complete: {len(discovered_urls)} discovered, {len(new_rows)} new, {len(stale_urls)} stale\n" return {"discovered": len(discovered_urls), "new": len(new_rows), "stale": len(stale_urls)} def process_job_url(job_url: str): try: job_id = url_to_job_id(job_url) - content = None - cached_page = db_get_cache_url(job_url) - if cached_page: - last_modified = cached_page.get("last_modified") - if last_modified and not is_cache_stale(last_modified): - content = get_cached_content(job_url) - else: - content = make_request_with_retry(job_url, 1) - else: - content = make_request_with_retry(job_url, 1) + yield f"Fetching job page: {job_url}\n" + content = make_request_with_retry(job_url, 1) if content is None: + yield f"Failed to fetch content for {job_url}, removing from database\n" remove_job(job_url) return None - # refresh cache and details - cache_page(job_url, content) - upsert_cached_page( - file_path=get_cache_path(job_url), - url_guess=job_url, - last_modified=now_iso(), - size_bytes=len(content), - job_id=job_id - ) + yield f"Scraping job data from {job_url}\n" job_data = scrape_job_page(content, job_url) if job_data: + yield f"Upserting job details for {job_id}\n" upsert_job_details(job_data) upsert_user_interaction( job_id, seen_at=datetime.now(timezone.utc).isoformat()) + yield f"Successfully processed job {job_id}: {job_data.get('title', 'Unknown')}\n" return job_data + else: + yield f"Failed to scrape job data from {job_url}\n" return None - except Exception: + except Exception as e: + yield f"Error processing {job_url}: {str(e)}\n" return None def scraper(): """Main function to run the scraper.""" - ensure_cache_dir() + yield "Starting scraper...\n" db_init() + yield "Database initialized\n" + # First, fetch current listings from search pages and make DB reflect them. - jl = fetch_listings() - - # Sync any cached files we have on disk into the cached_pages table. - db_sync_cached_pages(get_cache_dir()) - - # Normalize any relative cached file paths to absolute paths in DB - normalized = normalize_cached_page_paths() - if normalized: - pass + yield "Fetching listings...\n" + for message in fetch_listings(): + yield message # Finally, fetch and refresh individual job pages for current listings - for url in db_get_all_job_urls(): - process_job_url(url) + job_urls = db_get_all_job_urls() + yield f"Processing {len(job_urls)} job pages...\n" + + for i, url in enumerate(job_urls, 1): + yield f"\n--- Processing job {i}/{len(job_urls)} ---\n" + for message in process_job_url(url): + yield message + + yield "\nScraping completed successfully!\n" if __name__ == "__main__": diff --git a/web/db.py b/web/db.py index e8a75d0..0b39058 100644 --- a/web/db.py +++ b/web/db.py @@ -4,7 +4,6 @@ from __future__ import annotations Tables: - users(user_id PK, username UNIQUE, created_at) - - cached_pages(file_path PK, url_guess, last_modified, size_bytes, job_id) - job_listings(job_id PK, url UNIQUE, region, keyword, title, pay, location, timestamp) - job_descriptions(job_id PK FK -> job_listings, title, company, location, description, posted_time, url) - user_interactions(job_id PK FK -> job_listings, user_id FK -> users, seen_at, url_visited, is_user_favorite) @@ -23,8 +22,6 @@ from web.utils import ( url_to_job_id, normalize_job_id, now_iso, - get_cache_path, - get_cache_dir, get_mysql_config, ) @@ -86,8 +83,6 @@ class JobListing(Base): description = relationship( "JobDescription", back_populates="listing", uselist=False, cascade="all, delete-orphan") - cached_pages = relationship( - "CachedPage", back_populates="listing", cascade="all, delete-orphan") interactions = relationship( "UserInteraction", back_populates="listing", cascade="all, delete-orphan") @@ -106,30 +101,6 @@ class JobDescription(Base): listing = relationship("JobListing", back_populates="description") -class CachedPage(Base): - __tablename__ = "cached_pages" - file_path = Column(String(FILE_PATH_LEN), primary_key=True) - url_guess = Column(String(URL_LEN)) - last_modified = Column(String(TIME_LEN)) - size_bytes = Column(Integer) - job_id = Column(String(JOB_ID_LEN), ForeignKey( - "job_listings.job_id", ondelete="CASCADE")) - - listing = relationship("JobListing", back_populates="cached_pages") - - @property - def abs_path(self) -> Optional[str]: - """Return the absolute filesystem path for this cached page. - - The DB stores `file_path` relative to the configured cache dir. This - helper centralizes resolution so callers can use `cached_page.abs_path`. - """ - fp = getattr(self, 'file_path', None) - if not fp: - return None - return os.path.join(os.path.abspath(get_cache_dir()), fp) - - class UserInteraction(Base): __tablename__ = "user_interactions" # composite uniqueness on (user_id, job_id) @@ -292,139 +263,6 @@ def upsert_job_details(job_data: Dict[str, Any]): session.commit() -def upsert_cached_page(*, file_path: str, url_guess: Optional[str], last_modified: Optional[str], size_bytes: Optional[int], job_id: Optional[int]): - # Store file paths relative to the cache directory (keeps DB portable) - cache_dir = os.path.abspath(get_cache_dir()) - abs_fp = os.path.abspath(file_path) - rel_fp = os.path.relpath(abs_fp, start=cache_dir) - with _ensure_session() as session: - obj = session.get(CachedPage, rel_fp) - if obj is None: - obj = CachedPage(file_path=rel_fp) - session.add(obj) - setattr(obj, "url_guess", url_guess) - setattr(obj, "last_modified", last_modified) - setattr(obj, "size_bytes", size_bytes) - setattr(obj, "job_id", str(job_id) if job_id else None) - session.commit() - - -def remove_cached_page(file_path: str): - # Accept absolute or relative input but DB keys are stored relative to cache dir - cache_dir = os.path.abspath(get_cache_dir()) - abs_fp = os.path.abspath(file_path) - rel_fp = os.path.relpath(abs_fp, start=cache_dir) - with _ensure_session() as session: - obj = session.get(CachedPage, rel_fp) - if obj: - session.delete(obj) - session.commit() - - -def db_remove_cached_url(url: str): - """Remove a cached page by URL.""" - # Compute absolute path for the URL and delegate to remove_cached_page - abs_fp = os.path.abspath(get_cache_path(url)) - try: - remove_cached_page(abs_fp) - except Exception: - pass - - -def db_get_all_cached_pages() -> List[Dict[str, Any]]: - with _ensure_session() as session: - rows = session.execute(text( - "SELECT file_path, url_guess, last_modified, size_bytes, job_id FROM cached_pages")).fetchall() - out = [] - for row in rows: - fp = row[0] - out.append({ - "file_path": fp, - "file_path_abs": db_get_cached_abs_path(fp) if fp else None, - "url_guess": row[1], - "last_modified": row[2], - "size_bytes": row[3], - "job_id": row[4], - }) - return out - - -def db_get_cache_url(url: str): - """Return the data for a specific URL from cached_pages. - - Arguments: - url -- The URL to look up in the cache. - """ - with _ensure_session() as session: - row = session.execute(text( - "SELECT file_path, url_guess, last_modified, size_bytes, job_id FROM cached_pages WHERE url_guess = :u"), {"u": url}).fetchone() - if not row: - return None - fp = row[0] - return { - "file_path": fp, - "file_path_abs": db_get_cached_abs_path(fp) if fp else None, - "url_guess": row[1], - "last_modified": row[2], - "size_bytes": row[3], - "job_id": row[4], - } - - -def db_sync_cached_pages(cache_dir: str): - """Scan cache_dir and upsert page metadata into cached_pages table.""" - if not os.path.isdir(cache_dir): - return - abs_cache = os.path.abspath(cache_dir) - # read existing DB keys once for quick membership tests - db_cache_paths = {c["file_path"] for c in db_get_all_cached_pages()} - for root, _, files in os.walk(abs_cache): - for name in files: - if not name.lower().endswith(".html"): - continue - fp = os.path.abspath(os.path.join(root, name)) - rel_fp = os.path.relpath(fp, start=abs_cache) - if rel_fp in db_cache_paths: - continue - try: - stat = os.stat(fp) - mtime = datetime.fromtimestamp(stat.st_mtime).isoformat() - size = stat.st_size - except OSError: - mtime = None - size = None - url_guess = get_url_from_filename(name) - job_id = url_to_job_id(url_guess) - upsert_cached_page(file_path=rel_fp, url_guess=url_guess, - last_modified=mtime, size_bytes=size, job_id=job_id) - - -def normalize_cached_page_paths() -> int: - """Ensure all cached_pages.file_path values are absolute. Returns number of rows updated/normalized.""" - # Convert any absolute paths in DB to relative paths (relative to cache dir) - changed = 0 - abs_cache = os.path.abspath(get_cache_dir()) - with _ensure_session() as session: - rows = session.execute(text( - "SELECT file_path, url_guess, last_modified, size_bytes, job_id FROM cached_pages")).fetchall() - for (fp, url_guess, last_modified, size_bytes, job_id) in rows: - if os.path.isabs(fp): - rel_fp = os.path.relpath(fp, start=abs_cache) - upsert_cached_page( - file_path=rel_fp, - url_guess=url_guess, - last_modified=last_modified, - size_bytes=size_bytes, - job_id=job_id, - ) - with _ensure_session() as session: - session.execute( - text("DELETE FROM cached_pages WHERE file_path = :fp"), {"fp": fp}) - session.commit() - changed += 1 - return changed - - def db_get_keywords() -> List[str]: """Return a list of all unique keywords from job listings.""" with _ensure_session() as session: @@ -453,15 +291,10 @@ SELECT l.job_id ,l.timestamp ,d.posted_time ,l.url -,c.file_path -,c.last_modified -,c.url_guess -,CASE WHEN c.url_guess != l.url THEN 1 ELSE 0 END AS url_guess_stale FROM job_listings AS l INNER JOIN job_descriptions AS d ON l.job_id = d.job_id AND l.url = d.url -LEFT JOIN cached_pages AS c ON l.job_id = c.job_id ORDER BY d.posted_time DESC """ with _ensure_session() as session: @@ -479,27 +312,11 @@ ORDER BY d.posted_time DESC "timestamp": row[7], "posted_time": row[8], "url": row[9], - # file_path is stored relative to cache dir; provide both forms - "file_path": row[10], - "file_path_abs": os.path.join(os.path.abspath(get_cache_dir()), row[10]) if row[10] else None, - "last_modified": row[11], - "url_guess": row[12], - "url_guess_stale": row[13], } jobs.append(job) return jobs -def db_get_cached_abs_path(db_file_path: Optional[str]) -> Optional[str]: - """Return absolute cache file path given a DB-stored (relative) file_path. - - Returns None if input is falsy. - """ - if not db_file_path: - return None - return os.path.join(os.path.abspath(get_cache_dir()), db_file_path) - - def db_get_all_job_urls() -> List[str]: """Return list of job URLs from job_listings.""" with _ensure_session() as session: @@ -522,10 +339,6 @@ def remove_job(url): try: jid = url_to_job_id(url) db_delete_job(jid) - cache_fp = get_cache_path(url) - remove_cached_page(os.path.abspath(cache_fp)) - if os.path.exists(cache_fp): - os.remove(cache_fp) except Exception: pass diff --git a/web/scraper.py b/web/scraper.py index f4d89b4..7adfbcc 100644 --- a/web/scraper.py +++ b/web/scraper.py @@ -1,7 +1,7 @@ from datetime import datetime, UTC from bs4 import BeautifulSoup from typing import List, Dict, Set -from web.utils import get_base_url, cache_page, safe_get_text, safe_get_attr, is_cached, get_cached_content, make_request_with_retry +from web.utils import get_base_url, safe_get_text, safe_get_attr, make_request_with_retry def scrape_listings_page(listing, region: str, keyword: str, seen_urls: Set[str]) -> List: @@ -108,14 +108,7 @@ def scrape_job_data(content: str, region: str, keyword: str, seen_urls: Set[str] def process_region_keyword(region: str, keyword: str, seen_urls: Set[str]) -> List[List]: """Process a single region and keyword.""" url = get_base_url().format(region=region, keyword=keyword.replace(" ", "+")) - if is_cached(url): - content = get_cached_content(url) - cache_status = "CACHED" - else: - content = make_request_with_retry(url, 3) - if content is None: - return [] - cache_page(url, content) - cache_status = "FETCHED" - _ = cache_status # no-op to silence unused var + content = make_request_with_retry(url, 3) + if content is None: + return [] return scrape_job_data(content, region, keyword, seen_urls) diff --git a/web/templates/base.html b/web/templates/base.html index ef60b6b..75b2d87 100644 --- a/web/templates/base.html +++ b/web/templates/base.html @@ -20,6 +20,7 @@ Home | Preferences {% if current_user and current_user.is_admin %} | + Scrape Jobs | Taxonomy | Users {% endif %} {% if session.get('username') %} | diff --git a/web/templates/job.html b/web/templates/job.html index bf337a4..85948df 100644 --- a/web/templates/job.html +++ b/web/templates/job.html @@ -23,13 +23,5 @@ styles %}{% endblock %} {% block content %} >{{ job.title }}

- {% if job.file_path_abs or job.file_path %} -

- Cached copy: - View cached copy -

- {% endif %} {% endblock %} diff --git a/web/templates/scrape.html b/web/templates/scrape.html new file mode 100644 index 0000000..0533405 --- /dev/null +++ b/web/templates/scrape.html @@ -0,0 +1,67 @@ +{% extends "base.html" %} {% block title %}Scrape Jobs{% endblock %} {% block +content %} +
+

Job Scraping Progress

+ +
+
+{% endblock %} {% block scripts %} + +{% endblock %} diff --git a/web/utils.py b/web/utils.py index 4645d4d..f58dcf0 100644 --- a/web/utils.py +++ b/web/utils.py @@ -93,8 +93,7 @@ def get_paths() -> dict: return get_config().get('paths', {}) -def get_cache_dir() -> str: - return get_paths().get('cache_dir', 'cache') + def get_logs_dir() -> str: @@ -129,9 +128,7 @@ def get_base_url() -> str: return get_config().get('scraper', {}).get('base_url', "https://{region}.craigslist.org/search/jjj?query={keyword}&sort=rel") -def ensure_cache_dir(): - """Ensure cache directory exists.""" - os.makedirs(get_cache_dir(), exist_ok=True) + def now_iso() -> str: @@ -173,10 +170,7 @@ def get_url_from_filename(name: str) -> str: return url_guess -def get_cached_content(url: str) -> str: - """Get cached content for URL.""" - with open(get_cache_path(url), "r", encoding="utf-8") as f: - return f.read() + def safe_get_text(element, default="N/A"): @@ -194,51 +188,19 @@ def get_random_delay(min_delay: int = get_min_delay(), max_delay: int = get_max_ return random.uniform(min_delay, max_delay) -def get_cache_path(url: str) -> str: - """Get cache file path for URL.""" - return os.path.join(get_cache_dir(), f"{get_filename_from_url(url)}.html") -def cache_page(url: str, content: str): - """Cache the page content with a timestamp.""" - cache_path = get_cache_path(url) - with open(cache_path, "w", encoding="utf-8") as f: - f.write(content) - # Update the file's modification time to the current time - os.utime(cache_path, None) -def is_cached(url: str) -> bool: - """Check if the page is cached and not older than 24 hours.""" - cache_path = get_cache_path(url) - if not os.path.isfile(cache_path): - return False - - # Check the file's age if it's a search result page - if 'search' in url: - file_age = time.time() - os.path.getmtime(cache_path) - if file_age > 24 * 3600: # 24 hours in seconds - return False - - return True -def is_cache_stale(last_modified: str, days: int = 1) -> bool: - """Check if the cached page is stale (older than 24 hours).""" - if not last_modified: - return True - last_datetime = datetime.fromisoformat(last_modified) - file_age = time.time() - last_datetime.timestamp() - return file_age > days * 24 * 3600 # days in seconds -def delete_cached_page(url: str): - cache_fp = get_cache_path(url) - if os.path.exists(cache_fp): - try: - os.remove(cache_fp) - except Exception: - pass + + + + + def get_color_from_string(s: str) -> str: