From 2dac771d478010b3ddc8aa4768f75046f7c0b490 Mon Sep 17 00:00:00 2001 From: "georg.sinn-schirwitz" Date: Fri, 29 Aug 2025 16:31:10 +0200 Subject: [PATCH] simplify caching --- tests/test_cache_paths.py | 66 +++++++++++++++++++++++++++++++++++++++ web/app.py | 1 - web/db.py | 51 +++++++++++++++++++++--------- web/utils.py | 1 - 4 files changed, 102 insertions(+), 17 deletions(-) create mode 100644 tests/test_cache_paths.py diff --git a/tests/test_cache_paths.py b/tests/test_cache_paths.py new file mode 100644 index 0000000..11595e6 --- /dev/null +++ b/tests/test_cache_paths.py @@ -0,0 +1,66 @@ +import os +import tempfile +import pytest + +import web.db as db +from web.utils import get_cache_dir + + +# Skip unless explicitly enabled (MySQL integration expected) +if not os.getenv("RUN_DB_TESTS"): + pytest.skip("Set RUN_DB_TESTS=1 to run cache path integration tests", + allow_module_level=True) + + +def test_db_sync_inserts_relative_paths(tmp_path, monkeypatch): + # arrange: create a temporary cache dir and a fake html file + cache_dir = tmp_path / "cache" + cache_dir.mkdir() + f = cache_dir / "example.org_path_to_page_123.html" + f.write_text("ok") + + # point app at this cache dir + monkeypatch.setenv("PYTEST_CACHE_DIR", str(cache_dir)) + # monkeypatch get_cache_dir used by db functions + monkeypatch.setattr('web.utils.get_cache_dir', lambda: str(cache_dir)) + + # ensure DB initialized + db.db_init() + + # act + db.db_sync_cached_pages(str(cache_dir)) + + # assert: DB contains relative path, not absolute + rows = db.db_get_all_cached_pages() + assert any(r['file_path'] == os.path.relpath( + str(f), start=str(cache_dir)) for r in rows) + + +def test_normalize_cached_page_paths_converts_absolute(tmp_path, monkeypatch): + cache_dir = tmp_path / "cache" + cache_dir.mkdir() + # create an actual file + f = cache_dir / "site_example_page_1.html" + f.write_text("ok") + + monkeypatch.setattr('web.utils.get_cache_dir', lambda: str(cache_dir)) + db.db_init() + + abs_fp = str(f) + rel_fp = os.path.relpath(abs_fp, start=str(cache_dir)) + + # Insert an absolute path row directly (simulate legacy data) + with db._ensure_session() as session: + session.execute( + db.text("INSERT INTO cached_pages(file_path, url_guess, last_modified, size_bytes, job_id) VALUES(:fp, :ug, :lm, :sz, :jid)"), + {"fp": abs_fp, "ug": "https://example.org/page1.html", + "lm": None, "sz": 10, "jid": None} + ) + session.commit() + + # normalize should convert absolute to relative + changed = db.normalize_cached_page_paths() + assert changed >= 1 + + rows = db.db_get_all_cached_pages() + assert any(r['file_path'] == rel_fp for r in rows) diff --git a/web/app.py b/web/app.py index e69a789..c19dc6d 100644 --- a/web/app.py +++ b/web/app.py @@ -255,7 +255,6 @@ csrf.exempt(set_favorite) @app.route('/scrape', methods=['GET']) def scrape(): """Trigger the web scraping process.""" - # Run the full scraper orchestration (fetch listings, sync cache, process jobs) scraper() return jsonify({"status": "Scraping completed"}) diff --git a/web/db.py b/web/db.py index 04f709f..8ecd8b8 100644 --- a/web/db.py +++ b/web/db.py @@ -24,6 +24,7 @@ from web.utils import ( normalize_job_id, now_iso, get_cache_path, + get_cache_dir, get_mysql_config, ) @@ -280,12 +281,14 @@ def upsert_job_details(job_data: Dict[str, Any]): def upsert_cached_page(*, file_path: str, url_guess: Optional[str], last_modified: Optional[str], size_bytes: Optional[int], job_id: Optional[int]): - # Always store absolute paths + # Store file paths relative to the cache directory (keeps DB portable) + cache_dir = os.path.abspath(get_cache_dir()) abs_fp = os.path.abspath(file_path) + rel_fp = os.path.relpath(abs_fp, start=cache_dir) with _ensure_session() as session: - obj = session.get(CachedPage, abs_fp) + obj = session.get(CachedPage, rel_fp) if obj is None: - obj = CachedPage(file_path=abs_fp) + obj = CachedPage(file_path=rel_fp) session.add(obj) setattr(obj, "url_guess", url_guess) setattr(obj, "last_modified", last_modified) @@ -295,10 +298,12 @@ def upsert_cached_page(*, file_path: str, url_guess: Optional[str], last_modifie def remove_cached_page(file_path: str): - # Accept either relative or absolute; remove both variants just in case + # Accept absolute or relative input but DB keys are stored relative to cache dir + cache_dir = os.path.abspath(get_cache_dir()) abs_fp = os.path.abspath(file_path) + rel_fp = os.path.relpath(abs_fp, start=cache_dir) with _ensure_session() as session: - obj = session.get(CachedPage, abs_fp) + obj = session.get(CachedPage, rel_fp) if obj: session.delete(obj) session.commit() @@ -306,7 +311,8 @@ def remove_cached_page(file_path: str): def db_remove_cached_url(url: str): """Remove a cached page by URL.""" - abs_fp = get_cache_path(url) + # Compute absolute path for the URL and delegate to remove_cached_page + abs_fp = os.path.abspath(get_cache_path(url)) try: remove_cached_page(abs_fp) except Exception: @@ -353,15 +359,17 @@ def db_sync_cached_pages(cache_dir: str): """Scan cache_dir and upsert page metadata into cached_pages table.""" if not os.path.isdir(cache_dir): return - db_cache = db_get_all_cached_pages() - for root, _, files in os.walk(cache_dir): + abs_cache = os.path.abspath(cache_dir) + # read existing DB keys once for quick membership tests + db_cache_paths = {c["file_path"] for c in db_get_all_cached_pages()} + for root, _, files in os.walk(abs_cache): for name in files: if not name.lower().endswith(".html"): continue fp = os.path.abspath(os.path.join(root, name)) - if fp in [c["file_path"] for c in db_cache]: + rel_fp = os.path.relpath(fp, start=abs_cache) + if rel_fp in db_cache_paths: continue - try: stat = os.stat(fp) mtime = datetime.fromtimestamp(stat.st_mtime).isoformat() @@ -371,22 +379,23 @@ def db_sync_cached_pages(cache_dir: str): size = None url_guess = get_url_from_filename(name) job_id = url_to_job_id(url_guess) - upsert_cached_page(file_path=fp, url_guess=url_guess, + upsert_cached_page(file_path=rel_fp, url_guess=url_guess, last_modified=mtime, size_bytes=size, job_id=job_id) def normalize_cached_page_paths() -> int: """Ensure all cached_pages.file_path values are absolute. Returns number of rows updated/normalized.""" + # Convert any absolute paths in DB to relative paths (relative to cache dir) changed = 0 + abs_cache = os.path.abspath(get_cache_dir()) with _ensure_session() as session: rows = session.execute(text( "SELECT file_path, url_guess, last_modified, size_bytes, job_id FROM cached_pages")).fetchall() for (fp, url_guess, last_modified, size_bytes, job_id) in rows: - if not os.path.isabs(fp): - abs_fp = os.path.abspath(fp) - # Upsert under absolute path, then remove the relative entry + if os.path.isabs(fp): + rel_fp = os.path.relpath(fp, start=abs_cache) upsert_cached_page( - file_path=abs_fp, + file_path=rel_fp, url_guess=url_guess, last_modified=last_modified, size_bytes=size_bytes, @@ -454,7 +463,9 @@ ORDER BY d.posted_time DESC "timestamp": row[7], "posted_time": row[8], "url": row[9], + # file_path is stored relative to cache dir; provide both forms "file_path": row[10], + "file_path_abs": os.path.join(os.path.abspath(get_cache_dir()), row[10]) if row[10] else None, "last_modified": row[11], "url_guess": row[12], "url_guess_stale": row[13], @@ -463,6 +474,16 @@ ORDER BY d.posted_time DESC return jobs +def db_get_cached_abs_path(db_file_path: Optional[str]) -> Optional[str]: + """Return absolute cache file path given a DB-stored (relative) file_path. + + Returns None if input is falsy. + """ + if not db_file_path: + return None + return os.path.join(os.path.abspath(get_cache_dir()), db_file_path) + + def db_get_all_job_urls() -> List[str]: """Return list of job URLs from job_listings.""" with _ensure_session() as session: diff --git a/web/utils.py b/web/utils.py index 1371d64..0eeea63 100644 --- a/web/utils.py +++ b/web/utils.py @@ -203,7 +203,6 @@ def get_cache_path(url: str) -> str: def cache_page(url: str, content: str): """Cache the page content with a timestamp.""" cache_path = get_cache_path(url) - os.makedirs(os.path.dirname(cache_path), exist_ok=True) with open(cache_path, "w", encoding="utf-8") as f: f.write(content) # Update the file's modification time to the current time