From 2dac771d478010b3ddc8aa4768f75046f7c0b490 Mon Sep 17 00:00:00 2001
From: "georg.sinn-schirwitz" <georg.sinn-schirwitz@in-factory.com>
Date: Fri, 29 Aug 2025 16:31:10 +0200
Subject: [PATCH] simplify caching

---
 tests/test_cache_paths.py | 66 +++++++++++++++++++++++++++++++++++++++
 web/app.py                |  1 -
 web/db.py                 | 51 +++++++++++++++++++++---------
 web/utils.py              |  1 -
 4 files changed, 102 insertions(+), 17 deletions(-)
 create mode 100644 tests/test_cache_paths.py
diff --git a/tests/test_cache_paths.py b/tests/test_cache_paths.py
new file mode 100644
index 0000000..11595e6
--- /dev/null
+++ b/tests/test_cache_paths.py
@@ -0,0 +1,66 @@
+import os
+import tempfile
+import pytest
+
+import web.db as db
+from web.utils import get_cache_dir
+
+
+# Skip unless explicitly enabled (MySQL integration expected)
+if not os.getenv("RUN_DB_TESTS"):
+    pytest.skip("Set RUN_DB_TESTS=1 to run cache path integration tests",
+                allow_module_level=True)
+
+
+def test_db_sync_inserts_relative_paths(tmp_path, monkeypatch):
+    # arrange: create a temporary cache dir and a fake html file
+    cache_dir = tmp_path / "cache"
+    cache_dir.mkdir()
+    f = cache_dir / "example.org_path_to_page_123.html"
+    f.write_text("<html>ok</html>")
+
+    # point app at this cache dir
+    monkeypatch.setenv("PYTEST_CACHE_DIR", str(cache_dir))
+    # monkeypatch get_cache_dir used by db functions
+    monkeypatch.setattr('web.utils.get_cache_dir', lambda: str(cache_dir))
+
+    # ensure DB initialized
+    db.db_init()
+
+    # act
+    db.db_sync_cached_pages(str(cache_dir))
+
+    # assert: DB contains relative path, not absolute
+    rows = db.db_get_all_cached_pages()
+    assert any(r['file_path'] == os.path.relpath(
+        str(f), start=str(cache_dir)) for r in rows)
+
+
+def test_normalize_cached_page_paths_converts_absolute(tmp_path, monkeypatch):
+    cache_dir = tmp_path / "cache"
+    cache_dir.mkdir()
+    # create an actual file
+    f = cache_dir / "site_example_page_1.html"
+    f.write_text("<html>ok</html>")
+
+    monkeypatch.setattr('web.utils.get_cache_dir', lambda: str(cache_dir))
+    db.db_init()
+
+    abs_fp = str(f)
+    rel_fp = os.path.relpath(abs_fp, start=str(cache_dir))
+
+    # Insert an absolute path row directly (simulate legacy data)
+    with db._ensure_session() as session:
+        session.execute(
+            db.text("INSERT INTO cached_pages(file_path, url_guess, last_modified, size_bytes, job_id) VALUES(:fp, :ug, :lm, :sz, :jid)"),
+            {"fp": abs_fp, "ug": "https://example.org/page1.html",
+                "lm": None, "sz": 10, "jid": None}
+        )
+        session.commit()
+
+    # normalize should convert absolute to relative
+    changed = db.normalize_cached_page_paths()
+    assert changed >= 1
+
+    rows = db.db_get_all_cached_pages()
+    assert any(r['file_path'] == rel_fp for r in rows)
diff --git a/web/app.py b/web/app.py
index e69a789..c19dc6d 100644
--- a/web/app.py
+++ b/web/app.py
@@ -255,7 +255,6 @@ csrf.exempt(set_favorite)
 @app.route('/scrape', methods=['GET'])
 def scrape():
     """Trigger the web scraping process."""
-    # Run the full scraper orchestration (fetch listings, sync cache, process jobs)
     scraper()
     return jsonify({"status": "Scraping completed"})
 
diff --git a/web/db.py b/web/db.py
index 04f709f..8ecd8b8 100644
--- a/web/db.py
+++ b/web/db.py
@@ -24,6 +24,7 @@ from web.utils import (
     normalize_job_id,
     now_iso,
     get_cache_path,
+    get_cache_dir,
     get_mysql_config,
 )
 
@@ -280,12 +281,14 @@ def upsert_job_details(job_data: Dict[str, Any]):
 
 
 def upsert_cached_page(*, file_path: str, url_guess: Optional[str], last_modified: Optional[str], size_bytes: Optional[int], job_id: Optional[int]):
-    # Always store absolute paths
+    # Store file paths relative to the cache directory (keeps DB portable)
+    cache_dir = os.path.abspath(get_cache_dir())
     abs_fp = os.path.abspath(file_path)
+    rel_fp = os.path.relpath(abs_fp, start=cache_dir)
     with _ensure_session() as session:
-        obj = session.get(CachedPage, abs_fp)
+        obj = session.get(CachedPage, rel_fp)
         if obj is None:
-            obj = CachedPage(file_path=abs_fp)
+            obj = CachedPage(file_path=rel_fp)
             session.add(obj)
         setattr(obj, "url_guess", url_guess)
         setattr(obj, "last_modified", last_modified)
@@ -295,10 +298,12 @@ def upsert_cached_page(*, file_path: str, url_guess: Optional[str], last_modifie
 
 
 def remove_cached_page(file_path: str):
-    # Accept either relative or absolute; remove both variants just in case
+    # Accept absolute or relative input but DB keys are stored relative to cache dir
+    cache_dir = os.path.abspath(get_cache_dir())
     abs_fp = os.path.abspath(file_path)
+    rel_fp = os.path.relpath(abs_fp, start=cache_dir)
     with _ensure_session() as session:
-        obj = session.get(CachedPage, abs_fp)
+        obj = session.get(CachedPage, rel_fp)
         if obj:
             session.delete(obj)
             session.commit()
@@ -306,7 +311,8 @@ def remove_cached_page(file_path: str):
 
 def db_remove_cached_url(url: str):
     """Remove a cached page by URL."""
-    abs_fp = get_cache_path(url)
+    # Compute absolute path for the URL and delegate to remove_cached_page
+    abs_fp = os.path.abspath(get_cache_path(url))
     try:
         remove_cached_page(abs_fp)
     except Exception:
@@ -353,15 +359,17 @@ def db_sync_cached_pages(cache_dir: str):
     """Scan cache_dir and upsert page metadata into cached_pages table."""
     if not os.path.isdir(cache_dir):
         return
-    db_cache = db_get_all_cached_pages()
-    for root, _, files in os.walk(cache_dir):
+    abs_cache = os.path.abspath(cache_dir)
+    # read existing DB keys once for quick membership tests
+    db_cache_paths = {c["file_path"] for c in db_get_all_cached_pages()}
+    for root, _, files in os.walk(abs_cache):
         for name in files:
             if not name.lower().endswith(".html"):
                 continue
             fp = os.path.abspath(os.path.join(root, name))
-            if fp in [c["file_path"] for c in db_cache]:
+            rel_fp = os.path.relpath(fp, start=abs_cache)
+            if rel_fp in db_cache_paths:
                 continue
-
             try:
                 stat = os.stat(fp)
                 mtime = datetime.fromtimestamp(stat.st_mtime).isoformat()
@@ -371,22 +379,23 @@ def db_sync_cached_pages(cache_dir: str):
                 size = None
             url_guess = get_url_from_filename(name)
             job_id = url_to_job_id(url_guess)
-            upsert_cached_page(file_path=fp, url_guess=url_guess,
+            upsert_cached_page(file_path=rel_fp, url_guess=url_guess,
                                last_modified=mtime, size_bytes=size, job_id=job_id)
 
 
 def normalize_cached_page_paths() -> int:
     """Ensure all cached_pages.file_path values are absolute. Returns number of rows updated/normalized."""
+    # Convert any absolute paths in DB to relative paths (relative to cache dir)
     changed = 0
+    abs_cache = os.path.abspath(get_cache_dir())
     with _ensure_session() as session:
         rows = session.execute(text(
             "SELECT file_path, url_guess, last_modified, size_bytes, job_id FROM cached_pages")).fetchall()
     for (fp, url_guess, last_modified, size_bytes, job_id) in rows:
-        if not os.path.isabs(fp):
-            abs_fp = os.path.abspath(fp)
-            # Upsert under absolute path, then remove the relative entry
+        if os.path.isabs(fp):
+            rel_fp = os.path.relpath(fp, start=abs_cache)
             upsert_cached_page(
-                file_path=abs_fp,
+                file_path=rel_fp,
                 url_guess=url_guess,
                 last_modified=last_modified,
                 size_bytes=size_bytes,
@@ -454,7 +463,9 @@ ORDER BY d.posted_time DESC
                 "timestamp": row[7],
                 "posted_time": row[8],
                 "url": row[9],
+                # file_path is stored relative to cache dir; provide both forms
                 "file_path": row[10],
+                "file_path_abs": os.path.join(os.path.abspath(get_cache_dir()), row[10]) if row[10] else None,
                 "last_modified": row[11],
                 "url_guess": row[12],
                 "url_guess_stale": row[13],
@@ -463,6 +474,16 @@ ORDER BY d.posted_time DESC
         return jobs
 
 
+def db_get_cached_abs_path(db_file_path: Optional[str]) -> Optional[str]:
+    """Return absolute cache file path given a DB-stored (relative) file_path.
+
+    Returns None if input is falsy.
+    """
+    if not db_file_path:
+        return None
+    return os.path.join(os.path.abspath(get_cache_dir()), db_file_path)
+
+
 def db_get_all_job_urls() -> List[str]:
     """Return list of job URLs from job_listings."""
     with _ensure_session() as session:
diff --git a/web/utils.py b/web/utils.py
index 1371d64..0eeea63 100644
--- a/web/utils.py
+++ b/web/utils.py
@@ -203,7 +203,6 @@ def get_cache_path(url: str) -> str:
 def cache_page(url: str, content: str):
     """Cache the page content with a timestamp."""
     cache_path = get_cache_path(url)
-    os.makedirs(os.path.dirname(cache_path), exist_ok=True)
     with open(cache_path, "w", encoding="utf-8") as f:
         f.write(content)
     # Update the file's modification time to the current time