simplify caching
This commit is contained in:
66
tests/test_cache_paths.py
Normal file
66
tests/test_cache_paths.py
Normal file
@@ -0,0 +1,66 @@
|
|||||||
|
import os
|
||||||
|
import tempfile
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
import web.db as db
|
||||||
|
from web.utils import get_cache_dir
|
||||||
|
|
||||||
|
|
||||||
|
# Skip unless explicitly enabled (MySQL integration expected)
|
||||||
|
if not os.getenv("RUN_DB_TESTS"):
|
||||||
|
pytest.skip("Set RUN_DB_TESTS=1 to run cache path integration tests",
|
||||||
|
allow_module_level=True)
|
||||||
|
|
||||||
|
|
||||||
|
def test_db_sync_inserts_relative_paths(tmp_path, monkeypatch):
|
||||||
|
# arrange: create a temporary cache dir and a fake html file
|
||||||
|
cache_dir = tmp_path / "cache"
|
||||||
|
cache_dir.mkdir()
|
||||||
|
f = cache_dir / "example.org_path_to_page_123.html"
|
||||||
|
f.write_text("<html>ok</html>")
|
||||||
|
|
||||||
|
# point app at this cache dir
|
||||||
|
monkeypatch.setenv("PYTEST_CACHE_DIR", str(cache_dir))
|
||||||
|
# monkeypatch get_cache_dir used by db functions
|
||||||
|
monkeypatch.setattr('web.utils.get_cache_dir', lambda: str(cache_dir))
|
||||||
|
|
||||||
|
# ensure DB initialized
|
||||||
|
db.db_init()
|
||||||
|
|
||||||
|
# act
|
||||||
|
db.db_sync_cached_pages(str(cache_dir))
|
||||||
|
|
||||||
|
# assert: DB contains relative path, not absolute
|
||||||
|
rows = db.db_get_all_cached_pages()
|
||||||
|
assert any(r['file_path'] == os.path.relpath(
|
||||||
|
str(f), start=str(cache_dir)) for r in rows)
|
||||||
|
|
||||||
|
|
||||||
|
def test_normalize_cached_page_paths_converts_absolute(tmp_path, monkeypatch):
|
||||||
|
cache_dir = tmp_path / "cache"
|
||||||
|
cache_dir.mkdir()
|
||||||
|
# create an actual file
|
||||||
|
f = cache_dir / "site_example_page_1.html"
|
||||||
|
f.write_text("<html>ok</html>")
|
||||||
|
|
||||||
|
monkeypatch.setattr('web.utils.get_cache_dir', lambda: str(cache_dir))
|
||||||
|
db.db_init()
|
||||||
|
|
||||||
|
abs_fp = str(f)
|
||||||
|
rel_fp = os.path.relpath(abs_fp, start=str(cache_dir))
|
||||||
|
|
||||||
|
# Insert an absolute path row directly (simulate legacy data)
|
||||||
|
with db._ensure_session() as session:
|
||||||
|
session.execute(
|
||||||
|
db.text("INSERT INTO cached_pages(file_path, url_guess, last_modified, size_bytes, job_id) VALUES(:fp, :ug, :lm, :sz, :jid)"),
|
||||||
|
{"fp": abs_fp, "ug": "https://example.org/page1.html",
|
||||||
|
"lm": None, "sz": 10, "jid": None}
|
||||||
|
)
|
||||||
|
session.commit()
|
||||||
|
|
||||||
|
# normalize should convert absolute to relative
|
||||||
|
changed = db.normalize_cached_page_paths()
|
||||||
|
assert changed >= 1
|
||||||
|
|
||||||
|
rows = db.db_get_all_cached_pages()
|
||||||
|
assert any(r['file_path'] == rel_fp for r in rows)
|
||||||
@@ -255,7 +255,6 @@ csrf.exempt(set_favorite)
|
|||||||
@app.route('/scrape', methods=['GET'])
|
@app.route('/scrape', methods=['GET'])
|
||||||
def scrape():
|
def scrape():
|
||||||
"""Trigger the web scraping process."""
|
"""Trigger the web scraping process."""
|
||||||
# Run the full scraper orchestration (fetch listings, sync cache, process jobs)
|
|
||||||
scraper()
|
scraper()
|
||||||
return jsonify({"status": "Scraping completed"})
|
return jsonify({"status": "Scraping completed"})
|
||||||
|
|
||||||
|
|||||||
51
web/db.py
51
web/db.py
@@ -24,6 +24,7 @@ from web.utils import (
|
|||||||
normalize_job_id,
|
normalize_job_id,
|
||||||
now_iso,
|
now_iso,
|
||||||
get_cache_path,
|
get_cache_path,
|
||||||
|
get_cache_dir,
|
||||||
get_mysql_config,
|
get_mysql_config,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -280,12 +281,14 @@ def upsert_job_details(job_data: Dict[str, Any]):
|
|||||||
|
|
||||||
|
|
||||||
def upsert_cached_page(*, file_path: str, url_guess: Optional[str], last_modified: Optional[str], size_bytes: Optional[int], job_id: Optional[int]):
|
def upsert_cached_page(*, file_path: str, url_guess: Optional[str], last_modified: Optional[str], size_bytes: Optional[int], job_id: Optional[int]):
|
||||||
# Always store absolute paths
|
# Store file paths relative to the cache directory (keeps DB portable)
|
||||||
|
cache_dir = os.path.abspath(get_cache_dir())
|
||||||
abs_fp = os.path.abspath(file_path)
|
abs_fp = os.path.abspath(file_path)
|
||||||
|
rel_fp = os.path.relpath(abs_fp, start=cache_dir)
|
||||||
with _ensure_session() as session:
|
with _ensure_session() as session:
|
||||||
obj = session.get(CachedPage, abs_fp)
|
obj = session.get(CachedPage, rel_fp)
|
||||||
if obj is None:
|
if obj is None:
|
||||||
obj = CachedPage(file_path=abs_fp)
|
obj = CachedPage(file_path=rel_fp)
|
||||||
session.add(obj)
|
session.add(obj)
|
||||||
setattr(obj, "url_guess", url_guess)
|
setattr(obj, "url_guess", url_guess)
|
||||||
setattr(obj, "last_modified", last_modified)
|
setattr(obj, "last_modified", last_modified)
|
||||||
@@ -295,10 +298,12 @@ def upsert_cached_page(*, file_path: str, url_guess: Optional[str], last_modifie
|
|||||||
|
|
||||||
|
|
||||||
def remove_cached_page(file_path: str):
|
def remove_cached_page(file_path: str):
|
||||||
# Accept either relative or absolute; remove both variants just in case
|
# Accept absolute or relative input but DB keys are stored relative to cache dir
|
||||||
|
cache_dir = os.path.abspath(get_cache_dir())
|
||||||
abs_fp = os.path.abspath(file_path)
|
abs_fp = os.path.abspath(file_path)
|
||||||
|
rel_fp = os.path.relpath(abs_fp, start=cache_dir)
|
||||||
with _ensure_session() as session:
|
with _ensure_session() as session:
|
||||||
obj = session.get(CachedPage, abs_fp)
|
obj = session.get(CachedPage, rel_fp)
|
||||||
if obj:
|
if obj:
|
||||||
session.delete(obj)
|
session.delete(obj)
|
||||||
session.commit()
|
session.commit()
|
||||||
@@ -306,7 +311,8 @@ def remove_cached_page(file_path: str):
|
|||||||
|
|
||||||
def db_remove_cached_url(url: str):
|
def db_remove_cached_url(url: str):
|
||||||
"""Remove a cached page by URL."""
|
"""Remove a cached page by URL."""
|
||||||
abs_fp = get_cache_path(url)
|
# Compute absolute path for the URL and delegate to remove_cached_page
|
||||||
|
abs_fp = os.path.abspath(get_cache_path(url))
|
||||||
try:
|
try:
|
||||||
remove_cached_page(abs_fp)
|
remove_cached_page(abs_fp)
|
||||||
except Exception:
|
except Exception:
|
||||||
@@ -353,15 +359,17 @@ def db_sync_cached_pages(cache_dir: str):
|
|||||||
"""Scan cache_dir and upsert page metadata into cached_pages table."""
|
"""Scan cache_dir and upsert page metadata into cached_pages table."""
|
||||||
if not os.path.isdir(cache_dir):
|
if not os.path.isdir(cache_dir):
|
||||||
return
|
return
|
||||||
db_cache = db_get_all_cached_pages()
|
abs_cache = os.path.abspath(cache_dir)
|
||||||
for root, _, files in os.walk(cache_dir):
|
# read existing DB keys once for quick membership tests
|
||||||
|
db_cache_paths = {c["file_path"] for c in db_get_all_cached_pages()}
|
||||||
|
for root, _, files in os.walk(abs_cache):
|
||||||
for name in files:
|
for name in files:
|
||||||
if not name.lower().endswith(".html"):
|
if not name.lower().endswith(".html"):
|
||||||
continue
|
continue
|
||||||
fp = os.path.abspath(os.path.join(root, name))
|
fp = os.path.abspath(os.path.join(root, name))
|
||||||
if fp in [c["file_path"] for c in db_cache]:
|
rel_fp = os.path.relpath(fp, start=abs_cache)
|
||||||
|
if rel_fp in db_cache_paths:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
try:
|
try:
|
||||||
stat = os.stat(fp)
|
stat = os.stat(fp)
|
||||||
mtime = datetime.fromtimestamp(stat.st_mtime).isoformat()
|
mtime = datetime.fromtimestamp(stat.st_mtime).isoformat()
|
||||||
@@ -371,22 +379,23 @@ def db_sync_cached_pages(cache_dir: str):
|
|||||||
size = None
|
size = None
|
||||||
url_guess = get_url_from_filename(name)
|
url_guess = get_url_from_filename(name)
|
||||||
job_id = url_to_job_id(url_guess)
|
job_id = url_to_job_id(url_guess)
|
||||||
upsert_cached_page(file_path=fp, url_guess=url_guess,
|
upsert_cached_page(file_path=rel_fp, url_guess=url_guess,
|
||||||
last_modified=mtime, size_bytes=size, job_id=job_id)
|
last_modified=mtime, size_bytes=size, job_id=job_id)
|
||||||
|
|
||||||
|
|
||||||
def normalize_cached_page_paths() -> int:
|
def normalize_cached_page_paths() -> int:
|
||||||
"""Ensure all cached_pages.file_path values are absolute. Returns number of rows updated/normalized."""
|
"""Ensure all cached_pages.file_path values are absolute. Returns number of rows updated/normalized."""
|
||||||
|
# Convert any absolute paths in DB to relative paths (relative to cache dir)
|
||||||
changed = 0
|
changed = 0
|
||||||
|
abs_cache = os.path.abspath(get_cache_dir())
|
||||||
with _ensure_session() as session:
|
with _ensure_session() as session:
|
||||||
rows = session.execute(text(
|
rows = session.execute(text(
|
||||||
"SELECT file_path, url_guess, last_modified, size_bytes, job_id FROM cached_pages")).fetchall()
|
"SELECT file_path, url_guess, last_modified, size_bytes, job_id FROM cached_pages")).fetchall()
|
||||||
for (fp, url_guess, last_modified, size_bytes, job_id) in rows:
|
for (fp, url_guess, last_modified, size_bytes, job_id) in rows:
|
||||||
if not os.path.isabs(fp):
|
if os.path.isabs(fp):
|
||||||
abs_fp = os.path.abspath(fp)
|
rel_fp = os.path.relpath(fp, start=abs_cache)
|
||||||
# Upsert under absolute path, then remove the relative entry
|
|
||||||
upsert_cached_page(
|
upsert_cached_page(
|
||||||
file_path=abs_fp,
|
file_path=rel_fp,
|
||||||
url_guess=url_guess,
|
url_guess=url_guess,
|
||||||
last_modified=last_modified,
|
last_modified=last_modified,
|
||||||
size_bytes=size_bytes,
|
size_bytes=size_bytes,
|
||||||
@@ -454,7 +463,9 @@ ORDER BY d.posted_time DESC
|
|||||||
"timestamp": row[7],
|
"timestamp": row[7],
|
||||||
"posted_time": row[8],
|
"posted_time": row[8],
|
||||||
"url": row[9],
|
"url": row[9],
|
||||||
|
# file_path is stored relative to cache dir; provide both forms
|
||||||
"file_path": row[10],
|
"file_path": row[10],
|
||||||
|
"file_path_abs": os.path.join(os.path.abspath(get_cache_dir()), row[10]) if row[10] else None,
|
||||||
"last_modified": row[11],
|
"last_modified": row[11],
|
||||||
"url_guess": row[12],
|
"url_guess": row[12],
|
||||||
"url_guess_stale": row[13],
|
"url_guess_stale": row[13],
|
||||||
@@ -463,6 +474,16 @@ ORDER BY d.posted_time DESC
|
|||||||
return jobs
|
return jobs
|
||||||
|
|
||||||
|
|
||||||
|
def db_get_cached_abs_path(db_file_path: Optional[str]) -> Optional[str]:
|
||||||
|
"""Return absolute cache file path given a DB-stored (relative) file_path.
|
||||||
|
|
||||||
|
Returns None if input is falsy.
|
||||||
|
"""
|
||||||
|
if not db_file_path:
|
||||||
|
return None
|
||||||
|
return os.path.join(os.path.abspath(get_cache_dir()), db_file_path)
|
||||||
|
|
||||||
|
|
||||||
def db_get_all_job_urls() -> List[str]:
|
def db_get_all_job_urls() -> List[str]:
|
||||||
"""Return list of job URLs from job_listings."""
|
"""Return list of job URLs from job_listings."""
|
||||||
with _ensure_session() as session:
|
with _ensure_session() as session:
|
||||||
|
|||||||
@@ -203,7 +203,6 @@ def get_cache_path(url: str) -> str:
|
|||||||
def cache_page(url: str, content: str):
|
def cache_page(url: str, content: str):
|
||||||
"""Cache the page content with a timestamp."""
|
"""Cache the page content with a timestamp."""
|
||||||
cache_path = get_cache_path(url)
|
cache_path = get_cache_path(url)
|
||||||
os.makedirs(os.path.dirname(cache_path), exist_ok=True)
|
|
||||||
with open(cache_path, "w", encoding="utf-8") as f:
|
with open(cache_path, "w", encoding="utf-8") as f:
|
||||||
f.write(content)
|
f.write(content)
|
||||||
# Update the file's modification time to the current time
|
# Update the file's modification time to the current time
|
||||||
|
|||||||
Reference in New Issue
Block a user