simplify caching

This commit is contained in:
georg.sinn-schirwitz
2025-08-29 16:31:10 +02:00
parent d5f686bf1e
commit 2dac771d47
4 changed files with 102 additions and 17 deletions

View File

@@ -255,7 +255,6 @@ csrf.exempt(set_favorite)
@app.route('/scrape', methods=['GET'])
def scrape():
"""Trigger the web scraping process."""
# Run the full scraper orchestration (fetch listings, sync cache, process jobs)
scraper()
return jsonify({"status": "Scraping completed"})

View File

@@ -24,6 +24,7 @@ from web.utils import (
normalize_job_id,
now_iso,
get_cache_path,
get_cache_dir,
get_mysql_config,
)
@@ -280,12 +281,14 @@ def upsert_job_details(job_data: Dict[str, Any]):
def upsert_cached_page(*, file_path: str, url_guess: Optional[str], last_modified: Optional[str], size_bytes: Optional[int], job_id: Optional[int]):
# Always store absolute paths
# Store file paths relative to the cache directory (keeps DB portable)
cache_dir = os.path.abspath(get_cache_dir())
abs_fp = os.path.abspath(file_path)
rel_fp = os.path.relpath(abs_fp, start=cache_dir)
with _ensure_session() as session:
obj = session.get(CachedPage, abs_fp)
obj = session.get(CachedPage, rel_fp)
if obj is None:
obj = CachedPage(file_path=abs_fp)
obj = CachedPage(file_path=rel_fp)
session.add(obj)
setattr(obj, "url_guess", url_guess)
setattr(obj, "last_modified", last_modified)
@@ -295,10 +298,12 @@ def upsert_cached_page(*, file_path: str, url_guess: Optional[str], last_modifie
def remove_cached_page(file_path: str):
# Accept either relative or absolute; remove both variants just in case
# Accept absolute or relative input but DB keys are stored relative to cache dir
cache_dir = os.path.abspath(get_cache_dir())
abs_fp = os.path.abspath(file_path)
rel_fp = os.path.relpath(abs_fp, start=cache_dir)
with _ensure_session() as session:
obj = session.get(CachedPage, abs_fp)
obj = session.get(CachedPage, rel_fp)
if obj:
session.delete(obj)
session.commit()
@@ -306,7 +311,8 @@ def remove_cached_page(file_path: str):
def db_remove_cached_url(url: str):
"""Remove a cached page by URL."""
abs_fp = get_cache_path(url)
# Compute absolute path for the URL and delegate to remove_cached_page
abs_fp = os.path.abspath(get_cache_path(url))
try:
remove_cached_page(abs_fp)
except Exception:
@@ -353,15 +359,17 @@ def db_sync_cached_pages(cache_dir: str):
"""Scan cache_dir and upsert page metadata into cached_pages table."""
if not os.path.isdir(cache_dir):
return
db_cache = db_get_all_cached_pages()
for root, _, files in os.walk(cache_dir):
abs_cache = os.path.abspath(cache_dir)
# read existing DB keys once for quick membership tests
db_cache_paths = {c["file_path"] for c in db_get_all_cached_pages()}
for root, _, files in os.walk(abs_cache):
for name in files:
if not name.lower().endswith(".html"):
continue
fp = os.path.abspath(os.path.join(root, name))
if fp in [c["file_path"] for c in db_cache]:
rel_fp = os.path.relpath(fp, start=abs_cache)
if rel_fp in db_cache_paths:
continue
try:
stat = os.stat(fp)
mtime = datetime.fromtimestamp(stat.st_mtime).isoformat()
@@ -371,22 +379,23 @@ def db_sync_cached_pages(cache_dir: str):
size = None
url_guess = get_url_from_filename(name)
job_id = url_to_job_id(url_guess)
upsert_cached_page(file_path=fp, url_guess=url_guess,
upsert_cached_page(file_path=rel_fp, url_guess=url_guess,
last_modified=mtime, size_bytes=size, job_id=job_id)
def normalize_cached_page_paths() -> int:
"""Ensure all cached_pages.file_path values are absolute. Returns number of rows updated/normalized."""
# Convert any absolute paths in DB to relative paths (relative to cache dir)
changed = 0
abs_cache = os.path.abspath(get_cache_dir())
with _ensure_session() as session:
rows = session.execute(text(
"SELECT file_path, url_guess, last_modified, size_bytes, job_id FROM cached_pages")).fetchall()
for (fp, url_guess, last_modified, size_bytes, job_id) in rows:
if not os.path.isabs(fp):
abs_fp = os.path.abspath(fp)
# Upsert under absolute path, then remove the relative entry
if os.path.isabs(fp):
rel_fp = os.path.relpath(fp, start=abs_cache)
upsert_cached_page(
file_path=abs_fp,
file_path=rel_fp,
url_guess=url_guess,
last_modified=last_modified,
size_bytes=size_bytes,
@@ -454,7 +463,9 @@ ORDER BY d.posted_time DESC
"timestamp": row[7],
"posted_time": row[8],
"url": row[9],
# file_path is stored relative to cache dir; provide both forms
"file_path": row[10],
"file_path_abs": os.path.join(os.path.abspath(get_cache_dir()), row[10]) if row[10] else None,
"last_modified": row[11],
"url_guess": row[12],
"url_guess_stale": row[13],
@@ -463,6 +474,16 @@ ORDER BY d.posted_time DESC
return jobs
def db_get_cached_abs_path(db_file_path: Optional[str]) -> Optional[str]:
"""Return absolute cache file path given a DB-stored (relative) file_path.
Returns None if input is falsy.
"""
if not db_file_path:
return None
return os.path.join(os.path.abspath(get_cache_dir()), db_file_path)
def db_get_all_job_urls() -> List[str]:
"""Return list of job URLs from job_listings."""
with _ensure_session() as session:

View File

@@ -203,7 +203,6 @@ def get_cache_path(url: str) -> str:
def cache_page(url: str, content: str):
"""Cache the page content with a timestamp."""
cache_path = get_cache_path(url)
os.makedirs(os.path.dirname(cache_path), exist_ok=True)
with open(cache_path, "w", encoding="utf-8") as f:
f.write(content)
# Update the file's modification time to the current time