remove caching
This commit is contained in:
187
web/db.py
187
web/db.py
@@ -4,7 +4,6 @@ from __future__ import annotations
|
||||
|
||||
Tables:
|
||||
- users(user_id PK, username UNIQUE, created_at)
|
||||
- cached_pages(file_path PK, url_guess, last_modified, size_bytes, job_id)
|
||||
- job_listings(job_id PK, url UNIQUE, region, keyword, title, pay, location, timestamp)
|
||||
- job_descriptions(job_id PK FK -> job_listings, title, company, location, description, posted_time, url)
|
||||
- user_interactions(job_id PK FK -> job_listings, user_id FK -> users, seen_at, url_visited, is_user_favorite)
|
||||
@@ -23,8 +22,6 @@ from web.utils import (
|
||||
url_to_job_id,
|
||||
normalize_job_id,
|
||||
now_iso,
|
||||
get_cache_path,
|
||||
get_cache_dir,
|
||||
get_mysql_config,
|
||||
)
|
||||
|
||||
@@ -86,8 +83,6 @@ class JobListing(Base):
|
||||
|
||||
description = relationship(
|
||||
"JobDescription", back_populates="listing", uselist=False, cascade="all, delete-orphan")
|
||||
cached_pages = relationship(
|
||||
"CachedPage", back_populates="listing", cascade="all, delete-orphan")
|
||||
interactions = relationship(
|
||||
"UserInteraction", back_populates="listing", cascade="all, delete-orphan")
|
||||
|
||||
@@ -106,30 +101,6 @@ class JobDescription(Base):
|
||||
listing = relationship("JobListing", back_populates="description")
|
||||
|
||||
|
||||
class CachedPage(Base):
|
||||
__tablename__ = "cached_pages"
|
||||
file_path = Column(String(FILE_PATH_LEN), primary_key=True)
|
||||
url_guess = Column(String(URL_LEN))
|
||||
last_modified = Column(String(TIME_LEN))
|
||||
size_bytes = Column(Integer)
|
||||
job_id = Column(String(JOB_ID_LEN), ForeignKey(
|
||||
"job_listings.job_id", ondelete="CASCADE"))
|
||||
|
||||
listing = relationship("JobListing", back_populates="cached_pages")
|
||||
|
||||
@property
|
||||
def abs_path(self) -> Optional[str]:
|
||||
"""Return the absolute filesystem path for this cached page.
|
||||
|
||||
The DB stores `file_path` relative to the configured cache dir. This
|
||||
helper centralizes resolution so callers can use `cached_page.abs_path`.
|
||||
"""
|
||||
fp = getattr(self, 'file_path', None)
|
||||
if not fp:
|
||||
return None
|
||||
return os.path.join(os.path.abspath(get_cache_dir()), fp)
|
||||
|
||||
|
||||
class UserInteraction(Base):
|
||||
__tablename__ = "user_interactions"
|
||||
# composite uniqueness on (user_id, job_id)
|
||||
@@ -292,139 +263,6 @@ def upsert_job_details(job_data: Dict[str, Any]):
|
||||
session.commit()
|
||||
|
||||
|
||||
def upsert_cached_page(*, file_path: str, url_guess: Optional[str], last_modified: Optional[str], size_bytes: Optional[int], job_id: Optional[int]):
|
||||
# Store file paths relative to the cache directory (keeps DB portable)
|
||||
cache_dir = os.path.abspath(get_cache_dir())
|
||||
abs_fp = os.path.abspath(file_path)
|
||||
rel_fp = os.path.relpath(abs_fp, start=cache_dir)
|
||||
with _ensure_session() as session:
|
||||
obj = session.get(CachedPage, rel_fp)
|
||||
if obj is None:
|
||||
obj = CachedPage(file_path=rel_fp)
|
||||
session.add(obj)
|
||||
setattr(obj, "url_guess", url_guess)
|
||||
setattr(obj, "last_modified", last_modified)
|
||||
setattr(obj, "size_bytes", size_bytes)
|
||||
setattr(obj, "job_id", str(job_id) if job_id else None)
|
||||
session.commit()
|
||||
|
||||
|
||||
def remove_cached_page(file_path: str):
|
||||
# Accept absolute or relative input but DB keys are stored relative to cache dir
|
||||
cache_dir = os.path.abspath(get_cache_dir())
|
||||
abs_fp = os.path.abspath(file_path)
|
||||
rel_fp = os.path.relpath(abs_fp, start=cache_dir)
|
||||
with _ensure_session() as session:
|
||||
obj = session.get(CachedPage, rel_fp)
|
||||
if obj:
|
||||
session.delete(obj)
|
||||
session.commit()
|
||||
|
||||
|
||||
def db_remove_cached_url(url: str):
|
||||
"""Remove a cached page by URL."""
|
||||
# Compute absolute path for the URL and delegate to remove_cached_page
|
||||
abs_fp = os.path.abspath(get_cache_path(url))
|
||||
try:
|
||||
remove_cached_page(abs_fp)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def db_get_all_cached_pages() -> List[Dict[str, Any]]:
|
||||
with _ensure_session() as session:
|
||||
rows = session.execute(text(
|
||||
"SELECT file_path, url_guess, last_modified, size_bytes, job_id FROM cached_pages")).fetchall()
|
||||
out = []
|
||||
for row in rows:
|
||||
fp = row[0]
|
||||
out.append({
|
||||
"file_path": fp,
|
||||
"file_path_abs": db_get_cached_abs_path(fp) if fp else None,
|
||||
"url_guess": row[1],
|
||||
"last_modified": row[2],
|
||||
"size_bytes": row[3],
|
||||
"job_id": row[4],
|
||||
})
|
||||
return out
|
||||
|
||||
|
||||
def db_get_cache_url(url: str):
|
||||
"""Return the data for a specific URL from cached_pages.
|
||||
|
||||
Arguments:
|
||||
url -- The URL to look up in the cache.
|
||||
"""
|
||||
with _ensure_session() as session:
|
||||
row = session.execute(text(
|
||||
"SELECT file_path, url_guess, last_modified, size_bytes, job_id FROM cached_pages WHERE url_guess = :u"), {"u": url}).fetchone()
|
||||
if not row:
|
||||
return None
|
||||
fp = row[0]
|
||||
return {
|
||||
"file_path": fp,
|
||||
"file_path_abs": db_get_cached_abs_path(fp) if fp else None,
|
||||
"url_guess": row[1],
|
||||
"last_modified": row[2],
|
||||
"size_bytes": row[3],
|
||||
"job_id": row[4],
|
||||
}
|
||||
|
||||
|
||||
def db_sync_cached_pages(cache_dir: str):
|
||||
"""Scan cache_dir and upsert page metadata into cached_pages table."""
|
||||
if not os.path.isdir(cache_dir):
|
||||
return
|
||||
abs_cache = os.path.abspath(cache_dir)
|
||||
# read existing DB keys once for quick membership tests
|
||||
db_cache_paths = {c["file_path"] for c in db_get_all_cached_pages()}
|
||||
for root, _, files in os.walk(abs_cache):
|
||||
for name in files:
|
||||
if not name.lower().endswith(".html"):
|
||||
continue
|
||||
fp = os.path.abspath(os.path.join(root, name))
|
||||
rel_fp = os.path.relpath(fp, start=abs_cache)
|
||||
if rel_fp in db_cache_paths:
|
||||
continue
|
||||
try:
|
||||
stat = os.stat(fp)
|
||||
mtime = datetime.fromtimestamp(stat.st_mtime).isoformat()
|
||||
size = stat.st_size
|
||||
except OSError:
|
||||
mtime = None
|
||||
size = None
|
||||
url_guess = get_url_from_filename(name)
|
||||
job_id = url_to_job_id(url_guess)
|
||||
upsert_cached_page(file_path=rel_fp, url_guess=url_guess,
|
||||
last_modified=mtime, size_bytes=size, job_id=job_id)
|
||||
|
||||
|
||||
def normalize_cached_page_paths() -> int:
|
||||
"""Ensure all cached_pages.file_path values are absolute. Returns number of rows updated/normalized."""
|
||||
# Convert any absolute paths in DB to relative paths (relative to cache dir)
|
||||
changed = 0
|
||||
abs_cache = os.path.abspath(get_cache_dir())
|
||||
with _ensure_session() as session:
|
||||
rows = session.execute(text(
|
||||
"SELECT file_path, url_guess, last_modified, size_bytes, job_id FROM cached_pages")).fetchall()
|
||||
for (fp, url_guess, last_modified, size_bytes, job_id) in rows:
|
||||
if os.path.isabs(fp):
|
||||
rel_fp = os.path.relpath(fp, start=abs_cache)
|
||||
upsert_cached_page(
|
||||
file_path=rel_fp,
|
||||
url_guess=url_guess,
|
||||
last_modified=last_modified,
|
||||
size_bytes=size_bytes,
|
||||
job_id=job_id,
|
||||
)
|
||||
with _ensure_session() as session:
|
||||
session.execute(
|
||||
text("DELETE FROM cached_pages WHERE file_path = :fp"), {"fp": fp})
|
||||
session.commit()
|
||||
changed += 1
|
||||
return changed
|
||||
|
||||
|
||||
def db_get_keywords() -> List[str]:
|
||||
"""Return a list of all unique keywords from job listings."""
|
||||
with _ensure_session() as session:
|
||||
@@ -453,15 +291,10 @@ SELECT l.job_id
|
||||
,l.timestamp
|
||||
,d.posted_time
|
||||
,l.url
|
||||
,c.file_path
|
||||
,c.last_modified
|
||||
,c.url_guess
|
||||
,CASE WHEN c.url_guess != l.url THEN 1 ELSE 0 END AS url_guess_stale
|
||||
FROM job_listings AS l
|
||||
INNER JOIN job_descriptions AS d
|
||||
ON l.job_id = d.job_id
|
||||
AND l.url = d.url
|
||||
LEFT JOIN cached_pages AS c ON l.job_id = c.job_id
|
||||
ORDER BY d.posted_time DESC
|
||||
"""
|
||||
with _ensure_session() as session:
|
||||
@@ -479,27 +312,11 @@ ORDER BY d.posted_time DESC
|
||||
"timestamp": row[7],
|
||||
"posted_time": row[8],
|
||||
"url": row[9],
|
||||
# file_path is stored relative to cache dir; provide both forms
|
||||
"file_path": row[10],
|
||||
"file_path_abs": os.path.join(os.path.abspath(get_cache_dir()), row[10]) if row[10] else None,
|
||||
"last_modified": row[11],
|
||||
"url_guess": row[12],
|
||||
"url_guess_stale": row[13],
|
||||
}
|
||||
jobs.append(job)
|
||||
return jobs
|
||||
|
||||
|
||||
def db_get_cached_abs_path(db_file_path: Optional[str]) -> Optional[str]:
|
||||
"""Return absolute cache file path given a DB-stored (relative) file_path.
|
||||
|
||||
Returns None if input is falsy.
|
||||
"""
|
||||
if not db_file_path:
|
||||
return None
|
||||
return os.path.join(os.path.abspath(get_cache_dir()), db_file_path)
|
||||
|
||||
|
||||
def db_get_all_job_urls() -> List[str]:
|
||||
"""Return list of job URLs from job_listings."""
|
||||
with _ensure_session() as session:
|
||||
@@ -522,10 +339,6 @@ def remove_job(url):
|
||||
try:
|
||||
jid = url_to_job_id(url)
|
||||
db_delete_job(jid)
|
||||
cache_fp = get_cache_path(url)
|
||||
remove_cached_page(os.path.abspath(cache_fp))
|
||||
if os.path.exists(cache_fp):
|
||||
os.remove(cache_fp)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
Reference in New Issue
Block a user