Compare commits

...

2 Commits

Author SHA1 Message Date
georg.sinn-schirwitz
8ad52563fa extending abs_path in db and test coverage 2025-08-30 13:06:16 +02:00
georg.sinn-schirwitz
f899439f6a ensuring abs_path 2025-08-30 13:00:17 +02:00
6 changed files with 151 additions and 8 deletions

View File

@@ -0,0 +1,53 @@
import os
import tempfile
from web.app import app
def test_cached_route_serves_file(monkeypatch):
# Create a temporary file in the configured cache dir
cache_dir = os.path.abspath(os.path.join(
os.path.dirname(__file__), '..', 'cache'))
os.makedirs(cache_dir, exist_ok=True)
fd, tmp_path = tempfile.mkstemp(
prefix='test_cached_', suffix='.html', dir=cache_dir)
os.close(fd)
with open(tmp_path, 'w', encoding='utf-8') as f:
f.write('<html><body>cached</body></html>')
# Fake job record returned by get_job_by_id
fake_job = {
'id': 'fake123',
'job_id': 'fake123',
'file_path': os.path.relpath(tmp_path, cache_dir),
'file_path_abs': tmp_path,
}
def fake_get_job_by_id(jid):
if str(jid) in ('fake123',):
return fake_job
return {}
# Patch the symbol imported into web.app
monkeypatch.setattr('web.app.get_job_by_id', fake_get_job_by_id)
# Request route
client = app.test_client()
res = client.get('/cached/fake123')
assert res.status_code == 200
assert b'cached' in res.data
# Cleanup
try:
os.remove(tmp_path)
except Exception:
pass
def test_cached_route_missing(monkeypatch):
def fake_get_job_by_id(jid):
return {}
monkeypatch.setattr('web.app.get_job_by_id', fake_get_job_by_id)
client = app.test_client()
res = client.get('/cached/nope')
assert res.status_code == 404

View File

@@ -0,0 +1,27 @@
import os
from web.db import CachedPage
from web.utils import get_cache_dir
def test_cachedpage_abs_path(tmp_path, monkeypatch):
# Create a fake cache dir and monkeypatch get_cache_dir
fake_cache = tmp_path / 'cache'
fake_cache.mkdir()
monkeypatch.setenv('PYTHONIOENCODING', 'utf-8')
# Patch the symbol used by CachedPage.abs_path (imported into web.db)
monkeypatch.setattr('web.db.get_cache_dir', lambda: str(fake_cache))
# Create a CachedPage instance and set file_path attribute
cp = CachedPage()
setattr(cp, 'file_path', 'subdir/test.html')
# Ensure the computed absolute path joins the fake cache dir
expected = os.path.join(os.path.abspath(
str(fake_cache)), 'subdir/test.html')
assert cp.abs_path == expected
# When file_path is falsy, abs_path should be None
cp2 = CachedPage()
setattr(cp2, 'file_path', None)
assert cp2.abs_path is None

View File

@@ -1,5 +1,5 @@
import os import os
from flask import Flask, request, jsonify, render_template, redirect, url_for, session, flash from flask import Flask, request, jsonify, render_template, redirect, url_for, session, flash, send_file
from flask_wtf import CSRFProtect from flask_wtf import CSRFProtect
from typing import Dict, List from typing import Dict, List
@@ -32,6 +32,7 @@ from web.utils import (
initialize_users_from_settings, initialize_users_from_settings,
filter_jobs, filter_jobs,
get_job_by_id, get_job_by_id,
get_cache_dir,
) )
from web.db import get_all_regions, get_all_keywords from web.db import get_all_regions, get_all_keywords
@@ -229,6 +230,39 @@ def job_by_id(job_id):
return jsonify({"error": "Job not found"}), 404 return jsonify({"error": "Job not found"}), 404
@app.route('/cached/<job_id>', methods=['GET'])
def serve_cached(job_id):
"""Serve the cached HTML file for a job if available.
Uses the job record's `file_path_abs` when present, or resolves the DB `file_path` via helper.
Ensures the returned file is located under the configured cache directory to avoid path-traversal.
"""
try:
from web.db import db_get_cached_abs_path
j = get_job_by_id(job_id)
if not j:
return "Job not found", 404
# Prefer file_path_abs, fall back to resolving the DB-stored file_path
abs_fp = j.get('file_path_abs') or None
if not abs_fp:
db_fp = j.get('file_path')
abs_fp = db_get_cached_abs_path(db_fp) if db_fp else None
if not abs_fp or not os.path.isfile(abs_fp):
return "Cached file not available", 404
cache_dir = os.path.abspath(get_cache_dir())
abs_fp = os.path.abspath(abs_fp)
# Ensure the file is inside the cache directory
if os.path.commonpath([cache_dir, abs_fp]) != cache_dir:
return "Forbidden", 403
return send_file(abs_fp)
except Exception:
return "Error serving cached file", 500
@app.route('/jobs/<job_id>/favorite', methods=['POST']) @app.route('/jobs/<job_id>/favorite', methods=['POST'])
def set_favorite(job_id): def set_favorite(job_id):
"""Mark or unmark a job as favorite for a given user. """Mark or unmark a job as favorite for a given user.

View File

@@ -117,6 +117,18 @@ class CachedPage(Base):
listing = relationship("JobListing", back_populates="cached_pages") listing = relationship("JobListing", back_populates="cached_pages")
@property
def abs_path(self) -> Optional[str]:
"""Return the absolute filesystem path for this cached page.
The DB stores `file_path` relative to the configured cache dir. This
helper centralizes resolution so callers can use `cached_page.abs_path`.
"""
fp = getattr(self, 'file_path', None)
if not fp:
return None
return os.path.join(os.path.abspath(get_cache_dir()), fp)
class UserInteraction(Base): class UserInteraction(Base):
__tablename__ = "user_interactions" __tablename__ = "user_interactions"
@@ -323,16 +335,18 @@ def db_get_all_cached_pages() -> List[Dict[str, Any]]:
with _ensure_session() as session: with _ensure_session() as session:
rows = session.execute(text( rows = session.execute(text(
"SELECT file_path, url_guess, last_modified, size_bytes, job_id FROM cached_pages")).fetchall() "SELECT file_path, url_guess, last_modified, size_bytes, job_id FROM cached_pages")).fetchall()
return [ out = []
{ for row in rows:
"file_path": row[0], fp = row[0]
out.append({
"file_path": fp,
"file_path_abs": db_get_cached_abs_path(fp) if fp else None,
"url_guess": row[1], "url_guess": row[1],
"last_modified": row[2], "last_modified": row[2],
"size_bytes": row[3], "size_bytes": row[3],
"job_id": row[4], "job_id": row[4],
} })
for row in rows return out
]
def db_get_cache_url(url: str): def db_get_cache_url(url: str):
@@ -346,8 +360,10 @@ def db_get_cache_url(url: str):
"SELECT file_path, url_guess, last_modified, size_bytes, job_id FROM cached_pages WHERE url_guess = :u"), {"u": url}).fetchone() "SELECT file_path, url_guess, last_modified, size_bytes, job_id FROM cached_pages WHERE url_guess = :u"), {"u": url}).fetchone()
if not row: if not row:
return None return None
fp = row[0]
return { return {
"file_path": row[0], "file_path": fp,
"file_path_abs": db_get_cached_abs_path(fp) if fp else None,
"url_guess": row[1], "url_guess": row[1],
"last_modified": row[2], "last_modified": row[2],
"size_bytes": row[3], "size_bytes": row[3],

View File

@@ -46,6 +46,11 @@
<p class="job-posted-time">{{ job['posted_time'] }}</p> <p class="job-posted-time">{{ job['posted_time'] }}</p>
<span class="job-region region-{{ job['region'] }}">{{ job['region'] }}</span> <span class="job-region region-{{ job['region'] }}">{{ job['region'] }}</span>
<span class="job-keyword keyword-{{ job['keyword']|replace(' ', '')|lower }}">{{ job['keyword'] }}</span> <span class="job-keyword keyword-{{ job['keyword']|replace(' ', '')|lower }}">{{ job['keyword'] }}</span>
{% if job.get('file_path_abs') or job.get('file_path') %}
<div class="job-cached">
<a href="{{ url_for('serve_cached', job_id=job.get('id') or job.get('job_id')) }}" target="_blank">Cached</a>
</div>
{% endif %}
</div> </div>
{% endfor %} {% endfor %}
</div> </div>

View File

@@ -23,5 +23,13 @@ styles %}{% endblock %} {% block content %}
>{{ job.title }}</a >{{ job.title }}</a
> >
</p> </p>
{% if job.file_path_abs or job.file_path %}
<p>
<strong>Cached copy:</strong>
<a href="{{ url_for('serve_cached', job_id=job.id) }}" target="_blank"
>View cached copy</a
>
</p>
{% endif %}
</div> </div>
{% endblock %} {% endblock %}