remove caching

This commit is contained in:
georg.sinn-schirwitz
2025-09-08 14:44:46 +02:00
parent f8e23d0fba
commit 042a196718
13 changed files with 144 additions and 525 deletions

View File

@@ -1,6 +1,6 @@
#!/usr/bin/env python #!/usr/bin/env python
""" """
MySQL utilities for Craigslist project. MySQL utility script for initializing database and showing row counts.
Usage (PowerShell): Usage (PowerShell):
# Ensure MySQL database and tables # Ensure MySQL database and tables

View File

@@ -1,66 +0,0 @@
import os
import tempfile
import pytest
import web.db as db
from web.utils import get_cache_dir
# Skip unless explicitly enabled (MySQL integration expected)
if not os.getenv("RUN_DB_TESTS"):
pytest.skip("Set RUN_DB_TESTS=1 to run cache path integration tests",
allow_module_level=True)
def test_db_sync_inserts_relative_paths(tmp_path, monkeypatch):
# arrange: create a temporary cache dir and a fake html file
cache_dir = tmp_path / "cache"
cache_dir.mkdir()
f = cache_dir / "example.org_path_to_page_123.html"
f.write_text("<html>ok</html>")
# point app at this cache dir
monkeypatch.setenv("PYTEST_CACHE_DIR", str(cache_dir))
# monkeypatch get_cache_dir used by db functions
monkeypatch.setattr('web.utils.get_cache_dir', lambda: str(cache_dir))
# ensure DB initialized
db.db_init()
# act
db.db_sync_cached_pages(str(cache_dir))
# assert: DB contains relative path, not absolute
rows = db.db_get_all_cached_pages()
assert any(r['file_path'] == os.path.relpath(
str(f), start=str(cache_dir)) for r in rows)
def test_normalize_cached_page_paths_converts_absolute(tmp_path, monkeypatch):
cache_dir = tmp_path / "cache"
cache_dir.mkdir()
# create an actual file
f = cache_dir / "site_example_page_1.html"
f.write_text("<html>ok</html>")
monkeypatch.setattr('web.utils.get_cache_dir', lambda: str(cache_dir))
db.db_init()
abs_fp = str(f)
rel_fp = os.path.relpath(abs_fp, start=str(cache_dir))
# Insert an absolute path row directly (simulate legacy data)
with db._ensure_session() as session:
session.execute(
db.text("INSERT INTO cached_pages(file_path, url_guess, last_modified, size_bytes, job_id) VALUES(:fp, :ug, :lm, :sz, :jid)"),
{"fp": abs_fp, "ug": "https://example.org/page1.html",
"lm": None, "sz": 10, "jid": None}
)
session.commit()
# normalize should convert absolute to relative
changed = db.normalize_cached_page_paths()
assert changed >= 1
rows = db.db_get_all_cached_pages()
assert any(r['file_path'] == rel_fp for r in rows)

View File

@@ -1,53 +0,0 @@
import os
import tempfile
from web.app import app
def test_cached_route_serves_file(monkeypatch):
# Create a temporary file in the configured cache dir
cache_dir = os.path.abspath(os.path.join(
os.path.dirname(__file__), '..', 'cache'))
os.makedirs(cache_dir, exist_ok=True)
fd, tmp_path = tempfile.mkstemp(
prefix='test_cached_', suffix='.html', dir=cache_dir)
os.close(fd)
with open(tmp_path, 'w', encoding='utf-8') as f:
f.write('<html><body>cached</body></html>')
# Fake job record returned by get_job_by_id
fake_job = {
'id': 'fake123',
'job_id': 'fake123',
'file_path': os.path.relpath(tmp_path, cache_dir),
'file_path_abs': tmp_path,
}
def fake_get_job_by_id(jid):
if str(jid) in ('fake123',):
return fake_job
return {}
# Patch the symbol imported into web.app
monkeypatch.setattr('web.app.get_job_by_id', fake_get_job_by_id)
# Request route
client = app.test_client()
res = client.get('/cached/fake123')
assert res.status_code == 200
assert b'cached' in res.data
# Cleanup
try:
os.remove(tmp_path)
except Exception:
pass
def test_cached_route_missing(monkeypatch):
def fake_get_job_by_id(jid):
return {}
monkeypatch.setattr('web.app.get_job_by_id', fake_get_job_by_id)
client = app.test_client()
res = client.get('/cached/nope')
assert res.status_code == 404

View File

@@ -1,27 +0,0 @@
import os
from web.db import CachedPage
from web.utils import get_cache_dir
def test_cachedpage_abs_path(tmp_path, monkeypatch):
# Create a fake cache dir and monkeypatch get_cache_dir
fake_cache = tmp_path / 'cache'
fake_cache.mkdir()
monkeypatch.setenv('PYTHONIOENCODING', 'utf-8')
# Patch the symbol used by CachedPage.abs_path (imported into web.db)
monkeypatch.setattr('web.db.get_cache_dir', lambda: str(fake_cache))
# Create a CachedPage instance and set file_path attribute
cp = CachedPage()
setattr(cp, 'file_path', 'subdir/test.html')
# Ensure the computed absolute path joins the fake cache dir
expected = os.path.join(os.path.abspath(
str(fake_cache)), 'subdir/test.html')
assert cp.abs_path == expected
# When file_path is falsy, abs_path should be None
cp2 = CachedPage()
setattr(cp2, 'file_path', None)
assert cp2.abs_path is None

View File

@@ -95,39 +95,6 @@ def test_upsert_listing_details_and_urls(db_ready):
pass pass
def test_cached_page_upsert_and_get(db_ready):
jid_suffix = unique_suffix()
url = f"https://example.org/it/{jid_suffix}.html"
# Ensure a listing exists for FK relation if enforced
db.upsert_listing(
url=url,
region="it",
keyword="cache",
title=f"IT Cache {jid_suffix}",
pay="N/A",
location="Test City",
timestamp=now_iso(),
)
fp = f"/tmp/integration_{jid_suffix}.html"
db.upsert_cached_page(
file_path=fp,
url_guess=url,
last_modified=now_iso(),
size_bytes=123,
job_id=int(jid_suffix) if jid_suffix.isdigit() else None,
)
row = db.db_get_cache_url(url)
if row is not None:
assert row["url_guess"] == url
# Cleanup
try:
db.remove_cached_page(fp)
db.db_remove_cached_url(url)
db.db_delete_job(jid_suffix)
except Exception:
pass
def test_user_interactions_mark_and_visit(db_ready): def test_user_interactions_mark_and_visit(db_ready):
uname = f"it_user_{unique_suffix()}" uname = f"it_user_{unique_suffix()}"
db.create_or_update_user(uname, is_active=True) db.create_or_update_user(uname, is_active=True)

View File

@@ -1,5 +1,5 @@
import os import os
from flask import Flask, request, jsonify, render_template, redirect, url_for, session, flash, send_file from flask import Flask, request, jsonify, render_template, redirect, url_for, session, flash, Response
from flask_wtf import CSRFProtect from flask_wtf import CSRFProtect
from typing import Dict, List from typing import Dict, List
@@ -34,7 +34,6 @@ from web.utils import (
initialize_users_from_settings, initialize_users_from_settings,
filter_jobs, filter_jobs,
get_job_by_id, get_job_by_id,
get_cache_dir,
) )
from web.db import get_all_regions, get_all_keywords from web.db import get_all_regions, get_all_keywords
@@ -232,39 +231,6 @@ def job_by_id(job_id):
return jsonify({"error": "Job not found"}), 404 return jsonify({"error": "Job not found"}), 404
@app.route('/cached/<job_id>', methods=['GET'])
def serve_cached(job_id):
"""Serve the cached HTML file for a job if available.
Uses the job record's `file_path_abs` when present, or resolves the DB `file_path` via helper.
Ensures the returned file is located under the configured cache directory to avoid path-traversal.
"""
try:
from web.db import db_get_cached_abs_path
j = get_job_by_id(job_id)
if not j:
return "Job not found", 404
# Prefer file_path_abs, fall back to resolving the DB-stored file_path
abs_fp = j.get('file_path_abs') or None
if not abs_fp:
db_fp = j.get('file_path')
abs_fp = db_get_cached_abs_path(db_fp) if db_fp else None
if not abs_fp or not os.path.isfile(abs_fp):
return "Cached file not available", 404
cache_dir = os.path.abspath(get_cache_dir())
abs_fp = os.path.abspath(abs_fp)
# Ensure the file is inside the cache directory
if os.path.commonpath([cache_dir, abs_fp]) != cache_dir:
return "Forbidden", 403
return send_file(abs_fp)
except Exception:
return "Error serving cached file", 500
@app.route('/jobs/<job_id>/favorite', methods=['POST']) @app.route('/jobs/<job_id>/favorite', methods=['POST'])
def set_favorite(job_id): def set_favorite(job_id):
"""Mark or unmark a job as favorite for a given user. """Mark or unmark a job as favorite for a given user.
@@ -290,9 +256,21 @@ csrf.exempt(set_favorite)
@app.route('/scrape', methods=['GET']) @app.route('/scrape', methods=['GET'])
def scrape(): def scrape():
"""Trigger the web scraping process.""" """Trigger the web scraping process with streaming output."""
scraper() def generate():
return jsonify({"status": "Scraping completed"}) try:
for message in scraper():
yield message
except Exception as e:
yield f"Error during scraping: {str(e)}\n"
return Response(generate(), mimetype='text/plain')
@app.route('/scrape-page', methods=['GET'])
def scrape_page():
"""Serve the scrape page with streaming output display."""
return render_template('scrape.html', title='Scrape Jobs')
# ---------------- Auth & Admin UI ------------------------------------------ # ---------------- Auth & Admin UI ------------------------------------------

View File

@@ -2,31 +2,19 @@ from datetime import datetime, timezone
from web.scraper import process_region_keyword, scrape_job_page from web.scraper import process_region_keyword, scrape_job_page
from web.db import ( from web.db import (
db_init, db_init,
upsert_cached_page,
upsert_listing, upsert_listing,
upsert_job_details, upsert_job_details,
url_to_job_id, url_to_job_id,
upsert_user_interaction, upsert_user_interaction,
db_remove_cached_url,
db_sync_cached_pages,
db_get_all_job_urls, db_get_all_job_urls,
db_get_cache_url,
db_delete_job, db_delete_job,
remove_job, remove_job,
normalize_cached_page_paths,
) )
# Import utility functions # Import utility functions
from web.utils import ( from web.utils import (
get_cache_dir,
make_request_with_retry, make_request_with_retry,
now_iso, now_iso,
get_cache_path,
cache_page,
is_cache_stale,
delete_cached_page,
get_cached_content,
ensure_cache_dir
) )
from web.db import get_all_regions, get_all_keywords, seed_regions_keywords_from_listings from web.db import get_all_regions, get_all_keywords, seed_regions_keywords_from_listings
@@ -45,15 +33,26 @@ def fetch_listings():
except Exception: except Exception:
pass pass
yield "Initializing database and seeding regions/keywords...\n"
# Fetch listings for each region/keyword from DB # Fetch listings for each region/keyword from DB
for region in get_all_regions(): regions = get_all_regions()
keywords = get_all_keywords()
total_combinations = len(regions) * len(keywords)
processed = 0
yield f"Found {len(regions)} regions and {len(keywords)} keywords. Processing {total_combinations} combinations...\n"
for region in regions:
region_name = region.get("name") region_name = region.get("name")
if not region_name: if not region_name:
continue continue
for keyword in get_all_keywords(): for keyword in keywords:
keyword_name = keyword.get("name") keyword_name = keyword.get("name")
if not keyword_name: if not keyword_name:
continue continue
processed += 1
yield f"Processing {region_name} + {keyword_name} ({processed}/{total_combinations})...\n"
for row in process_region_keyword(region_name, keyword_name, discovered_urls): for row in process_region_keyword(region_name, keyword_name, discovered_urls):
timestamp, region, keyword, title, pay, location, url = row timestamp, region, keyword, title, pay, location, url = row
discovered_urls.add(url) discovered_urls.add(url)
@@ -72,75 +71,68 @@ def fetch_listings():
# Remove stale listings: those present in DB but not discovered now. # Remove stale listings: those present in DB but not discovered now.
stale_urls = existing_db_urls - discovered_urls stale_urls = existing_db_urls - discovered_urls
if stale_urls:
yield f"Removing {len(stale_urls)} stale listings...\n"
for url in stale_urls: for url in stale_urls:
try: try:
jid = url_to_job_id(url) jid = url_to_job_id(url)
db_delete_job(jid) db_delete_job(jid)
# Also try to remove cached file and its metadata
delete_cached_page(url)
db_remove_cached_url(url)
except Exception: except Exception:
pass pass
yield f"Listing fetch complete: {len(discovered_urls)} discovered, {len(new_rows)} new, {len(stale_urls)} stale\n"
return {"discovered": len(discovered_urls), "new": len(new_rows), "stale": len(stale_urls)} return {"discovered": len(discovered_urls), "new": len(new_rows), "stale": len(stale_urls)}
def process_job_url(job_url: str): def process_job_url(job_url: str):
try: try:
job_id = url_to_job_id(job_url) job_id = url_to_job_id(job_url)
content = None yield f"Fetching job page: {job_url}\n"
cached_page = db_get_cache_url(job_url)
if cached_page:
last_modified = cached_page.get("last_modified")
if last_modified and not is_cache_stale(last_modified):
content = get_cached_content(job_url)
else:
content = make_request_with_retry(job_url, 1)
else:
content = make_request_with_retry(job_url, 1) content = make_request_with_retry(job_url, 1)
if content is None: if content is None:
yield f"Failed to fetch content for {job_url}, removing from database\n"
remove_job(job_url) remove_job(job_url)
return None return None
# refresh cache and details yield f"Scraping job data from {job_url}\n"
cache_page(job_url, content)
upsert_cached_page(
file_path=get_cache_path(job_url),
url_guess=job_url,
last_modified=now_iso(),
size_bytes=len(content),
job_id=job_id
)
job_data = scrape_job_page(content, job_url) job_data = scrape_job_page(content, job_url)
if job_data: if job_data:
yield f"Upserting job details for {job_id}\n"
upsert_job_details(job_data) upsert_job_details(job_data)
upsert_user_interaction( upsert_user_interaction(
job_id, seen_at=datetime.now(timezone.utc).isoformat()) job_id, seen_at=datetime.now(timezone.utc).isoformat())
yield f"Successfully processed job {job_id}: {job_data.get('title', 'Unknown')}\n"
return job_data return job_data
else:
yield f"Failed to scrape job data from {job_url}\n"
return None return None
except Exception: except Exception as e:
yield f"Error processing {job_url}: {str(e)}\n"
return None return None
def scraper(): def scraper():
"""Main function to run the scraper.""" """Main function to run the scraper."""
ensure_cache_dir() yield "Starting scraper...\n"
db_init() db_init()
yield "Database initialized\n"
# First, fetch current listings from search pages and make DB reflect them. # First, fetch current listings from search pages and make DB reflect them.
jl = fetch_listings() yield "Fetching listings...\n"
for message in fetch_listings():
# Sync any cached files we have on disk into the cached_pages table. yield message
db_sync_cached_pages(get_cache_dir())
# Normalize any relative cached file paths to absolute paths in DB
normalized = normalize_cached_page_paths()
if normalized:
pass
# Finally, fetch and refresh individual job pages for current listings # Finally, fetch and refresh individual job pages for current listings
for url in db_get_all_job_urls(): job_urls = db_get_all_job_urls()
process_job_url(url) yield f"Processing {len(job_urls)} job pages...\n"
for i, url in enumerate(job_urls, 1):
yield f"\n--- Processing job {i}/{len(job_urls)} ---\n"
for message in process_job_url(url):
yield message
yield "\nScraping completed successfully!\n"
if __name__ == "__main__": if __name__ == "__main__":

187
web/db.py
View File

@@ -4,7 +4,6 @@ from __future__ import annotations
Tables: Tables:
- users(user_id PK, username UNIQUE, created_at) - users(user_id PK, username UNIQUE, created_at)
- cached_pages(file_path PK, url_guess, last_modified, size_bytes, job_id)
- job_listings(job_id PK, url UNIQUE, region, keyword, title, pay, location, timestamp) - job_listings(job_id PK, url UNIQUE, region, keyword, title, pay, location, timestamp)
- job_descriptions(job_id PK FK -> job_listings, title, company, location, description, posted_time, url) - job_descriptions(job_id PK FK -> job_listings, title, company, location, description, posted_time, url)
- user_interactions(job_id PK FK -> job_listings, user_id FK -> users, seen_at, url_visited, is_user_favorite) - user_interactions(job_id PK FK -> job_listings, user_id FK -> users, seen_at, url_visited, is_user_favorite)
@@ -23,8 +22,6 @@ from web.utils import (
url_to_job_id, url_to_job_id,
normalize_job_id, normalize_job_id,
now_iso, now_iso,
get_cache_path,
get_cache_dir,
get_mysql_config, get_mysql_config,
) )
@@ -86,8 +83,6 @@ class JobListing(Base):
description = relationship( description = relationship(
"JobDescription", back_populates="listing", uselist=False, cascade="all, delete-orphan") "JobDescription", back_populates="listing", uselist=False, cascade="all, delete-orphan")
cached_pages = relationship(
"CachedPage", back_populates="listing", cascade="all, delete-orphan")
interactions = relationship( interactions = relationship(
"UserInteraction", back_populates="listing", cascade="all, delete-orphan") "UserInteraction", back_populates="listing", cascade="all, delete-orphan")
@@ -106,30 +101,6 @@ class JobDescription(Base):
listing = relationship("JobListing", back_populates="description") listing = relationship("JobListing", back_populates="description")
class CachedPage(Base):
__tablename__ = "cached_pages"
file_path = Column(String(FILE_PATH_LEN), primary_key=True)
url_guess = Column(String(URL_LEN))
last_modified = Column(String(TIME_LEN))
size_bytes = Column(Integer)
job_id = Column(String(JOB_ID_LEN), ForeignKey(
"job_listings.job_id", ondelete="CASCADE"))
listing = relationship("JobListing", back_populates="cached_pages")
@property
def abs_path(self) -> Optional[str]:
"""Return the absolute filesystem path for this cached page.
The DB stores `file_path` relative to the configured cache dir. This
helper centralizes resolution so callers can use `cached_page.abs_path`.
"""
fp = getattr(self, 'file_path', None)
if not fp:
return None
return os.path.join(os.path.abspath(get_cache_dir()), fp)
class UserInteraction(Base): class UserInteraction(Base):
__tablename__ = "user_interactions" __tablename__ = "user_interactions"
# composite uniqueness on (user_id, job_id) # composite uniqueness on (user_id, job_id)
@@ -292,139 +263,6 @@ def upsert_job_details(job_data: Dict[str, Any]):
session.commit() session.commit()
def upsert_cached_page(*, file_path: str, url_guess: Optional[str], last_modified: Optional[str], size_bytes: Optional[int], job_id: Optional[int]):
# Store file paths relative to the cache directory (keeps DB portable)
cache_dir = os.path.abspath(get_cache_dir())
abs_fp = os.path.abspath(file_path)
rel_fp = os.path.relpath(abs_fp, start=cache_dir)
with _ensure_session() as session:
obj = session.get(CachedPage, rel_fp)
if obj is None:
obj = CachedPage(file_path=rel_fp)
session.add(obj)
setattr(obj, "url_guess", url_guess)
setattr(obj, "last_modified", last_modified)
setattr(obj, "size_bytes", size_bytes)
setattr(obj, "job_id", str(job_id) if job_id else None)
session.commit()
def remove_cached_page(file_path: str):
# Accept absolute or relative input but DB keys are stored relative to cache dir
cache_dir = os.path.abspath(get_cache_dir())
abs_fp = os.path.abspath(file_path)
rel_fp = os.path.relpath(abs_fp, start=cache_dir)
with _ensure_session() as session:
obj = session.get(CachedPage, rel_fp)
if obj:
session.delete(obj)
session.commit()
def db_remove_cached_url(url: str):
"""Remove a cached page by URL."""
# Compute absolute path for the URL and delegate to remove_cached_page
abs_fp = os.path.abspath(get_cache_path(url))
try:
remove_cached_page(abs_fp)
except Exception:
pass
def db_get_all_cached_pages() -> List[Dict[str, Any]]:
with _ensure_session() as session:
rows = session.execute(text(
"SELECT file_path, url_guess, last_modified, size_bytes, job_id FROM cached_pages")).fetchall()
out = []
for row in rows:
fp = row[0]
out.append({
"file_path": fp,
"file_path_abs": db_get_cached_abs_path(fp) if fp else None,
"url_guess": row[1],
"last_modified": row[2],
"size_bytes": row[3],
"job_id": row[4],
})
return out
def db_get_cache_url(url: str):
"""Return the data for a specific URL from cached_pages.
Arguments:
url -- The URL to look up in the cache.
"""
with _ensure_session() as session:
row = session.execute(text(
"SELECT file_path, url_guess, last_modified, size_bytes, job_id FROM cached_pages WHERE url_guess = :u"), {"u": url}).fetchone()
if not row:
return None
fp = row[0]
return {
"file_path": fp,
"file_path_abs": db_get_cached_abs_path(fp) if fp else None,
"url_guess": row[1],
"last_modified": row[2],
"size_bytes": row[3],
"job_id": row[4],
}
def db_sync_cached_pages(cache_dir: str):
"""Scan cache_dir and upsert page metadata into cached_pages table."""
if not os.path.isdir(cache_dir):
return
abs_cache = os.path.abspath(cache_dir)
# read existing DB keys once for quick membership tests
db_cache_paths = {c["file_path"] for c in db_get_all_cached_pages()}
for root, _, files in os.walk(abs_cache):
for name in files:
if not name.lower().endswith(".html"):
continue
fp = os.path.abspath(os.path.join(root, name))
rel_fp = os.path.relpath(fp, start=abs_cache)
if rel_fp in db_cache_paths:
continue
try:
stat = os.stat(fp)
mtime = datetime.fromtimestamp(stat.st_mtime).isoformat()
size = stat.st_size
except OSError:
mtime = None
size = None
url_guess = get_url_from_filename(name)
job_id = url_to_job_id(url_guess)
upsert_cached_page(file_path=rel_fp, url_guess=url_guess,
last_modified=mtime, size_bytes=size, job_id=job_id)
def normalize_cached_page_paths() -> int:
"""Ensure all cached_pages.file_path values are absolute. Returns number of rows updated/normalized."""
# Convert any absolute paths in DB to relative paths (relative to cache dir)
changed = 0
abs_cache = os.path.abspath(get_cache_dir())
with _ensure_session() as session:
rows = session.execute(text(
"SELECT file_path, url_guess, last_modified, size_bytes, job_id FROM cached_pages")).fetchall()
for (fp, url_guess, last_modified, size_bytes, job_id) in rows:
if os.path.isabs(fp):
rel_fp = os.path.relpath(fp, start=abs_cache)
upsert_cached_page(
file_path=rel_fp,
url_guess=url_guess,
last_modified=last_modified,
size_bytes=size_bytes,
job_id=job_id,
)
with _ensure_session() as session:
session.execute(
text("DELETE FROM cached_pages WHERE file_path = :fp"), {"fp": fp})
session.commit()
changed += 1
return changed
def db_get_keywords() -> List[str]: def db_get_keywords() -> List[str]:
"""Return a list of all unique keywords from job listings.""" """Return a list of all unique keywords from job listings."""
with _ensure_session() as session: with _ensure_session() as session:
@@ -453,15 +291,10 @@ SELECT l.job_id
,l.timestamp ,l.timestamp
,d.posted_time ,d.posted_time
,l.url ,l.url
,c.file_path
,c.last_modified
,c.url_guess
,CASE WHEN c.url_guess != l.url THEN 1 ELSE 0 END AS url_guess_stale
FROM job_listings AS l FROM job_listings AS l
INNER JOIN job_descriptions AS d INNER JOIN job_descriptions AS d
ON l.job_id = d.job_id ON l.job_id = d.job_id
AND l.url = d.url AND l.url = d.url
LEFT JOIN cached_pages AS c ON l.job_id = c.job_id
ORDER BY d.posted_time DESC ORDER BY d.posted_time DESC
""" """
with _ensure_session() as session: with _ensure_session() as session:
@@ -479,27 +312,11 @@ ORDER BY d.posted_time DESC
"timestamp": row[7], "timestamp": row[7],
"posted_time": row[8], "posted_time": row[8],
"url": row[9], "url": row[9],
# file_path is stored relative to cache dir; provide both forms
"file_path": row[10],
"file_path_abs": os.path.join(os.path.abspath(get_cache_dir()), row[10]) if row[10] else None,
"last_modified": row[11],
"url_guess": row[12],
"url_guess_stale": row[13],
} }
jobs.append(job) jobs.append(job)
return jobs return jobs
def db_get_cached_abs_path(db_file_path: Optional[str]) -> Optional[str]:
"""Return absolute cache file path given a DB-stored (relative) file_path.
Returns None if input is falsy.
"""
if not db_file_path:
return None
return os.path.join(os.path.abspath(get_cache_dir()), db_file_path)
def db_get_all_job_urls() -> List[str]: def db_get_all_job_urls() -> List[str]:
"""Return list of job URLs from job_listings.""" """Return list of job URLs from job_listings."""
with _ensure_session() as session: with _ensure_session() as session:
@@ -522,10 +339,6 @@ def remove_job(url):
try: try:
jid = url_to_job_id(url) jid = url_to_job_id(url)
db_delete_job(jid) db_delete_job(jid)
cache_fp = get_cache_path(url)
remove_cached_page(os.path.abspath(cache_fp))
if os.path.exists(cache_fp):
os.remove(cache_fp)
except Exception: except Exception:
pass pass

View File

@@ -1,7 +1,7 @@
from datetime import datetime, UTC from datetime import datetime, UTC
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from typing import List, Dict, Set from typing import List, Dict, Set
from web.utils import get_base_url, cache_page, safe_get_text, safe_get_attr, is_cached, get_cached_content, make_request_with_retry from web.utils import get_base_url, safe_get_text, safe_get_attr, make_request_with_retry
def scrape_listings_page(listing, region: str, keyword: str, seen_urls: Set[str]) -> List: def scrape_listings_page(listing, region: str, keyword: str, seen_urls: Set[str]) -> List:
@@ -108,14 +108,7 @@ def scrape_job_data(content: str, region: str, keyword: str, seen_urls: Set[str]
def process_region_keyword(region: str, keyword: str, seen_urls: Set[str]) -> List[List]: def process_region_keyword(region: str, keyword: str, seen_urls: Set[str]) -> List[List]:
"""Process a single region and keyword.""" """Process a single region and keyword."""
url = get_base_url().format(region=region, keyword=keyword.replace(" ", "+")) url = get_base_url().format(region=region, keyword=keyword.replace(" ", "+"))
if is_cached(url):
content = get_cached_content(url)
cache_status = "CACHED"
else:
content = make_request_with_retry(url, 3) content = make_request_with_retry(url, 3)
if content is None: if content is None:
return [] return []
cache_page(url, content)
cache_status = "FETCHED"
_ = cache_status # no-op to silence unused var
return scrape_job_data(content, region, keyword, seen_urls) return scrape_job_data(content, region, keyword, seen_urls)

View File

@@ -20,6 +20,7 @@
<a href="{{ url_for('index') }}">Home</a> | <a href="{{ url_for('index') }}">Home</a> |
<a href="{{ url_for('user_settings') }}">Preferences</a> <a href="{{ url_for('user_settings') }}">Preferences</a>
{% if current_user and current_user.is_admin %} | {% if current_user and current_user.is_admin %} |
<a href="{{ url_for('scrape_page') }}">Scrape Jobs</a> |
<a href="{{ url_for('admin_taxonomy') }}">Taxonomy</a> | <a href="{{ url_for('admin_taxonomy') }}">Taxonomy</a> |
<a href="{{ url_for('admin_users') }}">Users</a> {% endif %} {% if <a href="{{ url_for('admin_users') }}">Users</a> {% endif %} {% if
session.get('username') %} | session.get('username') %} |

View File

@@ -23,13 +23,5 @@ styles %}{% endblock %} {% block content %}
>{{ job.title }}</a >{{ job.title }}</a
> >
</p> </p>
{% if job.file_path_abs or job.file_path %}
<p>
<strong>Cached copy:</strong>
<a href="{{ url_for('serve_cached', job_id=job.id) }}" target="_blank"
>View cached copy</a
>
</p>
{% endif %}
</div> </div>
{% endblock %} {% endblock %}

67
web/templates/scrape.html Normal file
View File

@@ -0,0 +1,67 @@
{% extends "base.html" %} {% block title %}Scrape Jobs{% endblock %} {% block
content %}
<div id="scrape-container">
<h2>Job Scraping Progress</h2>
<button id="start-scrape" onclick="startScrape()">Start Scraping</button>
<div
id="output"
style="
margin-top: 20px;
padding: 10px;
border: 1px solid #ccc;
height: 400px;
overflow-y: auto;
background-color: #f9f9f9;
font-family: monospace;
white-space: pre-wrap;
"
></div>
</div>
{% endblock %} {% block scripts %}
<script>
function startScrape() {
const output = document.getElementById("output");
const startButton = document.getElementById("start-scrape");
output.textContent = "Starting scrape...\n";
startButton.disabled = true;
startButton.textContent = "Scraping...";
fetch("/scrape")
.then((response) => {
const reader = response.body.getReader();
const decoder = new TextDecoder();
function readStream() {
reader
.read()
.then(({ done, value }) => {
if (done) {
output.textContent += "\nScraping completed!";
startButton.disabled = false;
startButton.textContent = "Start Scraping";
return;
}
const chunk = decoder.decode(value, { stream: true });
output.textContent += chunk;
output.scrollTop = output.scrollHeight;
readStream();
})
.catch((error) => {
output.textContent += `\nError: ${error.message}`;
startButton.disabled = false;
startButton.textContent = "Start Scraping";
});
}
readStream();
})
.catch((error) => {
output.textContent = `Error starting scrape: ${error.message}`;
startButton.disabled = false;
startButton.textContent = "Start Scraping";
});
}
</script>
{% endblock %}

View File

@@ -93,8 +93,7 @@ def get_paths() -> dict:
return get_config().get('paths', {}) return get_config().get('paths', {})
def get_cache_dir() -> str:
return get_paths().get('cache_dir', 'cache')
def get_logs_dir() -> str: def get_logs_dir() -> str:
@@ -129,9 +128,7 @@ def get_base_url() -> str:
return get_config().get('scraper', {}).get('base_url', "https://{region}.craigslist.org/search/jjj?query={keyword}&sort=rel") return get_config().get('scraper', {}).get('base_url', "https://{region}.craigslist.org/search/jjj?query={keyword}&sort=rel")
def ensure_cache_dir():
"""Ensure cache directory exists."""
os.makedirs(get_cache_dir(), exist_ok=True)
def now_iso() -> str: def now_iso() -> str:
@@ -173,10 +170,7 @@ def get_url_from_filename(name: str) -> str:
return url_guess return url_guess
def get_cached_content(url: str) -> str:
"""Get cached content for URL."""
with open(get_cache_path(url), "r", encoding="utf-8") as f:
return f.read()
def safe_get_text(element, default="N/A"): def safe_get_text(element, default="N/A"):
@@ -194,51 +188,19 @@ def get_random_delay(min_delay: int = get_min_delay(), max_delay: int = get_max_
return random.uniform(min_delay, max_delay) return random.uniform(min_delay, max_delay)
def get_cache_path(url: str) -> str:
"""Get cache file path for URL."""
return os.path.join(get_cache_dir(), f"{get_filename_from_url(url)}.html")
def cache_page(url: str, content: str):
"""Cache the page content with a timestamp."""
cache_path = get_cache_path(url)
with open(cache_path, "w", encoding="utf-8") as f:
f.write(content)
# Update the file's modification time to the current time
os.utime(cache_path, None)
def is_cached(url: str) -> bool:
"""Check if the page is cached and not older than 24 hours."""
cache_path = get_cache_path(url)
if not os.path.isfile(cache_path):
return False
# Check the file's age if it's a search result page
if 'search' in url:
file_age = time.time() - os.path.getmtime(cache_path)
if file_age > 24 * 3600: # 24 hours in seconds
return False
return True
def is_cache_stale(last_modified: str, days: int = 1) -> bool:
"""Check if the cached page is stale (older than 24 hours)."""
if not last_modified:
return True
last_datetime = datetime.fromisoformat(last_modified)
file_age = time.time() - last_datetime.timestamp()
return file_age > days * 24 * 3600 # days in seconds
def delete_cached_page(url: str):
cache_fp = get_cache_path(url)
if os.path.exists(cache_fp):
try:
os.remove(cache_fp)
except Exception:
pass
def get_color_from_string(s: str) -> str: def get_color_from_string(s: str) -> str: