From 042a19671870b0b96b884aac69082e491cc188ef Mon Sep 17 00:00:00 2001
From: "georg.sinn-schirwitz"
Date: Mon, 8 Sep 2025 14:44:46 +0200
Subject: [PATCH] remove caching
---
setup.py | 2 +-
tests/test_cache_paths.py | 66 -----------
tests/test_cached_route.py | 53 ---------
tests/test_cachedpage_abs_path.py | 27 -----
tests/test_db_integration.py | 33 ------
web/app.py | 54 +++------
web/craigslist.py | 102 ++++++++--------
web/db.py | 187 ------------------------------
web/scraper.py | 15 +--
web/templates/base.html | 1 +
web/templates/job.html | 8 --
web/templates/scrape.html | 67 +++++++++++
web/utils.py | 54 ++-------
13 files changed, 144 insertions(+), 525 deletions(-)
delete mode 100644 tests/test_cache_paths.py
delete mode 100644 tests/test_cached_route.py
delete mode 100644 tests/test_cachedpage_abs_path.py
create mode 100644 web/templates/scrape.html
diff --git a/setup.py b/setup.py
index 371cffc..11dc996 100644
--- a/setup.py
+++ b/setup.py
@@ -1,6 +1,6 @@
#!/usr/bin/env python
"""
-MySQL utilities for Craigslist project.
+MySQL utility script for initializing database and showing row counts.
Usage (PowerShell):
# Ensure MySQL database and tables
diff --git a/tests/test_cache_paths.py b/tests/test_cache_paths.py
deleted file mode 100644
index 11595e6..0000000
--- a/tests/test_cache_paths.py
+++ /dev/null
@@ -1,66 +0,0 @@
-import os
-import tempfile
-import pytest
-
-import web.db as db
-from web.utils import get_cache_dir
-
-
-# Skip unless explicitly enabled (MySQL integration expected)
-if not os.getenv("RUN_DB_TESTS"):
- pytest.skip("Set RUN_DB_TESTS=1 to run cache path integration tests",
- allow_module_level=True)
-
-
-def test_db_sync_inserts_relative_paths(tmp_path, monkeypatch):
- # arrange: create a temporary cache dir and a fake html file
- cache_dir = tmp_path / "cache"
- cache_dir.mkdir()
- f = cache_dir / "example.org_path_to_page_123.html"
- f.write_text("ok")
-
- # point app at this cache dir
- monkeypatch.setenv("PYTEST_CACHE_DIR", str(cache_dir))
- # monkeypatch get_cache_dir used by db functions
- monkeypatch.setattr('web.utils.get_cache_dir', lambda: str(cache_dir))
-
- # ensure DB initialized
- db.db_init()
-
- # act
- db.db_sync_cached_pages(str(cache_dir))
-
- # assert: DB contains relative path, not absolute
- rows = db.db_get_all_cached_pages()
- assert any(r['file_path'] == os.path.relpath(
- str(f), start=str(cache_dir)) for r in rows)
-
-
-def test_normalize_cached_page_paths_converts_absolute(tmp_path, monkeypatch):
- cache_dir = tmp_path / "cache"
- cache_dir.mkdir()
- # create an actual file
- f = cache_dir / "site_example_page_1.html"
- f.write_text("ok")
-
- monkeypatch.setattr('web.utils.get_cache_dir', lambda: str(cache_dir))
- db.db_init()
-
- abs_fp = str(f)
- rel_fp = os.path.relpath(abs_fp, start=str(cache_dir))
-
- # Insert an absolute path row directly (simulate legacy data)
- with db._ensure_session() as session:
- session.execute(
- db.text("INSERT INTO cached_pages(file_path, url_guess, last_modified, size_bytes, job_id) VALUES(:fp, :ug, :lm, :sz, :jid)"),
- {"fp": abs_fp, "ug": "https://example.org/page1.html",
- "lm": None, "sz": 10, "jid": None}
- )
- session.commit()
-
- # normalize should convert absolute to relative
- changed = db.normalize_cached_page_paths()
- assert changed >= 1
-
- rows = db.db_get_all_cached_pages()
- assert any(r['file_path'] == rel_fp for r in rows)
diff --git a/tests/test_cached_route.py b/tests/test_cached_route.py
deleted file mode 100644
index bef3aaf..0000000
--- a/tests/test_cached_route.py
+++ /dev/null
@@ -1,53 +0,0 @@
-import os
-import tempfile
-from web.app import app
-
-
-def test_cached_route_serves_file(monkeypatch):
- # Create a temporary file in the configured cache dir
- cache_dir = os.path.abspath(os.path.join(
- os.path.dirname(__file__), '..', 'cache'))
- os.makedirs(cache_dir, exist_ok=True)
- fd, tmp_path = tempfile.mkstemp(
- prefix='test_cached_', suffix='.html', dir=cache_dir)
- os.close(fd)
- with open(tmp_path, 'w', encoding='utf-8') as f:
- f.write('cached')
-
- # Fake job record returned by get_job_by_id
- fake_job = {
- 'id': 'fake123',
- 'job_id': 'fake123',
- 'file_path': os.path.relpath(tmp_path, cache_dir),
- 'file_path_abs': tmp_path,
- }
-
- def fake_get_job_by_id(jid):
- if str(jid) in ('fake123',):
- return fake_job
- return {}
-
- # Patch the symbol imported into web.app
- monkeypatch.setattr('web.app.get_job_by_id', fake_get_job_by_id)
-
- # Request route
- client = app.test_client()
- res = client.get('/cached/fake123')
- assert res.status_code == 200
- assert b'cached' in res.data
-
- # Cleanup
- try:
- os.remove(tmp_path)
- except Exception:
- pass
-
-
-def test_cached_route_missing(monkeypatch):
- def fake_get_job_by_id(jid):
- return {}
-
- monkeypatch.setattr('web.app.get_job_by_id', fake_get_job_by_id)
- client = app.test_client()
- res = client.get('/cached/nope')
- assert res.status_code == 404
diff --git a/tests/test_cachedpage_abs_path.py b/tests/test_cachedpage_abs_path.py
deleted file mode 100644
index 021ac1c..0000000
--- a/tests/test_cachedpage_abs_path.py
+++ /dev/null
@@ -1,27 +0,0 @@
-import os
-from web.db import CachedPage
-from web.utils import get_cache_dir
-
-
-def test_cachedpage_abs_path(tmp_path, monkeypatch):
- # Create a fake cache dir and monkeypatch get_cache_dir
- fake_cache = tmp_path / 'cache'
- fake_cache.mkdir()
- monkeypatch.setenv('PYTHONIOENCODING', 'utf-8')
-
- # Patch the symbol used by CachedPage.abs_path (imported into web.db)
- monkeypatch.setattr('web.db.get_cache_dir', lambda: str(fake_cache))
-
- # Create a CachedPage instance and set file_path attribute
- cp = CachedPage()
- setattr(cp, 'file_path', 'subdir/test.html')
-
- # Ensure the computed absolute path joins the fake cache dir
- expected = os.path.join(os.path.abspath(
- str(fake_cache)), 'subdir/test.html')
- assert cp.abs_path == expected
-
- # When file_path is falsy, abs_path should be None
- cp2 = CachedPage()
- setattr(cp2, 'file_path', None)
- assert cp2.abs_path is None
diff --git a/tests/test_db_integration.py b/tests/test_db_integration.py
index 738ff05..010a99f 100644
--- a/tests/test_db_integration.py
+++ b/tests/test_db_integration.py
@@ -95,39 +95,6 @@ def test_upsert_listing_details_and_urls(db_ready):
pass
-def test_cached_page_upsert_and_get(db_ready):
- jid_suffix = unique_suffix()
- url = f"https://example.org/it/{jid_suffix}.html"
- # Ensure a listing exists for FK relation if enforced
- db.upsert_listing(
- url=url,
- region="it",
- keyword="cache",
- title=f"IT Cache {jid_suffix}",
- pay="N/A",
- location="Test City",
- timestamp=now_iso(),
- )
- fp = f"/tmp/integration_{jid_suffix}.html"
- db.upsert_cached_page(
- file_path=fp,
- url_guess=url,
- last_modified=now_iso(),
- size_bytes=123,
- job_id=int(jid_suffix) if jid_suffix.isdigit() else None,
- )
- row = db.db_get_cache_url(url)
- if row is not None:
- assert row["url_guess"] == url
- # Cleanup
- try:
- db.remove_cached_page(fp)
- db.db_remove_cached_url(url)
- db.db_delete_job(jid_suffix)
- except Exception:
- pass
-
-
def test_user_interactions_mark_and_visit(db_ready):
uname = f"it_user_{unique_suffix()}"
db.create_or_update_user(uname, is_active=True)
diff --git a/web/app.py b/web/app.py
index 154ed61..1312dd9 100644
--- a/web/app.py
+++ b/web/app.py
@@ -1,5 +1,5 @@
import os
-from flask import Flask, request, jsonify, render_template, redirect, url_for, session, flash, send_file
+from flask import Flask, request, jsonify, render_template, redirect, url_for, session, flash, Response
from flask_wtf import CSRFProtect
from typing import Dict, List
@@ -34,7 +34,6 @@ from web.utils import (
initialize_users_from_settings,
filter_jobs,
get_job_by_id,
- get_cache_dir,
)
from web.db import get_all_regions, get_all_keywords
@@ -232,39 +231,6 @@ def job_by_id(job_id):
return jsonify({"error": "Job not found"}), 404
-@app.route('/cached/', methods=['GET'])
-def serve_cached(job_id):
- """Serve the cached HTML file for a job if available.
-
- Uses the job record's `file_path_abs` when present, or resolves the DB `file_path` via helper.
- Ensures the returned file is located under the configured cache directory to avoid path-traversal.
- """
- try:
- from web.db import db_get_cached_abs_path
- j = get_job_by_id(job_id)
- if not j:
- return "Job not found", 404
-
- # Prefer file_path_abs, fall back to resolving the DB-stored file_path
- abs_fp = j.get('file_path_abs') or None
- if not abs_fp:
- db_fp = j.get('file_path')
- abs_fp = db_get_cached_abs_path(db_fp) if db_fp else None
-
- if not abs_fp or not os.path.isfile(abs_fp):
- return "Cached file not available", 404
-
- cache_dir = os.path.abspath(get_cache_dir())
- abs_fp = os.path.abspath(abs_fp)
- # Ensure the file is inside the cache directory
- if os.path.commonpath([cache_dir, abs_fp]) != cache_dir:
- return "Forbidden", 403
-
- return send_file(abs_fp)
- except Exception:
- return "Error serving cached file", 500
-
-
@app.route('/jobs//favorite', methods=['POST'])
def set_favorite(job_id):
"""Mark or unmark a job as favorite for a given user.
@@ -290,9 +256,21 @@ csrf.exempt(set_favorite)
@app.route('/scrape', methods=['GET'])
def scrape():
- """Trigger the web scraping process."""
- scraper()
- return jsonify({"status": "Scraping completed"})
+ """Trigger the web scraping process with streaming output."""
+ def generate():
+ try:
+ for message in scraper():
+ yield message
+ except Exception as e:
+ yield f"Error during scraping: {str(e)}\n"
+
+ return Response(generate(), mimetype='text/plain')
+
+
+@app.route('/scrape-page', methods=['GET'])
+def scrape_page():
+ """Serve the scrape page with streaming output display."""
+ return render_template('scrape.html', title='Scrape Jobs')
# ---------------- Auth & Admin UI ------------------------------------------
diff --git a/web/craigslist.py b/web/craigslist.py
index 7446366..29b0e44 100644
--- a/web/craigslist.py
+++ b/web/craigslist.py
@@ -2,31 +2,19 @@ from datetime import datetime, timezone
from web.scraper import process_region_keyword, scrape_job_page
from web.db import (
db_init,
- upsert_cached_page,
upsert_listing,
upsert_job_details,
url_to_job_id,
upsert_user_interaction,
- db_remove_cached_url,
- db_sync_cached_pages,
db_get_all_job_urls,
- db_get_cache_url,
db_delete_job,
remove_job,
- normalize_cached_page_paths,
)
# Import utility functions
from web.utils import (
- get_cache_dir,
make_request_with_retry,
now_iso,
- get_cache_path,
- cache_page,
- is_cache_stale,
- delete_cached_page,
- get_cached_content,
- ensure_cache_dir
)
from web.db import get_all_regions, get_all_keywords, seed_regions_keywords_from_listings
@@ -45,15 +33,26 @@ def fetch_listings():
except Exception:
pass
+ yield "Initializing database and seeding regions/keywords...\n"
+
# Fetch listings for each region/keyword from DB
- for region in get_all_regions():
+ regions = get_all_regions()
+ keywords = get_all_keywords()
+ total_combinations = len(regions) * len(keywords)
+ processed = 0
+
+ yield f"Found {len(regions)} regions and {len(keywords)} keywords. Processing {total_combinations} combinations...\n"
+
+ for region in regions:
region_name = region.get("name")
if not region_name:
continue
- for keyword in get_all_keywords():
+ for keyword in keywords:
keyword_name = keyword.get("name")
if not keyword_name:
continue
+ processed += 1
+ yield f"Processing {region_name} + {keyword_name} ({processed}/{total_combinations})...\n"
for row in process_region_keyword(region_name, keyword_name, discovered_urls):
timestamp, region, keyword, title, pay, location, url = row
discovered_urls.add(url)
@@ -72,75 +71,68 @@ def fetch_listings():
# Remove stale listings: those present in DB but not discovered now.
stale_urls = existing_db_urls - discovered_urls
- for url in stale_urls:
- try:
- jid = url_to_job_id(url)
- db_delete_job(jid)
- # Also try to remove cached file and its metadata
- delete_cached_page(url)
- db_remove_cached_url(url)
- except Exception:
- pass
+ if stale_urls:
+ yield f"Removing {len(stale_urls)} stale listings...\n"
+ for url in stale_urls:
+ try:
+ jid = url_to_job_id(url)
+ db_delete_job(jid)
+ except Exception:
+ pass
+ yield f"Listing fetch complete: {len(discovered_urls)} discovered, {len(new_rows)} new, {len(stale_urls)} stale\n"
return {"discovered": len(discovered_urls), "new": len(new_rows), "stale": len(stale_urls)}
def process_job_url(job_url: str):
try:
job_id = url_to_job_id(job_url)
- content = None
- cached_page = db_get_cache_url(job_url)
- if cached_page:
- last_modified = cached_page.get("last_modified")
- if last_modified and not is_cache_stale(last_modified):
- content = get_cached_content(job_url)
- else:
- content = make_request_with_retry(job_url, 1)
- else:
- content = make_request_with_retry(job_url, 1)
+ yield f"Fetching job page: {job_url}\n"
+ content = make_request_with_retry(job_url, 1)
if content is None:
+ yield f"Failed to fetch content for {job_url}, removing from database\n"
remove_job(job_url)
return None
- # refresh cache and details
- cache_page(job_url, content)
- upsert_cached_page(
- file_path=get_cache_path(job_url),
- url_guess=job_url,
- last_modified=now_iso(),
- size_bytes=len(content),
- job_id=job_id
- )
+ yield f"Scraping job data from {job_url}\n"
job_data = scrape_job_page(content, job_url)
if job_data:
+ yield f"Upserting job details for {job_id}\n"
upsert_job_details(job_data)
upsert_user_interaction(
job_id, seen_at=datetime.now(timezone.utc).isoformat())
+ yield f"Successfully processed job {job_id}: {job_data.get('title', 'Unknown')}\n"
return job_data
+ else:
+ yield f"Failed to scrape job data from {job_url}\n"
return None
- except Exception:
+ except Exception as e:
+ yield f"Error processing {job_url}: {str(e)}\n"
return None
def scraper():
"""Main function to run the scraper."""
- ensure_cache_dir()
+ yield "Starting scraper...\n"
db_init()
+ yield "Database initialized\n"
+
# First, fetch current listings from search pages and make DB reflect them.
- jl = fetch_listings()
-
- # Sync any cached files we have on disk into the cached_pages table.
- db_sync_cached_pages(get_cache_dir())
-
- # Normalize any relative cached file paths to absolute paths in DB
- normalized = normalize_cached_page_paths()
- if normalized:
- pass
+ yield "Fetching listings...\n"
+ for message in fetch_listings():
+ yield message
# Finally, fetch and refresh individual job pages for current listings
- for url in db_get_all_job_urls():
- process_job_url(url)
+ job_urls = db_get_all_job_urls()
+ yield f"Processing {len(job_urls)} job pages...\n"
+
+ for i, url in enumerate(job_urls, 1):
+ yield f"\n--- Processing job {i}/{len(job_urls)} ---\n"
+ for message in process_job_url(url):
+ yield message
+
+ yield "\nScraping completed successfully!\n"
if __name__ == "__main__":
diff --git a/web/db.py b/web/db.py
index e8a75d0..0b39058 100644
--- a/web/db.py
+++ b/web/db.py
@@ -4,7 +4,6 @@ from __future__ import annotations
Tables:
- users(user_id PK, username UNIQUE, created_at)
- - cached_pages(file_path PK, url_guess, last_modified, size_bytes, job_id)
- job_listings(job_id PK, url UNIQUE, region, keyword, title, pay, location, timestamp)
- job_descriptions(job_id PK FK -> job_listings, title, company, location, description, posted_time, url)
- user_interactions(job_id PK FK -> job_listings, user_id FK -> users, seen_at, url_visited, is_user_favorite)
@@ -23,8 +22,6 @@ from web.utils import (
url_to_job_id,
normalize_job_id,
now_iso,
- get_cache_path,
- get_cache_dir,
get_mysql_config,
)
@@ -86,8 +83,6 @@ class JobListing(Base):
description = relationship(
"JobDescription", back_populates="listing", uselist=False, cascade="all, delete-orphan")
- cached_pages = relationship(
- "CachedPage", back_populates="listing", cascade="all, delete-orphan")
interactions = relationship(
"UserInteraction", back_populates="listing", cascade="all, delete-orphan")
@@ -106,30 +101,6 @@ class JobDescription(Base):
listing = relationship("JobListing", back_populates="description")
-class CachedPage(Base):
- __tablename__ = "cached_pages"
- file_path = Column(String(FILE_PATH_LEN), primary_key=True)
- url_guess = Column(String(URL_LEN))
- last_modified = Column(String(TIME_LEN))
- size_bytes = Column(Integer)
- job_id = Column(String(JOB_ID_LEN), ForeignKey(
- "job_listings.job_id", ondelete="CASCADE"))
-
- listing = relationship("JobListing", back_populates="cached_pages")
-
- @property
- def abs_path(self) -> Optional[str]:
- """Return the absolute filesystem path for this cached page.
-
- The DB stores `file_path` relative to the configured cache dir. This
- helper centralizes resolution so callers can use `cached_page.abs_path`.
- """
- fp = getattr(self, 'file_path', None)
- if not fp:
- return None
- return os.path.join(os.path.abspath(get_cache_dir()), fp)
-
-
class UserInteraction(Base):
__tablename__ = "user_interactions"
# composite uniqueness on (user_id, job_id)
@@ -292,139 +263,6 @@ def upsert_job_details(job_data: Dict[str, Any]):
session.commit()
-def upsert_cached_page(*, file_path: str, url_guess: Optional[str], last_modified: Optional[str], size_bytes: Optional[int], job_id: Optional[int]):
- # Store file paths relative to the cache directory (keeps DB portable)
- cache_dir = os.path.abspath(get_cache_dir())
- abs_fp = os.path.abspath(file_path)
- rel_fp = os.path.relpath(abs_fp, start=cache_dir)
- with _ensure_session() as session:
- obj = session.get(CachedPage, rel_fp)
- if obj is None:
- obj = CachedPage(file_path=rel_fp)
- session.add(obj)
- setattr(obj, "url_guess", url_guess)
- setattr(obj, "last_modified", last_modified)
- setattr(obj, "size_bytes", size_bytes)
- setattr(obj, "job_id", str(job_id) if job_id else None)
- session.commit()
-
-
-def remove_cached_page(file_path: str):
- # Accept absolute or relative input but DB keys are stored relative to cache dir
- cache_dir = os.path.abspath(get_cache_dir())
- abs_fp = os.path.abspath(file_path)
- rel_fp = os.path.relpath(abs_fp, start=cache_dir)
- with _ensure_session() as session:
- obj = session.get(CachedPage, rel_fp)
- if obj:
- session.delete(obj)
- session.commit()
-
-
-def db_remove_cached_url(url: str):
- """Remove a cached page by URL."""
- # Compute absolute path for the URL and delegate to remove_cached_page
- abs_fp = os.path.abspath(get_cache_path(url))
- try:
- remove_cached_page(abs_fp)
- except Exception:
- pass
-
-
-def db_get_all_cached_pages() -> List[Dict[str, Any]]:
- with _ensure_session() as session:
- rows = session.execute(text(
- "SELECT file_path, url_guess, last_modified, size_bytes, job_id FROM cached_pages")).fetchall()
- out = []
- for row in rows:
- fp = row[0]
- out.append({
- "file_path": fp,
- "file_path_abs": db_get_cached_abs_path(fp) if fp else None,
- "url_guess": row[1],
- "last_modified": row[2],
- "size_bytes": row[3],
- "job_id": row[4],
- })
- return out
-
-
-def db_get_cache_url(url: str):
- """Return the data for a specific URL from cached_pages.
-
- Arguments:
- url -- The URL to look up in the cache.
- """
- with _ensure_session() as session:
- row = session.execute(text(
- "SELECT file_path, url_guess, last_modified, size_bytes, job_id FROM cached_pages WHERE url_guess = :u"), {"u": url}).fetchone()
- if not row:
- return None
- fp = row[0]
- return {
- "file_path": fp,
- "file_path_abs": db_get_cached_abs_path(fp) if fp else None,
- "url_guess": row[1],
- "last_modified": row[2],
- "size_bytes": row[3],
- "job_id": row[4],
- }
-
-
-def db_sync_cached_pages(cache_dir: str):
- """Scan cache_dir and upsert page metadata into cached_pages table."""
- if not os.path.isdir(cache_dir):
- return
- abs_cache = os.path.abspath(cache_dir)
- # read existing DB keys once for quick membership tests
- db_cache_paths = {c["file_path"] for c in db_get_all_cached_pages()}
- for root, _, files in os.walk(abs_cache):
- for name in files:
- if not name.lower().endswith(".html"):
- continue
- fp = os.path.abspath(os.path.join(root, name))
- rel_fp = os.path.relpath(fp, start=abs_cache)
- if rel_fp in db_cache_paths:
- continue
- try:
- stat = os.stat(fp)
- mtime = datetime.fromtimestamp(stat.st_mtime).isoformat()
- size = stat.st_size
- except OSError:
- mtime = None
- size = None
- url_guess = get_url_from_filename(name)
- job_id = url_to_job_id(url_guess)
- upsert_cached_page(file_path=rel_fp, url_guess=url_guess,
- last_modified=mtime, size_bytes=size, job_id=job_id)
-
-
-def normalize_cached_page_paths() -> int:
- """Ensure all cached_pages.file_path values are absolute. Returns number of rows updated/normalized."""
- # Convert any absolute paths in DB to relative paths (relative to cache dir)
- changed = 0
- abs_cache = os.path.abspath(get_cache_dir())
- with _ensure_session() as session:
- rows = session.execute(text(
- "SELECT file_path, url_guess, last_modified, size_bytes, job_id FROM cached_pages")).fetchall()
- for (fp, url_guess, last_modified, size_bytes, job_id) in rows:
- if os.path.isabs(fp):
- rel_fp = os.path.relpath(fp, start=abs_cache)
- upsert_cached_page(
- file_path=rel_fp,
- url_guess=url_guess,
- last_modified=last_modified,
- size_bytes=size_bytes,
- job_id=job_id,
- )
- with _ensure_session() as session:
- session.execute(
- text("DELETE FROM cached_pages WHERE file_path = :fp"), {"fp": fp})
- session.commit()
- changed += 1
- return changed
-
-
def db_get_keywords() -> List[str]:
"""Return a list of all unique keywords from job listings."""
with _ensure_session() as session:
@@ -453,15 +291,10 @@ SELECT l.job_id
,l.timestamp
,d.posted_time
,l.url
-,c.file_path
-,c.last_modified
-,c.url_guess
-,CASE WHEN c.url_guess != l.url THEN 1 ELSE 0 END AS url_guess_stale
FROM job_listings AS l
INNER JOIN job_descriptions AS d
ON l.job_id = d.job_id
AND l.url = d.url
-LEFT JOIN cached_pages AS c ON l.job_id = c.job_id
ORDER BY d.posted_time DESC
"""
with _ensure_session() as session:
@@ -479,27 +312,11 @@ ORDER BY d.posted_time DESC
"timestamp": row[7],
"posted_time": row[8],
"url": row[9],
- # file_path is stored relative to cache dir; provide both forms
- "file_path": row[10],
- "file_path_abs": os.path.join(os.path.abspath(get_cache_dir()), row[10]) if row[10] else None,
- "last_modified": row[11],
- "url_guess": row[12],
- "url_guess_stale": row[13],
}
jobs.append(job)
return jobs
-def db_get_cached_abs_path(db_file_path: Optional[str]) -> Optional[str]:
- """Return absolute cache file path given a DB-stored (relative) file_path.
-
- Returns None if input is falsy.
- """
- if not db_file_path:
- return None
- return os.path.join(os.path.abspath(get_cache_dir()), db_file_path)
-
-
def db_get_all_job_urls() -> List[str]:
"""Return list of job URLs from job_listings."""
with _ensure_session() as session:
@@ -522,10 +339,6 @@ def remove_job(url):
try:
jid = url_to_job_id(url)
db_delete_job(jid)
- cache_fp = get_cache_path(url)
- remove_cached_page(os.path.abspath(cache_fp))
- if os.path.exists(cache_fp):
- os.remove(cache_fp)
except Exception:
pass
diff --git a/web/scraper.py b/web/scraper.py
index f4d89b4..7adfbcc 100644
--- a/web/scraper.py
+++ b/web/scraper.py
@@ -1,7 +1,7 @@
from datetime import datetime, UTC
from bs4 import BeautifulSoup
from typing import List, Dict, Set
-from web.utils import get_base_url, cache_page, safe_get_text, safe_get_attr, is_cached, get_cached_content, make_request_with_retry
+from web.utils import get_base_url, safe_get_text, safe_get_attr, make_request_with_retry
def scrape_listings_page(listing, region: str, keyword: str, seen_urls: Set[str]) -> List:
@@ -108,14 +108,7 @@ def scrape_job_data(content: str, region: str, keyword: str, seen_urls: Set[str]
def process_region_keyword(region: str, keyword: str, seen_urls: Set[str]) -> List[List]:
"""Process a single region and keyword."""
url = get_base_url().format(region=region, keyword=keyword.replace(" ", "+"))
- if is_cached(url):
- content = get_cached_content(url)
- cache_status = "CACHED"
- else:
- content = make_request_with_retry(url, 3)
- if content is None:
- return []
- cache_page(url, content)
- cache_status = "FETCHED"
- _ = cache_status # no-op to silence unused var
+ content = make_request_with_retry(url, 3)
+ if content is None:
+ return []
return scrape_job_data(content, region, keyword, seen_urls)
diff --git a/web/templates/base.html b/web/templates/base.html
index ef60b6b..75b2d87 100644
--- a/web/templates/base.html
+++ b/web/templates/base.html
@@ -20,6 +20,7 @@
Home |
Preferences
{% if current_user and current_user.is_admin %} |
+ Scrape Jobs |
Taxonomy |
Users {% endif %} {% if
session.get('username') %} |
diff --git a/web/templates/job.html b/web/templates/job.html
index bf337a4..85948df 100644
--- a/web/templates/job.html
+++ b/web/templates/job.html
@@ -23,13 +23,5 @@ styles %}{% endblock %} {% block content %}
>{{ job.title }}
- {% if job.file_path_abs or job.file_path %}
-
- Cached copy:
- View cached copy
-
- {% endif %}
{% endblock %}
diff --git a/web/templates/scrape.html b/web/templates/scrape.html
new file mode 100644
index 0000000..0533405
--- /dev/null
+++ b/web/templates/scrape.html
@@ -0,0 +1,67 @@
+{% extends "base.html" %} {% block title %}Scrape Jobs{% endblock %} {% block
+content %}
+
+
Job Scraping Progress
+
+
+
+{% endblock %} {% block scripts %}
+
+{% endblock %}
diff --git a/web/utils.py b/web/utils.py
index 4645d4d..f58dcf0 100644
--- a/web/utils.py
+++ b/web/utils.py
@@ -93,8 +93,7 @@ def get_paths() -> dict:
return get_config().get('paths', {})
-def get_cache_dir() -> str:
- return get_paths().get('cache_dir', 'cache')
+
def get_logs_dir() -> str:
@@ -129,9 +128,7 @@ def get_base_url() -> str:
return get_config().get('scraper', {}).get('base_url', "https://{region}.craigslist.org/search/jjj?query={keyword}&sort=rel")
-def ensure_cache_dir():
- """Ensure cache directory exists."""
- os.makedirs(get_cache_dir(), exist_ok=True)
+
def now_iso() -> str:
@@ -173,10 +170,7 @@ def get_url_from_filename(name: str) -> str:
return url_guess
-def get_cached_content(url: str) -> str:
- """Get cached content for URL."""
- with open(get_cache_path(url), "r", encoding="utf-8") as f:
- return f.read()
+
def safe_get_text(element, default="N/A"):
@@ -194,51 +188,19 @@ def get_random_delay(min_delay: int = get_min_delay(), max_delay: int = get_max_
return random.uniform(min_delay, max_delay)
-def get_cache_path(url: str) -> str:
- """Get cache file path for URL."""
- return os.path.join(get_cache_dir(), f"{get_filename_from_url(url)}.html")
-def cache_page(url: str, content: str):
- """Cache the page content with a timestamp."""
- cache_path = get_cache_path(url)
- with open(cache_path, "w", encoding="utf-8") as f:
- f.write(content)
- # Update the file's modification time to the current time
- os.utime(cache_path, None)
-def is_cached(url: str) -> bool:
- """Check if the page is cached and not older than 24 hours."""
- cache_path = get_cache_path(url)
- if not os.path.isfile(cache_path):
- return False
-
- # Check the file's age if it's a search result page
- if 'search' in url:
- file_age = time.time() - os.path.getmtime(cache_path)
- if file_age > 24 * 3600: # 24 hours in seconds
- return False
-
- return True
-def is_cache_stale(last_modified: str, days: int = 1) -> bool:
- """Check if the cached page is stale (older than 24 hours)."""
- if not last_modified:
- return True
- last_datetime = datetime.fromisoformat(last_modified)
- file_age = time.time() - last_datetime.timestamp()
- return file_age > days * 24 * 3600 # days in seconds
-def delete_cached_page(url: str):
- cache_fp = get_cache_path(url)
- if os.path.exists(cache_fp):
- try:
- os.remove(cache_fp)
- except Exception:
- pass
+
+
+
+
+
def get_color_from_string(s: str) -> str: