remove caching
This commit is contained in:
2
setup.py
2
setup.py
@@ -1,6 +1,6 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
"""
|
"""
|
||||||
MySQL utilities for Craigslist project.
|
MySQL utility script for initializing database and showing row counts.
|
||||||
|
|
||||||
Usage (PowerShell):
|
Usage (PowerShell):
|
||||||
# Ensure MySQL database and tables
|
# Ensure MySQL database and tables
|
||||||
|
|||||||
@@ -1,66 +0,0 @@
|
|||||||
import os
|
|
||||||
import tempfile
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
import web.db as db
|
|
||||||
from web.utils import get_cache_dir
|
|
||||||
|
|
||||||
|
|
||||||
# Skip unless explicitly enabled (MySQL integration expected)
|
|
||||||
if not os.getenv("RUN_DB_TESTS"):
|
|
||||||
pytest.skip("Set RUN_DB_TESTS=1 to run cache path integration tests",
|
|
||||||
allow_module_level=True)
|
|
||||||
|
|
||||||
|
|
||||||
def test_db_sync_inserts_relative_paths(tmp_path, monkeypatch):
|
|
||||||
# arrange: create a temporary cache dir and a fake html file
|
|
||||||
cache_dir = tmp_path / "cache"
|
|
||||||
cache_dir.mkdir()
|
|
||||||
f = cache_dir / "example.org_path_to_page_123.html"
|
|
||||||
f.write_text("<html>ok</html>")
|
|
||||||
|
|
||||||
# point app at this cache dir
|
|
||||||
monkeypatch.setenv("PYTEST_CACHE_DIR", str(cache_dir))
|
|
||||||
# monkeypatch get_cache_dir used by db functions
|
|
||||||
monkeypatch.setattr('web.utils.get_cache_dir', lambda: str(cache_dir))
|
|
||||||
|
|
||||||
# ensure DB initialized
|
|
||||||
db.db_init()
|
|
||||||
|
|
||||||
# act
|
|
||||||
db.db_sync_cached_pages(str(cache_dir))
|
|
||||||
|
|
||||||
# assert: DB contains relative path, not absolute
|
|
||||||
rows = db.db_get_all_cached_pages()
|
|
||||||
assert any(r['file_path'] == os.path.relpath(
|
|
||||||
str(f), start=str(cache_dir)) for r in rows)
|
|
||||||
|
|
||||||
|
|
||||||
def test_normalize_cached_page_paths_converts_absolute(tmp_path, monkeypatch):
|
|
||||||
cache_dir = tmp_path / "cache"
|
|
||||||
cache_dir.mkdir()
|
|
||||||
# create an actual file
|
|
||||||
f = cache_dir / "site_example_page_1.html"
|
|
||||||
f.write_text("<html>ok</html>")
|
|
||||||
|
|
||||||
monkeypatch.setattr('web.utils.get_cache_dir', lambda: str(cache_dir))
|
|
||||||
db.db_init()
|
|
||||||
|
|
||||||
abs_fp = str(f)
|
|
||||||
rel_fp = os.path.relpath(abs_fp, start=str(cache_dir))
|
|
||||||
|
|
||||||
# Insert an absolute path row directly (simulate legacy data)
|
|
||||||
with db._ensure_session() as session:
|
|
||||||
session.execute(
|
|
||||||
db.text("INSERT INTO cached_pages(file_path, url_guess, last_modified, size_bytes, job_id) VALUES(:fp, :ug, :lm, :sz, :jid)"),
|
|
||||||
{"fp": abs_fp, "ug": "https://example.org/page1.html",
|
|
||||||
"lm": None, "sz": 10, "jid": None}
|
|
||||||
)
|
|
||||||
session.commit()
|
|
||||||
|
|
||||||
# normalize should convert absolute to relative
|
|
||||||
changed = db.normalize_cached_page_paths()
|
|
||||||
assert changed >= 1
|
|
||||||
|
|
||||||
rows = db.db_get_all_cached_pages()
|
|
||||||
assert any(r['file_path'] == rel_fp for r in rows)
|
|
||||||
@@ -1,53 +0,0 @@
|
|||||||
import os
|
|
||||||
import tempfile
|
|
||||||
from web.app import app
|
|
||||||
|
|
||||||
|
|
||||||
def test_cached_route_serves_file(monkeypatch):
|
|
||||||
# Create a temporary file in the configured cache dir
|
|
||||||
cache_dir = os.path.abspath(os.path.join(
|
|
||||||
os.path.dirname(__file__), '..', 'cache'))
|
|
||||||
os.makedirs(cache_dir, exist_ok=True)
|
|
||||||
fd, tmp_path = tempfile.mkstemp(
|
|
||||||
prefix='test_cached_', suffix='.html', dir=cache_dir)
|
|
||||||
os.close(fd)
|
|
||||||
with open(tmp_path, 'w', encoding='utf-8') as f:
|
|
||||||
f.write('<html><body>cached</body></html>')
|
|
||||||
|
|
||||||
# Fake job record returned by get_job_by_id
|
|
||||||
fake_job = {
|
|
||||||
'id': 'fake123',
|
|
||||||
'job_id': 'fake123',
|
|
||||||
'file_path': os.path.relpath(tmp_path, cache_dir),
|
|
||||||
'file_path_abs': tmp_path,
|
|
||||||
}
|
|
||||||
|
|
||||||
def fake_get_job_by_id(jid):
|
|
||||||
if str(jid) in ('fake123',):
|
|
||||||
return fake_job
|
|
||||||
return {}
|
|
||||||
|
|
||||||
# Patch the symbol imported into web.app
|
|
||||||
monkeypatch.setattr('web.app.get_job_by_id', fake_get_job_by_id)
|
|
||||||
|
|
||||||
# Request route
|
|
||||||
client = app.test_client()
|
|
||||||
res = client.get('/cached/fake123')
|
|
||||||
assert res.status_code == 200
|
|
||||||
assert b'cached' in res.data
|
|
||||||
|
|
||||||
# Cleanup
|
|
||||||
try:
|
|
||||||
os.remove(tmp_path)
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
def test_cached_route_missing(monkeypatch):
|
|
||||||
def fake_get_job_by_id(jid):
|
|
||||||
return {}
|
|
||||||
|
|
||||||
monkeypatch.setattr('web.app.get_job_by_id', fake_get_job_by_id)
|
|
||||||
client = app.test_client()
|
|
||||||
res = client.get('/cached/nope')
|
|
||||||
assert res.status_code == 404
|
|
||||||
@@ -1,27 +0,0 @@
|
|||||||
import os
|
|
||||||
from web.db import CachedPage
|
|
||||||
from web.utils import get_cache_dir
|
|
||||||
|
|
||||||
|
|
||||||
def test_cachedpage_abs_path(tmp_path, monkeypatch):
|
|
||||||
# Create a fake cache dir and monkeypatch get_cache_dir
|
|
||||||
fake_cache = tmp_path / 'cache'
|
|
||||||
fake_cache.mkdir()
|
|
||||||
monkeypatch.setenv('PYTHONIOENCODING', 'utf-8')
|
|
||||||
|
|
||||||
# Patch the symbol used by CachedPage.abs_path (imported into web.db)
|
|
||||||
monkeypatch.setattr('web.db.get_cache_dir', lambda: str(fake_cache))
|
|
||||||
|
|
||||||
# Create a CachedPage instance and set file_path attribute
|
|
||||||
cp = CachedPage()
|
|
||||||
setattr(cp, 'file_path', 'subdir/test.html')
|
|
||||||
|
|
||||||
# Ensure the computed absolute path joins the fake cache dir
|
|
||||||
expected = os.path.join(os.path.abspath(
|
|
||||||
str(fake_cache)), 'subdir/test.html')
|
|
||||||
assert cp.abs_path == expected
|
|
||||||
|
|
||||||
# When file_path is falsy, abs_path should be None
|
|
||||||
cp2 = CachedPage()
|
|
||||||
setattr(cp2, 'file_path', None)
|
|
||||||
assert cp2.abs_path is None
|
|
||||||
@@ -95,39 +95,6 @@ def test_upsert_listing_details_and_urls(db_ready):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
def test_cached_page_upsert_and_get(db_ready):
|
|
||||||
jid_suffix = unique_suffix()
|
|
||||||
url = f"https://example.org/it/{jid_suffix}.html"
|
|
||||||
# Ensure a listing exists for FK relation if enforced
|
|
||||||
db.upsert_listing(
|
|
||||||
url=url,
|
|
||||||
region="it",
|
|
||||||
keyword="cache",
|
|
||||||
title=f"IT Cache {jid_suffix}",
|
|
||||||
pay="N/A",
|
|
||||||
location="Test City",
|
|
||||||
timestamp=now_iso(),
|
|
||||||
)
|
|
||||||
fp = f"/tmp/integration_{jid_suffix}.html"
|
|
||||||
db.upsert_cached_page(
|
|
||||||
file_path=fp,
|
|
||||||
url_guess=url,
|
|
||||||
last_modified=now_iso(),
|
|
||||||
size_bytes=123,
|
|
||||||
job_id=int(jid_suffix) if jid_suffix.isdigit() else None,
|
|
||||||
)
|
|
||||||
row = db.db_get_cache_url(url)
|
|
||||||
if row is not None:
|
|
||||||
assert row["url_guess"] == url
|
|
||||||
# Cleanup
|
|
||||||
try:
|
|
||||||
db.remove_cached_page(fp)
|
|
||||||
db.db_remove_cached_url(url)
|
|
||||||
db.db_delete_job(jid_suffix)
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
def test_user_interactions_mark_and_visit(db_ready):
|
def test_user_interactions_mark_and_visit(db_ready):
|
||||||
uname = f"it_user_{unique_suffix()}"
|
uname = f"it_user_{unique_suffix()}"
|
||||||
db.create_or_update_user(uname, is_active=True)
|
db.create_or_update_user(uname, is_active=True)
|
||||||
|
|||||||
54
web/app.py
54
web/app.py
@@ -1,5 +1,5 @@
|
|||||||
import os
|
import os
|
||||||
from flask import Flask, request, jsonify, render_template, redirect, url_for, session, flash, send_file
|
from flask import Flask, request, jsonify, render_template, redirect, url_for, session, flash, Response
|
||||||
from flask_wtf import CSRFProtect
|
from flask_wtf import CSRFProtect
|
||||||
from typing import Dict, List
|
from typing import Dict, List
|
||||||
|
|
||||||
@@ -34,7 +34,6 @@ from web.utils import (
|
|||||||
initialize_users_from_settings,
|
initialize_users_from_settings,
|
||||||
filter_jobs,
|
filter_jobs,
|
||||||
get_job_by_id,
|
get_job_by_id,
|
||||||
get_cache_dir,
|
|
||||||
)
|
)
|
||||||
from web.db import get_all_regions, get_all_keywords
|
from web.db import get_all_regions, get_all_keywords
|
||||||
|
|
||||||
@@ -232,39 +231,6 @@ def job_by_id(job_id):
|
|||||||
return jsonify({"error": "Job not found"}), 404
|
return jsonify({"error": "Job not found"}), 404
|
||||||
|
|
||||||
|
|
||||||
@app.route('/cached/<job_id>', methods=['GET'])
|
|
||||||
def serve_cached(job_id):
|
|
||||||
"""Serve the cached HTML file for a job if available.
|
|
||||||
|
|
||||||
Uses the job record's `file_path_abs` when present, or resolves the DB `file_path` via helper.
|
|
||||||
Ensures the returned file is located under the configured cache directory to avoid path-traversal.
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
from web.db import db_get_cached_abs_path
|
|
||||||
j = get_job_by_id(job_id)
|
|
||||||
if not j:
|
|
||||||
return "Job not found", 404
|
|
||||||
|
|
||||||
# Prefer file_path_abs, fall back to resolving the DB-stored file_path
|
|
||||||
abs_fp = j.get('file_path_abs') or None
|
|
||||||
if not abs_fp:
|
|
||||||
db_fp = j.get('file_path')
|
|
||||||
abs_fp = db_get_cached_abs_path(db_fp) if db_fp else None
|
|
||||||
|
|
||||||
if not abs_fp or not os.path.isfile(abs_fp):
|
|
||||||
return "Cached file not available", 404
|
|
||||||
|
|
||||||
cache_dir = os.path.abspath(get_cache_dir())
|
|
||||||
abs_fp = os.path.abspath(abs_fp)
|
|
||||||
# Ensure the file is inside the cache directory
|
|
||||||
if os.path.commonpath([cache_dir, abs_fp]) != cache_dir:
|
|
||||||
return "Forbidden", 403
|
|
||||||
|
|
||||||
return send_file(abs_fp)
|
|
||||||
except Exception:
|
|
||||||
return "Error serving cached file", 500
|
|
||||||
|
|
||||||
|
|
||||||
@app.route('/jobs/<job_id>/favorite', methods=['POST'])
|
@app.route('/jobs/<job_id>/favorite', methods=['POST'])
|
||||||
def set_favorite(job_id):
|
def set_favorite(job_id):
|
||||||
"""Mark or unmark a job as favorite for a given user.
|
"""Mark or unmark a job as favorite for a given user.
|
||||||
@@ -290,9 +256,21 @@ csrf.exempt(set_favorite)
|
|||||||
|
|
||||||
@app.route('/scrape', methods=['GET'])
|
@app.route('/scrape', methods=['GET'])
|
||||||
def scrape():
|
def scrape():
|
||||||
"""Trigger the web scraping process."""
|
"""Trigger the web scraping process with streaming output."""
|
||||||
scraper()
|
def generate():
|
||||||
return jsonify({"status": "Scraping completed"})
|
try:
|
||||||
|
for message in scraper():
|
||||||
|
yield message
|
||||||
|
except Exception as e:
|
||||||
|
yield f"Error during scraping: {str(e)}\n"
|
||||||
|
|
||||||
|
return Response(generate(), mimetype='text/plain')
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/scrape-page', methods=['GET'])
|
||||||
|
def scrape_page():
|
||||||
|
"""Serve the scrape page with streaming output display."""
|
||||||
|
return render_template('scrape.html', title='Scrape Jobs')
|
||||||
|
|
||||||
|
|
||||||
# ---------------- Auth & Admin UI ------------------------------------------
|
# ---------------- Auth & Admin UI ------------------------------------------
|
||||||
|
|||||||
@@ -2,31 +2,19 @@ from datetime import datetime, timezone
|
|||||||
from web.scraper import process_region_keyword, scrape_job_page
|
from web.scraper import process_region_keyword, scrape_job_page
|
||||||
from web.db import (
|
from web.db import (
|
||||||
db_init,
|
db_init,
|
||||||
upsert_cached_page,
|
|
||||||
upsert_listing,
|
upsert_listing,
|
||||||
upsert_job_details,
|
upsert_job_details,
|
||||||
url_to_job_id,
|
url_to_job_id,
|
||||||
upsert_user_interaction,
|
upsert_user_interaction,
|
||||||
db_remove_cached_url,
|
|
||||||
db_sync_cached_pages,
|
|
||||||
db_get_all_job_urls,
|
db_get_all_job_urls,
|
||||||
db_get_cache_url,
|
|
||||||
db_delete_job,
|
db_delete_job,
|
||||||
remove_job,
|
remove_job,
|
||||||
normalize_cached_page_paths,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Import utility functions
|
# Import utility functions
|
||||||
from web.utils import (
|
from web.utils import (
|
||||||
get_cache_dir,
|
|
||||||
make_request_with_retry,
|
make_request_with_retry,
|
||||||
now_iso,
|
now_iso,
|
||||||
get_cache_path,
|
|
||||||
cache_page,
|
|
||||||
is_cache_stale,
|
|
||||||
delete_cached_page,
|
|
||||||
get_cached_content,
|
|
||||||
ensure_cache_dir
|
|
||||||
)
|
)
|
||||||
from web.db import get_all_regions, get_all_keywords, seed_regions_keywords_from_listings
|
from web.db import get_all_regions, get_all_keywords, seed_regions_keywords_from_listings
|
||||||
|
|
||||||
@@ -45,15 +33,26 @@ def fetch_listings():
|
|||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
yield "Initializing database and seeding regions/keywords...\n"
|
||||||
|
|
||||||
# Fetch listings for each region/keyword from DB
|
# Fetch listings for each region/keyword from DB
|
||||||
for region in get_all_regions():
|
regions = get_all_regions()
|
||||||
|
keywords = get_all_keywords()
|
||||||
|
total_combinations = len(regions) * len(keywords)
|
||||||
|
processed = 0
|
||||||
|
|
||||||
|
yield f"Found {len(regions)} regions and {len(keywords)} keywords. Processing {total_combinations} combinations...\n"
|
||||||
|
|
||||||
|
for region in regions:
|
||||||
region_name = region.get("name")
|
region_name = region.get("name")
|
||||||
if not region_name:
|
if not region_name:
|
||||||
continue
|
continue
|
||||||
for keyword in get_all_keywords():
|
for keyword in keywords:
|
||||||
keyword_name = keyword.get("name")
|
keyword_name = keyword.get("name")
|
||||||
if not keyword_name:
|
if not keyword_name:
|
||||||
continue
|
continue
|
||||||
|
processed += 1
|
||||||
|
yield f"Processing {region_name} + {keyword_name} ({processed}/{total_combinations})...\n"
|
||||||
for row in process_region_keyword(region_name, keyword_name, discovered_urls):
|
for row in process_region_keyword(region_name, keyword_name, discovered_urls):
|
||||||
timestamp, region, keyword, title, pay, location, url = row
|
timestamp, region, keyword, title, pay, location, url = row
|
||||||
discovered_urls.add(url)
|
discovered_urls.add(url)
|
||||||
@@ -72,75 +71,68 @@ def fetch_listings():
|
|||||||
|
|
||||||
# Remove stale listings: those present in DB but not discovered now.
|
# Remove stale listings: those present in DB but not discovered now.
|
||||||
stale_urls = existing_db_urls - discovered_urls
|
stale_urls = existing_db_urls - discovered_urls
|
||||||
|
if stale_urls:
|
||||||
|
yield f"Removing {len(stale_urls)} stale listings...\n"
|
||||||
for url in stale_urls:
|
for url in stale_urls:
|
||||||
try:
|
try:
|
||||||
jid = url_to_job_id(url)
|
jid = url_to_job_id(url)
|
||||||
db_delete_job(jid)
|
db_delete_job(jid)
|
||||||
# Also try to remove cached file and its metadata
|
|
||||||
delete_cached_page(url)
|
|
||||||
db_remove_cached_url(url)
|
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
yield f"Listing fetch complete: {len(discovered_urls)} discovered, {len(new_rows)} new, {len(stale_urls)} stale\n"
|
||||||
return {"discovered": len(discovered_urls), "new": len(new_rows), "stale": len(stale_urls)}
|
return {"discovered": len(discovered_urls), "new": len(new_rows), "stale": len(stale_urls)}
|
||||||
|
|
||||||
|
|
||||||
def process_job_url(job_url: str):
|
def process_job_url(job_url: str):
|
||||||
try:
|
try:
|
||||||
job_id = url_to_job_id(job_url)
|
job_id = url_to_job_id(job_url)
|
||||||
content = None
|
yield f"Fetching job page: {job_url}\n"
|
||||||
cached_page = db_get_cache_url(job_url)
|
|
||||||
if cached_page:
|
|
||||||
last_modified = cached_page.get("last_modified")
|
|
||||||
if last_modified and not is_cache_stale(last_modified):
|
|
||||||
content = get_cached_content(job_url)
|
|
||||||
else:
|
|
||||||
content = make_request_with_retry(job_url, 1)
|
|
||||||
else:
|
|
||||||
content = make_request_with_retry(job_url, 1)
|
content = make_request_with_retry(job_url, 1)
|
||||||
|
|
||||||
if content is None:
|
if content is None:
|
||||||
|
yield f"Failed to fetch content for {job_url}, removing from database\n"
|
||||||
remove_job(job_url)
|
remove_job(job_url)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# refresh cache and details
|
yield f"Scraping job data from {job_url}\n"
|
||||||
cache_page(job_url, content)
|
|
||||||
upsert_cached_page(
|
|
||||||
file_path=get_cache_path(job_url),
|
|
||||||
url_guess=job_url,
|
|
||||||
last_modified=now_iso(),
|
|
||||||
size_bytes=len(content),
|
|
||||||
job_id=job_id
|
|
||||||
)
|
|
||||||
job_data = scrape_job_page(content, job_url)
|
job_data = scrape_job_page(content, job_url)
|
||||||
if job_data:
|
if job_data:
|
||||||
|
yield f"Upserting job details for {job_id}\n"
|
||||||
upsert_job_details(job_data)
|
upsert_job_details(job_data)
|
||||||
upsert_user_interaction(
|
upsert_user_interaction(
|
||||||
job_id, seen_at=datetime.now(timezone.utc).isoformat())
|
job_id, seen_at=datetime.now(timezone.utc).isoformat())
|
||||||
|
yield f"Successfully processed job {job_id}: {job_data.get('title', 'Unknown')}\n"
|
||||||
return job_data
|
return job_data
|
||||||
|
else:
|
||||||
|
yield f"Failed to scrape job data from {job_url}\n"
|
||||||
return None
|
return None
|
||||||
except Exception:
|
except Exception as e:
|
||||||
|
yield f"Error processing {job_url}: {str(e)}\n"
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def scraper():
|
def scraper():
|
||||||
"""Main function to run the scraper."""
|
"""Main function to run the scraper."""
|
||||||
ensure_cache_dir()
|
yield "Starting scraper...\n"
|
||||||
db_init()
|
db_init()
|
||||||
|
yield "Database initialized\n"
|
||||||
|
|
||||||
# First, fetch current listings from search pages and make DB reflect them.
|
# First, fetch current listings from search pages and make DB reflect them.
|
||||||
jl = fetch_listings()
|
yield "Fetching listings...\n"
|
||||||
|
for message in fetch_listings():
|
||||||
# Sync any cached files we have on disk into the cached_pages table.
|
yield message
|
||||||
db_sync_cached_pages(get_cache_dir())
|
|
||||||
|
|
||||||
# Normalize any relative cached file paths to absolute paths in DB
|
|
||||||
normalized = normalize_cached_page_paths()
|
|
||||||
if normalized:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Finally, fetch and refresh individual job pages for current listings
|
# Finally, fetch and refresh individual job pages for current listings
|
||||||
for url in db_get_all_job_urls():
|
job_urls = db_get_all_job_urls()
|
||||||
process_job_url(url)
|
yield f"Processing {len(job_urls)} job pages...\n"
|
||||||
|
|
||||||
|
for i, url in enumerate(job_urls, 1):
|
||||||
|
yield f"\n--- Processing job {i}/{len(job_urls)} ---\n"
|
||||||
|
for message in process_job_url(url):
|
||||||
|
yield message
|
||||||
|
|
||||||
|
yield "\nScraping completed successfully!\n"
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
187
web/db.py
187
web/db.py
@@ -4,7 +4,6 @@ from __future__ import annotations
|
|||||||
|
|
||||||
Tables:
|
Tables:
|
||||||
- users(user_id PK, username UNIQUE, created_at)
|
- users(user_id PK, username UNIQUE, created_at)
|
||||||
- cached_pages(file_path PK, url_guess, last_modified, size_bytes, job_id)
|
|
||||||
- job_listings(job_id PK, url UNIQUE, region, keyword, title, pay, location, timestamp)
|
- job_listings(job_id PK, url UNIQUE, region, keyword, title, pay, location, timestamp)
|
||||||
- job_descriptions(job_id PK FK -> job_listings, title, company, location, description, posted_time, url)
|
- job_descriptions(job_id PK FK -> job_listings, title, company, location, description, posted_time, url)
|
||||||
- user_interactions(job_id PK FK -> job_listings, user_id FK -> users, seen_at, url_visited, is_user_favorite)
|
- user_interactions(job_id PK FK -> job_listings, user_id FK -> users, seen_at, url_visited, is_user_favorite)
|
||||||
@@ -23,8 +22,6 @@ from web.utils import (
|
|||||||
url_to_job_id,
|
url_to_job_id,
|
||||||
normalize_job_id,
|
normalize_job_id,
|
||||||
now_iso,
|
now_iso,
|
||||||
get_cache_path,
|
|
||||||
get_cache_dir,
|
|
||||||
get_mysql_config,
|
get_mysql_config,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -86,8 +83,6 @@ class JobListing(Base):
|
|||||||
|
|
||||||
description = relationship(
|
description = relationship(
|
||||||
"JobDescription", back_populates="listing", uselist=False, cascade="all, delete-orphan")
|
"JobDescription", back_populates="listing", uselist=False, cascade="all, delete-orphan")
|
||||||
cached_pages = relationship(
|
|
||||||
"CachedPage", back_populates="listing", cascade="all, delete-orphan")
|
|
||||||
interactions = relationship(
|
interactions = relationship(
|
||||||
"UserInteraction", back_populates="listing", cascade="all, delete-orphan")
|
"UserInteraction", back_populates="listing", cascade="all, delete-orphan")
|
||||||
|
|
||||||
@@ -106,30 +101,6 @@ class JobDescription(Base):
|
|||||||
listing = relationship("JobListing", back_populates="description")
|
listing = relationship("JobListing", back_populates="description")
|
||||||
|
|
||||||
|
|
||||||
class CachedPage(Base):
|
|
||||||
__tablename__ = "cached_pages"
|
|
||||||
file_path = Column(String(FILE_PATH_LEN), primary_key=True)
|
|
||||||
url_guess = Column(String(URL_LEN))
|
|
||||||
last_modified = Column(String(TIME_LEN))
|
|
||||||
size_bytes = Column(Integer)
|
|
||||||
job_id = Column(String(JOB_ID_LEN), ForeignKey(
|
|
||||||
"job_listings.job_id", ondelete="CASCADE"))
|
|
||||||
|
|
||||||
listing = relationship("JobListing", back_populates="cached_pages")
|
|
||||||
|
|
||||||
@property
|
|
||||||
def abs_path(self) -> Optional[str]:
|
|
||||||
"""Return the absolute filesystem path for this cached page.
|
|
||||||
|
|
||||||
The DB stores `file_path` relative to the configured cache dir. This
|
|
||||||
helper centralizes resolution so callers can use `cached_page.abs_path`.
|
|
||||||
"""
|
|
||||||
fp = getattr(self, 'file_path', None)
|
|
||||||
if not fp:
|
|
||||||
return None
|
|
||||||
return os.path.join(os.path.abspath(get_cache_dir()), fp)
|
|
||||||
|
|
||||||
|
|
||||||
class UserInteraction(Base):
|
class UserInteraction(Base):
|
||||||
__tablename__ = "user_interactions"
|
__tablename__ = "user_interactions"
|
||||||
# composite uniqueness on (user_id, job_id)
|
# composite uniqueness on (user_id, job_id)
|
||||||
@@ -292,139 +263,6 @@ def upsert_job_details(job_data: Dict[str, Any]):
|
|||||||
session.commit()
|
session.commit()
|
||||||
|
|
||||||
|
|
||||||
def upsert_cached_page(*, file_path: str, url_guess: Optional[str], last_modified: Optional[str], size_bytes: Optional[int], job_id: Optional[int]):
|
|
||||||
# Store file paths relative to the cache directory (keeps DB portable)
|
|
||||||
cache_dir = os.path.abspath(get_cache_dir())
|
|
||||||
abs_fp = os.path.abspath(file_path)
|
|
||||||
rel_fp = os.path.relpath(abs_fp, start=cache_dir)
|
|
||||||
with _ensure_session() as session:
|
|
||||||
obj = session.get(CachedPage, rel_fp)
|
|
||||||
if obj is None:
|
|
||||||
obj = CachedPage(file_path=rel_fp)
|
|
||||||
session.add(obj)
|
|
||||||
setattr(obj, "url_guess", url_guess)
|
|
||||||
setattr(obj, "last_modified", last_modified)
|
|
||||||
setattr(obj, "size_bytes", size_bytes)
|
|
||||||
setattr(obj, "job_id", str(job_id) if job_id else None)
|
|
||||||
session.commit()
|
|
||||||
|
|
||||||
|
|
||||||
def remove_cached_page(file_path: str):
|
|
||||||
# Accept absolute or relative input but DB keys are stored relative to cache dir
|
|
||||||
cache_dir = os.path.abspath(get_cache_dir())
|
|
||||||
abs_fp = os.path.abspath(file_path)
|
|
||||||
rel_fp = os.path.relpath(abs_fp, start=cache_dir)
|
|
||||||
with _ensure_session() as session:
|
|
||||||
obj = session.get(CachedPage, rel_fp)
|
|
||||||
if obj:
|
|
||||||
session.delete(obj)
|
|
||||||
session.commit()
|
|
||||||
|
|
||||||
|
|
||||||
def db_remove_cached_url(url: str):
|
|
||||||
"""Remove a cached page by URL."""
|
|
||||||
# Compute absolute path for the URL and delegate to remove_cached_page
|
|
||||||
abs_fp = os.path.abspath(get_cache_path(url))
|
|
||||||
try:
|
|
||||||
remove_cached_page(abs_fp)
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
def db_get_all_cached_pages() -> List[Dict[str, Any]]:
|
|
||||||
with _ensure_session() as session:
|
|
||||||
rows = session.execute(text(
|
|
||||||
"SELECT file_path, url_guess, last_modified, size_bytes, job_id FROM cached_pages")).fetchall()
|
|
||||||
out = []
|
|
||||||
for row in rows:
|
|
||||||
fp = row[0]
|
|
||||||
out.append({
|
|
||||||
"file_path": fp,
|
|
||||||
"file_path_abs": db_get_cached_abs_path(fp) if fp else None,
|
|
||||||
"url_guess": row[1],
|
|
||||||
"last_modified": row[2],
|
|
||||||
"size_bytes": row[3],
|
|
||||||
"job_id": row[4],
|
|
||||||
})
|
|
||||||
return out
|
|
||||||
|
|
||||||
|
|
||||||
def db_get_cache_url(url: str):
|
|
||||||
"""Return the data for a specific URL from cached_pages.
|
|
||||||
|
|
||||||
Arguments:
|
|
||||||
url -- The URL to look up in the cache.
|
|
||||||
"""
|
|
||||||
with _ensure_session() as session:
|
|
||||||
row = session.execute(text(
|
|
||||||
"SELECT file_path, url_guess, last_modified, size_bytes, job_id FROM cached_pages WHERE url_guess = :u"), {"u": url}).fetchone()
|
|
||||||
if not row:
|
|
||||||
return None
|
|
||||||
fp = row[0]
|
|
||||||
return {
|
|
||||||
"file_path": fp,
|
|
||||||
"file_path_abs": db_get_cached_abs_path(fp) if fp else None,
|
|
||||||
"url_guess": row[1],
|
|
||||||
"last_modified": row[2],
|
|
||||||
"size_bytes": row[3],
|
|
||||||
"job_id": row[4],
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def db_sync_cached_pages(cache_dir: str):
|
|
||||||
"""Scan cache_dir and upsert page metadata into cached_pages table."""
|
|
||||||
if not os.path.isdir(cache_dir):
|
|
||||||
return
|
|
||||||
abs_cache = os.path.abspath(cache_dir)
|
|
||||||
# read existing DB keys once for quick membership tests
|
|
||||||
db_cache_paths = {c["file_path"] for c in db_get_all_cached_pages()}
|
|
||||||
for root, _, files in os.walk(abs_cache):
|
|
||||||
for name in files:
|
|
||||||
if not name.lower().endswith(".html"):
|
|
||||||
continue
|
|
||||||
fp = os.path.abspath(os.path.join(root, name))
|
|
||||||
rel_fp = os.path.relpath(fp, start=abs_cache)
|
|
||||||
if rel_fp in db_cache_paths:
|
|
||||||
continue
|
|
||||||
try:
|
|
||||||
stat = os.stat(fp)
|
|
||||||
mtime = datetime.fromtimestamp(stat.st_mtime).isoformat()
|
|
||||||
size = stat.st_size
|
|
||||||
except OSError:
|
|
||||||
mtime = None
|
|
||||||
size = None
|
|
||||||
url_guess = get_url_from_filename(name)
|
|
||||||
job_id = url_to_job_id(url_guess)
|
|
||||||
upsert_cached_page(file_path=rel_fp, url_guess=url_guess,
|
|
||||||
last_modified=mtime, size_bytes=size, job_id=job_id)
|
|
||||||
|
|
||||||
|
|
||||||
def normalize_cached_page_paths() -> int:
|
|
||||||
"""Ensure all cached_pages.file_path values are absolute. Returns number of rows updated/normalized."""
|
|
||||||
# Convert any absolute paths in DB to relative paths (relative to cache dir)
|
|
||||||
changed = 0
|
|
||||||
abs_cache = os.path.abspath(get_cache_dir())
|
|
||||||
with _ensure_session() as session:
|
|
||||||
rows = session.execute(text(
|
|
||||||
"SELECT file_path, url_guess, last_modified, size_bytes, job_id FROM cached_pages")).fetchall()
|
|
||||||
for (fp, url_guess, last_modified, size_bytes, job_id) in rows:
|
|
||||||
if os.path.isabs(fp):
|
|
||||||
rel_fp = os.path.relpath(fp, start=abs_cache)
|
|
||||||
upsert_cached_page(
|
|
||||||
file_path=rel_fp,
|
|
||||||
url_guess=url_guess,
|
|
||||||
last_modified=last_modified,
|
|
||||||
size_bytes=size_bytes,
|
|
||||||
job_id=job_id,
|
|
||||||
)
|
|
||||||
with _ensure_session() as session:
|
|
||||||
session.execute(
|
|
||||||
text("DELETE FROM cached_pages WHERE file_path = :fp"), {"fp": fp})
|
|
||||||
session.commit()
|
|
||||||
changed += 1
|
|
||||||
return changed
|
|
||||||
|
|
||||||
|
|
||||||
def db_get_keywords() -> List[str]:
|
def db_get_keywords() -> List[str]:
|
||||||
"""Return a list of all unique keywords from job listings."""
|
"""Return a list of all unique keywords from job listings."""
|
||||||
with _ensure_session() as session:
|
with _ensure_session() as session:
|
||||||
@@ -453,15 +291,10 @@ SELECT l.job_id
|
|||||||
,l.timestamp
|
,l.timestamp
|
||||||
,d.posted_time
|
,d.posted_time
|
||||||
,l.url
|
,l.url
|
||||||
,c.file_path
|
|
||||||
,c.last_modified
|
|
||||||
,c.url_guess
|
|
||||||
,CASE WHEN c.url_guess != l.url THEN 1 ELSE 0 END AS url_guess_stale
|
|
||||||
FROM job_listings AS l
|
FROM job_listings AS l
|
||||||
INNER JOIN job_descriptions AS d
|
INNER JOIN job_descriptions AS d
|
||||||
ON l.job_id = d.job_id
|
ON l.job_id = d.job_id
|
||||||
AND l.url = d.url
|
AND l.url = d.url
|
||||||
LEFT JOIN cached_pages AS c ON l.job_id = c.job_id
|
|
||||||
ORDER BY d.posted_time DESC
|
ORDER BY d.posted_time DESC
|
||||||
"""
|
"""
|
||||||
with _ensure_session() as session:
|
with _ensure_session() as session:
|
||||||
@@ -479,27 +312,11 @@ ORDER BY d.posted_time DESC
|
|||||||
"timestamp": row[7],
|
"timestamp": row[7],
|
||||||
"posted_time": row[8],
|
"posted_time": row[8],
|
||||||
"url": row[9],
|
"url": row[9],
|
||||||
# file_path is stored relative to cache dir; provide both forms
|
|
||||||
"file_path": row[10],
|
|
||||||
"file_path_abs": os.path.join(os.path.abspath(get_cache_dir()), row[10]) if row[10] else None,
|
|
||||||
"last_modified": row[11],
|
|
||||||
"url_guess": row[12],
|
|
||||||
"url_guess_stale": row[13],
|
|
||||||
}
|
}
|
||||||
jobs.append(job)
|
jobs.append(job)
|
||||||
return jobs
|
return jobs
|
||||||
|
|
||||||
|
|
||||||
def db_get_cached_abs_path(db_file_path: Optional[str]) -> Optional[str]:
|
|
||||||
"""Return absolute cache file path given a DB-stored (relative) file_path.
|
|
||||||
|
|
||||||
Returns None if input is falsy.
|
|
||||||
"""
|
|
||||||
if not db_file_path:
|
|
||||||
return None
|
|
||||||
return os.path.join(os.path.abspath(get_cache_dir()), db_file_path)
|
|
||||||
|
|
||||||
|
|
||||||
def db_get_all_job_urls() -> List[str]:
|
def db_get_all_job_urls() -> List[str]:
|
||||||
"""Return list of job URLs from job_listings."""
|
"""Return list of job URLs from job_listings."""
|
||||||
with _ensure_session() as session:
|
with _ensure_session() as session:
|
||||||
@@ -522,10 +339,6 @@ def remove_job(url):
|
|||||||
try:
|
try:
|
||||||
jid = url_to_job_id(url)
|
jid = url_to_job_id(url)
|
||||||
db_delete_job(jid)
|
db_delete_job(jid)
|
||||||
cache_fp = get_cache_path(url)
|
|
||||||
remove_cached_page(os.path.abspath(cache_fp))
|
|
||||||
if os.path.exists(cache_fp):
|
|
||||||
os.remove(cache_fp)
|
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
from datetime import datetime, UTC
|
from datetime import datetime, UTC
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from typing import List, Dict, Set
|
from typing import List, Dict, Set
|
||||||
from web.utils import get_base_url, cache_page, safe_get_text, safe_get_attr, is_cached, get_cached_content, make_request_with_retry
|
from web.utils import get_base_url, safe_get_text, safe_get_attr, make_request_with_retry
|
||||||
|
|
||||||
|
|
||||||
def scrape_listings_page(listing, region: str, keyword: str, seen_urls: Set[str]) -> List:
|
def scrape_listings_page(listing, region: str, keyword: str, seen_urls: Set[str]) -> List:
|
||||||
@@ -108,14 +108,7 @@ def scrape_job_data(content: str, region: str, keyword: str, seen_urls: Set[str]
|
|||||||
def process_region_keyword(region: str, keyword: str, seen_urls: Set[str]) -> List[List]:
|
def process_region_keyword(region: str, keyword: str, seen_urls: Set[str]) -> List[List]:
|
||||||
"""Process a single region and keyword."""
|
"""Process a single region and keyword."""
|
||||||
url = get_base_url().format(region=region, keyword=keyword.replace(" ", "+"))
|
url = get_base_url().format(region=region, keyword=keyword.replace(" ", "+"))
|
||||||
if is_cached(url):
|
|
||||||
content = get_cached_content(url)
|
|
||||||
cache_status = "CACHED"
|
|
||||||
else:
|
|
||||||
content = make_request_with_retry(url, 3)
|
content = make_request_with_retry(url, 3)
|
||||||
if content is None:
|
if content is None:
|
||||||
return []
|
return []
|
||||||
cache_page(url, content)
|
|
||||||
cache_status = "FETCHED"
|
|
||||||
_ = cache_status # no-op to silence unused var
|
|
||||||
return scrape_job_data(content, region, keyword, seen_urls)
|
return scrape_job_data(content, region, keyword, seen_urls)
|
||||||
|
|||||||
@@ -20,6 +20,7 @@
|
|||||||
<a href="{{ url_for('index') }}">Home</a> |
|
<a href="{{ url_for('index') }}">Home</a> |
|
||||||
<a href="{{ url_for('user_settings') }}">Preferences</a>
|
<a href="{{ url_for('user_settings') }}">Preferences</a>
|
||||||
{% if current_user and current_user.is_admin %} |
|
{% if current_user and current_user.is_admin %} |
|
||||||
|
<a href="{{ url_for('scrape_page') }}">Scrape Jobs</a> |
|
||||||
<a href="{{ url_for('admin_taxonomy') }}">Taxonomy</a> |
|
<a href="{{ url_for('admin_taxonomy') }}">Taxonomy</a> |
|
||||||
<a href="{{ url_for('admin_users') }}">Users</a> {% endif %} {% if
|
<a href="{{ url_for('admin_users') }}">Users</a> {% endif %} {% if
|
||||||
session.get('username') %} |
|
session.get('username') %} |
|
||||||
|
|||||||
@@ -23,13 +23,5 @@ styles %}{% endblock %} {% block content %}
|
|||||||
>{{ job.title }}</a
|
>{{ job.title }}</a
|
||||||
>
|
>
|
||||||
</p>
|
</p>
|
||||||
{% if job.file_path_abs or job.file_path %}
|
|
||||||
<p>
|
|
||||||
<strong>Cached copy:</strong>
|
|
||||||
<a href="{{ url_for('serve_cached', job_id=job.id) }}" target="_blank"
|
|
||||||
>View cached copy</a
|
|
||||||
>
|
|
||||||
</p>
|
|
||||||
{% endif %}
|
|
||||||
</div>
|
</div>
|
||||||
{% endblock %}
|
{% endblock %}
|
||||||
|
|||||||
67
web/templates/scrape.html
Normal file
67
web/templates/scrape.html
Normal file
@@ -0,0 +1,67 @@
|
|||||||
|
{% extends "base.html" %} {% block title %}Scrape Jobs{% endblock %} {% block
|
||||||
|
content %}
|
||||||
|
<div id="scrape-container">
|
||||||
|
<h2>Job Scraping Progress</h2>
|
||||||
|
<button id="start-scrape" onclick="startScrape()">Start Scraping</button>
|
||||||
|
<div
|
||||||
|
id="output"
|
||||||
|
style="
|
||||||
|
margin-top: 20px;
|
||||||
|
padding: 10px;
|
||||||
|
border: 1px solid #ccc;
|
||||||
|
height: 400px;
|
||||||
|
overflow-y: auto;
|
||||||
|
background-color: #f9f9f9;
|
||||||
|
font-family: monospace;
|
||||||
|
white-space: pre-wrap;
|
||||||
|
"
|
||||||
|
></div>
|
||||||
|
</div>
|
||||||
|
{% endblock %} {% block scripts %}
|
||||||
|
<script>
|
||||||
|
function startScrape() {
|
||||||
|
const output = document.getElementById("output");
|
||||||
|
const startButton = document.getElementById("start-scrape");
|
||||||
|
|
||||||
|
output.textContent = "Starting scrape...\n";
|
||||||
|
startButton.disabled = true;
|
||||||
|
startButton.textContent = "Scraping...";
|
||||||
|
|
||||||
|
fetch("/scrape")
|
||||||
|
.then((response) => {
|
||||||
|
const reader = response.body.getReader();
|
||||||
|
const decoder = new TextDecoder();
|
||||||
|
|
||||||
|
function readStream() {
|
||||||
|
reader
|
||||||
|
.read()
|
||||||
|
.then(({ done, value }) => {
|
||||||
|
if (done) {
|
||||||
|
output.textContent += "\nScraping completed!";
|
||||||
|
startButton.disabled = false;
|
||||||
|
startButton.textContent = "Start Scraping";
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const chunk = decoder.decode(value, { stream: true });
|
||||||
|
output.textContent += chunk;
|
||||||
|
output.scrollTop = output.scrollHeight;
|
||||||
|
readStream();
|
||||||
|
})
|
||||||
|
.catch((error) => {
|
||||||
|
output.textContent += `\nError: ${error.message}`;
|
||||||
|
startButton.disabled = false;
|
||||||
|
startButton.textContent = "Start Scraping";
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
readStream();
|
||||||
|
})
|
||||||
|
.catch((error) => {
|
||||||
|
output.textContent = `Error starting scrape: ${error.message}`;
|
||||||
|
startButton.disabled = false;
|
||||||
|
startButton.textContent = "Start Scraping";
|
||||||
|
});
|
||||||
|
}
|
||||||
|
</script>
|
||||||
|
{% endblock %}
|
||||||
54
web/utils.py
54
web/utils.py
@@ -93,8 +93,7 @@ def get_paths() -> dict:
|
|||||||
return get_config().get('paths', {})
|
return get_config().get('paths', {})
|
||||||
|
|
||||||
|
|
||||||
def get_cache_dir() -> str:
|
|
||||||
return get_paths().get('cache_dir', 'cache')
|
|
||||||
|
|
||||||
|
|
||||||
def get_logs_dir() -> str:
|
def get_logs_dir() -> str:
|
||||||
@@ -129,9 +128,7 @@ def get_base_url() -> str:
|
|||||||
return get_config().get('scraper', {}).get('base_url', "https://{region}.craigslist.org/search/jjj?query={keyword}&sort=rel")
|
return get_config().get('scraper', {}).get('base_url', "https://{region}.craigslist.org/search/jjj?query={keyword}&sort=rel")
|
||||||
|
|
||||||
|
|
||||||
def ensure_cache_dir():
|
|
||||||
"""Ensure cache directory exists."""
|
|
||||||
os.makedirs(get_cache_dir(), exist_ok=True)
|
|
||||||
|
|
||||||
|
|
||||||
def now_iso() -> str:
|
def now_iso() -> str:
|
||||||
@@ -173,10 +170,7 @@ def get_url_from_filename(name: str) -> str:
|
|||||||
return url_guess
|
return url_guess
|
||||||
|
|
||||||
|
|
||||||
def get_cached_content(url: str) -> str:
|
|
||||||
"""Get cached content for URL."""
|
|
||||||
with open(get_cache_path(url), "r", encoding="utf-8") as f:
|
|
||||||
return f.read()
|
|
||||||
|
|
||||||
|
|
||||||
def safe_get_text(element, default="N/A"):
|
def safe_get_text(element, default="N/A"):
|
||||||
@@ -194,51 +188,19 @@ def get_random_delay(min_delay: int = get_min_delay(), max_delay: int = get_max_
|
|||||||
return random.uniform(min_delay, max_delay)
|
return random.uniform(min_delay, max_delay)
|
||||||
|
|
||||||
|
|
||||||
def get_cache_path(url: str) -> str:
|
|
||||||
"""Get cache file path for URL."""
|
|
||||||
return os.path.join(get_cache_dir(), f"{get_filename_from_url(url)}.html")
|
|
||||||
|
|
||||||
|
|
||||||
def cache_page(url: str, content: str):
|
|
||||||
"""Cache the page content with a timestamp."""
|
|
||||||
cache_path = get_cache_path(url)
|
|
||||||
with open(cache_path, "w", encoding="utf-8") as f:
|
|
||||||
f.write(content)
|
|
||||||
# Update the file's modification time to the current time
|
|
||||||
os.utime(cache_path, None)
|
|
||||||
|
|
||||||
|
|
||||||
def is_cached(url: str) -> bool:
|
|
||||||
"""Check if the page is cached and not older than 24 hours."""
|
|
||||||
cache_path = get_cache_path(url)
|
|
||||||
if not os.path.isfile(cache_path):
|
|
||||||
return False
|
|
||||||
|
|
||||||
# Check the file's age if it's a search result page
|
|
||||||
if 'search' in url:
|
|
||||||
file_age = time.time() - os.path.getmtime(cache_path)
|
|
||||||
if file_age > 24 * 3600: # 24 hours in seconds
|
|
||||||
return False
|
|
||||||
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
|
||||||
def is_cache_stale(last_modified: str, days: int = 1) -> bool:
|
|
||||||
"""Check if the cached page is stale (older than 24 hours)."""
|
|
||||||
if not last_modified:
|
|
||||||
return True
|
|
||||||
last_datetime = datetime.fromisoformat(last_modified)
|
|
||||||
file_age = time.time() - last_datetime.timestamp()
|
|
||||||
return file_age > days * 24 * 3600 # days in seconds
|
|
||||||
|
|
||||||
|
|
||||||
def delete_cached_page(url: str):
|
|
||||||
cache_fp = get_cache_path(url)
|
|
||||||
if os.path.exists(cache_fp):
|
|
||||||
try:
|
|
||||||
os.remove(cache_fp)
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
def get_color_from_string(s: str) -> str:
|
def get_color_from_string(s: str) -> str:
|
||||||
|
|||||||
Reference in New Issue
Block a user