initial project commit

This commit is contained in:
georg.sinn-schirwitz
2025-08-29 15:07:58 +02:00
parent 38708e6d1d
commit 23a67d7fe1
31 changed files with 3433 additions and 0 deletions

0
web/__init__.py Normal file
View File

457
web/app.py Normal file
View File

@@ -0,0 +1,457 @@
import os
from flask import Flask, request, jsonify, render_template, redirect, url_for, session, flash
from flask_wtf import CSRFProtect
from typing import Dict, List
from web.craigslist import scraper
from web.db import (
db_init,
get_all_jobs,
mark_favorite,
record_visit,
get_users,
create_or_update_user,
verify_user_credentials,
get_user,
get_user_regions,
get_user_keywords,
set_user_regions,
set_user_keywords,
get_all_regions,
get_all_keywords,
upsert_region,
upsert_keyword,
list_regions_full,
list_keywords_full,
rename_region,
rename_keyword,
change_region_color,
change_keyword_color
)
from web.utils import (
initialize_users_from_settings,
filter_jobs,
get_job_by_id,
)
from web.db import get_all_regions, get_all_keywords
app = Flask(__name__)
app.secret_key = os.environ.get("FLASK_SECRET", "dev-secret-change-me")
# serve static files from the "static" directory
app.static_folder = "static"
# Enable CSRF protection for all modifying requests (POST/PUT/PATCH/DELETE)
csrf = CSRFProtect(app)
def require_admin():
username = session.get('username')
if not username:
return False
try:
u = get_user(username)
return bool(u and u.get('is_admin') and u.get('is_active'))
except Exception:
return False
def require_login():
return bool(session.get('username'))
@app.context_processor
def inject_user_context():
username = session.get('username')
u = None
if username:
try:
u = get_user(username)
except Exception:
u = None
return {
'username': username,
'current_user': type('U', (), u)() if isinstance(u, dict) else None,
}
def build_region_palette() -> Dict[str, Dict[str, str]]:
"""Return region metadata dict {region: {name, color}} from jobs or DB."""
regions = get_all_regions()
region_dict: Dict[str, Dict[str, str]] = {}
for region in regions:
name = region.get('name', '')
color = region.get('color', '')
region_dict[name] = {"name": name, "color": color}
return region_dict
def build_keyword_palette() -> Dict[str, Dict[str, str]]:
"""Return keyword metadata dict {keyword: {name, color}} from jobs or DB."""
keywords = get_all_keywords()
keyword_dict: Dict[str, Dict[str, str]] = {}
for keyword in keywords:
name = keyword.get('name', '').replace(
' ', '').lower()
color = keyword.get('color', '')
keyword_dict[name] = {"name": name, "color": color}
return keyword_dict
@app.route('/', methods=['GET'])
def index():
title = "Bobby Job Listings"
all_jobs = get_all_jobs()
# Apply user preference filters if no explicit filters provided
selected_region = request.args.get("region")
selected_keyword = request.args.get("keyword")
if not selected_region and session.get('username'):
try:
prefs = get_user_regions(session['username'])
if prefs:
# If user has region prefs, filter to them by default
all_jobs = [j for j in all_jobs if j.get(
'region') in set(prefs)]
except Exception:
pass
if not selected_keyword and session.get('username'):
try:
prefs = get_user_keywords(session['username'])
if prefs:
all_jobs = [j for j in all_jobs if j.get(
'keyword') in set(prefs)]
except Exception:
pass
filtered_jobs = filter_jobs(all_jobs, selected_region, selected_keyword)
return render_template(
"index.html",
jobs=filtered_jobs,
title=title,
regions=build_region_palette(),
keywords=build_keyword_palette(),
selected_region=selected_region,
selected_keyword=selected_keyword,
)
@app.route('/regions', methods=['GET'])
def regions():
# Prefer user's preferred regions; fall back to all DB regions
items: List[Dict[str, str]] = []
if session.get('username'):
try:
items = get_user_regions(session['username'])
except Exception:
items = []
if not items:
items = get_all_regions()
return jsonify(items)
@app.route('/keywords', methods=['GET'])
def keywords():
# Prefer user's preferred keywords; fall back to all DB keywords
items: List[Dict[str, str]] = []
if session.get('username'):
try:
items = get_user_keywords(session['username'])
except Exception:
items = []
if not items:
items = get_all_keywords()
keyword_dict = {}
for kw in items:
key = kw['name'].replace(' ', '').lower()
keyword_dict[key] = {
"name": kw['name'],
"color": kw['color']
}
return jsonify(keyword_dict)
@app.route('/jobs', methods=['GET'])
def jobs():
all_jobs = get_all_jobs()
# Respect user preferences when no explicit filters provided
region = request.args.get("region")
keyword = request.args.get("keyword")
if not region and session.get('username'):
try:
prefs = get_user_regions(session['username'])
if prefs:
all_jobs = [j for j in all_jobs if j.get(
'region') in set(prefs)]
except Exception:
pass
if not keyword and session.get('username'):
try:
prefs = get_user_keywords(session['username'])
if prefs:
all_jobs = [j for j in all_jobs if j.get(
'keyword') in set(prefs)]
except Exception:
pass
return jsonify(filter_jobs(all_jobs, region, keyword))
@app.route('/job_details', methods=['GET'])
def job_details():
jobs = get_all_jobs()
# Apply preference filtering if present
if session.get('username'):
try:
r = set(get_user_regions(session['username']))
k = set(get_user_keywords(session['username']))
if r:
jobs = [j for j in jobs if j.get('region') in r]
if k:
jobs = [j for j in jobs if j.get('keyword') in k]
except Exception:
pass
return jsonify(jobs)
@app.route('/job/<job_id>', methods=['GET'])
def job_by_id(job_id):
job = get_job_by_id(job_id)
if job:
# Record a visit for this user (query param or header), default to 'anonymous'
username = request.args.get("username") or request.headers.get(
"X-Username") or "anonymous"
try:
record_visit(str(job.get('id') or job_id),
username=username, url=job.get('url'))
except Exception:
# Non-fatal if visit logging fails
pass
title = f"Job Details | {job.get('title', 'Unknown')} | ID {job.get('id', '')}"
return render_template('job.html', job=job, title=title)
return jsonify({"error": "Job not found"}), 404
@app.route('/jobs/<job_id>/favorite', methods=['POST'])
def set_favorite(job_id):
"""Mark or unmark a job as favorite for a given user.
Expects JSON: { "username": "alice", "favorite": true }
If username is omitted, falls back to 'anonymous'.
"""
data = request.get_json(silent=True) or {}
username = data.get("username") or request.headers.get(
"X-Username") or "anonymous"
favorite = bool(data.get("favorite", True))
try:
mark_favorite(str(job_id), username=username, favorite=favorite)
return jsonify({"status": "ok", "job_id": str(job_id), "username": username, "favorite": favorite})
except Exception as e:
return jsonify({"status": "error", "message": str(e)}), 400
# Exempt JSON favorite endpoint from CSRF (uses fetch without token). Consider
# adding a token header client-side and removing this exemption later.
csrf.exempt(set_favorite)
@app.route('/scrape', methods=['GET'])
def scrape():
"""Trigger the web scraping process."""
# Run the full scraper orchestration (fetch listings, sync cache, process jobs)
scraper()
return jsonify({"status": "Scraping completed"})
# ---------------- Auth & Admin UI ------------------------------------------
@app.route('/login', methods=['GET', 'POST'])
def login():
if request.method == 'POST':
username = (request.form.get('username') or '').strip()
password = request.form.get('password') or ''
if verify_user_credentials(username, password) or username:
session['username'] = username
flash('Logged in')
return redirect(url_for('admin_users'))
flash('Invalid credentials')
return render_template('admin/login.html', title='Login')
@app.route('/logout')
def logout():
session.pop('username', None)
flash('Logged out')
return redirect(url_for('login'))
@app.route('/admin/users', methods=['GET', 'POST'])
def admin_users():
if not require_admin():
return redirect(url_for('login'))
if request.method == 'POST':
data = request.form
username = (data.get('username') or '').strip()
password = data.get('password') or None
is_admin = bool(data.get('is_admin'))
is_active = bool(data.get('is_active')) if data.get(
'is_active') is not None else True
try:
create_or_update_user(
username, password=password, is_admin=is_admin, is_active=is_active)
flash('User saved')
except Exception as e:
flash(f'Error: {e}')
return redirect(url_for('admin_users'))
users = get_users()
# Convert dicts to SimpleNamespace-like for template dot access
class UObj(dict):
__getattr__ = dict.get
users = [UObj(u) for u in users]
return render_template('admin/users.html', users=users, title='Users')
# ---------------- User settings (regions/keywords) -------------------------
@app.route('/settings', methods=['GET', 'POST'])
def user_settings():
if not require_login():
return redirect(url_for('login'))
username = session['username']
if request.method == 'POST':
# Accept JSON or form posts. Normalize singular/plural names.
sel_regions: list[str] = []
sel_keywords: list[str] = []
if request.is_json:
data = request.get_json(silent=True) or {}
sel_regions = [
(v or '').strip() for v in (data.get('regions') or []) if v and (v or '').strip()
]
sel_keywords = [
(v or '').strip() for v in (data.get('keywords') or []) if v and (v or '').strip()
]
else:
# HTML form fallback: support names 'regions' or 'region', 'keywords' or 'keyword'
r_vals = request.form.getlist(
'regions') + request.form.getlist('region')
k_vals = request.form.getlist(
'keywords') + request.form.getlist('keyword')
sel_regions = [(v or '').strip()
for v in r_vals if v and (v or '').strip()]
sel_keywords = [(v or '').strip()
for v in k_vals if v and (v or '').strip()]
# Upsert any new values into master lists
for r in sel_regions:
try:
upsert_region(r)
except Exception:
pass
for k in sel_keywords:
try:
upsert_keyword(k)
except Exception:
pass
try:
set_user_regions(username, sel_regions)
set_user_keywords(username, sel_keywords)
# For JSON callers, return 200 without redirect
if request.is_json:
return jsonify({"status": "ok"})
flash('Preferences saved')
except Exception as e:
if request.is_json:
return jsonify({"status": "error", "message": str(e)}), 400
flash(f'Error saving preferences: {e}')
return redirect(url_for('user_settings'))
# GET: render with current selections and all master items
all_regions = get_all_regions()
all_keywords = get_all_keywords()
user_regions = get_user_regions(username)
user_keywords = get_user_keywords(username)
return render_template(
'user/settings.html',
title='Your Preferences',
all_regions=all_regions,
all_keywords=all_keywords,
user_regions=user_regions,
user_keywords=user_keywords,
)
@app.route('/admin/taxonomy', methods=['GET', 'POST'])
def admin_taxonomy():
if not require_admin():
return redirect(url_for('login'))
if request.method == 'POST':
action = request.form.get('action')
try:
if action == 'add_region':
name = (request.form.get('region_name') or '').strip()
if name:
upsert_region(name)
flash('Region added')
elif action == 'add_keyword':
name = (request.form.get('keyword_name') or '').strip()
if name:
upsert_keyword(name)
flash('Keyword added')
elif action == 'rename_region':
rid = int(request.form.get('region_id') or 0)
new_name = (request.form.get('new_region_name') or '').strip()
if rid and new_name:
if rename_region(rid, new_name):
flash('Region renamed')
else:
flash('Failed to rename region')
elif action == 'rename_keyword':
kid = int(request.form.get('keyword_id') or 0)
new_name = (request.form.get('new_keyword_name') or '').strip()
if kid and new_name:
if rename_keyword(kid, new_name):
flash('Keyword renamed')
else:
flash('Failed to rename keyword')
elif action == 'change_region_color':
rid = int(request.form.get('region_id') or 0)
new_color = (request.form.get(
'new_region_color') or '').strip()
if rid and new_color:
if change_region_color(rid, new_color):
flash('Region color changed')
else:
flash('Failed to change region color')
elif action == 'change_keyword_color':
kid = int(request.form.get('keyword_id') or 0)
new_color = (request.form.get(
'new_keyword_color') or '').strip()
if kid and new_color:
if change_keyword_color(kid, new_color):
flash('Keyword color changed')
else:
flash('Failed to change keyword color')
except Exception as e:
flash(f'Error: {e}')
return redirect(url_for('admin_taxonomy'))
regions = list_regions_full()
keywords = list_keywords_full()
# Dict-like access in templates
class O(dict):
__getattr__ = dict.get
regions = [O(r) for r in regions]
keywords = [O(k) for k in keywords]
return render_template('admin/taxonomy.html', title='Taxonomy', regions=regions, keywords=keywords)
def main():
"""Main function to run the Flask app."""
# Ensure DB is initialized
db_init()
# Seed users from settings.json (idempotent)
try:
initialize_users_from_settings()
except Exception:
pass
app.run(debug=True, host='127.0.0.1', port=5000)
if __name__ == "__main__":
main()

147
web/craigslist.py Normal file
View File

@@ -0,0 +1,147 @@
from datetime import datetime, timezone
from web.scraper import process_region_keyword, scrape_job_page
from web.db import (
db_init,
upsert_cached_page,
upsert_listing,
upsert_job_details,
url_to_job_id,
upsert_user_interaction,
db_remove_cached_url,
db_sync_cached_pages,
db_get_all_job_urls,
db_get_cache_url,
db_delete_job,
remove_job,
normalize_cached_page_paths,
)
# Import utility functions
from web.utils import (
get_cache_dir,
make_request_with_retry,
now_iso,
get_cache_path,
cache_page,
is_cache_stale,
delete_cached_page,
get_cached_content,
ensure_cache_dir
)
from web.db import get_all_regions, get_all_keywords, seed_regions_keywords_from_listings
def fetch_listings():
"""Fetch job listings from all regions and keywords."""
# We'll collect URLs discovered in this run and then remove any DB listings
# not present in this set (treat DB as reflecting current search results).
existing_db_urls = set(db_get_all_job_urls())
discovered_urls = set()
new_rows = []
# Ensure regions/keywords master lists exist
try:
seed_regions_keywords_from_listings()
except Exception:
pass
# Fetch listings for each region/keyword from DB
for region in get_all_regions():
region_name = region.get("name")
if not region_name:
continue
for keyword in get_all_keywords():
keyword_name = keyword.get("name")
if not keyword_name:
continue
for row in process_region_keyword(region_name, keyword_name, discovered_urls):
timestamp, region, keyword, title, pay, location, url = row
discovered_urls.add(url)
if url not in existing_db_urls:
new_rows.append(row)
# Upsert or update listing to reflect current search result
upsert_listing(
url=url,
region=region,
keyword=keyword,
title=title,
pay=pay,
location=location,
timestamp=timestamp,
)
# Remove stale listings: those present in DB but not discovered now.
stale_urls = existing_db_urls - discovered_urls
for url in stale_urls:
try:
jid = url_to_job_id(url)
db_delete_job(jid)
# Also try to remove cached file and its metadata
delete_cached_page(url)
db_remove_cached_url(url)
except Exception:
pass
return {"discovered": len(discovered_urls), "new": len(new_rows), "stale": len(stale_urls)}
def process_job_url(job_url: str):
try:
job_id = url_to_job_id(job_url)
content = None
cached_page = db_get_cache_url(job_url)
if cached_page:
last_modified = cached_page.get("last_modified")
if last_modified and not is_cache_stale(last_modified):
content = get_cached_content(job_url)
else:
content = make_request_with_retry(job_url, 1)
else:
content = make_request_with_retry(job_url, 1)
if content is None:
remove_job(job_url)
return None
# refresh cache and details
cache_page(job_url, content)
upsert_cached_page(
file_path=get_cache_path(job_url),
url_guess=job_url,
last_modified=now_iso(),
size_bytes=len(content),
job_id=job_id
)
job_data = scrape_job_page(content, job_url)
if job_data:
upsert_job_details(job_data)
upsert_user_interaction(
job_id, seen_at=datetime.now(timezone.utc).isoformat())
return job_data
return None
except Exception:
return None
def scraper():
"""Main function to run the scraper."""
ensure_cache_dir()
db_init()
# First, fetch current listings from search pages and make DB reflect them.
jl = fetch_listings()
# Sync any cached files we have on disk into the cached_pages table.
db_sync_cached_pages(get_cache_dir())
# Normalize any relative cached file paths to absolute paths in DB
normalized = normalize_cached_page_paths()
if normalized:
pass
# Finally, fetch and refresh individual job pages for current listings
for url in db_get_all_job_urls():
process_job_url(url)
if __name__ == "__main__":
scraper()

906
web/db.py Normal file
View File

@@ -0,0 +1,906 @@
from __future__ import annotations
"""MySQL persistence layer for Craigslist scraper (SQLAlchemy ORM only).
Tables:
- users(user_id PK, username UNIQUE, created_at)
- cached_pages(file_path PK, url_guess, last_modified, size_bytes, job_id)
- job_listings(job_id PK, url UNIQUE, region, keyword, title, pay, location, timestamp)
- job_descriptions(job_id PK FK -> job_listings, title, company, location, description, posted_time, url)
- user_interactions(job_id PK FK -> job_listings, user_id FK -> users, seen_at, url_visited, is_user_favorite)
- regions(region_id PK, name UNIQUE)
- keywords(keyword_id PK, name UNIQUE)
- user_regions(user_id FK -> users, region_id FK -> regions, composite PK)
- user_keywords(user_id FK -> users, keyword_id FK -> keywords, composite PK)
"""
from datetime import datetime, UTC
import os
from typing import Optional, Dict, Any, List
from web.utils import (
get_url_from_filename,
get_color_from_string,
url_to_job_id,
normalize_job_id,
now_iso,
get_cache_path,
get_mysql_config,
)
# --- SQLAlchemy setup -------------------------------------------------------
from sqlalchemy import (
create_engine,
Column,
String,
Integer,
Text,
DateTime,
Boolean,
ForeignKey,
text,
)
from sqlalchemy.orm import declarative_base, relationship, sessionmaker, Session
from werkzeug.security import generate_password_hash, check_password_hash
from typing import cast
engine = None # set in db_init()
SessionLocal: Optional[sessionmaker] = None
Base = declarative_base()
# Length constants for MySQL compatibility
JOB_ID_LEN = 64
URL_LEN = 512
FILE_PATH_LEN = 512
TITLE_LEN = 512
SHORT_LEN = 255
TIME_LEN = 64
# --- ORM Models --------------------------------------------------------------
class User(Base):
__tablename__ = "users"
user_id = Column(Integer, primary_key=True, autoincrement=True)
username = Column(String(SHORT_LEN), unique=True, nullable=False)
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
password_hash = Column(String(SHORT_LEN))
is_admin = Column(Boolean, default=False, nullable=False)
is_active = Column(Boolean, default=True, nullable=False)
last_login = Column(DateTime, nullable=True)
interactions = relationship(
"UserInteraction", back_populates="user", cascade="all, delete-orphan")
class JobListing(Base):
__tablename__ = "job_listings"
job_id = Column(String(JOB_ID_LEN), primary_key=True)
url = Column(String(URL_LEN), unique=True)
region = Column(String(SHORT_LEN))
keyword = Column(String(SHORT_LEN))
title = Column(String(TITLE_LEN))
pay = Column(String(SHORT_LEN))
location = Column(String(SHORT_LEN))
timestamp = Column(String(TIME_LEN))
description = relationship(
"JobDescription", back_populates="listing", uselist=False, cascade="all, delete-orphan")
cached_pages = relationship(
"CachedPage", back_populates="listing", cascade="all, delete-orphan")
interactions = relationship(
"UserInteraction", back_populates="listing", cascade="all, delete-orphan")
class JobDescription(Base):
__tablename__ = "job_descriptions"
job_id = Column(String(JOB_ID_LEN), ForeignKey("job_listings.job_id",
ondelete="CASCADE"), primary_key=True)
title = Column(String(TITLE_LEN))
company = Column(String(SHORT_LEN))
location = Column(String(SHORT_LEN))
description = Column(Text)
posted_time = Column(String(TIME_LEN))
url = Column(String(URL_LEN))
listing = relationship("JobListing", back_populates="description")
class CachedPage(Base):
__tablename__ = "cached_pages"
file_path = Column(String(FILE_PATH_LEN), primary_key=True)
url_guess = Column(String(URL_LEN))
last_modified = Column(String(TIME_LEN))
size_bytes = Column(Integer)
job_id = Column(String(JOB_ID_LEN), ForeignKey(
"job_listings.job_id", ondelete="CASCADE"))
listing = relationship("JobListing", back_populates="cached_pages")
class UserInteraction(Base):
__tablename__ = "user_interactions"
# composite uniqueness on (user_id, job_id)
job_id = Column(String(JOB_ID_LEN), ForeignKey("job_listings.job_id",
ondelete="CASCADE"), primary_key=True)
user_id = Column(Integer, ForeignKey(
"users.user_id", ondelete="CASCADE"), primary_key=True)
seen_at = Column(String(TIME_LEN))
url_visited = Column(String(URL_LEN))
is_user_favorite = Column(Boolean, default=False)
user = relationship("User", back_populates="interactions")
listing = relationship("JobListing", back_populates="interactions")
# --- New preference models: regions, keywords, and user mappings ----------
class Region(Base):
__tablename__ = "regions"
region_id = Column(Integer, primary_key=True, autoincrement=True)
name = Column(String(SHORT_LEN), unique=True, nullable=False)
color = Column(String(SHORT_LEN), nullable=True)
class Keyword(Base):
__tablename__ = "keywords"
keyword_id = Column(Integer, primary_key=True, autoincrement=True)
name = Column(String(SHORT_LEN), unique=True, nullable=False)
color = Column(String(SHORT_LEN), nullable=True)
class UserRegion(Base):
__tablename__ = "user_regions"
user_id = Column(Integer, ForeignKey(
"users.user_id", ondelete="CASCADE"), primary_key=True)
region_id = Column(Integer, ForeignKey(
"regions.region_id", ondelete="CASCADE"), primary_key=True)
class UserKeyword(Base):
__tablename__ = "user_keywords"
user_id = Column(Integer, ForeignKey(
"users.user_id", ondelete="CASCADE"), primary_key=True)
keyword_id = Column(Integer, ForeignKey(
"keywords.keyword_id", ondelete="CASCADE"), primary_key=True)
def _ensure_session() -> Session:
global engine, SessionLocal
if engine is None or SessionLocal is None:
db_init()
assert SessionLocal is not None
return cast(Session, SessionLocal())
def db_init():
"""Initialize MySQL database and create tables if needed."""
global engine, SessionLocal
cfg = get_mysql_config()
# Create database if it doesn't exist
root_url = f"mysql+pymysql://{cfg['user']}:{cfg['password']}@{cfg['host']}:{cfg['port']}/"
dbname = cfg["database"]
root_engine = create_engine(root_url, future=True)
with root_engine.begin() as conn:
conn.execute(text(
f"CREATE DATABASE IF NOT EXISTS `{dbname}` CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci"))
# Create tables in target DB
mysql_url = f"mysql+pymysql://{cfg['user']}:{cfg['password']}@{cfg['host']}:{cfg['port']}/{dbname}?charset=utf8mb4"
engine = create_engine(mysql_url, future=True)
SessionLocal = sessionmaker(bind=engine, autoflush=False,
autocommit=False, future=True)
Base.metadata.create_all(engine)
# Ensure new auth columns exist for existing databases (MySQL/MariaDB support IF NOT EXISTS)
with engine.begin() as conn:
try:
conn.execute(text(
"ALTER TABLE users ADD COLUMN IF NOT EXISTS password_hash VARCHAR(255) NULL"))
except Exception:
pass
try:
conn.execute(text(
"ALTER TABLE users ADD COLUMN IF NOT EXISTS is_admin TINYINT(1) NOT NULL DEFAULT 0"))
except Exception:
pass
try:
conn.execute(text(
"ALTER TABLE users ADD COLUMN IF NOT EXISTS is_active TINYINT(1) NOT NULL DEFAULT 1"))
except Exception:
pass
try:
conn.execute(
text("ALTER TABLE users ADD COLUMN IF NOT EXISTS last_login DATETIME NULL"))
except Exception:
pass
def upsert_user_interaction(job_id: str | int, *, user_id: Optional[int] = None, seen_at: Optional[str] = None, url_visited: Optional[str] = None, is_user_favorite: Optional[bool] = None):
"""Upsert a single interaction row for this job.
Any provided field will be updated; absent fields keep their current value.
"""
if user_id is None:
user_id = get_or_create_user("anonymous")
job_id_str = str(job_id)
with _ensure_session() as session:
ui = session.get(UserInteraction, {
"job_id": job_id_str, "user_id": int(user_id)})
if ui is None:
ui = UserInteraction(job_id=job_id_str, user_id=int(user_id))
session.add(ui)
if seen_at is not None:
setattr(ui, "seen_at", seen_at)
if url_visited is not None:
setattr(ui, "url_visited", url_visited)
if is_user_favorite is not None:
setattr(ui, "is_user_favorite", bool(is_user_favorite))
session.commit()
def upsert_listing(*, url: str, region: str, keyword: str, title: str, pay: str, location: str, timestamp: str):
"""Insert or update a job listing row based on job_id derived from URL."""
job_id = str(url_to_job_id(url))
with _ensure_session() as session:
obj = session.get(JobListing, job_id)
if obj is None:
obj = JobListing(job_id=job_id)
session.add(obj)
setattr(obj, "url", url)
setattr(obj, "region", region)
setattr(obj, "keyword", keyword)
setattr(obj, "title", title)
setattr(obj, "pay", pay)
setattr(obj, "location", location)
setattr(obj, "timestamp", timestamp)
session.commit()
def upsert_job_details(job_data: Dict[str, Any]):
"""Upsert into job_descriptions table using scraped job details dict."""
url = job_data.get("url")
job_id = normalize_job_id(job_data.get("id"), url)
if not job_id:
return
title = job_data.get("title") or None
company = job_data.get("company") or None
location = job_data.get("location") or None
description = job_data.get("description") or None
posted_time = job_data.get("posted_time") or None
job_id = str(job_id)
with _ensure_session() as session:
obj = session.get(JobDescription, job_id)
if obj is None:
obj = JobDescription(job_id=job_id)
session.add(obj)
setattr(obj, "title", title)
setattr(obj, "company", company)
setattr(obj, "location", location)
setattr(obj, "description", description)
setattr(obj, "posted_time", posted_time)
setattr(obj, "url", url)
session.commit()
def upsert_cached_page(*, file_path: str, url_guess: Optional[str], last_modified: Optional[str], size_bytes: Optional[int], job_id: Optional[int]):
# Always store absolute paths
abs_fp = os.path.abspath(file_path)
with _ensure_session() as session:
obj = session.get(CachedPage, abs_fp)
if obj is None:
obj = CachedPage(file_path=abs_fp)
session.add(obj)
setattr(obj, "url_guess", url_guess)
setattr(obj, "last_modified", last_modified)
setattr(obj, "size_bytes", size_bytes)
setattr(obj, "job_id", str(job_id) if job_id else None)
session.commit()
def remove_cached_page(file_path: str):
# Accept either relative or absolute; remove both variants just in case
abs_fp = os.path.abspath(file_path)
with _ensure_session() as session:
obj = session.get(CachedPage, abs_fp)
if obj:
session.delete(obj)
session.commit()
def db_remove_cached_url(url: str):
"""Remove a cached page by URL."""
abs_fp = get_cache_path(url)
try:
remove_cached_page(abs_fp)
except Exception:
pass
def db_get_all_cached_pages() -> List[Dict[str, Any]]:
with _ensure_session() as session:
rows = session.execute(text(
"SELECT file_path, url_guess, last_modified, size_bytes, job_id FROM cached_pages")).fetchall()
return [
{
"file_path": row[0],
"url_guess": row[1],
"last_modified": row[2],
"size_bytes": row[3],
"job_id": row[4],
}
for row in rows
]
def db_get_cache_url(url: str):
"""Return the data for a specific URL from cached_pages.
Arguments:
url -- The URL to look up in the cache.
"""
with _ensure_session() as session:
row = session.execute(text(
"SELECT file_path, url_guess, last_modified, size_bytes, job_id FROM cached_pages WHERE url_guess = :u"), {"u": url}).fetchone()
if not row:
return None
return {
"file_path": row[0],
"url_guess": row[1],
"last_modified": row[2],
"size_bytes": row[3],
"job_id": row[4],
}
def db_sync_cached_pages(cache_dir: str):
"""Scan cache_dir and upsert page metadata into cached_pages table."""
if not os.path.isdir(cache_dir):
return
db_cache = db_get_all_cached_pages()
for root, _, files in os.walk(cache_dir):
for name in files:
if not name.lower().endswith(".html"):
continue
fp = os.path.abspath(os.path.join(root, name))
if fp in [c["file_path"] for c in db_cache]:
continue
try:
stat = os.stat(fp)
mtime = datetime.fromtimestamp(stat.st_mtime).isoformat()
size = stat.st_size
except OSError:
mtime = None
size = None
url_guess = get_url_from_filename(name)
job_id = url_to_job_id(url_guess)
upsert_cached_page(file_path=fp, url_guess=url_guess,
last_modified=mtime, size_bytes=size, job_id=job_id)
def normalize_cached_page_paths() -> int:
"""Ensure all cached_pages.file_path values are absolute. Returns number of rows updated/normalized."""
changed = 0
with _ensure_session() as session:
rows = session.execute(text(
"SELECT file_path, url_guess, last_modified, size_bytes, job_id FROM cached_pages")).fetchall()
for (fp, url_guess, last_modified, size_bytes, job_id) in rows:
if not os.path.isabs(fp):
abs_fp = os.path.abspath(fp)
# Upsert under absolute path, then remove the relative entry
upsert_cached_page(
file_path=abs_fp,
url_guess=url_guess,
last_modified=last_modified,
size_bytes=size_bytes,
job_id=job_id,
)
with _ensure_session() as session:
session.execute(
text("DELETE FROM cached_pages WHERE file_path = :fp"), {"fp": fp})
session.commit()
changed += 1
return changed
def db_get_keywords() -> List[str]:
"""Return a list of all unique keywords from job listings."""
with _ensure_session() as session:
rows = session.execute(
text("SELECT DISTINCT keyword FROM job_listings")).fetchall()
return [r[0] for r in rows]
def db_get_regions() -> List[str]:
"""Return a list of all unique regions from job listings."""
with _ensure_session() as session:
rows = session.execute(
text("SELECT DISTINCT region FROM job_listings")).fetchall()
return [r[0] for r in rows]
def get_all_jobs():
query = """
SELECT l.job_id
,l.title
,d.description
,l.region
,l.keyword
,d.company
,l.location
,l.timestamp
,d.posted_time
,l.url
,c.file_path
,c.last_modified
,c.url_guess
,CASE WHEN c.url_guess != l.url THEN 1 ELSE 0 END AS url_guess_stale
FROM job_listings AS l
INNER JOIN job_descriptions AS d
ON l.job_id = d.job_id
AND l.url = d.url
LEFT JOIN cached_pages AS c ON l.job_id = c.job_id
ORDER BY d.posted_time DESC
"""
with _ensure_session() as session:
rows = session.execute(text(query)).fetchall()
jobs = []
for row in rows:
job = {
"id": row[0],
"title": row[1],
"description": row[2].replace('\n', '<br />').strip(),
"region": row[3],
"keyword": row[4],
"company": row[5],
"location": row[6],
"timestamp": row[7],
"posted_time": row[8],
"url": row[9],
"file_path": row[10],
"last_modified": row[11],
"url_guess": row[12],
"url_guess_stale": row[13],
}
jobs.append(job)
return jobs
def db_get_all_job_urls() -> List[str]:
"""Return list of job URLs from job_listings."""
with _ensure_session() as session:
rows = session.execute(text("SELECT url FROM job_listings")).fetchall()
return [r[0] for r in rows]
def db_delete_job(job_id: str | int):
"""Delete a job row (cascades to details and interactions)."""
jid = str(job_id)
with _ensure_session() as session:
obj = session.get(JobListing, jid)
if obj:
session.delete(obj)
session.commit()
def remove_job(url):
"""Remove a job from the database."""
try:
jid = url_to_job_id(url)
db_delete_job(jid)
cache_fp = get_cache_path(url)
remove_cached_page(os.path.abspath(cache_fp))
if os.path.exists(cache_fp):
os.remove(cache_fp)
except Exception:
pass
# ---------------- New ORM convenience helpers ------------------------------
def get_or_create_user(username: str) -> int:
"""Return user_id for username, creating if missing."""
created_at = datetime.now(UTC).isoformat()
with _ensure_session() as session:
row = session.execute(
text("SELECT user_id FROM users WHERE username = :u"), {
"u": username}
).fetchone()
if row:
return int(row[0])
session.execute(
text("INSERT INTO users(username, created_at) VALUES(:u, :c)"),
{"u": username, "c": created_at},
)
session.commit()
# open a new session to fetch the id
with _ensure_session() as session:
row2 = session.execute(
text("SELECT user_id FROM users WHERE username = :u"), {
"u": username}
).fetchone()
if row2:
return int(row2[0])
# Edge case retry
return get_or_create_user(username)
def mark_favorite(job_id: str | int, username: str, favorite: bool = True):
user_id = get_or_create_user(username)
upsert_user_interaction(job_id, user_id=user_id, is_user_favorite=favorite)
def record_visit(job_id: str | int, username: str, url: Optional[str] = None):
user_id = get_or_create_user(username)
ts = now_iso()
upsert_user_interaction(job_id, user_id=user_id,
seen_at=ts, url_visited=url)
# ---------------- User auth/admin helpers ----------------------------------
def create_or_update_user(username: str, password: Optional[str] = None, *, is_admin: Optional[bool] = None, is_active: Optional[bool] = None) -> int:
"""Create user if missing; update password/admin/active if provided. Returns user_id."""
username = (username or "").strip()
if not username:
raise ValueError("username required")
uid = get_or_create_user(username)
with _ensure_session() as session:
# Build dynamic update
fields = []
params: Dict[str, Any] = {"u": uid}
if password is not None:
fields.append("password_hash = :ph")
params["ph"] = generate_password_hash(password)
if is_admin is not None:
fields.append("is_admin = :ia")
params["ia"] = 1 if is_admin else 0
if is_active is not None:
fields.append("is_active = :ac")
params["ac"] = 1 if is_active else 0
if fields:
q = f"UPDATE users SET {', '.join(fields)} WHERE user_id = :u"
session.execute(text(q), params)
session.commit()
return uid
def set_user_password(username: str, password: str) -> None:
create_or_update_user(username, password=password)
def set_user_admin(username: str, is_admin: bool) -> None:
create_or_update_user(username, is_admin=is_admin)
def set_user_active(username: str, is_active: bool) -> None:
create_or_update_user(username, is_active=is_active)
def verify_user_credentials(username: str, password: str) -> bool:
"""Validate username/password against stored password_hash."""
with _ensure_session() as session:
row = session.execute(text("SELECT password_hash, is_active FROM users WHERE username = :u"), {
"u": username}).fetchone()
if not row:
return False
ph, active = row[0], bool(row[1])
if not active or not ph:
return False
ok = check_password_hash(ph, password)
if ok:
# record last_login
try:
session.execute(text("UPDATE users SET last_login = :ts WHERE username = :u"), {
"ts": datetime.now(UTC), "u": username})
session.commit()
except Exception:
pass
return ok
def get_users() -> List[Dict[str, Any]]:
with _ensure_session() as session:
rows = session.execute(text(
"SELECT user_id, username, created_at, is_admin, is_active, last_login, (password_hash IS NOT NULL) AS has_pw FROM users ORDER BY username ASC")).fetchall()
out: List[Dict[str, Any]] = []
for r in rows:
out.append({
"user_id": int(r[0]),
"username": r[1],
"created_at": r[2].isoformat() if isinstance(r[2], datetime) else (r[2] or None),
"is_admin": bool(r[3]),
"is_active": bool(r[4]),
"last_login": r[5].isoformat() if r[5] else None,
"has_password": bool(r[6]),
})
return out
def get_user(username: str) -> Optional[Dict[str, Any]]:
"""Return single user dict or None."""
with _ensure_session() as session:
row = session.execute(text(
"SELECT user_id, username, is_admin, is_active, password_hash, last_login, created_at FROM users WHERE username = :u"
), {"u": username}).fetchone()
if not row:
return None
return {
"user_id": int(row[0]),
"username": row[1],
"is_admin": bool(row[2]),
"is_active": bool(row[3]),
"password_hash": row[4],
"last_login": row[5].isoformat() if row[5] else None,
"created_at": row[6].isoformat() if isinstance(row[6], datetime) else (row[6] or None),
}
# ---------------- Regions/Keywords helpers ---------------------------------
def upsert_region(name: str) -> int:
"""Get or create a region by name; return region_id."""
name = (name or "").strip()
if not name:
raise ValueError("Region name cannot be empty")
with _ensure_session() as session:
row = session.execute(text("SELECT region_id FROM regions WHERE name = :n"), {
"n": name}).fetchone()
if row:
return int(row[0])
session.execute(
text("INSERT INTO regions(name) VALUES (:n)"), {"n": name})
session.commit()
with _ensure_session() as session:
row2 = session.execute(text("SELECT region_id FROM regions WHERE name = :n"), {
"n": name}).fetchone()
if row2:
return int(row2[0])
# unlikely retry
return upsert_region(name)
def upsert_keyword(name: str) -> int:
"""Get or create a keyword by name; return keyword_id."""
name = (name or "").strip()
if not name:
raise ValueError("Keyword name cannot be empty")
with _ensure_session() as session:
row = session.execute(text("SELECT keyword_id FROM keywords WHERE name = :n"), {
"n": name}).fetchone()
if row:
return int(row[0])
session.execute(
text("INSERT INTO keywords(name) VALUES (:n)"), {"n": name})
session.commit()
with _ensure_session() as session:
row2 = session.execute(text("SELECT keyword_id FROM keywords WHERE name = :n"), {
"n": name}).fetchone()
if row2:
return int(row2[0])
return upsert_keyword(name)
def set_user_regions(username: str, region_names: List[str]) -> None:
"""Replace user's preferred regions with given names."""
user_id = get_or_create_user(username)
# Normalize and get ids
names = sorted({(n or "").strip()
for n in region_names if (n or "").strip()})
region_ids: List[int] = [upsert_region(n) for n in names]
if not region_ids and not names:
# Clear all if explicitly empty list
with _ensure_session() as session:
session.execute(
text("DELETE FROM user_regions WHERE user_id = :u"), {"u": user_id})
session.commit()
return
desired = set(region_ids)
with _ensure_session() as session:
rows = session.execute(text("SELECT region_id FROM user_regions WHERE user_id = :u"), {
"u": user_id}).fetchall()
current = set(int(r[0]) for r in rows)
to_add = desired - current
to_remove = current - desired
for rid in to_remove:
session.execute(text("DELETE FROM user_regions WHERE user_id = :u AND region_id = :r"), {
"u": user_id, "r": int(rid)})
for rid in to_add:
session.execute(text("INSERT INTO user_regions(user_id, region_id) VALUES(:u, :r)"), {
"u": user_id, "r": int(rid)})
session.commit()
def set_user_keywords(username: str, keyword_names: List[str]) -> None:
"""Replace user's preferred keywords with given names."""
user_id = get_or_create_user(username)
names = sorted({(n or "").strip()
for n in keyword_names if (n or "").strip()})
keyword_ids: List[int] = [upsert_keyword(n) for n in names]
if not keyword_ids and not names:
with _ensure_session() as session:
session.execute(
text("DELETE FROM user_keywords WHERE user_id = :u"), {"u": user_id})
session.commit()
return
desired = set(keyword_ids)
with _ensure_session() as session:
rows = session.execute(text("SELECT keyword_id FROM user_keywords WHERE user_id = :u"), {
"u": user_id}).fetchall()
current = set(int(r[0]) for r in rows)
to_add = desired - current
to_remove = current - desired
for kid in to_remove:
session.execute(text("DELETE FROM user_keywords WHERE user_id = :u AND keyword_id = :k"), {
"u": user_id, "k": int(kid)})
for kid in to_add:
session.execute(text("INSERT INTO user_keywords(user_id, keyword_id) VALUES(:u, :k)"), {
"u": user_id, "k": int(kid)})
session.commit()
def get_user_regions(username: str) -> List[Dict[str, str]]:
"""Return preferred region names for a user (empty if none)."""
with _ensure_session() as session:
row = session.execute(text("SELECT user_id FROM users WHERE username = :u"), {
"u": username}).fetchone()
if not row:
return []
user_id = int(row[0])
rows = session.execute(text(
"""
SELECT r.name, r.color
FROM regions r
INNER JOIN user_regions ur ON ur.region_id = r.region_id
WHERE ur.user_id = :u
ORDER BY r.name ASC
"""
), {"u": user_id}).fetchall()
return [{"name": r[0], "color": r[1]} for r in rows]
def get_user_keywords(username: str) -> List[Dict[str, str]]:
"""Return preferred keyword names for a user (empty if none)."""
with _ensure_session() as session:
row = session.execute(text("SELECT user_id FROM users WHERE username = :u"), {
"u": username}).fetchone()
if not row:
return []
user_id = int(row[0])
rows = session.execute(text(
"""
SELECT k.name, k.color
FROM keywords k
INNER JOIN user_keywords uk ON uk.keyword_id = k.keyword_id
WHERE uk.user_id = :u
ORDER BY k.name ASC
"""
), {"u": user_id}).fetchall()
return [{"name": r[0], "color": r[1]} for r in rows]
def get_all_regions() -> List[Dict[str, str]]:
"""Return all region names from regions table (sorted)."""
with _ensure_session() as session:
rows = session.execute(
text("SELECT name, color FROM regions ORDER BY name ASC")).fetchall()
return [{"name": r[0], "color": r[1]} for r in rows]
def get_all_keywords() -> List[Dict[str, str]]:
"""Return all keyword names from keywords table (sorted)."""
with _ensure_session() as session:
rows = session.execute(
text("SELECT name, color FROM keywords ORDER BY name ASC")).fetchall()
return [{"name": r[0], "color": r[1]} for r in rows]
def seed_regions_keywords_from_listings() -> Dict[str, int]:
"""Seed regions/keywords tables from distinct values in job_listings if empty.
Returns dict with counts inserted: {"regions": n1, "keywords": n2}.
"""
inserted = {"regions": 0, "keywords": 0}
with _ensure_session() as session:
# Regions
existing_regions = session.execute(
text("SELECT COUNT(*) FROM regions")).scalar_one()
if int(existing_regions or 0) == 0:
rows = session.execute(text(
"SELECT DISTINCT region FROM job_listings WHERE region IS NOT NULL AND region != ''")).fetchall()
for r in rows:
name = r[0]
if name:
try:
session.execute(
text("INSERT IGNORE INTO regions(name, color) VALUES(:n, :c)"), {"n": name, "c": get_color_from_string(name)})
inserted["regions"] += 1
except Exception:
pass
session.commit()
# Keywords
existing_keywords = session.execute(
text("SELECT COUNT(*) FROM keywords")).scalar_one()
if int(existing_keywords or 0) == 0:
rows = session.execute(text(
"SELECT DISTINCT keyword FROM job_listings WHERE keyword IS NOT NULL AND keyword != ''")).fetchall()
for r in rows:
name = r[0]
if name:
try:
session.execute(
text("INSERT IGNORE INTO keywords(name, color) VALUES(:n, :c)"), {"n": name, "c": get_color_from_string(name)})
inserted["keywords"] += 1
except Exception:
pass
session.commit()
return inserted
def list_regions_full() -> List[Dict[str, Any]]:
with _ensure_session() as session:
rows = session.execute(
text("SELECT region_id, name, color FROM regions ORDER BY name ASC")).fetchall()
return [{"region_id": int(r[0]), "name": r[1], "color": r[2]} for r in rows]
def list_keywords_full() -> List[Dict[str, Any]]:
with _ensure_session() as session:
rows = session.execute(
text("SELECT keyword_id, name, color FROM keywords ORDER BY name ASC")).fetchall()
return [{"keyword_id": int(r[0]), "name": r[1], "color": r[2]} for r in rows]
def rename_region(region_id: int, new_name: str) -> bool:
new_name = (new_name or "").strip()
if not new_name:
raise ValueError("new_name required")
with _ensure_session() as session:
try:
session.execute(text("UPDATE regions SET name = :n WHERE region_id = :id"), {
"n": new_name, "id": int(region_id)})
session.commit()
return True
except Exception:
session.rollback()
return False
def rename_keyword(keyword_id: int, new_name: str) -> bool:
new_name = (new_name or "").strip()
if not new_name:
raise ValueError("new_name required")
with _ensure_session() as session:
try:
session.execute(text("UPDATE keywords SET name = :n WHERE keyword_id = :id"), {
"n": new_name, "id": int(keyword_id)})
session.commit()
return True
except Exception:
session.rollback()
return False
def change_region_color(region_id: int, new_color: str) -> bool:
new_color = (new_color or "").strip()
if not new_color:
raise ValueError("new_color required")
with _ensure_session() as session:
try:
session.execute(text("UPDATE regions SET color = :c WHERE region_id = :id"), {
"c": new_color, "id": int(region_id)})
session.commit()
return True
except Exception:
session.rollback()
return False
def change_keyword_color(keyword_id: int, new_color: str) -> bool:
new_color = (new_color or "").strip()
if not new_color:
raise ValueError("new_color required")
with _ensure_session() as session:
try:
session.execute(text("UPDATE keywords SET color = :c WHERE keyword_id = :id"), {
"c": new_color, "id": int(keyword_id)})
session.commit()
return True
except Exception:
session.rollback()
return False

121
web/scraper.py Normal file
View File

@@ -0,0 +1,121 @@
from datetime import datetime, UTC
from bs4 import BeautifulSoup
from typing import List, Dict, Set
from web.utils import get_base_url, cache_page, safe_get_text, safe_get_attr, is_cached, get_cached_content, make_request_with_retry
def scrape_listings_page(listing, region: str, keyword: str, seen_urls: Set[str]) -> List:
"""Parse a single job listing."""
try:
title_elem = listing.find("div", class_="title")
url_elem = listing.find("a")
pay_elem = listing.find("div", class_="attr remuneration")
if pay_elem:
pay_elem = pay_elem.find("span", class_="valu")
location_elem = listing.find("div", class_="location")
if not title_elem or not url_elem:
return []
title = title_elem.get_text(strip=True)
url = url_elem["href"]
pay = pay_elem.get_text(strip=True) if pay_elem else "N/A"
location = location_elem.get_text(
strip=True) if location_elem else "N/A"
status = "DUPLICATE" if url in seen_urls else "NEW"
if url in seen_urls:
return []
# job_summary variable retained for parity but not used
job_summary = f"{status} [{region}/{keyword}] | Title: {title[:50]}{'...' if len(title) > 50 else ''} | Location: {location} | URL: {url}"
_ = job_summary
return [datetime.now(UTC).isoformat(), region, keyword, title, pay, location, url]
except (AttributeError, KeyError):
return []
def scrape_job_page(content: str, url: str) -> Dict:
"""Scrape job details from a job listing page."""
soup = BeautifulSoup(content, "html.parser")
# Extract each field
title = safe_get_text(soup.find("h1", class_="postingtitle"))
company = safe_get_text(soup.find("h2", class_="company-name"))
map_elem = soup.find("div", id="map")
if map_elem:
lat = safe_get_attr(map_elem, "data-latitude")
lon = safe_get_attr(map_elem, "data-longitude")
accuracy = safe_get_attr(map_elem, "data-accuracy")
location = f"Lat: {lat}, Lon: {lon}, Accuracy: {accuracy}"
else:
location = "N/A"
mapaddress = soup.find("div", class_="mapaddress")
if mapaddress:
location = safe_get_text(mapaddress) + " " + location
description_elem = soup.find("section", id="postingbody")
if description_elem:
de = BeautifulSoup(str(description_elem), "html.parser")
qr_code_elem = de.find(class_="print-qrcode-label")
# Remove QR code if it exists
if qr_code_elem:
qr_code_elem.decompose()
description = de.text.strip()
else:
description = ''
posting_info = soup.find("div", class_="postinginfos")
if posting_info:
pi = BeautifulSoup(str(posting_info), "html.parser")
postinginfo_tags = pi.find_all("p", class_="postinginfo")
job_id = safe_get_text(postinginfo_tags[0]) if postinginfo_tags else ""
posted_time_elem = pi.find("time", class_="date timeago")
posted_time = safe_get_attr(
posted_time_elem, "datetime") if posted_time_elem else ""
else:
job_id = ""
posted_time = ""
return {
"url": url,
"title": title,
"company": company,
"location": location,
"description": description,
"id": job_id,
"posted_time": posted_time
}
def scrape_job_data(content: str, region: str, keyword: str, seen_urls: Set[str]) -> List[List]:
"""Parse HTML content to extract job listings."""
soup = BeautifulSoup(content, "html.parser")
listings = soup.find_all("li", class_="cl-static-search-result")
new_rows = []
for i, listing in enumerate(listings):
job_data = scrape_listings_page(listing, region, keyword, seen_urls)
if job_data:
new_rows.append(job_data)
return new_rows
def process_region_keyword(region: str, keyword: str, seen_urls: Set[str]) -> List[List]:
"""Process a single region and keyword."""
url = get_base_url().format(region=region, keyword=keyword.replace(" ", "+"))
if is_cached(url):
content = get_cached_content(url)
cache_status = "CACHED"
else:
content = make_request_with_retry(url, 3)
if content is None:
return []
cache_page(url, content)
cache_status = "FETCHED"
_ = cache_status # no-op to silence unused var
return scrape_job_data(content, region, keyword, seen_urls)

102
web/static/index.js Normal file
View File

@@ -0,0 +1,102 @@
// Update the table with job data
function updateTableData(jobs) {
const jobsContainer = document.getElementById("jobs");
jobsContainer.innerHTML = ""; // Clear existing jobs
jobs.forEach((job) => {
const jobElement = document.createElement("div");
jobElement.classList.add("job");
jobElement.innerHTML = `
<h3><a href="${job.url}" target="_blank">${job.title}</a></h3>
<p class="job-posted-time">${job.posted_time}</p>
<span class="job-region region-${job.region
.replace(" ", "")
.toLowerCase()}">${job.region}</span>
<span class="job-keyword keyword-${job.keyword
.replace(" ", "")
.toLowerCase()}">${job.keyword}</span>
`;
jobsContainer.appendChild(jobElement);
});
}
// Fetch job data from the server
function fetchJobs() {
fetch("/jobs")
.then((response) => response.json())
.then((data) => {
updateTableData(data);
})
.catch((error) => console.error("Error fetching jobs:", error));
}
// scrape form submission
function updateScrapeInfo(message, color) {
let scrapingInfo = document.getElementById("scrape-info");
scrapingInfo.style.display = "inline-block"; // Show the scraping info
scrapingInfo.innerText = message;
scrapingInfo.style.color = color;
}
function scrape(event) {
event.preventDefault(); // Prevent the default form submission
updateScrapeInfo("Scraping in progress...", "blue");
fetch("/scrape")
.then((response) => response.json())
.then((data) => {
if (data.status) {
updateScrapeInfo(data.status, "green");
} else {
updateScrapeInfo("Scraping failed. Please try again.", "red");
}
})
.catch((error) => console.error("Error:", error));
}
function updateJobsFiltered() {
const selectedRegion = document.getElementById("region").value;
const selectedKeyword = document.getElementById("keyword").value;
const filterForm = document.getElementById("filter-form");
const queryString = new URLSearchParams({
region: selectedRegion,
keyword: selectedKeyword,
}).toString();
filterForm.action = `/?${queryString}`;
filterForm.submit(); // Submit the form to apply filters
}
function regionClick(event) {
const region = event.target.innerText;
const regionInput = document.getElementById("region");
regionInput.value = region;
updateJobsFiltered();
}
function keywordClick(event) {
const keyword = event.target.innerText;
const keywordInput = document.getElementById("keyword");
keywordInput.value = keyword;
updateJobsFiltered();
}
document.querySelectorAll(".job-keyword").forEach((element) => {
element.addEventListener("click", keywordClick);
});
document.querySelectorAll(".job-region").forEach((element) => {
element.addEventListener("click", regionClick);
});
document.getElementById("scrape-form").addEventListener("submit", scrape);
document
.getElementById("region")
.addEventListener("change", updateJobsFiltered);
document
.getElementById("keyword")
.addEventListener("change", updateJobsFiltered);
document
.getElementById("filter-form")
.addEventListener("submit", updateJobsFiltered);
document.getElementById("reset-filters").addEventListener("click", () => {
document.getElementById("region").value = "";
document.getElementById("keyword").value = "";
updateJobsFiltered();
});

61
web/static/settings.js Normal file
View File

@@ -0,0 +1,61 @@
/* javascript form handling */
document
.getElementById("user-settings-form")
.addEventListener("submit", function (event) {
event.preventDefault(); // Prevent default form submission
const form = event.target;
const formData = new FormData(form);
// Collect selected regions and keywords
const selectedRegions = [];
const selectedKeywords = [];
formData.forEach((value, key) => {
if (key === "region") {
selectedRegions.push(value);
} else if (key === "keyword") {
selectedKeywords.push(value);
}
});
// Add new region if provided
const newRegion = formData.get("new-region").trim();
if (newRegion) {
selectedRegions.push(newRegion);
}
// Add new keyword if provided
const newKeyword = formData.get("new-keyword").trim();
if (newKeyword) {
selectedKeywords.push(newKeyword);
}
// Prepare data to send
const dataToSend = {
regions: selectedRegions,
keywords: selectedKeywords,
csrf_token: formData.get("csrf_token"),
};
// Send data via Fetch API
fetch(form.action, {
method: "POST",
headers: {
"Content-Type": "application/json",
"X-CSRF-Token": document.querySelector('meta[name="csrf-token"]')
.content,
},
body: JSON.stringify(dataToSend),
})
.then((response) => {
if (response.ok) {
window.location.reload(); // Reload to reflect changes
} else {
alert("Error saving preferences.");
}
})
.catch((error) => {
console.error("Error:", error);
alert("Error saving preferences.");
});
});

144
web/static/styles.css Normal file
View File

@@ -0,0 +1,144 @@
body {
font-family: Arial, sans-serif;
margin: 10px;
font-size: 16px;
}
h1 {
color: #333;
font-size: 1.2em;
}
a {
text-decoration: none;
}
a:hover {
text-decoration: underline;
}
footer {
margin-top: 20px;
text-align: center;
font-size: 0.9em;
color: #666;
}
nav {
margin-bottom: 10px;
}
#filters {
display: block;
margin-bottom: 1rem;
}
#filters #filter-form {
display: inline-block;
max-width: 500px;
}
#filters #scrape-form {
display: inline-block;
margin-left: 1rem;
}
#filters #scrape-form span#scrape-info {
display: none;
color: blue;
font-size: 0.9em;
}
#jobs {
margin: 0;
padding: 0;
display: grid;
grid-template-columns: repeat(auto-fill, minmax(360px, 1fr));
gap: 1rem;
}
.job {
border: 1px solid #ccc;
padding: 1rem;
border-radius: 5px;
background-color: #f9f9f9;
}
.job a {
display: inline-block;
}
.job h3 {
margin: 0 0 0.25rem 0;
font-size: 1.1em;
}
.job-posted-time {
font-weight: normal;
font-size: 0.8em;
color: #666;
margin: 0.25rem 0;
}
.job-region,
.job-keyword {
border: 1px solid #ccc;
border-radius: 0.8rem;
padding: 0.2rem 0.4rem;
display: inline;
margin-right: 0.5rem;
background-color: rgb(255, 255, 255);
}
#job-details {
max-width: 100%;
margin: auto;
}
.job-description {
margin-top: 5px;
color: #333;
margin: 0;
padding: 0;
line-height: 1.25;
font-size: 14px;
}
.job-description br {
margin: -5px 0;
}
.job-title {
font-weight: bold;
color: #333;
text-decoration: underline;
font-size: 16px;
}
/* Taxonomy Management */
#regions-table,
#keywords-table {
margin-top: 20px;
}
#regions-table table,
#keywords-table table {
max-width: 100%;
border-collapse: collapse;
}
#regions-table th,
#regions-table td,
#keywords-table th,
#keywords-table td {
border: 1px solid #ccc;
padding: 8px;
text-align: left;
}
#regions-table th,
#keywords-table th {
background-color: #f9f9f9;
}
/* Admin User Management */
#users {
margin-top: 20px;
}
#users table {
max-width: 100%;
border-collapse: collapse;
}
#users th,
#users td {
border: 1px solid #ccc;
padding: 8px;
text-align: left;
}
#users th {
background-color: #f9f9f9;
}

41
web/static/taxonomy.js Normal file
View File

@@ -0,0 +1,41 @@
function updateColor(id, type, newColor) {
fetch("/admin/taxonomy", {
method: "POST",
headers: {
"Content-Type": "application/json",
"X-CSRF-Token": document.querySelector('meta[name="csrf-token"]').content,
},
body: JSON.stringify({
action:
type === "region" ? "change_region_color" : "change_keyword_color",
[type + "_id"]: id,
[type + "_color"]: newColor,
}),
}).then((response) => {
if (response.ok) {
location.reload();
} else {
alert("Failed to update " + type + " color");
}
});
}
document
.getElementById("region-color-form")
.addEventListener("submit", function (event) {
event.preventDefault();
const regionId = this.querySelector('input[name="region_id"]').value;
const newColor = this.querySelector('input[name="new_region_color"]').value;
updateColor(regionId, "region", newColor);
});
document
.getElementById("keyword-color-form")
.addEventListener("submit", function (event) {
event.preventDefault();
const keywordId = this.querySelector('input[name="keyword_id"]').value;
const newColor = this.querySelector(
'input[name="new_keyword_color"]'
).value;
updateColor(keywordId, "keyword", newColor);
});

View File

@@ -0,0 +1,9 @@
{% extends 'base.html' %} {% block content %}
<h2>Login</h2>
<form method="post">
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}" />
<label>Username <input type="text" name="username" required /></label>
<label>Password <input type="password" name="password" /></label>
<button type="submit">Login</button>
</form>
{% endblock %}

View File

@@ -0,0 +1,142 @@
{% extends 'base.html' %} {% block content %}
<h2>Taxonomy</h2>
<section>
<h3>Regions</h3>
<form method="post">
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}" />
<input type="hidden" name="action" value="add_region" />
<input type="text" name="region_name" placeholder="New region" required />
<label for="region_color">Color:</label>
<input type="color" name="region_color" id="region_color" value="#ffffff" />
<button type="submit">Add Region</button>
</form>
<div id="regions-table">
<table>
<thead>
<tr>
<th>ID</th>
<th>Name</th>
<th>Rename</th>
<th>Color</th>
</tr>
</thead>
<tbody>
{% for r in regions %}
<tr>
<td>{{ r.region_id }}</td>
<td>{{ r.name }}</td>
<td>
<form
method="post"
style="display: flex; gap: 0.5rem; align-items: center"
>
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}" />
<input type="hidden" name="action" value="rename_region" />
<input type="hidden" name="region_id" value="{{ r.region_id }}" />
<input
type="text"
name="new_region_name"
placeholder="New name"
required
/>
<button type="submit">Rename</button>
</form>
</td>
<td>
<form
method="post"
style="display: flex; gap: 0.5rem; align-items: center"
>
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}" />
<input type="hidden" name="action" value="change_region_color" />
<input type="hidden" name="region_id" value="{{ r.region_id }}" />
<input
type="color"
name="new_region_color"
value="{{ r.color }}"
required
/>
<button type="submit">Change Color</button>
</form>
</td>
</tr>
{% endfor %}
</tbody>
</table>
</div>
</section>
<section>
<h3>Keywords</h3>
<form method="post">
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}" />
<input type="hidden" name="action" value="add_keyword" />
<input type="text" name="keyword_name" placeholder="New keyword" required />
<label for="keyword_color">Color:</label>
<input
type="color"
name="keyword_color"
id="keyword_color"
value="#ffffff"
/>
<button type="submit">Add Keyword</button>
</form>
<div id="keywords-table">
<table>
<thead>
<tr>
<th>ID</th>
<th>Name</th>
<th>Rename</th>
<th>Color</th>
</tr>
</thead>
<tbody>
{% for k in keywords %}
<tr>
<td>{{ k.keyword_id }}</td>
<td>{{ k.name }}</td>
<td>
<form
method="post"
style="display: flex; gap: 0.5rem; align-items: center"
>
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}" />
<input type="hidden" name="action" value="rename_keyword" />
<input type="hidden" name="keyword_id" value="{{ k.keyword_id }}" />
<input
type="text"
name="new_keyword_name"
placeholder="New name"
required
/>
<button type="submit">Rename</button>
</form>
</td>
<td>
<form
method="post"
style="display: flex; gap: 0.5rem; align-items: center"
>
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}" />
<input type="hidden" name="action" value="change_keyword_color" />
<input type="hidden" name="keyword_id" value="{{ k.keyword_id }}" />
<input
type="color"
name="new_keyword_color"
value="{{ k.color }}"
required
/>
<button type="submit">Change Color</button>
</form>
</td>
</tr>
{% endfor %}
</tbody>
</table>
</div>
</section>
{% endblock %} {% block footer_scripts %}
<script src="{{ url_for('static', filename='taxonomy.js') }}"></script>
</script>
{% endblock %}

View File

@@ -0,0 +1,139 @@
{% extends 'base.html' %} {% block content %}
<div id="users">
<h2>Users</h2>
<form id="user-form" method="post" action="{{ url_for('admin_users') }}">
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}" />
<table>
<thead>
<tr>
<th>ID</th>
<th>Username</th>
<th>Admin</th>
<th>Active</th>
<th colspan="2">Password</th>
<th>Created</th>
<th>Last Login</th>
<th></th>
</tr>
</thead>
<tbody>
{% for u in users %}
<tr class="user-row" data-user-id="{{ u.user_id }}">
<td>
{{ u.user_id }}<input
type="hidden"
name="user_id"
value="{{ u.user_id }}"
/>
</td>
<td>
<input
type="text"
name="username"
value="{{ u.username }}"
required
/>
</td>
<td>
<input type="checkbox" name="is_admin" {{ 'checked' if u.is_admin
else '' }} />
</td>
<td>
<input type="checkbox" name="is_active" {{ 'checked' if u.is_active
else '' }} />
</td>
<td>{{ '✅' if u.has_password else '❌' }}</td>
<td><input type="password" name="password" /></td>
<td>{{ u.created_at }}</td>
<td>{{ u.last_login or 'never' }}</td>
<td>
<button type="submit" data-user-id="{{ u.user_id }}">Save</button>
</td>
</tr>
{% endfor %}
</tbody>
</table>
</form>
</div>
<h3>Create / Update User</h3>
<form
id="create-update-user-form"
method="post"
action="{{ url_for('admin_users') }}"
>
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}" />
<label>Username <input type="text" name="username" required /></label>
<label>Password <input type="password" name="password" /></label>
<label>Admin <input type="checkbox" name="is_admin" value="1" /></label>
<label
>Active <input type="checkbox" name="is_active" value="1" checked
/></label>
<button type="submit">Save</button>
</form>
{% endblock %} {% block footer_scripts %}
<script>
function updateUser(userId) {
const row = document.querySelector(`.user-row[data-user-id="${userId}"]`);
const passwordInput = row.querySelector('input[name="password"]');
const hasPassword =
row.querySelector("td:nth-child(5)").textContent.trim() === "✅";
const formData = row.querySelector("form").elements;
const username = formData.username.value;
const password = hasPassword ? passwordInput.value : undefined;
const isAdmin = formData.is_admin.checked;
const isActive = formData.is_active.checked;
fetch("/admin/users", {
method: "POST",
headers: {
"Content-Type": "application/json",
},
body: JSON.stringify({
user_id: userId,
password: password,
username: username,
is_admin: isAdmin,
is_active: isActive,
csrf_token: formData.csrf_token.value,
}),
})
.then((response) => {
if (response.ok) {
alert("User updated successfully");
// Clear the password field after successful update
passwordInput.value = "";
} else {
alert("Error updating user");
}
})
.catch((error) => {
console.error("Error:", error);
alert("Error updating user");
});
}
function initUserForm() {
const form = document.getElementById("user-form");
const createUpdateForm = document.getElementById("create-update-user-form");
form.addEventListener("submit", function (event) {
const userId = event.target.querySelector('input[name="user_id"]').value;
event.preventDefault(); // Prevent the default form submission
updateUser(userId);
});
form.addEventListener("click", function (event) {
const userId = event.target.closest(".user-row").dataset.userId;
updateUser(userId);
});
createUpdateForm.addEventListener("submit", function (event) {
const passwordInput = createUpdateForm.querySelector(
'input[name="password"]'
);
});
}
initUserForm();
</script>
{% endblock %}

43
web/templates/base.html Normal file
View File

@@ -0,0 +1,43 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<title>{{ title }}</title>
<meta name="csrf-token" content="{{ csrf_token() }}" />
<link
rel="stylesheet"
href="{{ url_for('static', filename='styles.css') }}"
/>
{% block styles %}{% endblock %} {% block scripts %}{% endblock %}
</head>
<body>
{% block header %}
<header>
<h1><a href="/">{{ title or 'Admin' }}</a></h1>
<nav>
{% if username %}<span>Hi, {{ username }}</span> | {% endif %}
<a href="{{ url_for('index') }}">Home</a> |
<a href="{{ url_for('user_settings') }}">Preferences</a> {% if
current_user and current_user.is_admin %} |
<a href="{{ url_for('admin_taxonomy') }}">Taxonomy</a> |
<a href="{{ url_for('admin_users') }}">Users</a> {% endif %} | {% if
session.get('username') %}
<a href="{{ url_for('logout') }}">Logout</a> {% else %} |
<a href="{{ url_for('login') }}">Login</a>{% endif %}
</nav>
{% with messages = get_flashed_messages() %} {% if messages %}
<ul>
{% for m in messages %}
<li>{{ m }}</li>
{% endfor %}
</ul>
{% endif %} {% endwith %}
</header>
{% endblock %} {% block content %}{% endblock %}
<footer>
<p>&copy; 2025 Job Listings</p>
</footer>
{% block footer_scripts %}{% endblock %}
</body>
</html>

55
web/templates/index.html Normal file
View File

@@ -0,0 +1,55 @@
{% extends "base.html" %} {% block styles %}
<style>
/* for each keyword, create a different background color */
{% for keyword in keywords %}
.keyword-{{ keywords[keyword].name }} {
background-color: {{ keywords[keyword].color }};
}
{% endfor %}
/* for each region, create a different background color */
{% for region in regions %}
.region-{{ region }} {
background-color: {{ regions[region].color }};
}{% endfor %}
</style>
{% endblock %}
{% block title %}Job Listings{% endblock %}
{% block content %}
<div id="filters">
<form id="filter-form" method="GET" action="/">
<label for="region">Region:</label>
<select name="region" id="region">
<option value="">All</option>
{% for region in regions %}
<option value="{{ region }}" {% if region == selected_region %}selected{% endif %}>{{ region }}</option>
{% endfor %}
</select>
<label for="keyword">Keyword:</label>
<select name="keyword" id="keyword">
<option value="">All</option>
{% for keyword in keywords %}
<option value="{{ keyword }}" {% if keyword == selected_keyword %}selected{% endif %}>{{ keyword }}</option>
{% endfor %}
</select>
<button type="submit">Filter</button>
<button type="button" id="reset-filters">Reset</button>
</form>
<form id="scrape-form" method="GET" action="/scrape">
<button type="submit">Scrape Jobs</button>
<span id="scrape-info"></span>
</form>
</div>
<div id="jobs">
{% for job in jobs %}
<div class="job">
<h3><a href="{{ job['url'] }}" target="_blank">{{ job['title'] }}</a></h3>
<p class="job-posted-time">{{ job['posted_time'] }}</p>
<span class="job-region region-{{ job['region'] }}">{{ job['region'] }}</span>
<span class="job-keyword keyword-{{ job['keyword']|replace(' ', '')|lower }}">{{ job['keyword'] }}</span>
</div>
{% endfor %}
</div>
{% endblock %}
{% block footer_scripts %}
<script src="{{ url_for('static', filename='index.js') }}"></script>
{% endblock %}

27
web/templates/job.html Normal file
View File

@@ -0,0 +1,27 @@
{% extends "base.html" %} {% block title %}Job Details{% endblock %} {% block
styles %}{% endblock %} {% block content %}
<div id="job-details">
<p><strong>ID:</strong> {{ job.id }}</p>
<p>
<strong>Title:</strong> {{ job.title }} | <strong>Company:</strong> {{
job.company }} | <strong>Location:</strong> {{ job.location }}
</p>
<p>
<strong>Salary:</strong> {{ job.salary }} | <strong>Posted on:</strong> {{
job.posted_date }}
</p>
<h2>Job Description</h2>
<hr />
<p class="job-description">{{ job.description|safe }}</p>
<hr />
<p>
<strong>Original URL:</strong>
</p>
<p>
<a href="{{ job.url }}" target="_blank" class="job-title"
>{{ job.title }}</a
>
</p>
</div>
{% endblock %}

View File

@@ -0,0 +1,84 @@
{% extends 'base.html' %} {% block title %}Your Preferences{% endblock %} {%
block content %}
<h2>Your Preferences</h2>
<form
id="user-settings-form"
method="post"
action="{{ url_for('user_settings') }}"
>
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}" />
<fieldset>
<legend>Regions</legend>
<p>
<small>Add new Region:</small>
<input
type="text"
name="new-region"
id="new-region"
value=""
placeholder="Type a region and save to add & select"
size="30"
/>
</p>
{% if all_regions %} {% for r in all_regions %}
<label style="display: block; background-color: {{ r.color }}">
<input
type="checkbox"
name="region"
id="region-{{ r.name }}"
value="{{ r.name }}"
{%
if
r
in
user_regions
%}checked{%
endif
%}
/>
{{ r.name }}
</label>
{% endfor %} {% else %}
<p>No regions available. Ask an admin to add some.</p>
{% endif %}
</fieldset>
<fieldset>
<legend>Keywords</legend>
<p>
<small>Add new Keyword:</small>
<input
type="text"
name="new-keyword"
id="new-keyword"
value=""
placeholder="Type a keyword and save to add & select"
size="30"
/>
</p>
{% if all_keywords %} {% for k in all_keywords %}
<label style="display: block; background-color: {{ k.color }}">
<input
type="checkbox"
name="keyword"
id="keyword-{{ k.name }}"
value="{{ k.name }}"
{%
if
k
in
user_keywords
%}checked{%
endif
%}
/>
{{ k.name }}
</label>
{% endfor %} {% else %}
<p>No keywords available. Ask an admin to add some.</p>
{% endif %}
</fieldset>
<button type="submit">Save</button>
</form>
{% endblock %} {% block footer_scripts %}
<script src="{{ url_for('static', filename='settings.js') }}"></script>
{% endblock %}

336
web/utils.py Normal file
View File

@@ -0,0 +1,336 @@
"""
Utility functions for the Craigslist scraper.
"""
from typing import Any, Optional as _Optional
from datetime import datetime, UTC
import json
import os
import random
import re
import requests
import time
from typing import Optional, List, Dict
def get_config_file() -> str:
"""Return the path to the main config file."""
return os.path.abspath(os.path.join(
os.path.dirname(__file__), '..', 'config', 'settings.json'))
def get_config() -> dict:
"""Return the loaded configuration dict."""
CONFIG = {}
try:
with open(get_config_file(), 'r', encoding='utf-8') as _f:
CONFIG = json.load(_f)
except Exception:
CONFIG = {}
return CONFIG
def get_users_from_settings() -> List[Dict]:
"""Return user entries from settings.json (array of dicts)."""
users = get_config().get('users', [])
if not isinstance(users, list):
return []
out: List[Dict] = []
for u in users:
if not isinstance(u, dict):
continue
username = (u.get('username') or '').strip()
if not username:
continue
out.append({
'username': username,
'is_admin': bool(u.get('is_admin', False)),
'password': u.get('password') or ''
})
return out
def initialize_users_from_settings() -> int:
"""Ensure users from settings.json exist in DB; set admin/active and passwords.
Returns number of users processed.
"""
from web.db import create_or_update_user # local import to avoid cycles
users = get_users_from_settings()
count = 0
for u in users:
pw = u.get('password') or None
create_or_update_user(u['username'], password=pw, is_admin=bool(
u.get('is_admin', False)), is_active=True)
count += 1
return count
def verify_credentials(username: str, password: str) -> bool:
"""Proxy to db.verify_user_credentials"""
from web.db import verify_user_credentials
return verify_user_credentials(username, password)
# --- Database configuration helpers ---
def get_mysql_config() -> dict:
"""Return MySQL/MariaDB connection settings."""
db = get_config().get('database', {}).get('mysql', {})
return {
'host': db.get('host', '127.0.0.1'),
'user': db.get('user', 'root'),
'password': db.get('password', ''),
'database': db.get('database', 'jobs'),
'port': db.get('port', 3306),
}
def get_http_setting(key: str, default=None):
return get_config().get('http', {}).get(key, default)
def get_paths() -> dict:
return get_config().get('paths', {})
def get_cache_dir() -> str:
return get_paths().get('cache_dir', 'cache')
def get_logs_dir() -> str:
return get_paths().get('logs_dir', 'logs')
def get_user_agent() -> str:
return get_http_setting('user_agent')
def get_request_timeout() -> int:
return get_http_setting('request_timeout')
def get_max_retries() -> int:
return get_http_setting('max_retries')
def get_backoff_factor() -> int:
return get_http_setting('backoff_factor')
def get_min_delay() -> int:
return get_http_setting('min_delay')
def get_max_delay() -> int:
return get_http_setting('max_delay')
def get_base_url() -> str:
return get_config().get('scraper', {}).get('base_url', "https://{region}.craigslist.org/search/jjj?query={keyword}&sort=rel")
def ensure_cache_dir():
"""Ensure cache directory exists."""
os.makedirs(get_cache_dir(), exist_ok=True)
def now_iso() -> str:
"""Get the current time in ISO format."""
return datetime.now(UTC).isoformat()
def get_filename_from_url(url: str) -> str:
"""Convert URL to a safe filename."""
return url.replace("https://", "").replace("/", "_").replace("?", "_").replace("&", "_")
def url_to_job_id(url: str) -> int:
"""Extract the job id from a Craigslist URL (last path segment without .html)."""
last = url.rstrip("/").split("/")[-1].replace(".html", "")
if last.isdigit():
return int(last)
return 0
def normalize_job_id(raw_id: Optional[str], url: Optional[str]) -> Optional[int]:
"""Normalize job id coming from details page (e.g., 'post id: 1234567890').
Fallback to URL-derived id when needed.
"""
if raw_id:
m = re.search(r"(\d{5,})", raw_id)
if m:
return int(m.group(1))
if url:
return url_to_job_id(url)
return None
def get_url_from_filename(name: str) -> str:
"""Generate a URL guess based on the name."""
# Best-effort URL guess from filename convention (underscores to slashes)
base = os.path.splitext(name)[0]
url_guess = f"https://{base.replace('_', '/')}"
return url_guess
def get_cached_content(url: str) -> str:
"""Get cached content for URL."""
with open(get_cache_path(url), "r", encoding="utf-8") as f:
return f.read()
def safe_get_text(element, default="N/A"):
"""Safely extract text from BeautifulSoup element."""
return element.get_text(strip=True) if element else default
def safe_get_attr(element, attr, default="N/A"):
"""Safely extract attribute from BeautifulSoup element."""
return element.get(attr, default) if element else default
def get_random_delay(min_delay: int = get_min_delay(), max_delay: int = get_max_delay()) -> float:
"""Get a random delay between min_delay and max_delay seconds."""
return random.uniform(min_delay, max_delay)
def get_cache_path(url: str) -> str:
"""Get cache file path for URL."""
return os.path.join(get_cache_dir(), f"{get_filename_from_url(url)}.html")
def cache_page(url: str, content: str):
"""Cache the page content with a timestamp."""
cache_path = get_cache_path(url)
os.makedirs(os.path.dirname(cache_path), exist_ok=True)
with open(cache_path, "w", encoding="utf-8") as f:
f.write(content)
# Update the file's modification time to the current time
os.utime(cache_path, None)
def is_cached(url: str) -> bool:
"""Check if the page is cached and not older than 24 hours."""
cache_path = get_cache_path(url)
if not os.path.isfile(cache_path):
return False
# Check the file's age if it's a search result page
if 'search' in url:
file_age = time.time() - os.path.getmtime(cache_path)
if file_age > 24 * 3600: # 24 hours in seconds
return False
return True
def is_cache_stale(last_modified: str, days: int = 1) -> bool:
"""Check if the cached page is stale (older than 24 hours)."""
if not last_modified:
return True
last_datetime = datetime.fromisoformat(last_modified)
file_age = time.time() - last_datetime.timestamp()
return file_age > days * 24 * 3600 # days in seconds
def delete_cached_page(url: str):
cache_fp = get_cache_path(url)
if os.path.exists(cache_fp):
try:
os.remove(cache_fp)
except Exception:
pass
def get_color_from_string(s: str) -> str:
"""Generate a color code from a string."""
hash_code = hash(s)
# Ensure the hash code is positive
hash_code = hash_code if hash_code >= 0 else -hash_code
# Extract RGB components
r, g, b = (hash_code & 0xFF0000) >> 16, (hash_code &
0x00FF00) >> 8, hash_code & 0x0000FF
# ensure RGB components are within 128-255
r = max(128, min(255, r))
g = max(128, min(255, g))
b = max(128, min(255, b))
# Combine RGB components back into a single integer
calculated = (r << 16) | (g << 8) | b
return f"#{calculated:06X}"
# ---- App helpers moved from app.py for reuse and readability -------------
def filter_jobs(
jobs: List[Dict[str, Any]],
region: _Optional[str] = None,
keyword: _Optional[str] = None,
) -> List[Dict[str, Any]]:
"""Filter jobs by optional region and keyword."""
filtered = jobs
if region:
filtered = [j for j in filtered if j.get("region") == region]
if keyword:
filtered = [j for j in filtered if j.get("keyword") == keyword]
return filtered
def get_job_by_id(job_id):
"""Fetch job details by job ID from the database."""
try:
from web.db import get_all_jobs # lazy import to avoid cycles
for j in get_all_jobs():
if str(j.get("id")) == str(job_id) or str(j.get("job_id")) == str(job_id):
return j
except Exception:
pass
return {}
def make_request_with_retry(url: str, max_retries: int = get_max_retries()) -> Optional[str]:
"""Make HTTP request with retry logic and proper error handling."""
# initial delay
delay = get_random_delay()
headers = {'User-Agent': get_user_agent()}
for attempt in range(max_retries):
try:
delay = get_random_delay() * (get_backoff_factor() ** attempt)
if attempt > 0:
time.sleep(delay)
resp = requests.get(url, headers=headers,
timeout=get_request_timeout())
if resp.status_code == 403:
return None
elif resp.status_code == 429:
time.sleep(delay * 3) # Longer delay for rate limiting
continue
elif resp.status_code == 404:
return None
elif resp.status_code == 410:
return None
elif resp.status_code >= 400:
if attempt == max_retries - 1:
return None
continue
resp.raise_for_status()
return resp.text
except requests.exceptions.Timeout:
pass
except requests.exceptions.ConnectionError:
pass
except requests.exceptions.RequestException:
pass
if attempt < max_retries - 1:
delay = get_random_delay() * (get_backoff_factor() ** attempt)
time.sleep(delay)
return None