From 23a67d7fe1a41c305b9b6dd1580af0d2cc304896 Mon Sep 17 00:00:00 2001 From: "georg.sinn-schirwitz" Date: Fri, 29 Aug 2025 15:07:58 +0200 Subject: [PATCH] initial project commit --- .vscode/settings.json | 5 + analytics.py | 82 +++ config/keywords.txt | 7 + config/regions.txt | 6 + config/settings.json | 32 ++ main.py | 11 + online.md | 213 +++++++ pytest.ini | 5 + requirements.txt | 9 + setup.py | 53 ++ tests/conftest.py | 7 + tests/test_db_integration.py | 152 +++++ tests/test_users.py | 19 + tests/test_utils_config.py | 18 + web/__init__.py | 0 web/app.py | 457 +++++++++++++++ web/craigslist.py | 147 +++++ web/db.py | 906 ++++++++++++++++++++++++++++++ web/scraper.py | 121 ++++ web/static/index.js | 102 ++++ web/static/settings.js | 61 ++ web/static/styles.css | 144 +++++ web/static/taxonomy.js | 41 ++ web/templates/admin/login.html | 9 + web/templates/admin/taxonomy.html | 142 +++++ web/templates/admin/users.html | 139 +++++ web/templates/base.html | 43 ++ web/templates/index.html | 55 ++ web/templates/job.html | 27 + web/templates/user/settings.html | 84 +++ web/utils.py | 336 +++++++++++ 31 files changed, 3433 insertions(+) create mode 100644 .vscode/settings.json create mode 100644 analytics.py create mode 100644 config/keywords.txt create mode 100644 config/regions.txt create mode 100644 config/settings.json create mode 100644 main.py create mode 100644 online.md create mode 100644 pytest.ini create mode 100644 requirements.txt create mode 100644 setup.py create mode 100644 tests/conftest.py create mode 100644 tests/test_db_integration.py create mode 100644 tests/test_users.py create mode 100644 tests/test_utils_config.py create mode 100644 web/__init__.py create mode 100644 web/app.py create mode 100644 web/craigslist.py create mode 100644 web/db.py create mode 100644 web/scraper.py create mode 100644 web/static/index.js create mode 100644 web/static/settings.js create mode 100644 web/static/styles.css create mode 100644 web/static/taxonomy.js create mode 100644 web/templates/admin/login.html create mode 100644 web/templates/admin/taxonomy.html create mode 100644 web/templates/admin/users.html create mode 100644 web/templates/base.html create mode 100644 web/templates/index.html create mode 100644 web/templates/job.html create mode 100644 web/templates/user/settings.html create mode 100644 web/utils.py diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..d969f96 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,5 @@ +{ + "python.testing.pytestArgs": ["tests"], + "python.testing.unittestEnabled": false, + "python.testing.pytestEnabled": true +} diff --git a/analytics.py b/analytics.py new file mode 100644 index 0000000..d17c1ee --- /dev/null +++ b/analytics.py @@ -0,0 +1,82 @@ +import pandas as pd +from sqlalchemy import create_engine, text +from web.utils import get_mysql_config + + +def get_engine(): + cfg = get_mysql_config() + url = f"mysql+pymysql://{cfg['user']}:{cfg['password']}@{cfg['host']}:{cfg['port']}/{cfg['database']}?charset=utf8mb4" + return create_engine(url, future=True) + + +def get_all_jobs(): + query = """ +SELECT l.job_id +,l.title +,d.description +,l.region +,l.keyword +,d.company +,l.location +,l.timestamp +,d.posted_time +,l.url +,c.file_path +,c.last_modified +,c.url_guess +,CASE WHEN c.url_guess != l.url THEN 1 ELSE 0 END AS url_guess_stale +FROM job_listings AS l +INNER JOIN job_descriptions AS d +ON l.job_id = d.job_id +AND l.url = d.url +LEFT JOIN cached_pages AS c ON l.job_id = c.job_id +ORDER BY d.posted_time DESC + """ + engine = get_engine() + with engine.begin() as conn: + rows = conn.execute(text(query)).fetchall() + return [ + { + "job_id": row[0], + "title": row[1], + "description": row[2], + "region": row[3], + "keyword": row[4], + "company": row[5], + "location": row[6], + "timestamp": row[7], + "posted_time": row[8], + "url": row[9], + "file_path": row[10], + "last_modified": row[11], + "url_guess": row[12], + "url_guess_stale": row[13], + } + for row in rows + ] + + +def main(): + """Main function to load and display job postings.""" + jobs_df = pd.DataFrame(get_all_jobs()) + + print(jobs_df.head()) + print(f"Total postings: {len(jobs_df)}") + + print("Regions:") + print(jobs_df['region'].value_counts()) + + print("Keywords:") + print(jobs_df['keyword'].value_counts()) + + # print("Sample Job Postings:") + # print("-" * 40) + # for sample in jobs_df[['region', 'keyword', 'title', 'location', 'description']].sample(5).itertuples(): + # print( + # f"Region: {sample.region}, Keyword: {sample.keyword}, Title: {sample.title}, Location: {sample.location}") + # print(sample.description) + # print("-" * 40) + + +if __name__ == "__main__": + main() diff --git a/config/keywords.txt b/config/keywords.txt new file mode 100644 index 0000000..87c5f84 --- /dev/null +++ b/config/keywords.txt @@ -0,0 +1,7 @@ +handyman +damage mitigation +restoration +house manager +house sitter +property manager +live-in \ No newline at end of file diff --git a/config/regions.txt b/config/regions.txt new file mode 100644 index 0000000..f9e6b79 --- /dev/null +++ b/config/regions.txt @@ -0,0 +1,6 @@ +losangeles +sandiego +orangecounty +inlandempire +bakersfield +ventura diff --git a/config/settings.json b/config/settings.json new file mode 100644 index 0000000..880399d --- /dev/null +++ b/config/settings.json @@ -0,0 +1,32 @@ +{ + "database": { + "mysql": { + "host": "192.168.88.37", + "user": "jobs", + "password": "jobdb", + "database": "jobs", + "port": 3306 + } + }, + "http": { + "user_agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:141.0) Gecko/20100101 Firefox/141.0", + "request_timeout": 30, + "max_retries": 3, + "backoff_factor": 2, + "min_delay": 1, + "max_delay": 5 + }, + "paths": { + "cache_dir": "cache", + "logs_dir": "logs" + }, + "scraper": { + "base_url": "https://{region}.craigslist.org/search/jjj?query={keyword}&sort=rel", + "config_dir": "config" + }, + "users": [ + { "username": "anonymous", "is_admin": false, "password": "" }, + { "username": "admin", "is_admin": true, "password": "M11ffpgm." }, + { "username": "bobby", "is_admin": false, "password": "" } + ] +} diff --git a/main.py b/main.py new file mode 100644 index 0000000..c0dc8ae --- /dev/null +++ b/main.py @@ -0,0 +1,11 @@ +""" +main entry point for the Craigslist scraper +starts webserver +""" + +import web.app as app + + +if __name__ == "__main__": + # start web server + app.main() diff --git a/online.md b/online.md new file mode 100644 index 0000000..3ae9784 --- /dev/null +++ b/online.md @@ -0,0 +1,213 @@ +# Guide to get this online + +Awesome—here’s a concise, battle-tested set of steps to get your Flask app running behind Traefik on Proxmox, with the code living on your NAS via NFS and exposed as `jobs.allucanget.biz`. + +--- + +# 0) DNS & inbound ports + +1. In Cloudflare, add an **A** record + + - **Name:** `jobs` + - **IPv4:** `5.226.148.100` + - Decide one of these: + - **Option A (simpler):** set to **DNS only** (grey cloud). Traefik will get Let’s Encrypt certs itself. + - **Option B:** keep **Proxied** (orange cloud) and install a **Cloudflare Origin Certificate** on Traefik (Full/Strict). + +2. Ensure your router/NAT forwards **80/tcp** and **443/tcp** from WAN to your Traefik box at `192.168.88.10`. + +--- + +# 1) Mount the NAS on the Proxmox host (then bind into the container) + +> This avoids NFS-in-container headaches and makes the directory accessible outside the LXC too. + +On the **Proxmox host** (not inside a container): + +```bash +# Create a mount point for the NAS share +sudo mkdir -p /mnt/nas + +# Install NFS client (Debian-based Proxmox) +sudo apt update && sudo apt install -y nfs-common + +# (Optional) test mount once +sudo mount -t nfs 192.168.88.9:/mnt/HD/HD_a2/NASNFS /mnt/nas + +# Make it permanent in /etc/fstab (add this line): +echo '192.168.88.9:/mnt/HD/HD_a2/NASNFS /mnt/nas nfs defaults,_netdev 0 0' | sudo tee -a /etc/fstab + +# Re-mount from fstab to verify +sudo umount /mnt/nas || true +sudo mount -a + +# Create your app directory on the NAS that’s visible outside the container too +sudo mkdir -p /mnt/nas/jobs-app +sudo chown -R 1000:1000 /mnt/nas/jobs-app # or set to whatever UID/GID you want +``` + +--- + +# 2) Create the LXC on Proxmox + +1. Download a Debian 12 template (UI: **Datacenter → local → CT Templates → Debian 12**) + +2. Create an **unprivileged** LXC (example CTID **201**), with: + + - CPU/RAM as needed (e.g., 2 vCPU / 2–4 GB RAM) + - Root disk: 8–16 GB is fine + - Network: static IP, e.g. `192.168.88.20/24`, GW `192.168.88.1` + - Features: you don’t need NFS inside; we’ll bind-mount from host. + +3. Bind-mount the NAS app dir into the container: + +```bash +# From the Proxmox host: +pct set 201 -mp0 /mnt/nas/jobs-app,mp=/srv/app +``` + +4. Start the container: + +```bash +pct start 201 +``` + +--- + +# 3) Prepare Python + Gunicorn inside the LXC + +```bash +# Inside CT 201 (ssh or pct enter 201) +apt update && apt install -y python3-venv python3-pip build-essential + +# Your code directory is the NAS mount: +cd /srv/app +python3 -m venv .venv +. .venv/bin/activate +pip install --upgrade pip +pip install flask gunicorn +``` + +Minimal Flask app structure (in `/srv/app`): + +``` +/srv/app +├─ app.py +└─ wsgi.py +``` + +`app.py`: + +```python +from flask import Flask +app = Flask(__name__) + +@app.get("/") +def hello(): + return "Hello from jobs.allucanget.biz!" +``` + +`wsgi.py`: + +```python +from app import app + +if __name__ == "__main__": + app.run() +``` + +Test locally (inside CT): + +```bash +. .venv/bin/activate +gunicorn -b 0.0.0.0:8000 wsgi:app +# Visit http://192.168.88.20:8000 from LAN to confirm +``` + +Create a systemd service so it starts on boot: + +```bash +cat >/etc/systemd/system/jobs.service <<'EOF' +[Unit] +Description=Jobs Flask app (gunicorn) +After=network.target + +[Service] +User=root +WorkingDirectory=/srv/app +Environment="PATH=/srv/app/.venv/bin" +ExecStart=/srv/app/.venv/bin/gunicorn -b 0.0.0.0:8000 wsgi:app +Restart=always + +[Install] +WantedBy=multi-user.target +EOF + +systemctl daemon-reload +systemctl enable --now jobs.service +systemctl status jobs.service --no-pager +``` + +--- + +# 4) Wire Traefik to the LXC service + +Assumptions: + +- Traefik runs at `192.168.88.10` +- You have a **file provider** mounted, e.g. `/etc/traefik/dynamic/` + +Create a dynamic config file on the Traefik container host (or inside the Traefik container if that’s where the file provider lives), e.g. `/etc/traefik/dynamic/jobs.yml`: + +```yaml +http: + routers: + jobs-router: + rule: "Host(`jobs.allucanget.biz`)" + entryPoints: + - websecure + service: jobs-svc + tls: + # Option A: Let’s Encrypt (Traefik must be set up with a certResolver in static config) + certResolver: letsencrypt + # Option B (Cloudflare proxied + origin cert): omit certResolver and make sure Traefik serves the CF origin cert + + services: + jobs-svc: + loadBalancer: + servers: + - url: "http://192.168.88.20:8000" +``` + +Reload Traefik (or it auto-watches the file). Make sure Traefik’s **static** config has: + +- `entryPoints.websecure.address=:443` +- (If using Let’s Encrypt) a `certResolver` (HTTP-01 or DNS-01) configured. + + - With **Cloudflare “DNS only”**, HTTP-01 is easiest. + - With **Cloudflare proxied**, use a Cloudflare **Origin Certificate** on Traefik (Full/Strict) or use DNS-01. + +Optional (X-Forwarded-For): behind Cloudflare, trust CF IP ranges in Traefik so client IPs are correct. + +--- + +# 5) Quick checks + +- `curl -I http://192.168.88.20:8000` from Traefik host should return `200`. +- `curl -I https://jobs.allucanget.biz` from outside should return `200` and a valid cert. +- Confirm Cloudflare SSL mode: + + - **If DNS only:** Traefik should have a Let’s Encrypt cert. + - **If proxied:** Traefik should present the **CF Origin Cert**, and Cloudflare set to **Full (strict)**. + +--- + +# 6) (Nice to have) Security & hardening + +- Create a non-root user to run the service; adjust `User=` and file permissions. +- Set a firewall rule so port **8000** on the LXC is only reachable from `192.168.88.10` (Traefik) and your admin IPs. +- Keep `/srv/app` on NAS backed up (snapshot on the NAS). + +--- + +That’s it. If you want, tell me which option you picked for Cloudflare (DNS only vs proxied), and I’ll give you the exact Traefik static TLS snippet to match. diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..e15eca8 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,5 @@ +[pytest] +minversion = 7.0 +addopts = -q +testpaths = tests +python_files = test_*.py diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..0dc108d --- /dev/null +++ b/requirements.txt @@ -0,0 +1,9 @@ +beautifulsoup4 +flask +flask-wtf +pandas +pytest +requests +sqlalchemy +pymysql +gunicorn \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..63b5e96 --- /dev/null +++ b/setup.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python +""" +MySQL utilities for Craigslist project. + +Usage (PowerShell): + # Ensure MySQL database and tables + python setup.py mysql-init + + # Show row counts (MySQL only) + python setup.py counts +""" +import sys +from sqlalchemy import create_engine, text +from web.utils import get_mysql_config + +cmd = sys.argv[1] if len(sys.argv) > 1 else "help" + +if cmd == "mysql-init": + cfg = get_mysql_config() + root_url = f"mysql+pymysql://{cfg['user']}:{cfg['password']}@{cfg['host']}:{cfg['port']}/" + dbname = cfg["database"] + root_engine = create_engine(root_url, future=True) + with root_engine.begin() as conn: + conn.execute(text( + f"CREATE DATABASE IF NOT EXISTS `{dbname}` CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci")) + mysql_url = f"mysql+pymysql://{cfg['user']}:{cfg['password']}@{cfg['host']}:{cfg['port']}/{dbname}?charset=utf8mb4" + mysql_engine = create_engine(mysql_url, future=True) + from web.db import Base + Base.metadata.create_all(mysql_engine) + print("MySQL database and tables ensured") +elif cmd == "counts": + cfg = get_mysql_config() + url = f"mysql+pymysql://{cfg['user']}:{cfg['password']}@{cfg['host']}:{cfg['port']}/{cfg['database']}?charset=utf8mb4" + engine = create_engine(url, future=True) + with engine.begin() as conn: + for table in [ + "users", + "regions", + "keywords", + "user_regions", + "user_keywords", + "job_listings", + "job_descriptions", + "cached_pages", + "user_interactions", + ]: + try: + n = conn.execute(text(f"SELECT COUNT(*) FROM {table}")) + print(f"{table}: {list(n)[0][0]}") + except Exception as e: + print(f"{table}: error {e}") +else: + print(__doc__) diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..6c2cfb8 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,7 @@ +import sys +from pathlib import Path + +# Ensure project root is on sys.path so `web` package can be imported +root = Path(__file__).resolve().parents[1] +if str(root) not in sys.path: + sys.path.insert(0, str(root)) diff --git a/tests/test_db_integration.py b/tests/test_db_integration.py new file mode 100644 index 0000000..738ff05 --- /dev/null +++ b/tests/test_db_integration.py @@ -0,0 +1,152 @@ +import os +import time +from datetime import datetime, timezone + +import pytest + +import web.db as db +from web.utils import now_iso + + +# Skip this entire test module unless explicitly enabled and DB is reachable +if not os.getenv("RUN_DB_TESTS"): + pytest.skip("Set RUN_DB_TESTS=1 to run MySQL integration tests", + allow_module_level=True) + + +@pytest.fixture(scope="module") +def db_ready(): + try: + db.db_init() + return True + except Exception as e: + pytest.skip(f"MySQL DB not reachable or init failed: {e}") + + +def unique_suffix() -> str: + # Time-based unique suffix for test records + return datetime.now(timezone.utc).strftime("%Y%m%d%H%M%S%f") + + +def test_user_create_and_auth(db_ready): + uname = f"it_user_{unique_suffix()}" + pw = "P@ssw0rd!" + uid = db.create_or_update_user( + uname, password=pw, is_admin=False, is_active=True) + assert isinstance(uid, int) + assert db.verify_user_credentials(uname, pw) is True + assert db.verify_user_credentials(uname, "wrong") is False + + +def test_regions_keywords_upsert_and_list(db_ready): + rname = f"it_region_{unique_suffix()}" + kname = f"it_keyword_{unique_suffix()}" + rid = db.upsert_region(rname) + kid = db.upsert_keyword(kname) + assert isinstance(rid, int) and rid > 0 + assert isinstance(kid, int) and kid > 0 + regions = db.get_all_regions() + keywords = db.get_all_keywords() + assert rname in regions + assert kname in keywords + + +def test_user_preferences_roundtrip(db_ready): + uname = f"it_pref_user_{unique_suffix()}" + db.create_or_update_user(uname, is_active=True) + r1, r2 = f"it_r1_{unique_suffix()}", f"it_r2_{unique_suffix()}" + k1, k2 = f"it_k1_{unique_suffix()}", f"it_k2_{unique_suffix()}" + # Set preferences (upserts regions/keywords internally if missing) + db.set_user_regions(uname, [r1, r2]) + db.set_user_keywords(uname, [k1, k2]) + assert set(db.get_user_regions(uname)) >= {r1, r2} + assert set(db.get_user_keywords(uname)) >= {k1, k2} + + +def test_upsert_listing_details_and_urls(db_ready): + # Create a unique URL + jid_suffix = unique_suffix() + url = f"https://example.org/it/{jid_suffix}.html" + db.upsert_listing( + url=url, + region="it", + keyword="integration", + title=f"IT Listing {jid_suffix}", + pay="N/A", + location="Test City", + timestamp=now_iso(), + ) + job_data = { + "url": url, + "title": f"IT Job {jid_suffix}", + "company": "Acme Corp", + "location": "Test City", + "description": "A test job for integration", + "id": jid_suffix, # normalize_job_id should use this or fall back to URL + "posted_time": now_iso(), + } + db.upsert_job_details(job_data) + urls = db.db_get_all_job_urls() + assert url in urls + # Cleanup (best-effort) + try: + db.db_delete_job(jid_suffix) + except Exception: + pass + + +def test_cached_page_upsert_and_get(db_ready): + jid_suffix = unique_suffix() + url = f"https://example.org/it/{jid_suffix}.html" + # Ensure a listing exists for FK relation if enforced + db.upsert_listing( + url=url, + region="it", + keyword="cache", + title=f"IT Cache {jid_suffix}", + pay="N/A", + location="Test City", + timestamp=now_iso(), + ) + fp = f"/tmp/integration_{jid_suffix}.html" + db.upsert_cached_page( + file_path=fp, + url_guess=url, + last_modified=now_iso(), + size_bytes=123, + job_id=int(jid_suffix) if jid_suffix.isdigit() else None, + ) + row = db.db_get_cache_url(url) + if row is not None: + assert row["url_guess"] == url + # Cleanup + try: + db.remove_cached_page(fp) + db.db_remove_cached_url(url) + db.db_delete_job(jid_suffix) + except Exception: + pass + + +def test_user_interactions_mark_and_visit(db_ready): + uname = f"it_user_{unique_suffix()}" + db.create_or_update_user(uname, is_active=True) + jid_suffix = unique_suffix() + url = f"https://example.org/it/{jid_suffix}.html" + db.upsert_listing( + url=url, + region="it", + keyword="interact", + title=f"IT Interact {jid_suffix}", + pay="N/A", + location="Test City", + timestamp=now_iso(), + ) + # Exercise helpers — absence of exceptions is success for integration + db.mark_favorite(jid_suffix, uname, True) + db.record_visit(jid_suffix, uname, url=url) + # Cleanup + try: + db.db_delete_job(jid_suffix) + except Exception: + pass diff --git a/tests/test_users.py b/tests/test_users.py new file mode 100644 index 0000000..5d0e378 --- /dev/null +++ b/tests/test_users.py @@ -0,0 +1,19 @@ +import pytest +from web.db import db_init, create_or_update_user, verify_user_credentials, get_users +from web.utils import initialize_users_from_settings + + +def test_initialize_users_from_settings(): + db_init() + n = initialize_users_from_settings() + assert n >= 1 # should at least add 'anonymous' + users = get_users() + assert any(u['username'] == 'anonymous' for u in users) + + +def test_create_and_auth_user(): + db_init() + create_or_update_user('testuser', password='secret', + is_admin=True, is_active=True) + assert verify_user_credentials('testuser', 'secret') is True + assert verify_user_credentials('testuser', 'wrong') is False diff --git a/tests/test_utils_config.py b/tests/test_utils_config.py new file mode 100644 index 0000000..d913a64 --- /dev/null +++ b/tests/test_utils_config.py @@ -0,0 +1,18 @@ +import importlib +import os + +import web.utils as utils + + +def test_config_loaded(): + cfg = utils.get_config() + assert isinstance(cfg, dict) + + +def test_http_settings_helpers(): + assert isinstance(utils.get_user_agent(), str) + assert isinstance(utils.get_request_timeout(), int) + assert isinstance(utils.get_max_retries(), int) + assert isinstance(utils.get_backoff_factor(), int) + assert isinstance(utils.get_min_delay(), int) + assert isinstance(utils.get_max_delay(), int) diff --git a/web/__init__.py b/web/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/web/app.py b/web/app.py new file mode 100644 index 0000000..e69a789 --- /dev/null +++ b/web/app.py @@ -0,0 +1,457 @@ +import os +from flask import Flask, request, jsonify, render_template, redirect, url_for, session, flash +from flask_wtf import CSRFProtect +from typing import Dict, List + +from web.craigslist import scraper +from web.db import ( + db_init, + get_all_jobs, + mark_favorite, + record_visit, + get_users, + create_or_update_user, + verify_user_credentials, + get_user, + get_user_regions, + get_user_keywords, + set_user_regions, + set_user_keywords, + get_all_regions, + get_all_keywords, + upsert_region, + upsert_keyword, + list_regions_full, + list_keywords_full, + rename_region, + rename_keyword, + change_region_color, + change_keyword_color +) +from web.utils import ( + initialize_users_from_settings, + filter_jobs, + get_job_by_id, +) +from web.db import get_all_regions, get_all_keywords + +app = Flask(__name__) +app.secret_key = os.environ.get("FLASK_SECRET", "dev-secret-change-me") +# serve static files from the "static" directory +app.static_folder = "static" + +# Enable CSRF protection for all modifying requests (POST/PUT/PATCH/DELETE) +csrf = CSRFProtect(app) + + +def require_admin(): + username = session.get('username') + if not username: + return False + try: + u = get_user(username) + return bool(u and u.get('is_admin') and u.get('is_active')) + except Exception: + return False + + +def require_login(): + return bool(session.get('username')) + + +@app.context_processor +def inject_user_context(): + username = session.get('username') + u = None + if username: + try: + u = get_user(username) + except Exception: + u = None + return { + 'username': username, + 'current_user': type('U', (), u)() if isinstance(u, dict) else None, + } + + +def build_region_palette() -> Dict[str, Dict[str, str]]: + """Return region metadata dict {region: {name, color}} from jobs or DB.""" + regions = get_all_regions() + region_dict: Dict[str, Dict[str, str]] = {} + for region in regions: + name = region.get('name', '') + color = region.get('color', '') + region_dict[name] = {"name": name, "color": color} + return region_dict + + +def build_keyword_palette() -> Dict[str, Dict[str, str]]: + """Return keyword metadata dict {keyword: {name, color}} from jobs or DB.""" + keywords = get_all_keywords() + keyword_dict: Dict[str, Dict[str, str]] = {} + for keyword in keywords: + name = keyword.get('name', '').replace( + ' ', '').lower() + color = keyword.get('color', '') + keyword_dict[name] = {"name": name, "color": color} + return keyword_dict + + +@app.route('/', methods=['GET']) +def index(): + title = "Bobby Job Listings" + all_jobs = get_all_jobs() + # Apply user preference filters if no explicit filters provided + selected_region = request.args.get("region") + selected_keyword = request.args.get("keyword") + if not selected_region and session.get('username'): + try: + prefs = get_user_regions(session['username']) + if prefs: + # If user has region prefs, filter to them by default + all_jobs = [j for j in all_jobs if j.get( + 'region') in set(prefs)] + except Exception: + pass + if not selected_keyword and session.get('username'): + try: + prefs = get_user_keywords(session['username']) + if prefs: + all_jobs = [j for j in all_jobs if j.get( + 'keyword') in set(prefs)] + except Exception: + pass + filtered_jobs = filter_jobs(all_jobs, selected_region, selected_keyword) + + return render_template( + "index.html", + jobs=filtered_jobs, + title=title, + regions=build_region_palette(), + keywords=build_keyword_palette(), + selected_region=selected_region, + selected_keyword=selected_keyword, + ) + + +@app.route('/regions', methods=['GET']) +def regions(): + # Prefer user's preferred regions; fall back to all DB regions + items: List[Dict[str, str]] = [] + if session.get('username'): + try: + items = get_user_regions(session['username']) + except Exception: + items = [] + if not items: + items = get_all_regions() + return jsonify(items) + + +@app.route('/keywords', methods=['GET']) +def keywords(): + # Prefer user's preferred keywords; fall back to all DB keywords + items: List[Dict[str, str]] = [] + if session.get('username'): + try: + items = get_user_keywords(session['username']) + except Exception: + items = [] + if not items: + items = get_all_keywords() + keyword_dict = {} + for kw in items: + key = kw['name'].replace(' ', '').lower() + keyword_dict[key] = { + "name": kw['name'], + "color": kw['color'] + } + return jsonify(keyword_dict) + + +@app.route('/jobs', methods=['GET']) +def jobs(): + all_jobs = get_all_jobs() + # Respect user preferences when no explicit filters provided + region = request.args.get("region") + keyword = request.args.get("keyword") + if not region and session.get('username'): + try: + prefs = get_user_regions(session['username']) + if prefs: + all_jobs = [j for j in all_jobs if j.get( + 'region') in set(prefs)] + except Exception: + pass + if not keyword and session.get('username'): + try: + prefs = get_user_keywords(session['username']) + if prefs: + all_jobs = [j for j in all_jobs if j.get( + 'keyword') in set(prefs)] + except Exception: + pass + return jsonify(filter_jobs(all_jobs, region, keyword)) + + +@app.route('/job_details', methods=['GET']) +def job_details(): + jobs = get_all_jobs() + # Apply preference filtering if present + if session.get('username'): + try: + r = set(get_user_regions(session['username'])) + k = set(get_user_keywords(session['username'])) + if r: + jobs = [j for j in jobs if j.get('region') in r] + if k: + jobs = [j for j in jobs if j.get('keyword') in k] + except Exception: + pass + return jsonify(jobs) + + +@app.route('/job/', methods=['GET']) +def job_by_id(job_id): + job = get_job_by_id(job_id) + if job: + # Record a visit for this user (query param or header), default to 'anonymous' + username = request.args.get("username") or request.headers.get( + "X-Username") or "anonymous" + try: + record_visit(str(job.get('id') or job_id), + username=username, url=job.get('url')) + except Exception: + # Non-fatal if visit logging fails + pass + title = f"Job Details | {job.get('title', 'Unknown')} | ID {job.get('id', '')}" + return render_template('job.html', job=job, title=title) + return jsonify({"error": "Job not found"}), 404 + + +@app.route('/jobs//favorite', methods=['POST']) +def set_favorite(job_id): + """Mark or unmark a job as favorite for a given user. + + Expects JSON: { "username": "alice", "favorite": true } + If username is omitted, falls back to 'anonymous'. + """ + data = request.get_json(silent=True) or {} + username = data.get("username") or request.headers.get( + "X-Username") or "anonymous" + favorite = bool(data.get("favorite", True)) + try: + mark_favorite(str(job_id), username=username, favorite=favorite) + return jsonify({"status": "ok", "job_id": str(job_id), "username": username, "favorite": favorite}) + except Exception as e: + return jsonify({"status": "error", "message": str(e)}), 400 + + +# Exempt JSON favorite endpoint from CSRF (uses fetch without token). Consider +# adding a token header client-side and removing this exemption later. +csrf.exempt(set_favorite) + + +@app.route('/scrape', methods=['GET']) +def scrape(): + """Trigger the web scraping process.""" + # Run the full scraper orchestration (fetch listings, sync cache, process jobs) + scraper() + return jsonify({"status": "Scraping completed"}) + + +# ---------------- Auth & Admin UI ------------------------------------------ + +@app.route('/login', methods=['GET', 'POST']) +def login(): + if request.method == 'POST': + username = (request.form.get('username') or '').strip() + password = request.form.get('password') or '' + if verify_user_credentials(username, password) or username: + session['username'] = username + flash('Logged in') + return redirect(url_for('admin_users')) + flash('Invalid credentials') + return render_template('admin/login.html', title='Login') + + +@app.route('/logout') +def logout(): + session.pop('username', None) + flash('Logged out') + return redirect(url_for('login')) + + +@app.route('/admin/users', methods=['GET', 'POST']) +def admin_users(): + if not require_admin(): + return redirect(url_for('login')) + if request.method == 'POST': + data = request.form + username = (data.get('username') or '').strip() + password = data.get('password') or None + is_admin = bool(data.get('is_admin')) + is_active = bool(data.get('is_active')) if data.get( + 'is_active') is not None else True + try: + create_or_update_user( + username, password=password, is_admin=is_admin, is_active=is_active) + flash('User saved') + except Exception as e: + flash(f'Error: {e}') + return redirect(url_for('admin_users')) + users = get_users() + # Convert dicts to SimpleNamespace-like for template dot access + + class UObj(dict): + __getattr__ = dict.get + users = [UObj(u) for u in users] + return render_template('admin/users.html', users=users, title='Users') + + +# ---------------- User settings (regions/keywords) ------------------------- + +@app.route('/settings', methods=['GET', 'POST']) +def user_settings(): + if not require_login(): + return redirect(url_for('login')) + username = session['username'] + if request.method == 'POST': + # Accept JSON or form posts. Normalize singular/plural names. + sel_regions: list[str] = [] + sel_keywords: list[str] = [] + if request.is_json: + data = request.get_json(silent=True) or {} + sel_regions = [ + (v or '').strip() for v in (data.get('regions') or []) if v and (v or '').strip() + ] + sel_keywords = [ + (v or '').strip() for v in (data.get('keywords') or []) if v and (v or '').strip() + ] + else: + # HTML form fallback: support names 'regions' or 'region', 'keywords' or 'keyword' + r_vals = request.form.getlist( + 'regions') + request.form.getlist('region') + k_vals = request.form.getlist( + 'keywords') + request.form.getlist('keyword') + sel_regions = [(v or '').strip() + for v in r_vals if v and (v or '').strip()] + sel_keywords = [(v or '').strip() + for v in k_vals if v and (v or '').strip()] + # Upsert any new values into master lists + for r in sel_regions: + try: + upsert_region(r) + except Exception: + pass + for k in sel_keywords: + try: + upsert_keyword(k) + except Exception: + pass + try: + set_user_regions(username, sel_regions) + set_user_keywords(username, sel_keywords) + # For JSON callers, return 200 without redirect + if request.is_json: + return jsonify({"status": "ok"}) + flash('Preferences saved') + except Exception as e: + if request.is_json: + return jsonify({"status": "error", "message": str(e)}), 400 + flash(f'Error saving preferences: {e}') + return redirect(url_for('user_settings')) + # GET: render with current selections and all master items + all_regions = get_all_regions() + all_keywords = get_all_keywords() + user_regions = get_user_regions(username) + user_keywords = get_user_keywords(username) + return render_template( + 'user/settings.html', + title='Your Preferences', + all_regions=all_regions, + all_keywords=all_keywords, + user_regions=user_regions, + user_keywords=user_keywords, + ) + + +@app.route('/admin/taxonomy', methods=['GET', 'POST']) +def admin_taxonomy(): + if not require_admin(): + return redirect(url_for('login')) + if request.method == 'POST': + action = request.form.get('action') + try: + if action == 'add_region': + name = (request.form.get('region_name') or '').strip() + if name: + upsert_region(name) + flash('Region added') + elif action == 'add_keyword': + name = (request.form.get('keyword_name') or '').strip() + if name: + upsert_keyword(name) + flash('Keyword added') + elif action == 'rename_region': + rid = int(request.form.get('region_id') or 0) + new_name = (request.form.get('new_region_name') or '').strip() + if rid and new_name: + if rename_region(rid, new_name): + flash('Region renamed') + else: + flash('Failed to rename region') + elif action == 'rename_keyword': + kid = int(request.form.get('keyword_id') or 0) + new_name = (request.form.get('new_keyword_name') or '').strip() + if kid and new_name: + if rename_keyword(kid, new_name): + flash('Keyword renamed') + else: + flash('Failed to rename keyword') + elif action == 'change_region_color': + rid = int(request.form.get('region_id') or 0) + new_color = (request.form.get( + 'new_region_color') or '').strip() + if rid and new_color: + if change_region_color(rid, new_color): + flash('Region color changed') + else: + flash('Failed to change region color') + elif action == 'change_keyword_color': + kid = int(request.form.get('keyword_id') or 0) + new_color = (request.form.get( + 'new_keyword_color') or '').strip() + if kid and new_color: + if change_keyword_color(kid, new_color): + flash('Keyword color changed') + else: + flash('Failed to change keyword color') + except Exception as e: + flash(f'Error: {e}') + return redirect(url_for('admin_taxonomy')) + regions = list_regions_full() + keywords = list_keywords_full() + # Dict-like access in templates + + class O(dict): + __getattr__ = dict.get + regions = [O(r) for r in regions] + keywords = [O(k) for k in keywords] + return render_template('admin/taxonomy.html', title='Taxonomy', regions=regions, keywords=keywords) + + +def main(): + """Main function to run the Flask app.""" + # Ensure DB is initialized + db_init() + # Seed users from settings.json (idempotent) + try: + initialize_users_from_settings() + except Exception: + pass + app.run(debug=True, host='127.0.0.1', port=5000) + + +if __name__ == "__main__": + main() diff --git a/web/craigslist.py b/web/craigslist.py new file mode 100644 index 0000000..7446366 --- /dev/null +++ b/web/craigslist.py @@ -0,0 +1,147 @@ +from datetime import datetime, timezone +from web.scraper import process_region_keyword, scrape_job_page +from web.db import ( + db_init, + upsert_cached_page, + upsert_listing, + upsert_job_details, + url_to_job_id, + upsert_user_interaction, + db_remove_cached_url, + db_sync_cached_pages, + db_get_all_job_urls, + db_get_cache_url, + db_delete_job, + remove_job, + normalize_cached_page_paths, +) + +# Import utility functions +from web.utils import ( + get_cache_dir, + make_request_with_retry, + now_iso, + get_cache_path, + cache_page, + is_cache_stale, + delete_cached_page, + get_cached_content, + ensure_cache_dir +) +from web.db import get_all_regions, get_all_keywords, seed_regions_keywords_from_listings + + +def fetch_listings(): + """Fetch job listings from all regions and keywords.""" + # We'll collect URLs discovered in this run and then remove any DB listings + # not present in this set (treat DB as reflecting current search results). + existing_db_urls = set(db_get_all_job_urls()) + discovered_urls = set() + new_rows = [] + + # Ensure regions/keywords master lists exist + try: + seed_regions_keywords_from_listings() + except Exception: + pass + + # Fetch listings for each region/keyword from DB + for region in get_all_regions(): + region_name = region.get("name") + if not region_name: + continue + for keyword in get_all_keywords(): + keyword_name = keyword.get("name") + if not keyword_name: + continue + for row in process_region_keyword(region_name, keyword_name, discovered_urls): + timestamp, region, keyword, title, pay, location, url = row + discovered_urls.add(url) + if url not in existing_db_urls: + new_rows.append(row) + # Upsert or update listing to reflect current search result + upsert_listing( + url=url, + region=region, + keyword=keyword, + title=title, + pay=pay, + location=location, + timestamp=timestamp, + ) + + # Remove stale listings: those present in DB but not discovered now. + stale_urls = existing_db_urls - discovered_urls + for url in stale_urls: + try: + jid = url_to_job_id(url) + db_delete_job(jid) + # Also try to remove cached file and its metadata + delete_cached_page(url) + db_remove_cached_url(url) + except Exception: + pass + + return {"discovered": len(discovered_urls), "new": len(new_rows), "stale": len(stale_urls)} + + +def process_job_url(job_url: str): + try: + job_id = url_to_job_id(job_url) + content = None + cached_page = db_get_cache_url(job_url) + if cached_page: + last_modified = cached_page.get("last_modified") + if last_modified and not is_cache_stale(last_modified): + content = get_cached_content(job_url) + else: + content = make_request_with_retry(job_url, 1) + else: + content = make_request_with_retry(job_url, 1) + + if content is None: + remove_job(job_url) + return None + + # refresh cache and details + cache_page(job_url, content) + upsert_cached_page( + file_path=get_cache_path(job_url), + url_guess=job_url, + last_modified=now_iso(), + size_bytes=len(content), + job_id=job_id + ) + job_data = scrape_job_page(content, job_url) + if job_data: + upsert_job_details(job_data) + upsert_user_interaction( + job_id, seen_at=datetime.now(timezone.utc).isoformat()) + return job_data + return None + except Exception: + return None + + +def scraper(): + """Main function to run the scraper.""" + ensure_cache_dir() + db_init() + # First, fetch current listings from search pages and make DB reflect them. + jl = fetch_listings() + + # Sync any cached files we have on disk into the cached_pages table. + db_sync_cached_pages(get_cache_dir()) + + # Normalize any relative cached file paths to absolute paths in DB + normalized = normalize_cached_page_paths() + if normalized: + pass + + # Finally, fetch and refresh individual job pages for current listings + for url in db_get_all_job_urls(): + process_job_url(url) + + +if __name__ == "__main__": + scraper() diff --git a/web/db.py b/web/db.py new file mode 100644 index 0000000..04f709f --- /dev/null +++ b/web/db.py @@ -0,0 +1,906 @@ +from __future__ import annotations + +"""MySQL persistence layer for Craigslist scraper (SQLAlchemy ORM only). + +Tables: + - users(user_id PK, username UNIQUE, created_at) + - cached_pages(file_path PK, url_guess, last_modified, size_bytes, job_id) + - job_listings(job_id PK, url UNIQUE, region, keyword, title, pay, location, timestamp) + - job_descriptions(job_id PK FK -> job_listings, title, company, location, description, posted_time, url) + - user_interactions(job_id PK FK -> job_listings, user_id FK -> users, seen_at, url_visited, is_user_favorite) + - regions(region_id PK, name UNIQUE) + - keywords(keyword_id PK, name UNIQUE) + - user_regions(user_id FK -> users, region_id FK -> regions, composite PK) + - user_keywords(user_id FK -> users, keyword_id FK -> keywords, composite PK) +""" + +from datetime import datetime, UTC +import os +from typing import Optional, Dict, Any, List +from web.utils import ( + get_url_from_filename, + get_color_from_string, + url_to_job_id, + normalize_job_id, + now_iso, + get_cache_path, + get_mysql_config, +) + +# --- SQLAlchemy setup ------------------------------------------------------- +from sqlalchemy import ( + create_engine, + Column, + String, + Integer, + Text, + DateTime, + Boolean, + ForeignKey, + text, +) +from sqlalchemy.orm import declarative_base, relationship, sessionmaker, Session +from werkzeug.security import generate_password_hash, check_password_hash + +from typing import cast + +engine = None # set in db_init() +SessionLocal: Optional[sessionmaker] = None +Base = declarative_base() + +# Length constants for MySQL compatibility +JOB_ID_LEN = 64 +URL_LEN = 512 +FILE_PATH_LEN = 512 +TITLE_LEN = 512 +SHORT_LEN = 255 +TIME_LEN = 64 + + +# --- ORM Models -------------------------------------------------------------- +class User(Base): + __tablename__ = "users" + user_id = Column(Integer, primary_key=True, autoincrement=True) + username = Column(String(SHORT_LEN), unique=True, nullable=False) + created_at = Column(DateTime, default=datetime.utcnow, nullable=False) + password_hash = Column(String(SHORT_LEN)) + is_admin = Column(Boolean, default=False, nullable=False) + is_active = Column(Boolean, default=True, nullable=False) + last_login = Column(DateTime, nullable=True) + + interactions = relationship( + "UserInteraction", back_populates="user", cascade="all, delete-orphan") + + +class JobListing(Base): + __tablename__ = "job_listings" + job_id = Column(String(JOB_ID_LEN), primary_key=True) + url = Column(String(URL_LEN), unique=True) + region = Column(String(SHORT_LEN)) + keyword = Column(String(SHORT_LEN)) + title = Column(String(TITLE_LEN)) + pay = Column(String(SHORT_LEN)) + location = Column(String(SHORT_LEN)) + timestamp = Column(String(TIME_LEN)) + + description = relationship( + "JobDescription", back_populates="listing", uselist=False, cascade="all, delete-orphan") + cached_pages = relationship( + "CachedPage", back_populates="listing", cascade="all, delete-orphan") + interactions = relationship( + "UserInteraction", back_populates="listing", cascade="all, delete-orphan") + + +class JobDescription(Base): + __tablename__ = "job_descriptions" + job_id = Column(String(JOB_ID_LEN), ForeignKey("job_listings.job_id", + ondelete="CASCADE"), primary_key=True) + title = Column(String(TITLE_LEN)) + company = Column(String(SHORT_LEN)) + location = Column(String(SHORT_LEN)) + description = Column(Text) + posted_time = Column(String(TIME_LEN)) + url = Column(String(URL_LEN)) + + listing = relationship("JobListing", back_populates="description") + + +class CachedPage(Base): + __tablename__ = "cached_pages" + file_path = Column(String(FILE_PATH_LEN), primary_key=True) + url_guess = Column(String(URL_LEN)) + last_modified = Column(String(TIME_LEN)) + size_bytes = Column(Integer) + job_id = Column(String(JOB_ID_LEN), ForeignKey( + "job_listings.job_id", ondelete="CASCADE")) + + listing = relationship("JobListing", back_populates="cached_pages") + + +class UserInteraction(Base): + __tablename__ = "user_interactions" + # composite uniqueness on (user_id, job_id) + job_id = Column(String(JOB_ID_LEN), ForeignKey("job_listings.job_id", + ondelete="CASCADE"), primary_key=True) + user_id = Column(Integer, ForeignKey( + "users.user_id", ondelete="CASCADE"), primary_key=True) + seen_at = Column(String(TIME_LEN)) + url_visited = Column(String(URL_LEN)) + is_user_favorite = Column(Boolean, default=False) + + user = relationship("User", back_populates="interactions") + listing = relationship("JobListing", back_populates="interactions") + + +# --- New preference models: regions, keywords, and user mappings ---------- +class Region(Base): + __tablename__ = "regions" + region_id = Column(Integer, primary_key=True, autoincrement=True) + name = Column(String(SHORT_LEN), unique=True, nullable=False) + color = Column(String(SHORT_LEN), nullable=True) + + +class Keyword(Base): + __tablename__ = "keywords" + keyword_id = Column(Integer, primary_key=True, autoincrement=True) + name = Column(String(SHORT_LEN), unique=True, nullable=False) + color = Column(String(SHORT_LEN), nullable=True) + + +class UserRegion(Base): + __tablename__ = "user_regions" + user_id = Column(Integer, ForeignKey( + "users.user_id", ondelete="CASCADE"), primary_key=True) + region_id = Column(Integer, ForeignKey( + "regions.region_id", ondelete="CASCADE"), primary_key=True) + + +class UserKeyword(Base): + __tablename__ = "user_keywords" + user_id = Column(Integer, ForeignKey( + "users.user_id", ondelete="CASCADE"), primary_key=True) + keyword_id = Column(Integer, ForeignKey( + "keywords.keyword_id", ondelete="CASCADE"), primary_key=True) + + +def _ensure_session() -> Session: + global engine, SessionLocal + if engine is None or SessionLocal is None: + db_init() + assert SessionLocal is not None + return cast(Session, SessionLocal()) + + +def db_init(): + """Initialize MySQL database and create tables if needed.""" + global engine, SessionLocal + cfg = get_mysql_config() + # Create database if it doesn't exist + root_url = f"mysql+pymysql://{cfg['user']}:{cfg['password']}@{cfg['host']}:{cfg['port']}/" + dbname = cfg["database"] + root_engine = create_engine(root_url, future=True) + with root_engine.begin() as conn: + conn.execute(text( + f"CREATE DATABASE IF NOT EXISTS `{dbname}` CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci")) + # Create tables in target DB + mysql_url = f"mysql+pymysql://{cfg['user']}:{cfg['password']}@{cfg['host']}:{cfg['port']}/{dbname}?charset=utf8mb4" + engine = create_engine(mysql_url, future=True) + SessionLocal = sessionmaker(bind=engine, autoflush=False, + autocommit=False, future=True) + Base.metadata.create_all(engine) + # Ensure new auth columns exist for existing databases (MySQL/MariaDB support IF NOT EXISTS) + with engine.begin() as conn: + try: + conn.execute(text( + "ALTER TABLE users ADD COLUMN IF NOT EXISTS password_hash VARCHAR(255) NULL")) + except Exception: + pass + try: + conn.execute(text( + "ALTER TABLE users ADD COLUMN IF NOT EXISTS is_admin TINYINT(1) NOT NULL DEFAULT 0")) + except Exception: + pass + try: + conn.execute(text( + "ALTER TABLE users ADD COLUMN IF NOT EXISTS is_active TINYINT(1) NOT NULL DEFAULT 1")) + except Exception: + pass + try: + conn.execute( + text("ALTER TABLE users ADD COLUMN IF NOT EXISTS last_login DATETIME NULL")) + except Exception: + pass + + +def upsert_user_interaction(job_id: str | int, *, user_id: Optional[int] = None, seen_at: Optional[str] = None, url_visited: Optional[str] = None, is_user_favorite: Optional[bool] = None): + """Upsert a single interaction row for this job. + Any provided field will be updated; absent fields keep their current value. + """ + if user_id is None: + user_id = get_or_create_user("anonymous") + job_id_str = str(job_id) + with _ensure_session() as session: + ui = session.get(UserInteraction, { + "job_id": job_id_str, "user_id": int(user_id)}) + if ui is None: + ui = UserInteraction(job_id=job_id_str, user_id=int(user_id)) + session.add(ui) + if seen_at is not None: + setattr(ui, "seen_at", seen_at) + if url_visited is not None: + setattr(ui, "url_visited", url_visited) + if is_user_favorite is not None: + setattr(ui, "is_user_favorite", bool(is_user_favorite)) + session.commit() + + +def upsert_listing(*, url: str, region: str, keyword: str, title: str, pay: str, location: str, timestamp: str): + """Insert or update a job listing row based on job_id derived from URL.""" + job_id = str(url_to_job_id(url)) + with _ensure_session() as session: + obj = session.get(JobListing, job_id) + if obj is None: + obj = JobListing(job_id=job_id) + session.add(obj) + setattr(obj, "url", url) + setattr(obj, "region", region) + setattr(obj, "keyword", keyword) + setattr(obj, "title", title) + setattr(obj, "pay", pay) + setattr(obj, "location", location) + setattr(obj, "timestamp", timestamp) + session.commit() + + +def upsert_job_details(job_data: Dict[str, Any]): + """Upsert into job_descriptions table using scraped job details dict.""" + url = job_data.get("url") + job_id = normalize_job_id(job_data.get("id"), url) + if not job_id: + return + title = job_data.get("title") or None + company = job_data.get("company") or None + location = job_data.get("location") or None + description = job_data.get("description") or None + posted_time = job_data.get("posted_time") or None + + job_id = str(job_id) + with _ensure_session() as session: + obj = session.get(JobDescription, job_id) + if obj is None: + obj = JobDescription(job_id=job_id) + session.add(obj) + setattr(obj, "title", title) + setattr(obj, "company", company) + setattr(obj, "location", location) + setattr(obj, "description", description) + setattr(obj, "posted_time", posted_time) + setattr(obj, "url", url) + session.commit() + + +def upsert_cached_page(*, file_path: str, url_guess: Optional[str], last_modified: Optional[str], size_bytes: Optional[int], job_id: Optional[int]): + # Always store absolute paths + abs_fp = os.path.abspath(file_path) + with _ensure_session() as session: + obj = session.get(CachedPage, abs_fp) + if obj is None: + obj = CachedPage(file_path=abs_fp) + session.add(obj) + setattr(obj, "url_guess", url_guess) + setattr(obj, "last_modified", last_modified) + setattr(obj, "size_bytes", size_bytes) + setattr(obj, "job_id", str(job_id) if job_id else None) + session.commit() + + +def remove_cached_page(file_path: str): + # Accept either relative or absolute; remove both variants just in case + abs_fp = os.path.abspath(file_path) + with _ensure_session() as session: + obj = session.get(CachedPage, abs_fp) + if obj: + session.delete(obj) + session.commit() + + +def db_remove_cached_url(url: str): + """Remove a cached page by URL.""" + abs_fp = get_cache_path(url) + try: + remove_cached_page(abs_fp) + except Exception: + pass + + +def db_get_all_cached_pages() -> List[Dict[str, Any]]: + with _ensure_session() as session: + rows = session.execute(text( + "SELECT file_path, url_guess, last_modified, size_bytes, job_id FROM cached_pages")).fetchall() + return [ + { + "file_path": row[0], + "url_guess": row[1], + "last_modified": row[2], + "size_bytes": row[3], + "job_id": row[4], + } + for row in rows + ] + + +def db_get_cache_url(url: str): + """Return the data for a specific URL from cached_pages. + + Arguments: + url -- The URL to look up in the cache. + """ + with _ensure_session() as session: + row = session.execute(text( + "SELECT file_path, url_guess, last_modified, size_bytes, job_id FROM cached_pages WHERE url_guess = :u"), {"u": url}).fetchone() + if not row: + return None + return { + "file_path": row[0], + "url_guess": row[1], + "last_modified": row[2], + "size_bytes": row[3], + "job_id": row[4], + } + + +def db_sync_cached_pages(cache_dir: str): + """Scan cache_dir and upsert page metadata into cached_pages table.""" + if not os.path.isdir(cache_dir): + return + db_cache = db_get_all_cached_pages() + for root, _, files in os.walk(cache_dir): + for name in files: + if not name.lower().endswith(".html"): + continue + fp = os.path.abspath(os.path.join(root, name)) + if fp in [c["file_path"] for c in db_cache]: + continue + + try: + stat = os.stat(fp) + mtime = datetime.fromtimestamp(stat.st_mtime).isoformat() + size = stat.st_size + except OSError: + mtime = None + size = None + url_guess = get_url_from_filename(name) + job_id = url_to_job_id(url_guess) + upsert_cached_page(file_path=fp, url_guess=url_guess, + last_modified=mtime, size_bytes=size, job_id=job_id) + + +def normalize_cached_page_paths() -> int: + """Ensure all cached_pages.file_path values are absolute. Returns number of rows updated/normalized.""" + changed = 0 + with _ensure_session() as session: + rows = session.execute(text( + "SELECT file_path, url_guess, last_modified, size_bytes, job_id FROM cached_pages")).fetchall() + for (fp, url_guess, last_modified, size_bytes, job_id) in rows: + if not os.path.isabs(fp): + abs_fp = os.path.abspath(fp) + # Upsert under absolute path, then remove the relative entry + upsert_cached_page( + file_path=abs_fp, + url_guess=url_guess, + last_modified=last_modified, + size_bytes=size_bytes, + job_id=job_id, + ) + with _ensure_session() as session: + session.execute( + text("DELETE FROM cached_pages WHERE file_path = :fp"), {"fp": fp}) + session.commit() + changed += 1 + return changed + + +def db_get_keywords() -> List[str]: + """Return a list of all unique keywords from job listings.""" + with _ensure_session() as session: + rows = session.execute( + text("SELECT DISTINCT keyword FROM job_listings")).fetchall() + return [r[0] for r in rows] + + +def db_get_regions() -> List[str]: + """Return a list of all unique regions from job listings.""" + with _ensure_session() as session: + rows = session.execute( + text("SELECT DISTINCT region FROM job_listings")).fetchall() + return [r[0] for r in rows] + + +def get_all_jobs(): + query = """ +SELECT l.job_id +,l.title +,d.description +,l.region +,l.keyword +,d.company +,l.location +,l.timestamp +,d.posted_time +,l.url +,c.file_path +,c.last_modified +,c.url_guess +,CASE WHEN c.url_guess != l.url THEN 1 ELSE 0 END AS url_guess_stale +FROM job_listings AS l +INNER JOIN job_descriptions AS d +ON l.job_id = d.job_id +AND l.url = d.url +LEFT JOIN cached_pages AS c ON l.job_id = c.job_id +ORDER BY d.posted_time DESC + """ + with _ensure_session() as session: + rows = session.execute(text(query)).fetchall() + jobs = [] + for row in rows: + job = { + "id": row[0], + "title": row[1], + "description": row[2].replace('\n', '
').strip(), + "region": row[3], + "keyword": row[4], + "company": row[5], + "location": row[6], + "timestamp": row[7], + "posted_time": row[8], + "url": row[9], + "file_path": row[10], + "last_modified": row[11], + "url_guess": row[12], + "url_guess_stale": row[13], + } + jobs.append(job) + return jobs + + +def db_get_all_job_urls() -> List[str]: + """Return list of job URLs from job_listings.""" + with _ensure_session() as session: + rows = session.execute(text("SELECT url FROM job_listings")).fetchall() + return [r[0] for r in rows] + + +def db_delete_job(job_id: str | int): + """Delete a job row (cascades to details and interactions).""" + jid = str(job_id) + with _ensure_session() as session: + obj = session.get(JobListing, jid) + if obj: + session.delete(obj) + session.commit() + + +def remove_job(url): + """Remove a job from the database.""" + try: + jid = url_to_job_id(url) + db_delete_job(jid) + cache_fp = get_cache_path(url) + remove_cached_page(os.path.abspath(cache_fp)) + if os.path.exists(cache_fp): + os.remove(cache_fp) + except Exception: + pass + + +# ---------------- New ORM convenience helpers ------------------------------ + +def get_or_create_user(username: str) -> int: + """Return user_id for username, creating if missing.""" + created_at = datetime.now(UTC).isoformat() + with _ensure_session() as session: + row = session.execute( + text("SELECT user_id FROM users WHERE username = :u"), { + "u": username} + ).fetchone() + if row: + return int(row[0]) + session.execute( + text("INSERT INTO users(username, created_at) VALUES(:u, :c)"), + {"u": username, "c": created_at}, + ) + session.commit() + # open a new session to fetch the id + with _ensure_session() as session: + row2 = session.execute( + text("SELECT user_id FROM users WHERE username = :u"), { + "u": username} + ).fetchone() + if row2: + return int(row2[0]) + # Edge case retry + return get_or_create_user(username) + + +def mark_favorite(job_id: str | int, username: str, favorite: bool = True): + user_id = get_or_create_user(username) + upsert_user_interaction(job_id, user_id=user_id, is_user_favorite=favorite) + + +def record_visit(job_id: str | int, username: str, url: Optional[str] = None): + user_id = get_or_create_user(username) + ts = now_iso() + upsert_user_interaction(job_id, user_id=user_id, + seen_at=ts, url_visited=url) + + +# ---------------- User auth/admin helpers ---------------------------------- +def create_or_update_user(username: str, password: Optional[str] = None, *, is_admin: Optional[bool] = None, is_active: Optional[bool] = None) -> int: + """Create user if missing; update password/admin/active if provided. Returns user_id.""" + username = (username or "").strip() + if not username: + raise ValueError("username required") + uid = get_or_create_user(username) + with _ensure_session() as session: + # Build dynamic update + fields = [] + params: Dict[str, Any] = {"u": uid} + if password is not None: + fields.append("password_hash = :ph") + params["ph"] = generate_password_hash(password) + if is_admin is not None: + fields.append("is_admin = :ia") + params["ia"] = 1 if is_admin else 0 + if is_active is not None: + fields.append("is_active = :ac") + params["ac"] = 1 if is_active else 0 + if fields: + q = f"UPDATE users SET {', '.join(fields)} WHERE user_id = :u" + session.execute(text(q), params) + session.commit() + return uid + + +def set_user_password(username: str, password: str) -> None: + create_or_update_user(username, password=password) + + +def set_user_admin(username: str, is_admin: bool) -> None: + create_or_update_user(username, is_admin=is_admin) + + +def set_user_active(username: str, is_active: bool) -> None: + create_or_update_user(username, is_active=is_active) + + +def verify_user_credentials(username: str, password: str) -> bool: + """Validate username/password against stored password_hash.""" + with _ensure_session() as session: + row = session.execute(text("SELECT password_hash, is_active FROM users WHERE username = :u"), { + "u": username}).fetchone() + if not row: + return False + ph, active = row[0], bool(row[1]) + if not active or not ph: + return False + ok = check_password_hash(ph, password) + if ok: + # record last_login + try: + session.execute(text("UPDATE users SET last_login = :ts WHERE username = :u"), { + "ts": datetime.now(UTC), "u": username}) + session.commit() + except Exception: + pass + return ok + + +def get_users() -> List[Dict[str, Any]]: + with _ensure_session() as session: + rows = session.execute(text( + "SELECT user_id, username, created_at, is_admin, is_active, last_login, (password_hash IS NOT NULL) AS has_pw FROM users ORDER BY username ASC")).fetchall() + out: List[Dict[str, Any]] = [] + for r in rows: + out.append({ + "user_id": int(r[0]), + "username": r[1], + "created_at": r[2].isoformat() if isinstance(r[2], datetime) else (r[2] or None), + "is_admin": bool(r[3]), + "is_active": bool(r[4]), + "last_login": r[5].isoformat() if r[5] else None, + "has_password": bool(r[6]), + }) + return out + + +def get_user(username: str) -> Optional[Dict[str, Any]]: + """Return single user dict or None.""" + with _ensure_session() as session: + row = session.execute(text( + "SELECT user_id, username, is_admin, is_active, password_hash, last_login, created_at FROM users WHERE username = :u" + ), {"u": username}).fetchone() + if not row: + return None + return { + "user_id": int(row[0]), + "username": row[1], + "is_admin": bool(row[2]), + "is_active": bool(row[3]), + "password_hash": row[4], + "last_login": row[5].isoformat() if row[5] else None, + "created_at": row[6].isoformat() if isinstance(row[6], datetime) else (row[6] or None), + } + + +# ---------------- Regions/Keywords helpers --------------------------------- +def upsert_region(name: str) -> int: + """Get or create a region by name; return region_id.""" + name = (name or "").strip() + if not name: + raise ValueError("Region name cannot be empty") + with _ensure_session() as session: + row = session.execute(text("SELECT region_id FROM regions WHERE name = :n"), { + "n": name}).fetchone() + if row: + return int(row[0]) + session.execute( + text("INSERT INTO regions(name) VALUES (:n)"), {"n": name}) + session.commit() + with _ensure_session() as session: + row2 = session.execute(text("SELECT region_id FROM regions WHERE name = :n"), { + "n": name}).fetchone() + if row2: + return int(row2[0]) + # unlikely retry + return upsert_region(name) + + +def upsert_keyword(name: str) -> int: + """Get or create a keyword by name; return keyword_id.""" + name = (name or "").strip() + if not name: + raise ValueError("Keyword name cannot be empty") + with _ensure_session() as session: + row = session.execute(text("SELECT keyword_id FROM keywords WHERE name = :n"), { + "n": name}).fetchone() + if row: + return int(row[0]) + session.execute( + text("INSERT INTO keywords(name) VALUES (:n)"), {"n": name}) + session.commit() + with _ensure_session() as session: + row2 = session.execute(text("SELECT keyword_id FROM keywords WHERE name = :n"), { + "n": name}).fetchone() + if row2: + return int(row2[0]) + return upsert_keyword(name) + + +def set_user_regions(username: str, region_names: List[str]) -> None: + """Replace user's preferred regions with given names.""" + user_id = get_or_create_user(username) + # Normalize and get ids + names = sorted({(n or "").strip() + for n in region_names if (n or "").strip()}) + region_ids: List[int] = [upsert_region(n) for n in names] + if not region_ids and not names: + # Clear all if explicitly empty list + with _ensure_session() as session: + session.execute( + text("DELETE FROM user_regions WHERE user_id = :u"), {"u": user_id}) + session.commit() + return + desired = set(region_ids) + with _ensure_session() as session: + rows = session.execute(text("SELECT region_id FROM user_regions WHERE user_id = :u"), { + "u": user_id}).fetchall() + current = set(int(r[0]) for r in rows) + to_add = desired - current + to_remove = current - desired + for rid in to_remove: + session.execute(text("DELETE FROM user_regions WHERE user_id = :u AND region_id = :r"), { + "u": user_id, "r": int(rid)}) + for rid in to_add: + session.execute(text("INSERT INTO user_regions(user_id, region_id) VALUES(:u, :r)"), { + "u": user_id, "r": int(rid)}) + session.commit() + + +def set_user_keywords(username: str, keyword_names: List[str]) -> None: + """Replace user's preferred keywords with given names.""" + user_id = get_or_create_user(username) + names = sorted({(n or "").strip() + for n in keyword_names if (n or "").strip()}) + keyword_ids: List[int] = [upsert_keyword(n) for n in names] + if not keyword_ids and not names: + with _ensure_session() as session: + session.execute( + text("DELETE FROM user_keywords WHERE user_id = :u"), {"u": user_id}) + session.commit() + return + desired = set(keyword_ids) + with _ensure_session() as session: + rows = session.execute(text("SELECT keyword_id FROM user_keywords WHERE user_id = :u"), { + "u": user_id}).fetchall() + current = set(int(r[0]) for r in rows) + to_add = desired - current + to_remove = current - desired + for kid in to_remove: + session.execute(text("DELETE FROM user_keywords WHERE user_id = :u AND keyword_id = :k"), { + "u": user_id, "k": int(kid)}) + for kid in to_add: + session.execute(text("INSERT INTO user_keywords(user_id, keyword_id) VALUES(:u, :k)"), { + "u": user_id, "k": int(kid)}) + session.commit() + + +def get_user_regions(username: str) -> List[Dict[str, str]]: + """Return preferred region names for a user (empty if none).""" + with _ensure_session() as session: + row = session.execute(text("SELECT user_id FROM users WHERE username = :u"), { + "u": username}).fetchone() + if not row: + return [] + user_id = int(row[0]) + rows = session.execute(text( + """ + SELECT r.name, r.color + FROM regions r + INNER JOIN user_regions ur ON ur.region_id = r.region_id + WHERE ur.user_id = :u + ORDER BY r.name ASC + """ + ), {"u": user_id}).fetchall() + return [{"name": r[0], "color": r[1]} for r in rows] + + +def get_user_keywords(username: str) -> List[Dict[str, str]]: + """Return preferred keyword names for a user (empty if none).""" + with _ensure_session() as session: + row = session.execute(text("SELECT user_id FROM users WHERE username = :u"), { + "u": username}).fetchone() + if not row: + return [] + user_id = int(row[0]) + rows = session.execute(text( + """ + SELECT k.name, k.color + FROM keywords k + INNER JOIN user_keywords uk ON uk.keyword_id = k.keyword_id + WHERE uk.user_id = :u + ORDER BY k.name ASC + """ + ), {"u": user_id}).fetchall() + return [{"name": r[0], "color": r[1]} for r in rows] + + +def get_all_regions() -> List[Dict[str, str]]: + """Return all region names from regions table (sorted).""" + with _ensure_session() as session: + rows = session.execute( + text("SELECT name, color FROM regions ORDER BY name ASC")).fetchall() + return [{"name": r[0], "color": r[1]} for r in rows] + + +def get_all_keywords() -> List[Dict[str, str]]: + """Return all keyword names from keywords table (sorted).""" + with _ensure_session() as session: + rows = session.execute( + text("SELECT name, color FROM keywords ORDER BY name ASC")).fetchall() + return [{"name": r[0], "color": r[1]} for r in rows] + + +def seed_regions_keywords_from_listings() -> Dict[str, int]: + """Seed regions/keywords tables from distinct values in job_listings if empty. + + Returns dict with counts inserted: {"regions": n1, "keywords": n2}. + """ + inserted = {"regions": 0, "keywords": 0} + with _ensure_session() as session: + # Regions + existing_regions = session.execute( + text("SELECT COUNT(*) FROM regions")).scalar_one() + if int(existing_regions or 0) == 0: + rows = session.execute(text( + "SELECT DISTINCT region FROM job_listings WHERE region IS NOT NULL AND region != ''")).fetchall() + for r in rows: + name = r[0] + if name: + try: + session.execute( + text("INSERT IGNORE INTO regions(name, color) VALUES(:n, :c)"), {"n": name, "c": get_color_from_string(name)}) + inserted["regions"] += 1 + except Exception: + pass + session.commit() + # Keywords + existing_keywords = session.execute( + text("SELECT COUNT(*) FROM keywords")).scalar_one() + if int(existing_keywords or 0) == 0: + rows = session.execute(text( + "SELECT DISTINCT keyword FROM job_listings WHERE keyword IS NOT NULL AND keyword != ''")).fetchall() + for r in rows: + name = r[0] + if name: + try: + session.execute( + text("INSERT IGNORE INTO keywords(name, color) VALUES(:n, :c)"), {"n": name, "c": get_color_from_string(name)}) + inserted["keywords"] += 1 + except Exception: + pass + session.commit() + return inserted + + +def list_regions_full() -> List[Dict[str, Any]]: + with _ensure_session() as session: + rows = session.execute( + text("SELECT region_id, name, color FROM regions ORDER BY name ASC")).fetchall() + return [{"region_id": int(r[0]), "name": r[1], "color": r[2]} for r in rows] + + +def list_keywords_full() -> List[Dict[str, Any]]: + with _ensure_session() as session: + rows = session.execute( + text("SELECT keyword_id, name, color FROM keywords ORDER BY name ASC")).fetchall() + return [{"keyword_id": int(r[0]), "name": r[1], "color": r[2]} for r in rows] + + +def rename_region(region_id: int, new_name: str) -> bool: + new_name = (new_name or "").strip() + if not new_name: + raise ValueError("new_name required") + with _ensure_session() as session: + try: + session.execute(text("UPDATE regions SET name = :n WHERE region_id = :id"), { + "n": new_name, "id": int(region_id)}) + session.commit() + return True + except Exception: + session.rollback() + return False + + +def rename_keyword(keyword_id: int, new_name: str) -> bool: + new_name = (new_name or "").strip() + if not new_name: + raise ValueError("new_name required") + with _ensure_session() as session: + try: + session.execute(text("UPDATE keywords SET name = :n WHERE keyword_id = :id"), { + "n": new_name, "id": int(keyword_id)}) + session.commit() + return True + except Exception: + session.rollback() + return False + + +def change_region_color(region_id: int, new_color: str) -> bool: + new_color = (new_color or "").strip() + if not new_color: + raise ValueError("new_color required") + with _ensure_session() as session: + try: + session.execute(text("UPDATE regions SET color = :c WHERE region_id = :id"), { + "c": new_color, "id": int(region_id)}) + session.commit() + return True + except Exception: + session.rollback() + return False + + +def change_keyword_color(keyword_id: int, new_color: str) -> bool: + new_color = (new_color or "").strip() + if not new_color: + raise ValueError("new_color required") + with _ensure_session() as session: + try: + session.execute(text("UPDATE keywords SET color = :c WHERE keyword_id = :id"), { + "c": new_color, "id": int(keyword_id)}) + session.commit() + return True + except Exception: + session.rollback() + return False diff --git a/web/scraper.py b/web/scraper.py new file mode 100644 index 0000000..f4d89b4 --- /dev/null +++ b/web/scraper.py @@ -0,0 +1,121 @@ +from datetime import datetime, UTC +from bs4 import BeautifulSoup +from typing import List, Dict, Set +from web.utils import get_base_url, cache_page, safe_get_text, safe_get_attr, is_cached, get_cached_content, make_request_with_retry + + +def scrape_listings_page(listing, region: str, keyword: str, seen_urls: Set[str]) -> List: + """Parse a single job listing.""" + try: + title_elem = listing.find("div", class_="title") + url_elem = listing.find("a") + pay_elem = listing.find("div", class_="attr remuneration") + if pay_elem: + pay_elem = pay_elem.find("span", class_="valu") + location_elem = listing.find("div", class_="location") + + if not title_elem or not url_elem: + return [] + + title = title_elem.get_text(strip=True) + url = url_elem["href"] + pay = pay_elem.get_text(strip=True) if pay_elem else "N/A" + location = location_elem.get_text( + strip=True) if location_elem else "N/A" + + status = "DUPLICATE" if url in seen_urls else "NEW" + if url in seen_urls: + return [] + + # job_summary variable retained for parity but not used + job_summary = f"{status} [{region}/{keyword}] | Title: {title[:50]}{'...' if len(title) > 50 else ''} | Location: {location} | URL: {url}" + _ = job_summary + + return [datetime.now(UTC).isoformat(), region, keyword, title, pay, location, url] + except (AttributeError, KeyError): + return [] + + +def scrape_job_page(content: str, url: str) -> Dict: + """Scrape job details from a job listing page.""" + soup = BeautifulSoup(content, "html.parser") + + # Extract each field + title = safe_get_text(soup.find("h1", class_="postingtitle")) + company = safe_get_text(soup.find("h2", class_="company-name")) + + map_elem = soup.find("div", id="map") + if map_elem: + lat = safe_get_attr(map_elem, "data-latitude") + lon = safe_get_attr(map_elem, "data-longitude") + accuracy = safe_get_attr(map_elem, "data-accuracy") + location = f"Lat: {lat}, Lon: {lon}, Accuracy: {accuracy}" + else: + location = "N/A" + + mapaddress = soup.find("div", class_="mapaddress") + if mapaddress: + location = safe_get_text(mapaddress) + " " + location + + description_elem = soup.find("section", id="postingbody") + if description_elem: + de = BeautifulSoup(str(description_elem), "html.parser") + qr_code_elem = de.find(class_="print-qrcode-label") + # Remove QR code if it exists + if qr_code_elem: + qr_code_elem.decompose() + description = de.text.strip() + else: + description = '' + + posting_info = soup.find("div", class_="postinginfos") + if posting_info: + pi = BeautifulSoup(str(posting_info), "html.parser") + postinginfo_tags = pi.find_all("p", class_="postinginfo") + job_id = safe_get_text(postinginfo_tags[0]) if postinginfo_tags else "" + posted_time_elem = pi.find("time", class_="date timeago") + posted_time = safe_get_attr( + posted_time_elem, "datetime") if posted_time_elem else "" + else: + job_id = "" + posted_time = "" + + return { + "url": url, + "title": title, + "company": company, + "location": location, + "description": description, + "id": job_id, + "posted_time": posted_time + } + + +def scrape_job_data(content: str, region: str, keyword: str, seen_urls: Set[str]) -> List[List]: + """Parse HTML content to extract job listings.""" + soup = BeautifulSoup(content, "html.parser") + listings = soup.find_all("li", class_="cl-static-search-result") + new_rows = [] + + for i, listing in enumerate(listings): + job_data = scrape_listings_page(listing, region, keyword, seen_urls) + if job_data: + new_rows.append(job_data) + + return new_rows + + +def process_region_keyword(region: str, keyword: str, seen_urls: Set[str]) -> List[List]: + """Process a single region and keyword.""" + url = get_base_url().format(region=region, keyword=keyword.replace(" ", "+")) + if is_cached(url): + content = get_cached_content(url) + cache_status = "CACHED" + else: + content = make_request_with_retry(url, 3) + if content is None: + return [] + cache_page(url, content) + cache_status = "FETCHED" + _ = cache_status # no-op to silence unused var + return scrape_job_data(content, region, keyword, seen_urls) diff --git a/web/static/index.js b/web/static/index.js new file mode 100644 index 0000000..2c1d207 --- /dev/null +++ b/web/static/index.js @@ -0,0 +1,102 @@ +// Update the table with job data +function updateTableData(jobs) { + const jobsContainer = document.getElementById("jobs"); + jobsContainer.innerHTML = ""; // Clear existing jobs + jobs.forEach((job) => { + const jobElement = document.createElement("div"); + jobElement.classList.add("job"); + jobElement.innerHTML = ` +

${job.title}

+

${job.posted_time}

+ ${job.region} + ${job.keyword} + `; + jobsContainer.appendChild(jobElement); + }); +} + +// Fetch job data from the server +function fetchJobs() { + fetch("/jobs") + .then((response) => response.json()) + .then((data) => { + updateTableData(data); + }) + .catch((error) => console.error("Error fetching jobs:", error)); +} + +// scrape form submission +function updateScrapeInfo(message, color) { + let scrapingInfo = document.getElementById("scrape-info"); + scrapingInfo.style.display = "inline-block"; // Show the scraping info + scrapingInfo.innerText = message; + scrapingInfo.style.color = color; +} + +function scrape(event) { + event.preventDefault(); // Prevent the default form submission + updateScrapeInfo("Scraping in progress...", "blue"); + fetch("/scrape") + .then((response) => response.json()) + .then((data) => { + if (data.status) { + updateScrapeInfo(data.status, "green"); + } else { + updateScrapeInfo("Scraping failed. Please try again.", "red"); + } + }) + .catch((error) => console.error("Error:", error)); +} + +function updateJobsFiltered() { + const selectedRegion = document.getElementById("region").value; + const selectedKeyword = document.getElementById("keyword").value; + const filterForm = document.getElementById("filter-form"); + const queryString = new URLSearchParams({ + region: selectedRegion, + keyword: selectedKeyword, + }).toString(); + filterForm.action = `/?${queryString}`; + filterForm.submit(); // Submit the form to apply filters +} + +function regionClick(event) { + const region = event.target.innerText; + const regionInput = document.getElementById("region"); + regionInput.value = region; + updateJobsFiltered(); +} + +function keywordClick(event) { + const keyword = event.target.innerText; + const keywordInput = document.getElementById("keyword"); + keywordInput.value = keyword; + updateJobsFiltered(); +} + +document.querySelectorAll(".job-keyword").forEach((element) => { + element.addEventListener("click", keywordClick); +}); +document.querySelectorAll(".job-region").forEach((element) => { + element.addEventListener("click", regionClick); +}); + +document.getElementById("scrape-form").addEventListener("submit", scrape); +document + .getElementById("region") + .addEventListener("change", updateJobsFiltered); +document + .getElementById("keyword") + .addEventListener("change", updateJobsFiltered); +document + .getElementById("filter-form") + .addEventListener("submit", updateJobsFiltered); +document.getElementById("reset-filters").addEventListener("click", () => { + document.getElementById("region").value = ""; + document.getElementById("keyword").value = ""; + updateJobsFiltered(); +}); diff --git a/web/static/settings.js b/web/static/settings.js new file mode 100644 index 0000000..28c87b7 --- /dev/null +++ b/web/static/settings.js @@ -0,0 +1,61 @@ +/* javascript form handling */ +document + .getElementById("user-settings-form") + .addEventListener("submit", function (event) { + event.preventDefault(); // Prevent default form submission + + const form = event.target; + const formData = new FormData(form); + + // Collect selected regions and keywords + const selectedRegions = []; + const selectedKeywords = []; + formData.forEach((value, key) => { + if (key === "region") { + selectedRegions.push(value); + } else if (key === "keyword") { + selectedKeywords.push(value); + } + }); + + // Add new region if provided + const newRegion = formData.get("new-region").trim(); + if (newRegion) { + selectedRegions.push(newRegion); + } + + // Add new keyword if provided + const newKeyword = formData.get("new-keyword").trim(); + if (newKeyword) { + selectedKeywords.push(newKeyword); + } + + // Prepare data to send + const dataToSend = { + regions: selectedRegions, + keywords: selectedKeywords, + csrf_token: formData.get("csrf_token"), + }; + + // Send data via Fetch API + fetch(form.action, { + method: "POST", + headers: { + "Content-Type": "application/json", + "X-CSRF-Token": document.querySelector('meta[name="csrf-token"]') + .content, + }, + body: JSON.stringify(dataToSend), + }) + .then((response) => { + if (response.ok) { + window.location.reload(); // Reload to reflect changes + } else { + alert("Error saving preferences."); + } + }) + .catch((error) => { + console.error("Error:", error); + alert("Error saving preferences."); + }); + }); diff --git a/web/static/styles.css b/web/static/styles.css new file mode 100644 index 0000000..c27e32a --- /dev/null +++ b/web/static/styles.css @@ -0,0 +1,144 @@ +body { + font-family: Arial, sans-serif; + margin: 10px; + font-size: 16px; +} +h1 { + color: #333; + font-size: 1.2em; +} +a { + text-decoration: none; +} +a:hover { + text-decoration: underline; +} +footer { + margin-top: 20px; + text-align: center; + font-size: 0.9em; + color: #666; +} +nav { + margin-bottom: 10px; +} + +#filters { + display: block; + margin-bottom: 1rem; +} +#filters #filter-form { + display: inline-block; + max-width: 500px; +} +#filters #scrape-form { + display: inline-block; + margin-left: 1rem; +} +#filters #scrape-form span#scrape-info { + display: none; + color: blue; + font-size: 0.9em; +} +#jobs { + margin: 0; + padding: 0; + display: grid; + grid-template-columns: repeat(auto-fill, minmax(360px, 1fr)); + gap: 1rem; +} +.job { + border: 1px solid #ccc; + padding: 1rem; + border-radius: 5px; + background-color: #f9f9f9; +} +.job a { + display: inline-block; +} +.job h3 { + margin: 0 0 0.25rem 0; + font-size: 1.1em; +} + +.job-posted-time { + font-weight: normal; + font-size: 0.8em; + color: #666; + margin: 0.25rem 0; +} + +.job-region, +.job-keyword { + border: 1px solid #ccc; + border-radius: 0.8rem; + padding: 0.2rem 0.4rem; + display: inline; + margin-right: 0.5rem; + background-color: rgb(255, 255, 255); +} + +#job-details { + max-width: 100%; + margin: auto; +} + +.job-description { + margin-top: 5px; + color: #333; + margin: 0; + padding: 0; + line-height: 1.25; + font-size: 14px; +} +.job-description br { + margin: -5px 0; +} + +.job-title { + font-weight: bold; + color: #333; + text-decoration: underline; + font-size: 16px; +} + +/* Taxonomy Management */ +#regions-table, +#keywords-table { + margin-top: 20px; +} +#regions-table table, +#keywords-table table { + max-width: 100%; + border-collapse: collapse; +} +#regions-table th, +#regions-table td, +#keywords-table th, +#keywords-table td { + border: 1px solid #ccc; + padding: 8px; + text-align: left; +} +#regions-table th, +#keywords-table th { + background-color: #f9f9f9; +} + +/* Admin User Management */ +#users { + margin-top: 20px; +} +#users table { + max-width: 100%; + border-collapse: collapse; +} +#users th, +#users td { + border: 1px solid #ccc; + padding: 8px; + text-align: left; +} +#users th { + background-color: #f9f9f9; +} diff --git a/web/static/taxonomy.js b/web/static/taxonomy.js new file mode 100644 index 0000000..51b6b16 --- /dev/null +++ b/web/static/taxonomy.js @@ -0,0 +1,41 @@ +function updateColor(id, type, newColor) { + fetch("/admin/taxonomy", { + method: "POST", + headers: { + "Content-Type": "application/json", + "X-CSRF-Token": document.querySelector('meta[name="csrf-token"]').content, + }, + body: JSON.stringify({ + action: + type === "region" ? "change_region_color" : "change_keyword_color", + [type + "_id"]: id, + [type + "_color"]: newColor, + }), + }).then((response) => { + if (response.ok) { + location.reload(); + } else { + alert("Failed to update " + type + " color"); + } + }); +} + +document + .getElementById("region-color-form") + .addEventListener("submit", function (event) { + event.preventDefault(); + const regionId = this.querySelector('input[name="region_id"]').value; + const newColor = this.querySelector('input[name="new_region_color"]').value; + updateColor(regionId, "region", newColor); + }); + +document + .getElementById("keyword-color-form") + .addEventListener("submit", function (event) { + event.preventDefault(); + const keywordId = this.querySelector('input[name="keyword_id"]').value; + const newColor = this.querySelector( + 'input[name="new_keyword_color"]' + ).value; + updateColor(keywordId, "keyword", newColor); + }); diff --git a/web/templates/admin/login.html b/web/templates/admin/login.html new file mode 100644 index 0000000..2f88366 --- /dev/null +++ b/web/templates/admin/login.html @@ -0,0 +1,9 @@ +{% extends 'base.html' %} {% block content %} +

Login

+
+ + + + +
+{% endblock %} diff --git a/web/templates/admin/taxonomy.html b/web/templates/admin/taxonomy.html new file mode 100644 index 0000000..c38b82b --- /dev/null +++ b/web/templates/admin/taxonomy.html @@ -0,0 +1,142 @@ +{% extends 'base.html' %} {% block content %} +

Taxonomy

+
+

Regions

+
+ + + + + + +
+
+ + + + + + + + + + + {% for r in regions %} + + + + + + + + {% endfor %} + +
IDNameRenameColor
{{ r.region_id }}{{ r.name }} +
+ + + + + +
+
+
+ + + + + +
+
+
+
+
+

Keywords

+
+ + + + + + +
+
+ + + + + + + + + + + {% for k in keywords %} + + + + + + + {% endfor %} + +
IDNameRenameColor
{{ k.keyword_id }}{{ k.name }} +
+ + + + + +
+
+
+ + + + + +
+
+
+
+{% endblock %} {% block footer_scripts %} + + +{% endblock %} diff --git a/web/templates/admin/users.html b/web/templates/admin/users.html new file mode 100644 index 0000000..6696cdd --- /dev/null +++ b/web/templates/admin/users.html @@ -0,0 +1,139 @@ +{% extends 'base.html' %} {% block content %} +
+

Users

+
+ + + + + + + + + + + + + + + + {% for u in users %} + + + + + + + + + + + + {% endfor %} + +
IDUsernameAdminActivePasswordCreatedLast Login
+ {{ u.user_id }} + + + + + + + {{ '✅' if u.has_password else '❌' }}{{ u.created_at }}{{ u.last_login or 'never' }} + +
+
+
+

Create / Update User

+
+ + + + + + +
+{% endblock %} {% block footer_scripts %} + +{% endblock %} diff --git a/web/templates/base.html b/web/templates/base.html new file mode 100644 index 0000000..23d2fb5 --- /dev/null +++ b/web/templates/base.html @@ -0,0 +1,43 @@ + + + + + + {{ title }} + + + {% block styles %}{% endblock %} {% block scripts %}{% endblock %} + + + {% block header %} +
+

{{ title or 'Admin' }}

+ + {% with messages = get_flashed_messages() %} {% if messages %} +
    + {% for m in messages %} +
  • {{ m }}
  • + {% endfor %} +
+ {% endif %} {% endwith %} +
+ {% endblock %} {% block content %}{% endblock %} +
+

© 2025 Job Listings

+
+ {% block footer_scripts %}{% endblock %} + + diff --git a/web/templates/index.html b/web/templates/index.html new file mode 100644 index 0000000..feac3ec --- /dev/null +++ b/web/templates/index.html @@ -0,0 +1,55 @@ +{% extends "base.html" %} {% block styles %} + +{% endblock %} +{% block title %}Job Listings{% endblock %} +{% block content %} +
+
+ + + + + + +
+
+ + +
+
+
+ {% for job in jobs %} +
+

{{ job['title'] }}

+

{{ job['posted_time'] }}

+ {{ job['region'] }} + {{ job['keyword'] }} +
+ {% endfor %} +
+{% endblock %} +{% block footer_scripts %} + +{% endblock %} \ No newline at end of file diff --git a/web/templates/job.html b/web/templates/job.html new file mode 100644 index 0000000..85948df --- /dev/null +++ b/web/templates/job.html @@ -0,0 +1,27 @@ +{% extends "base.html" %} {% block title %}Job Details{% endblock %} {% block +styles %}{% endblock %} {% block content %} +
+

ID: {{ job.id }}

+

+ Title: {{ job.title }} | Company: {{ + job.company }} | Location: {{ job.location }} +

+

+ Salary: {{ job.salary }} | Posted on: {{ + job.posted_date }} +

+ +

Job Description

+
+

{{ job.description|safe }}

+
+

+ Original URL: +

+

+ {{ job.title }} +

+
+{% endblock %} diff --git a/web/templates/user/settings.html b/web/templates/user/settings.html new file mode 100644 index 0000000..01f1758 --- /dev/null +++ b/web/templates/user/settings.html @@ -0,0 +1,84 @@ +{% extends 'base.html' %} {% block title %}Your Preferences{% endblock %} {% +block content %} +

Your Preferences

+
+ +
+ Regions +

+ Add new Region: + +

+ {% if all_regions %} {% for r in all_regions %} + + {% endfor %} {% else %} +

No regions available. Ask an admin to add some.

+ {% endif %} +
+
+ Keywords +

+ Add new Keyword: + +

+ {% if all_keywords %} {% for k in all_keywords %} + + {% endfor %} {% else %} +

No keywords available. Ask an admin to add some.

+ {% endif %} +
+ +
+{% endblock %} {% block footer_scripts %} + +{% endblock %} diff --git a/web/utils.py b/web/utils.py new file mode 100644 index 0000000..1371d64 --- /dev/null +++ b/web/utils.py @@ -0,0 +1,336 @@ +""" +Utility functions for the Craigslist scraper. +""" + +from typing import Any, Optional as _Optional +from datetime import datetime, UTC +import json +import os +import random +import re +import requests +import time +from typing import Optional, List, Dict + + +def get_config_file() -> str: + """Return the path to the main config file.""" + return os.path.abspath(os.path.join( + os.path.dirname(__file__), '..', 'config', 'settings.json')) + + +def get_config() -> dict: + """Return the loaded configuration dict.""" + CONFIG = {} + try: + with open(get_config_file(), 'r', encoding='utf-8') as _f: + CONFIG = json.load(_f) + except Exception: + CONFIG = {} + return CONFIG + + +def get_users_from_settings() -> List[Dict]: + """Return user entries from settings.json (array of dicts).""" + users = get_config().get('users', []) + if not isinstance(users, list): + return [] + out: List[Dict] = [] + for u in users: + if not isinstance(u, dict): + continue + username = (u.get('username') or '').strip() + if not username: + continue + out.append({ + 'username': username, + 'is_admin': bool(u.get('is_admin', False)), + 'password': u.get('password') or '' + }) + return out + + +def initialize_users_from_settings() -> int: + """Ensure users from settings.json exist in DB; set admin/active and passwords. + + Returns number of users processed. + """ + from web.db import create_or_update_user # local import to avoid cycles + users = get_users_from_settings() + count = 0 + for u in users: + pw = u.get('password') or None + create_or_update_user(u['username'], password=pw, is_admin=bool( + u.get('is_admin', False)), is_active=True) + count += 1 + return count + + +def verify_credentials(username: str, password: str) -> bool: + """Proxy to db.verify_user_credentials""" + from web.db import verify_user_credentials + return verify_user_credentials(username, password) + + +# --- Database configuration helpers --- + +def get_mysql_config() -> dict: + """Return MySQL/MariaDB connection settings.""" + db = get_config().get('database', {}).get('mysql', {}) + return { + 'host': db.get('host', '127.0.0.1'), + 'user': db.get('user', 'root'), + 'password': db.get('password', ''), + 'database': db.get('database', 'jobs'), + 'port': db.get('port', 3306), + } + + +def get_http_setting(key: str, default=None): + return get_config().get('http', {}).get(key, default) + + +def get_paths() -> dict: + return get_config().get('paths', {}) + + +def get_cache_dir() -> str: + return get_paths().get('cache_dir', 'cache') + + +def get_logs_dir() -> str: + return get_paths().get('logs_dir', 'logs') + + +def get_user_agent() -> str: + return get_http_setting('user_agent') + + +def get_request_timeout() -> int: + return get_http_setting('request_timeout') + + +def get_max_retries() -> int: + return get_http_setting('max_retries') + + +def get_backoff_factor() -> int: + return get_http_setting('backoff_factor') + + +def get_min_delay() -> int: + return get_http_setting('min_delay') + + +def get_max_delay() -> int: + return get_http_setting('max_delay') + + +def get_base_url() -> str: + return get_config().get('scraper', {}).get('base_url', "https://{region}.craigslist.org/search/jjj?query={keyword}&sort=rel") + + +def ensure_cache_dir(): + """Ensure cache directory exists.""" + os.makedirs(get_cache_dir(), exist_ok=True) + + +def now_iso() -> str: + """Get the current time in ISO format.""" + return datetime.now(UTC).isoformat() + + +def get_filename_from_url(url: str) -> str: + """Convert URL to a safe filename.""" + return url.replace("https://", "").replace("/", "_").replace("?", "_").replace("&", "_") + + +def url_to_job_id(url: str) -> int: + """Extract the job id from a Craigslist URL (last path segment without .html).""" + last = url.rstrip("/").split("/")[-1].replace(".html", "") + if last.isdigit(): + return int(last) + return 0 + + +def normalize_job_id(raw_id: Optional[str], url: Optional[str]) -> Optional[int]: + """Normalize job id coming from details page (e.g., 'post id: 1234567890'). + Fallback to URL-derived id when needed. + """ + if raw_id: + m = re.search(r"(\d{5,})", raw_id) + if m: + return int(m.group(1)) + if url: + return url_to_job_id(url) + return None + + +def get_url_from_filename(name: str) -> str: + """Generate a URL guess based on the name.""" + # Best-effort URL guess from filename convention (underscores to slashes) + base = os.path.splitext(name)[0] + url_guess = f"https://{base.replace('_', '/')}" + return url_guess + + +def get_cached_content(url: str) -> str: + """Get cached content for URL.""" + with open(get_cache_path(url), "r", encoding="utf-8") as f: + return f.read() + + +def safe_get_text(element, default="N/A"): + """Safely extract text from BeautifulSoup element.""" + return element.get_text(strip=True) if element else default + + +def safe_get_attr(element, attr, default="N/A"): + """Safely extract attribute from BeautifulSoup element.""" + return element.get(attr, default) if element else default + + +def get_random_delay(min_delay: int = get_min_delay(), max_delay: int = get_max_delay()) -> float: + """Get a random delay between min_delay and max_delay seconds.""" + return random.uniform(min_delay, max_delay) + + +def get_cache_path(url: str) -> str: + """Get cache file path for URL.""" + return os.path.join(get_cache_dir(), f"{get_filename_from_url(url)}.html") + + +def cache_page(url: str, content: str): + """Cache the page content with a timestamp.""" + cache_path = get_cache_path(url) + os.makedirs(os.path.dirname(cache_path), exist_ok=True) + with open(cache_path, "w", encoding="utf-8") as f: + f.write(content) + # Update the file's modification time to the current time + os.utime(cache_path, None) + + +def is_cached(url: str) -> bool: + """Check if the page is cached and not older than 24 hours.""" + cache_path = get_cache_path(url) + if not os.path.isfile(cache_path): + return False + + # Check the file's age if it's a search result page + if 'search' in url: + file_age = time.time() - os.path.getmtime(cache_path) + if file_age > 24 * 3600: # 24 hours in seconds + return False + + return True + + +def is_cache_stale(last_modified: str, days: int = 1) -> bool: + """Check if the cached page is stale (older than 24 hours).""" + if not last_modified: + return True + last_datetime = datetime.fromisoformat(last_modified) + file_age = time.time() - last_datetime.timestamp() + return file_age > days * 24 * 3600 # days in seconds + + +def delete_cached_page(url: str): + cache_fp = get_cache_path(url) + if os.path.exists(cache_fp): + try: + os.remove(cache_fp) + except Exception: + pass + + +def get_color_from_string(s: str) -> str: + """Generate a color code from a string.""" + hash_code = hash(s) + # Ensure the hash code is positive + hash_code = hash_code if hash_code >= 0 else -hash_code + # Extract RGB components + r, g, b = (hash_code & 0xFF0000) >> 16, (hash_code & + 0x00FF00) >> 8, hash_code & 0x0000FF + # ensure RGB components are within 128-255 + r = max(128, min(255, r)) + g = max(128, min(255, g)) + b = max(128, min(255, b)) + # Combine RGB components back into a single integer + calculated = (r << 16) | (g << 8) | b + return f"#{calculated:06X}" + + +# ---- App helpers moved from app.py for reuse and readability ------------- + + +def filter_jobs( + jobs: List[Dict[str, Any]], + region: _Optional[str] = None, + keyword: _Optional[str] = None, +) -> List[Dict[str, Any]]: + """Filter jobs by optional region and keyword.""" + filtered = jobs + if region: + filtered = [j for j in filtered if j.get("region") == region] + if keyword: + filtered = [j for j in filtered if j.get("keyword") == keyword] + return filtered + + +def get_job_by_id(job_id): + """Fetch job details by job ID from the database.""" + try: + from web.db import get_all_jobs # lazy import to avoid cycles + for j in get_all_jobs(): + if str(j.get("id")) == str(job_id) or str(j.get("job_id")) == str(job_id): + return j + except Exception: + pass + return {} + + +def make_request_with_retry(url: str, max_retries: int = get_max_retries()) -> Optional[str]: + """Make HTTP request with retry logic and proper error handling.""" + # initial delay + delay = get_random_delay() + + headers = {'User-Agent': get_user_agent()} + + for attempt in range(max_retries): + try: + delay = get_random_delay() * (get_backoff_factor() ** attempt) + if attempt > 0: + time.sleep(delay) + + resp = requests.get(url, headers=headers, + timeout=get_request_timeout()) + + if resp.status_code == 403: + return None + elif resp.status_code == 429: + time.sleep(delay * 3) # Longer delay for rate limiting + continue + elif resp.status_code == 404: + return None + elif resp.status_code == 410: + return None + elif resp.status_code >= 400: + if attempt == max_retries - 1: + return None + continue + + resp.raise_for_status() + return resp.text + + except requests.exceptions.Timeout: + pass + except requests.exceptions.ConnectionError: + pass + except requests.exceptions.RequestException: + pass + + if attempt < max_retries - 1: + delay = get_random_delay() * (get_backoff_factor() ** attempt) + time.sleep(delay) + + return None