initial project commit
This commit is contained in:
5
.vscode/settings.json
vendored
Normal file
5
.vscode/settings.json
vendored
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
{
|
||||||
|
"python.testing.pytestArgs": ["tests"],
|
||||||
|
"python.testing.unittestEnabled": false,
|
||||||
|
"python.testing.pytestEnabled": true
|
||||||
|
}
|
||||||
82
analytics.py
Normal file
82
analytics.py
Normal file
@@ -0,0 +1,82 @@
|
|||||||
|
import pandas as pd
|
||||||
|
from sqlalchemy import create_engine, text
|
||||||
|
from web.utils import get_mysql_config
|
||||||
|
|
||||||
|
|
||||||
|
def get_engine():
|
||||||
|
cfg = get_mysql_config()
|
||||||
|
url = f"mysql+pymysql://{cfg['user']}:{cfg['password']}@{cfg['host']}:{cfg['port']}/{cfg['database']}?charset=utf8mb4"
|
||||||
|
return create_engine(url, future=True)
|
||||||
|
|
||||||
|
|
||||||
|
def get_all_jobs():
|
||||||
|
query = """
|
||||||
|
SELECT l.job_id
|
||||||
|
,l.title
|
||||||
|
,d.description
|
||||||
|
,l.region
|
||||||
|
,l.keyword
|
||||||
|
,d.company
|
||||||
|
,l.location
|
||||||
|
,l.timestamp
|
||||||
|
,d.posted_time
|
||||||
|
,l.url
|
||||||
|
,c.file_path
|
||||||
|
,c.last_modified
|
||||||
|
,c.url_guess
|
||||||
|
,CASE WHEN c.url_guess != l.url THEN 1 ELSE 0 END AS url_guess_stale
|
||||||
|
FROM job_listings AS l
|
||||||
|
INNER JOIN job_descriptions AS d
|
||||||
|
ON l.job_id = d.job_id
|
||||||
|
AND l.url = d.url
|
||||||
|
LEFT JOIN cached_pages AS c ON l.job_id = c.job_id
|
||||||
|
ORDER BY d.posted_time DESC
|
||||||
|
"""
|
||||||
|
engine = get_engine()
|
||||||
|
with engine.begin() as conn:
|
||||||
|
rows = conn.execute(text(query)).fetchall()
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
"job_id": row[0],
|
||||||
|
"title": row[1],
|
||||||
|
"description": row[2],
|
||||||
|
"region": row[3],
|
||||||
|
"keyword": row[4],
|
||||||
|
"company": row[5],
|
||||||
|
"location": row[6],
|
||||||
|
"timestamp": row[7],
|
||||||
|
"posted_time": row[8],
|
||||||
|
"url": row[9],
|
||||||
|
"file_path": row[10],
|
||||||
|
"last_modified": row[11],
|
||||||
|
"url_guess": row[12],
|
||||||
|
"url_guess_stale": row[13],
|
||||||
|
}
|
||||||
|
for row in rows
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Main function to load and display job postings."""
|
||||||
|
jobs_df = pd.DataFrame(get_all_jobs())
|
||||||
|
|
||||||
|
print(jobs_df.head())
|
||||||
|
print(f"Total postings: {len(jobs_df)}")
|
||||||
|
|
||||||
|
print("Regions:")
|
||||||
|
print(jobs_df['region'].value_counts())
|
||||||
|
|
||||||
|
print("Keywords:")
|
||||||
|
print(jobs_df['keyword'].value_counts())
|
||||||
|
|
||||||
|
# print("Sample Job Postings:")
|
||||||
|
# print("-" * 40)
|
||||||
|
# for sample in jobs_df[['region', 'keyword', 'title', 'location', 'description']].sample(5).itertuples():
|
||||||
|
# print(
|
||||||
|
# f"Region: {sample.region}, Keyword: {sample.keyword}, Title: {sample.title}, Location: {sample.location}")
|
||||||
|
# print(sample.description)
|
||||||
|
# print("-" * 40)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
7
config/keywords.txt
Normal file
7
config/keywords.txt
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
handyman
|
||||||
|
damage mitigation
|
||||||
|
restoration
|
||||||
|
house manager
|
||||||
|
house sitter
|
||||||
|
property manager
|
||||||
|
live-in
|
||||||
6
config/regions.txt
Normal file
6
config/regions.txt
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
losangeles
|
||||||
|
sandiego
|
||||||
|
orangecounty
|
||||||
|
inlandempire
|
||||||
|
bakersfield
|
||||||
|
ventura
|
||||||
32
config/settings.json
Normal file
32
config/settings.json
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
{
|
||||||
|
"database": {
|
||||||
|
"mysql": {
|
||||||
|
"host": "192.168.88.37",
|
||||||
|
"user": "jobs",
|
||||||
|
"password": "jobdb",
|
||||||
|
"database": "jobs",
|
||||||
|
"port": 3306
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"http": {
|
||||||
|
"user_agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:141.0) Gecko/20100101 Firefox/141.0",
|
||||||
|
"request_timeout": 30,
|
||||||
|
"max_retries": 3,
|
||||||
|
"backoff_factor": 2,
|
||||||
|
"min_delay": 1,
|
||||||
|
"max_delay": 5
|
||||||
|
},
|
||||||
|
"paths": {
|
||||||
|
"cache_dir": "cache",
|
||||||
|
"logs_dir": "logs"
|
||||||
|
},
|
||||||
|
"scraper": {
|
||||||
|
"base_url": "https://{region}.craigslist.org/search/jjj?query={keyword}&sort=rel",
|
||||||
|
"config_dir": "config"
|
||||||
|
},
|
||||||
|
"users": [
|
||||||
|
{ "username": "anonymous", "is_admin": false, "password": "" },
|
||||||
|
{ "username": "admin", "is_admin": true, "password": "M11ffpgm." },
|
||||||
|
{ "username": "bobby", "is_admin": false, "password": "" }
|
||||||
|
]
|
||||||
|
}
|
||||||
11
main.py
Normal file
11
main.py
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
"""
|
||||||
|
main entry point for the Craigslist scraper
|
||||||
|
starts webserver
|
||||||
|
"""
|
||||||
|
|
||||||
|
import web.app as app
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# start web server
|
||||||
|
app.main()
|
||||||
213
online.md
Normal file
213
online.md
Normal file
@@ -0,0 +1,213 @@
|
|||||||
|
# Guide to get this online
|
||||||
|
|
||||||
|
Awesome—here’s a concise, battle-tested set of steps to get your Flask app running behind Traefik on Proxmox, with the code living on your NAS via NFS and exposed as `jobs.allucanget.biz`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
# 0) DNS & inbound ports
|
||||||
|
|
||||||
|
1. In Cloudflare, add an **A** record
|
||||||
|
|
||||||
|
- **Name:** `jobs`
|
||||||
|
- **IPv4:** `5.226.148.100`
|
||||||
|
- Decide one of these:
|
||||||
|
- **Option A (simpler):** set to **DNS only** (grey cloud). Traefik will get Let’s Encrypt certs itself.
|
||||||
|
- **Option B:** keep **Proxied** (orange cloud) and install a **Cloudflare Origin Certificate** on Traefik (Full/Strict).
|
||||||
|
|
||||||
|
2. Ensure your router/NAT forwards **80/tcp** and **443/tcp** from WAN to your Traefik box at `192.168.88.10`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
# 1) Mount the NAS on the Proxmox host (then bind into the container)
|
||||||
|
|
||||||
|
> This avoids NFS-in-container headaches and makes the directory accessible outside the LXC too.
|
||||||
|
|
||||||
|
On the **Proxmox host** (not inside a container):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Create a mount point for the NAS share
|
||||||
|
sudo mkdir -p /mnt/nas
|
||||||
|
|
||||||
|
# Install NFS client (Debian-based Proxmox)
|
||||||
|
sudo apt update && sudo apt install -y nfs-common
|
||||||
|
|
||||||
|
# (Optional) test mount once
|
||||||
|
sudo mount -t nfs 192.168.88.9:/mnt/HD/HD_a2/NASNFS /mnt/nas
|
||||||
|
|
||||||
|
# Make it permanent in /etc/fstab (add this line):
|
||||||
|
echo '192.168.88.9:/mnt/HD/HD_a2/NASNFS /mnt/nas nfs defaults,_netdev 0 0' | sudo tee -a /etc/fstab
|
||||||
|
|
||||||
|
# Re-mount from fstab to verify
|
||||||
|
sudo umount /mnt/nas || true
|
||||||
|
sudo mount -a
|
||||||
|
|
||||||
|
# Create your app directory on the NAS that’s visible outside the container too
|
||||||
|
sudo mkdir -p /mnt/nas/jobs-app
|
||||||
|
sudo chown -R 1000:1000 /mnt/nas/jobs-app # or set to whatever UID/GID you want
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
# 2) Create the LXC on Proxmox
|
||||||
|
|
||||||
|
1. Download a Debian 12 template (UI: **Datacenter → local → CT Templates → Debian 12**)
|
||||||
|
|
||||||
|
2. Create an **unprivileged** LXC (example CTID **201**), with:
|
||||||
|
|
||||||
|
- CPU/RAM as needed (e.g., 2 vCPU / 2–4 GB RAM)
|
||||||
|
- Root disk: 8–16 GB is fine
|
||||||
|
- Network: static IP, e.g. `192.168.88.20/24`, GW `192.168.88.1`
|
||||||
|
- Features: you don’t need NFS inside; we’ll bind-mount from host.
|
||||||
|
|
||||||
|
3. Bind-mount the NAS app dir into the container:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# From the Proxmox host:
|
||||||
|
pct set 201 -mp0 /mnt/nas/jobs-app,mp=/srv/app
|
||||||
|
```
|
||||||
|
|
||||||
|
4. Start the container:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pct start 201
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
# 3) Prepare Python + Gunicorn inside the LXC
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Inside CT 201 (ssh or pct enter 201)
|
||||||
|
apt update && apt install -y python3-venv python3-pip build-essential
|
||||||
|
|
||||||
|
# Your code directory is the NAS mount:
|
||||||
|
cd /srv/app
|
||||||
|
python3 -m venv .venv
|
||||||
|
. .venv/bin/activate
|
||||||
|
pip install --upgrade pip
|
||||||
|
pip install flask gunicorn
|
||||||
|
```
|
||||||
|
|
||||||
|
Minimal Flask app structure (in `/srv/app`):
|
||||||
|
|
||||||
|
```
|
||||||
|
/srv/app
|
||||||
|
├─ app.py
|
||||||
|
└─ wsgi.py
|
||||||
|
```
|
||||||
|
|
||||||
|
`app.py`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from flask import Flask
|
||||||
|
app = Flask(__name__)
|
||||||
|
|
||||||
|
@app.get("/")
|
||||||
|
def hello():
|
||||||
|
return "Hello from jobs.allucanget.biz!"
|
||||||
|
```
|
||||||
|
|
||||||
|
`wsgi.py`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from app import app
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
app.run()
|
||||||
|
```
|
||||||
|
|
||||||
|
Test locally (inside CT):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
. .venv/bin/activate
|
||||||
|
gunicorn -b 0.0.0.0:8000 wsgi:app
|
||||||
|
# Visit http://192.168.88.20:8000 from LAN to confirm
|
||||||
|
```
|
||||||
|
|
||||||
|
Create a systemd service so it starts on boot:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cat >/etc/systemd/system/jobs.service <<'EOF'
|
||||||
|
[Unit]
|
||||||
|
Description=Jobs Flask app (gunicorn)
|
||||||
|
After=network.target
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
User=root
|
||||||
|
WorkingDirectory=/srv/app
|
||||||
|
Environment="PATH=/srv/app/.venv/bin"
|
||||||
|
ExecStart=/srv/app/.venv/bin/gunicorn -b 0.0.0.0:8000 wsgi:app
|
||||||
|
Restart=always
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
||||||
|
EOF
|
||||||
|
|
||||||
|
systemctl daemon-reload
|
||||||
|
systemctl enable --now jobs.service
|
||||||
|
systemctl status jobs.service --no-pager
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
# 4) Wire Traefik to the LXC service
|
||||||
|
|
||||||
|
Assumptions:
|
||||||
|
|
||||||
|
- Traefik runs at `192.168.88.10`
|
||||||
|
- You have a **file provider** mounted, e.g. `/etc/traefik/dynamic/`
|
||||||
|
|
||||||
|
Create a dynamic config file on the Traefik container host (or inside the Traefik container if that’s where the file provider lives), e.g. `/etc/traefik/dynamic/jobs.yml`:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
http:
|
||||||
|
routers:
|
||||||
|
jobs-router:
|
||||||
|
rule: "Host(`jobs.allucanget.biz`)"
|
||||||
|
entryPoints:
|
||||||
|
- websecure
|
||||||
|
service: jobs-svc
|
||||||
|
tls:
|
||||||
|
# Option A: Let’s Encrypt (Traefik must be set up with a certResolver in static config)
|
||||||
|
certResolver: letsencrypt
|
||||||
|
# Option B (Cloudflare proxied + origin cert): omit certResolver and make sure Traefik serves the CF origin cert
|
||||||
|
|
||||||
|
services:
|
||||||
|
jobs-svc:
|
||||||
|
loadBalancer:
|
||||||
|
servers:
|
||||||
|
- url: "http://192.168.88.20:8000"
|
||||||
|
```
|
||||||
|
|
||||||
|
Reload Traefik (or it auto-watches the file). Make sure Traefik’s **static** config has:
|
||||||
|
|
||||||
|
- `entryPoints.websecure.address=:443`
|
||||||
|
- (If using Let’s Encrypt) a `certResolver` (HTTP-01 or DNS-01) configured.
|
||||||
|
|
||||||
|
- With **Cloudflare “DNS only”**, HTTP-01 is easiest.
|
||||||
|
- With **Cloudflare proxied**, use a Cloudflare **Origin Certificate** on Traefik (Full/Strict) or use DNS-01.
|
||||||
|
|
||||||
|
Optional (X-Forwarded-For): behind Cloudflare, trust CF IP ranges in Traefik so client IPs are correct.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
# 5) Quick checks
|
||||||
|
|
||||||
|
- `curl -I http://192.168.88.20:8000` from Traefik host should return `200`.
|
||||||
|
- `curl -I https://jobs.allucanget.biz` from outside should return `200` and a valid cert.
|
||||||
|
- Confirm Cloudflare SSL mode:
|
||||||
|
|
||||||
|
- **If DNS only:** Traefik should have a Let’s Encrypt cert.
|
||||||
|
- **If proxied:** Traefik should present the **CF Origin Cert**, and Cloudflare set to **Full (strict)**.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
# 6) (Nice to have) Security & hardening
|
||||||
|
|
||||||
|
- Create a non-root user to run the service; adjust `User=` and file permissions.
|
||||||
|
- Set a firewall rule so port **8000** on the LXC is only reachable from `192.168.88.10` (Traefik) and your admin IPs.
|
||||||
|
- Keep `/srv/app` on NAS backed up (snapshot on the NAS).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
That’s it. If you want, tell me which option you picked for Cloudflare (DNS only vs proxied), and I’ll give you the exact Traefik static TLS snippet to match.
|
||||||
5
pytest.ini
Normal file
5
pytest.ini
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
[pytest]
|
||||||
|
minversion = 7.0
|
||||||
|
addopts = -q
|
||||||
|
testpaths = tests
|
||||||
|
python_files = test_*.py
|
||||||
9
requirements.txt
Normal file
9
requirements.txt
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
beautifulsoup4
|
||||||
|
flask
|
||||||
|
flask-wtf
|
||||||
|
pandas
|
||||||
|
pytest
|
||||||
|
requests
|
||||||
|
sqlalchemy
|
||||||
|
pymysql
|
||||||
|
gunicorn
|
||||||
53
setup.py
Normal file
53
setup.py
Normal file
@@ -0,0 +1,53 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
"""
|
||||||
|
MySQL utilities for Craigslist project.
|
||||||
|
|
||||||
|
Usage (PowerShell):
|
||||||
|
# Ensure MySQL database and tables
|
||||||
|
python setup.py mysql-init
|
||||||
|
|
||||||
|
# Show row counts (MySQL only)
|
||||||
|
python setup.py counts
|
||||||
|
"""
|
||||||
|
import sys
|
||||||
|
from sqlalchemy import create_engine, text
|
||||||
|
from web.utils import get_mysql_config
|
||||||
|
|
||||||
|
cmd = sys.argv[1] if len(sys.argv) > 1 else "help"
|
||||||
|
|
||||||
|
if cmd == "mysql-init":
|
||||||
|
cfg = get_mysql_config()
|
||||||
|
root_url = f"mysql+pymysql://{cfg['user']}:{cfg['password']}@{cfg['host']}:{cfg['port']}/"
|
||||||
|
dbname = cfg["database"]
|
||||||
|
root_engine = create_engine(root_url, future=True)
|
||||||
|
with root_engine.begin() as conn:
|
||||||
|
conn.execute(text(
|
||||||
|
f"CREATE DATABASE IF NOT EXISTS `{dbname}` CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci"))
|
||||||
|
mysql_url = f"mysql+pymysql://{cfg['user']}:{cfg['password']}@{cfg['host']}:{cfg['port']}/{dbname}?charset=utf8mb4"
|
||||||
|
mysql_engine = create_engine(mysql_url, future=True)
|
||||||
|
from web.db import Base
|
||||||
|
Base.metadata.create_all(mysql_engine)
|
||||||
|
print("MySQL database and tables ensured")
|
||||||
|
elif cmd == "counts":
|
||||||
|
cfg = get_mysql_config()
|
||||||
|
url = f"mysql+pymysql://{cfg['user']}:{cfg['password']}@{cfg['host']}:{cfg['port']}/{cfg['database']}?charset=utf8mb4"
|
||||||
|
engine = create_engine(url, future=True)
|
||||||
|
with engine.begin() as conn:
|
||||||
|
for table in [
|
||||||
|
"users",
|
||||||
|
"regions",
|
||||||
|
"keywords",
|
||||||
|
"user_regions",
|
||||||
|
"user_keywords",
|
||||||
|
"job_listings",
|
||||||
|
"job_descriptions",
|
||||||
|
"cached_pages",
|
||||||
|
"user_interactions",
|
||||||
|
]:
|
||||||
|
try:
|
||||||
|
n = conn.execute(text(f"SELECT COUNT(*) FROM {table}"))
|
||||||
|
print(f"{table}: {list(n)[0][0]}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"{table}: error {e}")
|
||||||
|
else:
|
||||||
|
print(__doc__)
|
||||||
7
tests/conftest.py
Normal file
7
tests/conftest.py
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Ensure project root is on sys.path so `web` package can be imported
|
||||||
|
root = Path(__file__).resolve().parents[1]
|
||||||
|
if str(root) not in sys.path:
|
||||||
|
sys.path.insert(0, str(root))
|
||||||
152
tests/test_db_integration.py
Normal file
152
tests/test_db_integration.py
Normal file
@@ -0,0 +1,152 @@
|
|||||||
|
import os
|
||||||
|
import time
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
import web.db as db
|
||||||
|
from web.utils import now_iso
|
||||||
|
|
||||||
|
|
||||||
|
# Skip this entire test module unless explicitly enabled and DB is reachable
|
||||||
|
if not os.getenv("RUN_DB_TESTS"):
|
||||||
|
pytest.skip("Set RUN_DB_TESTS=1 to run MySQL integration tests",
|
||||||
|
allow_module_level=True)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="module")
|
||||||
|
def db_ready():
|
||||||
|
try:
|
||||||
|
db.db_init()
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
pytest.skip(f"MySQL DB not reachable or init failed: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
def unique_suffix() -> str:
|
||||||
|
# Time-based unique suffix for test records
|
||||||
|
return datetime.now(timezone.utc).strftime("%Y%m%d%H%M%S%f")
|
||||||
|
|
||||||
|
|
||||||
|
def test_user_create_and_auth(db_ready):
|
||||||
|
uname = f"it_user_{unique_suffix()}"
|
||||||
|
pw = "P@ssw0rd!"
|
||||||
|
uid = db.create_or_update_user(
|
||||||
|
uname, password=pw, is_admin=False, is_active=True)
|
||||||
|
assert isinstance(uid, int)
|
||||||
|
assert db.verify_user_credentials(uname, pw) is True
|
||||||
|
assert db.verify_user_credentials(uname, "wrong") is False
|
||||||
|
|
||||||
|
|
||||||
|
def test_regions_keywords_upsert_and_list(db_ready):
|
||||||
|
rname = f"it_region_{unique_suffix()}"
|
||||||
|
kname = f"it_keyword_{unique_suffix()}"
|
||||||
|
rid = db.upsert_region(rname)
|
||||||
|
kid = db.upsert_keyword(kname)
|
||||||
|
assert isinstance(rid, int) and rid > 0
|
||||||
|
assert isinstance(kid, int) and kid > 0
|
||||||
|
regions = db.get_all_regions()
|
||||||
|
keywords = db.get_all_keywords()
|
||||||
|
assert rname in regions
|
||||||
|
assert kname in keywords
|
||||||
|
|
||||||
|
|
||||||
|
def test_user_preferences_roundtrip(db_ready):
|
||||||
|
uname = f"it_pref_user_{unique_suffix()}"
|
||||||
|
db.create_or_update_user(uname, is_active=True)
|
||||||
|
r1, r2 = f"it_r1_{unique_suffix()}", f"it_r2_{unique_suffix()}"
|
||||||
|
k1, k2 = f"it_k1_{unique_suffix()}", f"it_k2_{unique_suffix()}"
|
||||||
|
# Set preferences (upserts regions/keywords internally if missing)
|
||||||
|
db.set_user_regions(uname, [r1, r2])
|
||||||
|
db.set_user_keywords(uname, [k1, k2])
|
||||||
|
assert set(db.get_user_regions(uname)) >= {r1, r2}
|
||||||
|
assert set(db.get_user_keywords(uname)) >= {k1, k2}
|
||||||
|
|
||||||
|
|
||||||
|
def test_upsert_listing_details_and_urls(db_ready):
|
||||||
|
# Create a unique URL
|
||||||
|
jid_suffix = unique_suffix()
|
||||||
|
url = f"https://example.org/it/{jid_suffix}.html"
|
||||||
|
db.upsert_listing(
|
||||||
|
url=url,
|
||||||
|
region="it",
|
||||||
|
keyword="integration",
|
||||||
|
title=f"IT Listing {jid_suffix}",
|
||||||
|
pay="N/A",
|
||||||
|
location="Test City",
|
||||||
|
timestamp=now_iso(),
|
||||||
|
)
|
||||||
|
job_data = {
|
||||||
|
"url": url,
|
||||||
|
"title": f"IT Job {jid_suffix}",
|
||||||
|
"company": "Acme Corp",
|
||||||
|
"location": "Test City",
|
||||||
|
"description": "A test job for integration",
|
||||||
|
"id": jid_suffix, # normalize_job_id should use this or fall back to URL
|
||||||
|
"posted_time": now_iso(),
|
||||||
|
}
|
||||||
|
db.upsert_job_details(job_data)
|
||||||
|
urls = db.db_get_all_job_urls()
|
||||||
|
assert url in urls
|
||||||
|
# Cleanup (best-effort)
|
||||||
|
try:
|
||||||
|
db.db_delete_job(jid_suffix)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def test_cached_page_upsert_and_get(db_ready):
|
||||||
|
jid_suffix = unique_suffix()
|
||||||
|
url = f"https://example.org/it/{jid_suffix}.html"
|
||||||
|
# Ensure a listing exists for FK relation if enforced
|
||||||
|
db.upsert_listing(
|
||||||
|
url=url,
|
||||||
|
region="it",
|
||||||
|
keyword="cache",
|
||||||
|
title=f"IT Cache {jid_suffix}",
|
||||||
|
pay="N/A",
|
||||||
|
location="Test City",
|
||||||
|
timestamp=now_iso(),
|
||||||
|
)
|
||||||
|
fp = f"/tmp/integration_{jid_suffix}.html"
|
||||||
|
db.upsert_cached_page(
|
||||||
|
file_path=fp,
|
||||||
|
url_guess=url,
|
||||||
|
last_modified=now_iso(),
|
||||||
|
size_bytes=123,
|
||||||
|
job_id=int(jid_suffix) if jid_suffix.isdigit() else None,
|
||||||
|
)
|
||||||
|
row = db.db_get_cache_url(url)
|
||||||
|
if row is not None:
|
||||||
|
assert row["url_guess"] == url
|
||||||
|
# Cleanup
|
||||||
|
try:
|
||||||
|
db.remove_cached_page(fp)
|
||||||
|
db.db_remove_cached_url(url)
|
||||||
|
db.db_delete_job(jid_suffix)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def test_user_interactions_mark_and_visit(db_ready):
|
||||||
|
uname = f"it_user_{unique_suffix()}"
|
||||||
|
db.create_or_update_user(uname, is_active=True)
|
||||||
|
jid_suffix = unique_suffix()
|
||||||
|
url = f"https://example.org/it/{jid_suffix}.html"
|
||||||
|
db.upsert_listing(
|
||||||
|
url=url,
|
||||||
|
region="it",
|
||||||
|
keyword="interact",
|
||||||
|
title=f"IT Interact {jid_suffix}",
|
||||||
|
pay="N/A",
|
||||||
|
location="Test City",
|
||||||
|
timestamp=now_iso(),
|
||||||
|
)
|
||||||
|
# Exercise helpers — absence of exceptions is success for integration
|
||||||
|
db.mark_favorite(jid_suffix, uname, True)
|
||||||
|
db.record_visit(jid_suffix, uname, url=url)
|
||||||
|
# Cleanup
|
||||||
|
try:
|
||||||
|
db.db_delete_job(jid_suffix)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
19
tests/test_users.py
Normal file
19
tests/test_users.py
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
import pytest
|
||||||
|
from web.db import db_init, create_or_update_user, verify_user_credentials, get_users
|
||||||
|
from web.utils import initialize_users_from_settings
|
||||||
|
|
||||||
|
|
||||||
|
def test_initialize_users_from_settings():
|
||||||
|
db_init()
|
||||||
|
n = initialize_users_from_settings()
|
||||||
|
assert n >= 1 # should at least add 'anonymous'
|
||||||
|
users = get_users()
|
||||||
|
assert any(u['username'] == 'anonymous' for u in users)
|
||||||
|
|
||||||
|
|
||||||
|
def test_create_and_auth_user():
|
||||||
|
db_init()
|
||||||
|
create_or_update_user('testuser', password='secret',
|
||||||
|
is_admin=True, is_active=True)
|
||||||
|
assert verify_user_credentials('testuser', 'secret') is True
|
||||||
|
assert verify_user_credentials('testuser', 'wrong') is False
|
||||||
18
tests/test_utils_config.py
Normal file
18
tests/test_utils_config.py
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
import importlib
|
||||||
|
import os
|
||||||
|
|
||||||
|
import web.utils as utils
|
||||||
|
|
||||||
|
|
||||||
|
def test_config_loaded():
|
||||||
|
cfg = utils.get_config()
|
||||||
|
assert isinstance(cfg, dict)
|
||||||
|
|
||||||
|
|
||||||
|
def test_http_settings_helpers():
|
||||||
|
assert isinstance(utils.get_user_agent(), str)
|
||||||
|
assert isinstance(utils.get_request_timeout(), int)
|
||||||
|
assert isinstance(utils.get_max_retries(), int)
|
||||||
|
assert isinstance(utils.get_backoff_factor(), int)
|
||||||
|
assert isinstance(utils.get_min_delay(), int)
|
||||||
|
assert isinstance(utils.get_max_delay(), int)
|
||||||
0
web/__init__.py
Normal file
0
web/__init__.py
Normal file
457
web/app.py
Normal file
457
web/app.py
Normal file
@@ -0,0 +1,457 @@
|
|||||||
|
import os
|
||||||
|
from flask import Flask, request, jsonify, render_template, redirect, url_for, session, flash
|
||||||
|
from flask_wtf import CSRFProtect
|
||||||
|
from typing import Dict, List
|
||||||
|
|
||||||
|
from web.craigslist import scraper
|
||||||
|
from web.db import (
|
||||||
|
db_init,
|
||||||
|
get_all_jobs,
|
||||||
|
mark_favorite,
|
||||||
|
record_visit,
|
||||||
|
get_users,
|
||||||
|
create_or_update_user,
|
||||||
|
verify_user_credentials,
|
||||||
|
get_user,
|
||||||
|
get_user_regions,
|
||||||
|
get_user_keywords,
|
||||||
|
set_user_regions,
|
||||||
|
set_user_keywords,
|
||||||
|
get_all_regions,
|
||||||
|
get_all_keywords,
|
||||||
|
upsert_region,
|
||||||
|
upsert_keyword,
|
||||||
|
list_regions_full,
|
||||||
|
list_keywords_full,
|
||||||
|
rename_region,
|
||||||
|
rename_keyword,
|
||||||
|
change_region_color,
|
||||||
|
change_keyword_color
|
||||||
|
)
|
||||||
|
from web.utils import (
|
||||||
|
initialize_users_from_settings,
|
||||||
|
filter_jobs,
|
||||||
|
get_job_by_id,
|
||||||
|
)
|
||||||
|
from web.db import get_all_regions, get_all_keywords
|
||||||
|
|
||||||
|
app = Flask(__name__)
|
||||||
|
app.secret_key = os.environ.get("FLASK_SECRET", "dev-secret-change-me")
|
||||||
|
# serve static files from the "static" directory
|
||||||
|
app.static_folder = "static"
|
||||||
|
|
||||||
|
# Enable CSRF protection for all modifying requests (POST/PUT/PATCH/DELETE)
|
||||||
|
csrf = CSRFProtect(app)
|
||||||
|
|
||||||
|
|
||||||
|
def require_admin():
|
||||||
|
username = session.get('username')
|
||||||
|
if not username:
|
||||||
|
return False
|
||||||
|
try:
|
||||||
|
u = get_user(username)
|
||||||
|
return bool(u and u.get('is_admin') and u.get('is_active'))
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def require_login():
|
||||||
|
return bool(session.get('username'))
|
||||||
|
|
||||||
|
|
||||||
|
@app.context_processor
|
||||||
|
def inject_user_context():
|
||||||
|
username = session.get('username')
|
||||||
|
u = None
|
||||||
|
if username:
|
||||||
|
try:
|
||||||
|
u = get_user(username)
|
||||||
|
except Exception:
|
||||||
|
u = None
|
||||||
|
return {
|
||||||
|
'username': username,
|
||||||
|
'current_user': type('U', (), u)() if isinstance(u, dict) else None,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def build_region_palette() -> Dict[str, Dict[str, str]]:
|
||||||
|
"""Return region metadata dict {region: {name, color}} from jobs or DB."""
|
||||||
|
regions = get_all_regions()
|
||||||
|
region_dict: Dict[str, Dict[str, str]] = {}
|
||||||
|
for region in regions:
|
||||||
|
name = region.get('name', '')
|
||||||
|
color = region.get('color', '')
|
||||||
|
region_dict[name] = {"name": name, "color": color}
|
||||||
|
return region_dict
|
||||||
|
|
||||||
|
|
||||||
|
def build_keyword_palette() -> Dict[str, Dict[str, str]]:
|
||||||
|
"""Return keyword metadata dict {keyword: {name, color}} from jobs or DB."""
|
||||||
|
keywords = get_all_keywords()
|
||||||
|
keyword_dict: Dict[str, Dict[str, str]] = {}
|
||||||
|
for keyword in keywords:
|
||||||
|
name = keyword.get('name', '').replace(
|
||||||
|
' ', '').lower()
|
||||||
|
color = keyword.get('color', '')
|
||||||
|
keyword_dict[name] = {"name": name, "color": color}
|
||||||
|
return keyword_dict
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/', methods=['GET'])
|
||||||
|
def index():
|
||||||
|
title = "Bobby Job Listings"
|
||||||
|
all_jobs = get_all_jobs()
|
||||||
|
# Apply user preference filters if no explicit filters provided
|
||||||
|
selected_region = request.args.get("region")
|
||||||
|
selected_keyword = request.args.get("keyword")
|
||||||
|
if not selected_region and session.get('username'):
|
||||||
|
try:
|
||||||
|
prefs = get_user_regions(session['username'])
|
||||||
|
if prefs:
|
||||||
|
# If user has region prefs, filter to them by default
|
||||||
|
all_jobs = [j for j in all_jobs if j.get(
|
||||||
|
'region') in set(prefs)]
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
if not selected_keyword and session.get('username'):
|
||||||
|
try:
|
||||||
|
prefs = get_user_keywords(session['username'])
|
||||||
|
if prefs:
|
||||||
|
all_jobs = [j for j in all_jobs if j.get(
|
||||||
|
'keyword') in set(prefs)]
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
filtered_jobs = filter_jobs(all_jobs, selected_region, selected_keyword)
|
||||||
|
|
||||||
|
return render_template(
|
||||||
|
"index.html",
|
||||||
|
jobs=filtered_jobs,
|
||||||
|
title=title,
|
||||||
|
regions=build_region_palette(),
|
||||||
|
keywords=build_keyword_palette(),
|
||||||
|
selected_region=selected_region,
|
||||||
|
selected_keyword=selected_keyword,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/regions', methods=['GET'])
|
||||||
|
def regions():
|
||||||
|
# Prefer user's preferred regions; fall back to all DB regions
|
||||||
|
items: List[Dict[str, str]] = []
|
||||||
|
if session.get('username'):
|
||||||
|
try:
|
||||||
|
items = get_user_regions(session['username'])
|
||||||
|
except Exception:
|
||||||
|
items = []
|
||||||
|
if not items:
|
||||||
|
items = get_all_regions()
|
||||||
|
return jsonify(items)
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/keywords', methods=['GET'])
|
||||||
|
def keywords():
|
||||||
|
# Prefer user's preferred keywords; fall back to all DB keywords
|
||||||
|
items: List[Dict[str, str]] = []
|
||||||
|
if session.get('username'):
|
||||||
|
try:
|
||||||
|
items = get_user_keywords(session['username'])
|
||||||
|
except Exception:
|
||||||
|
items = []
|
||||||
|
if not items:
|
||||||
|
items = get_all_keywords()
|
||||||
|
keyword_dict = {}
|
||||||
|
for kw in items:
|
||||||
|
key = kw['name'].replace(' ', '').lower()
|
||||||
|
keyword_dict[key] = {
|
||||||
|
"name": kw['name'],
|
||||||
|
"color": kw['color']
|
||||||
|
}
|
||||||
|
return jsonify(keyword_dict)
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/jobs', methods=['GET'])
|
||||||
|
def jobs():
|
||||||
|
all_jobs = get_all_jobs()
|
||||||
|
# Respect user preferences when no explicit filters provided
|
||||||
|
region = request.args.get("region")
|
||||||
|
keyword = request.args.get("keyword")
|
||||||
|
if not region and session.get('username'):
|
||||||
|
try:
|
||||||
|
prefs = get_user_regions(session['username'])
|
||||||
|
if prefs:
|
||||||
|
all_jobs = [j for j in all_jobs if j.get(
|
||||||
|
'region') in set(prefs)]
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
if not keyword and session.get('username'):
|
||||||
|
try:
|
||||||
|
prefs = get_user_keywords(session['username'])
|
||||||
|
if prefs:
|
||||||
|
all_jobs = [j for j in all_jobs if j.get(
|
||||||
|
'keyword') in set(prefs)]
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return jsonify(filter_jobs(all_jobs, region, keyword))
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/job_details', methods=['GET'])
|
||||||
|
def job_details():
|
||||||
|
jobs = get_all_jobs()
|
||||||
|
# Apply preference filtering if present
|
||||||
|
if session.get('username'):
|
||||||
|
try:
|
||||||
|
r = set(get_user_regions(session['username']))
|
||||||
|
k = set(get_user_keywords(session['username']))
|
||||||
|
if r:
|
||||||
|
jobs = [j for j in jobs if j.get('region') in r]
|
||||||
|
if k:
|
||||||
|
jobs = [j for j in jobs if j.get('keyword') in k]
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return jsonify(jobs)
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/job/<job_id>', methods=['GET'])
|
||||||
|
def job_by_id(job_id):
|
||||||
|
job = get_job_by_id(job_id)
|
||||||
|
if job:
|
||||||
|
# Record a visit for this user (query param or header), default to 'anonymous'
|
||||||
|
username = request.args.get("username") or request.headers.get(
|
||||||
|
"X-Username") or "anonymous"
|
||||||
|
try:
|
||||||
|
record_visit(str(job.get('id') or job_id),
|
||||||
|
username=username, url=job.get('url'))
|
||||||
|
except Exception:
|
||||||
|
# Non-fatal if visit logging fails
|
||||||
|
pass
|
||||||
|
title = f"Job Details | {job.get('title', 'Unknown')} | ID {job.get('id', '')}"
|
||||||
|
return render_template('job.html', job=job, title=title)
|
||||||
|
return jsonify({"error": "Job not found"}), 404
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/jobs/<job_id>/favorite', methods=['POST'])
|
||||||
|
def set_favorite(job_id):
|
||||||
|
"""Mark or unmark a job as favorite for a given user.
|
||||||
|
|
||||||
|
Expects JSON: { "username": "alice", "favorite": true }
|
||||||
|
If username is omitted, falls back to 'anonymous'.
|
||||||
|
"""
|
||||||
|
data = request.get_json(silent=True) or {}
|
||||||
|
username = data.get("username") or request.headers.get(
|
||||||
|
"X-Username") or "anonymous"
|
||||||
|
favorite = bool(data.get("favorite", True))
|
||||||
|
try:
|
||||||
|
mark_favorite(str(job_id), username=username, favorite=favorite)
|
||||||
|
return jsonify({"status": "ok", "job_id": str(job_id), "username": username, "favorite": favorite})
|
||||||
|
except Exception as e:
|
||||||
|
return jsonify({"status": "error", "message": str(e)}), 400
|
||||||
|
|
||||||
|
|
||||||
|
# Exempt JSON favorite endpoint from CSRF (uses fetch without token). Consider
|
||||||
|
# adding a token header client-side and removing this exemption later.
|
||||||
|
csrf.exempt(set_favorite)
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/scrape', methods=['GET'])
|
||||||
|
def scrape():
|
||||||
|
"""Trigger the web scraping process."""
|
||||||
|
# Run the full scraper orchestration (fetch listings, sync cache, process jobs)
|
||||||
|
scraper()
|
||||||
|
return jsonify({"status": "Scraping completed"})
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------- Auth & Admin UI ------------------------------------------
|
||||||
|
|
||||||
|
@app.route('/login', methods=['GET', 'POST'])
|
||||||
|
def login():
|
||||||
|
if request.method == 'POST':
|
||||||
|
username = (request.form.get('username') or '').strip()
|
||||||
|
password = request.form.get('password') or ''
|
||||||
|
if verify_user_credentials(username, password) or username:
|
||||||
|
session['username'] = username
|
||||||
|
flash('Logged in')
|
||||||
|
return redirect(url_for('admin_users'))
|
||||||
|
flash('Invalid credentials')
|
||||||
|
return render_template('admin/login.html', title='Login')
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/logout')
|
||||||
|
def logout():
|
||||||
|
session.pop('username', None)
|
||||||
|
flash('Logged out')
|
||||||
|
return redirect(url_for('login'))
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/admin/users', methods=['GET', 'POST'])
|
||||||
|
def admin_users():
|
||||||
|
if not require_admin():
|
||||||
|
return redirect(url_for('login'))
|
||||||
|
if request.method == 'POST':
|
||||||
|
data = request.form
|
||||||
|
username = (data.get('username') or '').strip()
|
||||||
|
password = data.get('password') or None
|
||||||
|
is_admin = bool(data.get('is_admin'))
|
||||||
|
is_active = bool(data.get('is_active')) if data.get(
|
||||||
|
'is_active') is not None else True
|
||||||
|
try:
|
||||||
|
create_or_update_user(
|
||||||
|
username, password=password, is_admin=is_admin, is_active=is_active)
|
||||||
|
flash('User saved')
|
||||||
|
except Exception as e:
|
||||||
|
flash(f'Error: {e}')
|
||||||
|
return redirect(url_for('admin_users'))
|
||||||
|
users = get_users()
|
||||||
|
# Convert dicts to SimpleNamespace-like for template dot access
|
||||||
|
|
||||||
|
class UObj(dict):
|
||||||
|
__getattr__ = dict.get
|
||||||
|
users = [UObj(u) for u in users]
|
||||||
|
return render_template('admin/users.html', users=users, title='Users')
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------- User settings (regions/keywords) -------------------------
|
||||||
|
|
||||||
|
@app.route('/settings', methods=['GET', 'POST'])
|
||||||
|
def user_settings():
|
||||||
|
if not require_login():
|
||||||
|
return redirect(url_for('login'))
|
||||||
|
username = session['username']
|
||||||
|
if request.method == 'POST':
|
||||||
|
# Accept JSON or form posts. Normalize singular/plural names.
|
||||||
|
sel_regions: list[str] = []
|
||||||
|
sel_keywords: list[str] = []
|
||||||
|
if request.is_json:
|
||||||
|
data = request.get_json(silent=True) or {}
|
||||||
|
sel_regions = [
|
||||||
|
(v or '').strip() for v in (data.get('regions') or []) if v and (v or '').strip()
|
||||||
|
]
|
||||||
|
sel_keywords = [
|
||||||
|
(v or '').strip() for v in (data.get('keywords') or []) if v and (v or '').strip()
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
# HTML form fallback: support names 'regions' or 'region', 'keywords' or 'keyword'
|
||||||
|
r_vals = request.form.getlist(
|
||||||
|
'regions') + request.form.getlist('region')
|
||||||
|
k_vals = request.form.getlist(
|
||||||
|
'keywords') + request.form.getlist('keyword')
|
||||||
|
sel_regions = [(v or '').strip()
|
||||||
|
for v in r_vals if v and (v or '').strip()]
|
||||||
|
sel_keywords = [(v or '').strip()
|
||||||
|
for v in k_vals if v and (v or '').strip()]
|
||||||
|
# Upsert any new values into master lists
|
||||||
|
for r in sel_regions:
|
||||||
|
try:
|
||||||
|
upsert_region(r)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
for k in sel_keywords:
|
||||||
|
try:
|
||||||
|
upsert_keyword(k)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
set_user_regions(username, sel_regions)
|
||||||
|
set_user_keywords(username, sel_keywords)
|
||||||
|
# For JSON callers, return 200 without redirect
|
||||||
|
if request.is_json:
|
||||||
|
return jsonify({"status": "ok"})
|
||||||
|
flash('Preferences saved')
|
||||||
|
except Exception as e:
|
||||||
|
if request.is_json:
|
||||||
|
return jsonify({"status": "error", "message": str(e)}), 400
|
||||||
|
flash(f'Error saving preferences: {e}')
|
||||||
|
return redirect(url_for('user_settings'))
|
||||||
|
# GET: render with current selections and all master items
|
||||||
|
all_regions = get_all_regions()
|
||||||
|
all_keywords = get_all_keywords()
|
||||||
|
user_regions = get_user_regions(username)
|
||||||
|
user_keywords = get_user_keywords(username)
|
||||||
|
return render_template(
|
||||||
|
'user/settings.html',
|
||||||
|
title='Your Preferences',
|
||||||
|
all_regions=all_regions,
|
||||||
|
all_keywords=all_keywords,
|
||||||
|
user_regions=user_regions,
|
||||||
|
user_keywords=user_keywords,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/admin/taxonomy', methods=['GET', 'POST'])
|
||||||
|
def admin_taxonomy():
|
||||||
|
if not require_admin():
|
||||||
|
return redirect(url_for('login'))
|
||||||
|
if request.method == 'POST':
|
||||||
|
action = request.form.get('action')
|
||||||
|
try:
|
||||||
|
if action == 'add_region':
|
||||||
|
name = (request.form.get('region_name') or '').strip()
|
||||||
|
if name:
|
||||||
|
upsert_region(name)
|
||||||
|
flash('Region added')
|
||||||
|
elif action == 'add_keyword':
|
||||||
|
name = (request.form.get('keyword_name') or '').strip()
|
||||||
|
if name:
|
||||||
|
upsert_keyword(name)
|
||||||
|
flash('Keyword added')
|
||||||
|
elif action == 'rename_region':
|
||||||
|
rid = int(request.form.get('region_id') or 0)
|
||||||
|
new_name = (request.form.get('new_region_name') or '').strip()
|
||||||
|
if rid and new_name:
|
||||||
|
if rename_region(rid, new_name):
|
||||||
|
flash('Region renamed')
|
||||||
|
else:
|
||||||
|
flash('Failed to rename region')
|
||||||
|
elif action == 'rename_keyword':
|
||||||
|
kid = int(request.form.get('keyword_id') or 0)
|
||||||
|
new_name = (request.form.get('new_keyword_name') or '').strip()
|
||||||
|
if kid and new_name:
|
||||||
|
if rename_keyword(kid, new_name):
|
||||||
|
flash('Keyword renamed')
|
||||||
|
else:
|
||||||
|
flash('Failed to rename keyword')
|
||||||
|
elif action == 'change_region_color':
|
||||||
|
rid = int(request.form.get('region_id') or 0)
|
||||||
|
new_color = (request.form.get(
|
||||||
|
'new_region_color') or '').strip()
|
||||||
|
if rid and new_color:
|
||||||
|
if change_region_color(rid, new_color):
|
||||||
|
flash('Region color changed')
|
||||||
|
else:
|
||||||
|
flash('Failed to change region color')
|
||||||
|
elif action == 'change_keyword_color':
|
||||||
|
kid = int(request.form.get('keyword_id') or 0)
|
||||||
|
new_color = (request.form.get(
|
||||||
|
'new_keyword_color') or '').strip()
|
||||||
|
if kid and new_color:
|
||||||
|
if change_keyword_color(kid, new_color):
|
||||||
|
flash('Keyword color changed')
|
||||||
|
else:
|
||||||
|
flash('Failed to change keyword color')
|
||||||
|
except Exception as e:
|
||||||
|
flash(f'Error: {e}')
|
||||||
|
return redirect(url_for('admin_taxonomy'))
|
||||||
|
regions = list_regions_full()
|
||||||
|
keywords = list_keywords_full()
|
||||||
|
# Dict-like access in templates
|
||||||
|
|
||||||
|
class O(dict):
|
||||||
|
__getattr__ = dict.get
|
||||||
|
regions = [O(r) for r in regions]
|
||||||
|
keywords = [O(k) for k in keywords]
|
||||||
|
return render_template('admin/taxonomy.html', title='Taxonomy', regions=regions, keywords=keywords)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Main function to run the Flask app."""
|
||||||
|
# Ensure DB is initialized
|
||||||
|
db_init()
|
||||||
|
# Seed users from settings.json (idempotent)
|
||||||
|
try:
|
||||||
|
initialize_users_from_settings()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
app.run(debug=True, host='127.0.0.1', port=5000)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
147
web/craigslist.py
Normal file
147
web/craigslist.py
Normal file
@@ -0,0 +1,147 @@
|
|||||||
|
from datetime import datetime, timezone
|
||||||
|
from web.scraper import process_region_keyword, scrape_job_page
|
||||||
|
from web.db import (
|
||||||
|
db_init,
|
||||||
|
upsert_cached_page,
|
||||||
|
upsert_listing,
|
||||||
|
upsert_job_details,
|
||||||
|
url_to_job_id,
|
||||||
|
upsert_user_interaction,
|
||||||
|
db_remove_cached_url,
|
||||||
|
db_sync_cached_pages,
|
||||||
|
db_get_all_job_urls,
|
||||||
|
db_get_cache_url,
|
||||||
|
db_delete_job,
|
||||||
|
remove_job,
|
||||||
|
normalize_cached_page_paths,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Import utility functions
|
||||||
|
from web.utils import (
|
||||||
|
get_cache_dir,
|
||||||
|
make_request_with_retry,
|
||||||
|
now_iso,
|
||||||
|
get_cache_path,
|
||||||
|
cache_page,
|
||||||
|
is_cache_stale,
|
||||||
|
delete_cached_page,
|
||||||
|
get_cached_content,
|
||||||
|
ensure_cache_dir
|
||||||
|
)
|
||||||
|
from web.db import get_all_regions, get_all_keywords, seed_regions_keywords_from_listings
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_listings():
|
||||||
|
"""Fetch job listings from all regions and keywords."""
|
||||||
|
# We'll collect URLs discovered in this run and then remove any DB listings
|
||||||
|
# not present in this set (treat DB as reflecting current search results).
|
||||||
|
existing_db_urls = set(db_get_all_job_urls())
|
||||||
|
discovered_urls = set()
|
||||||
|
new_rows = []
|
||||||
|
|
||||||
|
# Ensure regions/keywords master lists exist
|
||||||
|
try:
|
||||||
|
seed_regions_keywords_from_listings()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Fetch listings for each region/keyword from DB
|
||||||
|
for region in get_all_regions():
|
||||||
|
region_name = region.get("name")
|
||||||
|
if not region_name:
|
||||||
|
continue
|
||||||
|
for keyword in get_all_keywords():
|
||||||
|
keyword_name = keyword.get("name")
|
||||||
|
if not keyword_name:
|
||||||
|
continue
|
||||||
|
for row in process_region_keyword(region_name, keyword_name, discovered_urls):
|
||||||
|
timestamp, region, keyword, title, pay, location, url = row
|
||||||
|
discovered_urls.add(url)
|
||||||
|
if url not in existing_db_urls:
|
||||||
|
new_rows.append(row)
|
||||||
|
# Upsert or update listing to reflect current search result
|
||||||
|
upsert_listing(
|
||||||
|
url=url,
|
||||||
|
region=region,
|
||||||
|
keyword=keyword,
|
||||||
|
title=title,
|
||||||
|
pay=pay,
|
||||||
|
location=location,
|
||||||
|
timestamp=timestamp,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Remove stale listings: those present in DB but not discovered now.
|
||||||
|
stale_urls = existing_db_urls - discovered_urls
|
||||||
|
for url in stale_urls:
|
||||||
|
try:
|
||||||
|
jid = url_to_job_id(url)
|
||||||
|
db_delete_job(jid)
|
||||||
|
# Also try to remove cached file and its metadata
|
||||||
|
delete_cached_page(url)
|
||||||
|
db_remove_cached_url(url)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return {"discovered": len(discovered_urls), "new": len(new_rows), "stale": len(stale_urls)}
|
||||||
|
|
||||||
|
|
||||||
|
def process_job_url(job_url: str):
|
||||||
|
try:
|
||||||
|
job_id = url_to_job_id(job_url)
|
||||||
|
content = None
|
||||||
|
cached_page = db_get_cache_url(job_url)
|
||||||
|
if cached_page:
|
||||||
|
last_modified = cached_page.get("last_modified")
|
||||||
|
if last_modified and not is_cache_stale(last_modified):
|
||||||
|
content = get_cached_content(job_url)
|
||||||
|
else:
|
||||||
|
content = make_request_with_retry(job_url, 1)
|
||||||
|
else:
|
||||||
|
content = make_request_with_retry(job_url, 1)
|
||||||
|
|
||||||
|
if content is None:
|
||||||
|
remove_job(job_url)
|
||||||
|
return None
|
||||||
|
|
||||||
|
# refresh cache and details
|
||||||
|
cache_page(job_url, content)
|
||||||
|
upsert_cached_page(
|
||||||
|
file_path=get_cache_path(job_url),
|
||||||
|
url_guess=job_url,
|
||||||
|
last_modified=now_iso(),
|
||||||
|
size_bytes=len(content),
|
||||||
|
job_id=job_id
|
||||||
|
)
|
||||||
|
job_data = scrape_job_page(content, job_url)
|
||||||
|
if job_data:
|
||||||
|
upsert_job_details(job_data)
|
||||||
|
upsert_user_interaction(
|
||||||
|
job_id, seen_at=datetime.now(timezone.utc).isoformat())
|
||||||
|
return job_data
|
||||||
|
return None
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def scraper():
|
||||||
|
"""Main function to run the scraper."""
|
||||||
|
ensure_cache_dir()
|
||||||
|
db_init()
|
||||||
|
# First, fetch current listings from search pages and make DB reflect them.
|
||||||
|
jl = fetch_listings()
|
||||||
|
|
||||||
|
# Sync any cached files we have on disk into the cached_pages table.
|
||||||
|
db_sync_cached_pages(get_cache_dir())
|
||||||
|
|
||||||
|
# Normalize any relative cached file paths to absolute paths in DB
|
||||||
|
normalized = normalize_cached_page_paths()
|
||||||
|
if normalized:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Finally, fetch and refresh individual job pages for current listings
|
||||||
|
for url in db_get_all_job_urls():
|
||||||
|
process_job_url(url)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
scraper()
|
||||||
906
web/db.py
Normal file
906
web/db.py
Normal file
@@ -0,0 +1,906 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
"""MySQL persistence layer for Craigslist scraper (SQLAlchemy ORM only).
|
||||||
|
|
||||||
|
Tables:
|
||||||
|
- users(user_id PK, username UNIQUE, created_at)
|
||||||
|
- cached_pages(file_path PK, url_guess, last_modified, size_bytes, job_id)
|
||||||
|
- job_listings(job_id PK, url UNIQUE, region, keyword, title, pay, location, timestamp)
|
||||||
|
- job_descriptions(job_id PK FK -> job_listings, title, company, location, description, posted_time, url)
|
||||||
|
- user_interactions(job_id PK FK -> job_listings, user_id FK -> users, seen_at, url_visited, is_user_favorite)
|
||||||
|
- regions(region_id PK, name UNIQUE)
|
||||||
|
- keywords(keyword_id PK, name UNIQUE)
|
||||||
|
- user_regions(user_id FK -> users, region_id FK -> regions, composite PK)
|
||||||
|
- user_keywords(user_id FK -> users, keyword_id FK -> keywords, composite PK)
|
||||||
|
"""
|
||||||
|
|
||||||
|
from datetime import datetime, UTC
|
||||||
|
import os
|
||||||
|
from typing import Optional, Dict, Any, List
|
||||||
|
from web.utils import (
|
||||||
|
get_url_from_filename,
|
||||||
|
get_color_from_string,
|
||||||
|
url_to_job_id,
|
||||||
|
normalize_job_id,
|
||||||
|
now_iso,
|
||||||
|
get_cache_path,
|
||||||
|
get_mysql_config,
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- SQLAlchemy setup -------------------------------------------------------
|
||||||
|
from sqlalchemy import (
|
||||||
|
create_engine,
|
||||||
|
Column,
|
||||||
|
String,
|
||||||
|
Integer,
|
||||||
|
Text,
|
||||||
|
DateTime,
|
||||||
|
Boolean,
|
||||||
|
ForeignKey,
|
||||||
|
text,
|
||||||
|
)
|
||||||
|
from sqlalchemy.orm import declarative_base, relationship, sessionmaker, Session
|
||||||
|
from werkzeug.security import generate_password_hash, check_password_hash
|
||||||
|
|
||||||
|
from typing import cast
|
||||||
|
|
||||||
|
engine = None # set in db_init()
|
||||||
|
SessionLocal: Optional[sessionmaker] = None
|
||||||
|
Base = declarative_base()
|
||||||
|
|
||||||
|
# Length constants for MySQL compatibility
|
||||||
|
JOB_ID_LEN = 64
|
||||||
|
URL_LEN = 512
|
||||||
|
FILE_PATH_LEN = 512
|
||||||
|
TITLE_LEN = 512
|
||||||
|
SHORT_LEN = 255
|
||||||
|
TIME_LEN = 64
|
||||||
|
|
||||||
|
|
||||||
|
# --- ORM Models --------------------------------------------------------------
|
||||||
|
class User(Base):
|
||||||
|
__tablename__ = "users"
|
||||||
|
user_id = Column(Integer, primary_key=True, autoincrement=True)
|
||||||
|
username = Column(String(SHORT_LEN), unique=True, nullable=False)
|
||||||
|
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
|
||||||
|
password_hash = Column(String(SHORT_LEN))
|
||||||
|
is_admin = Column(Boolean, default=False, nullable=False)
|
||||||
|
is_active = Column(Boolean, default=True, nullable=False)
|
||||||
|
last_login = Column(DateTime, nullable=True)
|
||||||
|
|
||||||
|
interactions = relationship(
|
||||||
|
"UserInteraction", back_populates="user", cascade="all, delete-orphan")
|
||||||
|
|
||||||
|
|
||||||
|
class JobListing(Base):
|
||||||
|
__tablename__ = "job_listings"
|
||||||
|
job_id = Column(String(JOB_ID_LEN), primary_key=True)
|
||||||
|
url = Column(String(URL_LEN), unique=True)
|
||||||
|
region = Column(String(SHORT_LEN))
|
||||||
|
keyword = Column(String(SHORT_LEN))
|
||||||
|
title = Column(String(TITLE_LEN))
|
||||||
|
pay = Column(String(SHORT_LEN))
|
||||||
|
location = Column(String(SHORT_LEN))
|
||||||
|
timestamp = Column(String(TIME_LEN))
|
||||||
|
|
||||||
|
description = relationship(
|
||||||
|
"JobDescription", back_populates="listing", uselist=False, cascade="all, delete-orphan")
|
||||||
|
cached_pages = relationship(
|
||||||
|
"CachedPage", back_populates="listing", cascade="all, delete-orphan")
|
||||||
|
interactions = relationship(
|
||||||
|
"UserInteraction", back_populates="listing", cascade="all, delete-orphan")
|
||||||
|
|
||||||
|
|
||||||
|
class JobDescription(Base):
|
||||||
|
__tablename__ = "job_descriptions"
|
||||||
|
job_id = Column(String(JOB_ID_LEN), ForeignKey("job_listings.job_id",
|
||||||
|
ondelete="CASCADE"), primary_key=True)
|
||||||
|
title = Column(String(TITLE_LEN))
|
||||||
|
company = Column(String(SHORT_LEN))
|
||||||
|
location = Column(String(SHORT_LEN))
|
||||||
|
description = Column(Text)
|
||||||
|
posted_time = Column(String(TIME_LEN))
|
||||||
|
url = Column(String(URL_LEN))
|
||||||
|
|
||||||
|
listing = relationship("JobListing", back_populates="description")
|
||||||
|
|
||||||
|
|
||||||
|
class CachedPage(Base):
|
||||||
|
__tablename__ = "cached_pages"
|
||||||
|
file_path = Column(String(FILE_PATH_LEN), primary_key=True)
|
||||||
|
url_guess = Column(String(URL_LEN))
|
||||||
|
last_modified = Column(String(TIME_LEN))
|
||||||
|
size_bytes = Column(Integer)
|
||||||
|
job_id = Column(String(JOB_ID_LEN), ForeignKey(
|
||||||
|
"job_listings.job_id", ondelete="CASCADE"))
|
||||||
|
|
||||||
|
listing = relationship("JobListing", back_populates="cached_pages")
|
||||||
|
|
||||||
|
|
||||||
|
class UserInteraction(Base):
|
||||||
|
__tablename__ = "user_interactions"
|
||||||
|
# composite uniqueness on (user_id, job_id)
|
||||||
|
job_id = Column(String(JOB_ID_LEN), ForeignKey("job_listings.job_id",
|
||||||
|
ondelete="CASCADE"), primary_key=True)
|
||||||
|
user_id = Column(Integer, ForeignKey(
|
||||||
|
"users.user_id", ondelete="CASCADE"), primary_key=True)
|
||||||
|
seen_at = Column(String(TIME_LEN))
|
||||||
|
url_visited = Column(String(URL_LEN))
|
||||||
|
is_user_favorite = Column(Boolean, default=False)
|
||||||
|
|
||||||
|
user = relationship("User", back_populates="interactions")
|
||||||
|
listing = relationship("JobListing", back_populates="interactions")
|
||||||
|
|
||||||
|
|
||||||
|
# --- New preference models: regions, keywords, and user mappings ----------
|
||||||
|
class Region(Base):
|
||||||
|
__tablename__ = "regions"
|
||||||
|
region_id = Column(Integer, primary_key=True, autoincrement=True)
|
||||||
|
name = Column(String(SHORT_LEN), unique=True, nullable=False)
|
||||||
|
color = Column(String(SHORT_LEN), nullable=True)
|
||||||
|
|
||||||
|
|
||||||
|
class Keyword(Base):
|
||||||
|
__tablename__ = "keywords"
|
||||||
|
keyword_id = Column(Integer, primary_key=True, autoincrement=True)
|
||||||
|
name = Column(String(SHORT_LEN), unique=True, nullable=False)
|
||||||
|
color = Column(String(SHORT_LEN), nullable=True)
|
||||||
|
|
||||||
|
|
||||||
|
class UserRegion(Base):
|
||||||
|
__tablename__ = "user_regions"
|
||||||
|
user_id = Column(Integer, ForeignKey(
|
||||||
|
"users.user_id", ondelete="CASCADE"), primary_key=True)
|
||||||
|
region_id = Column(Integer, ForeignKey(
|
||||||
|
"regions.region_id", ondelete="CASCADE"), primary_key=True)
|
||||||
|
|
||||||
|
|
||||||
|
class UserKeyword(Base):
|
||||||
|
__tablename__ = "user_keywords"
|
||||||
|
user_id = Column(Integer, ForeignKey(
|
||||||
|
"users.user_id", ondelete="CASCADE"), primary_key=True)
|
||||||
|
keyword_id = Column(Integer, ForeignKey(
|
||||||
|
"keywords.keyword_id", ondelete="CASCADE"), primary_key=True)
|
||||||
|
|
||||||
|
|
||||||
|
def _ensure_session() -> Session:
|
||||||
|
global engine, SessionLocal
|
||||||
|
if engine is None or SessionLocal is None:
|
||||||
|
db_init()
|
||||||
|
assert SessionLocal is not None
|
||||||
|
return cast(Session, SessionLocal())
|
||||||
|
|
||||||
|
|
||||||
|
def db_init():
|
||||||
|
"""Initialize MySQL database and create tables if needed."""
|
||||||
|
global engine, SessionLocal
|
||||||
|
cfg = get_mysql_config()
|
||||||
|
# Create database if it doesn't exist
|
||||||
|
root_url = f"mysql+pymysql://{cfg['user']}:{cfg['password']}@{cfg['host']}:{cfg['port']}/"
|
||||||
|
dbname = cfg["database"]
|
||||||
|
root_engine = create_engine(root_url, future=True)
|
||||||
|
with root_engine.begin() as conn:
|
||||||
|
conn.execute(text(
|
||||||
|
f"CREATE DATABASE IF NOT EXISTS `{dbname}` CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci"))
|
||||||
|
# Create tables in target DB
|
||||||
|
mysql_url = f"mysql+pymysql://{cfg['user']}:{cfg['password']}@{cfg['host']}:{cfg['port']}/{dbname}?charset=utf8mb4"
|
||||||
|
engine = create_engine(mysql_url, future=True)
|
||||||
|
SessionLocal = sessionmaker(bind=engine, autoflush=False,
|
||||||
|
autocommit=False, future=True)
|
||||||
|
Base.metadata.create_all(engine)
|
||||||
|
# Ensure new auth columns exist for existing databases (MySQL/MariaDB support IF NOT EXISTS)
|
||||||
|
with engine.begin() as conn:
|
||||||
|
try:
|
||||||
|
conn.execute(text(
|
||||||
|
"ALTER TABLE users ADD COLUMN IF NOT EXISTS password_hash VARCHAR(255) NULL"))
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
conn.execute(text(
|
||||||
|
"ALTER TABLE users ADD COLUMN IF NOT EXISTS is_admin TINYINT(1) NOT NULL DEFAULT 0"))
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
conn.execute(text(
|
||||||
|
"ALTER TABLE users ADD COLUMN IF NOT EXISTS is_active TINYINT(1) NOT NULL DEFAULT 1"))
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
conn.execute(
|
||||||
|
text("ALTER TABLE users ADD COLUMN IF NOT EXISTS last_login DATETIME NULL"))
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def upsert_user_interaction(job_id: str | int, *, user_id: Optional[int] = None, seen_at: Optional[str] = None, url_visited: Optional[str] = None, is_user_favorite: Optional[bool] = None):
|
||||||
|
"""Upsert a single interaction row for this job.
|
||||||
|
Any provided field will be updated; absent fields keep their current value.
|
||||||
|
"""
|
||||||
|
if user_id is None:
|
||||||
|
user_id = get_or_create_user("anonymous")
|
||||||
|
job_id_str = str(job_id)
|
||||||
|
with _ensure_session() as session:
|
||||||
|
ui = session.get(UserInteraction, {
|
||||||
|
"job_id": job_id_str, "user_id": int(user_id)})
|
||||||
|
if ui is None:
|
||||||
|
ui = UserInteraction(job_id=job_id_str, user_id=int(user_id))
|
||||||
|
session.add(ui)
|
||||||
|
if seen_at is not None:
|
||||||
|
setattr(ui, "seen_at", seen_at)
|
||||||
|
if url_visited is not None:
|
||||||
|
setattr(ui, "url_visited", url_visited)
|
||||||
|
if is_user_favorite is not None:
|
||||||
|
setattr(ui, "is_user_favorite", bool(is_user_favorite))
|
||||||
|
session.commit()
|
||||||
|
|
||||||
|
|
||||||
|
def upsert_listing(*, url: str, region: str, keyword: str, title: str, pay: str, location: str, timestamp: str):
|
||||||
|
"""Insert or update a job listing row based on job_id derived from URL."""
|
||||||
|
job_id = str(url_to_job_id(url))
|
||||||
|
with _ensure_session() as session:
|
||||||
|
obj = session.get(JobListing, job_id)
|
||||||
|
if obj is None:
|
||||||
|
obj = JobListing(job_id=job_id)
|
||||||
|
session.add(obj)
|
||||||
|
setattr(obj, "url", url)
|
||||||
|
setattr(obj, "region", region)
|
||||||
|
setattr(obj, "keyword", keyword)
|
||||||
|
setattr(obj, "title", title)
|
||||||
|
setattr(obj, "pay", pay)
|
||||||
|
setattr(obj, "location", location)
|
||||||
|
setattr(obj, "timestamp", timestamp)
|
||||||
|
session.commit()
|
||||||
|
|
||||||
|
|
||||||
|
def upsert_job_details(job_data: Dict[str, Any]):
|
||||||
|
"""Upsert into job_descriptions table using scraped job details dict."""
|
||||||
|
url = job_data.get("url")
|
||||||
|
job_id = normalize_job_id(job_data.get("id"), url)
|
||||||
|
if not job_id:
|
||||||
|
return
|
||||||
|
title = job_data.get("title") or None
|
||||||
|
company = job_data.get("company") or None
|
||||||
|
location = job_data.get("location") or None
|
||||||
|
description = job_data.get("description") or None
|
||||||
|
posted_time = job_data.get("posted_time") or None
|
||||||
|
|
||||||
|
job_id = str(job_id)
|
||||||
|
with _ensure_session() as session:
|
||||||
|
obj = session.get(JobDescription, job_id)
|
||||||
|
if obj is None:
|
||||||
|
obj = JobDescription(job_id=job_id)
|
||||||
|
session.add(obj)
|
||||||
|
setattr(obj, "title", title)
|
||||||
|
setattr(obj, "company", company)
|
||||||
|
setattr(obj, "location", location)
|
||||||
|
setattr(obj, "description", description)
|
||||||
|
setattr(obj, "posted_time", posted_time)
|
||||||
|
setattr(obj, "url", url)
|
||||||
|
session.commit()
|
||||||
|
|
||||||
|
|
||||||
|
def upsert_cached_page(*, file_path: str, url_guess: Optional[str], last_modified: Optional[str], size_bytes: Optional[int], job_id: Optional[int]):
|
||||||
|
# Always store absolute paths
|
||||||
|
abs_fp = os.path.abspath(file_path)
|
||||||
|
with _ensure_session() as session:
|
||||||
|
obj = session.get(CachedPage, abs_fp)
|
||||||
|
if obj is None:
|
||||||
|
obj = CachedPage(file_path=abs_fp)
|
||||||
|
session.add(obj)
|
||||||
|
setattr(obj, "url_guess", url_guess)
|
||||||
|
setattr(obj, "last_modified", last_modified)
|
||||||
|
setattr(obj, "size_bytes", size_bytes)
|
||||||
|
setattr(obj, "job_id", str(job_id) if job_id else None)
|
||||||
|
session.commit()
|
||||||
|
|
||||||
|
|
||||||
|
def remove_cached_page(file_path: str):
|
||||||
|
# Accept either relative or absolute; remove both variants just in case
|
||||||
|
abs_fp = os.path.abspath(file_path)
|
||||||
|
with _ensure_session() as session:
|
||||||
|
obj = session.get(CachedPage, abs_fp)
|
||||||
|
if obj:
|
||||||
|
session.delete(obj)
|
||||||
|
session.commit()
|
||||||
|
|
||||||
|
|
||||||
|
def db_remove_cached_url(url: str):
|
||||||
|
"""Remove a cached page by URL."""
|
||||||
|
abs_fp = get_cache_path(url)
|
||||||
|
try:
|
||||||
|
remove_cached_page(abs_fp)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def db_get_all_cached_pages() -> List[Dict[str, Any]]:
|
||||||
|
with _ensure_session() as session:
|
||||||
|
rows = session.execute(text(
|
||||||
|
"SELECT file_path, url_guess, last_modified, size_bytes, job_id FROM cached_pages")).fetchall()
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
"file_path": row[0],
|
||||||
|
"url_guess": row[1],
|
||||||
|
"last_modified": row[2],
|
||||||
|
"size_bytes": row[3],
|
||||||
|
"job_id": row[4],
|
||||||
|
}
|
||||||
|
for row in rows
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def db_get_cache_url(url: str):
|
||||||
|
"""Return the data for a specific URL from cached_pages.
|
||||||
|
|
||||||
|
Arguments:
|
||||||
|
url -- The URL to look up in the cache.
|
||||||
|
"""
|
||||||
|
with _ensure_session() as session:
|
||||||
|
row = session.execute(text(
|
||||||
|
"SELECT file_path, url_guess, last_modified, size_bytes, job_id FROM cached_pages WHERE url_guess = :u"), {"u": url}).fetchone()
|
||||||
|
if not row:
|
||||||
|
return None
|
||||||
|
return {
|
||||||
|
"file_path": row[0],
|
||||||
|
"url_guess": row[1],
|
||||||
|
"last_modified": row[2],
|
||||||
|
"size_bytes": row[3],
|
||||||
|
"job_id": row[4],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def db_sync_cached_pages(cache_dir: str):
|
||||||
|
"""Scan cache_dir and upsert page metadata into cached_pages table."""
|
||||||
|
if not os.path.isdir(cache_dir):
|
||||||
|
return
|
||||||
|
db_cache = db_get_all_cached_pages()
|
||||||
|
for root, _, files in os.walk(cache_dir):
|
||||||
|
for name in files:
|
||||||
|
if not name.lower().endswith(".html"):
|
||||||
|
continue
|
||||||
|
fp = os.path.abspath(os.path.join(root, name))
|
||||||
|
if fp in [c["file_path"] for c in db_cache]:
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
stat = os.stat(fp)
|
||||||
|
mtime = datetime.fromtimestamp(stat.st_mtime).isoformat()
|
||||||
|
size = stat.st_size
|
||||||
|
except OSError:
|
||||||
|
mtime = None
|
||||||
|
size = None
|
||||||
|
url_guess = get_url_from_filename(name)
|
||||||
|
job_id = url_to_job_id(url_guess)
|
||||||
|
upsert_cached_page(file_path=fp, url_guess=url_guess,
|
||||||
|
last_modified=mtime, size_bytes=size, job_id=job_id)
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_cached_page_paths() -> int:
|
||||||
|
"""Ensure all cached_pages.file_path values are absolute. Returns number of rows updated/normalized."""
|
||||||
|
changed = 0
|
||||||
|
with _ensure_session() as session:
|
||||||
|
rows = session.execute(text(
|
||||||
|
"SELECT file_path, url_guess, last_modified, size_bytes, job_id FROM cached_pages")).fetchall()
|
||||||
|
for (fp, url_guess, last_modified, size_bytes, job_id) in rows:
|
||||||
|
if not os.path.isabs(fp):
|
||||||
|
abs_fp = os.path.abspath(fp)
|
||||||
|
# Upsert under absolute path, then remove the relative entry
|
||||||
|
upsert_cached_page(
|
||||||
|
file_path=abs_fp,
|
||||||
|
url_guess=url_guess,
|
||||||
|
last_modified=last_modified,
|
||||||
|
size_bytes=size_bytes,
|
||||||
|
job_id=job_id,
|
||||||
|
)
|
||||||
|
with _ensure_session() as session:
|
||||||
|
session.execute(
|
||||||
|
text("DELETE FROM cached_pages WHERE file_path = :fp"), {"fp": fp})
|
||||||
|
session.commit()
|
||||||
|
changed += 1
|
||||||
|
return changed
|
||||||
|
|
||||||
|
|
||||||
|
def db_get_keywords() -> List[str]:
|
||||||
|
"""Return a list of all unique keywords from job listings."""
|
||||||
|
with _ensure_session() as session:
|
||||||
|
rows = session.execute(
|
||||||
|
text("SELECT DISTINCT keyword FROM job_listings")).fetchall()
|
||||||
|
return [r[0] for r in rows]
|
||||||
|
|
||||||
|
|
||||||
|
def db_get_regions() -> List[str]:
|
||||||
|
"""Return a list of all unique regions from job listings."""
|
||||||
|
with _ensure_session() as session:
|
||||||
|
rows = session.execute(
|
||||||
|
text("SELECT DISTINCT region FROM job_listings")).fetchall()
|
||||||
|
return [r[0] for r in rows]
|
||||||
|
|
||||||
|
|
||||||
|
def get_all_jobs():
|
||||||
|
query = """
|
||||||
|
SELECT l.job_id
|
||||||
|
,l.title
|
||||||
|
,d.description
|
||||||
|
,l.region
|
||||||
|
,l.keyword
|
||||||
|
,d.company
|
||||||
|
,l.location
|
||||||
|
,l.timestamp
|
||||||
|
,d.posted_time
|
||||||
|
,l.url
|
||||||
|
,c.file_path
|
||||||
|
,c.last_modified
|
||||||
|
,c.url_guess
|
||||||
|
,CASE WHEN c.url_guess != l.url THEN 1 ELSE 0 END AS url_guess_stale
|
||||||
|
FROM job_listings AS l
|
||||||
|
INNER JOIN job_descriptions AS d
|
||||||
|
ON l.job_id = d.job_id
|
||||||
|
AND l.url = d.url
|
||||||
|
LEFT JOIN cached_pages AS c ON l.job_id = c.job_id
|
||||||
|
ORDER BY d.posted_time DESC
|
||||||
|
"""
|
||||||
|
with _ensure_session() as session:
|
||||||
|
rows = session.execute(text(query)).fetchall()
|
||||||
|
jobs = []
|
||||||
|
for row in rows:
|
||||||
|
job = {
|
||||||
|
"id": row[0],
|
||||||
|
"title": row[1],
|
||||||
|
"description": row[2].replace('\n', '<br />').strip(),
|
||||||
|
"region": row[3],
|
||||||
|
"keyword": row[4],
|
||||||
|
"company": row[5],
|
||||||
|
"location": row[6],
|
||||||
|
"timestamp": row[7],
|
||||||
|
"posted_time": row[8],
|
||||||
|
"url": row[9],
|
||||||
|
"file_path": row[10],
|
||||||
|
"last_modified": row[11],
|
||||||
|
"url_guess": row[12],
|
||||||
|
"url_guess_stale": row[13],
|
||||||
|
}
|
||||||
|
jobs.append(job)
|
||||||
|
return jobs
|
||||||
|
|
||||||
|
|
||||||
|
def db_get_all_job_urls() -> List[str]:
|
||||||
|
"""Return list of job URLs from job_listings."""
|
||||||
|
with _ensure_session() as session:
|
||||||
|
rows = session.execute(text("SELECT url FROM job_listings")).fetchall()
|
||||||
|
return [r[0] for r in rows]
|
||||||
|
|
||||||
|
|
||||||
|
def db_delete_job(job_id: str | int):
|
||||||
|
"""Delete a job row (cascades to details and interactions)."""
|
||||||
|
jid = str(job_id)
|
||||||
|
with _ensure_session() as session:
|
||||||
|
obj = session.get(JobListing, jid)
|
||||||
|
if obj:
|
||||||
|
session.delete(obj)
|
||||||
|
session.commit()
|
||||||
|
|
||||||
|
|
||||||
|
def remove_job(url):
|
||||||
|
"""Remove a job from the database."""
|
||||||
|
try:
|
||||||
|
jid = url_to_job_id(url)
|
||||||
|
db_delete_job(jid)
|
||||||
|
cache_fp = get_cache_path(url)
|
||||||
|
remove_cached_page(os.path.abspath(cache_fp))
|
||||||
|
if os.path.exists(cache_fp):
|
||||||
|
os.remove(cache_fp)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------- New ORM convenience helpers ------------------------------
|
||||||
|
|
||||||
|
def get_or_create_user(username: str) -> int:
|
||||||
|
"""Return user_id for username, creating if missing."""
|
||||||
|
created_at = datetime.now(UTC).isoformat()
|
||||||
|
with _ensure_session() as session:
|
||||||
|
row = session.execute(
|
||||||
|
text("SELECT user_id FROM users WHERE username = :u"), {
|
||||||
|
"u": username}
|
||||||
|
).fetchone()
|
||||||
|
if row:
|
||||||
|
return int(row[0])
|
||||||
|
session.execute(
|
||||||
|
text("INSERT INTO users(username, created_at) VALUES(:u, :c)"),
|
||||||
|
{"u": username, "c": created_at},
|
||||||
|
)
|
||||||
|
session.commit()
|
||||||
|
# open a new session to fetch the id
|
||||||
|
with _ensure_session() as session:
|
||||||
|
row2 = session.execute(
|
||||||
|
text("SELECT user_id FROM users WHERE username = :u"), {
|
||||||
|
"u": username}
|
||||||
|
).fetchone()
|
||||||
|
if row2:
|
||||||
|
return int(row2[0])
|
||||||
|
# Edge case retry
|
||||||
|
return get_or_create_user(username)
|
||||||
|
|
||||||
|
|
||||||
|
def mark_favorite(job_id: str | int, username: str, favorite: bool = True):
|
||||||
|
user_id = get_or_create_user(username)
|
||||||
|
upsert_user_interaction(job_id, user_id=user_id, is_user_favorite=favorite)
|
||||||
|
|
||||||
|
|
||||||
|
def record_visit(job_id: str | int, username: str, url: Optional[str] = None):
|
||||||
|
user_id = get_or_create_user(username)
|
||||||
|
ts = now_iso()
|
||||||
|
upsert_user_interaction(job_id, user_id=user_id,
|
||||||
|
seen_at=ts, url_visited=url)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------- User auth/admin helpers ----------------------------------
|
||||||
|
def create_or_update_user(username: str, password: Optional[str] = None, *, is_admin: Optional[bool] = None, is_active: Optional[bool] = None) -> int:
|
||||||
|
"""Create user if missing; update password/admin/active if provided. Returns user_id."""
|
||||||
|
username = (username or "").strip()
|
||||||
|
if not username:
|
||||||
|
raise ValueError("username required")
|
||||||
|
uid = get_or_create_user(username)
|
||||||
|
with _ensure_session() as session:
|
||||||
|
# Build dynamic update
|
||||||
|
fields = []
|
||||||
|
params: Dict[str, Any] = {"u": uid}
|
||||||
|
if password is not None:
|
||||||
|
fields.append("password_hash = :ph")
|
||||||
|
params["ph"] = generate_password_hash(password)
|
||||||
|
if is_admin is not None:
|
||||||
|
fields.append("is_admin = :ia")
|
||||||
|
params["ia"] = 1 if is_admin else 0
|
||||||
|
if is_active is not None:
|
||||||
|
fields.append("is_active = :ac")
|
||||||
|
params["ac"] = 1 if is_active else 0
|
||||||
|
if fields:
|
||||||
|
q = f"UPDATE users SET {', '.join(fields)} WHERE user_id = :u"
|
||||||
|
session.execute(text(q), params)
|
||||||
|
session.commit()
|
||||||
|
return uid
|
||||||
|
|
||||||
|
|
||||||
|
def set_user_password(username: str, password: str) -> None:
|
||||||
|
create_or_update_user(username, password=password)
|
||||||
|
|
||||||
|
|
||||||
|
def set_user_admin(username: str, is_admin: bool) -> None:
|
||||||
|
create_or_update_user(username, is_admin=is_admin)
|
||||||
|
|
||||||
|
|
||||||
|
def set_user_active(username: str, is_active: bool) -> None:
|
||||||
|
create_or_update_user(username, is_active=is_active)
|
||||||
|
|
||||||
|
|
||||||
|
def verify_user_credentials(username: str, password: str) -> bool:
|
||||||
|
"""Validate username/password against stored password_hash."""
|
||||||
|
with _ensure_session() as session:
|
||||||
|
row = session.execute(text("SELECT password_hash, is_active FROM users WHERE username = :u"), {
|
||||||
|
"u": username}).fetchone()
|
||||||
|
if not row:
|
||||||
|
return False
|
||||||
|
ph, active = row[0], bool(row[1])
|
||||||
|
if not active or not ph:
|
||||||
|
return False
|
||||||
|
ok = check_password_hash(ph, password)
|
||||||
|
if ok:
|
||||||
|
# record last_login
|
||||||
|
try:
|
||||||
|
session.execute(text("UPDATE users SET last_login = :ts WHERE username = :u"), {
|
||||||
|
"ts": datetime.now(UTC), "u": username})
|
||||||
|
session.commit()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return ok
|
||||||
|
|
||||||
|
|
||||||
|
def get_users() -> List[Dict[str, Any]]:
|
||||||
|
with _ensure_session() as session:
|
||||||
|
rows = session.execute(text(
|
||||||
|
"SELECT user_id, username, created_at, is_admin, is_active, last_login, (password_hash IS NOT NULL) AS has_pw FROM users ORDER BY username ASC")).fetchall()
|
||||||
|
out: List[Dict[str, Any]] = []
|
||||||
|
for r in rows:
|
||||||
|
out.append({
|
||||||
|
"user_id": int(r[0]),
|
||||||
|
"username": r[1],
|
||||||
|
"created_at": r[2].isoformat() if isinstance(r[2], datetime) else (r[2] or None),
|
||||||
|
"is_admin": bool(r[3]),
|
||||||
|
"is_active": bool(r[4]),
|
||||||
|
"last_login": r[5].isoformat() if r[5] else None,
|
||||||
|
"has_password": bool(r[6]),
|
||||||
|
})
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def get_user(username: str) -> Optional[Dict[str, Any]]:
|
||||||
|
"""Return single user dict or None."""
|
||||||
|
with _ensure_session() as session:
|
||||||
|
row = session.execute(text(
|
||||||
|
"SELECT user_id, username, is_admin, is_active, password_hash, last_login, created_at FROM users WHERE username = :u"
|
||||||
|
), {"u": username}).fetchone()
|
||||||
|
if not row:
|
||||||
|
return None
|
||||||
|
return {
|
||||||
|
"user_id": int(row[0]),
|
||||||
|
"username": row[1],
|
||||||
|
"is_admin": bool(row[2]),
|
||||||
|
"is_active": bool(row[3]),
|
||||||
|
"password_hash": row[4],
|
||||||
|
"last_login": row[5].isoformat() if row[5] else None,
|
||||||
|
"created_at": row[6].isoformat() if isinstance(row[6], datetime) else (row[6] or None),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------- Regions/Keywords helpers ---------------------------------
|
||||||
|
def upsert_region(name: str) -> int:
|
||||||
|
"""Get or create a region by name; return region_id."""
|
||||||
|
name = (name or "").strip()
|
||||||
|
if not name:
|
||||||
|
raise ValueError("Region name cannot be empty")
|
||||||
|
with _ensure_session() as session:
|
||||||
|
row = session.execute(text("SELECT region_id FROM regions WHERE name = :n"), {
|
||||||
|
"n": name}).fetchone()
|
||||||
|
if row:
|
||||||
|
return int(row[0])
|
||||||
|
session.execute(
|
||||||
|
text("INSERT INTO regions(name) VALUES (:n)"), {"n": name})
|
||||||
|
session.commit()
|
||||||
|
with _ensure_session() as session:
|
||||||
|
row2 = session.execute(text("SELECT region_id FROM regions WHERE name = :n"), {
|
||||||
|
"n": name}).fetchone()
|
||||||
|
if row2:
|
||||||
|
return int(row2[0])
|
||||||
|
# unlikely retry
|
||||||
|
return upsert_region(name)
|
||||||
|
|
||||||
|
|
||||||
|
def upsert_keyword(name: str) -> int:
|
||||||
|
"""Get or create a keyword by name; return keyword_id."""
|
||||||
|
name = (name or "").strip()
|
||||||
|
if not name:
|
||||||
|
raise ValueError("Keyword name cannot be empty")
|
||||||
|
with _ensure_session() as session:
|
||||||
|
row = session.execute(text("SELECT keyword_id FROM keywords WHERE name = :n"), {
|
||||||
|
"n": name}).fetchone()
|
||||||
|
if row:
|
||||||
|
return int(row[0])
|
||||||
|
session.execute(
|
||||||
|
text("INSERT INTO keywords(name) VALUES (:n)"), {"n": name})
|
||||||
|
session.commit()
|
||||||
|
with _ensure_session() as session:
|
||||||
|
row2 = session.execute(text("SELECT keyword_id FROM keywords WHERE name = :n"), {
|
||||||
|
"n": name}).fetchone()
|
||||||
|
if row2:
|
||||||
|
return int(row2[0])
|
||||||
|
return upsert_keyword(name)
|
||||||
|
|
||||||
|
|
||||||
|
def set_user_regions(username: str, region_names: List[str]) -> None:
|
||||||
|
"""Replace user's preferred regions with given names."""
|
||||||
|
user_id = get_or_create_user(username)
|
||||||
|
# Normalize and get ids
|
||||||
|
names = sorted({(n or "").strip()
|
||||||
|
for n in region_names if (n or "").strip()})
|
||||||
|
region_ids: List[int] = [upsert_region(n) for n in names]
|
||||||
|
if not region_ids and not names:
|
||||||
|
# Clear all if explicitly empty list
|
||||||
|
with _ensure_session() as session:
|
||||||
|
session.execute(
|
||||||
|
text("DELETE FROM user_regions WHERE user_id = :u"), {"u": user_id})
|
||||||
|
session.commit()
|
||||||
|
return
|
||||||
|
desired = set(region_ids)
|
||||||
|
with _ensure_session() as session:
|
||||||
|
rows = session.execute(text("SELECT region_id FROM user_regions WHERE user_id = :u"), {
|
||||||
|
"u": user_id}).fetchall()
|
||||||
|
current = set(int(r[0]) for r in rows)
|
||||||
|
to_add = desired - current
|
||||||
|
to_remove = current - desired
|
||||||
|
for rid in to_remove:
|
||||||
|
session.execute(text("DELETE FROM user_regions WHERE user_id = :u AND region_id = :r"), {
|
||||||
|
"u": user_id, "r": int(rid)})
|
||||||
|
for rid in to_add:
|
||||||
|
session.execute(text("INSERT INTO user_regions(user_id, region_id) VALUES(:u, :r)"), {
|
||||||
|
"u": user_id, "r": int(rid)})
|
||||||
|
session.commit()
|
||||||
|
|
||||||
|
|
||||||
|
def set_user_keywords(username: str, keyword_names: List[str]) -> None:
|
||||||
|
"""Replace user's preferred keywords with given names."""
|
||||||
|
user_id = get_or_create_user(username)
|
||||||
|
names = sorted({(n or "").strip()
|
||||||
|
for n in keyword_names if (n or "").strip()})
|
||||||
|
keyword_ids: List[int] = [upsert_keyword(n) for n in names]
|
||||||
|
if not keyword_ids and not names:
|
||||||
|
with _ensure_session() as session:
|
||||||
|
session.execute(
|
||||||
|
text("DELETE FROM user_keywords WHERE user_id = :u"), {"u": user_id})
|
||||||
|
session.commit()
|
||||||
|
return
|
||||||
|
desired = set(keyword_ids)
|
||||||
|
with _ensure_session() as session:
|
||||||
|
rows = session.execute(text("SELECT keyword_id FROM user_keywords WHERE user_id = :u"), {
|
||||||
|
"u": user_id}).fetchall()
|
||||||
|
current = set(int(r[0]) for r in rows)
|
||||||
|
to_add = desired - current
|
||||||
|
to_remove = current - desired
|
||||||
|
for kid in to_remove:
|
||||||
|
session.execute(text("DELETE FROM user_keywords WHERE user_id = :u AND keyword_id = :k"), {
|
||||||
|
"u": user_id, "k": int(kid)})
|
||||||
|
for kid in to_add:
|
||||||
|
session.execute(text("INSERT INTO user_keywords(user_id, keyword_id) VALUES(:u, :k)"), {
|
||||||
|
"u": user_id, "k": int(kid)})
|
||||||
|
session.commit()
|
||||||
|
|
||||||
|
|
||||||
|
def get_user_regions(username: str) -> List[Dict[str, str]]:
|
||||||
|
"""Return preferred region names for a user (empty if none)."""
|
||||||
|
with _ensure_session() as session:
|
||||||
|
row = session.execute(text("SELECT user_id FROM users WHERE username = :u"), {
|
||||||
|
"u": username}).fetchone()
|
||||||
|
if not row:
|
||||||
|
return []
|
||||||
|
user_id = int(row[0])
|
||||||
|
rows = session.execute(text(
|
||||||
|
"""
|
||||||
|
SELECT r.name, r.color
|
||||||
|
FROM regions r
|
||||||
|
INNER JOIN user_regions ur ON ur.region_id = r.region_id
|
||||||
|
WHERE ur.user_id = :u
|
||||||
|
ORDER BY r.name ASC
|
||||||
|
"""
|
||||||
|
), {"u": user_id}).fetchall()
|
||||||
|
return [{"name": r[0], "color": r[1]} for r in rows]
|
||||||
|
|
||||||
|
|
||||||
|
def get_user_keywords(username: str) -> List[Dict[str, str]]:
|
||||||
|
"""Return preferred keyword names for a user (empty if none)."""
|
||||||
|
with _ensure_session() as session:
|
||||||
|
row = session.execute(text("SELECT user_id FROM users WHERE username = :u"), {
|
||||||
|
"u": username}).fetchone()
|
||||||
|
if not row:
|
||||||
|
return []
|
||||||
|
user_id = int(row[0])
|
||||||
|
rows = session.execute(text(
|
||||||
|
"""
|
||||||
|
SELECT k.name, k.color
|
||||||
|
FROM keywords k
|
||||||
|
INNER JOIN user_keywords uk ON uk.keyword_id = k.keyword_id
|
||||||
|
WHERE uk.user_id = :u
|
||||||
|
ORDER BY k.name ASC
|
||||||
|
"""
|
||||||
|
), {"u": user_id}).fetchall()
|
||||||
|
return [{"name": r[0], "color": r[1]} for r in rows]
|
||||||
|
|
||||||
|
|
||||||
|
def get_all_regions() -> List[Dict[str, str]]:
|
||||||
|
"""Return all region names from regions table (sorted)."""
|
||||||
|
with _ensure_session() as session:
|
||||||
|
rows = session.execute(
|
||||||
|
text("SELECT name, color FROM regions ORDER BY name ASC")).fetchall()
|
||||||
|
return [{"name": r[0], "color": r[1]} for r in rows]
|
||||||
|
|
||||||
|
|
||||||
|
def get_all_keywords() -> List[Dict[str, str]]:
|
||||||
|
"""Return all keyword names from keywords table (sorted)."""
|
||||||
|
with _ensure_session() as session:
|
||||||
|
rows = session.execute(
|
||||||
|
text("SELECT name, color FROM keywords ORDER BY name ASC")).fetchall()
|
||||||
|
return [{"name": r[0], "color": r[1]} for r in rows]
|
||||||
|
|
||||||
|
|
||||||
|
def seed_regions_keywords_from_listings() -> Dict[str, int]:
|
||||||
|
"""Seed regions/keywords tables from distinct values in job_listings if empty.
|
||||||
|
|
||||||
|
Returns dict with counts inserted: {"regions": n1, "keywords": n2}.
|
||||||
|
"""
|
||||||
|
inserted = {"regions": 0, "keywords": 0}
|
||||||
|
with _ensure_session() as session:
|
||||||
|
# Regions
|
||||||
|
existing_regions = session.execute(
|
||||||
|
text("SELECT COUNT(*) FROM regions")).scalar_one()
|
||||||
|
if int(existing_regions or 0) == 0:
|
||||||
|
rows = session.execute(text(
|
||||||
|
"SELECT DISTINCT region FROM job_listings WHERE region IS NOT NULL AND region != ''")).fetchall()
|
||||||
|
for r in rows:
|
||||||
|
name = r[0]
|
||||||
|
if name:
|
||||||
|
try:
|
||||||
|
session.execute(
|
||||||
|
text("INSERT IGNORE INTO regions(name, color) VALUES(:n, :c)"), {"n": name, "c": get_color_from_string(name)})
|
||||||
|
inserted["regions"] += 1
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
session.commit()
|
||||||
|
# Keywords
|
||||||
|
existing_keywords = session.execute(
|
||||||
|
text("SELECT COUNT(*) FROM keywords")).scalar_one()
|
||||||
|
if int(existing_keywords or 0) == 0:
|
||||||
|
rows = session.execute(text(
|
||||||
|
"SELECT DISTINCT keyword FROM job_listings WHERE keyword IS NOT NULL AND keyword != ''")).fetchall()
|
||||||
|
for r in rows:
|
||||||
|
name = r[0]
|
||||||
|
if name:
|
||||||
|
try:
|
||||||
|
session.execute(
|
||||||
|
text("INSERT IGNORE INTO keywords(name, color) VALUES(:n, :c)"), {"n": name, "c": get_color_from_string(name)})
|
||||||
|
inserted["keywords"] += 1
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
session.commit()
|
||||||
|
return inserted
|
||||||
|
|
||||||
|
|
||||||
|
def list_regions_full() -> List[Dict[str, Any]]:
|
||||||
|
with _ensure_session() as session:
|
||||||
|
rows = session.execute(
|
||||||
|
text("SELECT region_id, name, color FROM regions ORDER BY name ASC")).fetchall()
|
||||||
|
return [{"region_id": int(r[0]), "name": r[1], "color": r[2]} for r in rows]
|
||||||
|
|
||||||
|
|
||||||
|
def list_keywords_full() -> List[Dict[str, Any]]:
|
||||||
|
with _ensure_session() as session:
|
||||||
|
rows = session.execute(
|
||||||
|
text("SELECT keyword_id, name, color FROM keywords ORDER BY name ASC")).fetchall()
|
||||||
|
return [{"keyword_id": int(r[0]), "name": r[1], "color": r[2]} for r in rows]
|
||||||
|
|
||||||
|
|
||||||
|
def rename_region(region_id: int, new_name: str) -> bool:
|
||||||
|
new_name = (new_name or "").strip()
|
||||||
|
if not new_name:
|
||||||
|
raise ValueError("new_name required")
|
||||||
|
with _ensure_session() as session:
|
||||||
|
try:
|
||||||
|
session.execute(text("UPDATE regions SET name = :n WHERE region_id = :id"), {
|
||||||
|
"n": new_name, "id": int(region_id)})
|
||||||
|
session.commit()
|
||||||
|
return True
|
||||||
|
except Exception:
|
||||||
|
session.rollback()
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def rename_keyword(keyword_id: int, new_name: str) -> bool:
|
||||||
|
new_name = (new_name or "").strip()
|
||||||
|
if not new_name:
|
||||||
|
raise ValueError("new_name required")
|
||||||
|
with _ensure_session() as session:
|
||||||
|
try:
|
||||||
|
session.execute(text("UPDATE keywords SET name = :n WHERE keyword_id = :id"), {
|
||||||
|
"n": new_name, "id": int(keyword_id)})
|
||||||
|
session.commit()
|
||||||
|
return True
|
||||||
|
except Exception:
|
||||||
|
session.rollback()
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def change_region_color(region_id: int, new_color: str) -> bool:
|
||||||
|
new_color = (new_color or "").strip()
|
||||||
|
if not new_color:
|
||||||
|
raise ValueError("new_color required")
|
||||||
|
with _ensure_session() as session:
|
||||||
|
try:
|
||||||
|
session.execute(text("UPDATE regions SET color = :c WHERE region_id = :id"), {
|
||||||
|
"c": new_color, "id": int(region_id)})
|
||||||
|
session.commit()
|
||||||
|
return True
|
||||||
|
except Exception:
|
||||||
|
session.rollback()
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def change_keyword_color(keyword_id: int, new_color: str) -> bool:
|
||||||
|
new_color = (new_color or "").strip()
|
||||||
|
if not new_color:
|
||||||
|
raise ValueError("new_color required")
|
||||||
|
with _ensure_session() as session:
|
||||||
|
try:
|
||||||
|
session.execute(text("UPDATE keywords SET color = :c WHERE keyword_id = :id"), {
|
||||||
|
"c": new_color, "id": int(keyword_id)})
|
||||||
|
session.commit()
|
||||||
|
return True
|
||||||
|
except Exception:
|
||||||
|
session.rollback()
|
||||||
|
return False
|
||||||
121
web/scraper.py
Normal file
121
web/scraper.py
Normal file
@@ -0,0 +1,121 @@
|
|||||||
|
from datetime import datetime, UTC
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from typing import List, Dict, Set
|
||||||
|
from web.utils import get_base_url, cache_page, safe_get_text, safe_get_attr, is_cached, get_cached_content, make_request_with_retry
|
||||||
|
|
||||||
|
|
||||||
|
def scrape_listings_page(listing, region: str, keyword: str, seen_urls: Set[str]) -> List:
|
||||||
|
"""Parse a single job listing."""
|
||||||
|
try:
|
||||||
|
title_elem = listing.find("div", class_="title")
|
||||||
|
url_elem = listing.find("a")
|
||||||
|
pay_elem = listing.find("div", class_="attr remuneration")
|
||||||
|
if pay_elem:
|
||||||
|
pay_elem = pay_elem.find("span", class_="valu")
|
||||||
|
location_elem = listing.find("div", class_="location")
|
||||||
|
|
||||||
|
if not title_elem or not url_elem:
|
||||||
|
return []
|
||||||
|
|
||||||
|
title = title_elem.get_text(strip=True)
|
||||||
|
url = url_elem["href"]
|
||||||
|
pay = pay_elem.get_text(strip=True) if pay_elem else "N/A"
|
||||||
|
location = location_elem.get_text(
|
||||||
|
strip=True) if location_elem else "N/A"
|
||||||
|
|
||||||
|
status = "DUPLICATE" if url in seen_urls else "NEW"
|
||||||
|
if url in seen_urls:
|
||||||
|
return []
|
||||||
|
|
||||||
|
# job_summary variable retained for parity but not used
|
||||||
|
job_summary = f"{status} [{region}/{keyword}] | Title: {title[:50]}{'...' if len(title) > 50 else ''} | Location: {location} | URL: {url}"
|
||||||
|
_ = job_summary
|
||||||
|
|
||||||
|
return [datetime.now(UTC).isoformat(), region, keyword, title, pay, location, url]
|
||||||
|
except (AttributeError, KeyError):
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def scrape_job_page(content: str, url: str) -> Dict:
|
||||||
|
"""Scrape job details from a job listing page."""
|
||||||
|
soup = BeautifulSoup(content, "html.parser")
|
||||||
|
|
||||||
|
# Extract each field
|
||||||
|
title = safe_get_text(soup.find("h1", class_="postingtitle"))
|
||||||
|
company = safe_get_text(soup.find("h2", class_="company-name"))
|
||||||
|
|
||||||
|
map_elem = soup.find("div", id="map")
|
||||||
|
if map_elem:
|
||||||
|
lat = safe_get_attr(map_elem, "data-latitude")
|
||||||
|
lon = safe_get_attr(map_elem, "data-longitude")
|
||||||
|
accuracy = safe_get_attr(map_elem, "data-accuracy")
|
||||||
|
location = f"Lat: {lat}, Lon: {lon}, Accuracy: {accuracy}"
|
||||||
|
else:
|
||||||
|
location = "N/A"
|
||||||
|
|
||||||
|
mapaddress = soup.find("div", class_="mapaddress")
|
||||||
|
if mapaddress:
|
||||||
|
location = safe_get_text(mapaddress) + " " + location
|
||||||
|
|
||||||
|
description_elem = soup.find("section", id="postingbody")
|
||||||
|
if description_elem:
|
||||||
|
de = BeautifulSoup(str(description_elem), "html.parser")
|
||||||
|
qr_code_elem = de.find(class_="print-qrcode-label")
|
||||||
|
# Remove QR code if it exists
|
||||||
|
if qr_code_elem:
|
||||||
|
qr_code_elem.decompose()
|
||||||
|
description = de.text.strip()
|
||||||
|
else:
|
||||||
|
description = ''
|
||||||
|
|
||||||
|
posting_info = soup.find("div", class_="postinginfos")
|
||||||
|
if posting_info:
|
||||||
|
pi = BeautifulSoup(str(posting_info), "html.parser")
|
||||||
|
postinginfo_tags = pi.find_all("p", class_="postinginfo")
|
||||||
|
job_id = safe_get_text(postinginfo_tags[0]) if postinginfo_tags else ""
|
||||||
|
posted_time_elem = pi.find("time", class_="date timeago")
|
||||||
|
posted_time = safe_get_attr(
|
||||||
|
posted_time_elem, "datetime") if posted_time_elem else ""
|
||||||
|
else:
|
||||||
|
job_id = ""
|
||||||
|
posted_time = ""
|
||||||
|
|
||||||
|
return {
|
||||||
|
"url": url,
|
||||||
|
"title": title,
|
||||||
|
"company": company,
|
||||||
|
"location": location,
|
||||||
|
"description": description,
|
||||||
|
"id": job_id,
|
||||||
|
"posted_time": posted_time
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def scrape_job_data(content: str, region: str, keyword: str, seen_urls: Set[str]) -> List[List]:
|
||||||
|
"""Parse HTML content to extract job listings."""
|
||||||
|
soup = BeautifulSoup(content, "html.parser")
|
||||||
|
listings = soup.find_all("li", class_="cl-static-search-result")
|
||||||
|
new_rows = []
|
||||||
|
|
||||||
|
for i, listing in enumerate(listings):
|
||||||
|
job_data = scrape_listings_page(listing, region, keyword, seen_urls)
|
||||||
|
if job_data:
|
||||||
|
new_rows.append(job_data)
|
||||||
|
|
||||||
|
return new_rows
|
||||||
|
|
||||||
|
|
||||||
|
def process_region_keyword(region: str, keyword: str, seen_urls: Set[str]) -> List[List]:
|
||||||
|
"""Process a single region and keyword."""
|
||||||
|
url = get_base_url().format(region=region, keyword=keyword.replace(" ", "+"))
|
||||||
|
if is_cached(url):
|
||||||
|
content = get_cached_content(url)
|
||||||
|
cache_status = "CACHED"
|
||||||
|
else:
|
||||||
|
content = make_request_with_retry(url, 3)
|
||||||
|
if content is None:
|
||||||
|
return []
|
||||||
|
cache_page(url, content)
|
||||||
|
cache_status = "FETCHED"
|
||||||
|
_ = cache_status # no-op to silence unused var
|
||||||
|
return scrape_job_data(content, region, keyword, seen_urls)
|
||||||
102
web/static/index.js
Normal file
102
web/static/index.js
Normal file
@@ -0,0 +1,102 @@
|
|||||||
|
// Update the table with job data
|
||||||
|
function updateTableData(jobs) {
|
||||||
|
const jobsContainer = document.getElementById("jobs");
|
||||||
|
jobsContainer.innerHTML = ""; // Clear existing jobs
|
||||||
|
jobs.forEach((job) => {
|
||||||
|
const jobElement = document.createElement("div");
|
||||||
|
jobElement.classList.add("job");
|
||||||
|
jobElement.innerHTML = `
|
||||||
|
<h3><a href="${job.url}" target="_blank">${job.title}</a></h3>
|
||||||
|
<p class="job-posted-time">${job.posted_time}</p>
|
||||||
|
<span class="job-region region-${job.region
|
||||||
|
.replace(" ", "")
|
||||||
|
.toLowerCase()}">${job.region}</span>
|
||||||
|
<span class="job-keyword keyword-${job.keyword
|
||||||
|
.replace(" ", "")
|
||||||
|
.toLowerCase()}">${job.keyword}</span>
|
||||||
|
`;
|
||||||
|
jobsContainer.appendChild(jobElement);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fetch job data from the server
|
||||||
|
function fetchJobs() {
|
||||||
|
fetch("/jobs")
|
||||||
|
.then((response) => response.json())
|
||||||
|
.then((data) => {
|
||||||
|
updateTableData(data);
|
||||||
|
})
|
||||||
|
.catch((error) => console.error("Error fetching jobs:", error));
|
||||||
|
}
|
||||||
|
|
||||||
|
// scrape form submission
|
||||||
|
function updateScrapeInfo(message, color) {
|
||||||
|
let scrapingInfo = document.getElementById("scrape-info");
|
||||||
|
scrapingInfo.style.display = "inline-block"; // Show the scraping info
|
||||||
|
scrapingInfo.innerText = message;
|
||||||
|
scrapingInfo.style.color = color;
|
||||||
|
}
|
||||||
|
|
||||||
|
function scrape(event) {
|
||||||
|
event.preventDefault(); // Prevent the default form submission
|
||||||
|
updateScrapeInfo("Scraping in progress...", "blue");
|
||||||
|
fetch("/scrape")
|
||||||
|
.then((response) => response.json())
|
||||||
|
.then((data) => {
|
||||||
|
if (data.status) {
|
||||||
|
updateScrapeInfo(data.status, "green");
|
||||||
|
} else {
|
||||||
|
updateScrapeInfo("Scraping failed. Please try again.", "red");
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.catch((error) => console.error("Error:", error));
|
||||||
|
}
|
||||||
|
|
||||||
|
function updateJobsFiltered() {
|
||||||
|
const selectedRegion = document.getElementById("region").value;
|
||||||
|
const selectedKeyword = document.getElementById("keyword").value;
|
||||||
|
const filterForm = document.getElementById("filter-form");
|
||||||
|
const queryString = new URLSearchParams({
|
||||||
|
region: selectedRegion,
|
||||||
|
keyword: selectedKeyword,
|
||||||
|
}).toString();
|
||||||
|
filterForm.action = `/?${queryString}`;
|
||||||
|
filterForm.submit(); // Submit the form to apply filters
|
||||||
|
}
|
||||||
|
|
||||||
|
function regionClick(event) {
|
||||||
|
const region = event.target.innerText;
|
||||||
|
const regionInput = document.getElementById("region");
|
||||||
|
regionInput.value = region;
|
||||||
|
updateJobsFiltered();
|
||||||
|
}
|
||||||
|
|
||||||
|
function keywordClick(event) {
|
||||||
|
const keyword = event.target.innerText;
|
||||||
|
const keywordInput = document.getElementById("keyword");
|
||||||
|
keywordInput.value = keyword;
|
||||||
|
updateJobsFiltered();
|
||||||
|
}
|
||||||
|
|
||||||
|
document.querySelectorAll(".job-keyword").forEach((element) => {
|
||||||
|
element.addEventListener("click", keywordClick);
|
||||||
|
});
|
||||||
|
document.querySelectorAll(".job-region").forEach((element) => {
|
||||||
|
element.addEventListener("click", regionClick);
|
||||||
|
});
|
||||||
|
|
||||||
|
document.getElementById("scrape-form").addEventListener("submit", scrape);
|
||||||
|
document
|
||||||
|
.getElementById("region")
|
||||||
|
.addEventListener("change", updateJobsFiltered);
|
||||||
|
document
|
||||||
|
.getElementById("keyword")
|
||||||
|
.addEventListener("change", updateJobsFiltered);
|
||||||
|
document
|
||||||
|
.getElementById("filter-form")
|
||||||
|
.addEventListener("submit", updateJobsFiltered);
|
||||||
|
document.getElementById("reset-filters").addEventListener("click", () => {
|
||||||
|
document.getElementById("region").value = "";
|
||||||
|
document.getElementById("keyword").value = "";
|
||||||
|
updateJobsFiltered();
|
||||||
|
});
|
||||||
61
web/static/settings.js
Normal file
61
web/static/settings.js
Normal file
@@ -0,0 +1,61 @@
|
|||||||
|
/* javascript form handling */
|
||||||
|
document
|
||||||
|
.getElementById("user-settings-form")
|
||||||
|
.addEventListener("submit", function (event) {
|
||||||
|
event.preventDefault(); // Prevent default form submission
|
||||||
|
|
||||||
|
const form = event.target;
|
||||||
|
const formData = new FormData(form);
|
||||||
|
|
||||||
|
// Collect selected regions and keywords
|
||||||
|
const selectedRegions = [];
|
||||||
|
const selectedKeywords = [];
|
||||||
|
formData.forEach((value, key) => {
|
||||||
|
if (key === "region") {
|
||||||
|
selectedRegions.push(value);
|
||||||
|
} else if (key === "keyword") {
|
||||||
|
selectedKeywords.push(value);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Add new region if provided
|
||||||
|
const newRegion = formData.get("new-region").trim();
|
||||||
|
if (newRegion) {
|
||||||
|
selectedRegions.push(newRegion);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add new keyword if provided
|
||||||
|
const newKeyword = formData.get("new-keyword").trim();
|
||||||
|
if (newKeyword) {
|
||||||
|
selectedKeywords.push(newKeyword);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Prepare data to send
|
||||||
|
const dataToSend = {
|
||||||
|
regions: selectedRegions,
|
||||||
|
keywords: selectedKeywords,
|
||||||
|
csrf_token: formData.get("csrf_token"),
|
||||||
|
};
|
||||||
|
|
||||||
|
// Send data via Fetch API
|
||||||
|
fetch(form.action, {
|
||||||
|
method: "POST",
|
||||||
|
headers: {
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
"X-CSRF-Token": document.querySelector('meta[name="csrf-token"]')
|
||||||
|
.content,
|
||||||
|
},
|
||||||
|
body: JSON.stringify(dataToSend),
|
||||||
|
})
|
||||||
|
.then((response) => {
|
||||||
|
if (response.ok) {
|
||||||
|
window.location.reload(); // Reload to reflect changes
|
||||||
|
} else {
|
||||||
|
alert("Error saving preferences.");
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.catch((error) => {
|
||||||
|
console.error("Error:", error);
|
||||||
|
alert("Error saving preferences.");
|
||||||
|
});
|
||||||
|
});
|
||||||
144
web/static/styles.css
Normal file
144
web/static/styles.css
Normal file
@@ -0,0 +1,144 @@
|
|||||||
|
body {
|
||||||
|
font-family: Arial, sans-serif;
|
||||||
|
margin: 10px;
|
||||||
|
font-size: 16px;
|
||||||
|
}
|
||||||
|
h1 {
|
||||||
|
color: #333;
|
||||||
|
font-size: 1.2em;
|
||||||
|
}
|
||||||
|
a {
|
||||||
|
text-decoration: none;
|
||||||
|
}
|
||||||
|
a:hover {
|
||||||
|
text-decoration: underline;
|
||||||
|
}
|
||||||
|
footer {
|
||||||
|
margin-top: 20px;
|
||||||
|
text-align: center;
|
||||||
|
font-size: 0.9em;
|
||||||
|
color: #666;
|
||||||
|
}
|
||||||
|
nav {
|
||||||
|
margin-bottom: 10px;
|
||||||
|
}
|
||||||
|
|
||||||
|
#filters {
|
||||||
|
display: block;
|
||||||
|
margin-bottom: 1rem;
|
||||||
|
}
|
||||||
|
#filters #filter-form {
|
||||||
|
display: inline-block;
|
||||||
|
max-width: 500px;
|
||||||
|
}
|
||||||
|
#filters #scrape-form {
|
||||||
|
display: inline-block;
|
||||||
|
margin-left: 1rem;
|
||||||
|
}
|
||||||
|
#filters #scrape-form span#scrape-info {
|
||||||
|
display: none;
|
||||||
|
color: blue;
|
||||||
|
font-size: 0.9em;
|
||||||
|
}
|
||||||
|
#jobs {
|
||||||
|
margin: 0;
|
||||||
|
padding: 0;
|
||||||
|
display: grid;
|
||||||
|
grid-template-columns: repeat(auto-fill, minmax(360px, 1fr));
|
||||||
|
gap: 1rem;
|
||||||
|
}
|
||||||
|
.job {
|
||||||
|
border: 1px solid #ccc;
|
||||||
|
padding: 1rem;
|
||||||
|
border-radius: 5px;
|
||||||
|
background-color: #f9f9f9;
|
||||||
|
}
|
||||||
|
.job a {
|
||||||
|
display: inline-block;
|
||||||
|
}
|
||||||
|
.job h3 {
|
||||||
|
margin: 0 0 0.25rem 0;
|
||||||
|
font-size: 1.1em;
|
||||||
|
}
|
||||||
|
|
||||||
|
.job-posted-time {
|
||||||
|
font-weight: normal;
|
||||||
|
font-size: 0.8em;
|
||||||
|
color: #666;
|
||||||
|
margin: 0.25rem 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
.job-region,
|
||||||
|
.job-keyword {
|
||||||
|
border: 1px solid #ccc;
|
||||||
|
border-radius: 0.8rem;
|
||||||
|
padding: 0.2rem 0.4rem;
|
||||||
|
display: inline;
|
||||||
|
margin-right: 0.5rem;
|
||||||
|
background-color: rgb(255, 255, 255);
|
||||||
|
}
|
||||||
|
|
||||||
|
#job-details {
|
||||||
|
max-width: 100%;
|
||||||
|
margin: auto;
|
||||||
|
}
|
||||||
|
|
||||||
|
.job-description {
|
||||||
|
margin-top: 5px;
|
||||||
|
color: #333;
|
||||||
|
margin: 0;
|
||||||
|
padding: 0;
|
||||||
|
line-height: 1.25;
|
||||||
|
font-size: 14px;
|
||||||
|
}
|
||||||
|
.job-description br {
|
||||||
|
margin: -5px 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
.job-title {
|
||||||
|
font-weight: bold;
|
||||||
|
color: #333;
|
||||||
|
text-decoration: underline;
|
||||||
|
font-size: 16px;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Taxonomy Management */
|
||||||
|
#regions-table,
|
||||||
|
#keywords-table {
|
||||||
|
margin-top: 20px;
|
||||||
|
}
|
||||||
|
#regions-table table,
|
||||||
|
#keywords-table table {
|
||||||
|
max-width: 100%;
|
||||||
|
border-collapse: collapse;
|
||||||
|
}
|
||||||
|
#regions-table th,
|
||||||
|
#regions-table td,
|
||||||
|
#keywords-table th,
|
||||||
|
#keywords-table td {
|
||||||
|
border: 1px solid #ccc;
|
||||||
|
padding: 8px;
|
||||||
|
text-align: left;
|
||||||
|
}
|
||||||
|
#regions-table th,
|
||||||
|
#keywords-table th {
|
||||||
|
background-color: #f9f9f9;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Admin User Management */
|
||||||
|
#users {
|
||||||
|
margin-top: 20px;
|
||||||
|
}
|
||||||
|
#users table {
|
||||||
|
max-width: 100%;
|
||||||
|
border-collapse: collapse;
|
||||||
|
}
|
||||||
|
#users th,
|
||||||
|
#users td {
|
||||||
|
border: 1px solid #ccc;
|
||||||
|
padding: 8px;
|
||||||
|
text-align: left;
|
||||||
|
}
|
||||||
|
#users th {
|
||||||
|
background-color: #f9f9f9;
|
||||||
|
}
|
||||||
41
web/static/taxonomy.js
Normal file
41
web/static/taxonomy.js
Normal file
@@ -0,0 +1,41 @@
|
|||||||
|
function updateColor(id, type, newColor) {
|
||||||
|
fetch("/admin/taxonomy", {
|
||||||
|
method: "POST",
|
||||||
|
headers: {
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
"X-CSRF-Token": document.querySelector('meta[name="csrf-token"]').content,
|
||||||
|
},
|
||||||
|
body: JSON.stringify({
|
||||||
|
action:
|
||||||
|
type === "region" ? "change_region_color" : "change_keyword_color",
|
||||||
|
[type + "_id"]: id,
|
||||||
|
[type + "_color"]: newColor,
|
||||||
|
}),
|
||||||
|
}).then((response) => {
|
||||||
|
if (response.ok) {
|
||||||
|
location.reload();
|
||||||
|
} else {
|
||||||
|
alert("Failed to update " + type + " color");
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
document
|
||||||
|
.getElementById("region-color-form")
|
||||||
|
.addEventListener("submit", function (event) {
|
||||||
|
event.preventDefault();
|
||||||
|
const regionId = this.querySelector('input[name="region_id"]').value;
|
||||||
|
const newColor = this.querySelector('input[name="new_region_color"]').value;
|
||||||
|
updateColor(regionId, "region", newColor);
|
||||||
|
});
|
||||||
|
|
||||||
|
document
|
||||||
|
.getElementById("keyword-color-form")
|
||||||
|
.addEventListener("submit", function (event) {
|
||||||
|
event.preventDefault();
|
||||||
|
const keywordId = this.querySelector('input[name="keyword_id"]').value;
|
||||||
|
const newColor = this.querySelector(
|
||||||
|
'input[name="new_keyword_color"]'
|
||||||
|
).value;
|
||||||
|
updateColor(keywordId, "keyword", newColor);
|
||||||
|
});
|
||||||
9
web/templates/admin/login.html
Normal file
9
web/templates/admin/login.html
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
{% extends 'base.html' %} {% block content %}
|
||||||
|
<h2>Login</h2>
|
||||||
|
<form method="post">
|
||||||
|
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}" />
|
||||||
|
<label>Username <input type="text" name="username" required /></label>
|
||||||
|
<label>Password <input type="password" name="password" /></label>
|
||||||
|
<button type="submit">Login</button>
|
||||||
|
</form>
|
||||||
|
{% endblock %}
|
||||||
142
web/templates/admin/taxonomy.html
Normal file
142
web/templates/admin/taxonomy.html
Normal file
@@ -0,0 +1,142 @@
|
|||||||
|
{% extends 'base.html' %} {% block content %}
|
||||||
|
<h2>Taxonomy</h2>
|
||||||
|
<section>
|
||||||
|
<h3>Regions</h3>
|
||||||
|
<form method="post">
|
||||||
|
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}" />
|
||||||
|
<input type="hidden" name="action" value="add_region" />
|
||||||
|
<input type="text" name="region_name" placeholder="New region" required />
|
||||||
|
<label for="region_color">Color:</label>
|
||||||
|
<input type="color" name="region_color" id="region_color" value="#ffffff" />
|
||||||
|
<button type="submit">Add Region</button>
|
||||||
|
</form>
|
||||||
|
<div id="regions-table">
|
||||||
|
<table>
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th>ID</th>
|
||||||
|
<th>Name</th>
|
||||||
|
<th>Rename</th>
|
||||||
|
<th>Color</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
{% for r in regions %}
|
||||||
|
<tr>
|
||||||
|
<td>{{ r.region_id }}</td>
|
||||||
|
|
||||||
|
<td>{{ r.name }}</td>
|
||||||
|
<td>
|
||||||
|
<form
|
||||||
|
method="post"
|
||||||
|
style="display: flex; gap: 0.5rem; align-items: center"
|
||||||
|
>
|
||||||
|
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}" />
|
||||||
|
<input type="hidden" name="action" value="rename_region" />
|
||||||
|
<input type="hidden" name="region_id" value="{{ r.region_id }}" />
|
||||||
|
<input
|
||||||
|
type="text"
|
||||||
|
name="new_region_name"
|
||||||
|
placeholder="New name"
|
||||||
|
required
|
||||||
|
/>
|
||||||
|
<button type="submit">Rename</button>
|
||||||
|
</form>
|
||||||
|
</td>
|
||||||
|
<td>
|
||||||
|
<form
|
||||||
|
method="post"
|
||||||
|
style="display: flex; gap: 0.5rem; align-items: center"
|
||||||
|
>
|
||||||
|
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}" />
|
||||||
|
<input type="hidden" name="action" value="change_region_color" />
|
||||||
|
<input type="hidden" name="region_id" value="{{ r.region_id }}" />
|
||||||
|
<input
|
||||||
|
type="color"
|
||||||
|
name="new_region_color"
|
||||||
|
value="{{ r.color }}"
|
||||||
|
required
|
||||||
|
/>
|
||||||
|
<button type="submit">Change Color</button>
|
||||||
|
</form>
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
{% endfor %}
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
<section>
|
||||||
|
<h3>Keywords</h3>
|
||||||
|
<form method="post">
|
||||||
|
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}" />
|
||||||
|
<input type="hidden" name="action" value="add_keyword" />
|
||||||
|
<input type="text" name="keyword_name" placeholder="New keyword" required />
|
||||||
|
<label for="keyword_color">Color:</label>
|
||||||
|
<input
|
||||||
|
type="color"
|
||||||
|
name="keyword_color"
|
||||||
|
id="keyword_color"
|
||||||
|
value="#ffffff"
|
||||||
|
/>
|
||||||
|
<button type="submit">Add Keyword</button>
|
||||||
|
</form>
|
||||||
|
<div id="keywords-table">
|
||||||
|
<table>
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th>ID</th>
|
||||||
|
<th>Name</th>
|
||||||
|
<th>Rename</th>
|
||||||
|
<th>Color</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
{% for k in keywords %}
|
||||||
|
<tr>
|
||||||
|
<td>{{ k.keyword_id }}</td>
|
||||||
|
<td>{{ k.name }}</td>
|
||||||
|
<td>
|
||||||
|
<form
|
||||||
|
method="post"
|
||||||
|
style="display: flex; gap: 0.5rem; align-items: center"
|
||||||
|
>
|
||||||
|
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}" />
|
||||||
|
<input type="hidden" name="action" value="rename_keyword" />
|
||||||
|
<input type="hidden" name="keyword_id" value="{{ k.keyword_id }}" />
|
||||||
|
<input
|
||||||
|
type="text"
|
||||||
|
name="new_keyword_name"
|
||||||
|
placeholder="New name"
|
||||||
|
required
|
||||||
|
/>
|
||||||
|
<button type="submit">Rename</button>
|
||||||
|
</form>
|
||||||
|
</td>
|
||||||
|
<td>
|
||||||
|
<form
|
||||||
|
method="post"
|
||||||
|
style="display: flex; gap: 0.5rem; align-items: center"
|
||||||
|
>
|
||||||
|
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}" />
|
||||||
|
<input type="hidden" name="action" value="change_keyword_color" />
|
||||||
|
<input type="hidden" name="keyword_id" value="{{ k.keyword_id }}" />
|
||||||
|
<input
|
||||||
|
type="color"
|
||||||
|
name="new_keyword_color"
|
||||||
|
value="{{ k.color }}"
|
||||||
|
required
|
||||||
|
/>
|
||||||
|
<button type="submit">Change Color</button>
|
||||||
|
</form>
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
{% endfor %}
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</section>
|
||||||
|
{% endblock %} {% block footer_scripts %}
|
||||||
|
<script src="{{ url_for('static', filename='taxonomy.js') }}"></script>
|
||||||
|
</script>
|
||||||
|
{% endblock %}
|
||||||
139
web/templates/admin/users.html
Normal file
139
web/templates/admin/users.html
Normal file
@@ -0,0 +1,139 @@
|
|||||||
|
{% extends 'base.html' %} {% block content %}
|
||||||
|
<div id="users">
|
||||||
|
<h2>Users</h2>
|
||||||
|
<form id="user-form" method="post" action="{{ url_for('admin_users') }}">
|
||||||
|
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}" />
|
||||||
|
<table>
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th>ID</th>
|
||||||
|
<th>Username</th>
|
||||||
|
<th>Admin</th>
|
||||||
|
<th>Active</th>
|
||||||
|
<th colspan="2">Password</th>
|
||||||
|
<th>Created</th>
|
||||||
|
<th>Last Login</th>
|
||||||
|
<th></th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
{% for u in users %}
|
||||||
|
<tr class="user-row" data-user-id="{{ u.user_id }}">
|
||||||
|
<td>
|
||||||
|
{{ u.user_id }}<input
|
||||||
|
type="hidden"
|
||||||
|
name="user_id"
|
||||||
|
value="{{ u.user_id }}"
|
||||||
|
/>
|
||||||
|
</td>
|
||||||
|
<td>
|
||||||
|
<input
|
||||||
|
type="text"
|
||||||
|
name="username"
|
||||||
|
value="{{ u.username }}"
|
||||||
|
required
|
||||||
|
/>
|
||||||
|
</td>
|
||||||
|
<td>
|
||||||
|
<input type="checkbox" name="is_admin" {{ 'checked' if u.is_admin
|
||||||
|
else '' }} />
|
||||||
|
</td>
|
||||||
|
<td>
|
||||||
|
<input type="checkbox" name="is_active" {{ 'checked' if u.is_active
|
||||||
|
else '' }} />
|
||||||
|
</td>
|
||||||
|
<td>{{ '✅' if u.has_password else '❌' }}</td>
|
||||||
|
<td><input type="password" name="password" /></td>
|
||||||
|
<td>{{ u.created_at }}</td>
|
||||||
|
<td>{{ u.last_login or 'never' }}</td>
|
||||||
|
<td>
|
||||||
|
<button type="submit" data-user-id="{{ u.user_id }}">Save</button>
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
{% endfor %}
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</form>
|
||||||
|
</div>
|
||||||
|
<h3>Create / Update User</h3>
|
||||||
|
<form
|
||||||
|
id="create-update-user-form"
|
||||||
|
method="post"
|
||||||
|
action="{{ url_for('admin_users') }}"
|
||||||
|
>
|
||||||
|
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}" />
|
||||||
|
<label>Username <input type="text" name="username" required /></label>
|
||||||
|
<label>Password <input type="password" name="password" /></label>
|
||||||
|
<label>Admin <input type="checkbox" name="is_admin" value="1" /></label>
|
||||||
|
<label
|
||||||
|
>Active <input type="checkbox" name="is_active" value="1" checked
|
||||||
|
/></label>
|
||||||
|
<button type="submit">Save</button>
|
||||||
|
</form>
|
||||||
|
{% endblock %} {% block footer_scripts %}
|
||||||
|
<script>
|
||||||
|
function updateUser(userId) {
|
||||||
|
const row = document.querySelector(`.user-row[data-user-id="${userId}"]`);
|
||||||
|
const passwordInput = row.querySelector('input[name="password"]');
|
||||||
|
const hasPassword =
|
||||||
|
row.querySelector("td:nth-child(5)").textContent.trim() === "✅";
|
||||||
|
const formData = row.querySelector("form").elements;
|
||||||
|
const username = formData.username.value;
|
||||||
|
const password = hasPassword ? passwordInput.value : undefined;
|
||||||
|
const isAdmin = formData.is_admin.checked;
|
||||||
|
const isActive = formData.is_active.checked;
|
||||||
|
|
||||||
|
fetch("/admin/users", {
|
||||||
|
method: "POST",
|
||||||
|
headers: {
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
},
|
||||||
|
body: JSON.stringify({
|
||||||
|
user_id: userId,
|
||||||
|
password: password,
|
||||||
|
username: username,
|
||||||
|
is_admin: isAdmin,
|
||||||
|
is_active: isActive,
|
||||||
|
csrf_token: formData.csrf_token.value,
|
||||||
|
}),
|
||||||
|
})
|
||||||
|
.then((response) => {
|
||||||
|
if (response.ok) {
|
||||||
|
alert("User updated successfully");
|
||||||
|
// Clear the password field after successful update
|
||||||
|
passwordInput.value = "";
|
||||||
|
} else {
|
||||||
|
alert("Error updating user");
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.catch((error) => {
|
||||||
|
console.error("Error:", error);
|
||||||
|
alert("Error updating user");
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function initUserForm() {
|
||||||
|
const form = document.getElementById("user-form");
|
||||||
|
const createUpdateForm = document.getElementById("create-update-user-form");
|
||||||
|
|
||||||
|
form.addEventListener("submit", function (event) {
|
||||||
|
const userId = event.target.querySelector('input[name="user_id"]').value;
|
||||||
|
event.preventDefault(); // Prevent the default form submission
|
||||||
|
updateUser(userId);
|
||||||
|
});
|
||||||
|
|
||||||
|
form.addEventListener("click", function (event) {
|
||||||
|
const userId = event.target.closest(".user-row").dataset.userId;
|
||||||
|
updateUser(userId);
|
||||||
|
});
|
||||||
|
|
||||||
|
createUpdateForm.addEventListener("submit", function (event) {
|
||||||
|
const passwordInput = createUpdateForm.querySelector(
|
||||||
|
'input[name="password"]'
|
||||||
|
);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
initUserForm();
|
||||||
|
</script>
|
||||||
|
{% endblock %}
|
||||||
43
web/templates/base.html
Normal file
43
web/templates/base.html
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8" />
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||||
|
<title>{{ title }}</title>
|
||||||
|
<meta name="csrf-token" content="{{ csrf_token() }}" />
|
||||||
|
<link
|
||||||
|
rel="stylesheet"
|
||||||
|
href="{{ url_for('static', filename='styles.css') }}"
|
||||||
|
/>
|
||||||
|
{% block styles %}{% endblock %} {% block scripts %}{% endblock %}
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
{% block header %}
|
||||||
|
<header>
|
||||||
|
<h1><a href="/">{{ title or 'Admin' }}</a></h1>
|
||||||
|
<nav>
|
||||||
|
{% if username %}<span>Hi, {{ username }}</span> | {% endif %}
|
||||||
|
<a href="{{ url_for('index') }}">Home</a> |
|
||||||
|
<a href="{{ url_for('user_settings') }}">Preferences</a> {% if
|
||||||
|
current_user and current_user.is_admin %} |
|
||||||
|
<a href="{{ url_for('admin_taxonomy') }}">Taxonomy</a> |
|
||||||
|
<a href="{{ url_for('admin_users') }}">Users</a> {% endif %} | {% if
|
||||||
|
session.get('username') %}
|
||||||
|
<a href="{{ url_for('logout') }}">Logout</a> {% else %} |
|
||||||
|
<a href="{{ url_for('login') }}">Login</a>{% endif %}
|
||||||
|
</nav>
|
||||||
|
{% with messages = get_flashed_messages() %} {% if messages %}
|
||||||
|
<ul>
|
||||||
|
{% for m in messages %}
|
||||||
|
<li>{{ m }}</li>
|
||||||
|
{% endfor %}
|
||||||
|
</ul>
|
||||||
|
{% endif %} {% endwith %}
|
||||||
|
</header>
|
||||||
|
{% endblock %} {% block content %}{% endblock %}
|
||||||
|
<footer>
|
||||||
|
<p>© 2025 Job Listings</p>
|
||||||
|
</footer>
|
||||||
|
{% block footer_scripts %}{% endblock %}
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
55
web/templates/index.html
Normal file
55
web/templates/index.html
Normal file
@@ -0,0 +1,55 @@
|
|||||||
|
{% extends "base.html" %} {% block styles %}
|
||||||
|
<style>
|
||||||
|
/* for each keyword, create a different background color */
|
||||||
|
{% for keyword in keywords %}
|
||||||
|
.keyword-{{ keywords[keyword].name }} {
|
||||||
|
background-color: {{ keywords[keyword].color }};
|
||||||
|
}
|
||||||
|
{% endfor %}
|
||||||
|
/* for each region, create a different background color */
|
||||||
|
{% for region in regions %}
|
||||||
|
.region-{{ region }} {
|
||||||
|
background-color: {{ regions[region].color }};
|
||||||
|
}{% endfor %}
|
||||||
|
</style>
|
||||||
|
{% endblock %}
|
||||||
|
{% block title %}Job Listings{% endblock %}
|
||||||
|
{% block content %}
|
||||||
|
<div id="filters">
|
||||||
|
<form id="filter-form" method="GET" action="/">
|
||||||
|
<label for="region">Region:</label>
|
||||||
|
<select name="region" id="region">
|
||||||
|
<option value="">All</option>
|
||||||
|
{% for region in regions %}
|
||||||
|
<option value="{{ region }}" {% if region == selected_region %}selected{% endif %}>{{ region }}</option>
|
||||||
|
{% endfor %}
|
||||||
|
</select>
|
||||||
|
<label for="keyword">Keyword:</label>
|
||||||
|
<select name="keyword" id="keyword">
|
||||||
|
<option value="">All</option>
|
||||||
|
{% for keyword in keywords %}
|
||||||
|
<option value="{{ keyword }}" {% if keyword == selected_keyword %}selected{% endif %}>{{ keyword }}</option>
|
||||||
|
{% endfor %}
|
||||||
|
</select>
|
||||||
|
<button type="submit">Filter</button>
|
||||||
|
<button type="button" id="reset-filters">Reset</button>
|
||||||
|
</form>
|
||||||
|
<form id="scrape-form" method="GET" action="/scrape">
|
||||||
|
<button type="submit">Scrape Jobs</button>
|
||||||
|
<span id="scrape-info"></span>
|
||||||
|
</form>
|
||||||
|
</div>
|
||||||
|
<div id="jobs">
|
||||||
|
{% for job in jobs %}
|
||||||
|
<div class="job">
|
||||||
|
<h3><a href="{{ job['url'] }}" target="_blank">{{ job['title'] }}</a></h3>
|
||||||
|
<p class="job-posted-time">{{ job['posted_time'] }}</p>
|
||||||
|
<span class="job-region region-{{ job['region'] }}">{{ job['region'] }}</span>
|
||||||
|
<span class="job-keyword keyword-{{ job['keyword']|replace(' ', '')|lower }}">{{ job['keyword'] }}</span>
|
||||||
|
</div>
|
||||||
|
{% endfor %}
|
||||||
|
</div>
|
||||||
|
{% endblock %}
|
||||||
|
{% block footer_scripts %}
|
||||||
|
<script src="{{ url_for('static', filename='index.js') }}"></script>
|
||||||
|
{% endblock %}
|
||||||
27
web/templates/job.html
Normal file
27
web/templates/job.html
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
{% extends "base.html" %} {% block title %}Job Details{% endblock %} {% block
|
||||||
|
styles %}{% endblock %} {% block content %}
|
||||||
|
<div id="job-details">
|
||||||
|
<p><strong>ID:</strong> {{ job.id }}</p>
|
||||||
|
<p>
|
||||||
|
<strong>Title:</strong> {{ job.title }} | <strong>Company:</strong> {{
|
||||||
|
job.company }} | <strong>Location:</strong> {{ job.location }}
|
||||||
|
</p>
|
||||||
|
<p>
|
||||||
|
<strong>Salary:</strong> {{ job.salary }} | <strong>Posted on:</strong> {{
|
||||||
|
job.posted_date }}
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<h2>Job Description</h2>
|
||||||
|
<hr />
|
||||||
|
<p class="job-description">{{ job.description|safe }}</p>
|
||||||
|
<hr />
|
||||||
|
<p>
|
||||||
|
<strong>Original URL:</strong>
|
||||||
|
</p>
|
||||||
|
<p>
|
||||||
|
<a href="{{ job.url }}" target="_blank" class="job-title"
|
||||||
|
>{{ job.title }}</a
|
||||||
|
>
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
{% endblock %}
|
||||||
84
web/templates/user/settings.html
Normal file
84
web/templates/user/settings.html
Normal file
@@ -0,0 +1,84 @@
|
|||||||
|
{% extends 'base.html' %} {% block title %}Your Preferences{% endblock %} {%
|
||||||
|
block content %}
|
||||||
|
<h2>Your Preferences</h2>
|
||||||
|
<form
|
||||||
|
id="user-settings-form"
|
||||||
|
method="post"
|
||||||
|
action="{{ url_for('user_settings') }}"
|
||||||
|
>
|
||||||
|
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}" />
|
||||||
|
<fieldset>
|
||||||
|
<legend>Regions</legend>
|
||||||
|
<p>
|
||||||
|
<small>Add new Region:</small>
|
||||||
|
<input
|
||||||
|
type="text"
|
||||||
|
name="new-region"
|
||||||
|
id="new-region"
|
||||||
|
value=""
|
||||||
|
placeholder="Type a region and save to add & select"
|
||||||
|
size="30"
|
||||||
|
/>
|
||||||
|
</p>
|
||||||
|
{% if all_regions %} {% for r in all_regions %}
|
||||||
|
<label style="display: block; background-color: {{ r.color }}">
|
||||||
|
<input
|
||||||
|
type="checkbox"
|
||||||
|
name="region"
|
||||||
|
id="region-{{ r.name }}"
|
||||||
|
value="{{ r.name }}"
|
||||||
|
{%
|
||||||
|
if
|
||||||
|
r
|
||||||
|
in
|
||||||
|
user_regions
|
||||||
|
%}checked{%
|
||||||
|
endif
|
||||||
|
%}
|
||||||
|
/>
|
||||||
|
{{ r.name }}
|
||||||
|
</label>
|
||||||
|
{% endfor %} {% else %}
|
||||||
|
<p>No regions available. Ask an admin to add some.</p>
|
||||||
|
{% endif %}
|
||||||
|
</fieldset>
|
||||||
|
<fieldset>
|
||||||
|
<legend>Keywords</legend>
|
||||||
|
<p>
|
||||||
|
<small>Add new Keyword:</small>
|
||||||
|
<input
|
||||||
|
type="text"
|
||||||
|
name="new-keyword"
|
||||||
|
id="new-keyword"
|
||||||
|
value=""
|
||||||
|
placeholder="Type a keyword and save to add & select"
|
||||||
|
size="30"
|
||||||
|
/>
|
||||||
|
</p>
|
||||||
|
{% if all_keywords %} {% for k in all_keywords %}
|
||||||
|
<label style="display: block; background-color: {{ k.color }}">
|
||||||
|
<input
|
||||||
|
type="checkbox"
|
||||||
|
name="keyword"
|
||||||
|
id="keyword-{{ k.name }}"
|
||||||
|
value="{{ k.name }}"
|
||||||
|
{%
|
||||||
|
if
|
||||||
|
k
|
||||||
|
in
|
||||||
|
user_keywords
|
||||||
|
%}checked{%
|
||||||
|
endif
|
||||||
|
%}
|
||||||
|
/>
|
||||||
|
{{ k.name }}
|
||||||
|
</label>
|
||||||
|
{% endfor %} {% else %}
|
||||||
|
<p>No keywords available. Ask an admin to add some.</p>
|
||||||
|
{% endif %}
|
||||||
|
</fieldset>
|
||||||
|
<button type="submit">Save</button>
|
||||||
|
</form>
|
||||||
|
{% endblock %} {% block footer_scripts %}
|
||||||
|
<script src="{{ url_for('static', filename='settings.js') }}"></script>
|
||||||
|
{% endblock %}
|
||||||
336
web/utils.py
Normal file
336
web/utils.py
Normal file
@@ -0,0 +1,336 @@
|
|||||||
|
"""
|
||||||
|
Utility functions for the Craigslist scraper.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from typing import Any, Optional as _Optional
|
||||||
|
from datetime import datetime, UTC
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import random
|
||||||
|
import re
|
||||||
|
import requests
|
||||||
|
import time
|
||||||
|
from typing import Optional, List, Dict
|
||||||
|
|
||||||
|
|
||||||
|
def get_config_file() -> str:
|
||||||
|
"""Return the path to the main config file."""
|
||||||
|
return os.path.abspath(os.path.join(
|
||||||
|
os.path.dirname(__file__), '..', 'config', 'settings.json'))
|
||||||
|
|
||||||
|
|
||||||
|
def get_config() -> dict:
|
||||||
|
"""Return the loaded configuration dict."""
|
||||||
|
CONFIG = {}
|
||||||
|
try:
|
||||||
|
with open(get_config_file(), 'r', encoding='utf-8') as _f:
|
||||||
|
CONFIG = json.load(_f)
|
||||||
|
except Exception:
|
||||||
|
CONFIG = {}
|
||||||
|
return CONFIG
|
||||||
|
|
||||||
|
|
||||||
|
def get_users_from_settings() -> List[Dict]:
|
||||||
|
"""Return user entries from settings.json (array of dicts)."""
|
||||||
|
users = get_config().get('users', [])
|
||||||
|
if not isinstance(users, list):
|
||||||
|
return []
|
||||||
|
out: List[Dict] = []
|
||||||
|
for u in users:
|
||||||
|
if not isinstance(u, dict):
|
||||||
|
continue
|
||||||
|
username = (u.get('username') or '').strip()
|
||||||
|
if not username:
|
||||||
|
continue
|
||||||
|
out.append({
|
||||||
|
'username': username,
|
||||||
|
'is_admin': bool(u.get('is_admin', False)),
|
||||||
|
'password': u.get('password') or ''
|
||||||
|
})
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def initialize_users_from_settings() -> int:
|
||||||
|
"""Ensure users from settings.json exist in DB; set admin/active and passwords.
|
||||||
|
|
||||||
|
Returns number of users processed.
|
||||||
|
"""
|
||||||
|
from web.db import create_or_update_user # local import to avoid cycles
|
||||||
|
users = get_users_from_settings()
|
||||||
|
count = 0
|
||||||
|
for u in users:
|
||||||
|
pw = u.get('password') or None
|
||||||
|
create_or_update_user(u['username'], password=pw, is_admin=bool(
|
||||||
|
u.get('is_admin', False)), is_active=True)
|
||||||
|
count += 1
|
||||||
|
return count
|
||||||
|
|
||||||
|
|
||||||
|
def verify_credentials(username: str, password: str) -> bool:
|
||||||
|
"""Proxy to db.verify_user_credentials"""
|
||||||
|
from web.db import verify_user_credentials
|
||||||
|
return verify_user_credentials(username, password)
|
||||||
|
|
||||||
|
|
||||||
|
# --- Database configuration helpers ---
|
||||||
|
|
||||||
|
def get_mysql_config() -> dict:
|
||||||
|
"""Return MySQL/MariaDB connection settings."""
|
||||||
|
db = get_config().get('database', {}).get('mysql', {})
|
||||||
|
return {
|
||||||
|
'host': db.get('host', '127.0.0.1'),
|
||||||
|
'user': db.get('user', 'root'),
|
||||||
|
'password': db.get('password', ''),
|
||||||
|
'database': db.get('database', 'jobs'),
|
||||||
|
'port': db.get('port', 3306),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def get_http_setting(key: str, default=None):
|
||||||
|
return get_config().get('http', {}).get(key, default)
|
||||||
|
|
||||||
|
|
||||||
|
def get_paths() -> dict:
|
||||||
|
return get_config().get('paths', {})
|
||||||
|
|
||||||
|
|
||||||
|
def get_cache_dir() -> str:
|
||||||
|
return get_paths().get('cache_dir', 'cache')
|
||||||
|
|
||||||
|
|
||||||
|
def get_logs_dir() -> str:
|
||||||
|
return get_paths().get('logs_dir', 'logs')
|
||||||
|
|
||||||
|
|
||||||
|
def get_user_agent() -> str:
|
||||||
|
return get_http_setting('user_agent')
|
||||||
|
|
||||||
|
|
||||||
|
def get_request_timeout() -> int:
|
||||||
|
return get_http_setting('request_timeout')
|
||||||
|
|
||||||
|
|
||||||
|
def get_max_retries() -> int:
|
||||||
|
return get_http_setting('max_retries')
|
||||||
|
|
||||||
|
|
||||||
|
def get_backoff_factor() -> int:
|
||||||
|
return get_http_setting('backoff_factor')
|
||||||
|
|
||||||
|
|
||||||
|
def get_min_delay() -> int:
|
||||||
|
return get_http_setting('min_delay')
|
||||||
|
|
||||||
|
|
||||||
|
def get_max_delay() -> int:
|
||||||
|
return get_http_setting('max_delay')
|
||||||
|
|
||||||
|
|
||||||
|
def get_base_url() -> str:
|
||||||
|
return get_config().get('scraper', {}).get('base_url', "https://{region}.craigslist.org/search/jjj?query={keyword}&sort=rel")
|
||||||
|
|
||||||
|
|
||||||
|
def ensure_cache_dir():
|
||||||
|
"""Ensure cache directory exists."""
|
||||||
|
os.makedirs(get_cache_dir(), exist_ok=True)
|
||||||
|
|
||||||
|
|
||||||
|
def now_iso() -> str:
|
||||||
|
"""Get the current time in ISO format."""
|
||||||
|
return datetime.now(UTC).isoformat()
|
||||||
|
|
||||||
|
|
||||||
|
def get_filename_from_url(url: str) -> str:
|
||||||
|
"""Convert URL to a safe filename."""
|
||||||
|
return url.replace("https://", "").replace("/", "_").replace("?", "_").replace("&", "_")
|
||||||
|
|
||||||
|
|
||||||
|
def url_to_job_id(url: str) -> int:
|
||||||
|
"""Extract the job id from a Craigslist URL (last path segment without .html)."""
|
||||||
|
last = url.rstrip("/").split("/")[-1].replace(".html", "")
|
||||||
|
if last.isdigit():
|
||||||
|
return int(last)
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_job_id(raw_id: Optional[str], url: Optional[str]) -> Optional[int]:
|
||||||
|
"""Normalize job id coming from details page (e.g., 'post id: 1234567890').
|
||||||
|
Fallback to URL-derived id when needed.
|
||||||
|
"""
|
||||||
|
if raw_id:
|
||||||
|
m = re.search(r"(\d{5,})", raw_id)
|
||||||
|
if m:
|
||||||
|
return int(m.group(1))
|
||||||
|
if url:
|
||||||
|
return url_to_job_id(url)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def get_url_from_filename(name: str) -> str:
|
||||||
|
"""Generate a URL guess based on the name."""
|
||||||
|
# Best-effort URL guess from filename convention (underscores to slashes)
|
||||||
|
base = os.path.splitext(name)[0]
|
||||||
|
url_guess = f"https://{base.replace('_', '/')}"
|
||||||
|
return url_guess
|
||||||
|
|
||||||
|
|
||||||
|
def get_cached_content(url: str) -> str:
|
||||||
|
"""Get cached content for URL."""
|
||||||
|
with open(get_cache_path(url), "r", encoding="utf-8") as f:
|
||||||
|
return f.read()
|
||||||
|
|
||||||
|
|
||||||
|
def safe_get_text(element, default="N/A"):
|
||||||
|
"""Safely extract text from BeautifulSoup element."""
|
||||||
|
return element.get_text(strip=True) if element else default
|
||||||
|
|
||||||
|
|
||||||
|
def safe_get_attr(element, attr, default="N/A"):
|
||||||
|
"""Safely extract attribute from BeautifulSoup element."""
|
||||||
|
return element.get(attr, default) if element else default
|
||||||
|
|
||||||
|
|
||||||
|
def get_random_delay(min_delay: int = get_min_delay(), max_delay: int = get_max_delay()) -> float:
|
||||||
|
"""Get a random delay between min_delay and max_delay seconds."""
|
||||||
|
return random.uniform(min_delay, max_delay)
|
||||||
|
|
||||||
|
|
||||||
|
def get_cache_path(url: str) -> str:
|
||||||
|
"""Get cache file path for URL."""
|
||||||
|
return os.path.join(get_cache_dir(), f"{get_filename_from_url(url)}.html")
|
||||||
|
|
||||||
|
|
||||||
|
def cache_page(url: str, content: str):
|
||||||
|
"""Cache the page content with a timestamp."""
|
||||||
|
cache_path = get_cache_path(url)
|
||||||
|
os.makedirs(os.path.dirname(cache_path), exist_ok=True)
|
||||||
|
with open(cache_path, "w", encoding="utf-8") as f:
|
||||||
|
f.write(content)
|
||||||
|
# Update the file's modification time to the current time
|
||||||
|
os.utime(cache_path, None)
|
||||||
|
|
||||||
|
|
||||||
|
def is_cached(url: str) -> bool:
|
||||||
|
"""Check if the page is cached and not older than 24 hours."""
|
||||||
|
cache_path = get_cache_path(url)
|
||||||
|
if not os.path.isfile(cache_path):
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Check the file's age if it's a search result page
|
||||||
|
if 'search' in url:
|
||||||
|
file_age = time.time() - os.path.getmtime(cache_path)
|
||||||
|
if file_age > 24 * 3600: # 24 hours in seconds
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def is_cache_stale(last_modified: str, days: int = 1) -> bool:
|
||||||
|
"""Check if the cached page is stale (older than 24 hours)."""
|
||||||
|
if not last_modified:
|
||||||
|
return True
|
||||||
|
last_datetime = datetime.fromisoformat(last_modified)
|
||||||
|
file_age = time.time() - last_datetime.timestamp()
|
||||||
|
return file_age > days * 24 * 3600 # days in seconds
|
||||||
|
|
||||||
|
|
||||||
|
def delete_cached_page(url: str):
|
||||||
|
cache_fp = get_cache_path(url)
|
||||||
|
if os.path.exists(cache_fp):
|
||||||
|
try:
|
||||||
|
os.remove(cache_fp)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def get_color_from_string(s: str) -> str:
|
||||||
|
"""Generate a color code from a string."""
|
||||||
|
hash_code = hash(s)
|
||||||
|
# Ensure the hash code is positive
|
||||||
|
hash_code = hash_code if hash_code >= 0 else -hash_code
|
||||||
|
# Extract RGB components
|
||||||
|
r, g, b = (hash_code & 0xFF0000) >> 16, (hash_code &
|
||||||
|
0x00FF00) >> 8, hash_code & 0x0000FF
|
||||||
|
# ensure RGB components are within 128-255
|
||||||
|
r = max(128, min(255, r))
|
||||||
|
g = max(128, min(255, g))
|
||||||
|
b = max(128, min(255, b))
|
||||||
|
# Combine RGB components back into a single integer
|
||||||
|
calculated = (r << 16) | (g << 8) | b
|
||||||
|
return f"#{calculated:06X}"
|
||||||
|
|
||||||
|
|
||||||
|
# ---- App helpers moved from app.py for reuse and readability -------------
|
||||||
|
|
||||||
|
|
||||||
|
def filter_jobs(
|
||||||
|
jobs: List[Dict[str, Any]],
|
||||||
|
region: _Optional[str] = None,
|
||||||
|
keyword: _Optional[str] = None,
|
||||||
|
) -> List[Dict[str, Any]]:
|
||||||
|
"""Filter jobs by optional region and keyword."""
|
||||||
|
filtered = jobs
|
||||||
|
if region:
|
||||||
|
filtered = [j for j in filtered if j.get("region") == region]
|
||||||
|
if keyword:
|
||||||
|
filtered = [j for j in filtered if j.get("keyword") == keyword]
|
||||||
|
return filtered
|
||||||
|
|
||||||
|
|
||||||
|
def get_job_by_id(job_id):
|
||||||
|
"""Fetch job details by job ID from the database."""
|
||||||
|
try:
|
||||||
|
from web.db import get_all_jobs # lazy import to avoid cycles
|
||||||
|
for j in get_all_jobs():
|
||||||
|
if str(j.get("id")) == str(job_id) or str(j.get("job_id")) == str(job_id):
|
||||||
|
return j
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
def make_request_with_retry(url: str, max_retries: int = get_max_retries()) -> Optional[str]:
|
||||||
|
"""Make HTTP request with retry logic and proper error handling."""
|
||||||
|
# initial delay
|
||||||
|
delay = get_random_delay()
|
||||||
|
|
||||||
|
headers = {'User-Agent': get_user_agent()}
|
||||||
|
|
||||||
|
for attempt in range(max_retries):
|
||||||
|
try:
|
||||||
|
delay = get_random_delay() * (get_backoff_factor() ** attempt)
|
||||||
|
if attempt > 0:
|
||||||
|
time.sleep(delay)
|
||||||
|
|
||||||
|
resp = requests.get(url, headers=headers,
|
||||||
|
timeout=get_request_timeout())
|
||||||
|
|
||||||
|
if resp.status_code == 403:
|
||||||
|
return None
|
||||||
|
elif resp.status_code == 429:
|
||||||
|
time.sleep(delay * 3) # Longer delay for rate limiting
|
||||||
|
continue
|
||||||
|
elif resp.status_code == 404:
|
||||||
|
return None
|
||||||
|
elif resp.status_code == 410:
|
||||||
|
return None
|
||||||
|
elif resp.status_code >= 400:
|
||||||
|
if attempt == max_retries - 1:
|
||||||
|
return None
|
||||||
|
continue
|
||||||
|
|
||||||
|
resp.raise_for_status()
|
||||||
|
return resp.text
|
||||||
|
|
||||||
|
except requests.exceptions.Timeout:
|
||||||
|
pass
|
||||||
|
except requests.exceptions.ConnectionError:
|
||||||
|
pass
|
||||||
|
except requests.exceptions.RequestException:
|
||||||
|
pass
|
||||||
|
|
||||||
|
if attempt < max_retries - 1:
|
||||||
|
delay = get_random_delay() * (get_backoff_factor() ** attempt)
|
||||||
|
time.sleep(delay)
|
||||||
|
|
||||||
|
return None
|
||||||
Reference in New Issue
Block a user