fix: update scrape function to handle HTML response and improve status messages

fix: update fetch logic to skip jobs fetched within the last 24 hours and adjust retry attempts in scraper
feat: Enhance CI/CD pipeline with Docker image build and push steps
2025-11-30 10:51:16 +01:00 · 2025-11-28 20:54:39 +01:00 · 2025-11-28 19:16:28 +01:00 · 2025-11-28 18:15:08 +01:00 · 2025-11-03 19:04:34 +01:00 · 2025-11-01 19:47:41 +01:00
49 changed files with 4367 additions and 751 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -0,0 +1,63 @@
 # Python
 __pycache__/
 *.py[cod]
 *$py.class
 *.so
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 # Virtual environments
 .venv/
 venv/
 ENV/
 env/
 # IDE
 .vscode/
 .idea/
 *.swp
 *.swo
 # OS
 .DS_Store
 Thumbs.db
 # Git
 .git/
 .gitignore
 # Logs
 logs/
 *.log
 # Cache
 cache/
 # Testing
 .pytest_cache/
 .coverage
 htmlcov/
 # Documentation
 docs/_build/
 # Docker
 Dockerfile*
 docker-compose*.yml
 .dockerignore
 README-Docker.md
 deploy.sh
--- a/.gitea/workflows/ci.yml
+++ b/.gitea/workflows/ci.yml
@@ -0,0 +1,98 @@
 name: CI/CD Pipeline
 on:
  push:
    branches: [main]
  pull_request:
    branches: [main]
 jobs:
  test:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout code
        uses: actions/checkout@v4
      - name: Set up Python
        uses: actions/setup-python@v4
        with:
          python-version: "3.11"
      - name: Install dependencies
        run: |
          python -m pip install --upgrade pip
          pip install -r requirements.txt
      - name: Run tests
        run: |
          python -m pytest tests/ -v
      # - name: Run linting
      #   run: |
      #     python -m flake8 web/ tests/ --max-line-length=120
  build-image:
    runs-on: ubuntu-latest
    needs: test
    env:
      DEFAULT_BRANCH: main
      REGISTRY_URL: ${{ secrets.REGISTRY_URL }}
      REGISTRY_USERNAME: ${{ secrets.REGISTRY_USERNAME }}
      REGISTRY_PASSWORD: ${{ secrets.REGISTRY_PASSWORD }}
    steps:
      - name: Checkout
        uses: actions/checkout@v4
      - name: Collect workflow metadata
        id: meta
        shell: bash
        run: |
          ref_name="${GITHUB_REF_NAME:-${GITHUB_REF##*/}}"
          event_name="${GITHUB_EVENT_NAME:-}"
          sha="${GITHUB_SHA:-}"
          if [ "$ref_name" = "${DEFAULT_BRANCH:-main}" ]; then
            echo "on_default=true" >> "$GITHUB_OUTPUT"
          else
            echo "on_default=false" >> "$GITHUB_OUTPUT"
          fi
          echo "ref_name=$ref_name" >> "$GITHUB_OUTPUT"
          echo "event_name=$event_name" >> "$GITHUB_OUTPUT"
          echo "sha=$sha" >> "$GITHUB_OUTPUT"
      - name: Set up QEMU and Buildx
        uses: docker/setup-buildx-action@v3
      - name: Log in to registry (best-effort)
        if: ${{ steps.meta.outputs.on_default == 'true' }}
        uses: docker/login-action@v3
        continue-on-error: true
        with:
          registry: ${{ env.REGISTRY_URL }}
          username: ${{ env.REGISTRY_USERNAME }}
          password: ${{ env.REGISTRY_PASSWORD }}
      - name: Build (and optionally push) image
        uses: docker/build-push-action@v5
        with:
          context: .
          file: Dockerfile
          push: ${{ steps.meta.outputs.on_default == 'true' && steps.meta.outputs.event_name != 'pull_request' && (env.REGISTRY_URL != '' && env.REGISTRY_USERNAME != '' && env.REGISTRY_PASSWORD != '') }}
          tags: |
            ${{ env.REGISTRY_URL }}/allucanget/jobs:latest
            ${{ env.REGISTRY_URL }}/allucanget/jobs:${{ steps.meta.outputs.sha }}
  # deploy:
  #   runs-on: ubuntu-latest
  #   needs: test
  #   if: github.ref == 'refs/heads/main'
  #   steps:
  #     - name: Checkout code
  #       uses: actions/checkout@v4
  #     - name: Deploy to production
  #       run: |
  #         echo "Deploying to production..."
  #         docker-compose up -d
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,3 @@
 .github/copilot*
 cache/
 logs/
@@ -165,3 +164,6 @@ cython_debug/
 #.idea/
 docs/online.md
 .github/copilot*
 .github/TODO.md
 .vscode/launch.json
--- a/51
+++ b/51
@@ -0,0 +1,51 @@
 # Use Python 3.11 slim image
 FROM python:3.11-slim-bookworm
 # Set environment variables
 ENV PYTHONUNBUFFERED=1
 ENV PYTHONDONTWRITEBYTECODE=1
 ENV FLASK_ENV=production
 # Add apt-cacher-ng configuration (if APT_CACHER_NG is set)
 RUN if [ -n "$APT_CACHER_NG" ]; then echo 'Acquire::http { Proxy "'"$APT_CACHER_NG"'/"; };' > /etc/apt/apt.conf.d/01proxy; fi
 # Install system dependencies
 RUN apt-get update && apt-get install -y \
    gcc \
    default-libmysqlclient-dev \
    default-mysql-client \
    pkg-config \
    curl \
    && rm -rf /var/lib/apt/lists/*
 # Create app directory
 WORKDIR /app
 # Copy requirements first for better caching
 COPY requirements.txt .
 # Install Python dependencies
 RUN pip install --no-cache-dir -r requirements.txt
 # Copy application code
 COPY . .
 # Create necessary directories
 RUN mkdir -p cache logs
 # Expose port
 EXPOSE 8000
 # Health check
 HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
    CMD curl -f http://localhost:8000/ || exit 1
 # Copy entrypoint script
 COPY docker-entrypoint.sh /usr/local/bin/
 RUN chmod +x /usr/local/bin/docker-entrypoint.sh
 # Set entrypoint
 ENTRYPOINT ["docker-entrypoint.sh"]
 # Run gunicorn
 CMD ["gunicorn", "--config", "gunicorn.conf.py", "web.app:app"]
--- a/README-Docker.md
+++ b/README-Docker.md
@@ -0,0 +1,409 @@
 # Jobs App - Docker Deployment
 This application is a Craigslist job scraper with a Flask web interface.
 ## Quick Start with Docker
 ### Prerequisites
 - Docker
 - Docker Compose
 ### Deployment
 1. **Clone the repository and navigate to the project directory**
   ```bash
   cd /path/to/jobs-app/jobs
   ```
 2. **Start the application**
   ```bash
   docker-compose up --build -d
   ```
 3. **Wait for services to be ready** (about 30 seconds)
   ```bash
   # Check if the app is running
   curl http://localhost:8000
   ```
 4. **Access the application**
   - Main app: <http://localhost:8000>
   - Admin interface: <http://localhost:8000/admin/users>
     - Username: `admin`
     - Password: `M11ffpgm.`
   - Scraper interface: <http://localhost:8000/scrape-page>
 ## Docker Architecture
 ### Services
 - **jobs-app**: Flask application with Gunicorn WSGI server
 - **mysql**: MySQL 8.0 database
 ### Ports
 - 8000: Flask application
 - 3306: MySQL database (exposed for external access if needed)
 ### Volumes
 - `mysql_data`: Persistent MySQL data storage
 ## Configuration
 ### Environment Variables
 - `FLASK_ENV`: Set to `production`
 - `FLASK_SECRET`: Secret key for Flask sessions (required)
 - `APT_CACHER_NG`: Optional URL for apt-cacher-ng proxy to speed up package downloads (e.g., `http://192.168.88.14:3142`)
 ### Database Configuration
 The database configuration is in `config/settings.json`:
 ```json
 {
  "database": {
    "mysql": {
      "host": "mysql",
      "user": "jobs",
      "password": "jobdb",
      "database": "jobs",
      "port": 3306
    }
  }
 }
 ```
 ## Useful Commands
 ```bash
 # View logs
 docker-compose logs -f
 # Stop services
 docker-compose down
 # Restart services
 docker-compose restart
 # Rebuild and restart
 docker-compose up --build
 # View running containers
 docker-compose ps
 # Execute commands in the app container
 docker-compose exec jobs-app bash
 # Check database
 docker-compose exec mysql mysql -u jobs -p jobs
 ```
 ## Production Considerations
 1. **Security**:
   - Change default passwords in `docker-compose.yml`
   - Use environment variables for secrets
   - Configure proper firewall rules
 2. **Scaling**:
   - Adjust Gunicorn workers in `gunicorn.conf.py`
   - Consider using a reverse proxy (nginx)
   - Implement proper logging and monitoring
 3. **Database**:
   - Use external MySQL for production
   - Configure backups
   - Set up connection pooling
 4. **Networking**:
   - Use proper domain names
   - Configure SSL/TLS
   - Set up load balancing if needed
 ## Troubleshooting
 ### Common Issues
 1. **Port conflicts**: Change ports in `docker-compose.yml`
 2. **Database connection**: Ensure MySQL container is healthy
 3. **Memory issues**: Increase Docker memory limits
 4. **Permission issues**: Check file permissions in mounted volumes
 ### Logs
 ```bash
 # Application logs
 docker-compose logs jobs-app
 # Database logs
 docker-compose logs mysql
 # All logs
 docker-compose logs
 ```
 ## Development
 For development with hot reload:
 ```bash
 # Run in development mode
 docker-compose -f docker-compose.dev.yml up --build
 ```
 Create `docker-compose.dev.yml`:
 ```yaml
 version: "3.8"
 services:
  jobs-app:
    build: .
    ports:
      - "8000:8000"
    environment:
      - FLASK_ENV=development
    volumes:
      - .:/app
    command: ["flask", "run", "--host", "0.0.0.0", "--port", "8000"]
 ```
 ## Coolify Deployment
 This application can be deployed on [Coolify](https://coolify.io) using Docker Compose. Coolify provides additional features and management capabilities for containerized applications.
 ### Coolify Prerequisites
 - Coolify instance (self-hosted or cloud)
 - Git repository accessible to Coolify
 ### Coolify-Specific Configuration
 #### 1. Environment Variables
 Coolify automatically detects environment variables in your `docker-compose.yml` and provides a UI to manage them. Use the following syntax for better integration:
 ```yaml
 services:
  jobs-app:
    environment:
      # Required variables (will show red border if empty)
      - FLASK_SECRET=${FLASK_SECRET:?}
      # Required with default (prefilled but editable)
      - FLASK_ENV=${FLASK_ENV:?production}
      # Optional with default
      - GUNICORN_WORKERS=${GUNICORN_WORKERS:-4}
 ```
 #### 2. Coolify Magic Environment Variables
 Leverage Coolify's dynamic environment variables:
 ```yaml
 services:
  jobs-app:
    environment:
      # Generate FQDN for the application
      - SERVICE_FQDN_JOBS_APP
      # Generate secure password for admin user
      - ADMIN_PASSWORD=${SERVICE_PASSWORD_ADMIN:?M11ffpgm.}
      # Generate database credentials
      - DB_USER=${SERVICE_USER_DB:?jobs}
      - DB_PASSWORD=${SERVICE_PASSWORD_DB:?jobdb}
 ```
 #### 3. Storage Configuration
 Coolify supports advanced storage options:
 ```yaml
 services:
  jobs-app:
    volumes:
      # Create empty directories
      - type: bind
        source: ./cache
        target: /app/cache
        is_directory: true
      - type: bind
        source: ./logs
        target: /app/logs
        is_directory: true
  mysql:
    volumes:
      # Persistent database storage
      - mysql_data:/var/lib/mysql
 ```
 #### 4. Health Checks and Service Management
 ```yaml
 services:
  jobs-app:
    # Exclude from health checks if needed
    exclude_from_hc: false
    # Labels for Coolify management and Traefik routing
    labels:
      - coolify.managed=true
      - traefik.enable=true
      - "traefik.http.routers.jobs-app.rule=Host(`${SERVICE_FQDN_JOBS_APP:-localhost}`)"
      - traefik.http.routers.jobs-app.entryPoints=https
      - "traefik.http.routers.jobs-app.middlewares=https-redirect"
      - "traefik.http.middlewares.https-redirect.redirectscheme.scheme=https"
 ```
 #### 5. Database Configuration for Coolify
 Update your `config/settings.json` to use Coolify environment variables:
 ```json
 {
  "database": {
    "mysql": {
      "host": "mysql",
      "user": "${DB_USER:-jobs}",
      "password": "${DB_PASSWORD:-jobdb}",
      "database": "jobs",
      "port": 3306
    }
  }
 }
 ```
 ### Complete Coolify docker-compose.yml
 Here's a complete `docker-compose.yml` optimized for Coolify:
 ```yaml
 version: "3.8"
 services:
  jobs-app:
    build: .
    ports:
      - "8000:8000"
    environment:
      # Required environment variables
      - FLASK_SECRET=${FLASK_SECRET:?}
      - FLASK_ENV=${FLASK_ENV:?production}
      # Coolify magic variables
      - SERVICE_FQDN_JOBS_APP
      - ADMIN_PASSWORD=${SERVICE_PASSWORD_ADMIN:?M11ffpgm.}
      - DB_USER=${SERVICE_USER_DB:?jobs}
      - DB_PASSWORD=${SERVICE_PASSWORD_DB:?jobdb}
      # Optional configuration
      - GUNICORN_WORKERS=${GUNICORN_WORKERS:-4}
      - APT_CACHER_NG=${APT_CACHER_NG}
    volumes:
      - type: bind
        source: ./cache
        target: /app/cache
        is_directory: true
      - type: bind
        source: ./logs
        target: /app/logs
        is_directory: true
    depends_on:
      - mysql
    labels:
      - coolify.managed=true
      - traefik.enable=true
      - "traefik.http.routers.jobs-app.rule=Host(`${SERVICE_FQDN_JOBS_APP:-localhost}`)"
      - traefik.http.routers.jobs-app.entryPoints=https
      - "traefik.http.routers.jobs-app.middlewares=https-redirect"
      - "traefik.http.middlewares.https-redirect.redirectscheme.scheme=https"
    networks:
      - jobs-network
    restart: unless-stopped
  mysql:
    image: mysql:8.0
    environment:
      - MYSQL_ROOT_PASSWORD=${MYSQL_ROOT_PASSWORD:?rootpassword}
      - MYSQL_DATABASE=jobs
      - MYSQL_USER=${DB_USER:-jobs}
      - MYSQL_PASSWORD=${DB_PASSWORD:-jobdb}
    ports:
      - "3306:3306"
    volumes:
      - mysql_data:/var/lib/mysql
      - ./mysql-init:/docker-entrypoint-initdb.d
    networks:
      - jobs-network
 volumes:
  mysql_data:
 networks:
  jobs-network:
    driver: bridge
 ```
 ### Coolify Deployment Steps
 1. **Connect Repository**: Link your Git repository to Coolify
 2. **Create Service Stack**: Choose "Docker Compose" as the build pack
 3. **Configure Environment Variables**: Set required variables in Coolify's UI:
   - `FLASK_SECRET`: Generate a secure random string
   - `FLASK_ENV`: Set to "production"
   - `MYSQL_ROOT_PASSWORD`: Set a secure password
 4. **Deploy**: Coolify will automatically build and deploy your application
 5. **Access**: Use the generated FQDN to access your application
 ### Coolify Benefits
 - **Automatic SSL**: HTTPS certificates are automatically managed
 - **Environment Management**: Easy variable management through UI
 - **Monitoring**: Built-in logging and health monitoring
 - **Scaling**: Easy horizontal scaling
 - **Backups**: Automated backup capabilities
 - **Security**: Isolated networks and secure defaults
 ### Troubleshooting Coolify Deployments
 1. **Environment Variables**: Check that all required variables are set in Coolify's UI
 2. **Build Logs**: Review build logs for any compilation errors
 3. **Network Issues**: Ensure services can communicate within the stack
 4. **Storage Permissions**: Verify volume permissions are correct
 5. **FQDN Configuration**: Check that the generated FQDN is accessible
 #### Common Coolify Errors
 **Error: "The scheme `domain.sslip.io` isn't valid. It should be either `http`, `https`"**
 - **Cause**: Incorrect Traefik router configuration using full URLs instead of hostnames
 - **Solution**: Use `SERVICE_FQDN_*` variables (which contain just the domain) in `Host()` rules, not `SERVICE_URL_*` variables
 **Example**:
 ```yaml
 # Correct
 - "traefik.http.routers.app.rule=Host(`${SERVICE_FQDN_APP:-localhost}`)"
 # Incorrect
 - "traefik.http.routers.app.rule=Host(`${SERVICE_URL_APP:-localhost}`)"
 ```
 ##### Container fails to start with permission denied
 - **Cause**: Volume mount permissions in Coolify environment
 - **Solution**: Ensure `is_directory: true` is set for bind mounts that need directory creation
 For more information, visit the [Coolify Docker Compose documentation](https://coolify.io/docs/knowledge-base/docker/compose).
--- a/README-Traefik.md
+++ b/README-Traefik.md
@@ -0,0 +1,87 @@
 # Setting Up Coolify Deployment Behind an Existing Traefik Proxy
 This guide explains how to configure your existing Traefik instance (running at `192.168.88.10`) to proxy traffic to a Coolify-deployed jobs-app service running on `192.168.88.13:8001`.
 ## Prerequisites
 - Traefik is running and accessible at `192.168.88.10`
 - Your external IP is configured to point to Traefik for domain resolution
 - The jobs-app is deployed via Coolify and running on `192.168.88.13:8001`
 - You have access to Traefik's configuration files (assuming file-based provider)
 ## Step 1: Verify Jobs-App Accessibility
 Ensure the jobs-app is running and accessible:
 ```bash
 curl http://192.168.88.13:8001
 ```
 You should receive a response from the Flask application.
 ## Step 2: Configure Traefik
 Since Traefik is on a separate machine (`192.168.88.10`) and cannot directly watch the Docker containers on `192.168.88.13`, you'll need to manually configure the routing in Traefik's configuration.
 ### Option 1: Using Traefik's File Provider
 Add the following configuration to your Traefik dynamic configuration file (e.g., `dynamic.yml`):
 ```yaml
 http:
  routers:
    jobs-app:
      rule: "Host(`your-domain.com`)" # Replace with your actual domain
      service: jobs-app
      entryPoints:
        - https # Assuming Traefik handles SSL termination
      middlewares:
        - https-redirect # Optional: redirect HTTP to HTTPS
  services:
    jobs-app:
      loadBalancer:
        servers:
          - url: "http://192.168.88.13:8001"
  middlewares:
    https-redirect:
      redirectScheme:
        scheme: https
        permanent: true
 ```
 ### Option 2: Using Docker Labels (if Traefik can access the Docker socket)
 If Traefik has access to the Docker socket on `192.168.88.13` (e.g., via network mount or API), the Docker labels in `docker-compose.yml` will automatically configure the routing. No additional configuration is needed.
 ## Step 3: Reload Traefik Configuration
 After updating the configuration, reload Traefik:
 ```bash
 # If using Docker
 docker-compose restart traefik
 # Or if running directly
 systemctl reload traefik
 ```
 ## Step 4: Test the Setup
 1. Ensure your DNS points `your-domain.com` to your external IP, which routes to Traefik.
 2. Visit `https://your-domain.com` in your browser.
 3. Traefik should proxy the request to `http://192.168.88.13:8001` and serve the jobs-app.
 ## Troubleshooting
 - **Port not accessible**: Ensure firewall rules allow traffic from `192.168.88.10` to `192.168.88.13:8001`.
 - **SSL issues**: If Traefik is not terminating SSL, adjust the `entryPoints` and remove HTTPS redirects.
 - **Routing not working**: Check Traefik logs for errors in router/service configuration.
 - **Domain mismatch**: Verify the `Host` rule matches your actual domain.
 ## Notes
 - The jobs-app runs on port 8000 internally in the container, exposed on host port 8001.
 - If you need to change the external port, update the `ports` mapping in `docker-compose.yml` and the Traefik service URL accordingly.
 - For production, consider adding authentication, rate limiting, or other middlewares in Traefik.
--- a/README.md
+++ b/README.md
@@ -9,11 +9,32 @@ job scraper
 - Users can search for job listings by keywords and region
 - Selection of job listings based on user preferences
-## Requirements
+## Architecture Overview
- Database (MySQL/MariaDB)
+The application is built as a modular Flask‑based service with clear separation of concerns:
- Python 3.x
+
-  - Required Python packages (see requirements.txt)
+| Layer                         | Module                                   | Responsibility                                                                                                                                                                                                           |
 | ----------------------------- | ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | **Web UI**                    | `web/app.py`                             | Flask application that serves HTML pages, REST endpoints, and admin interfaces (users, taxonomy, health, email management).                                                                                              |
 | **Orchestrator**              | `web/craigslist.py`                      | Coordinates the scraping workflow: schedules runs, fetches listings, updates the DB, and triggers email alerts.                                                                                                          |
 | **Scraper**                   | `web/scraper.py`                         | Contains the low‑level HTML parsing logic (`scrape_job_data`, `scrape_job_page`, `extract_contact_info`).                                                                                                                |
 | **Persistence**               | `web/db.py`                              | SQLAlchemy ORM models (`User`, `JobListing`, `JobDescription`, `UserInteraction`, `Region`, `Keyword`, `EmailSubscription`, **`EmailTemplate`**) and helper functions for upserts, queries, and subscription management. |
 | **Email Rendering**           | `web/email_templates.py`                 | Renders job‑alert emails using a pluggable template system. Supports default placeholders (`{count_label}`, `{scope}`, `{timestamp}`, `{jobs_section}`, `{jobs_message}`) and custom admin‑defined templates.            |
 | **Email Delivery**            | `web/email_service.py`                   | Sends rendered messages via SMTP, handling TLS/SSL, authentication, and graceful disabling.                                                                                                                              |
 | **Configuration**             | `config/settings.json`                   | Centralised JSON config for database, HTTP, scraper options, negative keywords, and email settings.                                                                                                                      |
 | **Static Assets & Templates** | `web/static/`, `web/templates/`          | Front‑end resources (JS, CSS) and Jinja2 templates for the public UI and admin pages (including the new **Email Templates** management UI).                                                                              |
 | **Scheduler**                 | `schedule` (used in `web/craigslist.py`) | Runs the scraper automatically at configurable intervals (default hourly).                                                                                                                                               |
 | **Testing**                   | `tests/`                                 | Pytest suite covering scheduler, scraper, DB helpers, email service, and the new admin UI for email subscriptions and templates.                                                                                         |
 **Key architectural notes**
 - **Email Subscriptions** are stored in the `email_subscriptions` table and managed via `/admin/emails`.
 - **Email Templates** are persisted in the new `email_templates` table, editable through `/admin/email-templates`, and used by the alert system.
 - The orchestrator (`fetch_listings`) returns a detailed result dict (`discovered`, `new`, `by_search`) that drives UI metrics and health checks.
 - Contact information (`reply_url`, `contact_email`, `contact_phone`, `contact_name`) extracted by the scraper is saved in `job_descriptions`.
 - Negative keyword filtering is applied early in the pipeline to prevent unwanted listings from reaching the DB or email alerts.
 This layered design makes it straightforward to extend the scraper to new sources, swap out the email backend, or add additional admin features without impacting other components.
 ## Installation
@@ -22,3 +43,205 @@ job scraper
 3. Install dependencies
 4. Set up environment variables
 5. Run the application
 ## Scheduler Configuration
 The application includes an automated scheduler that runs the job scraping process every hour. The scheduler is implemented in `web/craigslist.py` and includes:
 - **Automatic Scheduling**: Scraping runs every hour automatically
 - **Failure Handling**: Retry logic with exponential backoff (up to 3 attempts)
 - **Background Operation**: Runs in a separate daemon thread
 - **Graceful Error Recovery**: Continues running even if individual scraping attempts fail
 ### Scheduler Features
 - **Retry Mechanism**: Automatically retries failed scraping attempts
 - **Logging**: Comprehensive logging of scheduler operations and failures
 - **Testing**: Comprehensive test suite in `tests/test_scheduler.py`
 To modify the scheduling interval, edit the `start_scheduler()` function in `web/craigslist.py`.
 ## Job Scraping Output
 The `fetch_listings()` function in `web/craigslist.py` extends its output to provide detailed metrics about each scraping operation. It returns a dictionary containing:
 - **discovered**: Total number of unique job URLs discovered across all region/keyword combinations
 - **new**: Total number of newly added jobs (jobs not previously in the database)
 - **by_search**: List of dictionaries, each containing:
  - **region**: The region name for this search
  - **keyword**: The keyword used for this search
  - **count**: Number of jobs fetched for this specific region/keyword combination
 ### Example Output
 ```python
 {
    "discovered": 150,
    "new": 42,
    "by_search": [
        {"region": "sfbay", "keyword": "python", "count": 25},
        {"region": "sfbay", "keyword": "java", "count": 18},
        {"region": "losangeles", "keyword": "python", "count": 45},
        {"region": "losangeles", "keyword": "java", "count": 62}
    ]
 }
 ```
 This per-search breakdown allows for better monitoring and debugging of the scraping process, enabling identification of searches that may be failing or returning fewer results than expected.
 ## Contact Information Extraction
 The scraper now automatically extracts contact information from job listing pages:
 ### Extracted Fields
 When scraping individual job listings, the following contact information is extracted and stored:
 - **contact_email**: Email address extracted from reply button or contact form links
 - **contact_phone**: Phone number extracted from tel links or contact parameters
 - **contact_name**: Contact person or department name if available
 - **reply_url**: The full reply/contact URL from the job listing
 ### How Contact Information is Extracted
 The `extract_contact_info()` function intelligently parses various types of reply URLs:
 1. **Mailto Links**: `mailto:jobs@company.com?subject=...`
   - Extracts the email address directly
 2. **Phone Links**: `tel:+1234567890`
   - Extracts the phone number
 3. **URL Parameters**: `https://apply.company.com?email=hr@company.com&phone=555-1234&name=HR%20Team`
   - Searches for common parameter names: `email`, `phone`, `contact_name`, etc.
 4. **Graceful Fallback**: If contact information cannot be extracted, the fields are set to `"N/A"`
 ### Database Storage
 Contact information is stored in the `job_descriptions` table with the following columns:
 - `reply_url` (VARCHAR(512)): The complete reply/contact URL
 - `contact_email` (VARCHAR(255)): Extracted email address
 - `contact_phone` (VARCHAR(255)): Extracted phone number
 - `contact_name` (VARCHAR(255)): Extracted contact person/department name
 ### Example
 For a job listing with reply button `mailto:hiring@acme.com?subject=Job%20Application`:
 ```python
 {
    "reply_url": "mailto:hiring@acme.com?subject=Job%20Application",
    "contact_email": "hiring@acme.com",
    "contact_phone": "N/A",
    "contact_name": "N/A"
 }
 ```
 This contact information is automatically extracted during job page scraping and persisted to the database for easy access and filtering.
 ## Negative Keyword Filtering
 The scraper inspects each job’s title, company, location, and description for configurable “negative” keywords. When a keyword matches, the scraped result indicates the match so downstream workflows can skip or flag the job.
 ### Email Configuration
 Define keywords in `config/settings.json` under `scraper.negative_keywords`. Keywords are matched case-insensitively and should be supplied without surrounding whitespace:
 ```json
 {
  "scraper": {
    "negative_keywords": ["scam", "mlm", "unpaid"]
  }
 }
 ```
 ### Scrape Output
 Each `scrape_job_page` result contains three new fields:
 - `is_negative_match`: `True` when any keyword matches
 - `negative_keyword_match`: the keyword that triggered the match
 - `negative_match_field`: which field (title, company, location, description) contained the keyword
 ### Processing Behavior
 - `process_job_url` stops when `is_negative_match` is `True`, yielding a log message and calling `remove_job` so stale results never remain in `job_listings`.
 - `upsert_job_details` now returns immediately for negative matches, ensuring `job_descriptions` never stores filtered listings.
 - Regression coverage lives in `tests/test_scraper.py::TestScraperPipelineNegativeFiltering` and `tests/test_db_negative_filtering.py::test_upsert_job_details_skips_negative_match`.
 Together, these checks mean negative matches are dropped before any persistence and never shown in the UI.
 ### User-Specific Negative Keywords
 In addition to the global negative keywords defined in `settings.json`, users can define their own personal negative keywords via the **Preferences** page (`/settings`).
 - **Management**: Users can add new negative keywords and remove existing ones.
 - **Filtering**: Jobs matching any of the user's negative keywords are filtered out from the job listings view (`/` and `/jobs`).
 - **Validation**: The UI prevents adding duplicate keywords.
 - **Storage**: User-specific negative keywords are stored in the database (`negative_keywords` and `user_negative_keywords` tables).
 ## Email Notifications
 Optional job-alert emails are generated whenever the scraper discovers new listings.
 ### Configuration
 Edit `config/settings.json` under the `email` section:
 ```json
 {
  "email": {
    "enabled": true,
    "from_address": "jobs@example.com",
    "recipients": ["alerts@example.com"],
    "smtp": {
      "host": "smtp.example.com",
      "port": 587,
      "username": "smtp-user",
      "password": "secret",
      "use_tls": true,
      "use_ssl": false,
      "timeout": 30
    }
  }
 }
 ```
 - Leave `enabled` set to `false` for local development or when credentials are unavailable.
 - Provide at least one recipient; otherwise alerts are skipped with a log message.
 - Omit real credentials from source control—inject them via environment variables or a secrets manager in production.
 ### How Alerts Are Sent
 - After `fetch_listings()` completes, the scraper gathers new listings and, when configured, renders a plaintext digest via `web.email_templates.render_job_alert_email`.
 - Delivery is handled by `web.email_service.send_email`, which supports TLS/SSL SMTP connections and gracefully skips when disabled.
 - Success or failure is streamed in the scraper log output (`Job alert email sent.` or the reason for skipping).
 ### Managing Recipients
 - Admin users can visit `/admin/emails` to add or deactivate subscription addresses through the web UI.
 - Deactivated rows remain in the table so they can be reactivated later; the scraper only mails active recipients.
 - The navigation bar exposes an **Email Alerts** link to the management screen after logging in as an admin user.
 ### Customising Templates
 - Use the **Email Templates** admin page (`/admin/email-templates`) to create, edit, preview, or delete alert templates.
 - Templates support placeholder tokens such as `{count_label}`, `{scope}`, `{timestamp}`, `{jobs_section}`, and `{jobs_message}`; the UI lists all available tokens.
 - Preview renders the selected template with sample data so changes can be reviewed before saving.
 ### Tests
 - `tests/test_email_templates.py` verifies the rendered subject/body for both populated and empty alerts.
 - `tests/test_email_service.py` covers SMTP configuration, disabled mode, and login/send flows using fakes.
 - `tests/test_admin_email.py` exercises the admin UI for listing, subscribing, and unsubscribing recipients.
 - `tests/test_admin_email_templates.py` verifies CRUD operations and previews for template management.
 - `tests/test_scraper.py::TestScraperEmailNotifications` ensures the scraping pipeline invokes the alert sender when new jobs are found.
 ## Docker Deployment
 Please see [README-Docker.md](README-Docker.md) for instructions on deploying the application using Docker.
--- a/config/settings.json
+++ b/config/settings.json
@@ -9,7 +9,7 @@
    }
  },
  "http": {
-    "user_agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:141.0) Gecko/20100101 Firefox/141.0",
+    "user_agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:145.0) Gecko/20100101 Firefox/145.0",
    "request_timeout": 30,
    "max_retries": 3,
    "backoff_factor": 2,
@@ -22,7 +22,22 @@
  },
  "scraper": {
    "base_url": "https://{region}.craigslist.org/search/jjj?query={keyword}&sort=rel",
-    "config_dir": "config"
+    "config_dir": "config",
    "negative_keywords": []
  },
  "email": {
    "enabled": false,
    "from_address": "jobs@example.com",
    "recipients": [],
    "smtp": {
      "host": "smtp.example.com",
      "port": 587,
      "username": "",
      "password": "",
      "use_tls": true,
      "use_ssl": false,
      "timeout": 30
    }
  },
  "users": [
    { "username": "anonymous", "is_admin": false, "password": "" },
--- a/deploy.sh
+++ b/deploy.sh
@@ -0,0 +1,41 @@
 #!/bin/bash
 # Deployment script for Jobs App
 set -e
 echo "🚀 Starting Jobs App deployment..."
 # Check if Docker is installed
 if ! command -v docker &> /dev/null; then
    echo "❌ Docker is not installed. Please install Docker first."
    exit 1
 fi
 # Check if docker-compose is installed
 if ! command -v docker-compose &> /dev/null; then
    echo "❌ docker-compose is not installed. Please install docker-compose first."
    exit 1
 fi
 echo "📦 Building and starting services..."
 docker-compose up --build -d
 echo "⏳ Waiting for services to be ready..."
 sleep 30
 echo "🔍 Checking service health..."
 if curl -f http://localhost:8000/ &> /dev/null; then
    echo "✅ Jobs App is running successfully!"
    echo "🌐 Access the application at: http://localhost:8000"
    echo "📊 Admin interface: http://localhost:8000/admin/users (login: admin/M11ffpgm.)"
    echo "🔄 Scraper interface: http://localhost:8000/scrape-page"
 else
    echo "❌ Jobs App failed to start. Check logs with: docker-compose logs"
    exit 1
 fi
 echo "📋 Useful commands:"
 echo "  View logs: docker-compose logs -f"
 echo "  Stop services: docker-compose down"
 echo "  Restart: docker-compose restart"
 echo "  Rebuild: docker-compose up --build"
--- a/docker-compose-test.yml
+++ b/docker-compose-test.yml
@@ -0,0 +1,40 @@
 version: "3.8"
 services:
  jobs-app:
    build: .
    ports:
      - "8001:8000"
    environment:
      # Required environment variables
      - FLASK_SECRET="localtest8462851903856136136"
      - FLASK_ENV=production
      # Coolify magic variables
      - SERVICE_FQDN_JOBS_APP=https://jobs.allucanget.biz
      - ADMIN_PASSWORD=M11ffpgm.
      - DB_USER=jobs
      - DB_PASSWORD=jobdb
      # Optional configuration
      - GUNICORN_WORKERS=4
    volumes:
      - type: bind
        source: ./cache
        target: /app/cache
      - type: bind
        source: ./logs
        target: /app/logs
    labels:
      - coolify.managed=true
      - traefik.enable=true
      - "traefik.http.routers.jobs-app.rule=Host(`${SERVICE_FQDN_JOBS_APP:-localhost}`)"
      - traefik.http.routers.jobs-app.entryPoints=http
      - "traefik.http.services.jobs-app.loadbalancer.server.port=8000"
    networks:
      - jobs-network
    restart: unless-stopped
 networks:
  jobs-network:
    driver: bridge
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -0,0 +1,61 @@
 version: "3.8"
 services:
  jobs-app:
    build: .
    ports:
      - "8001:8000"
    environment:
      # Required environment variables
      - FLASK_SECRET=${FLASK_SECRET:?}
      - FLASK_ENV=${FLASK_ENV:?production}
      # Coolify magic variables
      - SERVICE_FQDN_JOBS_APP
      - ADMIN_PASSWORD=${SERVICE_PASSWORD_ADMIN:?M11ffpgm.}
      - DB_USER=${SERVICE_USER_DB:?jobs}
      - DB_PASSWORD=${SERVICE_PASSWORD_DB:?jobdb}
      # Optional configuration
      - GUNICORN_WORKERS=${GUNICORN_WORKERS:-4}
      - APT_CACHER_NG=${APT_CACHER_NG}
    volumes:
      - type: bind
        source: ./cache
        target: /app/cache
      - type: bind
        source: ./logs
        target: /app/logs
    depends_on:
      - mysql
    labels:
      - coolify.managed=true
      - traefik.enable=true
      - "traefik.http.routers.jobs-app.rule=Host(`${SERVICE_FQDN_JOBS_APP:-localhost}`)"
      - traefik.http.routers.jobs-app.entryPoints=http
      - "traefik.http.services.jobs-app.loadbalancer.server.port=8000"
    networks:
      - jobs-network
    restart: unless-stopped
  mysql:
    image: mysql:8.0
    environment:
      - MYSQL_ROOT_PASSWORD=${MYSQL_ROOT_PASSWORD:?rootpassword}
      - MYSQL_DATABASE=jobs
      - MYSQL_USER=${DB_USER:-jobs}
      - MYSQL_PASSWORD=${DB_PASSWORD:-jobdb}
    ports:
      - "3306:3306"
    volumes:
      - mysql_data:/var/lib/mysql
      - ./mysql-init:/docker-entrypoint-initdb.d
    networks:
      - jobs-network
 volumes:
  mysql_data:
 networks:
  jobs-network:
    driver: bridge
--- a/docker-entrypoint.sh
+++ b/docker-entrypoint.sh
@@ -0,0 +1,48 @@
 #!/bin/bash
 # Docker entrypoint script for Jobs App
 set -e
 echo "🚀 Starting Jobs App container..."
 # Wait for MySQL to be ready
 echo "⏳ Waiting for MySQL to be ready..."
 python -c "
 import time
 import pymysql
 while True:
    try:
        conn = pymysql.connect(
            host='192.168.88.37',
            user='jobs',
            password='jobdb',
            database='jobs',
            connect_timeout=5
        )
        conn.close()
        print('✅ MySQL is ready!')
        break
    except pymysql.Error as e:
        print(f'MySQL is not ready: {e}, waiting...')
        time.sleep(2)
 "
 # Run database setup
 echo "🗄️ Setting up database..."
 python setup.py mysql-init
 # Seed initial data if needed
 echo "🌱 Seeding initial data..."
 python -c "
 from web.db import db_init
 from web.utils import initialize_users_from_settings
 db_init()
 try:
    initialize_users_from_settings()
    print('✅ Users seeded successfully')
 except Exception as e:
    print(f'⚠️ User seeding failed: {e}')
 "
 echo "🎯 Starting Gunicorn server..."
 exec "$@"
--- a/gunicorn.conf.py
+++ b/gunicorn.conf.py
@@ -0,0 +1,38 @@
 # Gunicorn configuration file
 import multiprocessing
 # Server socket
 bind = "0.0.0.0:8000"
 backlog = 2048
 # Worker processes
 workers = multiprocessing.cpu_count() * 2 + 1
 worker_class = "sync"
 worker_connections = 1000
 max_requests = 1000
 max_requests_jitter = 50
 timeout = 30
 keepalive = 2
 # Logging
 loglevel = "info"
 accesslog = "-"
 errorlog = "-"
 # Process naming
 proc_name = "jobs_app"
 # Server mechanics
 daemon = False
 pidfile = "/tmp/gunicorn.pid"
 user = None
 group = None
 tmp_upload_dir = None
 # SSL (if needed)
 keyfile = None
 certfile = None
 # Application
 wsgi_module = "web.app:app"
 callable = "app"
--- a/main.py
+++ b/main.py
@@ -4,8 +4,20 @@ starts webserver
 """
 import web.app as app
 import threading
 from web.craigslist import start_scheduler
 def start_background_scheduler():
    """Start the scheduler in a background thread."""
    scheduler_thread = threading.Thread(target=start_scheduler, daemon=True)
    scheduler_thread.start()
    print("Background scheduler started")
 if __name__ == "__main__":
    # Start scheduler in background thread
    start_background_scheduler()
    # start web server
    app.main()
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,6 +3,7 @@ flask
 flask-wtf
 pytest
 requests
 schedule
 sqlalchemy
 pymysql
 gunicorn
--- a/setup.py
+++ b/setup.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 """
-MySQL utilities for Craigslist project.
+MySQL utility script for initializing database and showing row counts.
 Usage (PowerShell):
  # Ensure MySQL database and tables
@@ -36,15 +36,15 @@ try:
        engine = create_engine(url, future=True)
        with engine.begin() as conn:
            for table in [
                "users",
                "regions",
                "keywords",
                "user_regions",
                "user_keywords",
                "job_listings",
                "job_descriptions",
-                "cached_pages",
+                "job_listings",
                "keywords",
                "logs",
                "regions",
                "users",
                "user_interactions",
                "user_keywords",
                "user_regions",
            ]:
                try:
                    n = conn.execute(text(f"SELECT COUNT(*) FROM {table}"))
--- a/tests/test_admin_email.py
+++ b/tests/test_admin_email.py
@@ -0,0 +1,84 @@
 import pytest
 from sqlalchemy import text
 from web.app import app
 from web.db import (
    db_init,
    create_or_update_user,
    subscribe_email,
    list_email_subscriptions,
    _ensure_session,
 )
@pytest.fixture(scope="function", autouse=True)
 def initialize_app():
    app.config.update(TESTING=True, WTF_CSRF_ENABLED=False)
    with app.app_context():
        db_init()
        create_or_update_user("admin", password="secret",
                              is_admin=True, is_active=True)
    # Clear subscriptions before and after each test to avoid leakage
    with _ensure_session() as session:
        session.execute(text("DELETE FROM email_subscriptions"))
        session.commit()
    yield
    with _ensure_session() as session:
        session.execute(text("DELETE FROM email_subscriptions"))
        session.commit()
@pytest.fixture
 def client():
    with app.test_client() as test_client:
        with test_client.session_transaction() as sess:
            sess["username"] = "admin"
        yield test_client
@pytest.fixture
 def anon_client():
    with app.test_client() as test_client:
        # Ensure no admin session present
        with test_client.session_transaction() as sess:
            sess.pop("username", None)
        yield test_client
 def test_admin_emails_requires_admin(anon_client):
    response = anon_client.get("/admin/emails")
    assert response.status_code == 302
    assert "/login" in response.headers.get("Location", "")
 def test_admin_emails_lists_subscriptions(client):
    subscribe_email("alice@example.com")
    response = client.get("/admin/emails")
    assert response.status_code == 200
    assert b"alice@example.com" in response.data
 def test_admin_emails_can_subscribe(client):
    response = client.post(
        "/admin/emails",
        data={"action": "subscribe", "email": "bob@example.com"},
        follow_redirects=False,
    )
    assert response.status_code == 302
    emails = list_email_subscriptions()
    assert any(sub["email"] == "bob@example.com" and sub["is_active"]
               for sub in emails)
 def test_admin_emails_can_unsubscribe(client):
    subscribe_email("carol@example.com")
    response = client.post(
        "/admin/emails",
        data={"action": "unsubscribe", "email": "carol@example.com"},
        follow_redirects=False,
    )
    assert response.status_code == 302
    emails = list_email_subscriptions()
    matching = [sub for sub in emails if sub["email"] == "carol@example.com"]
    assert matching
    assert matching[0]["is_active"] is False
--- a/tests/test_admin_email_templates.py
+++ b/tests/test_admin_email_templates.py
@@ -0,0 +1,138 @@
 import pytest
 from sqlalchemy import text
 from web.app import app
 from web.db import (
    db_init,
    create_or_update_user,
    list_email_templates,
    update_email_template,
    _ensure_session,
    ensure_default_email_template,
 )
 from web.email_templates import render_job_alert_email
@pytest.fixture(scope="function", autouse=True)
 def setup_database():
    app.config.update(TESTING=True, WTF_CSRF_ENABLED=False)
    with app.app_context():
        db_init()
        create_or_update_user("admin", password="secret", is_admin=True, is_active=True)
    with _ensure_session() as session:
        session.execute(text("DELETE FROM email_templates"))
        session.commit()
    ensure_default_email_template()
    yield
    with _ensure_session() as session:
        session.execute(text("DELETE FROM email_templates"))
        session.commit()
    ensure_default_email_template()
@pytest.fixture
 def client():
    with app.test_client() as test_client:
        with test_client.session_transaction() as sess:
            sess["username"] = "admin"
        yield test_client
@pytest.fixture
 def anon_client():
    with app.test_client() as test_client:
        with test_client.session_transaction() as sess:
            sess.pop("username", None)
        yield test_client
 def test_email_templates_requires_admin(anon_client):
    response = anon_client.get("/admin/email-templates")
    assert response.status_code == 302
    assert "/login" in response.headers.get("Location", "")
 def test_email_templates_lists_default(client):
    response = client.get("/admin/email-templates")
    assert response.status_code == 200
    assert b"job-alert" in response.data
 def test_email_templates_create_update_delete(client):
    # Create
    response = client.post(
        "/admin/email-templates",
        data={
            "action": "create",
            "name": "Daily Summary",
            "slug": "daily-summary",
            "subject": "Summary: {count_label}",
            "body": "Jobs:{jobs_section}",
            "is_active": "on",
        },
        follow_redirects=False,
    )
    assert response.status_code == 302
    templates = list_email_templates()
    assert any(t["slug"] == "daily-summary" for t in templates)
    # Update
    template_row = next(t for t in templates if t["slug"] == "daily-summary")
    response = client.post(
        "/admin/email-templates",
        data={
            "action": "update",
            "template_id": template_row["template_id"],
            "name": "Daily Summary",
            "slug": "daily-summary",
            "subject": "Updated: {count_label}",
            "body": "Updated body {jobs_section}",
        },
        follow_redirects=False,
    )
    assert response.status_code == 302
    updated = list_email_templates()
    updated_row = next(t for t in updated if t["slug"] == "daily-summary")
    assert "Updated:" in updated_row["subject"]
    # Delete
    response = client.post(
        "/admin/email-templates",
        data={
            "action": "delete",
            "template_id": updated_row["template_id"],
        },
        follow_redirects=False,
    )
    assert response.status_code == 302
    slugs = [t["slug"] for t in list_email_templates()]
    assert "daily-summary" not in slugs
 def test_email_templates_preview(client):
    templates = list_email_templates()
    job_alert = next(t for t in templates if t["slug"] == "job-alert")
    response = client.get(f"/admin/email-templates?preview_id={job_alert['template_id']}")
    assert response.status_code == 200
    assert b"Preview" in response.data
    assert b"Subject" in response.data
 def test_render_job_alert_email_uses_template_override(client):
    templates = list_email_templates()
    job_alert = next(t for t in templates if t["slug"] == "job-alert")
    update_email_template(
        job_alert["template_id"],
        subject="Custom Subject {count}",
        body="Body {jobs_message}",
    )
    rendered = render_job_alert_email([
        {
            "title": "Python Developer",
            "company": "Acme",
            "location": "Remote",
            "url": "https://example.com",
        }
    ])
    assert rendered["subject"].startswith("Custom Subject")
    assert "Python Developer" in rendered["body"]
--- a/tests/test_cache_paths.py
+++ b/tests/test_cache_paths.py
@@ -1,66 +0,0 @@
 import os
 import tempfile
 import pytest
 import web.db as db
 from web.utils import get_cache_dir
 # Skip unless explicitly enabled (MySQL integration expected)
 if not os.getenv("RUN_DB_TESTS"):
    pytest.skip("Set RUN_DB_TESTS=1 to run cache path integration tests",
                allow_module_level=True)
 def test_db_sync_inserts_relative_paths(tmp_path, monkeypatch):
    # arrange: create a temporary cache dir and a fake html file
    cache_dir = tmp_path / "cache"
    cache_dir.mkdir()
    f = cache_dir / "example.org_path_to_page_123.html"
    f.write_text("<html>ok</html>")
    # point app at this cache dir
    monkeypatch.setenv("PYTEST_CACHE_DIR", str(cache_dir))
    # monkeypatch get_cache_dir used by db functions
    monkeypatch.setattr('web.utils.get_cache_dir', lambda: str(cache_dir))
    # ensure DB initialized
    db.db_init()
    # act
    db.db_sync_cached_pages(str(cache_dir))
    # assert: DB contains relative path, not absolute
    rows = db.db_get_all_cached_pages()
    assert any(r['file_path'] == os.path.relpath(
        str(f), start=str(cache_dir)) for r in rows)
 def test_normalize_cached_page_paths_converts_absolute(tmp_path, monkeypatch):
    cache_dir = tmp_path / "cache"
    cache_dir.mkdir()
    # create an actual file
    f = cache_dir / "site_example_page_1.html"
    f.write_text("<html>ok</html>")
    monkeypatch.setattr('web.utils.get_cache_dir', lambda: str(cache_dir))
    db.db_init()
    abs_fp = str(f)
    rel_fp = os.path.relpath(abs_fp, start=str(cache_dir))
    # Insert an absolute path row directly (simulate legacy data)
    with db._ensure_session() as session:
        session.execute(
            db.text("INSERT INTO cached_pages(file_path, url_guess, last_modified, size_bytes, job_id) VALUES(:fp, :ug, :lm, :sz, :jid)"),
            {"fp": abs_fp, "ug": "https://example.org/page1.html",
                "lm": None, "sz": 10, "jid": None}
        )
        session.commit()
    # normalize should convert absolute to relative
    changed = db.normalize_cached_page_paths()
    assert changed >= 1
    rows = db.db_get_all_cached_pages()
    assert any(r['file_path'] == rel_fp for r in rows)
--- a/tests/test_cached_route.py
+++ b/tests/test_cached_route.py
@@ -1,53 +0,0 @@
 import os
 import tempfile
 from web.app import app
 def test_cached_route_serves_file(monkeypatch):
    # Create a temporary file in the configured cache dir
    cache_dir = os.path.abspath(os.path.join(
        os.path.dirname(__file__), '..', 'cache'))
    os.makedirs(cache_dir, exist_ok=True)
    fd, tmp_path = tempfile.mkstemp(
        prefix='test_cached_', suffix='.html', dir=cache_dir)
    os.close(fd)
    with open(tmp_path, 'w', encoding='utf-8') as f:
        f.write('<html><body>cached</body></html>')
    # Fake job record returned by get_job_by_id
    fake_job = {
        'id': 'fake123',
        'job_id': 'fake123',
        'file_path': os.path.relpath(tmp_path, cache_dir),
        'file_path_abs': tmp_path,
    }
    def fake_get_job_by_id(jid):
        if str(jid) in ('fake123',):
            return fake_job
        return {}
    # Patch the symbol imported into web.app
    monkeypatch.setattr('web.app.get_job_by_id', fake_get_job_by_id)
    # Request route
    client = app.test_client()
    res = client.get('/cached/fake123')
    assert res.status_code == 200
    assert b'cached' in res.data
    # Cleanup
    try:
        os.remove(tmp_path)
    except Exception:
        pass
 def test_cached_route_missing(monkeypatch):
    def fake_get_job_by_id(jid):
        return {}
    monkeypatch.setattr('web.app.get_job_by_id', fake_get_job_by_id)
    client = app.test_client()
    res = client.get('/cached/nope')
    assert res.status_code == 404
--- a/tests/test_cachedpage_abs_path.py
+++ b/tests/test_cachedpage_abs_path.py
@@ -1,27 +0,0 @@
 import os
 from web.db import CachedPage
 from web.utils import get_cache_dir
 def test_cachedpage_abs_path(tmp_path, monkeypatch):
    # Create a fake cache dir and monkeypatch get_cache_dir
    fake_cache = tmp_path / 'cache'
    fake_cache.mkdir()
    monkeypatch.setenv('PYTHONIOENCODING', 'utf-8')
    # Patch the symbol used by CachedPage.abs_path (imported into web.db)
    monkeypatch.setattr('web.db.get_cache_dir', lambda: str(fake_cache))
    # Create a CachedPage instance and set file_path attribute
    cp = CachedPage()
    setattr(cp, 'file_path', 'subdir/test.html')
    # Ensure the computed absolute path joins the fake cache dir
    expected = os.path.join(os.path.abspath(
        str(fake_cache)), 'subdir/test.html')
    assert cp.abs_path == expected
    # When file_path is falsy, abs_path should be None
    cp2 = CachedPage()
    setattr(cp2, 'file_path', None)
    assert cp2.abs_path is None
--- a/tests/test_db_integration.py
+++ b/tests/test_db_integration.py
@@ -95,39 +95,6 @@ def test_upsert_listing_details_and_urls(db_ready):
        pass
 def test_cached_page_upsert_and_get(db_ready):
    jid_suffix = unique_suffix()
    url = f"https://example.org/it/{jid_suffix}.html"
    # Ensure a listing exists for FK relation if enforced
    db.upsert_listing(
        url=url,
        region="it",
        keyword="cache",
        title=f"IT Cache {jid_suffix}",
        pay="N/A",
        location="Test City",
        timestamp=now_iso(),
    )
    fp = f"/tmp/integration_{jid_suffix}.html"
    db.upsert_cached_page(
        file_path=fp,
        url_guess=url,
        last_modified=now_iso(),
        size_bytes=123,
        job_id=int(jid_suffix) if jid_suffix.isdigit() else None,
    )
    row = db.db_get_cache_url(url)
    if row is not None:
        assert row["url_guess"] == url
    # Cleanup
    try:
        db.remove_cached_page(fp)
        db.db_remove_cached_url(url)
        db.db_delete_job(jid_suffix)
    except Exception:
        pass
 def test_user_interactions_mark_and_visit(db_ready):
    uname = f"it_user_{unique_suffix()}"
    db.create_or_update_user(uname, is_active=True)
--- a/tests/test_db_negative_filtering.py
+++ b/tests/test_db_negative_filtering.py
@@ -0,0 +1,21 @@
 import pytest
 import web.db as db
 def test_upsert_job_details_skips_negative_match(monkeypatch):
    def fail(*args, **kwargs):  # pragma: no cover - guard against unwanted calls
        raise AssertionError("should not reach database layers when negative")
    monkeypatch.setattr(db, "_ensure_session", fail)
    monkeypatch.setattr(db, "insert_log", fail)
    job_data = {
        "url": "https://example.com/job/neg",
        "id": "neg123",
        "is_negative_match": True,
        "negative_keyword_match": "scam",
        "negative_match_field": "title",
    }
    # Should return early without touching the database helpers.
    db.upsert_job_details(job_data)
--- a/tests/test_email_service.py
+++ b/tests/test_email_service.py
@@ -0,0 +1,106 @@
 import pytest
 from web.email_service import (
    EmailConfigurationError,
    send_email,
 )
 def test_send_email_disabled(monkeypatch):
    called = {}
    def _fake_smtp(*args, **kwargs):  # pragma: no cover - should not be called
        called["used"] = True
        raise AssertionError(
            "SMTP should not be invoked when email is disabled")
    monkeypatch.setattr("web.email_service.smtplib.SMTP", _fake_smtp)
    monkeypatch.setattr("web.email_service.smtplib.SMTP_SSL", _fake_smtp)
    result = send_email(
        subject="Hi",
        body="Test",
        to="user@example.com",
        settings={"enabled": False},
    )
    assert result is False
    assert called == {}
 def test_send_email_sends_message(monkeypatch):
    events = {"starttls": False, "login": None, "sent": None}
    class FakeSMTP:
        def __init__(self, *, host, port, timeout):
            self.host = host
            self.port = port
            self.timeout = timeout
        def __enter__(self):
            return self
        def __exit__(self, exc_type, exc, tb):
            return False
        def ehlo(self):
            events.setdefault("ehlo", 0)
            events["ehlo"] += 1
        def starttls(self):
            events["starttls"] = True
        def login(self, username, password):
            events["login"] = (username, password)
        def send_message(self, message, *, from_addr, to_addrs):
            events["sent"] = {
                "from": from_addr,
                "to": tuple(to_addrs),
                "subject": message["Subject"],
            }
    monkeypatch.setattr("web.email_service.smtplib.SMTP", FakeSMTP)
    monkeypatch.setattr("web.email_service.smtplib.SMTP_SSL", FakeSMTP)
    settings = {
        "enabled": True,
        "from_address": "jobs@example.com",
        "smtp": {
            "host": "smtp.example.com",
            "port": 2525,
            "timeout": 15,
            "username": "jobs",
            "password": "secret",
            "use_tls": True,
            "use_ssl": False,
        },
    }
    result = send_email(
        subject="New Jobs",
        body="You have new jobs waiting.",
        to=["a@example.com", "b@example.com"],
        cc="c@example.com",
        bcc=["d@example.com"],
        settings=settings,
    )
    assert result is True
    assert events["starttls"] is True
    assert events["login"] == ("jobs", "secret")
    assert events["sent"] == {
        "from": "jobs@example.com",
        "to": ("a@example.com", "b@example.com", "c@example.com", "d@example.com"),
        "subject": "New Jobs",
    }
 def test_send_email_requires_host():
    settings = {
        "enabled": True,
        "from_address": "jobs@example.com",
        "smtp": {"host": "", "port": 587},
    }
    with pytest.raises(EmailConfigurationError):
        send_email(subject="Hi", body="Test",
                   to="user@example.com", settings=settings)
--- a/tests/test_email_templates.py
+++ b/tests/test_email_templates.py
@@ -0,0 +1,40 @@
 from datetime import datetime
 from web.email_templates import render_job_alert_email
 def test_render_job_alert_email_with_jobs():
    jobs = [
        {
            "title": "Python Developer",
            "company": "Acme",
            "location": "Remote",
            "url": "https://example.com/jobs/1",
        },
        {
            "title": "Data Engineer",
            "company": "Globex",
            "location": "NYC",
            "url": "https://example.com/jobs/2",
        },
    ]
    ts = datetime(2025, 11, 3, 12, 0)
    rendered = render_job_alert_email(
        jobs, region="sfbay", keyword="python", generated_at=ts)
    assert rendered["subject"] == "2 new jobs (region: sfbay, keyword: python)"
    assert "1. Python Developer" in rendered["body"]
    assert "Generated at 2025-11-03 12:00 UTC." in rendered["body"]
    assert rendered["context"]["count"] == 2
    assert rendered["context"]["jobs_section"].startswith(
        "\n1. Python Developer")
 def test_render_job_alert_email_empty():
    ts = datetime(2025, 11, 3, 12, 0)
    rendered = render_job_alert_email([], generated_at=ts)
    assert rendered["subject"] == "No new jobs"
    assert "No jobs matched this alert." in rendered["body"]
    assert rendered["body"].count("Generated at") == 1
    assert rendered["context"]["count"] == 0
--- a/tests/test_scheduler.py
+++ b/tests/test_scheduler.py
@@ -0,0 +1,137 @@
 import pytest
 import time
 from unittest.mock import patch, MagicMock
 from web.craigslist import scrape_jobs_with_retry, run_scheduled_scraping, fetch_listings
 class TestScheduler:
    def test_scrape_jobs_with_retry_success(self):
        """Test that scrape_jobs_with_retry succeeds on first attempt."""
        with patch('web.craigslist.scraper') as mock_scrape:
            result = scrape_jobs_with_retry()
            assert result is True
            mock_scrape.assert_called_once()
    def test_scrape_jobs_with_retry_failure(self):
        """Test that scrape_jobs_with_retry handles failures properly."""
        with patch('web.craigslist.scraper', side_effect=Exception("Test error")) as mock_scrape:
            result = scrape_jobs_with_retry(max_retries=2)
            assert result is False
            assert mock_scrape.call_count == 2
    def test_run_scheduled_scraping(self):
        """Test the scheduled scraping wrapper function."""
        with patch('web.craigslist.scrape_jobs_with_retry') as mock_retry:
            mock_retry.return_value = True
            run_scheduled_scraping()
            mock_retry.assert_called_once()
    def test_scheduler_import(self):
        """Test that scheduler functions can be imported."""
        from web.craigslist import start_scheduler
        assert callable(start_scheduler)
    @patch('web.craigslist.schedule')
    def test_scheduler_setup(self, mock_schedule):
        """Test that scheduler setup works correctly."""
        # This is a basic test to ensure the scheduler can be set up
        from web.craigslist import schedule
        assert schedule is not None
    @patch('web.craigslist.db_get_all_job_urls')
    @patch('web.craigslist.seed_regions_keywords_from_listings')
    @patch('web.craigslist.get_all_regions')
    @patch('web.craigslist.get_all_keywords')
    @patch('web.craigslist.get_last_fetch_time')
    @patch('web.craigslist.process_region_keyword')
    @patch('web.craigslist.upsert_listing')
    @patch('web.craigslist.insert_log')
    def test_fetch_listings_return_structure(self, mock_log, mock_upsert, mock_process, mock_last_fetch,
                                             mock_keywords, mock_regions, mock_seed, mock_db_urls):
        """Test that fetch_listings returns the correct structure with per-search counts."""
        # Setup mocks
        mock_db_urls.return_value = []
        mock_regions.return_value = [{"name": "sfbay"}]
        mock_keywords.return_value = [{"name": "python"}]
        mock_last_fetch.return_value = None  # Never fetched before
        mock_process.return_value = [
            ("2025-11-03T10:00:00Z", "sfbay", "python", "Python Dev",
             "$100k", "San Francisco", "http://example.com/1"),
            ("2025-11-03T10:00:00Z", "sfbay", "python", "Python Dev",
             "$100k", "San Francisco", "http://example.com/2"),
        ]
        # Collect messages and get return value from generator
        gen = fetch_listings()
        messages = []
        result = None
        try:
            while True:
                messages.append(next(gen))
        except StopIteration as e:
            result = e.value
        # Verify return structure
        assert result is not None
        assert "discovered" in result
        assert "new" in result
        assert "by_search" in result
        assert isinstance(result.get("by_search"), list)
        assert result.get("discovered") == 2
        assert result.get("new") == 2
    @patch('web.craigslist.db_get_all_job_urls')
    @patch('web.craigslist.seed_regions_keywords_from_listings')
    @patch('web.craigslist.get_all_regions')
    @patch('web.craigslist.get_all_keywords')
    @patch('web.craigslist.get_last_fetch_time')
    @patch('web.craigslist.process_region_keyword')
    @patch('web.craigslist.upsert_listing')
    @patch('web.craigslist.insert_log')
    def test_fetch_listings_per_search_count(self, mock_log, mock_upsert, mock_process, mock_last_fetch,
                                             mock_keywords, mock_regions, mock_seed, mock_db_urls):
        """Test that fetch_listings correctly counts jobs per search."""
        # Setup mocks
        mock_db_urls.return_value = []
        mock_regions.return_value = [{"name": "sfbay"}, {"name": "losangeles"}]
        mock_keywords.return_value = [{"name": "python"}, {"name": "java"}]
        mock_last_fetch.return_value = None  # Never fetched before
        # Mock process_region_keyword to return different counts for each search
        def mock_process_impl(region, keyword, discovered_urls):
            # Use unique URLs per search to get the total discovered count
            base_url = f"http://example.com/{region}/{keyword}"
            counts = {
                ("sfbay", "python"): 3,
                ("sfbay", "java"): 2,
                ("losangeles", "python"): 4,
                ("losangeles", "java"): 1,
            }
            count = counts.get((region, keyword), 0)
            return [(f"2025-11-03T10:00:00Z", region, keyword, f"Job {i}", "$100k", region, f"{base_url}/{i}")
                    for i in range(count)]
        mock_process.side_effect = mock_process_impl
        # Collect result from generator
        gen = fetch_listings()
        messages = []
        result = None
        try:
            while True:
                messages.append(next(gen))
        except StopIteration as e:
            result = e.value
        # Verify per-search counts
        assert result is not None
        by_search = result.get("by_search", [])
        assert len(by_search) == 4
        search_data = {(r.get("region"), r.get("keyword"))                       : r.get("count") for r in by_search}
        assert search_data.get(("sfbay", "python")) == 3
        assert search_data.get(("sfbay", "java")) == 2
        assert search_data.get(("losangeles", "python")) == 4
        assert search_data.get(("losangeles", "java")) == 1
        assert result.get("discovered") == 10  # Total unique jobs
--- a/tests/test_scraper.py
+++ b/tests/test_scraper.py
@@ -0,0 +1,384 @@
 import pytest
 from web.scraper import scrape_job_page, extract_contact_info
 from web.craigslist import process_job_url, scraper
 def _make_negative_job(url: str) -> dict:
    return {
        "url": url,
        "title": "SCAM role",
        "company": "Test Co",
        "location": "Remote",
        "description": "This is a scam offer",
        "id": "job123",
        "posted_time": "",
        "reply_url": "N/A",
        "contact_email": "N/A",
        "contact_phone": "N/A",
        "contact_name": "N/A",
        "is_negative_match": True,
        "negative_keyword_match": "scam",
        "negative_match_field": "title",
    }
 class TestExtractContactInfo:
    """Test suite for contact information extraction."""
    def test_extract_email_from_mailto_link(self):
        """Test extraction of email from mailto link."""
        reply_url = "mailto:contact@example.com?subject=Job%20Inquiry"
        contact_info = extract_contact_info(reply_url)
        assert contact_info["email"] == "contact@example.com"
        assert contact_info["phone"] == "N/A"
        assert contact_info["contact_name"] == "N/A"
    def test_extract_phone_from_tel_link(self):
        """Test extraction of phone from tel link."""
        reply_url = "tel:+1234567890"
        contact_info = extract_contact_info(reply_url)
        assert contact_info["email"] == "N/A"
        assert contact_info["phone"] == "+1234567890"
        assert contact_info["contact_name"] == "N/A"
    def test_extract_email_from_url_parameter(self):
        """Test extraction of email from URL query parameters."""
        reply_url = "https://example.com/contact?email=jobs@company.com&name=John%20Doe"
        contact_info = extract_contact_info(reply_url)
        assert contact_info["email"] == "jobs@company.com"
        assert contact_info["contact_name"] == "John Doe"
    def test_extract_phone_from_url_parameter(self):
        """Test extraction of phone from URL query parameters."""
        reply_url = "https://example.com/apply?phone=555-1234&email=contact@test.com"
        contact_info = extract_contact_info(reply_url)
        assert contact_info["phone"] == "555-1234"
        assert contact_info["email"] == "contact@test.com"
    def test_extract_contact_name_from_url_parameter(self):
        """Test extraction of contact name from URL query parameters."""
        reply_url = "https://example.com/reply?name=Alice%20Smith&contact_name=Bob%20Jones"
        contact_info = extract_contact_info(reply_url)
        # Should prefer contact_name over name
        assert contact_info["contact_name"] == "Bob Jones"
    def test_extract_all_fields_from_url(self):
        """Test extraction of all fields from URL parameters."""
        reply_url = "https://example.com/contact?email=hr@company.com&phone=555-9876&contact_name=Jane%20Doe"
        contact_info = extract_contact_info(reply_url)
        assert contact_info["email"] == "hr@company.com"
        assert contact_info["phone"] == "555-9876"
        assert contact_info["contact_name"] == "Jane Doe"
    def test_handle_empty_reply_url(self):
        """Test handling of empty reply URL."""
        contact_info = extract_contact_info("")
        assert contact_info["email"] == "N/A"
        assert contact_info["phone"] == "N/A"
        assert contact_info["contact_name"] == "N/A"
    def test_handle_na_reply_url(self):
        """Test handling of N/A reply URL."""
        contact_info = extract_contact_info("N/A")
        assert contact_info["email"] == "N/A"
        assert contact_info["phone"] == "N/A"
        assert contact_info["contact_name"] == "N/A"
    def test_handle_none_reply_url(self):
        """Test handling of None reply URL."""
        contact_info = extract_contact_info(None)
        assert contact_info["email"] == "N/A"
        assert contact_info["phone"] == "N/A"
        assert contact_info["contact_name"] == "N/A"
    def test_handle_invalid_url(self):
        """Test handling of invalid URL (graceful fallback)."""
        reply_url = "not a valid url at all"
        contact_info = extract_contact_info(reply_url)
        # Should return all N/A values without crashing
        assert contact_info["email"] == "N/A"
        assert contact_info["phone"] == "N/A"
        assert contact_info["contact_name"] == "N/A"
    def test_multiple_parameter_variations(self):
        """Test that function finds email despite multiple parameter name variations."""
        reply_url = "https://example.com/reply?from_email=sender@test.com&other=value"
        contact_info = extract_contact_info(reply_url)
        assert contact_info["email"] == "sender@test.com"
    def test_telephone_parameter_name(self):
        """Test extraction using 'telephone' parameter name."""
        reply_url = "https://example.com/contact?telephone=555-0000"
        contact_info = extract_contact_info(reply_url)
        assert contact_info["phone"] == "555-0000"
 class TestScrapeJobPageContactInfo:
    """Test suite for scrape_job_page contact information extraction."""
    def test_scrape_job_page_includes_contact_fields(self):
        """Test that scrape_job_page includes contact information in return dict."""
        html_content = """
        <html>
            <h1 class="postingtitle">Software Engineer</h1>
            <h2 class="company-name">Tech Company</h2>
            <button class="reply-button" data-href="mailto:jobs@techco.com"></button>
            <div id="map" data-latitude="37.7749" data-longitude="-122.4194" data-accuracy="rooftop"></div>
            <section id="postingbody">
                <p>This is a test job description</p>
            </section>
            <div class="postinginfos">
                <p class="postinginfo">posting id: 12345abc</p>
                <time class="date timeago" datetime="2025-11-03T10:00:00"></time>
            </div>
        </html>
        """
        job_data = scrape_job_page(html_content, "https://example.com/job/123")
        # Verify all expected keys are present
        assert "contact_email" in job_data
        assert "contact_phone" in job_data
        assert "contact_name" in job_data
        assert "reply_url" in job_data
    def test_scrape_job_page_extracts_mailto_contact(self):
        """Test that scrape_job_page correctly extracts email from mailto link."""
        html_content = """
        <html>
            <h1 class="postingtitle">Job Title</h1>
            <h2 class="company-name">Company</h2>
            <button class="reply-button" data-href="mailto:hiring@company.com?subject=Application"></button>
            <div id="map"></div>
            <section id="postingbody"><p>Job desc</p></section>
            <div class="postinginfos">
                <p class="postinginfo">id: xyz</p>
            </div>
        </html>
        """
        job_data = scrape_job_page(html_content, "https://example.com/job/456")
        assert job_data["contact_email"] == "hiring@company.com"
        assert job_data["reply_url"] == "mailto:hiring@company.com?subject=Application"
    def test_scrape_job_page_no_reply_button(self):
        """Test scrape_job_page when no reply button is present."""
        html_content = """
        <html>
            <h1 class="postingtitle">Job Title</h1>
            <h2 class="company-name">Company</h2>
            <div id="map"></div>
            <section id="postingbody"><p>Job desc</p></section>
            <div class="postinginfos">
                <p class="postinginfo">id: xyz</p>
            </div>
        </html>
        """
        job_data = scrape_job_page(html_content, "https://example.com/job/789")
        # Should have N/A for all contact fields
        assert job_data["reply_url"] == "N/A"
        assert job_data["contact_email"] == "N/A"
        assert job_data["contact_phone"] == "N/A"
        assert job_data["contact_name"] == "N/A"
    def test_scrape_job_page_with_url_based_reply(self):
        """Test scrape_job_page with URL-based reply link containing contact info."""
        html_content = """
        <html>
            <h1 class="postingtitle">Manager Position</h1>
            <h2 class="company-name">BigCorp</h2>
            <button class="reply-button" data-href="https://apply.bigcorp.com?email=hr@bigcorp.com&name=HR%20Team"></button>
            <div id="map"></div>
            <section id="postingbody"><p>Apply now</p></section>
            <div class="postinginfos">
                <p class="postinginfo">id: manager123</p>
            </div>
        </html>
        """
        job_data = scrape_job_page(html_content, "https://example.com/job/999")
        assert job_data["contact_email"] == "hr@bigcorp.com"
        assert job_data["contact_name"] == "HR Team"
    def test_scrape_job_page_negative_keyword_match(self, monkeypatch):
        """Test that negative keyword detection flags matching jobs."""
        monkeypatch.setattr(
            "web.scraper.get_negative_keywords", lambda: ["scam"])
        html_content = """
        <html>
            <h1 class="postingtitle">Great Opportunity</h1>
            <h2 class="company-name">SCAM Corp</h2>
            <section id="postingbody"><p>This is a scam offer</p></section>
        </html>
        """
        job_data = scrape_job_page(
            html_content, "https://example.com/job/negative")
        assert job_data["is_negative_match"] is True
        assert job_data["negative_keyword_match"] == "scam"
        assert job_data["negative_match_field"] in {
            "title", "company", "description"}
    def test_scrape_job_page_no_negative_match(self, monkeypatch):
        """Test that jobs without matching keywords are not flagged."""
        monkeypatch.setattr(
            "web.scraper.get_negative_keywords", lambda: ["scam"])
        html_content = """
        <html>
            <h1 class="postingtitle">Legit Opportunity</h1>
            <h2 class="company-name">Honest Corp</h2>
            <section id="postingbody"><p>We pay well and on time.</p></section>
        </html>
        """
        job_data = scrape_job_page(
            html_content, "https://example.com/job/positive")
        assert job_data["is_negative_match"] is False
        assert job_data["negative_keyword_match"] is None
        assert job_data["negative_match_field"] is None
 class TestProcessJobUrlNegativeFiltering:
    def test_process_job_url_skips_negative_match(self, monkeypatch):
        job_url = "https://example.com/job/negative"
        remove_calls = []
        upsert_calls = []
        monkeypatch.setattr(
            "web.craigslist.get_last_fetch_time", lambda url: None)
        monkeypatch.setattr(
            "web.craigslist.insert_log",
            lambda *args, **kwargs: None,
        )
        monkeypatch.setattr(
            "web.craigslist.make_request_with_retry",
            lambda url, attempts: "<html />",
        )
        monkeypatch.setattr(
            "web.craigslist.scrape_job_page",
            lambda content, url: _make_negative_job(url),
        )
        def fake_upsert(job_data, region="", keyword=""):
            upsert_calls.append(job_data)
        def fake_remove(url):
            remove_calls.append(url)
        monkeypatch.setattr("web.craigslist.upsert_job_details", fake_upsert)
        monkeypatch.setattr("web.craigslist.remove_job", fake_remove)
        messages = list(process_job_url(job_url, region="test", keyword="kw"))
        assert any("Skipping job" in message for message in messages)
        assert remove_calls == [job_url]
        assert upsert_calls == []
 class TestScraperPipelineNegativeFiltering:
    def test_scraper_skips_negative_jobs(self, monkeypatch):
        job_url = "https://example.com/job/negative"
        remove_calls = []
        upsert_calls = []
        monkeypatch.setattr("web.craigslist.db_init", lambda: None)
        def fake_fetch_listings():
            yield "Fake listing fetch\n"
            return {"discovered": 0, "new": 0, "by_search": [], "new_jobs": []}
        monkeypatch.setattr("web.craigslist.fetch_listings",
                            fake_fetch_listings)
        monkeypatch.setattr(
            "web.craigslist.db_get_all_job_urls",
            lambda: [{"url": job_url, "region": "reg", "keyword": "kw"}],
        )
        monkeypatch.setattr(
            "web.craigslist.get_last_fetch_time", lambda url: None)
        monkeypatch.setattr("web.craigslist.insert_log",
                            lambda *args, **kwargs: None)
        monkeypatch.setattr(
            "web.craigslist.make_request_with_retry", lambda url, attempts: "<html />"
        )
        monkeypatch.setattr("web.craigslist.url_to_job_id",
                            lambda url: "job123")
        monkeypatch.setattr(
            "web.craigslist.scrape_job_page",
            lambda content, url: _make_negative_job(url),
        )
        def fake_upsert(job_data, region="", keyword=""):
            upsert_calls.append(job_data)
        def fake_remove(url):
            remove_calls.append(url)
        monkeypatch.setattr("web.craigslist.upsert_job_details", fake_upsert)
        monkeypatch.setattr("web.craigslist.remove_job", fake_remove)
        messages = list(scraper())
        assert any("Skipping job" in message for message in messages)
        assert remove_calls == [job_url]
        assert upsert_calls == []
 class TestScraperEmailNotifications:
    def test_scraper_sends_email_for_new_jobs(self, monkeypatch):
        monkeypatch.setattr("web.craigslist.db_init", lambda: None)
        new_jobs = [
            {
                "title": "Python Developer",
                "company": "Acme",
                "location": "Remote",
                "url": "https://example.com/jobs/1",
            }
        ]
        def fake_fetch_listings():
            yield "Fake listing fetch\n"
            return {
                "discovered": 1,
                "new": 1,
                "by_search": [],
                "new_jobs": new_jobs,
            }
        monkeypatch.setattr("web.craigslist.fetch_listings", fake_fetch_listings)
        monkeypatch.setattr("web.craigslist.db_get_all_job_urls", lambda: [])
        calls = {}
        def fake_send_alert(jobs):
            calls["jobs"] = jobs
            return True, "sent"
        monkeypatch.setattr("web.craigslist._send_new_job_alert", fake_send_alert)
        messages = list(scraper())
        assert calls["jobs"] == new_jobs
        assert any("Job alert email sent." in message for message in messages)
--- a/tests/test_user_negative_keywords.py
+++ b/tests/test_user_negative_keywords.py
@@ -0,0 +1,148 @@
 import pytest
 from web.db import (
    db_init,
    create_or_update_user,
    upsert_negative_keyword,
    set_user_negative_keywords,
    get_user_negative_keywords,
    upsert_listing,
    upsert_job_details,
    get_all_jobs,
    UserNegativeKeyword,
    NegativeKeyword
 )
 from web.app import app
 from web.utils import filter_jobs
@pytest.fixture
 def client():
    app.config['TESTING'] = True
    app.config['WTF_CSRF_ENABLED'] = False
    with app.test_client() as client:
        with app.app_context():
            db_init()
        yield client
 def test_negative_keyword_db_ops():
    db_init()
    username = "test_neg_user"
    create_or_update_user(username, "password")
    # Test upsert
    kid = upsert_negative_keyword("scam")
    assert kid > 0
    kid2 = upsert_negative_keyword("scam")
    assert kid == kid2
    # Test set/get
    set_user_negative_keywords(username, ["scam", "unpaid"])
    nks = get_user_negative_keywords(username)
    assert len(nks) == 2
    assert "scam" in nks
    assert "unpaid" in nks
    # Test update
    set_user_negative_keywords(username, ["scam"])
    nks = get_user_negative_keywords(username)
    assert len(nks) == 1
    assert "scam" in nks
    assert "unpaid" not in nks
    # Test clear
    set_user_negative_keywords(username, [])
    nks = get_user_negative_keywords(username)
    assert len(nks) == 0
 def test_settings_endpoint(client):
    username = "test_settings_user"
    create_or_update_user(username, "password")
    # Login
    client.post('/login', data={'username': username, 'password': 'password'})
    # Post settings
    resp = client.post('/settings', json={
        'regions': [],
        'keywords': [],
        'negative_keywords': ['spam', 'junk']
    })
    assert resp.status_code == 200
    # Verify DB
    nks = get_user_negative_keywords(username)
    assert "spam" in nks
    assert "junk" in nks
 def test_job_filtering_with_negative_keywords():
    # Setup jobs
    jobs = [
        {"title": "Great Job", "description": "Good pay"},
        {"title": "Bad Job", "description": "This is a scam"},
        {"title": "Okay Job", "description": "Average pay"},
    ]
    # Filter
    filtered = filter_jobs(jobs, negative_keywords=["scam"])
    assert len(filtered) == 2
    assert "Bad Job" not in [j['title'] for j in filtered]
    filtered = filter_jobs(jobs, negative_keywords=["pay"])
    assert len(filtered) == 1
    assert "Bad Job" in [j['title']
                         for j in filtered]  # "scam" job doesn't have "pay"
 def test_jobs_endpoint_filtering(client):
    username = "test_filter_user"
    create_or_update_user(username, "password")
    # Setup DB with jobs
    upsert_listing(
        url="http://example.com/1",
        region="sfbay",
        keyword="python",
        title="Good Python Job",
        pay="$100k",
        location="SF",
        timestamp="now"
    )
    upsert_job_details({
        "url": "http://example.com/1",
        "id": "1",
        "title": "Good Python Job",
        "description": "This is a legit job."
    })
    upsert_listing(
        url="http://example.com/2",
        region="sfbay",
        keyword="python",
        title="Bad Python Job",
        pay="$100k",
        location="SF",
        timestamp="now"
    )
    upsert_job_details({
        "url": "http://example.com/2",
        "id": "2",
        "title": "Bad Python Job",
        "description": "This is a scam job."
    })
    # Login
    client.post('/login', data={'username': username, 'password': 'password'})
    # Set negative keywords
    set_user_negative_keywords(username, ["scam"])
    # Fetch jobs
    resp = client.get('/jobs')
    data = resp.get_json()
    titles = [j['title'] for j in data]
    assert "Good Python Job" in titles
    assert "Bad Python Job" not in titles
--- a/tests/test_utils_config.py
+++ b/tests/test_utils_config.py
@@ -16,3 +16,23 @@ def test_http_settings_helpers():
    assert isinstance(utils.get_backoff_factor(), int)
    assert isinstance(utils.get_min_delay(), int)
    assert isinstance(utils.get_max_delay(), int)
 def test_negative_keywords_helper():
    keywords = utils.get_negative_keywords()
    assert isinstance(keywords, list)
    for kw in keywords:
        assert isinstance(kw, str)
        assert kw == kw.lower()
 def test_email_settings_helper():
    settings = utils.get_email_settings()
    assert isinstance(settings, dict)
    assert 'enabled' in settings
    assert 'from_address' in settings
    smtp = settings.get('smtp')
    assert isinstance(smtp, dict)
    assert 'host' in smtp
    assert isinstance(smtp.get('port'), int)
    assert isinstance(settings.get('recipients'), list)
--- a/web/app.py
+++ b/web/app.py
@@ -1,11 +1,13 @@
 import os
-from flask import Flask, request, jsonify, render_template, redirect, url_for, session, flash, send_file
+from flask import Flask, request, jsonify, render_template, redirect, url_for, session, flash, Response
 from flask_wtf import CSRFProtect
 from typing import Dict, List
 from datetime import datetime, timezone
 from web.craigslist import scraper
 from web.db import (
    db_init,
    delete_user_by_id,
    get_all_jobs,
    mark_favorite,
    record_visit,
@@ -13,12 +15,16 @@ from web.db import (
    create_or_update_user,
    verify_user_credentials,
    get_user,
    get_user_by_id,
    get_user_regions,
    get_user_keywords,
    get_user_negative_keywords,
    set_user_regions,
    set_user_keywords,
    set_user_negative_keywords,
    get_all_regions,
    get_all_keywords,
    stats_overview,
    upsert_region,
    upsert_keyword,
    list_regions_full,
@@ -26,15 +32,24 @@ from web.db import (
    rename_region,
    rename_keyword,
    change_region_color,
-    change_keyword_color
+    change_keyword_color,
    subscribe_email,
    unsubscribe_email,
    list_email_subscriptions,
    list_email_templates,
    create_email_template,
    update_email_template,
    delete_email_template,
    get_email_template,
 )
 from web.utils import (
    initialize_users_from_settings,
    filter_jobs,
    get_job_by_id,
-    get_cache_dir,
+    now_iso,
 )
 from web.db import get_all_regions, get_all_keywords
 from web.email_templates import render_job_alert_email
 app = Flask(__name__)
 app.secret_key = os.environ.get("FLASK_SECRET", "dev-secret-change-me")
@@ -105,24 +120,30 @@ def index():
    # Apply user preference filters if no explicit filters provided
    selected_region = request.args.get("region")
    selected_keyword = request.args.get("keyword")
-    if not selected_region and session.get('username'):
+    user_negative_keywords = []
    if session.get('username'):
        try:
-            prefs = get_user_regions(session['username'])
+            username = session['username']
-            if prefs:
+            if not selected_region:
-                # If user has region prefs, filter to them by default
+                prefs = get_user_regions(username)
-                all_jobs = [j for j in all_jobs if j.get(
+                if prefs:
-                    'region') in set(prefs)]
+                    # If user has region prefs, filter to them by default
                    all_jobs = [j for j in all_jobs if j.get(
                        'region') in set(prefs)]
            if not selected_keyword:
                prefs = get_user_keywords(username)
                if prefs:
                    all_jobs = [j for j in all_jobs if j.get(
                        'keyword') in set(prefs)]
            # Always fetch negative keywords for logged-in users
            user_negative_keywords = get_user_negative_keywords(username)
        except Exception:
            pass
-    if not selected_keyword and session.get('username'):
+
-        try:
+    filtered_jobs = filter_jobs(
-            prefs = get_user_keywords(session['username'])
+        all_jobs, selected_region, selected_keyword, negative_keywords=user_negative_keywords)
            if prefs:
                all_jobs = [j for j in all_jobs if j.get(
                    'keyword') in set(prefs)]
        except Exception:
            pass
    filtered_jobs = filter_jobs(all_jobs, selected_region, selected_keyword)
    return render_template(
        "index.html",
@@ -176,23 +197,26 @@ def jobs():
    # Respect user preferences when no explicit filters provided
    region = request.args.get("region")
    keyword = request.args.get("keyword")
-    if not region and session.get('username'):
+    user_negative_keywords = []
    if session.get('username'):
        try:
-            prefs = get_user_regions(session['username'])
+            username = session['username']
-            if prefs:
+            if not region:
-                all_jobs = [j for j in all_jobs if j.get(
+                prefs = get_user_regions(username)
-                    'region') in set(prefs)]
+                if prefs:
                    all_jobs = [j for j in all_jobs if j.get(
                        'region') in set(prefs)]
            if not keyword:
                prefs = get_user_keywords(username)
                if prefs:
                    all_jobs = [j for j in all_jobs if j.get(
                        'keyword') in set(prefs)]
            user_negative_keywords = get_user_negative_keywords(username)
        except Exception:
            pass
-    if not keyword and session.get('username'):
+    return jsonify(filter_jobs(all_jobs, region, keyword, negative_keywords=user_negative_keywords))
        try:
            prefs = get_user_keywords(session['username'])
            if prefs:
                all_jobs = [j for j in all_jobs if j.get(
                    'keyword') in set(prefs)]
        except Exception:
            pass
    return jsonify(filter_jobs(all_jobs, region, keyword))
@app.route('/job_details', methods=['GET'])
@@ -202,9 +226,9 @@ def job_details():
    if session.get('username'):
        try:
            r = set(get_user_regions(session['username']))
            k = set(get_user_keywords(session['username']))
            if r:
                jobs = [j for j in jobs if j.get('region') in r]
            k = set(get_user_keywords(session['username']))
            if k:
                jobs = [j for j in jobs if j.get('keyword') in k]
        except Exception:
@@ -230,39 +254,6 @@ def job_by_id(job_id):
    return jsonify({"error": "Job not found"}), 404
@app.route('/cached/<job_id>', methods=['GET'])
 def serve_cached(job_id):
    """Serve the cached HTML file for a job if available.
    Uses the job record's `file_path_abs` when present, or resolves the DB `file_path` via helper.
    Ensures the returned file is located under the configured cache directory to avoid path-traversal.
    """
    try:
        from web.db import db_get_cached_abs_path
        j = get_job_by_id(job_id)
        if not j:
            return "Job not found", 404
        # Prefer file_path_abs, fall back to resolving the DB-stored file_path
        abs_fp = j.get('file_path_abs') or None
        if not abs_fp:
            db_fp = j.get('file_path')
            abs_fp = db_get_cached_abs_path(db_fp) if db_fp else None
        if not abs_fp or not os.path.isfile(abs_fp):
            return "Cached file not available", 404
        cache_dir = os.path.abspath(get_cache_dir())
        abs_fp = os.path.abspath(abs_fp)
        # Ensure the file is inside the cache directory
        if os.path.commonpath([cache_dir, abs_fp]) != cache_dir:
            return "Forbidden", 403
        return send_file(abs_fp)
    except Exception:
        return "Error serving cached file", 500
@app.route('/jobs/<job_id>/favorite', methods=['POST'])
 def set_favorite(job_id):
    """Mark or unmark a job as favorite for a given user.
@@ -288,9 +279,21 @@ csrf.exempt(set_favorite)
@app.route('/scrape', methods=['GET'])
 def scrape():
-    """Trigger the web scraping process."""
+    """Trigger the web scraping process with streaming output."""
-    scraper()
+    def generate():
-    return jsonify({"status": "Scraping completed"})
+        try:
            for message in scraper():
                yield message
        except Exception as e:
            yield f"Error during scraping: {str(e)}\n"
    return Response(generate(), mimetype='text/plain')
@app.route('/scrape-page', methods=['GET'])
 def scrape_page():
    """Serve the scrape page with streaming output display."""
    return render_template('scrape.html', title='Scrape Jobs')
 # ---------------- Auth & Admin UI ------------------------------------------
@@ -322,7 +325,7 @@ def admin_users():
    if request.method == 'POST':
        data = request.form
        username = (data.get('username') or '').strip()
-        password = data.get('password') or None
+        password = data.get('new_password') or None
        is_admin = bool(data.get('is_admin'))
        is_active = bool(data.get('is_active')) if data.get(
            'is_active') is not None else True
@@ -342,6 +345,163 @@ def admin_users():
    return render_template('admin/users.html', users=users, title='Users')
@app.route('/admin/user/<user_id>', methods=['GET', 'POST'])
 def admin_user(user_id):
    if not require_admin():
        return redirect(url_for('login'))
    user = get_user_by_id(user_id)
    if request.method == 'POST':
        data = request.form
        username = (data.get('username') or '').strip()
        password = data.get('new_password')
        is_admin = bool(data.get('is_admin'))
        is_active = bool(data.get('is_active')) if data.get(
            'is_active') is not None else True
        try:
            create_or_update_user(
                username, password=password, is_admin=is_admin, is_active=is_active)
            flash('User saved')
        except Exception as e:
            flash(f'Error: {e}')
        return redirect(url_for('admin_users'))
    return render_template('admin/user.html', user=user, title='User')
@app.route('/admin/user/<user_id>/delete', methods=['POST'])
 def admin_user_delete(user_id):
    if not require_admin():
        return redirect(url_for('login'))
    if delete_user_by_id(user_id):
        flash('User deleted')
    else:
        flash('Error deleting user')
    return redirect(url_for('admin_users'))
@app.route('/admin/emails', methods=['GET', 'POST'])
 def admin_emails():
    if not require_admin():
        return redirect(url_for('login'))
    if request.method == 'POST':
        action = (request.form.get('action') or '').strip().lower()
        email = (request.form.get('email') or '').strip()
        try:
            if action == 'subscribe':
                subscribe_email(email)
                flash('Subscription saved')
            elif action == 'unsubscribe':
                if unsubscribe_email(email):
                    flash('Subscription deactivated')
                else:
                    flash('No matching subscription found')
            elif action == 'reactivate':
                subscribe_email(email)
                flash('Subscription reactivated')
            else:
                flash('Unknown action')
        except ValueError as exc:
            flash(f'Error: {exc}')
        except Exception as exc:
            flash(f'Error: {exc}')
        return redirect(url_for('admin_emails'))
    subscriptions = list_email_subscriptions()
    class Sub(dict):
        __getattr__ = dict.get
    subscription_rows = [Sub(s) for s in subscriptions]
    active_count = sum(1 for s in subscription_rows if s.get('is_active'))
    return render_template(
        'admin/email.html',
        title='Email Subscriptions',
        subscriptions=subscription_rows,
        total_active=active_count,
        total=len(subscription_rows),
    )
@app.route('/admin/email-templates', methods=['GET', 'POST'])
 def admin_email_templates():
    if not require_admin():
        return redirect(url_for('login'))
    if request.method == 'POST':
        action = (request.form.get('action') or '').strip().lower()
        template_id = request.form.get('template_id')
        name = request.form.get('name') or ''
        slug = request.form.get('slug') or ''
        subject = request.form.get('subject') or ''
        body = request.form.get('body') or ''
        is_active = request.form.get('is_active') == 'on'
        try:
            if action == 'create':
                create_email_template(
                    name=name, slug=slug, subject=subject, body=body, is_active=is_active)
                flash('Template created')
            elif action == 'update':
                update_email_template(
                    int(template_id or 0),
                    name=name,
                    slug=slug or None,
                    subject=subject,
                    body=body,
                    is_active=is_active,
                )
                flash('Template updated')
            elif action == 'delete':
                if delete_email_template(int(template_id or 0)):
                    flash('Template deleted')
                else:
                    flash('Template not found')
            else:
                flash('Unknown action')
        except ValueError as exc:
            flash(f'Error: {exc}')
        except Exception as exc:
            flash(f'Error: {exc}')
        return redirect(url_for('admin_email_templates'))
    templates = list_email_templates(include_inactive=True)
    edit_id = request.args.get('template_id', type=int)
    editing = get_email_template(edit_id) if edit_id else None
    preview_payload = None
    preview_template = None
    preview_id = request.args.get('preview_id', type=int)
    if preview_id:
        preview_template = get_email_template(preview_id)
        if preview_template:
            sample_jobs = [
                {
                    'title': 'Senior Python Engineer',
                    'company': 'ACME Corp',
                    'location': 'Remote',
                    'url': 'https://example.com/jobs/1',
                },
                {
                    'title': 'Data Engineer',
                    'company': 'Globex',
                    'location': 'New York, NY',
                    'url': 'https://example.com/jobs/2',
                },
            ]
            preview_payload = render_job_alert_email(
                sample_jobs,
                region='preview-region',
                keyword='preview-keyword',
                template_override=preview_template,
            )
    return render_template(
        'admin/email_templates.html',
        title='Email Templates',
        templates=templates,
        editing=editing,
        preview=preview_payload,
        preview_template=preview_template,
    )
 # ---------------- User settings (regions/keywords) -------------------------
@app.route('/settings', methods=['GET', 'POST'])
@@ -353,6 +513,8 @@ def user_settings():
        # Accept JSON or form posts. Normalize singular/plural names.
        sel_regions: list[str] = []
        sel_keywords: list[str] = []
        sel_negative_keywords: list[str] = []
        if request.is_json:
            data = request.get_json(silent=True) or {}
            sel_regions = [
@@ -361,16 +523,25 @@ def user_settings():
            sel_keywords = [
                (v or '').strip() for v in (data.get('keywords') or []) if v and (v or '').strip()
            ]
            sel_negative_keywords = [
                (v or '').strip() for v in (data.get('negative_keywords') or []) if v and (v or '').strip()
            ]
        else:
            # HTML form fallback: support names 'regions' or 'region', 'keywords' or 'keyword'
            r_vals = request.form.getlist(
                'regions') + request.form.getlist('region')
            k_vals = request.form.getlist(
                'keywords') + request.form.getlist('keyword')
            nk_vals = request.form.getlist(
                'negative_keywords') + request.form.getlist('negative_keyword')
            sel_regions = [(v or '').strip()
                           for v in r_vals if v and (v or '').strip()]
            sel_keywords = [(v or '').strip()
                            for v in k_vals if v and (v or '').strip()]
            sel_negative_keywords = [(v or '').strip()
                                     for v in nk_vals if v and (v or '').strip()]
        # Upsert any new values into master lists
        for r in sel_regions:
            try:
@@ -382,9 +553,14 @@ def user_settings():
                upsert_keyword(k)
            except Exception:
                pass
        # Negative keywords are upserted inside set_user_negative_keywords implicitly if we wanted,
        # but let's stick to the pattern. Actually set_user_negative_keywords calls upsert_negative_keyword.
        try:
            set_user_regions(username, sel_regions)
            set_user_keywords(username, sel_keywords)
            set_user_negative_keywords(username, sel_negative_keywords)
            # For JSON callers, return 200 without redirect
            if request.is_json:
                return jsonify({"status": "ok"})
@@ -399,6 +575,8 @@ def user_settings():
    all_keywords = get_all_keywords()
    user_regions = get_user_regions(username)
    user_keywords = get_user_keywords(username)
    user_negative_keywords = get_user_negative_keywords(username)
    return render_template(
        'user/settings.html',
        title='Your Preferences',
@@ -406,6 +584,7 @@ def user_settings():
        all_keywords=all_keywords,
        user_regions=user_regions,
        user_keywords=user_keywords,
        user_negative_keywords=user_negative_keywords,
    )
@@ -474,6 +653,45 @@ def admin_taxonomy():
    return render_template('admin/taxonomy.html', title='Taxonomy', regions=regions, keywords=keywords)
@app.route('/admin/stats', methods=['GET'])
 def admin_stats():
    if not require_admin():
        return redirect(url_for('login'))
    # Optional filters via query params
    keyword = request.args.get('keyword')
    region = request.args.get('region')
    try:
        stats = stats_overview()
        # For detailed jobs table, reuse get_all_jobs() and filter
        jobs = get_all_jobs()
        if keyword:
            jobs = [j for j in jobs if (j.get('keyword') or '') == keyword]
        if region:
            jobs = [j for j in jobs if (j.get('region') or '') == region]
    except Exception as e:
        flash(f'Error computing stats: {e}')
        stats = {
            'total_jobs': 0,
            'total_keywords': 0,
            'total_regions': 0,
            'jobs_per_keyword': [],
            'jobs_per_region': []
        }
        jobs = []
    return render_template('admin/stats.html', title='Statistics', stats=stats, jobs=jobs, regions=get_all_regions(), keywords=get_all_keywords())
@app.route('/health', methods=['GET'])
 def health_check():
    """Health check endpoint for monitoring application status."""
    return jsonify({
        "status": "healthy",
        "timestamp": now_iso(),
        "service": "jobs-scraper",
        "version": "1.0.0"
    }), 200
 def init():
    """Main function to run the Flask app."""
    # Ensure DB is initialized
--- a/web/craigslist.py
+++ b/web/craigslist.py
@@ -2,42 +2,83 @@ from datetime import datetime, timezone
 from web.scraper import process_region_keyword, scrape_job_page
 from web.db import (
    db_init,
    upsert_cached_page,
    upsert_listing,
    upsert_job_details,
    url_to_job_id,
    upsert_user_interaction,
    db_remove_cached_url,
    db_sync_cached_pages,
    db_get_all_job_urls,
    db_get_cache_url,
    db_delete_job,
    remove_job,
-    normalize_cached_page_paths,
+    insert_log,
    get_last_fetch_time,
 )
-
+import schedule
 import time
 # Import utility functions
 from web.utils import (
-    get_cache_dir,
+    get_base_url,
    make_request_with_retry,
-    now_iso,
+    get_email_settings,
    get_cache_path,
    cache_page,
    is_cache_stale,
    delete_cached_page,
    get_cached_content,
    ensure_cache_dir
 )
 from web.db import get_all_regions, get_all_keywords, seed_regions_keywords_from_listings
 from web.email_templates import render_job_alert_email
 from web.email_service import send_email
 def _negative_match_details(job_data: dict) -> tuple[str, str] | None:
    """Return (keyword, field) when job_data indicates a negative match."""
    if not job_data or not job_data.get("is_negative_match"):
        return None
    keyword = (job_data.get("negative_keyword_match") or "").strip()
    field = (job_data.get("negative_match_field")
             or "unknown").strip() or "unknown"
    if not keyword:
        keyword = "unknown keyword"
    return keyword, field
 def _send_new_job_alert(new_jobs: list[dict]) -> tuple[bool, str]:
    """Send an email alert for newly discovered jobs.
    Returns (sent, message) where message explains why mail was skipped.
    """
    settings = get_email_settings()
    if not settings.get("enabled"):
        return False, "email alerts disabled"
    recipients = settings.get("recipients", []) or []
    if not recipients:
        return False, "no recipients configured"
    payload = render_job_alert_email(new_jobs)
    send_email(
        subject=payload.get("subject", "New jobs available"),
        body=payload.get("body", ""),
        to=recipients,
        settings=settings,
    )
    return True, "sent"
 def fetch_listings():
-    """Fetch job listings from all regions and keywords."""
+    """Fetch job listings from all regions and keywords.
    Yields progress messages and returns a dict with:
    - discovered: total number of unique job URLs discovered
    - new: total number of new jobs added to the database
    - by_search: list of dicts, each containing:
        - region: region name
        - keyword: keyword name
        - count: number of jobs fetched for this search
    """
    # We'll collect URLs discovered in this run and then remove any DB listings
    # not present in this set (treat DB as reflecting current search results).
-    existing_db_urls = set(db_get_all_job_urls())
+    existing_db_urls = set(row['url'] for row in db_get_all_job_urls())
    discovered_urls = set()
    new_rows = []
    new_jobs = []
    search_results = []  # Track count per search
    # Ensure regions/keywords master lists exist
    try:
@@ -45,20 +86,64 @@ def fetch_listings():
    except Exception:
        pass
    yield "Initializing database and seeding regions/keywords...\n"
    # Fetch listings for each region/keyword from DB
-    for region in get_all_regions():
+    regions = get_all_regions()
    keywords = get_all_keywords()
    total_combinations = len(regions) * len(keywords)
    processed = 0
    yield f"Found {len(regions)} regions and {len(keywords)} keywords. Processing {total_combinations} combinations...\n"
    for region in regions:
        region_name = region.get("name")
        if not region_name:
            continue
-        for keyword in get_all_keywords():
+        for keyword in keywords:
            keyword_name = keyword.get("name")
            if not keyword_name:
                continue
            # Build a canonical search identifier for this region+keyword combination.
            url = get_base_url().format(region=region, keyword=keyword_name.replace(" ", "+"))
            search_page_id = f"search:{region_name}:{keyword_name}"
            search_count = 0  # Count jobs for this search
            try:
                last = get_last_fetch_time(url)
                if last is not None:
                    # skip if fetched within the last hour
                    age = datetime.now(
                        timezone.utc) - (last if last.tzinfo is not None else last.replace(tzinfo=timezone.utc))
                    if age.total_seconds() < 1 * 3600:
                        yield f"Skipping {region_name} + {keyword_name} (fetched {age.seconds//3600}h ago)...\n"
                        processed += 1
                        continue
            except Exception:
                # if logging lookup fails, proceed with fetch
                pass
            processed += 1
            yield f"Processing {region_name} + {keyword_name} ({processed}/{total_combinations})...\n"
            # record that we're fetching this search page now
            try:
                insert_log(url, region=region_name,
                           keyword=keyword_name, fetched_at=datetime.now(timezone.utc))
            except Exception:
                pass
            for row in process_region_keyword(region_name, keyword_name, discovered_urls):
                timestamp, region, keyword, title, pay, location, url = row
                discovered_urls.add(url)
                search_count += 1
                if url not in existing_db_urls:
                    new_rows.append(row)
                    new_jobs.append({
                        "timestamp": timestamp,
                        "region": region,
                        "keyword": keyword,
                        "title": title,
                        "pay": pay,
                        "location": location,
                        "url": url,
                    })
                # Upsert or update listing to reflect current search result
                upsert_listing(
                    url=url,
@@ -68,80 +153,154 @@ def fetch_listings():
                    pay=pay,
                    location=location,
                    timestamp=timestamp,
                    fetched_from=search_page_id,
                    fetched_at=datetime.now(timezone.utc),
                )
            # Record per-search count
            search_results.append({
                "region": region_name,
                "keyword": keyword_name,
                "count": search_count
            })
-    # Remove stale listings: those present in DB but not discovered now.
+    yield f"Listing fetch complete: {len(discovered_urls)} discovered, {len(new_rows)} new,\n"
-    stale_urls = existing_db_urls - discovered_urls
+    return {
-    for url in stale_urls:
+        "discovered": len(discovered_urls),
-        try:
+        "new": len(new_rows),
-            jid = url_to_job_id(url)
+        "by_search": search_results,
-            db_delete_job(jid)
+        "new_jobs": new_jobs,
-            # Also try to remove cached file and its metadata
+    }
            delete_cached_page(url)
            db_remove_cached_url(url)
        except Exception:
            pass
    return {"discovered": len(discovered_urls), "new": len(new_rows), "stale": len(stale_urls)}
-def process_job_url(job_url: str):
+def process_job_url(job_url: str, region: str = "", keyword: str = ""):
    last = get_last_fetch_time(job_url)
    if last is not None:
        # skip if fetched within the last 24 hours
        age = datetime.now(
            timezone.utc) - (last if last.tzinfo is not None else last.replace(tzinfo=timezone.utc))
        if age.total_seconds() < 24 * 3600:
            yield f"Skipping job {job_url} (fetched {age.seconds//3600}h ago)...\n"
            return None
    try:
        job_id = url_to_job_id(job_url)
-        content = None
+        yield f"Fetching job page: {job_url}\n"
-        cached_page = db_get_cache_url(job_url)
+        content = make_request_with_retry(job_url, 1)
        if cached_page:
            last_modified = cached_page.get("last_modified")
            if last_modified and not is_cache_stale(last_modified):
                content = get_cached_content(job_url)
            else:
                content = make_request_with_retry(job_url, 1)
        else:
            content = make_request_with_retry(job_url, 1)
        if content is None:
            yield f"Failed to fetch content for {job_url}, removing from database\n"
            remove_job(job_url)
            return None
-        # refresh cache and details
+        yield f"Scraping job data from {job_url}\n"
        cache_page(job_url, content)
        upsert_cached_page(
            file_path=get_cache_path(job_url),
            url_guess=job_url,
            last_modified=now_iso(),
            size_bytes=len(content),
            job_id=job_id
        )
        job_data = scrape_job_page(content, job_url)
        if job_data:
-            upsert_job_details(job_data)
+            negative_info = _negative_match_details(job_data)
-            upsert_user_interaction(
+            if negative_info:
-                job_id, seen_at=datetime.now(timezone.utc).isoformat())
+                keyword, field = negative_info
                yield (
                    f"Skipping job {job_id} due to negative keyword "
                    f"'{keyword}' in {field}\n"
                )
                remove_job(job_url)
                return None
            yield f"Upserting job details for {job_id}\n"
            upsert_job_details(job_data, region=region, keyword=keyword)
            yield f"Successfully processed job {job_id}: {job_data.get('title', 'Unknown')}\n"
            return job_data
        else:
            yield f"Failed to scrape job data from {job_url}\n"
        return None
-    except Exception:
+    except Exception as e:
        yield f"Error processing {job_url}: {str(e)}\n"
        return None
 def scraper():
    """Main function to run the scraper."""
-    ensure_cache_dir()
+    yield "Starting scraper...\n"
    db_init()
    yield "Database initialized\n"
    # First, fetch current listings from search pages and make DB reflect them.
-    jl = fetch_listings()
+    yield "Fetching listings...\n"
    listing_summary: dict | None = None
    fetch_iter = fetch_listings()
    try:
        while True:
            message = next(fetch_iter)
            yield message
    except StopIteration as stop:
        listing_summary = stop.value if isinstance(stop.value, dict) else {}
-    # Sync any cached files we have on disk into the cached_pages table.
+    new_jobs = []
-    db_sync_cached_pages(get_cache_dir())
+    if listing_summary:
        new_jobs = listing_summary.get("new_jobs", []) or []
-    # Normalize any relative cached file paths to absolute paths in DB
+    if new_jobs:
-    normalized = normalize_cached_page_paths()
+        yield f"Preparing email alert for {len(new_jobs)} new jobs...\n"
-    if normalized:
+        try:
-        pass
+            sent, info = _send_new_job_alert(new_jobs)
            if sent:
                yield "Job alert email sent.\n"
            else:
                yield f"Skipping email alert: {info}\n"
        except Exception as exc:
            yield f"Failed to send job alert email: {exc}\n"
    # Finally, fetch and refresh individual job pages for current listings
-    for url in db_get_all_job_urls():
+    job_urls = db_get_all_job_urls()
-        process_job_url(url)
+    yield f"Processing {len(job_urls)} job pages...\n"
    for i, url_dict in enumerate(job_urls, start=1):
        url = url_dict.get("url")
        region = url_dict.get("region", "")
        keyword = url_dict.get("keyword", "")
        if not url:
            continue
        yield f"\n--- Processing job {i}/{len(job_urls)} ---\n"
        for message in process_job_url(job_url=url, region=region, keyword=keyword):
            yield message
    yield "\nScraping completed successfully!\n"
 def scrape_jobs_with_retry(max_retries=3):
    """Run the scraping process with retry logic for failures."""
    for attempt in range(max_retries):
        try:
            scraper()
            return True
        except Exception as e:
            if attempt < max_retries - 1:
                time.sleep(2 ** attempt * 10)  # Exponential backoff
    return False
 def start_scheduler():
    """Start the scheduler to run scraping every hour."""
    # Clear any existing jobs
    schedule.clear()
    # Schedule scraping every hour
    schedule.every().hour.do(scrape_jobs_with_retry)
    # Run the scheduler in a loop
    while True:
        schedule.run_pending()
        time.sleep(60)  # Check every minute
 def run_scheduled_scraping():
    """Run the scheduled scraping process."""
    try:
        scrape_jobs_with_retry()
    except Exception as e:
        pass
 # Initialize scheduler when module is imported
 schedule.every().hour.do(run_scheduled_scraping)
 if __name__ == "__main__":
    scraper()
--- a/web/db.py
+++ b/web/db.py
@@ -4,27 +4,24 @@ from __future__ import annotations
 Tables:
    - users(user_id PK, username UNIQUE, created_at)
    - cached_pages(file_path PK, url_guess, last_modified, size_bytes, job_id)
    - job_listings(job_id PK, url UNIQUE, region, keyword, title, pay, location, timestamp)
-    - job_descriptions(job_id PK FK -> job_listings, title, company, location, description, posted_time, url)
+    - job_descriptions(job_id PK FK -> job_listings, title, company, location, description, posted_time, url, reply_url)
    - user_interactions(job_id PK FK -> job_listings, user_id FK -> users, seen_at, url_visited, is_user_favorite)
    - regions(region_id PK, name UNIQUE)
    - keywords(keyword_id PK, name UNIQUE)
    - user_regions(user_id FK -> users, region_id FK -> regions, composite PK)
    - user_keywords(user_id FK -> users, keyword_id FK -> keywords, composite PK)
    - logs(id PK, page_url, region, keyword, fetched_at)
 """
 from datetime import datetime, UTC
 import os
 from typing import Optional, Dict, Any, List
 import re
 from web.utils import (
    get_url_from_filename,
    get_color_from_string,
    url_to_job_id,
    normalize_job_id,
    now_iso,
    get_cache_path,
    get_cache_dir,
    get_mysql_config,
 )
@@ -86,8 +83,6 @@ class JobListing(Base):
    description = relationship(
        "JobDescription", back_populates="listing", uselist=False, cascade="all, delete-orphan")
    cached_pages = relationship(
        "CachedPage", back_populates="listing", cascade="all, delete-orphan")
    interactions = relationship(
        "UserInteraction", back_populates="listing", cascade="all, delete-orphan")
@@ -102,32 +97,277 @@ class JobDescription(Base):
    description = Column(Text)
    posted_time = Column(String(TIME_LEN))
    url = Column(String(URL_LEN))
    reply_url = Column(String(URL_LEN))
    contact_email = Column(String(SHORT_LEN))
    contact_phone = Column(String(SHORT_LEN))
    contact_name = Column(String(SHORT_LEN))
    listing = relationship("JobListing", back_populates="description")
-class CachedPage(Base):
+def _normalize_email(value: Optional[str]) -> str:
-    __tablename__ = "cached_pages"
+    if not value or not isinstance(value, str):
-    file_path = Column(String(FILE_PATH_LEN), primary_key=True)
+        return ""
-    url_guess = Column(String(URL_LEN))
+    return value.strip().lower()
    last_modified = Column(String(TIME_LEN))
    size_bytes = Column(Integer)
    job_id = Column(String(JOB_ID_LEN), ForeignKey(
        "job_listings.job_id", ondelete="CASCADE"))
    listing = relationship("JobListing", back_populates="cached_pages")
-    @property
+def subscribe_email(email: str) -> bool:
-    def abs_path(self) -> Optional[str]:
+    """Add or reactivate an email subscription."""
-        """Return the absolute filesystem path for this cached page.
+    address = _normalize_email(email)
    if not address:
        raise ValueError("email address required")
    with _ensure_session() as session:
        existing = session.execute(
            text(
                "SELECT subscription_id, is_active FROM email_subscriptions WHERE email = :e"
            ),
            {"e": address},
        ).fetchone()
        now = datetime.now(UTC)
        if existing:
            session.execute(
                text(
                    "UPDATE email_subscriptions SET is_active = 1, updated_at = :u WHERE subscription_id = :sid"
                ),
                {"u": now, "sid": existing[0]},
            )
        else:
            session.execute(
                text(
                    "INSERT INTO email_subscriptions(email, is_active, created_at, updated_at) "
                    "VALUES(:e, 1, :u, :u)"
                ),
                {"e": address, "u": now},
            )
        session.commit()
    return True
-        The DB stores `file_path` relative to the configured cache dir. This
+
-        helper centralizes resolution so callers can use `cached_page.abs_path`.
+def unsubscribe_email(email: str) -> bool:
-        """
+    """Deactivate an email subscription."""
-        fp = getattr(self, 'file_path', None)
+    address = _normalize_email(email)
-        if not fp:
+    if not address:
-            return None
+        raise ValueError("email address required")
-        return os.path.join(os.path.abspath(get_cache_dir()), fp)
+    with _ensure_session() as session:
        now = datetime.now(UTC)
        result = session.execute(
            text(
                "UPDATE email_subscriptions SET is_active = 0, updated_at = :u WHERE email = :e"
            ),
            {"u": now, "e": address},
        )
        session.commit()
        rowcount = getattr(result, "rowcount", None)
        if rowcount is None:
            return False
        return rowcount > 0
 def list_email_subscriptions(*, active_only: bool = False) -> List[Dict[str, Any]]:
    """Return subscription rows as dicts."""
    query = "SELECT subscription_id, email, is_active, created_at, updated_at FROM email_subscriptions"
    params: Dict[str, Any] = {}
    if active_only:
        query += " WHERE is_active = 1"
    query += " ORDER BY email"
    with _ensure_session() as session:
        rows = session.execute(text(query), params).fetchall()
    result: List[Dict[str, Any]] = []
    for row in rows:
        result.append(
            {
                "subscription_id": row[0],
                "email": row[1],
                "is_active": bool(row[2]),
                "created_at": row[3],
                "updated_at": row[4],
            }
        )
    return result
 def get_active_email_recipients() -> List[str]:
    """Return list of active subscription email addresses."""
    return [s["email"] for s in list_email_subscriptions(active_only=True)]
 def _normalize_slug(value: Optional[str]) -> str:
    if not value:
        return ""
    slug = re.sub(r"[^a-zA-Z0-9-]+", "-", value.strip().lower())
    slug = re.sub(r"-+", "-", slug).strip("-")
    return slug
 def _template_to_dict(template: EmailTemplate) -> Dict[str, Any]:
    created = getattr(template, "created_at", None)
    updated = getattr(template, "updated_at", None)
    return {
        "template_id": template.template_id,
        "slug": template.slug,
        "name": template.name,
        "subject": template.subject,
        "body": template.body,
        "is_active": bool(template.is_active),
        "created_at": created.isoformat() if isinstance(created, datetime) else created,
        "updated_at": updated.isoformat() if isinstance(updated, datetime) else updated,
    }
 def list_email_templates(*, include_inactive: bool = True) -> List[Dict[str, Any]]:
    with _ensure_session() as session:
        query = session.query(EmailTemplate)
        if not include_inactive:
            query = query.filter(EmailTemplate.is_active.is_(True))
        items = query.order_by(EmailTemplate.name.asc()).all()
        return [_template_to_dict(obj) for obj in items]
 def get_email_template(template_id: int) -> Optional[Dict[str, Any]]:
    if not template_id:
        return None
    with _ensure_session() as session:
        obj = session.get(EmailTemplate, int(template_id))
        return _template_to_dict(obj) if obj else None
 def get_email_template_by_slug(slug: str) -> Optional[Dict[str, Any]]:
    normalized = _normalize_slug(slug)
    if not normalized:
        return None
    with _ensure_session() as session:
        obj = session.query(EmailTemplate).filter(
            EmailTemplate.slug == normalized).one_or_none()
        return _template_to_dict(obj) if obj else None
 def create_email_template(
    *,
    name: str,
    subject: str,
    body: str,
    slug: Optional[str] = None,
    is_active: bool = True,
 ) -> Dict[str, Any]:
    name_clean = (name or "").strip()
    if not name_clean:
        raise ValueError("Template name is required")
    subject_clean = (subject or "").strip()
    if not subject_clean:
        raise ValueError("Template subject is required")
    body_clean = (body or "").strip()
    if not body_clean:
        raise ValueError("Template body is required")
    slug_clean = _normalize_slug(slug or name_clean)
    if not slug_clean:
        raise ValueError("Template slug is required")
    with _ensure_session() as session:
        existing = session.query(EmailTemplate).filter(
            EmailTemplate.slug == slug_clean).one_or_none()
        if existing:
            raise ValueError("A template with this slug already exists")
        template = EmailTemplate(
            name=name_clean,
            slug=slug_clean,
            subject=subject_clean,
            body=body_clean,
            is_active=bool(is_active),
        )
        session.add(template)
        session.commit()
        session.refresh(template)
        return _template_to_dict(template)
 def update_email_template(
    template_id: int,
    *,
    name: Optional[str] = None,
    subject: Optional[str] = None,
    body: Optional[str] = None,
    slug: Optional[str] = None,
    is_active: Optional[bool] = None,
 ) -> Dict[str, Any]:
    if not template_id:
        raise ValueError("template_id is required")
    with _ensure_session() as session:
        template = session.get(EmailTemplate, int(template_id))
        if template is None:
            raise ValueError("Template not found")
        if name is not None:
            name_clean = name.strip()
            if not name_clean:
                raise ValueError("Template name is required")
            setattr(template, "name", name_clean)
        if subject is not None:
            subject_clean = subject.strip()
            if not subject_clean:
                raise ValueError("Template subject is required")
            setattr(template, "subject", subject_clean)
        if body is not None:
            body_clean = body.strip()
            if not body_clean:
                raise ValueError("Template body is required")
            setattr(template, "body", body_clean)
        if slug is not None:
            slug_clean = _normalize_slug(slug)
            if not slug_clean:
                raise ValueError("Template slug is required")
            existing = (
                session.query(EmailTemplate)
                .filter(EmailTemplate.slug == slug_clean, EmailTemplate.template_id != template.template_id)
                .one_or_none()
            )
            if existing:
                raise ValueError("A template with this slug already exists")
            setattr(template, "slug", slug_clean)
        if is_active is not None:
            setattr(template, "is_active", bool(is_active))
        template.touch()
        session.commit()
        session.refresh(template)
        return _template_to_dict(template)
 def delete_email_template(template_id: int) -> bool:
    if not template_id:
        return False
    with _ensure_session() as session:
        template = session.get(EmailTemplate, int(template_id))
        if template is None:
            return False
        session.delete(template)
        session.commit()
        return True
 def ensure_default_email_template() -> None:
    try:
        from web.email_templates import DEFAULT_JOB_ALERT_SUBJECT, DEFAULT_JOB_ALERT_BODY
    except Exception:
        DEFAULT_JOB_ALERT_SUBJECT = "{count_label}{scope}"
        DEFAULT_JOB_ALERT_BODY = (
            "Hi,\n\n{intro_line}\n{jobs_message}\n\nGenerated at {timestamp} UTC.\n"
            "You are receiving this message because job alerts are enabled.\n"
        )
    try:
        with _ensure_session() as session:
            existing = session.query(EmailTemplate).filter(
                EmailTemplate.slug == "job-alert").one_or_none()
            if existing is None:
                template = EmailTemplate(
                    name="Job Alert",
                    slug="job-alert",
                    subject=DEFAULT_JOB_ALERT_SUBJECT,
                    body=DEFAULT_JOB_ALERT_BODY,
                    is_active=True,
                )
                session.add(template)
                session.commit()
    except Exception:
        pass
 class UserInteraction(Base):
@@ -176,6 +416,58 @@ class UserKeyword(Base):
        "keywords.keyword_id", ondelete="CASCADE"), primary_key=True)
 class NegativeKeyword(Base):
    __tablename__ = "negative_keywords"
    keyword_id = Column(Integer, primary_key=True, autoincrement=True)
    name = Column(String(SHORT_LEN), unique=True, nullable=False)
 class UserNegativeKeyword(Base):
    __tablename__ = "user_negative_keywords"
    user_id = Column(Integer, ForeignKey(
        "users.user_id", ondelete="CASCADE"), primary_key=True)
    keyword_id = Column(Integer, ForeignKey(
        "negative_keywords.keyword_id", ondelete="CASCADE"), primary_key=True)
 class Log(Base):
    __tablename__ = "logs"
    id = Column(Integer, primary_key=True, autoincrement=True)
    page_url = Column(String(URL_LEN))
    region = Column(String(SHORT_LEN))
    keyword = Column(String(SHORT_LEN))
    fetched_at = Column(DateTime)
 class EmailSubscription(Base):
    __tablename__ = "email_subscriptions"
    subscription_id = Column(Integer, primary_key=True, autoincrement=True)
    email = Column(String(SHORT_LEN), unique=True, nullable=False)
    is_active = Column(Boolean, default=True, nullable=False)
    created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
    updated_at = Column(DateTime, default=datetime.utcnow, nullable=False)
    def touch(self):
        setattr(self, "updated_at", datetime.utcnow())
 class EmailTemplate(Base):
    __tablename__ = "email_templates"
    template_id = Column(Integer, primary_key=True, autoincrement=True)
    slug = Column(String(SHORT_LEN), unique=True, nullable=False)
    name = Column(String(SHORT_LEN), nullable=False)
    subject = Column(Text, nullable=False)
    body = Column(Text, nullable=False)
    is_active = Column(Boolean, default=True, nullable=False)
    created_at = Column(
        DateTime, default=lambda: datetime.now(UTC), nullable=False)
    updated_at = Column(
        DateTime, default=lambda: datetime.now(UTC), nullable=False)
    def touch(self):
        setattr(self, "updated_at", datetime.now(UTC))
 def _ensure_session() -> Session:
    global engine, SessionLocal
    if engine is None or SessionLocal is None:
@@ -223,6 +515,31 @@ def db_init():
                text("ALTER TABLE users ADD COLUMN IF NOT EXISTS last_login DATETIME NULL"))
        except Exception:
            pass
        try:
            conn.execute(text(
                "ALTER TABLE job_descriptions ADD COLUMN IF NOT EXISTS reply_url VARCHAR(512) NULL"))
        except Exception:
            pass
        try:
            conn.execute(text(
                "ALTER TABLE job_descriptions ADD COLUMN IF NOT EXISTS contact_email VARCHAR(255) NULL"))
        except Exception:
            pass
        try:
            conn.execute(text(
                "ALTER TABLE job_descriptions ADD COLUMN IF NOT EXISTS contact_phone VARCHAR(255) NULL"))
        except Exception:
            pass
        try:
            conn.execute(text(
                "ALTER TABLE job_descriptions ADD COLUMN IF NOT EXISTS contact_name VARCHAR(255) NULL"))
        except Exception:
            pass
    try:
        ensure_default_email_template()
    except Exception:
        pass
 def upsert_user_interaction(job_id: str | int, *, user_id: Optional[int] = None, seen_at: Optional[str] = None, url_visited: Optional[str] = None, is_user_favorite: Optional[bool] = None):
@@ -247,7 +564,7 @@ def upsert_user_interaction(job_id: str | int, *, user_id: Optional[int] = None,
        session.commit()
-def upsert_listing(*, url: str, region: str, keyword: str, title: str, pay: str, location: str, timestamp: str):
+def upsert_listing(*, url: str, region: str, keyword: str, title: str, pay: str, location: str, timestamp: str, fetched_from: str | None = None, fetched_at: Optional[datetime] = None):
    """Insert or update a job listing row based on job_id derived from URL."""
    job_id = str(url_to_job_id(url))
    with _ensure_session() as session:
@@ -263,19 +580,74 @@ def upsert_listing(*, url: str, region: str, keyword: str, title: str, pay: str,
        setattr(obj, "location", location)
        setattr(obj, "timestamp", timestamp)
        session.commit()
    # Optionally record a fetch log for the listing if source provided
    if fetched_from:
        try:
            insert_log(fetched_from, region=region, keyword=keyword,
                       fetched_at=fetched_at or datetime.now())
        except Exception:
            pass
-def upsert_job_details(job_data: Dict[str, Any]):
+def insert_log(page_url: str, region: str | None = None, keyword: str | None = None, fetched_at: Optional[datetime] = None):
-    """Upsert into job_descriptions table using scraped job details dict."""
+    """Insert a log row for a fetched page."""
    fetched_at = fetched_at or datetime.now()
    with _ensure_session() as session:
        l = Log(page_url=page_url, region=region or '',
                keyword=keyword or '', fetched_at=fetched_at)
        session.add(l)
        session.commit()
 def get_last_fetch_time(page_url: str) -> Optional[datetime]:
    """Return the latest fetched_at for a given page_url, or None if never fetched."""
    with _ensure_session() as session:
        row = session.execute(text("SELECT fetched_at FROM logs WHERE page_url = :u ORDER BY fetched_at DESC LIMIT 1"), {
                              "u": page_url}).fetchone()
        if row and row[0]:
            return row[0]
        return None
 def upsert_job_details(job_data: Dict[str, Any], region: str = "", keyword: str = ""):
    """Upsert into job_descriptions table using scraped job details dict.
    Behavior additions:
    - If the provided job `url` has a log entry with fetched_at less than 24 hours ago,
      the function will skip updating to avoid unnecessary work.
    - On successful upsert, a log entry is recorded with `insert_log(url, ...)`.
    """
    if not job_data or job_data.get("is_negative_match"):
        return
    url = job_data.get("url")
    job_id = normalize_job_id(job_data.get("id"), url)
    if not job_id:
        return
    # Skip if job page was fetched recently (24 hours)
    try:
        if isinstance(url, str) and url:
            last = get_last_fetch_time(url)
            if last is not None:
                # normalize tz-awareness
                from datetime import timezone as _tz
                now = datetime.now(_tz.utc)
                last_dt = last if getattr(
                    last, 'tzinfo', None) is not None else last.replace(tzinfo=_tz.utc)
                if (now - last_dt).total_seconds() < 24 * 3600:
                    return
    except Exception:
        # if log lookup fails, proceed normally
        pass
    title = job_data.get("title") or None
    company = job_data.get("company") or None
    location = job_data.get("location") or None
    description = job_data.get("description") or None
    posted_time = job_data.get("posted_time") or None
    reply_url = job_data.get("reply_url") or None
    contact_email = job_data.get("contact_email") or None
    contact_phone = job_data.get("contact_phone") or None
    contact_name = job_data.get("contact_name") or None
    job_id = str(job_id)
    with _ensure_session() as session:
@@ -289,142 +661,20 @@ def upsert_job_details(job_data: Dict[str, Any]):
        setattr(obj, "description", description)
        setattr(obj, "posted_time", posted_time)
        setattr(obj, "url", url)
        setattr(obj, "reply_url", reply_url)
        setattr(obj, "contact_email", contact_email)
        setattr(obj, "contact_phone", contact_phone)
        setattr(obj, "contact_name", contact_name)
        session.commit()
-
+    # Record that we fetched/updated this job page
 def upsert_cached_page(*, file_path: str, url_guess: Optional[str], last_modified: Optional[str], size_bytes: Optional[int], job_id: Optional[int]):
    # Store file paths relative to the cache directory (keeps DB portable)
    cache_dir = os.path.abspath(get_cache_dir())
    abs_fp = os.path.abspath(file_path)
    rel_fp = os.path.relpath(abs_fp, start=cache_dir)
    with _ensure_session() as session:
        obj = session.get(CachedPage, rel_fp)
        if obj is None:
            obj = CachedPage(file_path=rel_fp)
            session.add(obj)
        setattr(obj, "url_guess", url_guess)
        setattr(obj, "last_modified", last_modified)
        setattr(obj, "size_bytes", size_bytes)
        setattr(obj, "job_id", str(job_id) if job_id else None)
        session.commit()
 def remove_cached_page(file_path: str):
    # Accept absolute or relative input but DB keys are stored relative to cache dir
    cache_dir = os.path.abspath(get_cache_dir())
    abs_fp = os.path.abspath(file_path)
    rel_fp = os.path.relpath(abs_fp, start=cache_dir)
    with _ensure_session() as session:
        obj = session.get(CachedPage, rel_fp)
        if obj:
            session.delete(obj)
            session.commit()
 def db_remove_cached_url(url: str):
    """Remove a cached page by URL."""
    # Compute absolute path for the URL and delegate to remove_cached_page
    abs_fp = os.path.abspath(get_cache_path(url))
    try:
-        remove_cached_page(abs_fp)
+        if isinstance(url, str) and url:
            insert_log(url, region=None, keyword=None,
                       fetched_at=datetime.now())
    except Exception:
        pass
 def db_get_all_cached_pages() -> List[Dict[str, Any]]:
    with _ensure_session() as session:
        rows = session.execute(text(
            "SELECT file_path, url_guess, last_modified, size_bytes, job_id FROM cached_pages")).fetchall()
        out = []
        for row in rows:
            fp = row[0]
            out.append({
                "file_path": fp,
                "file_path_abs": db_get_cached_abs_path(fp) if fp else None,
                "url_guess": row[1],
                "last_modified": row[2],
                "size_bytes": row[3],
                "job_id": row[4],
            })
        return out
 def db_get_cache_url(url: str):
    """Return the data for a specific URL from cached_pages.
    Arguments:
        url -- The URL to look up in the cache.
    """
    with _ensure_session() as session:
        row = session.execute(text(
            "SELECT file_path, url_guess, last_modified, size_bytes, job_id FROM cached_pages WHERE url_guess = :u"), {"u": url}).fetchone()
        if not row:
            return None
        fp = row[0]
        return {
            "file_path": fp,
            "file_path_abs": db_get_cached_abs_path(fp) if fp else None,
            "url_guess": row[1],
            "last_modified": row[2],
            "size_bytes": row[3],
            "job_id": row[4],
        }
 def db_sync_cached_pages(cache_dir: str):
    """Scan cache_dir and upsert page metadata into cached_pages table."""
    if not os.path.isdir(cache_dir):
        return
    abs_cache = os.path.abspath(cache_dir)
    # read existing DB keys once for quick membership tests
    db_cache_paths = {c["file_path"] for c in db_get_all_cached_pages()}
    for root, _, files in os.walk(abs_cache):
        for name in files:
            if not name.lower().endswith(".html"):
                continue
            fp = os.path.abspath(os.path.join(root, name))
            rel_fp = os.path.relpath(fp, start=abs_cache)
            if rel_fp in db_cache_paths:
                continue
            try:
                stat = os.stat(fp)
                mtime = datetime.fromtimestamp(stat.st_mtime).isoformat()
                size = stat.st_size
            except OSError:
                mtime = None
                size = None
            url_guess = get_url_from_filename(name)
            job_id = url_to_job_id(url_guess)
            upsert_cached_page(file_path=rel_fp, url_guess=url_guess,
                               last_modified=mtime, size_bytes=size, job_id=job_id)
 def normalize_cached_page_paths() -> int:
    """Ensure all cached_pages.file_path values are absolute. Returns number of rows updated/normalized."""
    # Convert any absolute paths in DB to relative paths (relative to cache dir)
    changed = 0
    abs_cache = os.path.abspath(get_cache_dir())
    with _ensure_session() as session:
        rows = session.execute(text(
            "SELECT file_path, url_guess, last_modified, size_bytes, job_id FROM cached_pages")).fetchall()
    for (fp, url_guess, last_modified, size_bytes, job_id) in rows:
        if os.path.isabs(fp):
            rel_fp = os.path.relpath(fp, start=abs_cache)
            upsert_cached_page(
                file_path=rel_fp,
                url_guess=url_guess,
                last_modified=last_modified,
                size_bytes=size_bytes,
                job_id=job_id,
            )
            with _ensure_session() as session:
                session.execute(
                    text("DELETE FROM cached_pages WHERE file_path = :fp"), {"fp": fp})
                session.commit()
            changed += 1
    return changed
 def db_get_keywords() -> List[str]:
    """Return a list of all unique keywords from job listings."""
    with _ensure_session() as session:
@@ -453,15 +703,10 @@ SELECT l.job_id
 ,l.timestamp
 ,d.posted_time
 ,l.url
 ,c.file_path
 ,c.last_modified
 ,c.url_guess
 ,CASE WHEN c.url_guess != l.url THEN 1 ELSE 0 END AS url_guess_stale
 FROM job_listings AS l
 INNER JOIN job_descriptions AS d
 ON l.job_id = d.job_id
 AND l.url = d.url
 LEFT JOIN cached_pages AS c ON l.job_id = c.job_id
 ORDER BY d.posted_time DESC
    """
    with _ensure_session() as session:
@@ -479,32 +724,21 @@ ORDER BY d.posted_time DESC
                "timestamp": row[7],
                "posted_time": row[8],
                "url": row[9],
                # file_path is stored relative to cache dir; provide both forms
                "file_path": row[10],
                "file_path_abs": os.path.join(os.path.abspath(get_cache_dir()), row[10]) if row[10] else None,
                "last_modified": row[11],
                "url_guess": row[12],
                "url_guess_stale": row[13],
            }
            jobs.append(job)
        return jobs
-def db_get_cached_abs_path(db_file_path: Optional[str]) -> Optional[str]:
+def db_get_all_job_urls() -> List[dict]:
-    """Return absolute cache file path given a DB-stored (relative) file_path.
+    """Return list of job URLs from job_listings.
-    Returns None if input is falsy.
+    Returns:
    - List of dicts with keys: url, region, keyword
    """
    if not db_file_path:
        return None
    return os.path.join(os.path.abspath(get_cache_dir()), db_file_path)
 def db_get_all_job_urls() -> List[str]:
    """Return list of job URLs from job_listings."""
    with _ensure_session() as session:
-        rows = session.execute(text("SELECT url FROM job_listings")).fetchall()
+        rows = session.execute(
-        return [r[0] for r in rows]
+            text("SELECT url, region, keyword FROM job_listings")).fetchall()
        return [{"url": r[0], "region": r[1], "keyword": r[2]} for r in rows]
 def db_delete_job(job_id: str | int):
@@ -522,10 +756,6 @@ def remove_job(url):
    try:
        jid = url_to_job_id(url)
        db_delete_job(jid)
        cache_fp = get_cache_path(url)
        remove_cached_page(os.path.abspath(cache_fp))
        if os.path.exists(cache_fp):
            os.remove(cache_fp)
    except Exception:
        pass
@@ -534,7 +764,8 @@ def remove_job(url):
 def get_or_create_user(username: str) -> int:
    """Return user_id for username, creating if missing."""
-    created_at = datetime.now(UTC).isoformat()
+    # 2025-08-30T16:04:29.660245+00:00 is wrong. should be 2025-08-30T16:04:29
    created_at = datetime.now(UTC).isoformat().split('.')[0]
    with _ensure_session() as session:
        row = session.execute(
            text("SELECT user_id FROM users WHERE username = :u"), {
@@ -654,22 +885,54 @@ def get_user(username: str) -> Optional[Dict[str, Any]]:
    """Return single user dict or None."""
    with _ensure_session() as session:
        row = session.execute(text(
-            "SELECT user_id, username, is_admin, is_active, password_hash, last_login, created_at FROM users WHERE username = :u"
+            "SELECT user_id, username, created_at, is_admin, is_active, last_login, (password_hash IS NOT NULL) AS has_pw FROM users WHERE username = :u"
        ), {"u": username}).fetchone()
        if not row:
            return None
        return {
            "user_id": int(row[0]),
            "username": row[1],
-            "is_admin": bool(row[2]),
+            "created_at": row[2].isoformat() if isinstance(row[2], datetime) else (row[2] or None),
-            "is_active": bool(row[3]),
+            "is_admin": bool(row[3]),
-            "password_hash": row[4],
+            "is_active": bool(row[4]),
            "last_login": row[5].isoformat() if row[5] else None,
-            "created_at": row[6].isoformat() if isinstance(row[6], datetime) else (row[6] or None),
+            "has_password": bool(row[6]),
        }
 def get_user_by_id(user_id: int) -> Optional[Dict[str, Any]]:
    """Return single user dict or None."""
    with _ensure_session() as session:
        row = session.execute(text(
            "SELECT user_id, username, created_at, is_admin, is_active, last_login, (password_hash IS NOT NULL) AS has_pw FROM users WHERE user_id = :u"
        ), {"u": user_id}).fetchone()
        if not row:
            return None
        return {
            "user_id": int(row[0]),
            "username": row[1],
            "created_at": row[2].isoformat() if isinstance(row[2], datetime) else (row[2] or None),
            "is_admin": bool(row[3]),
            "is_active": bool(row[4]),
            "last_login": row[5].isoformat() if row[5] else None,
            "has_password": bool(row[6]),
        }
 def delete_user_by_id(user_id: int) -> bool:
    with _ensure_session() as session:
        result = session.execute(
            text("DELETE FROM users WHERE user_id = :u"), {"u": user_id})
        session.commit()
        rc = getattr(result, 'rowcount', None)
        if rc is None:
            # Unable to determine rowcount; assume success if no exception
            return True
        return rc > 0
 # ---------------- Regions/Keywords helpers ---------------------------------
 def upsert_region(name: str) -> int:
    """Get or create a region by name; return region_id."""
    name = (name or "").strip()
@@ -713,6 +976,27 @@ def upsert_keyword(name: str) -> int:
    return upsert_keyword(name)
 def upsert_negative_keyword(name: str) -> int:
    """Get or create a negative keyword by name; return keyword_id."""
    name = (name or "").strip().lower()
    if not name:
        raise ValueError("Negative keyword cannot be empty")
    with _ensure_session() as session:
        row = session.execute(text("SELECT keyword_id FROM negative_keywords WHERE name = :n"), {
                              "n": name}).fetchone()
        if row:
            return int(row[0])
        session.execute(
            text("INSERT INTO negative_keywords(name) VALUES (:n)"), {"n": name})
        session.commit()
    with _ensure_session() as session:
        row2 = session.execute(text("SELECT keyword_id FROM negative_keywords WHERE name = :n"), {
                               "n": name}).fetchone()
        if row2:
            return int(row2[0])
    return upsert_negative_keyword(name)
 def set_user_regions(username: str, region_names: List[str]) -> None:
    """Replace user's preferred regions with given names."""
    user_id = get_or_create_user(username)
@@ -771,6 +1055,34 @@ def set_user_keywords(username: str, keyword_names: List[str]) -> None:
        session.commit()
 def set_user_negative_keywords(username: str, keyword_names: List[str]) -> None:
    """Replace user's negative keywords with given names."""
    user_id = get_or_create_user(username)
    names = sorted({(n or "").strip().lower()
                   for n in keyword_names if (n or "").strip()})
    keyword_ids: List[int] = [upsert_negative_keyword(n) for n in names]
    if not keyword_ids and not names:
        with _ensure_session() as session:
            session.execute(
                text("DELETE FROM user_negative_keywords WHERE user_id = :u"), {"u": user_id})
            session.commit()
        return
    desired = set(keyword_ids)
    with _ensure_session() as session:
        rows = session.execute(text("SELECT keyword_id FROM user_negative_keywords WHERE user_id = :u"), {
                               "u": user_id}).fetchall()
        current = set(int(r[0]) for r in rows)
        to_add = desired - current
        to_remove = current - desired
        for kid in to_remove:
            session.execute(text("DELETE FROM user_negative_keywords WHERE user_id = :u AND keyword_id = :k"), {
                            "u": user_id, "k": int(kid)})
        for kid in to_add:
            session.execute(text("INSERT INTO user_negative_keywords(user_id, keyword_id) VALUES(:u, :k)"), {
                            "u": user_id, "k": int(kid)})
        session.commit()
 def get_user_regions(username: str) -> List[Dict[str, str]]:
    """Return preferred region names for a user (empty if none)."""
    with _ensure_session() as session:
@@ -811,6 +1123,26 @@ def get_user_keywords(username: str) -> List[Dict[str, str]]:
        return [{"name": r[0], "color": r[1]} for r in rows]
 def get_user_negative_keywords(username: str) -> List[str]:
    """Return negative keyword names for a user (empty if none)."""
    with _ensure_session() as session:
        row = session.execute(text("SELECT user_id FROM users WHERE username = :u"), {
                              "u": username}).fetchone()
        if not row:
            return []
        user_id = int(row[0])
        rows = session.execute(text(
            """
            SELECT k.name
            FROM negative_keywords k
            INNER JOIN user_negative_keywords uk ON uk.keyword_id = k.keyword_id
            WHERE uk.user_id = :u
            ORDER BY k.name ASC
            """
        ), {"u": user_id}).fetchall()
        return [r[0] for r in rows]
 def get_all_regions() -> List[Dict[str, str]]:
    """Return all region names from regions table (sorted)."""
    with _ensure_session() as session:
@@ -941,3 +1273,44 @@ def change_keyword_color(keyword_id: int, new_color: str) -> bool:
        except Exception:
            session.rollback()
            return False
 def stats_overview() -> Dict[str, Any]:
    """Return an overview of job DB statistics.
    Returns a dict with keys:
      - total_jobs: int
      - total_keywords: int (distinct keywords in listings)
      - total_regions: int (distinct regions in listings)
      - jobs_per_keyword: List[{"keyword": str, "count": int}]
      - jobs_per_region: List[{"region": str, "count": int}]
    """
    with _ensure_session() as session:
        total_jobs = session.execute(text(
            "SELECT COUNT(*) FROM job_listings l INNER JOIN job_descriptions d ON l.job_id = d.job_id AND l.url = d.url"
        )).scalar_one()
        total_keywords = session.execute(text(
            "SELECT COUNT(DISTINCT keyword) FROM job_listings WHERE keyword IS NOT NULL AND keyword != ''"
        )).scalar_one()
        total_regions = session.execute(text(
            "SELECT COUNT(DISTINCT region) FROM job_listings WHERE region IS NOT NULL AND region != ''"
        )).scalar_one()
        rows = session.execute(text(
            "SELECT COALESCE(keyword, '') AS keyword, COUNT(*) as cnt FROM job_listings l INNER JOIN job_descriptions d ON l.job_id = d.job_id AND l.url = d.url GROUP BY keyword ORDER BY cnt DESC"
        )).fetchall()
        jobs_per_keyword = [
            {"keyword": r[0], "count": int(r[1])} for r in rows]
        rows = session.execute(text(
            "SELECT COALESCE(region, '') AS region, COUNT(*) as cnt FROM job_listings l INNER JOIN job_descriptions d ON l.job_id = d.job_id AND l.url = d.url GROUP BY region ORDER BY cnt DESC"
        )).fetchall()
        jobs_per_region = [{"region": r[0], "count": int(r[1])} for r in rows]
    return {
        "total_jobs": int(total_jobs or 0),
        "total_keywords": int(total_keywords or 0),
        "total_regions": int(total_regions or 0),
        "jobs_per_keyword": jobs_per_keyword,
        "jobs_per_region": jobs_per_region,
    }
--- a/web/email_service.py
+++ b/web/email_service.py
@@ -0,0 +1,130 @@
 """Email sending utilities for the jobs scraper."""
 from __future__ import annotations
 from email.message import EmailMessage
 from typing import Iterable, Sequence
 import smtplib
 from web.utils import get_email_settings
 class EmailConfigurationError(RuntimeError):
    """Raised when email settings are missing or invalid."""
 class EmailDeliveryError(RuntimeError):
    """Raised when an email fails to send."""
 def _normalize_addresses(addresses: Sequence[str] | str | None) -> list[str]:
    if not addresses:
        return []
    if isinstance(addresses, str):
        items = [addresses]
    else:
        items = list(addresses)
    cleaned: list[str] = []
    seen: set[str] = set()
    for raw in items:
        if not isinstance(raw, str):
            continue
        addr = raw.strip()
        if not addr:
            continue
        lower = addr.lower()
        if lower in seen:
            continue
        seen.add(lower)
        cleaned.append(addr)
    return cleaned
 def _ensure_recipients(*recipient_groups: Iterable[str]) -> list[str]:
    merged: list[str] = []
    seen: set[str] = set()
    for group in recipient_groups:
        for addr in group:
            lower = addr.lower()
            if lower in seen:
                continue
            seen.add(lower)
            merged.append(addr)
    if not merged:
        raise EmailConfigurationError(
            "At least one recipient address is required")
    return merged
 def send_email(
    *,
    subject: str,
    body: str,
    to: Sequence[str] | str,
    cc: Sequence[str] | str | None = None,
    bcc: Sequence[str] | str | None = None,
    reply_to: Sequence[str] | str | None = None,
    settings: dict | None = None,
 ) -> bool:
    """Send an email using configured SMTP settings.
    Returns True when a message is sent, False when email is disabled.
    Raises EmailConfigurationError for invalid config and EmailDeliveryError for SMTP failures.
    """
    config = settings or get_email_settings()
    if not config.get("enabled"):
        return False
    smtp_cfg = config.get("smtp", {})
    host = (smtp_cfg.get("host") or "").strip()
    if not host:
        raise EmailConfigurationError("SMTP host is not configured")
    port = int(smtp_cfg.get("port", 587) or 587)
    timeout = int(smtp_cfg.get("timeout", 30) or 30)
    use_ssl = bool(smtp_cfg.get("use_ssl", False))
    use_tls = bool(smtp_cfg.get("use_tls", True))
    from_address = (config.get("from_address")
                    or smtp_cfg.get("username") or "").strip()
    if not from_address:
        raise EmailConfigurationError("From address is not configured")
    to_list = _normalize_addresses(to)
    cc_list = _normalize_addresses(cc)
    bcc_list = _normalize_addresses(bcc)
    reply_to_list = _normalize_addresses(reply_to)
    all_recipients = _ensure_recipients(to_list, cc_list, bcc_list)
    message = EmailMessage()
    message["Subject"] = subject
    message["From"] = from_address
    message["To"] = ", ".join(to_list)
    if cc_list:
        message["Cc"] = ", ".join(cc_list)
    if reply_to_list:
        message["Reply-To"] = ", ".join(reply_to_list)
    message.set_content(body)
    username = (smtp_cfg.get("username") or "").strip()
    password = smtp_cfg.get("password") or ""
    client_cls = smtplib.SMTP_SSL if use_ssl else smtplib.SMTP
    try:
        with client_cls(host=host, port=port, timeout=timeout) as client:
            client.ehlo()
            if use_tls and not use_ssl:
                client.starttls()
                client.ehlo()
            if username:
                client.login(username, password)
            client.send_message(message, from_addr=from_address,
                                to_addrs=all_recipients)
    except EmailConfigurationError:
        raise
    except Exception as exc:  # pragma: no cover - network errors depend on env
        raise EmailDeliveryError(str(exc)) from exc
    return True
--- a/web/email_templates.py
+++ b/web/email_templates.py
@@ -0,0 +1,106 @@
 """Email templates for job notifications."""
 from __future__ import annotations
 from datetime import datetime, UTC
 from typing import Iterable, Mapping, Dict, Any
 DEFAULT_DATETIME_FORMAT = "%Y-%m-%d %H:%M"
 DEFAULT_JOB_ALERT_SUBJECT = "{count_label}{scope}"
 DEFAULT_JOB_ALERT_BODY = (
    "Hi,\n\n{intro_line}{jobs_section}\n\nGenerated at {timestamp} UTC.\n"
    "You are receiving this message because job alerts are enabled.\n"
 )
 class _SafeDict(dict):
    def __missing__(self, key: str) -> str:
        return ""
 def _format_template(template: str, context: Dict[str, Any]) -> str:
    safe_context = _SafeDict(
        {k: ("\n".join(str(v) for v in context[k]) if isinstance(
            context[k], list) else context[k]) for k in context}
    )
    return template.format_map(safe_context)
 def render_job_alert_email(
    jobs: Iterable[Mapping[str, object]],
    *,
    region: str | None = None,
    keyword: str | None = None,
    generated_at: datetime | None = None,
    template_override: Mapping[str, str] | None = None,
 ) -> dict[str, Any]:
    """Render the subject/body for a job alert email.
    Returns a dict with subject/body strings and the context used to render them.
    """
    job_list = list(jobs)
    generated_at = generated_at or datetime.now(UTC)
    timestamp = generated_at.strftime(DEFAULT_DATETIME_FORMAT)
    scope_parts = []
    if region:
        scope_parts.append(f"region: {region}")
    if keyword:
        scope_parts.append(f"keyword: {keyword}")
    scope = " (" + ", ".join(scope_parts) + ")" if scope_parts else ""
    job_lines: list[str] = []
    for index, job in enumerate(job_list, start=1):
        title = str(job.get("title", "Untitled"))
        company = str(job.get("company", "Unknown company"))
        location = str(job.get("location", "N/A"))
        url = str(job.get("url", ""))
        line = f"{index}. {title} — {company} ({location})"
        job_lines.append(line)
        if url:
            job_lines.append(f"   {url}")
    if job_lines:
        jobs_section = "\n" + "\n".join(job_lines)
    else:
        jobs_section = "\nNo jobs matched this alert."
    jobs_message = jobs_section.strip()
    context: Dict[str, Any] = {
        "count": len(job_list),
        "count_label": "No new jobs" if not job_list else f"{len(job_list)} new jobs",
        "scope": scope,
        "region": region or "",
        "keyword": keyword or "",
        "timestamp": timestamp,
        "generated_at": generated_at,
        "intro_line": "Here are the latest jobs discovered by the scraper:",
        "jobs_message": jobs_message,
        "jobs_section": jobs_section,
        "jobs_lines": job_lines,
        "has_jobs": bool(job_list),
    }
    template = template_override
    if template is None:
        try:
            from web.db import get_email_template_by_slug
            template = get_email_template_by_slug("job-alert")
        except Exception:
            template = None
    template_subject = (template or {}).get(
        "subject") or DEFAULT_JOB_ALERT_SUBJECT
    template_body = (template or {}).get("body") or DEFAULT_JOB_ALERT_BODY
    subject = _format_template(template_subject, context)
    body = _format_template(template_body, context)
    result = {
        "subject": subject,
        "body": body,
        "context": context,
        "template_slug": (template or {}).get("slug", "job-alert"),
    }
    return result
--- a/web/scraper.py
+++ b/web/scraper.py
@@ -1,7 +1,82 @@
 from datetime import datetime, UTC
 from bs4 import BeautifulSoup
 from typing import List, Dict, Set
-from web.utils import get_base_url, cache_page, safe_get_text, safe_get_attr, is_cached, get_cached_content, make_request_with_retry
+from urllib.parse import urlparse, parse_qs
 import re
 from web.utils import (
    get_base_url,
    safe_get_text,
    safe_get_attr,
    make_request_with_retry,
    get_negative_keywords,
 )
 def extract_contact_info(reply_url) -> Dict[str, str]:
    """Extract contact information from reply URL.
    Parses mailto links, phone links, and contact form URLs to extract:
    - email: Email address (from mailto links)
    - phone: Phone number (from tel links or URL parameters)
    - contact_name: Contact person name (if available in URL parameters)
    Returns a dict with email, phone, and contact_name keys (values may be "N/A").
    """
    contact_info = {
        "email": "N/A",
        "phone": "N/A",
        "contact_name": "N/A"
    }
    # Handle None or empty cases
    if not reply_url or reply_url == "N/A":
        return contact_info
    reply_url = str(reply_url).strip()
    if not reply_url or reply_url == "N/A":
        return contact_info
    try:
        # Check for mailto links
        if reply_url.startswith("mailto:"):
            email_part = reply_url.replace("mailto:", "")
            # Extract email (may contain ?subject=...)
            email = email_part.split("?")[0]
            contact_info["email"] = email
            return contact_info
        # Check for tel links
        if reply_url.startswith("tel:"):
            phone = reply_url.replace("tel:", "")
            contact_info["phone"] = phone
            return contact_info
        # Parse as URL
        if reply_url.startswith("http"):
            parsed = urlparse(reply_url)
            params = parse_qs(parsed.query)
            # Try to extract email from parameters
            for key in ["email", "from_email", "sender_email", "contact_email"]:
                if key in params:
                    contact_info["email"] = params[key][0]
                    break
            # Try to extract phone from parameters
            for key in ["phone", "tel", "telephone"]:
                if key in params:
                    contact_info["phone"] = params[key][0]
                    break
            # Try to extract contact name from parameters
            for key in ["contact_name", "from_name", "name"]:
                if key in params:
                    contact_info["contact_name"] = params[key][0]
                    break
    except Exception:
        pass
    return contact_info
 def scrape_listings_page(listing, region: str, keyword: str, seen_urls: Set[str]) -> List:
@@ -40,6 +115,16 @@ def scrape_job_page(content: str, url: str) -> Dict:
    """Scrape job details from a job listing page."""
    soup = BeautifulSoup(content, "html.parser")
    # Extract reply button
    reply_button = soup.find("button", class_="reply-button")
    if reply_button:
        reply_url = safe_get_attr(reply_button, "data-href")
    else:
        reply_url = "N/A"
    # Extract contact information from reply URL
    contact_info = extract_contact_info(reply_url)
    # Extract each field
    title = safe_get_text(soup.find("h1", class_="postingtitle"))
    company = safe_get_text(soup.find("h2", class_="company-name"))
@@ -80,6 +165,30 @@ def scrape_job_page(content: str, url: str) -> Dict:
        job_id = ""
        posted_time = ""
    # Negative keyword detection
    negative_keyword_match = None
    negative_match_field = None
    negative_keywords = get_negative_keywords()
    if negative_keywords:
        fields_to_check = {
            "title": title or "",
            "company": company or "",
            "location": location or "",
            "description": description or "",
        }
        for keyword in negative_keywords:
            if not keyword:
                continue
            pattern = re.compile(
                r"\b" + re.escape(keyword) + r"\b", re.IGNORECASE)
            for field_name, field_value in fields_to_check.items():
                if field_value and pattern.search(field_value):
                    negative_keyword_match = keyword
                    negative_match_field = field_name
                    break
            if negative_keyword_match:
                break
    return {
        "url": url,
        "title": title,
@@ -87,7 +196,14 @@ def scrape_job_page(content: str, url: str) -> Dict:
        "location": location,
        "description": description,
        "id": job_id,
-        "posted_time": posted_time
+        "posted_time": posted_time,
        "reply_url": reply_url,
        "contact_email": contact_info["email"],
        "contact_phone": contact_info["phone"],
        "contact_name": contact_info["contact_name"],
        "negative_keyword_match": negative_keyword_match,
        "negative_match_field": negative_match_field,
        "is_negative_match": bool(negative_keyword_match),
    }
@@ -108,14 +224,7 @@ def scrape_job_data(content: str, region: str, keyword: str, seen_urls: Set[str]
 def process_region_keyword(region: str, keyword: str, seen_urls: Set[str]) -> List[List]:
    """Process a single region and keyword."""
    url = get_base_url().format(region=region, keyword=keyword.replace(" ", "+"))
-    if is_cached(url):
+    content = make_request_with_retry(url, 1)
-        content = get_cached_content(url)
+    if content is None:
-        cache_status = "CACHED"
+        return []
    else:
        content = make_request_with_retry(url, 3)
        if content is None:
            return []
        cache_page(url, content)
        cache_status = "FETCHED"
    _ = cache_status  # no-op to silence unused var
    return scrape_job_data(content, region, keyword, seen_urls)
--- a/web/static/index.js
+++ b/web/static/index.js
@@ -41,12 +41,16 @@ function scrape(event) {
  event.preventDefault(); // Prevent the default form submission
  updateScrapeInfo("Scraping in progress...", "blue");
  fetch("/scrape")
-    .then((response) => response.json())
+    // expect HTML response containing "Scraping completed successfully!"
    .then((response) => response.text())
    .then((data) => {
-      if (data.status) {
+      if (data.includes("Scraping completed successfully!")) {
-        updateScrapeInfo(data.status, "green");
+        updateScrapeInfo("Scraping completed successfully!", "green");
      } else {
-        updateScrapeInfo("Scraping failed. Please try again.", "red");
+        updateScrapeInfo(
          "Scraping failed or timed out. Please try again.",
          "red"
        );
      }
    })
    .catch((error) => console.error("Error:", error));
--- a/web/static/scrape.js
+++ b/web/static/scrape.js
@@ -0,0 +1,44 @@
 function startScrape() {
  const output = document.getElementById("output");
  const startButton = document.getElementById("start-scrape");
  output.textContent = "Starting scrape...\n";
  startButton.disabled = true;
  startButton.textContent = "Scraping...";
  fetch("/scrape")
    .then((response) => {
      const reader = response.body.getReader();
      const decoder = new TextDecoder();
      function readStream() {
        reader
          .read()
          .then(({ done, value }) => {
            if (done) {
              output.textContent += "\nScraping completed!";
              startButton.disabled = false;
              startButton.textContent = "Start Scraping";
              return;
            }
            const chunk = decoder.decode(value, { stream: true });
            output.textContent += chunk;
            output.scrollTop = output.scrollHeight;
            readStream();
          })
          .catch((error) => {
            output.textContent += `\nError: ${error.message}`;
            startButton.disabled = false;
            startButton.textContent = "Start Scraping";
          });
      }
      readStream();
    })
    .catch((error) => {
      output.textContent = `Error starting scrape: ${error.message}`;
      startButton.disabled = false;
      startButton.textContent = "Start Scraping";
    });
 }
--- a/web/static/settings.js
+++ b/web/static/settings.js
@@ -1,4 +1,22 @@
 /* javascript form handling */
 document.addEventListener("DOMContentLoaded", function () {
  const newNkInput = document.getElementById("new-negative-keyword");
  if (newNkInput) {
    newNkInput.addEventListener("input", function () {
      const val = this.value.trim();
      const existing = Array.from(
        document.querySelectorAll('input[name="negative_keyword"]')
      ).map((el) => el.value);
      if (existing.includes(val)) {
        this.setCustomValidity("Keyword already exists");
        this.reportValidity();
      } else {
        this.setCustomValidity("");
      }
    });
  }
 });
 document
  .getElementById("user-settings-form")
  .addEventListener("submit", function (event) {
@@ -10,11 +28,15 @@ document
    // Collect selected regions and keywords
    const selectedRegions = [];
    const selectedKeywords = [];
    const selectedNegativeKeywords = [];
    formData.forEach((value, key) => {
      if (key === "region") {
        selectedRegions.push(value);
      } else if (key === "keyword") {
        selectedKeywords.push(value);
      } else if (key === "negative_keyword") {
        selectedNegativeKeywords.push(value);
      }
    });
@@ -30,10 +52,21 @@ document
      selectedKeywords.push(newKeyword);
    }
    // Add new negative keyword if provided
    const newNegativeKeyword = formData.get("new-negative-keyword").trim();
    if (newNegativeKeyword) {
      if (selectedNegativeKeywords.includes(newNegativeKeyword)) {
        alert("Negative keyword already exists!");
        return;
      }
      selectedNegativeKeywords.push(newNegativeKeyword);
    }
    // Prepare data to send
    const dataToSend = {
      regions: selectedRegions,
      keywords: selectedKeywords,
      negative_keywords: selectedNegativeKeywords,
      csrf_token: formData.get("csrf_token"),
    };
--- a/web/templates/admin/email.html
+++ b/web/templates/admin/email.html
@@ -0,0 +1,62 @@
 {% extends 'base.html' %} {% block content %}
 <h2>Email Subscriptions</h2>
 <section>
  <h3>Add Subscription</h3>
  <form method="post">
    <input type="hidden" name="csrf_token" value="{{ csrf_token() }}" />
    <input type="hidden" name="action" value="subscribe" />
    <label for="email">Email address</label>
    <input
      type="email"
      id="email"
      name="email"
      placeholder="alerts@example.com"
      required
    />
    <button type="submit">Subscribe</button>
  </form>
 </section>
 <section>
  <h3>Current Recipients</h3>
  {% if not subscriptions %}
  <p>No subscriptions yet. Add one above to start sending alerts.</p>
    <p>You can customize alert content from the <a href="{{ url_for('admin_email_templates') }}">Email Templates</a> page.</p>
  {% else %}
  <p>{{ total_active }} active of {{ total }} total.</p>
  <table>
    <thead>
      <tr>
        <th>Email</th>
        <th>Status</th>
        <th>Created</th>
        <th>Updated</th>
        <th>Action</th>
      </tr>
    </thead>
    <tbody>
      {% for sub in subscriptions %}
      <tr>
        <td>{{ sub.email }}</td>
        <td>{{ 'Active' if sub.is_active else 'Inactive' }}</td>
        <td>{{ sub.created_at }}</td>
        <td>{{ sub.updated_at }}</td>
        <td>
          <form method="post" style="display: inline-flex; gap: 0.5rem">
            <input type="hidden" name="csrf_token" value="{{ csrf_token() }}" />
            <input type="hidden" name="email" value="{{ sub.email }}" />
            {% if sub.is_active %}
            <input type="hidden" name="action" value="unsubscribe" />
            <button type="submit">Deactivate</button>
            {% else %}
            <input type="hidden" name="action" value="reactivate" />
            <button type="submit">Reactivate</button>
            {% endif %}
          </form>
        </td>
      </tr>
      {% endfor %}
    </tbody>
  </table>
  {% endif %}
 </section>
 {% endblock %}
--- a/web/templates/admin/email_templates.html
+++ b/web/templates/admin/email_templates.html
@@ -0,0 +1,102 @@
 {% extends 'base.html' %}
 {% block content %}
 <h2>Email Templates</h2>
 <section>
  <h3>Available Templates</h3>
  {% if not templates %}
  <p>No templates found. Create one below to get started.</p>
  {% else %}
  <table>
    <thead>
      <tr>
        <th>Name</th>
        <th>Slug</th>
        <th>Status</th>
        <th>Updated</th>
        <th>Actions</th>
      </tr>
    </thead>
    <tbody>
      {% for template in templates %}
      <tr>
        <td>{{ template.name }}</td>
        <td>{{ template.slug }}</td>
        <td>{{ 'Active' if template.is_active else 'Inactive' }}</td>
        <td>{{ template.updated_at or template.created_at or '' }}</td>
        <td style="display: flex; gap: 0.5rem;">
          <a class="button" href="{{ url_for('admin_email_templates', template_id=template.template_id) }}">Edit</a>
          <a class="button" href="{{ url_for('admin_email_templates', preview_id=template.template_id) }}">Preview</a>
          <form method="post" onsubmit="return confirm('Delete template {{ template.name }}?');">
            <input type="hidden" name="csrf_token" value="{{ csrf_token() }}" />
            <input type="hidden" name="action" value="delete" />
            <input type="hidden" name="template_id" value="{{ template.template_id }}" />
            <button type="submit">Delete</button>
          </form>
        </td>
      </tr>
      {% endfor %}
    </tbody>
  </table>
  {% endif %}
 </section>
 <section>
  <h3>{{ 'Edit Template' if editing else 'Create Template' }}</h3>
  <form method="post">
    <input type="hidden" name="csrf_token" value="{{ csrf_token() }}" />
    <input type="hidden" name="action" value="{{ 'update' if editing else 'create' }}" />
    {% if editing %}
    <input type="hidden" name="template_id" value="{{ editing.template_id }}" />
    {% endif %}
    <div>
      <label for="name">Name</label>
      <input type="text" id="name" name="name" value="{{ editing.name if editing else '' }}" required />
    </div>
    <div>
      <label for="slug">Slug</label>
      <input type="text" id="slug" name="slug" placeholder="job-alert" value="{{ editing.slug if editing else '' }}" />
      <small>Leave blank to reuse the name. Slug must be URL friendly (letters, numbers, dashes).</small>
    </div>
    <div>
      <label for="subject">Subject Template</label>
      <input type="text" id="subject" name="subject" value="{{ editing.subject if editing else '' }}" required />
    </div>
    <div>
      <label for="body">Body Template</label>
      <textarea id="body" name="body" rows="12" required>{{ editing.body if editing else '' }}</textarea>
    </div>
    <div>
      <label>
        <input type="checkbox" name="is_active" {% if editing is none or editing.is_active %}checked{% endif %} />
        Active
      </label>
    </div>
    <button type="submit">{{ 'Update Template' if editing else 'Create Template' }}</button>
    {% if editing %}
    <a class="button" href="{{ url_for('admin_email_templates') }}">Cancel</a>
    {% endif %}
  </form>
  <aside>
    <h4>Available placeholders</h4>
    <ul>
      <li><code>{count}</code> – number of jobs in the alert</li>
      <li><code>{count_label}</code> – "No new jobs" or "X new jobs"</li>
      <li><code>{scope}</code> – formatted region/keyword context</li>
      <li><code>{region}</code>, <code>{keyword}</code></li>
      <li><code>{timestamp}</code> – formatted timestamp</li>
      <li><code>{jobs_section}</code> – newline-prefixed block of job entries</li>
      <li><code>{jobs_message}</code> – jobs block without leading newline</li>
    </ul>
  </aside>
 </section>
 {% if preview %}
 <section>
  <h3>Preview: {{ preview_template.name if preview_template else 'Job Alert' }}</h3>
  <article>
    <h4>Subject</h4>
    <pre>{{ preview.subject }}</pre>
    <h4>Body</h4>
    <pre>{{ preview.body }}</pre>
  </article>
 </section>
 {% endif %}
 {% endblock %}
--- a/web/templates/admin/stats.html
+++ b/web/templates/admin/stats.html
@@ -0,0 +1,46 @@
 {% extends 'base.html' %} {% block content %}
 <div id="admin-stats">
  <h2>Database Statistics</h2>
  <div class="stats-summary">
    <p><strong>Total jobs:</strong> {{ stats.total_jobs }}</p>
    <p><strong>Total keywords:</strong> {{ stats.total_keywords }}</p>
    <p><strong>Total regions:</strong> {{ stats.total_regions }}</p>
  </div>
  <h3>Jobs per keyword</h3>
  <table>
    <thead>
      <tr>
        <th>Keyword</th>
        <th>Count</th>
      </tr>
    </thead>
    <tbody>
      {% for row in stats.jobs_per_keyword %}
      <tr>
        <td>{{ row.keyword or '(empty)' }}</td>
        <td>{{ row.count }}</td>
      </tr>
      {% endfor %}
    </tbody>
  </table>
  <h3>Jobs per region</h3>
  <table>
    <thead>
      <tr>
        <th>Region</th>
        <th>Count</th>
      </tr>
    </thead>
    <tbody>
      {% for row in stats.jobs_per_region %}
      <tr>
        <td>{{ row.region or '(empty)' }}</td>
        <td>{{ row.count }}</td>
      </tr>
      {% endfor %}
    </tbody>
  </table>
 </div>
 {% endblock %}
--- a/web/templates/admin/user.html
+++ b/web/templates/admin/user.html
@@ -0,0 +1,114 @@
 {% extends 'base.html' %} {% block content %}
 <div id="user-details">
  {% if not user %}
  <h2>Create new user</h2>
  <form id="new-user-form" method="post" action="{{ url_for('admin_users') }}">
    <input type="hidden" name="csrf_token" value="{{ csrf_token() }}" />
    <div id="user-info">
      <p>
        <strong>Username:</strong>
        <input type="text" name="username" required />
      </p>
      <p>
        <strong>Password:</strong>
        <input type="password" name="password" required />
      </p>
      <p>
        <strong>Admin:</strong>
        <input type="checkbox" name="is_admin" />
      </p>
      <p>
        <strong>Active:</strong>
        <input type="checkbox" name="is_active" />
      </p>
      <button type="submit">Create User</button>
    </div>
  </form>
  {% else %}
  <h2>User {{ user.username }}</h2>
  <form
    id="user-form"
    method="post"
    action="{{ url_for('admin_user', user_id=user.user_id) }}"
  >
    <input type="hidden" name="csrf_token" value="{{ csrf_token() }}" />
    <input type="hidden" name="user_id" value="{{ user.user_id }}" />
    <input type="hidden" name="username" value="{{ user.username }}" />
    <div id="user-info">
      <p><strong>ID:</strong> {{ user.user_id }}</p>
      <p><strong>Username:</strong> {{ user.username }}</p>
      <p><strong>Created At:</strong> {{ user.created_at }}</p>
      <p><strong>Last Login:</strong> {{ user.last_login }}</p>
      <p>
        <strong>Admin:</strong>
        <input type="checkbox" name="is_admin" {{ 'checked' if user.is_admin
        else '' }} />
      </p>
      <p>
        <strong>Active:</strong>
        <input type="checkbox" name="is_active" {{ 'checked' if user.is_active
        else '' }} />
      </p>
      <p>
        <strong>Has Password:</strong> {{ '✅' if user.has_password else '❌' }}
      </p>
      <p>
        <strong>New Password:</strong>
        <input type="password" id="new_password" name="new_password" />
      </p>
      <button type="submit">Save</button>
    </div>
  </form>
 </div>
 <script>
  const userForm = document.getElementById("user-form");
  userForm.addEventListener("submit", function (event) {
    const userId = document.getElementById("user_id").value;
    event.preventDefault(); // Prevent the default form submission
    updateUser(userId);
  });
  function updateUser(userId) {
    const passwordInput = document.getElementById("new_password");
    const formData = userForm.elements;
    const username = formData.username.value;
    const password = passwordInput.value;
    const isAdmin = formData.is_admin.checked;
    const isActive = formData.is_active.checked;
    const hasPassword = passwordInput.value.trim() !== "";
    fetch("/admin/user/" + userId, {
      method: "POST",
      headers: {
        "Content-Type": "application/json",
        "X-CSRF-Token": formData.csrf_token.value,
      },
      body: JSON.stringify({
        user_id: userId,
        password: password,
        username: username,
        is_admin: isAdmin,
        is_active: isActive,
      }),
    })
      .then((response) => {
        if (response.ok) {
          alert("User updated successfully");
          // Clear the password field after successful update
          passwordInput.value = "";
          // Set 'has_password' indicator
          userForm.querySelector('input[name="has_password"]').value =
            hasPassword ? "✅" : "❌";
        } else {
          alert("Error updating user");
        }
      })
      .catch((error) => {
        console.error("Error:", error);
        alert("Error updating user");
      });
  }
 </script>
 {% endif %} {% endblock %} {% block footer_scripts %} {% endblock %}
--- a/web/templates/admin/users.html
+++ b/web/templates/admin/users.html
@@ -1,139 +1,100 @@
 {% extends 'base.html' %} {% block content %}
 <div id="users">
  <h2>Users</h2>
  <form id="user-form" method="post" action="{{ url_for('admin_users') }}">
    <input type="hidden" name="csrf_token" value="{{ csrf_token() }}" />
    <table>
      <thead>
        <tr>
          <th>ID</th>
          <th>Username</th>
          <th>Admin</th>
          <th>Active</th>
          <th colspan="2">Password</th>
          <th>Created</th>
          <th>Last Login</th>
          <th></th>
        </tr>
      </thead>
      <tbody>
        {% for u in users %}
        <tr class="user-row" data-user-id="{{ u.user_id }}">
          <td>
            {{ u.user_id }}<input
              type="hidden"
              name="user_id"
              value="{{ u.user_id }}"
            />
          </td>
          <td>
            <input
              type="text"
              name="username"
              value="{{ u.username }}"
              required
            />
          </td>
          <td>
            <input type="checkbox" name="is_admin" {{ 'checked' if u.is_admin
            else '' }} />
          </td>
          <td>
            <input type="checkbox" name="is_active" {{ 'checked' if u.is_active
            else '' }} />
          </td>
          <td>{{ '✅' if u.has_password else '❌' }}</td>
          <td><input type="password" name="password" /></td>
          <td>{{ u.created_at }}</td>
          <td>{{ u.last_login or 'never' }}</td>
          <td>
            <button type="submit" data-user-id="{{ u.user_id }}">Save</button>
          </td>
        </tr>
        {% endfor %}
      </tbody>
    </table>
  </form>
 </div>
 <h3>Create / Update User</h3>
 <form
  id="create-update-user-form"
  method="post"
  action="{{ url_for('admin_users') }}"
 >
  <input type="hidden" name="csrf_token" value="{{ csrf_token() }}" />
-  <label>Username <input type="text" name="username" required /></label>
+  <table>
-  <label>Password <input type="password" name="password" /></label>
+    <thead>
-  <label>Admin <input type="checkbox" name="is_admin" value="1" /></label>
+      <tr>
-  <label
+        <th>ID</th>
-    >Active <input type="checkbox" name="is_active" value="1" checked
+        <th>Username</th>
-  /></label>
+        <th>Admin</th>
-  <button type="submit">Save</button>
+        <th>Active</th>
-</form>
+        <th>Password</th>
        <th>Created</th>
        <th>Last Login</th>
        <th>Edit</th>
        <th>Delete</th>
      </tr>
    </thead>
    <tbody>
      {% for u in users %}
      <tr class="user-row" data-user-id="{{ u.user_id }}">
        <td>{{ u.user_id }}</td>
        <td>
          <a href="{{ url_for('admin_user', user_id=u.user_id) }}"
            >{{ u.username }}</a
          >
        </td>
        <td>{{ '✅' if u.is_admin else '❌' }}</td>
        <td>{{ '✅' if u.is_active else '❌' }}</td>
        <td>{{ '✅' if u.has_password else '❌' }}</td>
        <td>{{ u.created_at }}</td>
        <td>{{ u.last_login or 'never' }}</td>
        <td>
          <button
            type="button"
            class="edit-user"
            data-user-id="{{ u.user_id }}"
            onclick="editUser({{ u.user_id }})"
          >
            Edit
          </button>
        </td>
        <td>
          <button
            type="button"
            class="delete-user"
            data-user-id="{{ u.user_id }}"
            onclick="deleteUser({{ u.user_id }})"
          >
            Delete
          </button>
        </td>
      </tr>
      {% endfor %}
    </tbody>
  </table>
 </div>
 <h2>Create New User</h2>
 <a href="{{ url_for('admin_user', user_id='new') }}">Create User</a>
 {% endblock %} {% block footer_scripts %}
 <script>
-  function updateUser(userId) {
+  function editUser(userId) {
-    const row = document.querySelector(`.user-row[data-user-id="${userId}"]`);
+    window.location.href = `/admin/user/${userId}`;
-    const passwordInput = row.querySelector('input[name="password"]');
+  }
-    const hasPassword =
+  function deleteUser(userId) {
-      row.querySelector("td:nth-child(5)").textContent.trim() === "✅";
+    if (
-    const formData = row.querySelector("form").elements;
+      confirm(
-    const username = formData.username.value;
+        "Are you sure you want to delete this user? This action cannot be undone."
-    const password = hasPassword ? passwordInput.value : undefined;
+      )
-    const isAdmin = formData.is_admin.checked;
+    ) {
-    const isActive = formData.is_active.checked;
+      fetch(`/admin/user/${userId}/delete`, {
-
+        method: "POST",
-    fetch("/admin/users", {
+        headers: {
-      method: "POST",
+          "Content-Type": "application/json",
-      headers: {
+          "X-CSRFToken": document.querySelector('input[name="csrf_token"]')
-        "Content-Type": "application/json",
+            .value,
-      },
+        },
      body: JSON.stringify({
        user_id: userId,
        password: password,
        username: username,
        is_admin: isAdmin,
        is_active: isActive,
        csrf_token: formData.csrf_token.value,
      }),
    })
      .then((response) => {
        if (response.ok) {
          alert("User updated successfully");
          // Clear the password field after successful update
          passwordInput.value = "";
        } else {
          alert("Error updating user");
        }
      })
-      .catch((error) => {
+        .then((response) => {
-        console.error("Error:", error);
+          if (response.ok) {
-        alert("Error updating user");
+            // Remove the user row from the table
-      });
+            const row = document.querySelector(
              `.user-row[data-user-id="${userId}"]`
            );
            if (row) {
              row.remove();
            }
          } else {
            alert("Error deleting user.");
          }
        })
        .catch((error) => {
          console.error("Error:", error);
          alert("Error deleting user.");
        });
    }
  }
  function initUserForm() {
    const form = document.getElementById("user-form");
    const createUpdateForm = document.getElementById("create-update-user-form");
    form.addEventListener("submit", function (event) {
      const userId = event.target.querySelector('input[name="user_id"]').value;
      event.preventDefault(); // Prevent the default form submission
      updateUser(userId);
    });
    form.addEventListener("click", function (event) {
      const userId = event.target.closest(".user-row").dataset.userId;
      updateUser(userId);
    });
    createUpdateForm.addEventListener("submit", function (event) {
      const passwordInput = createUpdateForm.querySelector(
        'input[name="password"]'
      );
    });
  }
  initUserForm();
 </script>
 {% endblock %}
--- a/web/templates/base.html
+++ b/web/templates/base.html
@@ -16,15 +16,21 @@
    <header>
      <h1><a href="/">{{ title or 'Admin' }}</a></h1>
      <nav>
-        {% if username %}<span>Hi, {{ username }}</span> | {% endif %}
+        <div id="navigation">
-        <a href="{{ url_for('index') }}">Home</a> |
+          {% if username %}<span>Hi, {{ username }}</span> | {% endif %}
-        <a href="{{ url_for('user_settings') }}">Preferences</a>
+          <a href="{{ url_for('index') }}">Home</a> |
-        {% if current_user and current_user.is_admin %} |
+          <a href="{{ url_for('user_settings') }}">Preferences</a>
-        <a href="{{ url_for('admin_taxonomy') }}">Taxonomy</a> |
+          {% if current_user and current_user.is_admin %} |
-        <a href="{{ url_for('admin_users') }}">Users</a> {% endif %} {% if
+          <a href="{{ url_for('scrape_page') }}">Scrape Jobs</a> |
-        session.get('username') %} |
+          <a href="{{ url_for('admin_taxonomy') }}">Taxonomy</a> |
-        <a href="{{ url_for('logout') }}">Logout</a> {% else %} |
+          <a href="{{ url_for('admin_stats') }}">Statistics</a> |
-        <a href="{{ url_for('login') }}">Login</a>{% endif %}
+          <a href="{{ url_for('admin_emails') }}">Email Alerts</a> |
          <a href="{{ url_for('admin_email_templates') }}">Email Templates</a> |
          <a href="{{ url_for('admin_users') }}">Users</a> {% endif %} {% if
          session.get('username') %} |
          <a href="{{ url_for('logout') }}">Logout</a> {% else %} |
          <a href="{{ url_for('login') }}">Login</a>{% endif %}
        </div>
      </nav>
      {% with messages = get_flashed_messages() %} {% if messages %}
      <ul>
--- a/web/templates/index.html
+++ b/web/templates/index.html
@@ -44,7 +44,8 @@
 <div id="jobs">
    {% for job in jobs %}
    <div class="job">
-      <h3><a href="{{ job['url'] }}" target="_blank">{{ job['title'] }}</a></h3>
+      <!--<h3><a href="{{ job['url'] }}" target="_blank">{{ job['title'] }}</a></h3>-->
      <h3><a href="{{ url_for('job_by_id', job_id=job['id']) }}" target="_blank">{{ job['title'] }}</a></h3>
      <p class="job-posted-time">{{ job['posted_time'] }}</p>
      <span class="job-region region-{{ job['region'] }}">{{ job['region'] }}</span>
      <span class="job-keyword keyword-{{ job['keyword']|replace(' ', '')|lower }}">{{ job['keyword'] }}</span>
--- a/web/templates/job.html
+++ b/web/templates/job.html
@@ -23,13 +23,5 @@ styles %}{% endblock %} {% block content %}
      >{{ job.title }}</a
    >
  </p>
  {% if job.file_path_abs or job.file_path %}
  <p>
    <strong>Cached copy:</strong>
    <a href="{{ url_for('serve_cached', job_id=job.id) }}" target="_blank"
      >View cached copy</a
    >
  </p>
  {% endif %}
 </div>
 {% endblock %}
--- a/web/templates/scrape.html
+++ b/web/templates/scrape.html
@@ -0,0 +1,22 @@
 {% extends "base.html" %} {% block title %}Scrape Jobs{% endblock %} {% block
 content %}
 <div id="scrape-container">
  <h2>Job Scraping Progress</h2>
  <button id="start-scrape" onclick="startScrape()">Start Scraping</button>
  <div
    id="output"
    style="
      margin-top: 20px;
      padding: 10px;
      border: 1px solid #ccc;
      height: 400px;
      overflow-y: auto;
      background-color: #f9f9f9;
      font-family: monospace;
      white-space: pre-wrap;
    "
  ></div>
 </div>
 {% endblock %} {% block scripts %}
 <script src="{{ url_for('static', filename='scrape.js') }}"></script>
 {% endblock %}
--- a/web/templates/user/settings.html
+++ b/web/templates/user/settings.html
@@ -77,6 +77,29 @@ block content %}
    <p>No keywords available. Ask an admin to add some.</p>
    {% endif %}
  </fieldset>
  <fieldset>
    <legend>Negative Keywords</legend>
    <p>
      <small>Add new Negative Keyword:</small>
      <input
        type="text"
        name="new-negative-keyword"
        id="new-negative-keyword"
        value=""
        placeholder="Type a keyword and save to add"
        size="30"
      />
    </p>
    {% if user_negative_keywords %} {% for nk in user_negative_keywords %}
    <label style="display: block">
      <input type="checkbox" name="negative_keyword" value="{{ nk }}" checked />
      {{ nk }}
    </label>
    {% endfor %} {% else %}
    <p>No negative keywords set.</p>
    {% endif %}
    <p><small>Uncheck to remove.</small></p>
  </fieldset>
  <button type="submit">Save</button>
 </form>
 {% endblock %} {% block footer_scripts %}
--- a/web/utils.py
+++ b/web/utils.py
@@ -2,7 +2,7 @@
 Utility functions for the Craigslist scraper.
 """
-from typing import Any, Optional as _Optional
+from typing import Any, Optional, List, Dict
 from datetime import datetime, UTC
 import json
 import os
@@ -10,7 +10,6 @@ import random
 import re
 import requests
 import time
 from typing import Optional, List, Dict
 def get_config_file() -> str:
@@ -94,10 +93,6 @@ def get_paths() -> dict:
    return get_config().get('paths', {})
 def get_cache_dir() -> str:
    return get_paths().get('cache_dir', 'cache')
 def get_logs_dir() -> str:
    return get_paths().get('logs_dir', 'logs')
@@ -130,9 +125,64 @@ def get_base_url() -> str:
    return get_config().get('scraper', {}).get('base_url', "https://{region}.craigslist.org/search/jjj?query={keyword}&sort=rel")
-def ensure_cache_dir():
+def get_negative_keywords() -> List[str]:
-    """Ensure cache directory exists."""
+    """Return normalized list of negative keywords from config."""
-    os.makedirs(get_cache_dir(), exist_ok=True)
+    raw = get_config().get('scraper', {}).get('negative_keywords', [])
    if not isinstance(raw, list):
        return []
    cleaned: List[str] = []
    for item in raw:
        if not isinstance(item, str):
            continue
        val = item.strip()
        if not val:
            continue
        cleaned.append(val.lower())
    return cleaned
 def get_email_settings() -> Dict[str, Any]:
    """Return normalized email settings from config."""
    cfg = get_config().get('email', {})
    if not isinstance(cfg, dict):
        cfg = {}
    raw_smtp = cfg.get('smtp', {}) if isinstance(cfg.get('smtp'), dict) else {}
    raw_recipients = cfg.get('recipients', [])
    def _to_int(value, default):
        try:
            return int(value)
        except (TypeError, ValueError):
            return default
    recipients: List[str] = []
    if isinstance(raw_recipients, list):
        for item in raw_recipients:
            if isinstance(item, str):
                addr = item.strip()
                if addr:
                    recipients.append(addr)
    smtp = {
        'host': (raw_smtp.get('host') or '').strip(),
        'port': _to_int(raw_smtp.get('port', 587), 587),
        'username': (raw_smtp.get('username') or '').strip(),
        'password': raw_smtp.get('password') or '',
        'use_tls': bool(raw_smtp.get('use_tls', True)),
        'use_ssl': bool(raw_smtp.get('use_ssl', False)),
        'timeout': _to_int(raw_smtp.get('timeout', 30), 30),
    }
    if smtp['port'] <= 0:
        smtp['port'] = 587
    if smtp['timeout'] <= 0:
        smtp['timeout'] = 30
    return {
        'enabled': bool(cfg.get('enabled', False)),
        'from_address': (cfg.get('from_address') or '').strip(),
        'smtp': smtp,
        'recipients': recipients,
    }
 def now_iso() -> str:
@@ -174,12 +224,6 @@ def get_url_from_filename(name: str) -> str:
    return url_guess
 def get_cached_content(url: str) -> str:
    """Get cached content for URL."""
    with open(get_cache_path(url), "r", encoding="utf-8") as f:
        return f.read()
 def safe_get_text(element, default="N/A"):
    """Safely extract text from BeautifulSoup element."""
    return element.get_text(strip=True) if element else default
@@ -195,53 +239,6 @@ def get_random_delay(min_delay: int = get_min_delay(), max_delay: int = get_max_
    return random.uniform(min_delay, max_delay)
 def get_cache_path(url: str) -> str:
    """Get cache file path for URL."""
    return os.path.join(get_cache_dir(), f"{get_filename_from_url(url)}.html")
 def cache_page(url: str, content: str):
    """Cache the page content with a timestamp."""
    cache_path = get_cache_path(url)
    with open(cache_path, "w", encoding="utf-8") as f:
        f.write(content)
    # Update the file's modification time to the current time
    os.utime(cache_path, None)
 def is_cached(url: str) -> bool:
    """Check if the page is cached and not older than 24 hours."""
    cache_path = get_cache_path(url)
    if not os.path.isfile(cache_path):
        return False
    # Check the file's age if it's a search result page
    if 'search' in url:
        file_age = time.time() - os.path.getmtime(cache_path)
        if file_age > 24 * 3600:  # 24 hours in seconds
            return False
    return True
 def is_cache_stale(last_modified: str, days: int = 1) -> bool:
    """Check if the cached page is stale (older than 24 hours)."""
    if not last_modified:
        return True
    last_datetime = datetime.fromisoformat(last_modified)
    file_age = time.time() - last_datetime.timestamp()
    return file_age > days * 24 * 3600  # days in seconds
 def delete_cached_page(url: str):
    cache_fp = get_cache_path(url)
    if os.path.exists(cache_fp):
        try:
            os.remove(cache_fp)
        except Exception:
            pass
 def get_color_from_string(s: str) -> str:
    """Generate a color code from a string."""
    hash_code = hash(s)
@@ -264,15 +261,41 @@ def get_color_from_string(s: str) -> str:
 def filter_jobs(
    jobs: List[Dict[str, Any]],
-    region: _Optional[str] = None,
+    region: Optional[str] = None,
-    keyword: _Optional[str] = None,
+    keyword: Optional[str] = None,
    negative_keywords: Optional[List[str]] = None,
 ) -> List[Dict[str, Any]]:
-    """Filter jobs by optional region and keyword."""
+    """Filter jobs by optional region, keyword, and negative keywords."""
    filtered = jobs
    if region:
        filtered = [j for j in filtered if j.get("region") == region]
    if keyword:
        filtered = [j for j in filtered if j.get("keyword") == keyword]
    if negative_keywords:
        # Pre-compile regexes or just check substring?
        # Scraper uses substring check. Let's do the same for consistency.
        # Fields to check: title, company, location, description
        # Note: description might contain HTML or be long.
        # Normalize negative keywords
        nks = [nk.lower() for nk in negative_keywords if nk]
        def is_clean(job):
            # Check all fields
            text_blob = " ".join([
                str(job.get("title") or ""),
                str(job.get("company") or ""),
                str(job.get("location") or ""),
                str(job.get("description") or "")
            ]).lower()
            for nk in nks:
                if nk in text_blob:
                    return False
            return True
        filtered = [j for j in filtered if is_clean(j)]
    return filtered
Author	SHA1	Message	Date
zwitschi	f8a5b1b5ef	fix: update scrape function to handle HTML response and improve status messages Some checks failed CI/CD Pipeline / test (push) Successful in 1m34s Details CI/CD Pipeline / build-image (push) Failing after 1m53s Details	2025-11-30 10:51:16 +01:00
zwitschi	02e3e77f78	fix: update fetch logic to skip jobs fetched within the last 24 hours and adjust retry attempts in scraper Some checks failed CI/CD Pipeline / test (push) Failing after 20s Details CI/CD Pipeline / build-image (push) Has been skipped Details	2025-11-28 20:54:39 +01:00
zwitschi	e0bc295936	feat: Enhance CI/CD pipeline with Docker image build and push steps Some checks failed CI/CD Pipeline / test (push) Successful in 21s Details CI/CD Pipeline / build-image (push) Failing after 1m9s Details	2025-11-28 19:16:28 +01:00
zwitschi	2185a07ff0	feat: Implement email sending utilities and templates for job notifications Some checks failed CI/CD Pipeline / test (push) Failing after 4m9s Details - Added email_service.py for sending emails with SMTP configuration. - Introduced email_templates.py to render job alert email subjects and bodies. - Enhanced scraper.py to extract contact information from job listings. - Updated settings.js to handle negative keyword input validation. - Created email.html and email_templates.html for managing email subscriptions and templates in the admin interface. - Modified base.html to include links for email alerts and templates. - Expanded user settings.html to allow management of negative keywords. - Updated utils.py to include functions for retrieving negative keywords and email settings. - Enhanced job filtering logic to exclude jobs containing negative keywords.	2025-11-28 18:15:08 +01:00
zwitschi	8afb208985	fix: update .gitignore to include GitHub Copilot files and add TODO.md Some checks failed CI/CD Pipeline / test (push) Failing after 3m35s Details	2025-11-03 19:04:34 +01:00
zwitschi	d9a224fc36	fix: remove redundant checkout step from CI/CD pipeline Some checks failed CI/CD Pipeline / test (push) Failing after 32s Details	2025-11-01 19:47:41 +01:00
zwitschi	1678e1366e	fix: remove unnecessary build and push jobs from CI/CD pipeline Some checks failed CI/CD Pipeline / test (push) Has been cancelled Details	2025-11-01 19:47:02 +01:00
zwitschi	fee955f01d	fix: update Dockerfile and documentation for APT_CACHER_NG configuration Some checks failed CI/CD Pipeline / test (push) Successful in 18s Details CI/CD Pipeline / build (push) Successful in 1m3s Details CI/CD Pipeline / push (push) Failing after 35s Details	2025-11-01 19:41:58 +01:00
zwitschi	a51d500777	feat: enhance CI/CD pipeline with build and push steps for Docker images Some checks failed CI/CD Pipeline / test (push) Successful in 18s Details CI/CD Pipeline / build (push) Successful in 16s Details CI/CD Pipeline / push (push) Failing after 5s Details	2025-11-01 19:04:05 +01:00
zwitschi	2238a286d4	fix: cexclude linting and deployment for now All checks were successful CI/CD Pipeline / test (push) Successful in 3m57s Details	2025-11-01 18:35:07 +01:00
zwitschi	92b6efb550	fix: adjust exponential backoff timing in scrape_jobs_with_retry Some checks failed CI/CD Pipeline / test (push) Failing after 42s Details CI/CD Pipeline / deploy (push) Has been skipped Details	2025-11-01 18:31:35 +01:00
zwitschi	f48f5dc036	fix: missing variable in job_details()	2025-11-01 18:24:37 +01:00
zwitschi	053a9988a8	feat: add CI pipeline	2025-11-01 18:07:57 +01:00
zwitschi	504dc8e2b0	implement automated job scraping scheduler with retry logic and logging	2025-11-01 18:00:59 +01:00
zwitschi	8e3a6f4f41	add logs table definition to database schema	2025-11-01 16:10:42 +01:00
zwitschi	5cbb760005	separate javascript for scrape page	2025-09-17 17:12:29 +02:00
zwitschi	2ae1e2058d	reorganize imports removing unused imports	2025-09-17 17:12:16 +02:00
zwitschi	e549fae3f6	fix table setup	2025-09-17 17:11:45 +02:00
zwitschi	c4761c257c	fix handling of db result -> dict	2025-09-17 16:01:21 +02:00
zwitschi	b6f9d39ad8	extending logging	2025-09-17 15:53:28 +02:00
zwitschi	94730439a2	extending logging to individual jobs	2025-09-17 15:35:12 +02:00
zwitschi	39900ea564	do not remove stale urls	2025-09-17 15:30:29 +02:00
zwitschi	e26dc9c164	updating db logic adding logging	2025-09-17 15:24:59 +02:00
zwitschi	c4a5ed56b5	adding statistics page for admin	2025-09-14 17:07:05 +02:00
zwitschi	e947520be9	correct db in docker entrypoint	2025-09-14 16:23:00 +02:00
zwitschi	89d38c91cf	simpler Dockerfile and compose test	2025-09-14 16:10:11 +02:00
zwitschi	e7c7861b52	change Dockerfile to bookwork	2025-09-14 15:30:10 +02:00
zwitschi	dd9772997d	extending Dockerfile with lang and locale settings changing debian mirror for faster installation	2025-09-14 15:20:00 +02:00
zwitschi	1a3e6ce3de	docker compose test file	2025-09-08 18:53:54 +02:00
zwitschi	9e18323e5f	adjusting for exsting traefik host	2025-09-08 18:34:10 +02:00
zwitschi	7c743a56cc	port change	2025-09-08 17:56:44 +02:00
zwitschi	8a40e7115f	adjusting compose file	2025-09-08 17:32:11 +02:00
zwitschi	56315fe147	modified docker-compose.yml	2025-09-08 17:18:11 +02:00
zwitschi	0f5c2fcf31	extending docker setup README	2025-09-08 17:16:53 +02:00
zwitschi	a369972119	fixing docker compose	2025-09-08 15:36:17 +02:00
zwitschi	008eb7906b	fix formatting	2025-09-08 14:59:32 +02:00
georg.sinn-schirwitz	5940e1f8b4	Docker functionality	2025-09-08 14:51:04 +02:00
georg.sinn-schirwitz	042a196718	remove caching	2025-09-08 14:44:46 +02:00
georg.sinn-schirwitz	f8e23d0fba	reverting fix: errors	2025-09-01 14:04:35 +02:00
georg.sinn-schirwitz	2201185599	fix rowcount	2025-09-01 14:00:26 +02:00
georg.sinn-schirwitz	fe2a579fc4	completing user administration	2025-08-30 18:33:08 +02:00
georg.sinn-schirwitz	7379d3040d	refactor imports	2025-08-30 16:19:26 +02:00
georg.sinn-schirwitz	932a85e279	rename password field	2025-08-30 16:19:17 +02:00