Compare commits
43 Commits
7a3d665538
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| f8a5b1b5ef | |||
| 02e3e77f78 | |||
| e0bc295936 | |||
| 2185a07ff0 | |||
| 8afb208985 | |||
| d9a224fc36 | |||
| 1678e1366e | |||
| fee955f01d | |||
| a51d500777 | |||
| 2238a286d4 | |||
| 92b6efb550 | |||
| f48f5dc036 | |||
| 053a9988a8 | |||
| 504dc8e2b0 | |||
| 8e3a6f4f41 | |||
| 5cbb760005 | |||
| 2ae1e2058d | |||
| e549fae3f6 | |||
| c4761c257c | |||
| b6f9d39ad8 | |||
| 94730439a2 | |||
| 39900ea564 | |||
| e26dc9c164 | |||
| c4a5ed56b5 | |||
| e947520be9 | |||
| 89d38c91cf | |||
| e7c7861b52 | |||
| dd9772997d | |||
| 1a3e6ce3de | |||
| 9e18323e5f | |||
| 7c743a56cc | |||
| 8a40e7115f | |||
| 56315fe147 | |||
| 0f5c2fcf31 | |||
| a369972119 | |||
| 008eb7906b | |||
|
|
5940e1f8b4 | ||
|
|
042a196718 | ||
|
|
f8e23d0fba | ||
|
|
2201185599 | ||
|
|
fe2a579fc4 | ||
|
|
7379d3040d | ||
|
|
932a85e279 |
63
.dockerignore
Normal file
63
.dockerignore
Normal file
@@ -0,0 +1,63 @@
|
|||||||
|
# Python
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
*.so
|
||||||
|
.Python
|
||||||
|
build/
|
||||||
|
develop-eggs/
|
||||||
|
dist/
|
||||||
|
downloads/
|
||||||
|
eggs/
|
||||||
|
.eggs/
|
||||||
|
lib/
|
||||||
|
lib64/
|
||||||
|
parts/
|
||||||
|
sdist/
|
||||||
|
var/
|
||||||
|
wheels/
|
||||||
|
*.egg-info/
|
||||||
|
.installed.cfg
|
||||||
|
*.egg
|
||||||
|
|
||||||
|
# Virtual environments
|
||||||
|
.venv/
|
||||||
|
venv/
|
||||||
|
ENV/
|
||||||
|
env/
|
||||||
|
|
||||||
|
# IDE
|
||||||
|
.vscode/
|
||||||
|
.idea/
|
||||||
|
*.swp
|
||||||
|
*.swo
|
||||||
|
|
||||||
|
# OS
|
||||||
|
.DS_Store
|
||||||
|
Thumbs.db
|
||||||
|
|
||||||
|
# Git
|
||||||
|
.git/
|
||||||
|
.gitignore
|
||||||
|
|
||||||
|
# Logs
|
||||||
|
logs/
|
||||||
|
*.log
|
||||||
|
|
||||||
|
# Cache
|
||||||
|
cache/
|
||||||
|
|
||||||
|
# Testing
|
||||||
|
.pytest_cache/
|
||||||
|
.coverage
|
||||||
|
htmlcov/
|
||||||
|
|
||||||
|
# Documentation
|
||||||
|
docs/_build/
|
||||||
|
|
||||||
|
# Docker
|
||||||
|
Dockerfile*
|
||||||
|
docker-compose*.yml
|
||||||
|
.dockerignore
|
||||||
|
README-Docker.md
|
||||||
|
deploy.sh
|
||||||
98
.gitea/workflows/ci.yml
Normal file
98
.gitea/workflows/ci.yml
Normal file
@@ -0,0 +1,98 @@
|
|||||||
|
name: CI/CD Pipeline
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches: [main]
|
||||||
|
pull_request:
|
||||||
|
branches: [main]
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
test:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Checkout code
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Set up Python
|
||||||
|
uses: actions/setup-python@v4
|
||||||
|
with:
|
||||||
|
python-version: "3.11"
|
||||||
|
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
python -m pip install --upgrade pip
|
||||||
|
pip install -r requirements.txt
|
||||||
|
|
||||||
|
- name: Run tests
|
||||||
|
run: |
|
||||||
|
python -m pytest tests/ -v
|
||||||
|
|
||||||
|
# - name: Run linting
|
||||||
|
# run: |
|
||||||
|
# python -m flake8 web/ tests/ --max-line-length=120
|
||||||
|
build-image:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
needs: test
|
||||||
|
env:
|
||||||
|
DEFAULT_BRANCH: main
|
||||||
|
REGISTRY_URL: ${{ secrets.REGISTRY_URL }}
|
||||||
|
REGISTRY_USERNAME: ${{ secrets.REGISTRY_USERNAME }}
|
||||||
|
REGISTRY_PASSWORD: ${{ secrets.REGISTRY_PASSWORD }}
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Collect workflow metadata
|
||||||
|
id: meta
|
||||||
|
shell: bash
|
||||||
|
run: |
|
||||||
|
ref_name="${GITHUB_REF_NAME:-${GITHUB_REF##*/}}"
|
||||||
|
event_name="${GITHUB_EVENT_NAME:-}"
|
||||||
|
sha="${GITHUB_SHA:-}"
|
||||||
|
|
||||||
|
if [ "$ref_name" = "${DEFAULT_BRANCH:-main}" ]; then
|
||||||
|
echo "on_default=true" >> "$GITHUB_OUTPUT"
|
||||||
|
else
|
||||||
|
echo "on_default=false" >> "$GITHUB_OUTPUT"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "ref_name=$ref_name" >> "$GITHUB_OUTPUT"
|
||||||
|
echo "event_name=$event_name" >> "$GITHUB_OUTPUT"
|
||||||
|
echo "sha=$sha" >> "$GITHUB_OUTPUT"
|
||||||
|
|
||||||
|
- name: Set up QEMU and Buildx
|
||||||
|
uses: docker/setup-buildx-action@v3
|
||||||
|
|
||||||
|
- name: Log in to registry (best-effort)
|
||||||
|
if: ${{ steps.meta.outputs.on_default == 'true' }}
|
||||||
|
uses: docker/login-action@v3
|
||||||
|
continue-on-error: true
|
||||||
|
with:
|
||||||
|
registry: ${{ env.REGISTRY_URL }}
|
||||||
|
username: ${{ env.REGISTRY_USERNAME }}
|
||||||
|
password: ${{ env.REGISTRY_PASSWORD }}
|
||||||
|
|
||||||
|
- name: Build (and optionally push) image
|
||||||
|
uses: docker/build-push-action@v5
|
||||||
|
with:
|
||||||
|
context: .
|
||||||
|
file: Dockerfile
|
||||||
|
push: ${{ steps.meta.outputs.on_default == 'true' && steps.meta.outputs.event_name != 'pull_request' && (env.REGISTRY_URL != '' && env.REGISTRY_USERNAME != '' && env.REGISTRY_PASSWORD != '') }}
|
||||||
|
tags: |
|
||||||
|
${{ env.REGISTRY_URL }}/allucanget/jobs:latest
|
||||||
|
${{ env.REGISTRY_URL }}/allucanget/jobs:${{ steps.meta.outputs.sha }}
|
||||||
|
|
||||||
|
# deploy:
|
||||||
|
# runs-on: ubuntu-latest
|
||||||
|
# needs: test
|
||||||
|
# if: github.ref == 'refs/heads/main'
|
||||||
|
|
||||||
|
# steps:
|
||||||
|
# - name: Checkout code
|
||||||
|
# uses: actions/checkout@v4
|
||||||
|
|
||||||
|
# - name: Deploy to production
|
||||||
|
# run: |
|
||||||
|
# echo "Deploying to production..."
|
||||||
|
# docker-compose up -d
|
||||||
4
.gitignore
vendored
4
.gitignore
vendored
@@ -1,4 +1,3 @@
|
|||||||
.github/copilot*
|
|
||||||
cache/
|
cache/
|
||||||
logs/
|
logs/
|
||||||
|
|
||||||
@@ -165,3 +164,6 @@ cython_debug/
|
|||||||
#.idea/
|
#.idea/
|
||||||
|
|
||||||
docs/online.md
|
docs/online.md
|
||||||
|
.github/copilot*
|
||||||
|
.github/TODO.md
|
||||||
|
.vscode/launch.json
|
||||||
|
|||||||
51
Dockerfile
Normal file
51
Dockerfile
Normal file
@@ -0,0 +1,51 @@
|
|||||||
|
# Use Python 3.11 slim image
|
||||||
|
FROM python:3.11-slim-bookworm
|
||||||
|
|
||||||
|
# Set environment variables
|
||||||
|
ENV PYTHONUNBUFFERED=1
|
||||||
|
ENV PYTHONDONTWRITEBYTECODE=1
|
||||||
|
ENV FLASK_ENV=production
|
||||||
|
|
||||||
|
# Add apt-cacher-ng configuration (if APT_CACHER_NG is set)
|
||||||
|
RUN if [ -n "$APT_CACHER_NG" ]; then echo 'Acquire::http { Proxy "'"$APT_CACHER_NG"'/"; };' > /etc/apt/apt.conf.d/01proxy; fi
|
||||||
|
|
||||||
|
# Install system dependencies
|
||||||
|
RUN apt-get update && apt-get install -y \
|
||||||
|
gcc \
|
||||||
|
default-libmysqlclient-dev \
|
||||||
|
default-mysql-client \
|
||||||
|
pkg-config \
|
||||||
|
curl \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Create app directory
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Copy requirements first for better caching
|
||||||
|
COPY requirements.txt .
|
||||||
|
|
||||||
|
# Install Python dependencies
|
||||||
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
|
# Copy application code
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
# Create necessary directories
|
||||||
|
RUN mkdir -p cache logs
|
||||||
|
|
||||||
|
# Expose port
|
||||||
|
EXPOSE 8000
|
||||||
|
|
||||||
|
# Health check
|
||||||
|
HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
|
||||||
|
CMD curl -f http://localhost:8000/ || exit 1
|
||||||
|
|
||||||
|
# Copy entrypoint script
|
||||||
|
COPY docker-entrypoint.sh /usr/local/bin/
|
||||||
|
RUN chmod +x /usr/local/bin/docker-entrypoint.sh
|
||||||
|
|
||||||
|
# Set entrypoint
|
||||||
|
ENTRYPOINT ["docker-entrypoint.sh"]
|
||||||
|
|
||||||
|
# Run gunicorn
|
||||||
|
CMD ["gunicorn", "--config", "gunicorn.conf.py", "web.app:app"]
|
||||||
409
README-Docker.md
Normal file
409
README-Docker.md
Normal file
@@ -0,0 +1,409 @@
|
|||||||
|
# Jobs App - Docker Deployment
|
||||||
|
|
||||||
|
This application is a Craigslist job scraper with a Flask web interface.
|
||||||
|
|
||||||
|
## Quick Start with Docker
|
||||||
|
|
||||||
|
### Prerequisites
|
||||||
|
|
||||||
|
- Docker
|
||||||
|
- Docker Compose
|
||||||
|
|
||||||
|
### Deployment
|
||||||
|
|
||||||
|
1. **Clone the repository and navigate to the project directory**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd /path/to/jobs-app/jobs
|
||||||
|
```
|
||||||
|
|
||||||
|
2. **Start the application**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker-compose up --build -d
|
||||||
|
```
|
||||||
|
|
||||||
|
3. **Wait for services to be ready** (about 30 seconds)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Check if the app is running
|
||||||
|
curl http://localhost:8000
|
||||||
|
```
|
||||||
|
|
||||||
|
4. **Access the application**
|
||||||
|
- Main app: <http://localhost:8000>
|
||||||
|
- Admin interface: <http://localhost:8000/admin/users>
|
||||||
|
- Username: `admin`
|
||||||
|
- Password: `M11ffpgm.`
|
||||||
|
- Scraper interface: <http://localhost:8000/scrape-page>
|
||||||
|
|
||||||
|
## Docker Architecture
|
||||||
|
|
||||||
|
### Services
|
||||||
|
|
||||||
|
- **jobs-app**: Flask application with Gunicorn WSGI server
|
||||||
|
- **mysql**: MySQL 8.0 database
|
||||||
|
|
||||||
|
### Ports
|
||||||
|
|
||||||
|
- 8000: Flask application
|
||||||
|
- 3306: MySQL database (exposed for external access if needed)
|
||||||
|
|
||||||
|
### Volumes
|
||||||
|
|
||||||
|
- `mysql_data`: Persistent MySQL data storage
|
||||||
|
|
||||||
|
## Configuration
|
||||||
|
|
||||||
|
### Environment Variables
|
||||||
|
|
||||||
|
- `FLASK_ENV`: Set to `production`
|
||||||
|
- `FLASK_SECRET`: Secret key for Flask sessions (required)
|
||||||
|
- `APT_CACHER_NG`: Optional URL for apt-cacher-ng proxy to speed up package downloads (e.g., `http://192.168.88.14:3142`)
|
||||||
|
|
||||||
|
### Database Configuration
|
||||||
|
|
||||||
|
The database configuration is in `config/settings.json`:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"database": {
|
||||||
|
"mysql": {
|
||||||
|
"host": "mysql",
|
||||||
|
"user": "jobs",
|
||||||
|
"password": "jobdb",
|
||||||
|
"database": "jobs",
|
||||||
|
"port": 3306
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Useful Commands
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# View logs
|
||||||
|
docker-compose logs -f
|
||||||
|
|
||||||
|
# Stop services
|
||||||
|
docker-compose down
|
||||||
|
|
||||||
|
# Restart services
|
||||||
|
docker-compose restart
|
||||||
|
|
||||||
|
# Rebuild and restart
|
||||||
|
docker-compose up --build
|
||||||
|
|
||||||
|
# View running containers
|
||||||
|
docker-compose ps
|
||||||
|
|
||||||
|
# Execute commands in the app container
|
||||||
|
docker-compose exec jobs-app bash
|
||||||
|
|
||||||
|
# Check database
|
||||||
|
docker-compose exec mysql mysql -u jobs -p jobs
|
||||||
|
```
|
||||||
|
|
||||||
|
## Production Considerations
|
||||||
|
|
||||||
|
1. **Security**:
|
||||||
|
|
||||||
|
- Change default passwords in `docker-compose.yml`
|
||||||
|
- Use environment variables for secrets
|
||||||
|
- Configure proper firewall rules
|
||||||
|
|
||||||
|
2. **Scaling**:
|
||||||
|
|
||||||
|
- Adjust Gunicorn workers in `gunicorn.conf.py`
|
||||||
|
- Consider using a reverse proxy (nginx)
|
||||||
|
- Implement proper logging and monitoring
|
||||||
|
|
||||||
|
3. **Database**:
|
||||||
|
|
||||||
|
- Use external MySQL for production
|
||||||
|
- Configure backups
|
||||||
|
- Set up connection pooling
|
||||||
|
|
||||||
|
4. **Networking**:
|
||||||
|
- Use proper domain names
|
||||||
|
- Configure SSL/TLS
|
||||||
|
- Set up load balancing if needed
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### Common Issues
|
||||||
|
|
||||||
|
1. **Port conflicts**: Change ports in `docker-compose.yml`
|
||||||
|
2. **Database connection**: Ensure MySQL container is healthy
|
||||||
|
3. **Memory issues**: Increase Docker memory limits
|
||||||
|
4. **Permission issues**: Check file permissions in mounted volumes
|
||||||
|
|
||||||
|
### Logs
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Application logs
|
||||||
|
docker-compose logs jobs-app
|
||||||
|
|
||||||
|
# Database logs
|
||||||
|
docker-compose logs mysql
|
||||||
|
|
||||||
|
# All logs
|
||||||
|
docker-compose logs
|
||||||
|
```
|
||||||
|
|
||||||
|
## Development
|
||||||
|
|
||||||
|
For development with hot reload:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Run in development mode
|
||||||
|
docker-compose -f docker-compose.dev.yml up --build
|
||||||
|
```
|
||||||
|
|
||||||
|
Create `docker-compose.dev.yml`:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
version: "3.8"
|
||||||
|
services:
|
||||||
|
jobs-app:
|
||||||
|
build: .
|
||||||
|
ports:
|
||||||
|
- "8000:8000"
|
||||||
|
environment:
|
||||||
|
- FLASK_ENV=development
|
||||||
|
volumes:
|
||||||
|
- .:/app
|
||||||
|
command: ["flask", "run", "--host", "0.0.0.0", "--port", "8000"]
|
||||||
|
```
|
||||||
|
|
||||||
|
## Coolify Deployment
|
||||||
|
|
||||||
|
This application can be deployed on [Coolify](https://coolify.io) using Docker Compose. Coolify provides additional features and management capabilities for containerized applications.
|
||||||
|
|
||||||
|
### Coolify Prerequisites
|
||||||
|
|
||||||
|
- Coolify instance (self-hosted or cloud)
|
||||||
|
- Git repository accessible to Coolify
|
||||||
|
|
||||||
|
### Coolify-Specific Configuration
|
||||||
|
|
||||||
|
#### 1. Environment Variables
|
||||||
|
|
||||||
|
Coolify automatically detects environment variables in your `docker-compose.yml` and provides a UI to manage them. Use the following syntax for better integration:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
services:
|
||||||
|
jobs-app:
|
||||||
|
environment:
|
||||||
|
# Required variables (will show red border if empty)
|
||||||
|
- FLASK_SECRET=${FLASK_SECRET:?}
|
||||||
|
|
||||||
|
# Required with default (prefilled but editable)
|
||||||
|
- FLASK_ENV=${FLASK_ENV:?production}
|
||||||
|
|
||||||
|
# Optional with default
|
||||||
|
- GUNICORN_WORKERS=${GUNICORN_WORKERS:-4}
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 2. Coolify Magic Environment Variables
|
||||||
|
|
||||||
|
Leverage Coolify's dynamic environment variables:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
services:
|
||||||
|
jobs-app:
|
||||||
|
environment:
|
||||||
|
# Generate FQDN for the application
|
||||||
|
- SERVICE_FQDN_JOBS_APP
|
||||||
|
|
||||||
|
# Generate secure password for admin user
|
||||||
|
- ADMIN_PASSWORD=${SERVICE_PASSWORD_ADMIN:?M11ffpgm.}
|
||||||
|
|
||||||
|
# Generate database credentials
|
||||||
|
- DB_USER=${SERVICE_USER_DB:?jobs}
|
||||||
|
- DB_PASSWORD=${SERVICE_PASSWORD_DB:?jobdb}
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 3. Storage Configuration
|
||||||
|
|
||||||
|
Coolify supports advanced storage options:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
services:
|
||||||
|
jobs-app:
|
||||||
|
volumes:
|
||||||
|
# Create empty directories
|
||||||
|
- type: bind
|
||||||
|
source: ./cache
|
||||||
|
target: /app/cache
|
||||||
|
is_directory: true
|
||||||
|
|
||||||
|
- type: bind
|
||||||
|
source: ./logs
|
||||||
|
target: /app/logs
|
||||||
|
is_directory: true
|
||||||
|
|
||||||
|
mysql:
|
||||||
|
volumes:
|
||||||
|
# Persistent database storage
|
||||||
|
- mysql_data:/var/lib/mysql
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 4. Health Checks and Service Management
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
services:
|
||||||
|
jobs-app:
|
||||||
|
# Exclude from health checks if needed
|
||||||
|
exclude_from_hc: false
|
||||||
|
|
||||||
|
# Labels for Coolify management and Traefik routing
|
||||||
|
labels:
|
||||||
|
- coolify.managed=true
|
||||||
|
- traefik.enable=true
|
||||||
|
- "traefik.http.routers.jobs-app.rule=Host(`${SERVICE_FQDN_JOBS_APP:-localhost}`)"
|
||||||
|
- traefik.http.routers.jobs-app.entryPoints=https
|
||||||
|
- "traefik.http.routers.jobs-app.middlewares=https-redirect"
|
||||||
|
- "traefik.http.middlewares.https-redirect.redirectscheme.scheme=https"
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 5. Database Configuration for Coolify
|
||||||
|
|
||||||
|
Update your `config/settings.json` to use Coolify environment variables:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"database": {
|
||||||
|
"mysql": {
|
||||||
|
"host": "mysql",
|
||||||
|
"user": "${DB_USER:-jobs}",
|
||||||
|
"password": "${DB_PASSWORD:-jobdb}",
|
||||||
|
"database": "jobs",
|
||||||
|
"port": 3306
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Complete Coolify docker-compose.yml
|
||||||
|
|
||||||
|
Here's a complete `docker-compose.yml` optimized for Coolify:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
version: "3.8"
|
||||||
|
|
||||||
|
services:
|
||||||
|
jobs-app:
|
||||||
|
build: .
|
||||||
|
ports:
|
||||||
|
- "8000:8000"
|
||||||
|
environment:
|
||||||
|
# Required environment variables
|
||||||
|
- FLASK_SECRET=${FLASK_SECRET:?}
|
||||||
|
- FLASK_ENV=${FLASK_ENV:?production}
|
||||||
|
|
||||||
|
# Coolify magic variables
|
||||||
|
- SERVICE_FQDN_JOBS_APP
|
||||||
|
- ADMIN_PASSWORD=${SERVICE_PASSWORD_ADMIN:?M11ffpgm.}
|
||||||
|
- DB_USER=${SERVICE_USER_DB:?jobs}
|
||||||
|
- DB_PASSWORD=${SERVICE_PASSWORD_DB:?jobdb}
|
||||||
|
|
||||||
|
# Optional configuration
|
||||||
|
- GUNICORN_WORKERS=${GUNICORN_WORKERS:-4}
|
||||||
|
- APT_CACHER_NG=${APT_CACHER_NG}
|
||||||
|
volumes:
|
||||||
|
- type: bind
|
||||||
|
source: ./cache
|
||||||
|
target: /app/cache
|
||||||
|
is_directory: true
|
||||||
|
- type: bind
|
||||||
|
source: ./logs
|
||||||
|
target: /app/logs
|
||||||
|
is_directory: true
|
||||||
|
depends_on:
|
||||||
|
- mysql
|
||||||
|
labels:
|
||||||
|
- coolify.managed=true
|
||||||
|
- traefik.enable=true
|
||||||
|
- "traefik.http.routers.jobs-app.rule=Host(`${SERVICE_FQDN_JOBS_APP:-localhost}`)"
|
||||||
|
- traefik.http.routers.jobs-app.entryPoints=https
|
||||||
|
- "traefik.http.routers.jobs-app.middlewares=https-redirect"
|
||||||
|
- "traefik.http.middlewares.https-redirect.redirectscheme.scheme=https"
|
||||||
|
networks:
|
||||||
|
- jobs-network
|
||||||
|
restart: unless-stopped
|
||||||
|
|
||||||
|
mysql:
|
||||||
|
image: mysql:8.0
|
||||||
|
environment:
|
||||||
|
- MYSQL_ROOT_PASSWORD=${MYSQL_ROOT_PASSWORD:?rootpassword}
|
||||||
|
- MYSQL_DATABASE=jobs
|
||||||
|
- MYSQL_USER=${DB_USER:-jobs}
|
||||||
|
- MYSQL_PASSWORD=${DB_PASSWORD:-jobdb}
|
||||||
|
ports:
|
||||||
|
- "3306:3306"
|
||||||
|
volumes:
|
||||||
|
- mysql_data:/var/lib/mysql
|
||||||
|
- ./mysql-init:/docker-entrypoint-initdb.d
|
||||||
|
networks:
|
||||||
|
- jobs-network
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
mysql_data:
|
||||||
|
|
||||||
|
networks:
|
||||||
|
jobs-network:
|
||||||
|
driver: bridge
|
||||||
|
```
|
||||||
|
|
||||||
|
### Coolify Deployment Steps
|
||||||
|
|
||||||
|
1. **Connect Repository**: Link your Git repository to Coolify
|
||||||
|
2. **Create Service Stack**: Choose "Docker Compose" as the build pack
|
||||||
|
3. **Configure Environment Variables**: Set required variables in Coolify's UI:
|
||||||
|
- `FLASK_SECRET`: Generate a secure random string
|
||||||
|
- `FLASK_ENV`: Set to "production"
|
||||||
|
- `MYSQL_ROOT_PASSWORD`: Set a secure password
|
||||||
|
4. **Deploy**: Coolify will automatically build and deploy your application
|
||||||
|
5. **Access**: Use the generated FQDN to access your application
|
||||||
|
|
||||||
|
### Coolify Benefits
|
||||||
|
|
||||||
|
- **Automatic SSL**: HTTPS certificates are automatically managed
|
||||||
|
- **Environment Management**: Easy variable management through UI
|
||||||
|
- **Monitoring**: Built-in logging and health monitoring
|
||||||
|
- **Scaling**: Easy horizontal scaling
|
||||||
|
- **Backups**: Automated backup capabilities
|
||||||
|
- **Security**: Isolated networks and secure defaults
|
||||||
|
|
||||||
|
### Troubleshooting Coolify Deployments
|
||||||
|
|
||||||
|
1. **Environment Variables**: Check that all required variables are set in Coolify's UI
|
||||||
|
2. **Build Logs**: Review build logs for any compilation errors
|
||||||
|
3. **Network Issues**: Ensure services can communicate within the stack
|
||||||
|
4. **Storage Permissions**: Verify volume permissions are correct
|
||||||
|
5. **FQDN Configuration**: Check that the generated FQDN is accessible
|
||||||
|
|
||||||
|
#### Common Coolify Errors
|
||||||
|
|
||||||
|
**Error: "The scheme `domain.sslip.io` isn't valid. It should be either `http`, `https`"**
|
||||||
|
|
||||||
|
- **Cause**: Incorrect Traefik router configuration using full URLs instead of hostnames
|
||||||
|
- **Solution**: Use `SERVICE_FQDN_*` variables (which contain just the domain) in `Host()` rules, not `SERVICE_URL_*` variables
|
||||||
|
|
||||||
|
**Example**:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# Correct
|
||||||
|
- "traefik.http.routers.app.rule=Host(`${SERVICE_FQDN_APP:-localhost}`)"
|
||||||
|
|
||||||
|
# Incorrect
|
||||||
|
- "traefik.http.routers.app.rule=Host(`${SERVICE_URL_APP:-localhost}`)"
|
||||||
|
```
|
||||||
|
|
||||||
|
##### Container fails to start with permission denied
|
||||||
|
|
||||||
|
- **Cause**: Volume mount permissions in Coolify environment
|
||||||
|
- **Solution**: Ensure `is_directory: true` is set for bind mounts that need directory creation
|
||||||
|
|
||||||
|
For more information, visit the [Coolify Docker Compose documentation](https://coolify.io/docs/knowledge-base/docker/compose).
|
||||||
87
README-Traefik.md
Normal file
87
README-Traefik.md
Normal file
@@ -0,0 +1,87 @@
|
|||||||
|
# Setting Up Coolify Deployment Behind an Existing Traefik Proxy
|
||||||
|
|
||||||
|
This guide explains how to configure your existing Traefik instance (running at `192.168.88.10`) to proxy traffic to a Coolify-deployed jobs-app service running on `192.168.88.13:8001`.
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
- Traefik is running and accessible at `192.168.88.10`
|
||||||
|
- Your external IP is configured to point to Traefik for domain resolution
|
||||||
|
- The jobs-app is deployed via Coolify and running on `192.168.88.13:8001`
|
||||||
|
- You have access to Traefik's configuration files (assuming file-based provider)
|
||||||
|
|
||||||
|
## Step 1: Verify Jobs-App Accessibility
|
||||||
|
|
||||||
|
Ensure the jobs-app is running and accessible:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl http://192.168.88.13:8001
|
||||||
|
```
|
||||||
|
|
||||||
|
You should receive a response from the Flask application.
|
||||||
|
|
||||||
|
## Step 2: Configure Traefik
|
||||||
|
|
||||||
|
Since Traefik is on a separate machine (`192.168.88.10`) and cannot directly watch the Docker containers on `192.168.88.13`, you'll need to manually configure the routing in Traefik's configuration.
|
||||||
|
|
||||||
|
### Option 1: Using Traefik's File Provider
|
||||||
|
|
||||||
|
Add the following configuration to your Traefik dynamic configuration file (e.g., `dynamic.yml`):
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
http:
|
||||||
|
routers:
|
||||||
|
jobs-app:
|
||||||
|
rule: "Host(`your-domain.com`)" # Replace with your actual domain
|
||||||
|
service: jobs-app
|
||||||
|
entryPoints:
|
||||||
|
- https # Assuming Traefik handles SSL termination
|
||||||
|
middlewares:
|
||||||
|
- https-redirect # Optional: redirect HTTP to HTTPS
|
||||||
|
|
||||||
|
services:
|
||||||
|
jobs-app:
|
||||||
|
loadBalancer:
|
||||||
|
servers:
|
||||||
|
- url: "http://192.168.88.13:8001"
|
||||||
|
|
||||||
|
middlewares:
|
||||||
|
https-redirect:
|
||||||
|
redirectScheme:
|
||||||
|
scheme: https
|
||||||
|
permanent: true
|
||||||
|
```
|
||||||
|
|
||||||
|
### Option 2: Using Docker Labels (if Traefik can access the Docker socket)
|
||||||
|
|
||||||
|
If Traefik has access to the Docker socket on `192.168.88.13` (e.g., via network mount or API), the Docker labels in `docker-compose.yml` will automatically configure the routing. No additional configuration is needed.
|
||||||
|
|
||||||
|
## Step 3: Reload Traefik Configuration
|
||||||
|
|
||||||
|
After updating the configuration, reload Traefik:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# If using Docker
|
||||||
|
docker-compose restart traefik
|
||||||
|
|
||||||
|
# Or if running directly
|
||||||
|
systemctl reload traefik
|
||||||
|
```
|
||||||
|
|
||||||
|
## Step 4: Test the Setup
|
||||||
|
|
||||||
|
1. Ensure your DNS points `your-domain.com` to your external IP, which routes to Traefik.
|
||||||
|
2. Visit `https://your-domain.com` in your browser.
|
||||||
|
3. Traefik should proxy the request to `http://192.168.88.13:8001` and serve the jobs-app.
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
- **Port not accessible**: Ensure firewall rules allow traffic from `192.168.88.10` to `192.168.88.13:8001`.
|
||||||
|
- **SSL issues**: If Traefik is not terminating SSL, adjust the `entryPoints` and remove HTTPS redirects.
|
||||||
|
- **Routing not working**: Check Traefik logs for errors in router/service configuration.
|
||||||
|
- **Domain mismatch**: Verify the `Host` rule matches your actual domain.
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
- The jobs-app runs on port 8000 internally in the container, exposed on host port 8001.
|
||||||
|
- If you need to change the external port, update the `ports` mapping in `docker-compose.yml` and the Traefik service URL accordingly.
|
||||||
|
- For production, consider adding authentication, rate limiting, or other middlewares in Traefik.
|
||||||
231
README.md
231
README.md
@@ -9,11 +9,32 @@ job scraper
|
|||||||
- Users can search for job listings by keywords and region
|
- Users can search for job listings by keywords and region
|
||||||
- Selection of job listings based on user preferences
|
- Selection of job listings based on user preferences
|
||||||
|
|
||||||
## Requirements
|
## Architecture Overview
|
||||||
|
|
||||||
- Database (MySQL/MariaDB)
|
The application is built as a modular Flask‑based service with clear separation of concerns:
|
||||||
- Python 3.x
|
|
||||||
- Required Python packages (see requirements.txt)
|
| Layer | Module | Responsibility |
|
||||||
|
| ----------------------------- | ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
|
| **Web UI** | `web/app.py` | Flask application that serves HTML pages, REST endpoints, and admin interfaces (users, taxonomy, health, email management). |
|
||||||
|
| **Orchestrator** | `web/craigslist.py` | Coordinates the scraping workflow: schedules runs, fetches listings, updates the DB, and triggers email alerts. |
|
||||||
|
| **Scraper** | `web/scraper.py` | Contains the low‑level HTML parsing logic (`scrape_job_data`, `scrape_job_page`, `extract_contact_info`). |
|
||||||
|
| **Persistence** | `web/db.py` | SQLAlchemy ORM models (`User`, `JobListing`, `JobDescription`, `UserInteraction`, `Region`, `Keyword`, `EmailSubscription`, **`EmailTemplate`**) and helper functions for upserts, queries, and subscription management. |
|
||||||
|
| **Email Rendering** | `web/email_templates.py` | Renders job‑alert emails using a pluggable template system. Supports default placeholders (`{count_label}`, `{scope}`, `{timestamp}`, `{jobs_section}`, `{jobs_message}`) and custom admin‑defined templates. |
|
||||||
|
| **Email Delivery** | `web/email_service.py` | Sends rendered messages via SMTP, handling TLS/SSL, authentication, and graceful disabling. |
|
||||||
|
| **Configuration** | `config/settings.json` | Centralised JSON config for database, HTTP, scraper options, negative keywords, and email settings. |
|
||||||
|
| **Static Assets & Templates** | `web/static/`, `web/templates/` | Front‑end resources (JS, CSS) and Jinja2 templates for the public UI and admin pages (including the new **Email Templates** management UI). |
|
||||||
|
| **Scheduler** | `schedule` (used in `web/craigslist.py`) | Runs the scraper automatically at configurable intervals (default hourly). |
|
||||||
|
| **Testing** | `tests/` | Pytest suite covering scheduler, scraper, DB helpers, email service, and the new admin UI for email subscriptions and templates. |
|
||||||
|
|
||||||
|
**Key architectural notes**
|
||||||
|
|
||||||
|
- **Email Subscriptions** are stored in the `email_subscriptions` table and managed via `/admin/emails`.
|
||||||
|
- **Email Templates** are persisted in the new `email_templates` table, editable through `/admin/email-templates`, and used by the alert system.
|
||||||
|
- The orchestrator (`fetch_listings`) returns a detailed result dict (`discovered`, `new`, `by_search`) that drives UI metrics and health checks.
|
||||||
|
- Contact information (`reply_url`, `contact_email`, `contact_phone`, `contact_name`) extracted by the scraper is saved in `job_descriptions`.
|
||||||
|
- Negative keyword filtering is applied early in the pipeline to prevent unwanted listings from reaching the DB or email alerts.
|
||||||
|
|
||||||
|
This layered design makes it straightforward to extend the scraper to new sources, swap out the email backend, or add additional admin features without impacting other components.
|
||||||
|
|
||||||
## Installation
|
## Installation
|
||||||
|
|
||||||
@@ -22,3 +43,205 @@ job scraper
|
|||||||
3. Install dependencies
|
3. Install dependencies
|
||||||
4. Set up environment variables
|
4. Set up environment variables
|
||||||
5. Run the application
|
5. Run the application
|
||||||
|
|
||||||
|
## Scheduler Configuration
|
||||||
|
|
||||||
|
The application includes an automated scheduler that runs the job scraping process every hour. The scheduler is implemented in `web/craigslist.py` and includes:
|
||||||
|
|
||||||
|
- **Automatic Scheduling**: Scraping runs every hour automatically
|
||||||
|
- **Failure Handling**: Retry logic with exponential backoff (up to 3 attempts)
|
||||||
|
- **Background Operation**: Runs in a separate daemon thread
|
||||||
|
- **Graceful Error Recovery**: Continues running even if individual scraping attempts fail
|
||||||
|
|
||||||
|
### Scheduler Features
|
||||||
|
|
||||||
|
- **Retry Mechanism**: Automatically retries failed scraping attempts
|
||||||
|
- **Logging**: Comprehensive logging of scheduler operations and failures
|
||||||
|
- **Testing**: Comprehensive test suite in `tests/test_scheduler.py`
|
||||||
|
|
||||||
|
To modify the scheduling interval, edit the `start_scheduler()` function in `web/craigslist.py`.
|
||||||
|
|
||||||
|
## Job Scraping Output
|
||||||
|
|
||||||
|
The `fetch_listings()` function in `web/craigslist.py` extends its output to provide detailed metrics about each scraping operation. It returns a dictionary containing:
|
||||||
|
|
||||||
|
- **discovered**: Total number of unique job URLs discovered across all region/keyword combinations
|
||||||
|
- **new**: Total number of newly added jobs (jobs not previously in the database)
|
||||||
|
- **by_search**: List of dictionaries, each containing:
|
||||||
|
- **region**: The region name for this search
|
||||||
|
- **keyword**: The keyword used for this search
|
||||||
|
- **count**: Number of jobs fetched for this specific region/keyword combination
|
||||||
|
|
||||||
|
### Example Output
|
||||||
|
|
||||||
|
```python
|
||||||
|
{
|
||||||
|
"discovered": 150,
|
||||||
|
"new": 42,
|
||||||
|
"by_search": [
|
||||||
|
{"region": "sfbay", "keyword": "python", "count": 25},
|
||||||
|
{"region": "sfbay", "keyword": "java", "count": 18},
|
||||||
|
{"region": "losangeles", "keyword": "python", "count": 45},
|
||||||
|
{"region": "losangeles", "keyword": "java", "count": 62}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
This per-search breakdown allows for better monitoring and debugging of the scraping process, enabling identification of searches that may be failing or returning fewer results than expected.
|
||||||
|
|
||||||
|
## Contact Information Extraction
|
||||||
|
|
||||||
|
The scraper now automatically extracts contact information from job listing pages:
|
||||||
|
|
||||||
|
### Extracted Fields
|
||||||
|
|
||||||
|
When scraping individual job listings, the following contact information is extracted and stored:
|
||||||
|
|
||||||
|
- **contact_email**: Email address extracted from reply button or contact form links
|
||||||
|
- **contact_phone**: Phone number extracted from tel links or contact parameters
|
||||||
|
- **contact_name**: Contact person or department name if available
|
||||||
|
- **reply_url**: The full reply/contact URL from the job listing
|
||||||
|
|
||||||
|
### How Contact Information is Extracted
|
||||||
|
|
||||||
|
The `extract_contact_info()` function intelligently parses various types of reply URLs:
|
||||||
|
|
||||||
|
1. **Mailto Links**: `mailto:jobs@company.com?subject=...`
|
||||||
|
|
||||||
|
- Extracts the email address directly
|
||||||
|
|
||||||
|
2. **Phone Links**: `tel:+1234567890`
|
||||||
|
|
||||||
|
- Extracts the phone number
|
||||||
|
|
||||||
|
3. **URL Parameters**: `https://apply.company.com?email=hr@company.com&phone=555-1234&name=HR%20Team`
|
||||||
|
|
||||||
|
- Searches for common parameter names: `email`, `phone`, `contact_name`, etc.
|
||||||
|
|
||||||
|
4. **Graceful Fallback**: If contact information cannot be extracted, the fields are set to `"N/A"`
|
||||||
|
|
||||||
|
### Database Storage
|
||||||
|
|
||||||
|
Contact information is stored in the `job_descriptions` table with the following columns:
|
||||||
|
|
||||||
|
- `reply_url` (VARCHAR(512)): The complete reply/contact URL
|
||||||
|
- `contact_email` (VARCHAR(255)): Extracted email address
|
||||||
|
- `contact_phone` (VARCHAR(255)): Extracted phone number
|
||||||
|
- `contact_name` (VARCHAR(255)): Extracted contact person/department name
|
||||||
|
|
||||||
|
### Example
|
||||||
|
|
||||||
|
For a job listing with reply button `mailto:hiring@acme.com?subject=Job%20Application`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
{
|
||||||
|
"reply_url": "mailto:hiring@acme.com?subject=Job%20Application",
|
||||||
|
"contact_email": "hiring@acme.com",
|
||||||
|
"contact_phone": "N/A",
|
||||||
|
"contact_name": "N/A"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
This contact information is automatically extracted during job page scraping and persisted to the database for easy access and filtering.
|
||||||
|
|
||||||
|
## Negative Keyword Filtering
|
||||||
|
|
||||||
|
The scraper inspects each job’s title, company, location, and description for configurable “negative” keywords. When a keyword matches, the scraped result indicates the match so downstream workflows can skip or flag the job.
|
||||||
|
|
||||||
|
### Email Configuration
|
||||||
|
|
||||||
|
Define keywords in `config/settings.json` under `scraper.negative_keywords`. Keywords are matched case-insensitively and should be supplied without surrounding whitespace:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"scraper": {
|
||||||
|
"negative_keywords": ["scam", "mlm", "unpaid"]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Scrape Output
|
||||||
|
|
||||||
|
Each `scrape_job_page` result contains three new fields:
|
||||||
|
|
||||||
|
- `is_negative_match`: `True` when any keyword matches
|
||||||
|
- `negative_keyword_match`: the keyword that triggered the match
|
||||||
|
- `negative_match_field`: which field (title, company, location, description) contained the keyword
|
||||||
|
|
||||||
|
### Processing Behavior
|
||||||
|
|
||||||
|
- `process_job_url` stops when `is_negative_match` is `True`, yielding a log message and calling `remove_job` so stale results never remain in `job_listings`.
|
||||||
|
- `upsert_job_details` now returns immediately for negative matches, ensuring `job_descriptions` never stores filtered listings.
|
||||||
|
- Regression coverage lives in `tests/test_scraper.py::TestScraperPipelineNegativeFiltering` and `tests/test_db_negative_filtering.py::test_upsert_job_details_skips_negative_match`.
|
||||||
|
|
||||||
|
Together, these checks mean negative matches are dropped before any persistence and never shown in the UI.
|
||||||
|
|
||||||
|
### User-Specific Negative Keywords
|
||||||
|
|
||||||
|
In addition to the global negative keywords defined in `settings.json`, users can define their own personal negative keywords via the **Preferences** page (`/settings`).
|
||||||
|
|
||||||
|
- **Management**: Users can add new negative keywords and remove existing ones.
|
||||||
|
- **Filtering**: Jobs matching any of the user's negative keywords are filtered out from the job listings view (`/` and `/jobs`).
|
||||||
|
- **Validation**: The UI prevents adding duplicate keywords.
|
||||||
|
- **Storage**: User-specific negative keywords are stored in the database (`negative_keywords` and `user_negative_keywords` tables).
|
||||||
|
|
||||||
|
## Email Notifications
|
||||||
|
|
||||||
|
Optional job-alert emails are generated whenever the scraper discovers new listings.
|
||||||
|
|
||||||
|
### Configuration
|
||||||
|
|
||||||
|
Edit `config/settings.json` under the `email` section:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"email": {
|
||||||
|
"enabled": true,
|
||||||
|
"from_address": "jobs@example.com",
|
||||||
|
"recipients": ["alerts@example.com"],
|
||||||
|
"smtp": {
|
||||||
|
"host": "smtp.example.com",
|
||||||
|
"port": 587,
|
||||||
|
"username": "smtp-user",
|
||||||
|
"password": "secret",
|
||||||
|
"use_tls": true,
|
||||||
|
"use_ssl": false,
|
||||||
|
"timeout": 30
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
- Leave `enabled` set to `false` for local development or when credentials are unavailable.
|
||||||
|
- Provide at least one recipient; otherwise alerts are skipped with a log message.
|
||||||
|
- Omit real credentials from source control—inject them via environment variables or a secrets manager in production.
|
||||||
|
|
||||||
|
### How Alerts Are Sent
|
||||||
|
|
||||||
|
- After `fetch_listings()` completes, the scraper gathers new listings and, when configured, renders a plaintext digest via `web.email_templates.render_job_alert_email`.
|
||||||
|
- Delivery is handled by `web.email_service.send_email`, which supports TLS/SSL SMTP connections and gracefully skips when disabled.
|
||||||
|
- Success or failure is streamed in the scraper log output (`Job alert email sent.` or the reason for skipping).
|
||||||
|
|
||||||
|
### Managing Recipients
|
||||||
|
|
||||||
|
- Admin users can visit `/admin/emails` to add or deactivate subscription addresses through the web UI.
|
||||||
|
- Deactivated rows remain in the table so they can be reactivated later; the scraper only mails active recipients.
|
||||||
|
- The navigation bar exposes an **Email Alerts** link to the management screen after logging in as an admin user.
|
||||||
|
|
||||||
|
### Customising Templates
|
||||||
|
|
||||||
|
- Use the **Email Templates** admin page (`/admin/email-templates`) to create, edit, preview, or delete alert templates.
|
||||||
|
- Templates support placeholder tokens such as `{count_label}`, `{scope}`, `{timestamp}`, `{jobs_section}`, and `{jobs_message}`; the UI lists all available tokens.
|
||||||
|
- Preview renders the selected template with sample data so changes can be reviewed before saving.
|
||||||
|
|
||||||
|
### Tests
|
||||||
|
|
||||||
|
- `tests/test_email_templates.py` verifies the rendered subject/body for both populated and empty alerts.
|
||||||
|
- `tests/test_email_service.py` covers SMTP configuration, disabled mode, and login/send flows using fakes.
|
||||||
|
- `tests/test_admin_email.py` exercises the admin UI for listing, subscribing, and unsubscribing recipients.
|
||||||
|
- `tests/test_admin_email_templates.py` verifies CRUD operations and previews for template management.
|
||||||
|
- `tests/test_scraper.py::TestScraperEmailNotifications` ensures the scraping pipeline invokes the alert sender when new jobs are found.
|
||||||
|
|
||||||
|
## Docker Deployment
|
||||||
|
|
||||||
|
Please see [README-Docker.md](README-Docker.md) for instructions on deploying the application using Docker.
|
||||||
|
|||||||
@@ -9,7 +9,7 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
"http": {
|
"http": {
|
||||||
"user_agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:141.0) Gecko/20100101 Firefox/141.0",
|
"user_agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:145.0) Gecko/20100101 Firefox/145.0",
|
||||||
"request_timeout": 30,
|
"request_timeout": 30,
|
||||||
"max_retries": 3,
|
"max_retries": 3,
|
||||||
"backoff_factor": 2,
|
"backoff_factor": 2,
|
||||||
@@ -22,7 +22,22 @@
|
|||||||
},
|
},
|
||||||
"scraper": {
|
"scraper": {
|
||||||
"base_url": "https://{region}.craigslist.org/search/jjj?query={keyword}&sort=rel",
|
"base_url": "https://{region}.craigslist.org/search/jjj?query={keyword}&sort=rel",
|
||||||
"config_dir": "config"
|
"config_dir": "config",
|
||||||
|
"negative_keywords": []
|
||||||
|
},
|
||||||
|
"email": {
|
||||||
|
"enabled": false,
|
||||||
|
"from_address": "jobs@example.com",
|
||||||
|
"recipients": [],
|
||||||
|
"smtp": {
|
||||||
|
"host": "smtp.example.com",
|
||||||
|
"port": 587,
|
||||||
|
"username": "",
|
||||||
|
"password": "",
|
||||||
|
"use_tls": true,
|
||||||
|
"use_ssl": false,
|
||||||
|
"timeout": 30
|
||||||
|
}
|
||||||
},
|
},
|
||||||
"users": [
|
"users": [
|
||||||
{ "username": "anonymous", "is_admin": false, "password": "" },
|
{ "username": "anonymous", "is_admin": false, "password": "" },
|
||||||
|
|||||||
41
deploy.sh
Normal file
41
deploy.sh
Normal file
@@ -0,0 +1,41 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# Deployment script for Jobs App
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
echo "🚀 Starting Jobs App deployment..."
|
||||||
|
|
||||||
|
# Check if Docker is installed
|
||||||
|
if ! command -v docker &> /dev/null; then
|
||||||
|
echo "❌ Docker is not installed. Please install Docker first."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Check if docker-compose is installed
|
||||||
|
if ! command -v docker-compose &> /dev/null; then
|
||||||
|
echo "❌ docker-compose is not installed. Please install docker-compose first."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "📦 Building and starting services..."
|
||||||
|
docker-compose up --build -d
|
||||||
|
|
||||||
|
echo "⏳ Waiting for services to be ready..."
|
||||||
|
sleep 30
|
||||||
|
|
||||||
|
echo "🔍 Checking service health..."
|
||||||
|
if curl -f http://localhost:8000/ &> /dev/null; then
|
||||||
|
echo "✅ Jobs App is running successfully!"
|
||||||
|
echo "🌐 Access the application at: http://localhost:8000"
|
||||||
|
echo "📊 Admin interface: http://localhost:8000/admin/users (login: admin/M11ffpgm.)"
|
||||||
|
echo "🔄 Scraper interface: http://localhost:8000/scrape-page"
|
||||||
|
else
|
||||||
|
echo "❌ Jobs App failed to start. Check logs with: docker-compose logs"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "📋 Useful commands:"
|
||||||
|
echo " View logs: docker-compose logs -f"
|
||||||
|
echo " Stop services: docker-compose down"
|
||||||
|
echo " Restart: docker-compose restart"
|
||||||
|
echo " Rebuild: docker-compose up --build"
|
||||||
40
docker-compose-test.yml
Normal file
40
docker-compose-test.yml
Normal file
@@ -0,0 +1,40 @@
|
|||||||
|
version: "3.8"
|
||||||
|
|
||||||
|
services:
|
||||||
|
jobs-app:
|
||||||
|
build: .
|
||||||
|
ports:
|
||||||
|
- "8001:8000"
|
||||||
|
environment:
|
||||||
|
# Required environment variables
|
||||||
|
- FLASK_SECRET="localtest8462851903856136136"
|
||||||
|
- FLASK_ENV=production
|
||||||
|
|
||||||
|
# Coolify magic variables
|
||||||
|
- SERVICE_FQDN_JOBS_APP=https://jobs.allucanget.biz
|
||||||
|
- ADMIN_PASSWORD=M11ffpgm.
|
||||||
|
- DB_USER=jobs
|
||||||
|
- DB_PASSWORD=jobdb
|
||||||
|
|
||||||
|
# Optional configuration
|
||||||
|
- GUNICORN_WORKERS=4
|
||||||
|
volumes:
|
||||||
|
- type: bind
|
||||||
|
source: ./cache
|
||||||
|
target: /app/cache
|
||||||
|
- type: bind
|
||||||
|
source: ./logs
|
||||||
|
target: /app/logs
|
||||||
|
labels:
|
||||||
|
- coolify.managed=true
|
||||||
|
- traefik.enable=true
|
||||||
|
- "traefik.http.routers.jobs-app.rule=Host(`${SERVICE_FQDN_JOBS_APP:-localhost}`)"
|
||||||
|
- traefik.http.routers.jobs-app.entryPoints=http
|
||||||
|
- "traefik.http.services.jobs-app.loadbalancer.server.port=8000"
|
||||||
|
networks:
|
||||||
|
- jobs-network
|
||||||
|
restart: unless-stopped
|
||||||
|
|
||||||
|
networks:
|
||||||
|
jobs-network:
|
||||||
|
driver: bridge
|
||||||
61
docker-compose.yml
Normal file
61
docker-compose.yml
Normal file
@@ -0,0 +1,61 @@
|
|||||||
|
version: "3.8"
|
||||||
|
|
||||||
|
services:
|
||||||
|
jobs-app:
|
||||||
|
build: .
|
||||||
|
ports:
|
||||||
|
- "8001:8000"
|
||||||
|
environment:
|
||||||
|
# Required environment variables
|
||||||
|
- FLASK_SECRET=${FLASK_SECRET:?}
|
||||||
|
- FLASK_ENV=${FLASK_ENV:?production}
|
||||||
|
|
||||||
|
# Coolify magic variables
|
||||||
|
- SERVICE_FQDN_JOBS_APP
|
||||||
|
- ADMIN_PASSWORD=${SERVICE_PASSWORD_ADMIN:?M11ffpgm.}
|
||||||
|
- DB_USER=${SERVICE_USER_DB:?jobs}
|
||||||
|
- DB_PASSWORD=${SERVICE_PASSWORD_DB:?jobdb}
|
||||||
|
|
||||||
|
# Optional configuration
|
||||||
|
- GUNICORN_WORKERS=${GUNICORN_WORKERS:-4}
|
||||||
|
- APT_CACHER_NG=${APT_CACHER_NG}
|
||||||
|
volumes:
|
||||||
|
- type: bind
|
||||||
|
source: ./cache
|
||||||
|
target: /app/cache
|
||||||
|
- type: bind
|
||||||
|
source: ./logs
|
||||||
|
target: /app/logs
|
||||||
|
depends_on:
|
||||||
|
- mysql
|
||||||
|
labels:
|
||||||
|
- coolify.managed=true
|
||||||
|
- traefik.enable=true
|
||||||
|
- "traefik.http.routers.jobs-app.rule=Host(`${SERVICE_FQDN_JOBS_APP:-localhost}`)"
|
||||||
|
- traefik.http.routers.jobs-app.entryPoints=http
|
||||||
|
- "traefik.http.services.jobs-app.loadbalancer.server.port=8000"
|
||||||
|
networks:
|
||||||
|
- jobs-network
|
||||||
|
restart: unless-stopped
|
||||||
|
|
||||||
|
mysql:
|
||||||
|
image: mysql:8.0
|
||||||
|
environment:
|
||||||
|
- MYSQL_ROOT_PASSWORD=${MYSQL_ROOT_PASSWORD:?rootpassword}
|
||||||
|
- MYSQL_DATABASE=jobs
|
||||||
|
- MYSQL_USER=${DB_USER:-jobs}
|
||||||
|
- MYSQL_PASSWORD=${DB_PASSWORD:-jobdb}
|
||||||
|
ports:
|
||||||
|
- "3306:3306"
|
||||||
|
volumes:
|
||||||
|
- mysql_data:/var/lib/mysql
|
||||||
|
- ./mysql-init:/docker-entrypoint-initdb.d
|
||||||
|
networks:
|
||||||
|
- jobs-network
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
mysql_data:
|
||||||
|
|
||||||
|
networks:
|
||||||
|
jobs-network:
|
||||||
|
driver: bridge
|
||||||
48
docker-entrypoint.sh
Normal file
48
docker-entrypoint.sh
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# Docker entrypoint script for Jobs App
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
echo "🚀 Starting Jobs App container..."
|
||||||
|
|
||||||
|
# Wait for MySQL to be ready
|
||||||
|
echo "⏳ Waiting for MySQL to be ready..."
|
||||||
|
python -c "
|
||||||
|
import time
|
||||||
|
import pymysql
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
conn = pymysql.connect(
|
||||||
|
host='192.168.88.37',
|
||||||
|
user='jobs',
|
||||||
|
password='jobdb',
|
||||||
|
database='jobs',
|
||||||
|
connect_timeout=5
|
||||||
|
)
|
||||||
|
conn.close()
|
||||||
|
print('✅ MySQL is ready!')
|
||||||
|
break
|
||||||
|
except pymysql.Error as e:
|
||||||
|
print(f'MySQL is not ready: {e}, waiting...')
|
||||||
|
time.sleep(2)
|
||||||
|
"
|
||||||
|
|
||||||
|
# Run database setup
|
||||||
|
echo "🗄️ Setting up database..."
|
||||||
|
python setup.py mysql-init
|
||||||
|
|
||||||
|
# Seed initial data if needed
|
||||||
|
echo "🌱 Seeding initial data..."
|
||||||
|
python -c "
|
||||||
|
from web.db import db_init
|
||||||
|
from web.utils import initialize_users_from_settings
|
||||||
|
db_init()
|
||||||
|
try:
|
||||||
|
initialize_users_from_settings()
|
||||||
|
print('✅ Users seeded successfully')
|
||||||
|
except Exception as e:
|
||||||
|
print(f'⚠️ User seeding failed: {e}')
|
||||||
|
"
|
||||||
|
|
||||||
|
echo "🎯 Starting Gunicorn server..."
|
||||||
|
exec "$@"
|
||||||
38
gunicorn.conf.py
Normal file
38
gunicorn.conf.py
Normal file
@@ -0,0 +1,38 @@
|
|||||||
|
# Gunicorn configuration file
|
||||||
|
import multiprocessing
|
||||||
|
|
||||||
|
# Server socket
|
||||||
|
bind = "0.0.0.0:8000"
|
||||||
|
backlog = 2048
|
||||||
|
|
||||||
|
# Worker processes
|
||||||
|
workers = multiprocessing.cpu_count() * 2 + 1
|
||||||
|
worker_class = "sync"
|
||||||
|
worker_connections = 1000
|
||||||
|
max_requests = 1000
|
||||||
|
max_requests_jitter = 50
|
||||||
|
timeout = 30
|
||||||
|
keepalive = 2
|
||||||
|
|
||||||
|
# Logging
|
||||||
|
loglevel = "info"
|
||||||
|
accesslog = "-"
|
||||||
|
errorlog = "-"
|
||||||
|
|
||||||
|
# Process naming
|
||||||
|
proc_name = "jobs_app"
|
||||||
|
|
||||||
|
# Server mechanics
|
||||||
|
daemon = False
|
||||||
|
pidfile = "/tmp/gunicorn.pid"
|
||||||
|
user = None
|
||||||
|
group = None
|
||||||
|
tmp_upload_dir = None
|
||||||
|
|
||||||
|
# SSL (if needed)
|
||||||
|
keyfile = None
|
||||||
|
certfile = None
|
||||||
|
|
||||||
|
# Application
|
||||||
|
wsgi_module = "web.app:app"
|
||||||
|
callable = "app"
|
||||||
12
main.py
12
main.py
@@ -4,8 +4,20 @@ starts webserver
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import web.app as app
|
import web.app as app
|
||||||
|
import threading
|
||||||
|
from web.craigslist import start_scheduler
|
||||||
|
|
||||||
|
|
||||||
|
def start_background_scheduler():
|
||||||
|
"""Start the scheduler in a background thread."""
|
||||||
|
scheduler_thread = threading.Thread(target=start_scheduler, daemon=True)
|
||||||
|
scheduler_thread.start()
|
||||||
|
print("Background scheduler started")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
# Start scheduler in background thread
|
||||||
|
start_background_scheduler()
|
||||||
|
|
||||||
# start web server
|
# start web server
|
||||||
app.main()
|
app.main()
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ flask
|
|||||||
flask-wtf
|
flask-wtf
|
||||||
pytest
|
pytest
|
||||||
requests
|
requests
|
||||||
|
schedule
|
||||||
sqlalchemy
|
sqlalchemy
|
||||||
pymysql
|
pymysql
|
||||||
gunicorn
|
gunicorn
|
||||||
16
setup.py
16
setup.py
@@ -1,6 +1,6 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
"""
|
"""
|
||||||
MySQL utilities for Craigslist project.
|
MySQL utility script for initializing database and showing row counts.
|
||||||
|
|
||||||
Usage (PowerShell):
|
Usage (PowerShell):
|
||||||
# Ensure MySQL database and tables
|
# Ensure MySQL database and tables
|
||||||
@@ -36,15 +36,15 @@ try:
|
|||||||
engine = create_engine(url, future=True)
|
engine = create_engine(url, future=True)
|
||||||
with engine.begin() as conn:
|
with engine.begin() as conn:
|
||||||
for table in [
|
for table in [
|
||||||
"users",
|
|
||||||
"regions",
|
|
||||||
"keywords",
|
|
||||||
"user_regions",
|
|
||||||
"user_keywords",
|
|
||||||
"job_listings",
|
|
||||||
"job_descriptions",
|
"job_descriptions",
|
||||||
"cached_pages",
|
"job_listings",
|
||||||
|
"keywords",
|
||||||
|
"logs",
|
||||||
|
"regions",
|
||||||
|
"users",
|
||||||
"user_interactions",
|
"user_interactions",
|
||||||
|
"user_keywords",
|
||||||
|
"user_regions",
|
||||||
]:
|
]:
|
||||||
try:
|
try:
|
||||||
n = conn.execute(text(f"SELECT COUNT(*) FROM {table}"))
|
n = conn.execute(text(f"SELECT COUNT(*) FROM {table}"))
|
||||||
|
|||||||
84
tests/test_admin_email.py
Normal file
84
tests/test_admin_email.py
Normal file
@@ -0,0 +1,84 @@
|
|||||||
|
import pytest
|
||||||
|
from sqlalchemy import text
|
||||||
|
|
||||||
|
from web.app import app
|
||||||
|
from web.db import (
|
||||||
|
db_init,
|
||||||
|
create_or_update_user,
|
||||||
|
subscribe_email,
|
||||||
|
list_email_subscriptions,
|
||||||
|
_ensure_session,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="function", autouse=True)
|
||||||
|
def initialize_app():
|
||||||
|
app.config.update(TESTING=True, WTF_CSRF_ENABLED=False)
|
||||||
|
with app.app_context():
|
||||||
|
db_init()
|
||||||
|
create_or_update_user("admin", password="secret",
|
||||||
|
is_admin=True, is_active=True)
|
||||||
|
# Clear subscriptions before and after each test to avoid leakage
|
||||||
|
with _ensure_session() as session:
|
||||||
|
session.execute(text("DELETE FROM email_subscriptions"))
|
||||||
|
session.commit()
|
||||||
|
yield
|
||||||
|
with _ensure_session() as session:
|
||||||
|
session.execute(text("DELETE FROM email_subscriptions"))
|
||||||
|
session.commit()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def client():
|
||||||
|
with app.test_client() as test_client:
|
||||||
|
with test_client.session_transaction() as sess:
|
||||||
|
sess["username"] = "admin"
|
||||||
|
yield test_client
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def anon_client():
|
||||||
|
with app.test_client() as test_client:
|
||||||
|
# Ensure no admin session present
|
||||||
|
with test_client.session_transaction() as sess:
|
||||||
|
sess.pop("username", None)
|
||||||
|
yield test_client
|
||||||
|
|
||||||
|
|
||||||
|
def test_admin_emails_requires_admin(anon_client):
|
||||||
|
response = anon_client.get("/admin/emails")
|
||||||
|
assert response.status_code == 302
|
||||||
|
assert "/login" in response.headers.get("Location", "")
|
||||||
|
|
||||||
|
|
||||||
|
def test_admin_emails_lists_subscriptions(client):
|
||||||
|
subscribe_email("alice@example.com")
|
||||||
|
response = client.get("/admin/emails")
|
||||||
|
assert response.status_code == 200
|
||||||
|
assert b"alice@example.com" in response.data
|
||||||
|
|
||||||
|
|
||||||
|
def test_admin_emails_can_subscribe(client):
|
||||||
|
response = client.post(
|
||||||
|
"/admin/emails",
|
||||||
|
data={"action": "subscribe", "email": "bob@example.com"},
|
||||||
|
follow_redirects=False,
|
||||||
|
)
|
||||||
|
assert response.status_code == 302
|
||||||
|
emails = list_email_subscriptions()
|
||||||
|
assert any(sub["email"] == "bob@example.com" and sub["is_active"]
|
||||||
|
for sub in emails)
|
||||||
|
|
||||||
|
|
||||||
|
def test_admin_emails_can_unsubscribe(client):
|
||||||
|
subscribe_email("carol@example.com")
|
||||||
|
response = client.post(
|
||||||
|
"/admin/emails",
|
||||||
|
data={"action": "unsubscribe", "email": "carol@example.com"},
|
||||||
|
follow_redirects=False,
|
||||||
|
)
|
||||||
|
assert response.status_code == 302
|
||||||
|
emails = list_email_subscriptions()
|
||||||
|
matching = [sub for sub in emails if sub["email"] == "carol@example.com"]
|
||||||
|
assert matching
|
||||||
|
assert matching[0]["is_active"] is False
|
||||||
138
tests/test_admin_email_templates.py
Normal file
138
tests/test_admin_email_templates.py
Normal file
@@ -0,0 +1,138 @@
|
|||||||
|
import pytest
|
||||||
|
from sqlalchemy import text
|
||||||
|
|
||||||
|
from web.app import app
|
||||||
|
from web.db import (
|
||||||
|
db_init,
|
||||||
|
create_or_update_user,
|
||||||
|
list_email_templates,
|
||||||
|
update_email_template,
|
||||||
|
_ensure_session,
|
||||||
|
ensure_default_email_template,
|
||||||
|
)
|
||||||
|
from web.email_templates import render_job_alert_email
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="function", autouse=True)
|
||||||
|
def setup_database():
|
||||||
|
app.config.update(TESTING=True, WTF_CSRF_ENABLED=False)
|
||||||
|
with app.app_context():
|
||||||
|
db_init()
|
||||||
|
create_or_update_user("admin", password="secret", is_admin=True, is_active=True)
|
||||||
|
with _ensure_session() as session:
|
||||||
|
session.execute(text("DELETE FROM email_templates"))
|
||||||
|
session.commit()
|
||||||
|
ensure_default_email_template()
|
||||||
|
yield
|
||||||
|
with _ensure_session() as session:
|
||||||
|
session.execute(text("DELETE FROM email_templates"))
|
||||||
|
session.commit()
|
||||||
|
ensure_default_email_template()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def client():
|
||||||
|
with app.test_client() as test_client:
|
||||||
|
with test_client.session_transaction() as sess:
|
||||||
|
sess["username"] = "admin"
|
||||||
|
yield test_client
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def anon_client():
|
||||||
|
with app.test_client() as test_client:
|
||||||
|
with test_client.session_transaction() as sess:
|
||||||
|
sess.pop("username", None)
|
||||||
|
yield test_client
|
||||||
|
|
||||||
|
|
||||||
|
def test_email_templates_requires_admin(anon_client):
|
||||||
|
response = anon_client.get("/admin/email-templates")
|
||||||
|
assert response.status_code == 302
|
||||||
|
assert "/login" in response.headers.get("Location", "")
|
||||||
|
|
||||||
|
|
||||||
|
def test_email_templates_lists_default(client):
|
||||||
|
response = client.get("/admin/email-templates")
|
||||||
|
assert response.status_code == 200
|
||||||
|
assert b"job-alert" in response.data
|
||||||
|
|
||||||
|
|
||||||
|
def test_email_templates_create_update_delete(client):
|
||||||
|
# Create
|
||||||
|
response = client.post(
|
||||||
|
"/admin/email-templates",
|
||||||
|
data={
|
||||||
|
"action": "create",
|
||||||
|
"name": "Daily Summary",
|
||||||
|
"slug": "daily-summary",
|
||||||
|
"subject": "Summary: {count_label}",
|
||||||
|
"body": "Jobs:{jobs_section}",
|
||||||
|
"is_active": "on",
|
||||||
|
},
|
||||||
|
follow_redirects=False,
|
||||||
|
)
|
||||||
|
assert response.status_code == 302
|
||||||
|
templates = list_email_templates()
|
||||||
|
assert any(t["slug"] == "daily-summary" for t in templates)
|
||||||
|
|
||||||
|
# Update
|
||||||
|
template_row = next(t for t in templates if t["slug"] == "daily-summary")
|
||||||
|
response = client.post(
|
||||||
|
"/admin/email-templates",
|
||||||
|
data={
|
||||||
|
"action": "update",
|
||||||
|
"template_id": template_row["template_id"],
|
||||||
|
"name": "Daily Summary",
|
||||||
|
"slug": "daily-summary",
|
||||||
|
"subject": "Updated: {count_label}",
|
||||||
|
"body": "Updated body {jobs_section}",
|
||||||
|
},
|
||||||
|
follow_redirects=False,
|
||||||
|
)
|
||||||
|
assert response.status_code == 302
|
||||||
|
updated = list_email_templates()
|
||||||
|
updated_row = next(t for t in updated if t["slug"] == "daily-summary")
|
||||||
|
assert "Updated:" in updated_row["subject"]
|
||||||
|
|
||||||
|
# Delete
|
||||||
|
response = client.post(
|
||||||
|
"/admin/email-templates",
|
||||||
|
data={
|
||||||
|
"action": "delete",
|
||||||
|
"template_id": updated_row["template_id"],
|
||||||
|
},
|
||||||
|
follow_redirects=False,
|
||||||
|
)
|
||||||
|
assert response.status_code == 302
|
||||||
|
slugs = [t["slug"] for t in list_email_templates()]
|
||||||
|
assert "daily-summary" not in slugs
|
||||||
|
|
||||||
|
|
||||||
|
def test_email_templates_preview(client):
|
||||||
|
templates = list_email_templates()
|
||||||
|
job_alert = next(t for t in templates if t["slug"] == "job-alert")
|
||||||
|
response = client.get(f"/admin/email-templates?preview_id={job_alert['template_id']}")
|
||||||
|
assert response.status_code == 200
|
||||||
|
assert b"Preview" in response.data
|
||||||
|
assert b"Subject" in response.data
|
||||||
|
|
||||||
|
|
||||||
|
def test_render_job_alert_email_uses_template_override(client):
|
||||||
|
templates = list_email_templates()
|
||||||
|
job_alert = next(t for t in templates if t["slug"] == "job-alert")
|
||||||
|
update_email_template(
|
||||||
|
job_alert["template_id"],
|
||||||
|
subject="Custom Subject {count}",
|
||||||
|
body="Body {jobs_message}",
|
||||||
|
)
|
||||||
|
rendered = render_job_alert_email([
|
||||||
|
{
|
||||||
|
"title": "Python Developer",
|
||||||
|
"company": "Acme",
|
||||||
|
"location": "Remote",
|
||||||
|
"url": "https://example.com",
|
||||||
|
}
|
||||||
|
])
|
||||||
|
assert rendered["subject"].startswith("Custom Subject")
|
||||||
|
assert "Python Developer" in rendered["body"]
|
||||||
@@ -1,66 +0,0 @@
|
|||||||
import os
|
|
||||||
import tempfile
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
import web.db as db
|
|
||||||
from web.utils import get_cache_dir
|
|
||||||
|
|
||||||
|
|
||||||
# Skip unless explicitly enabled (MySQL integration expected)
|
|
||||||
if not os.getenv("RUN_DB_TESTS"):
|
|
||||||
pytest.skip("Set RUN_DB_TESTS=1 to run cache path integration tests",
|
|
||||||
allow_module_level=True)
|
|
||||||
|
|
||||||
|
|
||||||
def test_db_sync_inserts_relative_paths(tmp_path, monkeypatch):
|
|
||||||
# arrange: create a temporary cache dir and a fake html file
|
|
||||||
cache_dir = tmp_path / "cache"
|
|
||||||
cache_dir.mkdir()
|
|
||||||
f = cache_dir / "example.org_path_to_page_123.html"
|
|
||||||
f.write_text("<html>ok</html>")
|
|
||||||
|
|
||||||
# point app at this cache dir
|
|
||||||
monkeypatch.setenv("PYTEST_CACHE_DIR", str(cache_dir))
|
|
||||||
# monkeypatch get_cache_dir used by db functions
|
|
||||||
monkeypatch.setattr('web.utils.get_cache_dir', lambda: str(cache_dir))
|
|
||||||
|
|
||||||
# ensure DB initialized
|
|
||||||
db.db_init()
|
|
||||||
|
|
||||||
# act
|
|
||||||
db.db_sync_cached_pages(str(cache_dir))
|
|
||||||
|
|
||||||
# assert: DB contains relative path, not absolute
|
|
||||||
rows = db.db_get_all_cached_pages()
|
|
||||||
assert any(r['file_path'] == os.path.relpath(
|
|
||||||
str(f), start=str(cache_dir)) for r in rows)
|
|
||||||
|
|
||||||
|
|
||||||
def test_normalize_cached_page_paths_converts_absolute(tmp_path, monkeypatch):
|
|
||||||
cache_dir = tmp_path / "cache"
|
|
||||||
cache_dir.mkdir()
|
|
||||||
# create an actual file
|
|
||||||
f = cache_dir / "site_example_page_1.html"
|
|
||||||
f.write_text("<html>ok</html>")
|
|
||||||
|
|
||||||
monkeypatch.setattr('web.utils.get_cache_dir', lambda: str(cache_dir))
|
|
||||||
db.db_init()
|
|
||||||
|
|
||||||
abs_fp = str(f)
|
|
||||||
rel_fp = os.path.relpath(abs_fp, start=str(cache_dir))
|
|
||||||
|
|
||||||
# Insert an absolute path row directly (simulate legacy data)
|
|
||||||
with db._ensure_session() as session:
|
|
||||||
session.execute(
|
|
||||||
db.text("INSERT INTO cached_pages(file_path, url_guess, last_modified, size_bytes, job_id) VALUES(:fp, :ug, :lm, :sz, :jid)"),
|
|
||||||
{"fp": abs_fp, "ug": "https://example.org/page1.html",
|
|
||||||
"lm": None, "sz": 10, "jid": None}
|
|
||||||
)
|
|
||||||
session.commit()
|
|
||||||
|
|
||||||
# normalize should convert absolute to relative
|
|
||||||
changed = db.normalize_cached_page_paths()
|
|
||||||
assert changed >= 1
|
|
||||||
|
|
||||||
rows = db.db_get_all_cached_pages()
|
|
||||||
assert any(r['file_path'] == rel_fp for r in rows)
|
|
||||||
@@ -1,53 +0,0 @@
|
|||||||
import os
|
|
||||||
import tempfile
|
|
||||||
from web.app import app
|
|
||||||
|
|
||||||
|
|
||||||
def test_cached_route_serves_file(monkeypatch):
|
|
||||||
# Create a temporary file in the configured cache dir
|
|
||||||
cache_dir = os.path.abspath(os.path.join(
|
|
||||||
os.path.dirname(__file__), '..', 'cache'))
|
|
||||||
os.makedirs(cache_dir, exist_ok=True)
|
|
||||||
fd, tmp_path = tempfile.mkstemp(
|
|
||||||
prefix='test_cached_', suffix='.html', dir=cache_dir)
|
|
||||||
os.close(fd)
|
|
||||||
with open(tmp_path, 'w', encoding='utf-8') as f:
|
|
||||||
f.write('<html><body>cached</body></html>')
|
|
||||||
|
|
||||||
# Fake job record returned by get_job_by_id
|
|
||||||
fake_job = {
|
|
||||||
'id': 'fake123',
|
|
||||||
'job_id': 'fake123',
|
|
||||||
'file_path': os.path.relpath(tmp_path, cache_dir),
|
|
||||||
'file_path_abs': tmp_path,
|
|
||||||
}
|
|
||||||
|
|
||||||
def fake_get_job_by_id(jid):
|
|
||||||
if str(jid) in ('fake123',):
|
|
||||||
return fake_job
|
|
||||||
return {}
|
|
||||||
|
|
||||||
# Patch the symbol imported into web.app
|
|
||||||
monkeypatch.setattr('web.app.get_job_by_id', fake_get_job_by_id)
|
|
||||||
|
|
||||||
# Request route
|
|
||||||
client = app.test_client()
|
|
||||||
res = client.get('/cached/fake123')
|
|
||||||
assert res.status_code == 200
|
|
||||||
assert b'cached' in res.data
|
|
||||||
|
|
||||||
# Cleanup
|
|
||||||
try:
|
|
||||||
os.remove(tmp_path)
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
def test_cached_route_missing(monkeypatch):
|
|
||||||
def fake_get_job_by_id(jid):
|
|
||||||
return {}
|
|
||||||
|
|
||||||
monkeypatch.setattr('web.app.get_job_by_id', fake_get_job_by_id)
|
|
||||||
client = app.test_client()
|
|
||||||
res = client.get('/cached/nope')
|
|
||||||
assert res.status_code == 404
|
|
||||||
@@ -1,27 +0,0 @@
|
|||||||
import os
|
|
||||||
from web.db import CachedPage
|
|
||||||
from web.utils import get_cache_dir
|
|
||||||
|
|
||||||
|
|
||||||
def test_cachedpage_abs_path(tmp_path, monkeypatch):
|
|
||||||
# Create a fake cache dir and monkeypatch get_cache_dir
|
|
||||||
fake_cache = tmp_path / 'cache'
|
|
||||||
fake_cache.mkdir()
|
|
||||||
monkeypatch.setenv('PYTHONIOENCODING', 'utf-8')
|
|
||||||
|
|
||||||
# Patch the symbol used by CachedPage.abs_path (imported into web.db)
|
|
||||||
monkeypatch.setattr('web.db.get_cache_dir', lambda: str(fake_cache))
|
|
||||||
|
|
||||||
# Create a CachedPage instance and set file_path attribute
|
|
||||||
cp = CachedPage()
|
|
||||||
setattr(cp, 'file_path', 'subdir/test.html')
|
|
||||||
|
|
||||||
# Ensure the computed absolute path joins the fake cache dir
|
|
||||||
expected = os.path.join(os.path.abspath(
|
|
||||||
str(fake_cache)), 'subdir/test.html')
|
|
||||||
assert cp.abs_path == expected
|
|
||||||
|
|
||||||
# When file_path is falsy, abs_path should be None
|
|
||||||
cp2 = CachedPage()
|
|
||||||
setattr(cp2, 'file_path', None)
|
|
||||||
assert cp2.abs_path is None
|
|
||||||
@@ -95,39 +95,6 @@ def test_upsert_listing_details_and_urls(db_ready):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
def test_cached_page_upsert_and_get(db_ready):
|
|
||||||
jid_suffix = unique_suffix()
|
|
||||||
url = f"https://example.org/it/{jid_suffix}.html"
|
|
||||||
# Ensure a listing exists for FK relation if enforced
|
|
||||||
db.upsert_listing(
|
|
||||||
url=url,
|
|
||||||
region="it",
|
|
||||||
keyword="cache",
|
|
||||||
title=f"IT Cache {jid_suffix}",
|
|
||||||
pay="N/A",
|
|
||||||
location="Test City",
|
|
||||||
timestamp=now_iso(),
|
|
||||||
)
|
|
||||||
fp = f"/tmp/integration_{jid_suffix}.html"
|
|
||||||
db.upsert_cached_page(
|
|
||||||
file_path=fp,
|
|
||||||
url_guess=url,
|
|
||||||
last_modified=now_iso(),
|
|
||||||
size_bytes=123,
|
|
||||||
job_id=int(jid_suffix) if jid_suffix.isdigit() else None,
|
|
||||||
)
|
|
||||||
row = db.db_get_cache_url(url)
|
|
||||||
if row is not None:
|
|
||||||
assert row["url_guess"] == url
|
|
||||||
# Cleanup
|
|
||||||
try:
|
|
||||||
db.remove_cached_page(fp)
|
|
||||||
db.db_remove_cached_url(url)
|
|
||||||
db.db_delete_job(jid_suffix)
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
def test_user_interactions_mark_and_visit(db_ready):
|
def test_user_interactions_mark_and_visit(db_ready):
|
||||||
uname = f"it_user_{unique_suffix()}"
|
uname = f"it_user_{unique_suffix()}"
|
||||||
db.create_or_update_user(uname, is_active=True)
|
db.create_or_update_user(uname, is_active=True)
|
||||||
|
|||||||
21
tests/test_db_negative_filtering.py
Normal file
21
tests/test_db_negative_filtering.py
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
import pytest
|
||||||
|
import web.db as db
|
||||||
|
|
||||||
|
|
||||||
|
def test_upsert_job_details_skips_negative_match(monkeypatch):
|
||||||
|
def fail(*args, **kwargs): # pragma: no cover - guard against unwanted calls
|
||||||
|
raise AssertionError("should not reach database layers when negative")
|
||||||
|
|
||||||
|
monkeypatch.setattr(db, "_ensure_session", fail)
|
||||||
|
monkeypatch.setattr(db, "insert_log", fail)
|
||||||
|
|
||||||
|
job_data = {
|
||||||
|
"url": "https://example.com/job/neg",
|
||||||
|
"id": "neg123",
|
||||||
|
"is_negative_match": True,
|
||||||
|
"negative_keyword_match": "scam",
|
||||||
|
"negative_match_field": "title",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Should return early without touching the database helpers.
|
||||||
|
db.upsert_job_details(job_data)
|
||||||
106
tests/test_email_service.py
Normal file
106
tests/test_email_service.py
Normal file
@@ -0,0 +1,106 @@
|
|||||||
|
import pytest
|
||||||
|
|
||||||
|
from web.email_service import (
|
||||||
|
EmailConfigurationError,
|
||||||
|
send_email,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_send_email_disabled(monkeypatch):
|
||||||
|
called = {}
|
||||||
|
|
||||||
|
def _fake_smtp(*args, **kwargs): # pragma: no cover - should not be called
|
||||||
|
called["used"] = True
|
||||||
|
raise AssertionError(
|
||||||
|
"SMTP should not be invoked when email is disabled")
|
||||||
|
|
||||||
|
monkeypatch.setattr("web.email_service.smtplib.SMTP", _fake_smtp)
|
||||||
|
monkeypatch.setattr("web.email_service.smtplib.SMTP_SSL", _fake_smtp)
|
||||||
|
|
||||||
|
result = send_email(
|
||||||
|
subject="Hi",
|
||||||
|
body="Test",
|
||||||
|
to="user@example.com",
|
||||||
|
settings={"enabled": False},
|
||||||
|
)
|
||||||
|
assert result is False
|
||||||
|
assert called == {}
|
||||||
|
|
||||||
|
|
||||||
|
def test_send_email_sends_message(monkeypatch):
|
||||||
|
events = {"starttls": False, "login": None, "sent": None}
|
||||||
|
|
||||||
|
class FakeSMTP:
|
||||||
|
def __init__(self, *, host, port, timeout):
|
||||||
|
self.host = host
|
||||||
|
self.port = port
|
||||||
|
self.timeout = timeout
|
||||||
|
|
||||||
|
def __enter__(self):
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __exit__(self, exc_type, exc, tb):
|
||||||
|
return False
|
||||||
|
|
||||||
|
def ehlo(self):
|
||||||
|
events.setdefault("ehlo", 0)
|
||||||
|
events["ehlo"] += 1
|
||||||
|
|
||||||
|
def starttls(self):
|
||||||
|
events["starttls"] = True
|
||||||
|
|
||||||
|
def login(self, username, password):
|
||||||
|
events["login"] = (username, password)
|
||||||
|
|
||||||
|
def send_message(self, message, *, from_addr, to_addrs):
|
||||||
|
events["sent"] = {
|
||||||
|
"from": from_addr,
|
||||||
|
"to": tuple(to_addrs),
|
||||||
|
"subject": message["Subject"],
|
||||||
|
}
|
||||||
|
|
||||||
|
monkeypatch.setattr("web.email_service.smtplib.SMTP", FakeSMTP)
|
||||||
|
monkeypatch.setattr("web.email_service.smtplib.SMTP_SSL", FakeSMTP)
|
||||||
|
|
||||||
|
settings = {
|
||||||
|
"enabled": True,
|
||||||
|
"from_address": "jobs@example.com",
|
||||||
|
"smtp": {
|
||||||
|
"host": "smtp.example.com",
|
||||||
|
"port": 2525,
|
||||||
|
"timeout": 15,
|
||||||
|
"username": "jobs",
|
||||||
|
"password": "secret",
|
||||||
|
"use_tls": True,
|
||||||
|
"use_ssl": False,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
result = send_email(
|
||||||
|
subject="New Jobs",
|
||||||
|
body="You have new jobs waiting.",
|
||||||
|
to=["a@example.com", "b@example.com"],
|
||||||
|
cc="c@example.com",
|
||||||
|
bcc=["d@example.com"],
|
||||||
|
settings=settings,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result is True
|
||||||
|
assert events["starttls"] is True
|
||||||
|
assert events["login"] == ("jobs", "secret")
|
||||||
|
assert events["sent"] == {
|
||||||
|
"from": "jobs@example.com",
|
||||||
|
"to": ("a@example.com", "b@example.com", "c@example.com", "d@example.com"),
|
||||||
|
"subject": "New Jobs",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def test_send_email_requires_host():
|
||||||
|
settings = {
|
||||||
|
"enabled": True,
|
||||||
|
"from_address": "jobs@example.com",
|
||||||
|
"smtp": {"host": "", "port": 587},
|
||||||
|
}
|
||||||
|
with pytest.raises(EmailConfigurationError):
|
||||||
|
send_email(subject="Hi", body="Test",
|
||||||
|
to="user@example.com", settings=settings)
|
||||||
40
tests/test_email_templates.py
Normal file
40
tests/test_email_templates.py
Normal file
@@ -0,0 +1,40 @@
|
|||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
from web.email_templates import render_job_alert_email
|
||||||
|
|
||||||
|
|
||||||
|
def test_render_job_alert_email_with_jobs():
|
||||||
|
jobs = [
|
||||||
|
{
|
||||||
|
"title": "Python Developer",
|
||||||
|
"company": "Acme",
|
||||||
|
"location": "Remote",
|
||||||
|
"url": "https://example.com/jobs/1",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Data Engineer",
|
||||||
|
"company": "Globex",
|
||||||
|
"location": "NYC",
|
||||||
|
"url": "https://example.com/jobs/2",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
ts = datetime(2025, 11, 3, 12, 0)
|
||||||
|
rendered = render_job_alert_email(
|
||||||
|
jobs, region="sfbay", keyword="python", generated_at=ts)
|
||||||
|
|
||||||
|
assert rendered["subject"] == "2 new jobs (region: sfbay, keyword: python)"
|
||||||
|
assert "1. Python Developer" in rendered["body"]
|
||||||
|
assert "Generated at 2025-11-03 12:00 UTC." in rendered["body"]
|
||||||
|
assert rendered["context"]["count"] == 2
|
||||||
|
assert rendered["context"]["jobs_section"].startswith(
|
||||||
|
"\n1. Python Developer")
|
||||||
|
|
||||||
|
|
||||||
|
def test_render_job_alert_email_empty():
|
||||||
|
ts = datetime(2025, 11, 3, 12, 0)
|
||||||
|
rendered = render_job_alert_email([], generated_at=ts)
|
||||||
|
|
||||||
|
assert rendered["subject"] == "No new jobs"
|
||||||
|
assert "No jobs matched this alert." in rendered["body"]
|
||||||
|
assert rendered["body"].count("Generated at") == 1
|
||||||
|
assert rendered["context"]["count"] == 0
|
||||||
137
tests/test_scheduler.py
Normal file
137
tests/test_scheduler.py
Normal file
@@ -0,0 +1,137 @@
|
|||||||
|
import pytest
|
||||||
|
import time
|
||||||
|
from unittest.mock import patch, MagicMock
|
||||||
|
from web.craigslist import scrape_jobs_with_retry, run_scheduled_scraping, fetch_listings
|
||||||
|
|
||||||
|
|
||||||
|
class TestScheduler:
|
||||||
|
|
||||||
|
def test_scrape_jobs_with_retry_success(self):
|
||||||
|
"""Test that scrape_jobs_with_retry succeeds on first attempt."""
|
||||||
|
with patch('web.craigslist.scraper') as mock_scrape:
|
||||||
|
result = scrape_jobs_with_retry()
|
||||||
|
assert result is True
|
||||||
|
mock_scrape.assert_called_once()
|
||||||
|
|
||||||
|
def test_scrape_jobs_with_retry_failure(self):
|
||||||
|
"""Test that scrape_jobs_with_retry handles failures properly."""
|
||||||
|
with patch('web.craigslist.scraper', side_effect=Exception("Test error")) as mock_scrape:
|
||||||
|
result = scrape_jobs_with_retry(max_retries=2)
|
||||||
|
assert result is False
|
||||||
|
assert mock_scrape.call_count == 2
|
||||||
|
|
||||||
|
def test_run_scheduled_scraping(self):
|
||||||
|
"""Test the scheduled scraping wrapper function."""
|
||||||
|
with patch('web.craigslist.scrape_jobs_with_retry') as mock_retry:
|
||||||
|
mock_retry.return_value = True
|
||||||
|
run_scheduled_scraping()
|
||||||
|
mock_retry.assert_called_once()
|
||||||
|
|
||||||
|
def test_scheduler_import(self):
|
||||||
|
"""Test that scheduler functions can be imported."""
|
||||||
|
from web.craigslist import start_scheduler
|
||||||
|
assert callable(start_scheduler)
|
||||||
|
|
||||||
|
@patch('web.craigslist.schedule')
|
||||||
|
def test_scheduler_setup(self, mock_schedule):
|
||||||
|
"""Test that scheduler setup works correctly."""
|
||||||
|
# This is a basic test to ensure the scheduler can be set up
|
||||||
|
from web.craigslist import schedule
|
||||||
|
assert schedule is not None
|
||||||
|
|
||||||
|
@patch('web.craigslist.db_get_all_job_urls')
|
||||||
|
@patch('web.craigslist.seed_regions_keywords_from_listings')
|
||||||
|
@patch('web.craigslist.get_all_regions')
|
||||||
|
@patch('web.craigslist.get_all_keywords')
|
||||||
|
@patch('web.craigslist.get_last_fetch_time')
|
||||||
|
@patch('web.craigslist.process_region_keyword')
|
||||||
|
@patch('web.craigslist.upsert_listing')
|
||||||
|
@patch('web.craigslist.insert_log')
|
||||||
|
def test_fetch_listings_return_structure(self, mock_log, mock_upsert, mock_process, mock_last_fetch,
|
||||||
|
mock_keywords, mock_regions, mock_seed, mock_db_urls):
|
||||||
|
"""Test that fetch_listings returns the correct structure with per-search counts."""
|
||||||
|
# Setup mocks
|
||||||
|
mock_db_urls.return_value = []
|
||||||
|
mock_regions.return_value = [{"name": "sfbay"}]
|
||||||
|
mock_keywords.return_value = [{"name": "python"}]
|
||||||
|
mock_last_fetch.return_value = None # Never fetched before
|
||||||
|
mock_process.return_value = [
|
||||||
|
("2025-11-03T10:00:00Z", "sfbay", "python", "Python Dev",
|
||||||
|
"$100k", "San Francisco", "http://example.com/1"),
|
||||||
|
("2025-11-03T10:00:00Z", "sfbay", "python", "Python Dev",
|
||||||
|
"$100k", "San Francisco", "http://example.com/2"),
|
||||||
|
]
|
||||||
|
|
||||||
|
# Collect messages and get return value from generator
|
||||||
|
gen = fetch_listings()
|
||||||
|
messages = []
|
||||||
|
result = None
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
messages.append(next(gen))
|
||||||
|
except StopIteration as e:
|
||||||
|
result = e.value
|
||||||
|
|
||||||
|
# Verify return structure
|
||||||
|
assert result is not None
|
||||||
|
assert "discovered" in result
|
||||||
|
assert "new" in result
|
||||||
|
assert "by_search" in result
|
||||||
|
assert isinstance(result.get("by_search"), list)
|
||||||
|
assert result.get("discovered") == 2
|
||||||
|
assert result.get("new") == 2
|
||||||
|
|
||||||
|
@patch('web.craigslist.db_get_all_job_urls')
|
||||||
|
@patch('web.craigslist.seed_regions_keywords_from_listings')
|
||||||
|
@patch('web.craigslist.get_all_regions')
|
||||||
|
@patch('web.craigslist.get_all_keywords')
|
||||||
|
@patch('web.craigslist.get_last_fetch_time')
|
||||||
|
@patch('web.craigslist.process_region_keyword')
|
||||||
|
@patch('web.craigslist.upsert_listing')
|
||||||
|
@patch('web.craigslist.insert_log')
|
||||||
|
def test_fetch_listings_per_search_count(self, mock_log, mock_upsert, mock_process, mock_last_fetch,
|
||||||
|
mock_keywords, mock_regions, mock_seed, mock_db_urls):
|
||||||
|
"""Test that fetch_listings correctly counts jobs per search."""
|
||||||
|
# Setup mocks
|
||||||
|
mock_db_urls.return_value = []
|
||||||
|
mock_regions.return_value = [{"name": "sfbay"}, {"name": "losangeles"}]
|
||||||
|
mock_keywords.return_value = [{"name": "python"}, {"name": "java"}]
|
||||||
|
mock_last_fetch.return_value = None # Never fetched before
|
||||||
|
|
||||||
|
# Mock process_region_keyword to return different counts for each search
|
||||||
|
def mock_process_impl(region, keyword, discovered_urls):
|
||||||
|
# Use unique URLs per search to get the total discovered count
|
||||||
|
base_url = f"http://example.com/{region}/{keyword}"
|
||||||
|
counts = {
|
||||||
|
("sfbay", "python"): 3,
|
||||||
|
("sfbay", "java"): 2,
|
||||||
|
("losangeles", "python"): 4,
|
||||||
|
("losangeles", "java"): 1,
|
||||||
|
}
|
||||||
|
count = counts.get((region, keyword), 0)
|
||||||
|
return [(f"2025-11-03T10:00:00Z", region, keyword, f"Job {i}", "$100k", region, f"{base_url}/{i}")
|
||||||
|
for i in range(count)]
|
||||||
|
|
||||||
|
mock_process.side_effect = mock_process_impl
|
||||||
|
|
||||||
|
# Collect result from generator
|
||||||
|
gen = fetch_listings()
|
||||||
|
messages = []
|
||||||
|
result = None
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
messages.append(next(gen))
|
||||||
|
except StopIteration as e:
|
||||||
|
result = e.value
|
||||||
|
|
||||||
|
# Verify per-search counts
|
||||||
|
assert result is not None
|
||||||
|
by_search = result.get("by_search", [])
|
||||||
|
assert len(by_search) == 4
|
||||||
|
|
||||||
|
search_data = {(r.get("region"), r.get("keyword")) : r.get("count") for r in by_search}
|
||||||
|
assert search_data.get(("sfbay", "python")) == 3
|
||||||
|
assert search_data.get(("sfbay", "java")) == 2
|
||||||
|
assert search_data.get(("losangeles", "python")) == 4
|
||||||
|
assert search_data.get(("losangeles", "java")) == 1
|
||||||
|
assert result.get("discovered") == 10 # Total unique jobs
|
||||||
384
tests/test_scraper.py
Normal file
384
tests/test_scraper.py
Normal file
@@ -0,0 +1,384 @@
|
|||||||
|
import pytest
|
||||||
|
from web.scraper import scrape_job_page, extract_contact_info
|
||||||
|
from web.craigslist import process_job_url, scraper
|
||||||
|
|
||||||
|
|
||||||
|
def _make_negative_job(url: str) -> dict:
|
||||||
|
return {
|
||||||
|
"url": url,
|
||||||
|
"title": "SCAM role",
|
||||||
|
"company": "Test Co",
|
||||||
|
"location": "Remote",
|
||||||
|
"description": "This is a scam offer",
|
||||||
|
"id": "job123",
|
||||||
|
"posted_time": "",
|
||||||
|
"reply_url": "N/A",
|
||||||
|
"contact_email": "N/A",
|
||||||
|
"contact_phone": "N/A",
|
||||||
|
"contact_name": "N/A",
|
||||||
|
"is_negative_match": True,
|
||||||
|
"negative_keyword_match": "scam",
|
||||||
|
"negative_match_field": "title",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class TestExtractContactInfo:
|
||||||
|
"""Test suite for contact information extraction."""
|
||||||
|
|
||||||
|
def test_extract_email_from_mailto_link(self):
|
||||||
|
"""Test extraction of email from mailto link."""
|
||||||
|
reply_url = "mailto:contact@example.com?subject=Job%20Inquiry"
|
||||||
|
contact_info = extract_contact_info(reply_url)
|
||||||
|
|
||||||
|
assert contact_info["email"] == "contact@example.com"
|
||||||
|
assert contact_info["phone"] == "N/A"
|
||||||
|
assert contact_info["contact_name"] == "N/A"
|
||||||
|
|
||||||
|
def test_extract_phone_from_tel_link(self):
|
||||||
|
"""Test extraction of phone from tel link."""
|
||||||
|
reply_url = "tel:+1234567890"
|
||||||
|
contact_info = extract_contact_info(reply_url)
|
||||||
|
|
||||||
|
assert contact_info["email"] == "N/A"
|
||||||
|
assert contact_info["phone"] == "+1234567890"
|
||||||
|
assert contact_info["contact_name"] == "N/A"
|
||||||
|
|
||||||
|
def test_extract_email_from_url_parameter(self):
|
||||||
|
"""Test extraction of email from URL query parameters."""
|
||||||
|
reply_url = "https://example.com/contact?email=jobs@company.com&name=John%20Doe"
|
||||||
|
contact_info = extract_contact_info(reply_url)
|
||||||
|
|
||||||
|
assert contact_info["email"] == "jobs@company.com"
|
||||||
|
assert contact_info["contact_name"] == "John Doe"
|
||||||
|
|
||||||
|
def test_extract_phone_from_url_parameter(self):
|
||||||
|
"""Test extraction of phone from URL query parameters."""
|
||||||
|
reply_url = "https://example.com/apply?phone=555-1234&email=contact@test.com"
|
||||||
|
contact_info = extract_contact_info(reply_url)
|
||||||
|
|
||||||
|
assert contact_info["phone"] == "555-1234"
|
||||||
|
assert contact_info["email"] == "contact@test.com"
|
||||||
|
|
||||||
|
def test_extract_contact_name_from_url_parameter(self):
|
||||||
|
"""Test extraction of contact name from URL query parameters."""
|
||||||
|
reply_url = "https://example.com/reply?name=Alice%20Smith&contact_name=Bob%20Jones"
|
||||||
|
contact_info = extract_contact_info(reply_url)
|
||||||
|
|
||||||
|
# Should prefer contact_name over name
|
||||||
|
assert contact_info["contact_name"] == "Bob Jones"
|
||||||
|
|
||||||
|
def test_extract_all_fields_from_url(self):
|
||||||
|
"""Test extraction of all fields from URL parameters."""
|
||||||
|
reply_url = "https://example.com/contact?email=hr@company.com&phone=555-9876&contact_name=Jane%20Doe"
|
||||||
|
contact_info = extract_contact_info(reply_url)
|
||||||
|
|
||||||
|
assert contact_info["email"] == "hr@company.com"
|
||||||
|
assert contact_info["phone"] == "555-9876"
|
||||||
|
assert contact_info["contact_name"] == "Jane Doe"
|
||||||
|
|
||||||
|
def test_handle_empty_reply_url(self):
|
||||||
|
"""Test handling of empty reply URL."""
|
||||||
|
contact_info = extract_contact_info("")
|
||||||
|
|
||||||
|
assert contact_info["email"] == "N/A"
|
||||||
|
assert contact_info["phone"] == "N/A"
|
||||||
|
assert contact_info["contact_name"] == "N/A"
|
||||||
|
|
||||||
|
def test_handle_na_reply_url(self):
|
||||||
|
"""Test handling of N/A reply URL."""
|
||||||
|
contact_info = extract_contact_info("N/A")
|
||||||
|
|
||||||
|
assert contact_info["email"] == "N/A"
|
||||||
|
assert contact_info["phone"] == "N/A"
|
||||||
|
assert contact_info["contact_name"] == "N/A"
|
||||||
|
|
||||||
|
def test_handle_none_reply_url(self):
|
||||||
|
"""Test handling of None reply URL."""
|
||||||
|
contact_info = extract_contact_info(None)
|
||||||
|
|
||||||
|
assert contact_info["email"] == "N/A"
|
||||||
|
assert contact_info["phone"] == "N/A"
|
||||||
|
assert contact_info["contact_name"] == "N/A"
|
||||||
|
|
||||||
|
def test_handle_invalid_url(self):
|
||||||
|
"""Test handling of invalid URL (graceful fallback)."""
|
||||||
|
reply_url = "not a valid url at all"
|
||||||
|
contact_info = extract_contact_info(reply_url)
|
||||||
|
|
||||||
|
# Should return all N/A values without crashing
|
||||||
|
assert contact_info["email"] == "N/A"
|
||||||
|
assert contact_info["phone"] == "N/A"
|
||||||
|
assert contact_info["contact_name"] == "N/A"
|
||||||
|
|
||||||
|
def test_multiple_parameter_variations(self):
|
||||||
|
"""Test that function finds email despite multiple parameter name variations."""
|
||||||
|
reply_url = "https://example.com/reply?from_email=sender@test.com&other=value"
|
||||||
|
contact_info = extract_contact_info(reply_url)
|
||||||
|
|
||||||
|
assert contact_info["email"] == "sender@test.com"
|
||||||
|
|
||||||
|
def test_telephone_parameter_name(self):
|
||||||
|
"""Test extraction using 'telephone' parameter name."""
|
||||||
|
reply_url = "https://example.com/contact?telephone=555-0000"
|
||||||
|
contact_info = extract_contact_info(reply_url)
|
||||||
|
|
||||||
|
assert contact_info["phone"] == "555-0000"
|
||||||
|
|
||||||
|
|
||||||
|
class TestScrapeJobPageContactInfo:
|
||||||
|
"""Test suite for scrape_job_page contact information extraction."""
|
||||||
|
|
||||||
|
def test_scrape_job_page_includes_contact_fields(self):
|
||||||
|
"""Test that scrape_job_page includes contact information in return dict."""
|
||||||
|
html_content = """
|
||||||
|
<html>
|
||||||
|
<h1 class="postingtitle">Software Engineer</h1>
|
||||||
|
<h2 class="company-name">Tech Company</h2>
|
||||||
|
<button class="reply-button" data-href="mailto:jobs@techco.com"></button>
|
||||||
|
<div id="map" data-latitude="37.7749" data-longitude="-122.4194" data-accuracy="rooftop"></div>
|
||||||
|
<section id="postingbody">
|
||||||
|
<p>This is a test job description</p>
|
||||||
|
</section>
|
||||||
|
<div class="postinginfos">
|
||||||
|
<p class="postinginfo">posting id: 12345abc</p>
|
||||||
|
<time class="date timeago" datetime="2025-11-03T10:00:00"></time>
|
||||||
|
</div>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
|
||||||
|
job_data = scrape_job_page(html_content, "https://example.com/job/123")
|
||||||
|
|
||||||
|
# Verify all expected keys are present
|
||||||
|
assert "contact_email" in job_data
|
||||||
|
assert "contact_phone" in job_data
|
||||||
|
assert "contact_name" in job_data
|
||||||
|
assert "reply_url" in job_data
|
||||||
|
|
||||||
|
def test_scrape_job_page_extracts_mailto_contact(self):
|
||||||
|
"""Test that scrape_job_page correctly extracts email from mailto link."""
|
||||||
|
html_content = """
|
||||||
|
<html>
|
||||||
|
<h1 class="postingtitle">Job Title</h1>
|
||||||
|
<h2 class="company-name">Company</h2>
|
||||||
|
<button class="reply-button" data-href="mailto:hiring@company.com?subject=Application"></button>
|
||||||
|
<div id="map"></div>
|
||||||
|
<section id="postingbody"><p>Job desc</p></section>
|
||||||
|
<div class="postinginfos">
|
||||||
|
<p class="postinginfo">id: xyz</p>
|
||||||
|
</div>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
|
||||||
|
job_data = scrape_job_page(html_content, "https://example.com/job/456")
|
||||||
|
|
||||||
|
assert job_data["contact_email"] == "hiring@company.com"
|
||||||
|
assert job_data["reply_url"] == "mailto:hiring@company.com?subject=Application"
|
||||||
|
|
||||||
|
def test_scrape_job_page_no_reply_button(self):
|
||||||
|
"""Test scrape_job_page when no reply button is present."""
|
||||||
|
html_content = """
|
||||||
|
<html>
|
||||||
|
<h1 class="postingtitle">Job Title</h1>
|
||||||
|
<h2 class="company-name">Company</h2>
|
||||||
|
<div id="map"></div>
|
||||||
|
<section id="postingbody"><p>Job desc</p></section>
|
||||||
|
<div class="postinginfos">
|
||||||
|
<p class="postinginfo">id: xyz</p>
|
||||||
|
</div>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
|
||||||
|
job_data = scrape_job_page(html_content, "https://example.com/job/789")
|
||||||
|
|
||||||
|
# Should have N/A for all contact fields
|
||||||
|
assert job_data["reply_url"] == "N/A"
|
||||||
|
assert job_data["contact_email"] == "N/A"
|
||||||
|
assert job_data["contact_phone"] == "N/A"
|
||||||
|
assert job_data["contact_name"] == "N/A"
|
||||||
|
|
||||||
|
def test_scrape_job_page_with_url_based_reply(self):
|
||||||
|
"""Test scrape_job_page with URL-based reply link containing contact info."""
|
||||||
|
html_content = """
|
||||||
|
<html>
|
||||||
|
<h1 class="postingtitle">Manager Position</h1>
|
||||||
|
<h2 class="company-name">BigCorp</h2>
|
||||||
|
<button class="reply-button" data-href="https://apply.bigcorp.com?email=hr@bigcorp.com&name=HR%20Team"></button>
|
||||||
|
<div id="map"></div>
|
||||||
|
<section id="postingbody"><p>Apply now</p></section>
|
||||||
|
<div class="postinginfos">
|
||||||
|
<p class="postinginfo">id: manager123</p>
|
||||||
|
</div>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
|
||||||
|
job_data = scrape_job_page(html_content, "https://example.com/job/999")
|
||||||
|
|
||||||
|
assert job_data["contact_email"] == "hr@bigcorp.com"
|
||||||
|
assert job_data["contact_name"] == "HR Team"
|
||||||
|
|
||||||
|
def test_scrape_job_page_negative_keyword_match(self, monkeypatch):
|
||||||
|
"""Test that negative keyword detection flags matching jobs."""
|
||||||
|
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"web.scraper.get_negative_keywords", lambda: ["scam"])
|
||||||
|
|
||||||
|
html_content = """
|
||||||
|
<html>
|
||||||
|
<h1 class="postingtitle">Great Opportunity</h1>
|
||||||
|
<h2 class="company-name">SCAM Corp</h2>
|
||||||
|
<section id="postingbody"><p>This is a scam offer</p></section>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
|
||||||
|
job_data = scrape_job_page(
|
||||||
|
html_content, "https://example.com/job/negative")
|
||||||
|
|
||||||
|
assert job_data["is_negative_match"] is True
|
||||||
|
assert job_data["negative_keyword_match"] == "scam"
|
||||||
|
assert job_data["negative_match_field"] in {
|
||||||
|
"title", "company", "description"}
|
||||||
|
|
||||||
|
def test_scrape_job_page_no_negative_match(self, monkeypatch):
|
||||||
|
"""Test that jobs without matching keywords are not flagged."""
|
||||||
|
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"web.scraper.get_negative_keywords", lambda: ["scam"])
|
||||||
|
|
||||||
|
html_content = """
|
||||||
|
<html>
|
||||||
|
<h1 class="postingtitle">Legit Opportunity</h1>
|
||||||
|
<h2 class="company-name">Honest Corp</h2>
|
||||||
|
<section id="postingbody"><p>We pay well and on time.</p></section>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
|
||||||
|
job_data = scrape_job_page(
|
||||||
|
html_content, "https://example.com/job/positive")
|
||||||
|
|
||||||
|
assert job_data["is_negative_match"] is False
|
||||||
|
assert job_data["negative_keyword_match"] is None
|
||||||
|
assert job_data["negative_match_field"] is None
|
||||||
|
|
||||||
|
|
||||||
|
class TestProcessJobUrlNegativeFiltering:
|
||||||
|
def test_process_job_url_skips_negative_match(self, monkeypatch):
|
||||||
|
job_url = "https://example.com/job/negative"
|
||||||
|
remove_calls = []
|
||||||
|
upsert_calls = []
|
||||||
|
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"web.craigslist.get_last_fetch_time", lambda url: None)
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"web.craigslist.insert_log",
|
||||||
|
lambda *args, **kwargs: None,
|
||||||
|
)
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"web.craigslist.make_request_with_retry",
|
||||||
|
lambda url, attempts: "<html />",
|
||||||
|
)
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"web.craigslist.scrape_job_page",
|
||||||
|
lambda content, url: _make_negative_job(url),
|
||||||
|
)
|
||||||
|
|
||||||
|
def fake_upsert(job_data, region="", keyword=""):
|
||||||
|
upsert_calls.append(job_data)
|
||||||
|
|
||||||
|
def fake_remove(url):
|
||||||
|
remove_calls.append(url)
|
||||||
|
|
||||||
|
monkeypatch.setattr("web.craigslist.upsert_job_details", fake_upsert)
|
||||||
|
monkeypatch.setattr("web.craigslist.remove_job", fake_remove)
|
||||||
|
|
||||||
|
messages = list(process_job_url(job_url, region="test", keyword="kw"))
|
||||||
|
|
||||||
|
assert any("Skipping job" in message for message in messages)
|
||||||
|
assert remove_calls == [job_url]
|
||||||
|
assert upsert_calls == []
|
||||||
|
|
||||||
|
|
||||||
|
class TestScraperPipelineNegativeFiltering:
|
||||||
|
def test_scraper_skips_negative_jobs(self, monkeypatch):
|
||||||
|
job_url = "https://example.com/job/negative"
|
||||||
|
remove_calls = []
|
||||||
|
upsert_calls = []
|
||||||
|
|
||||||
|
monkeypatch.setattr("web.craigslist.db_init", lambda: None)
|
||||||
|
|
||||||
|
def fake_fetch_listings():
|
||||||
|
yield "Fake listing fetch\n"
|
||||||
|
return {"discovered": 0, "new": 0, "by_search": [], "new_jobs": []}
|
||||||
|
|
||||||
|
monkeypatch.setattr("web.craigslist.fetch_listings",
|
||||||
|
fake_fetch_listings)
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"web.craigslist.db_get_all_job_urls",
|
||||||
|
lambda: [{"url": job_url, "region": "reg", "keyword": "kw"}],
|
||||||
|
)
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"web.craigslist.get_last_fetch_time", lambda url: None)
|
||||||
|
monkeypatch.setattr("web.craigslist.insert_log",
|
||||||
|
lambda *args, **kwargs: None)
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"web.craigslist.make_request_with_retry", lambda url, attempts: "<html />"
|
||||||
|
)
|
||||||
|
monkeypatch.setattr("web.craigslist.url_to_job_id",
|
||||||
|
lambda url: "job123")
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"web.craigslist.scrape_job_page",
|
||||||
|
lambda content, url: _make_negative_job(url),
|
||||||
|
)
|
||||||
|
|
||||||
|
def fake_upsert(job_data, region="", keyword=""):
|
||||||
|
upsert_calls.append(job_data)
|
||||||
|
|
||||||
|
def fake_remove(url):
|
||||||
|
remove_calls.append(url)
|
||||||
|
|
||||||
|
monkeypatch.setattr("web.craigslist.upsert_job_details", fake_upsert)
|
||||||
|
monkeypatch.setattr("web.craigslist.remove_job", fake_remove)
|
||||||
|
|
||||||
|
messages = list(scraper())
|
||||||
|
|
||||||
|
assert any("Skipping job" in message for message in messages)
|
||||||
|
assert remove_calls == [job_url]
|
||||||
|
assert upsert_calls == []
|
||||||
|
|
||||||
|
|
||||||
|
class TestScraperEmailNotifications:
|
||||||
|
def test_scraper_sends_email_for_new_jobs(self, monkeypatch):
|
||||||
|
monkeypatch.setattr("web.craigslist.db_init", lambda: None)
|
||||||
|
|
||||||
|
new_jobs = [
|
||||||
|
{
|
||||||
|
"title": "Python Developer",
|
||||||
|
"company": "Acme",
|
||||||
|
"location": "Remote",
|
||||||
|
"url": "https://example.com/jobs/1",
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
def fake_fetch_listings():
|
||||||
|
yield "Fake listing fetch\n"
|
||||||
|
return {
|
||||||
|
"discovered": 1,
|
||||||
|
"new": 1,
|
||||||
|
"by_search": [],
|
||||||
|
"new_jobs": new_jobs,
|
||||||
|
}
|
||||||
|
|
||||||
|
monkeypatch.setattr("web.craigslist.fetch_listings", fake_fetch_listings)
|
||||||
|
monkeypatch.setattr("web.craigslist.db_get_all_job_urls", lambda: [])
|
||||||
|
|
||||||
|
calls = {}
|
||||||
|
|
||||||
|
def fake_send_alert(jobs):
|
||||||
|
calls["jobs"] = jobs
|
||||||
|
return True, "sent"
|
||||||
|
|
||||||
|
monkeypatch.setattr("web.craigslist._send_new_job_alert", fake_send_alert)
|
||||||
|
|
||||||
|
messages = list(scraper())
|
||||||
|
|
||||||
|
assert calls["jobs"] == new_jobs
|
||||||
|
assert any("Job alert email sent." in message for message in messages)
|
||||||
148
tests/test_user_negative_keywords.py
Normal file
148
tests/test_user_negative_keywords.py
Normal file
@@ -0,0 +1,148 @@
|
|||||||
|
import pytest
|
||||||
|
from web.db import (
|
||||||
|
db_init,
|
||||||
|
create_or_update_user,
|
||||||
|
upsert_negative_keyword,
|
||||||
|
set_user_negative_keywords,
|
||||||
|
get_user_negative_keywords,
|
||||||
|
upsert_listing,
|
||||||
|
upsert_job_details,
|
||||||
|
get_all_jobs,
|
||||||
|
UserNegativeKeyword,
|
||||||
|
NegativeKeyword
|
||||||
|
)
|
||||||
|
from web.app import app
|
||||||
|
from web.utils import filter_jobs
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def client():
|
||||||
|
app.config['TESTING'] = True
|
||||||
|
app.config['WTF_CSRF_ENABLED'] = False
|
||||||
|
with app.test_client() as client:
|
||||||
|
with app.app_context():
|
||||||
|
db_init()
|
||||||
|
yield client
|
||||||
|
|
||||||
|
|
||||||
|
def test_negative_keyword_db_ops():
|
||||||
|
db_init()
|
||||||
|
username = "test_neg_user"
|
||||||
|
create_or_update_user(username, "password")
|
||||||
|
|
||||||
|
# Test upsert
|
||||||
|
kid = upsert_negative_keyword("scam")
|
||||||
|
assert kid > 0
|
||||||
|
kid2 = upsert_negative_keyword("scam")
|
||||||
|
assert kid == kid2
|
||||||
|
|
||||||
|
# Test set/get
|
||||||
|
set_user_negative_keywords(username, ["scam", "unpaid"])
|
||||||
|
nks = get_user_negative_keywords(username)
|
||||||
|
assert len(nks) == 2
|
||||||
|
assert "scam" in nks
|
||||||
|
assert "unpaid" in nks
|
||||||
|
|
||||||
|
# Test update
|
||||||
|
set_user_negative_keywords(username, ["scam"])
|
||||||
|
nks = get_user_negative_keywords(username)
|
||||||
|
assert len(nks) == 1
|
||||||
|
assert "scam" in nks
|
||||||
|
assert "unpaid" not in nks
|
||||||
|
|
||||||
|
# Test clear
|
||||||
|
set_user_negative_keywords(username, [])
|
||||||
|
nks = get_user_negative_keywords(username)
|
||||||
|
assert len(nks) == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_settings_endpoint(client):
|
||||||
|
username = "test_settings_user"
|
||||||
|
create_or_update_user(username, "password")
|
||||||
|
|
||||||
|
# Login
|
||||||
|
client.post('/login', data={'username': username, 'password': 'password'})
|
||||||
|
|
||||||
|
# Post settings
|
||||||
|
resp = client.post('/settings', json={
|
||||||
|
'regions': [],
|
||||||
|
'keywords': [],
|
||||||
|
'negative_keywords': ['spam', 'junk']
|
||||||
|
})
|
||||||
|
assert resp.status_code == 200
|
||||||
|
|
||||||
|
# Verify DB
|
||||||
|
nks = get_user_negative_keywords(username)
|
||||||
|
assert "spam" in nks
|
||||||
|
assert "junk" in nks
|
||||||
|
|
||||||
|
|
||||||
|
def test_job_filtering_with_negative_keywords():
|
||||||
|
# Setup jobs
|
||||||
|
jobs = [
|
||||||
|
{"title": "Great Job", "description": "Good pay"},
|
||||||
|
{"title": "Bad Job", "description": "This is a scam"},
|
||||||
|
{"title": "Okay Job", "description": "Average pay"},
|
||||||
|
]
|
||||||
|
|
||||||
|
# Filter
|
||||||
|
filtered = filter_jobs(jobs, negative_keywords=["scam"])
|
||||||
|
assert len(filtered) == 2
|
||||||
|
assert "Bad Job" not in [j['title'] for j in filtered]
|
||||||
|
|
||||||
|
filtered = filter_jobs(jobs, negative_keywords=["pay"])
|
||||||
|
assert len(filtered) == 1
|
||||||
|
assert "Bad Job" in [j['title']
|
||||||
|
for j in filtered] # "scam" job doesn't have "pay"
|
||||||
|
|
||||||
|
|
||||||
|
def test_jobs_endpoint_filtering(client):
|
||||||
|
username = "test_filter_user"
|
||||||
|
create_or_update_user(username, "password")
|
||||||
|
|
||||||
|
# Setup DB with jobs
|
||||||
|
upsert_listing(
|
||||||
|
url="http://example.com/1",
|
||||||
|
region="sfbay",
|
||||||
|
keyword="python",
|
||||||
|
title="Good Python Job",
|
||||||
|
pay="$100k",
|
||||||
|
location="SF",
|
||||||
|
timestamp="now"
|
||||||
|
)
|
||||||
|
upsert_job_details({
|
||||||
|
"url": "http://example.com/1",
|
||||||
|
"id": "1",
|
||||||
|
"title": "Good Python Job",
|
||||||
|
"description": "This is a legit job."
|
||||||
|
})
|
||||||
|
|
||||||
|
upsert_listing(
|
||||||
|
url="http://example.com/2",
|
||||||
|
region="sfbay",
|
||||||
|
keyword="python",
|
||||||
|
title="Bad Python Job",
|
||||||
|
pay="$100k",
|
||||||
|
location="SF",
|
||||||
|
timestamp="now"
|
||||||
|
)
|
||||||
|
upsert_job_details({
|
||||||
|
"url": "http://example.com/2",
|
||||||
|
"id": "2",
|
||||||
|
"title": "Bad Python Job",
|
||||||
|
"description": "This is a scam job."
|
||||||
|
})
|
||||||
|
|
||||||
|
# Login
|
||||||
|
client.post('/login', data={'username': username, 'password': 'password'})
|
||||||
|
|
||||||
|
# Set negative keywords
|
||||||
|
set_user_negative_keywords(username, ["scam"])
|
||||||
|
|
||||||
|
# Fetch jobs
|
||||||
|
resp = client.get('/jobs')
|
||||||
|
data = resp.get_json()
|
||||||
|
|
||||||
|
titles = [j['title'] for j in data]
|
||||||
|
assert "Good Python Job" in titles
|
||||||
|
assert "Bad Python Job" not in titles
|
||||||
@@ -16,3 +16,23 @@ def test_http_settings_helpers():
|
|||||||
assert isinstance(utils.get_backoff_factor(), int)
|
assert isinstance(utils.get_backoff_factor(), int)
|
||||||
assert isinstance(utils.get_min_delay(), int)
|
assert isinstance(utils.get_min_delay(), int)
|
||||||
assert isinstance(utils.get_max_delay(), int)
|
assert isinstance(utils.get_max_delay(), int)
|
||||||
|
|
||||||
|
|
||||||
|
def test_negative_keywords_helper():
|
||||||
|
keywords = utils.get_negative_keywords()
|
||||||
|
assert isinstance(keywords, list)
|
||||||
|
for kw in keywords:
|
||||||
|
assert isinstance(kw, str)
|
||||||
|
assert kw == kw.lower()
|
||||||
|
|
||||||
|
|
||||||
|
def test_email_settings_helper():
|
||||||
|
settings = utils.get_email_settings()
|
||||||
|
assert isinstance(settings, dict)
|
||||||
|
assert 'enabled' in settings
|
||||||
|
assert 'from_address' in settings
|
||||||
|
smtp = settings.get('smtp')
|
||||||
|
assert isinstance(smtp, dict)
|
||||||
|
assert 'host' in smtp
|
||||||
|
assert isinstance(smtp.get('port'), int)
|
||||||
|
assert isinstance(settings.get('recipients'), list)
|
||||||
|
|||||||
358
web/app.py
358
web/app.py
@@ -1,11 +1,13 @@
|
|||||||
import os
|
import os
|
||||||
from flask import Flask, request, jsonify, render_template, redirect, url_for, session, flash, send_file
|
from flask import Flask, request, jsonify, render_template, redirect, url_for, session, flash, Response
|
||||||
from flask_wtf import CSRFProtect
|
from flask_wtf import CSRFProtect
|
||||||
from typing import Dict, List
|
from typing import Dict, List
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
|
||||||
from web.craigslist import scraper
|
from web.craigslist import scraper
|
||||||
from web.db import (
|
from web.db import (
|
||||||
db_init,
|
db_init,
|
||||||
|
delete_user_by_id,
|
||||||
get_all_jobs,
|
get_all_jobs,
|
||||||
mark_favorite,
|
mark_favorite,
|
||||||
record_visit,
|
record_visit,
|
||||||
@@ -13,12 +15,16 @@ from web.db import (
|
|||||||
create_or_update_user,
|
create_or_update_user,
|
||||||
verify_user_credentials,
|
verify_user_credentials,
|
||||||
get_user,
|
get_user,
|
||||||
|
get_user_by_id,
|
||||||
get_user_regions,
|
get_user_regions,
|
||||||
get_user_keywords,
|
get_user_keywords,
|
||||||
|
get_user_negative_keywords,
|
||||||
set_user_regions,
|
set_user_regions,
|
||||||
set_user_keywords,
|
set_user_keywords,
|
||||||
|
set_user_negative_keywords,
|
||||||
get_all_regions,
|
get_all_regions,
|
||||||
get_all_keywords,
|
get_all_keywords,
|
||||||
|
stats_overview,
|
||||||
upsert_region,
|
upsert_region,
|
||||||
upsert_keyword,
|
upsert_keyword,
|
||||||
list_regions_full,
|
list_regions_full,
|
||||||
@@ -26,15 +32,24 @@ from web.db import (
|
|||||||
rename_region,
|
rename_region,
|
||||||
rename_keyword,
|
rename_keyword,
|
||||||
change_region_color,
|
change_region_color,
|
||||||
change_keyword_color
|
change_keyword_color,
|
||||||
|
subscribe_email,
|
||||||
|
unsubscribe_email,
|
||||||
|
list_email_subscriptions,
|
||||||
|
list_email_templates,
|
||||||
|
create_email_template,
|
||||||
|
update_email_template,
|
||||||
|
delete_email_template,
|
||||||
|
get_email_template,
|
||||||
)
|
)
|
||||||
from web.utils import (
|
from web.utils import (
|
||||||
initialize_users_from_settings,
|
initialize_users_from_settings,
|
||||||
filter_jobs,
|
filter_jobs,
|
||||||
get_job_by_id,
|
get_job_by_id,
|
||||||
get_cache_dir,
|
now_iso,
|
||||||
)
|
)
|
||||||
from web.db import get_all_regions, get_all_keywords
|
from web.db import get_all_regions, get_all_keywords
|
||||||
|
from web.email_templates import render_job_alert_email
|
||||||
|
|
||||||
app = Flask(__name__)
|
app = Flask(__name__)
|
||||||
app.secret_key = os.environ.get("FLASK_SECRET", "dev-secret-change-me")
|
app.secret_key = os.environ.get("FLASK_SECRET", "dev-secret-change-me")
|
||||||
@@ -105,24 +120,30 @@ def index():
|
|||||||
# Apply user preference filters if no explicit filters provided
|
# Apply user preference filters if no explicit filters provided
|
||||||
selected_region = request.args.get("region")
|
selected_region = request.args.get("region")
|
||||||
selected_keyword = request.args.get("keyword")
|
selected_keyword = request.args.get("keyword")
|
||||||
if not selected_region and session.get('username'):
|
user_negative_keywords = []
|
||||||
|
|
||||||
|
if session.get('username'):
|
||||||
try:
|
try:
|
||||||
prefs = get_user_regions(session['username'])
|
username = session['username']
|
||||||
if prefs:
|
if not selected_region:
|
||||||
# If user has region prefs, filter to them by default
|
prefs = get_user_regions(username)
|
||||||
all_jobs = [j for j in all_jobs if j.get(
|
if prefs:
|
||||||
'region') in set(prefs)]
|
# If user has region prefs, filter to them by default
|
||||||
|
all_jobs = [j for j in all_jobs if j.get(
|
||||||
|
'region') in set(prefs)]
|
||||||
|
if not selected_keyword:
|
||||||
|
prefs = get_user_keywords(username)
|
||||||
|
if prefs:
|
||||||
|
all_jobs = [j for j in all_jobs if j.get(
|
||||||
|
'keyword') in set(prefs)]
|
||||||
|
|
||||||
|
# Always fetch negative keywords for logged-in users
|
||||||
|
user_negative_keywords = get_user_negative_keywords(username)
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
if not selected_keyword and session.get('username'):
|
|
||||||
try:
|
filtered_jobs = filter_jobs(
|
||||||
prefs = get_user_keywords(session['username'])
|
all_jobs, selected_region, selected_keyword, negative_keywords=user_negative_keywords)
|
||||||
if prefs:
|
|
||||||
all_jobs = [j for j in all_jobs if j.get(
|
|
||||||
'keyword') in set(prefs)]
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
filtered_jobs = filter_jobs(all_jobs, selected_region, selected_keyword)
|
|
||||||
|
|
||||||
return render_template(
|
return render_template(
|
||||||
"index.html",
|
"index.html",
|
||||||
@@ -176,23 +197,26 @@ def jobs():
|
|||||||
# Respect user preferences when no explicit filters provided
|
# Respect user preferences when no explicit filters provided
|
||||||
region = request.args.get("region")
|
region = request.args.get("region")
|
||||||
keyword = request.args.get("keyword")
|
keyword = request.args.get("keyword")
|
||||||
if not region and session.get('username'):
|
user_negative_keywords = []
|
||||||
|
|
||||||
|
if session.get('username'):
|
||||||
try:
|
try:
|
||||||
prefs = get_user_regions(session['username'])
|
username = session['username']
|
||||||
if prefs:
|
if not region:
|
||||||
all_jobs = [j for j in all_jobs if j.get(
|
prefs = get_user_regions(username)
|
||||||
'region') in set(prefs)]
|
if prefs:
|
||||||
|
all_jobs = [j for j in all_jobs if j.get(
|
||||||
|
'region') in set(prefs)]
|
||||||
|
if not keyword:
|
||||||
|
prefs = get_user_keywords(username)
|
||||||
|
if prefs:
|
||||||
|
all_jobs = [j for j in all_jobs if j.get(
|
||||||
|
'keyword') in set(prefs)]
|
||||||
|
|
||||||
|
user_negative_keywords = get_user_negative_keywords(username)
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
if not keyword and session.get('username'):
|
return jsonify(filter_jobs(all_jobs, region, keyword, negative_keywords=user_negative_keywords))
|
||||||
try:
|
|
||||||
prefs = get_user_keywords(session['username'])
|
|
||||||
if prefs:
|
|
||||||
all_jobs = [j for j in all_jobs if j.get(
|
|
||||||
'keyword') in set(prefs)]
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
return jsonify(filter_jobs(all_jobs, region, keyword))
|
|
||||||
|
|
||||||
|
|
||||||
@app.route('/job_details', methods=['GET'])
|
@app.route('/job_details', methods=['GET'])
|
||||||
@@ -202,9 +226,9 @@ def job_details():
|
|||||||
if session.get('username'):
|
if session.get('username'):
|
||||||
try:
|
try:
|
||||||
r = set(get_user_regions(session['username']))
|
r = set(get_user_regions(session['username']))
|
||||||
k = set(get_user_keywords(session['username']))
|
|
||||||
if r:
|
if r:
|
||||||
jobs = [j for j in jobs if j.get('region') in r]
|
jobs = [j for j in jobs if j.get('region') in r]
|
||||||
|
k = set(get_user_keywords(session['username']))
|
||||||
if k:
|
if k:
|
||||||
jobs = [j for j in jobs if j.get('keyword') in k]
|
jobs = [j for j in jobs if j.get('keyword') in k]
|
||||||
except Exception:
|
except Exception:
|
||||||
@@ -230,39 +254,6 @@ def job_by_id(job_id):
|
|||||||
return jsonify({"error": "Job not found"}), 404
|
return jsonify({"error": "Job not found"}), 404
|
||||||
|
|
||||||
|
|
||||||
@app.route('/cached/<job_id>', methods=['GET'])
|
|
||||||
def serve_cached(job_id):
|
|
||||||
"""Serve the cached HTML file for a job if available.
|
|
||||||
|
|
||||||
Uses the job record's `file_path_abs` when present, or resolves the DB `file_path` via helper.
|
|
||||||
Ensures the returned file is located under the configured cache directory to avoid path-traversal.
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
from web.db import db_get_cached_abs_path
|
|
||||||
j = get_job_by_id(job_id)
|
|
||||||
if not j:
|
|
||||||
return "Job not found", 404
|
|
||||||
|
|
||||||
# Prefer file_path_abs, fall back to resolving the DB-stored file_path
|
|
||||||
abs_fp = j.get('file_path_abs') or None
|
|
||||||
if not abs_fp:
|
|
||||||
db_fp = j.get('file_path')
|
|
||||||
abs_fp = db_get_cached_abs_path(db_fp) if db_fp else None
|
|
||||||
|
|
||||||
if not abs_fp or not os.path.isfile(abs_fp):
|
|
||||||
return "Cached file not available", 404
|
|
||||||
|
|
||||||
cache_dir = os.path.abspath(get_cache_dir())
|
|
||||||
abs_fp = os.path.abspath(abs_fp)
|
|
||||||
# Ensure the file is inside the cache directory
|
|
||||||
if os.path.commonpath([cache_dir, abs_fp]) != cache_dir:
|
|
||||||
return "Forbidden", 403
|
|
||||||
|
|
||||||
return send_file(abs_fp)
|
|
||||||
except Exception:
|
|
||||||
return "Error serving cached file", 500
|
|
||||||
|
|
||||||
|
|
||||||
@app.route('/jobs/<job_id>/favorite', methods=['POST'])
|
@app.route('/jobs/<job_id>/favorite', methods=['POST'])
|
||||||
def set_favorite(job_id):
|
def set_favorite(job_id):
|
||||||
"""Mark or unmark a job as favorite for a given user.
|
"""Mark or unmark a job as favorite for a given user.
|
||||||
@@ -288,9 +279,21 @@ csrf.exempt(set_favorite)
|
|||||||
|
|
||||||
@app.route('/scrape', methods=['GET'])
|
@app.route('/scrape', methods=['GET'])
|
||||||
def scrape():
|
def scrape():
|
||||||
"""Trigger the web scraping process."""
|
"""Trigger the web scraping process with streaming output."""
|
||||||
scraper()
|
def generate():
|
||||||
return jsonify({"status": "Scraping completed"})
|
try:
|
||||||
|
for message in scraper():
|
||||||
|
yield message
|
||||||
|
except Exception as e:
|
||||||
|
yield f"Error during scraping: {str(e)}\n"
|
||||||
|
|
||||||
|
return Response(generate(), mimetype='text/plain')
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/scrape-page', methods=['GET'])
|
||||||
|
def scrape_page():
|
||||||
|
"""Serve the scrape page with streaming output display."""
|
||||||
|
return render_template('scrape.html', title='Scrape Jobs')
|
||||||
|
|
||||||
|
|
||||||
# ---------------- Auth & Admin UI ------------------------------------------
|
# ---------------- Auth & Admin UI ------------------------------------------
|
||||||
@@ -322,7 +325,7 @@ def admin_users():
|
|||||||
if request.method == 'POST':
|
if request.method == 'POST':
|
||||||
data = request.form
|
data = request.form
|
||||||
username = (data.get('username') or '').strip()
|
username = (data.get('username') or '').strip()
|
||||||
password = data.get('password') or None
|
password = data.get('new_password') or None
|
||||||
is_admin = bool(data.get('is_admin'))
|
is_admin = bool(data.get('is_admin'))
|
||||||
is_active = bool(data.get('is_active')) if data.get(
|
is_active = bool(data.get('is_active')) if data.get(
|
||||||
'is_active') is not None else True
|
'is_active') is not None else True
|
||||||
@@ -342,6 +345,163 @@ def admin_users():
|
|||||||
return render_template('admin/users.html', users=users, title='Users')
|
return render_template('admin/users.html', users=users, title='Users')
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/admin/user/<user_id>', methods=['GET', 'POST'])
|
||||||
|
def admin_user(user_id):
|
||||||
|
if not require_admin():
|
||||||
|
return redirect(url_for('login'))
|
||||||
|
user = get_user_by_id(user_id)
|
||||||
|
if request.method == 'POST':
|
||||||
|
data = request.form
|
||||||
|
username = (data.get('username') or '').strip()
|
||||||
|
password = data.get('new_password')
|
||||||
|
is_admin = bool(data.get('is_admin'))
|
||||||
|
is_active = bool(data.get('is_active')) if data.get(
|
||||||
|
'is_active') is not None else True
|
||||||
|
try:
|
||||||
|
create_or_update_user(
|
||||||
|
username, password=password, is_admin=is_admin, is_active=is_active)
|
||||||
|
flash('User saved')
|
||||||
|
except Exception as e:
|
||||||
|
flash(f'Error: {e}')
|
||||||
|
return redirect(url_for('admin_users'))
|
||||||
|
return render_template('admin/user.html', user=user, title='User')
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/admin/user/<user_id>/delete', methods=['POST'])
|
||||||
|
def admin_user_delete(user_id):
|
||||||
|
if not require_admin():
|
||||||
|
return redirect(url_for('login'))
|
||||||
|
if delete_user_by_id(user_id):
|
||||||
|
flash('User deleted')
|
||||||
|
else:
|
||||||
|
flash('Error deleting user')
|
||||||
|
return redirect(url_for('admin_users'))
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/admin/emails', methods=['GET', 'POST'])
|
||||||
|
def admin_emails():
|
||||||
|
if not require_admin():
|
||||||
|
return redirect(url_for('login'))
|
||||||
|
if request.method == 'POST':
|
||||||
|
action = (request.form.get('action') or '').strip().lower()
|
||||||
|
email = (request.form.get('email') or '').strip()
|
||||||
|
try:
|
||||||
|
if action == 'subscribe':
|
||||||
|
subscribe_email(email)
|
||||||
|
flash('Subscription saved')
|
||||||
|
elif action == 'unsubscribe':
|
||||||
|
if unsubscribe_email(email):
|
||||||
|
flash('Subscription deactivated')
|
||||||
|
else:
|
||||||
|
flash('No matching subscription found')
|
||||||
|
elif action == 'reactivate':
|
||||||
|
subscribe_email(email)
|
||||||
|
flash('Subscription reactivated')
|
||||||
|
else:
|
||||||
|
flash('Unknown action')
|
||||||
|
except ValueError as exc:
|
||||||
|
flash(f'Error: {exc}')
|
||||||
|
except Exception as exc:
|
||||||
|
flash(f'Error: {exc}')
|
||||||
|
return redirect(url_for('admin_emails'))
|
||||||
|
subscriptions = list_email_subscriptions()
|
||||||
|
|
||||||
|
class Sub(dict):
|
||||||
|
__getattr__ = dict.get
|
||||||
|
|
||||||
|
subscription_rows = [Sub(s) for s in subscriptions]
|
||||||
|
active_count = sum(1 for s in subscription_rows if s.get('is_active'))
|
||||||
|
return render_template(
|
||||||
|
'admin/email.html',
|
||||||
|
title='Email Subscriptions',
|
||||||
|
subscriptions=subscription_rows,
|
||||||
|
total_active=active_count,
|
||||||
|
total=len(subscription_rows),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/admin/email-templates', methods=['GET', 'POST'])
|
||||||
|
def admin_email_templates():
|
||||||
|
if not require_admin():
|
||||||
|
return redirect(url_for('login'))
|
||||||
|
|
||||||
|
if request.method == 'POST':
|
||||||
|
action = (request.form.get('action') or '').strip().lower()
|
||||||
|
template_id = request.form.get('template_id')
|
||||||
|
name = request.form.get('name') or ''
|
||||||
|
slug = request.form.get('slug') or ''
|
||||||
|
subject = request.form.get('subject') or ''
|
||||||
|
body = request.form.get('body') or ''
|
||||||
|
is_active = request.form.get('is_active') == 'on'
|
||||||
|
try:
|
||||||
|
if action == 'create':
|
||||||
|
create_email_template(
|
||||||
|
name=name, slug=slug, subject=subject, body=body, is_active=is_active)
|
||||||
|
flash('Template created')
|
||||||
|
elif action == 'update':
|
||||||
|
update_email_template(
|
||||||
|
int(template_id or 0),
|
||||||
|
name=name,
|
||||||
|
slug=slug or None,
|
||||||
|
subject=subject,
|
||||||
|
body=body,
|
||||||
|
is_active=is_active,
|
||||||
|
)
|
||||||
|
flash('Template updated')
|
||||||
|
elif action == 'delete':
|
||||||
|
if delete_email_template(int(template_id or 0)):
|
||||||
|
flash('Template deleted')
|
||||||
|
else:
|
||||||
|
flash('Template not found')
|
||||||
|
else:
|
||||||
|
flash('Unknown action')
|
||||||
|
except ValueError as exc:
|
||||||
|
flash(f'Error: {exc}')
|
||||||
|
except Exception as exc:
|
||||||
|
flash(f'Error: {exc}')
|
||||||
|
return redirect(url_for('admin_email_templates'))
|
||||||
|
|
||||||
|
templates = list_email_templates(include_inactive=True)
|
||||||
|
edit_id = request.args.get('template_id', type=int)
|
||||||
|
editing = get_email_template(edit_id) if edit_id else None
|
||||||
|
|
||||||
|
preview_payload = None
|
||||||
|
preview_template = None
|
||||||
|
preview_id = request.args.get('preview_id', type=int)
|
||||||
|
if preview_id:
|
||||||
|
preview_template = get_email_template(preview_id)
|
||||||
|
if preview_template:
|
||||||
|
sample_jobs = [
|
||||||
|
{
|
||||||
|
'title': 'Senior Python Engineer',
|
||||||
|
'company': 'ACME Corp',
|
||||||
|
'location': 'Remote',
|
||||||
|
'url': 'https://example.com/jobs/1',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'title': 'Data Engineer',
|
||||||
|
'company': 'Globex',
|
||||||
|
'location': 'New York, NY',
|
||||||
|
'url': 'https://example.com/jobs/2',
|
||||||
|
},
|
||||||
|
]
|
||||||
|
preview_payload = render_job_alert_email(
|
||||||
|
sample_jobs,
|
||||||
|
region='preview-region',
|
||||||
|
keyword='preview-keyword',
|
||||||
|
template_override=preview_template,
|
||||||
|
)
|
||||||
|
|
||||||
|
return render_template(
|
||||||
|
'admin/email_templates.html',
|
||||||
|
title='Email Templates',
|
||||||
|
templates=templates,
|
||||||
|
editing=editing,
|
||||||
|
preview=preview_payload,
|
||||||
|
preview_template=preview_template,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# ---------------- User settings (regions/keywords) -------------------------
|
# ---------------- User settings (regions/keywords) -------------------------
|
||||||
|
|
||||||
@app.route('/settings', methods=['GET', 'POST'])
|
@app.route('/settings', methods=['GET', 'POST'])
|
||||||
@@ -353,6 +513,8 @@ def user_settings():
|
|||||||
# Accept JSON or form posts. Normalize singular/plural names.
|
# Accept JSON or form posts. Normalize singular/plural names.
|
||||||
sel_regions: list[str] = []
|
sel_regions: list[str] = []
|
||||||
sel_keywords: list[str] = []
|
sel_keywords: list[str] = []
|
||||||
|
sel_negative_keywords: list[str] = []
|
||||||
|
|
||||||
if request.is_json:
|
if request.is_json:
|
||||||
data = request.get_json(silent=True) or {}
|
data = request.get_json(silent=True) or {}
|
||||||
sel_regions = [
|
sel_regions = [
|
||||||
@@ -361,16 +523,25 @@ def user_settings():
|
|||||||
sel_keywords = [
|
sel_keywords = [
|
||||||
(v or '').strip() for v in (data.get('keywords') or []) if v and (v or '').strip()
|
(v or '').strip() for v in (data.get('keywords') or []) if v and (v or '').strip()
|
||||||
]
|
]
|
||||||
|
sel_negative_keywords = [
|
||||||
|
(v or '').strip() for v in (data.get('negative_keywords') or []) if v and (v or '').strip()
|
||||||
|
]
|
||||||
else:
|
else:
|
||||||
# HTML form fallback: support names 'regions' or 'region', 'keywords' or 'keyword'
|
# HTML form fallback: support names 'regions' or 'region', 'keywords' or 'keyword'
|
||||||
r_vals = request.form.getlist(
|
r_vals = request.form.getlist(
|
||||||
'regions') + request.form.getlist('region')
|
'regions') + request.form.getlist('region')
|
||||||
k_vals = request.form.getlist(
|
k_vals = request.form.getlist(
|
||||||
'keywords') + request.form.getlist('keyword')
|
'keywords') + request.form.getlist('keyword')
|
||||||
|
nk_vals = request.form.getlist(
|
||||||
|
'negative_keywords') + request.form.getlist('negative_keyword')
|
||||||
|
|
||||||
sel_regions = [(v or '').strip()
|
sel_regions = [(v or '').strip()
|
||||||
for v in r_vals if v and (v or '').strip()]
|
for v in r_vals if v and (v or '').strip()]
|
||||||
sel_keywords = [(v or '').strip()
|
sel_keywords = [(v or '').strip()
|
||||||
for v in k_vals if v and (v or '').strip()]
|
for v in k_vals if v and (v or '').strip()]
|
||||||
|
sel_negative_keywords = [(v or '').strip()
|
||||||
|
for v in nk_vals if v and (v or '').strip()]
|
||||||
|
|
||||||
# Upsert any new values into master lists
|
# Upsert any new values into master lists
|
||||||
for r in sel_regions:
|
for r in sel_regions:
|
||||||
try:
|
try:
|
||||||
@@ -382,9 +553,14 @@ def user_settings():
|
|||||||
upsert_keyword(k)
|
upsert_keyword(k)
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
# Negative keywords are upserted inside set_user_negative_keywords implicitly if we wanted,
|
||||||
|
# but let's stick to the pattern. Actually set_user_negative_keywords calls upsert_negative_keyword.
|
||||||
|
|
||||||
try:
|
try:
|
||||||
set_user_regions(username, sel_regions)
|
set_user_regions(username, sel_regions)
|
||||||
set_user_keywords(username, sel_keywords)
|
set_user_keywords(username, sel_keywords)
|
||||||
|
set_user_negative_keywords(username, sel_negative_keywords)
|
||||||
|
|
||||||
# For JSON callers, return 200 without redirect
|
# For JSON callers, return 200 without redirect
|
||||||
if request.is_json:
|
if request.is_json:
|
||||||
return jsonify({"status": "ok"})
|
return jsonify({"status": "ok"})
|
||||||
@@ -399,6 +575,8 @@ def user_settings():
|
|||||||
all_keywords = get_all_keywords()
|
all_keywords = get_all_keywords()
|
||||||
user_regions = get_user_regions(username)
|
user_regions = get_user_regions(username)
|
||||||
user_keywords = get_user_keywords(username)
|
user_keywords = get_user_keywords(username)
|
||||||
|
user_negative_keywords = get_user_negative_keywords(username)
|
||||||
|
|
||||||
return render_template(
|
return render_template(
|
||||||
'user/settings.html',
|
'user/settings.html',
|
||||||
title='Your Preferences',
|
title='Your Preferences',
|
||||||
@@ -406,6 +584,7 @@ def user_settings():
|
|||||||
all_keywords=all_keywords,
|
all_keywords=all_keywords,
|
||||||
user_regions=user_regions,
|
user_regions=user_regions,
|
||||||
user_keywords=user_keywords,
|
user_keywords=user_keywords,
|
||||||
|
user_negative_keywords=user_negative_keywords,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -474,6 +653,45 @@ def admin_taxonomy():
|
|||||||
return render_template('admin/taxonomy.html', title='Taxonomy', regions=regions, keywords=keywords)
|
return render_template('admin/taxonomy.html', title='Taxonomy', regions=regions, keywords=keywords)
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/admin/stats', methods=['GET'])
|
||||||
|
def admin_stats():
|
||||||
|
if not require_admin():
|
||||||
|
return redirect(url_for('login'))
|
||||||
|
# Optional filters via query params
|
||||||
|
keyword = request.args.get('keyword')
|
||||||
|
region = request.args.get('region')
|
||||||
|
try:
|
||||||
|
stats = stats_overview()
|
||||||
|
# For detailed jobs table, reuse get_all_jobs() and filter
|
||||||
|
jobs = get_all_jobs()
|
||||||
|
if keyword:
|
||||||
|
jobs = [j for j in jobs if (j.get('keyword') or '') == keyword]
|
||||||
|
if region:
|
||||||
|
jobs = [j for j in jobs if (j.get('region') or '') == region]
|
||||||
|
except Exception as e:
|
||||||
|
flash(f'Error computing stats: {e}')
|
||||||
|
stats = {
|
||||||
|
'total_jobs': 0,
|
||||||
|
'total_keywords': 0,
|
||||||
|
'total_regions': 0,
|
||||||
|
'jobs_per_keyword': [],
|
||||||
|
'jobs_per_region': []
|
||||||
|
}
|
||||||
|
jobs = []
|
||||||
|
return render_template('admin/stats.html', title='Statistics', stats=stats, jobs=jobs, regions=get_all_regions(), keywords=get_all_keywords())
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/health', methods=['GET'])
|
||||||
|
def health_check():
|
||||||
|
"""Health check endpoint for monitoring application status."""
|
||||||
|
return jsonify({
|
||||||
|
"status": "healthy",
|
||||||
|
"timestamp": now_iso(),
|
||||||
|
"service": "jobs-scraper",
|
||||||
|
"version": "1.0.0"
|
||||||
|
}), 200
|
||||||
|
|
||||||
|
|
||||||
def init():
|
def init():
|
||||||
"""Main function to run the Flask app."""
|
"""Main function to run the Flask app."""
|
||||||
# Ensure DB is initialized
|
# Ensure DB is initialized
|
||||||
|
|||||||
@@ -2,42 +2,83 @@ from datetime import datetime, timezone
|
|||||||
from web.scraper import process_region_keyword, scrape_job_page
|
from web.scraper import process_region_keyword, scrape_job_page
|
||||||
from web.db import (
|
from web.db import (
|
||||||
db_init,
|
db_init,
|
||||||
upsert_cached_page,
|
|
||||||
upsert_listing,
|
upsert_listing,
|
||||||
upsert_job_details,
|
upsert_job_details,
|
||||||
url_to_job_id,
|
url_to_job_id,
|
||||||
upsert_user_interaction,
|
upsert_user_interaction,
|
||||||
db_remove_cached_url,
|
|
||||||
db_sync_cached_pages,
|
|
||||||
db_get_all_job_urls,
|
db_get_all_job_urls,
|
||||||
db_get_cache_url,
|
|
||||||
db_delete_job,
|
db_delete_job,
|
||||||
remove_job,
|
remove_job,
|
||||||
normalize_cached_page_paths,
|
insert_log,
|
||||||
|
get_last_fetch_time,
|
||||||
)
|
)
|
||||||
|
import schedule
|
||||||
|
import time
|
||||||
# Import utility functions
|
# Import utility functions
|
||||||
from web.utils import (
|
from web.utils import (
|
||||||
get_cache_dir,
|
get_base_url,
|
||||||
make_request_with_retry,
|
make_request_with_retry,
|
||||||
now_iso,
|
get_email_settings,
|
||||||
get_cache_path,
|
|
||||||
cache_page,
|
|
||||||
is_cache_stale,
|
|
||||||
delete_cached_page,
|
|
||||||
get_cached_content,
|
|
||||||
ensure_cache_dir
|
|
||||||
)
|
)
|
||||||
from web.db import get_all_regions, get_all_keywords, seed_regions_keywords_from_listings
|
from web.db import get_all_regions, get_all_keywords, seed_regions_keywords_from_listings
|
||||||
|
from web.email_templates import render_job_alert_email
|
||||||
|
from web.email_service import send_email
|
||||||
|
|
||||||
|
|
||||||
|
def _negative_match_details(job_data: dict) -> tuple[str, str] | None:
|
||||||
|
"""Return (keyword, field) when job_data indicates a negative match."""
|
||||||
|
if not job_data or not job_data.get("is_negative_match"):
|
||||||
|
return None
|
||||||
|
keyword = (job_data.get("negative_keyword_match") or "").strip()
|
||||||
|
field = (job_data.get("negative_match_field")
|
||||||
|
or "unknown").strip() or "unknown"
|
||||||
|
if not keyword:
|
||||||
|
keyword = "unknown keyword"
|
||||||
|
return keyword, field
|
||||||
|
|
||||||
|
|
||||||
|
def _send_new_job_alert(new_jobs: list[dict]) -> tuple[bool, str]:
|
||||||
|
"""Send an email alert for newly discovered jobs.
|
||||||
|
|
||||||
|
Returns (sent, message) where message explains why mail was skipped.
|
||||||
|
"""
|
||||||
|
|
||||||
|
settings = get_email_settings()
|
||||||
|
if not settings.get("enabled"):
|
||||||
|
return False, "email alerts disabled"
|
||||||
|
|
||||||
|
recipients = settings.get("recipients", []) or []
|
||||||
|
if not recipients:
|
||||||
|
return False, "no recipients configured"
|
||||||
|
|
||||||
|
payload = render_job_alert_email(new_jobs)
|
||||||
|
send_email(
|
||||||
|
subject=payload.get("subject", "New jobs available"),
|
||||||
|
body=payload.get("body", ""),
|
||||||
|
to=recipients,
|
||||||
|
settings=settings,
|
||||||
|
)
|
||||||
|
return True, "sent"
|
||||||
|
|
||||||
|
|
||||||
def fetch_listings():
|
def fetch_listings():
|
||||||
"""Fetch job listings from all regions and keywords."""
|
"""Fetch job listings from all regions and keywords.
|
||||||
|
|
||||||
|
Yields progress messages and returns a dict with:
|
||||||
|
- discovered: total number of unique job URLs discovered
|
||||||
|
- new: total number of new jobs added to the database
|
||||||
|
- by_search: list of dicts, each containing:
|
||||||
|
- region: region name
|
||||||
|
- keyword: keyword name
|
||||||
|
- count: number of jobs fetched for this search
|
||||||
|
"""
|
||||||
# We'll collect URLs discovered in this run and then remove any DB listings
|
# We'll collect URLs discovered in this run and then remove any DB listings
|
||||||
# not present in this set (treat DB as reflecting current search results).
|
# not present in this set (treat DB as reflecting current search results).
|
||||||
existing_db_urls = set(db_get_all_job_urls())
|
existing_db_urls = set(row['url'] for row in db_get_all_job_urls())
|
||||||
discovered_urls = set()
|
discovered_urls = set()
|
||||||
new_rows = []
|
new_rows = []
|
||||||
|
new_jobs = []
|
||||||
|
search_results = [] # Track count per search
|
||||||
|
|
||||||
# Ensure regions/keywords master lists exist
|
# Ensure regions/keywords master lists exist
|
||||||
try:
|
try:
|
||||||
@@ -45,20 +86,64 @@ def fetch_listings():
|
|||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
yield "Initializing database and seeding regions/keywords...\n"
|
||||||
|
|
||||||
# Fetch listings for each region/keyword from DB
|
# Fetch listings for each region/keyword from DB
|
||||||
for region in get_all_regions():
|
regions = get_all_regions()
|
||||||
|
keywords = get_all_keywords()
|
||||||
|
total_combinations = len(regions) * len(keywords)
|
||||||
|
processed = 0
|
||||||
|
|
||||||
|
yield f"Found {len(regions)} regions and {len(keywords)} keywords. Processing {total_combinations} combinations...\n"
|
||||||
|
|
||||||
|
for region in regions:
|
||||||
region_name = region.get("name")
|
region_name = region.get("name")
|
||||||
if not region_name:
|
if not region_name:
|
||||||
continue
|
continue
|
||||||
for keyword in get_all_keywords():
|
for keyword in keywords:
|
||||||
keyword_name = keyword.get("name")
|
keyword_name = keyword.get("name")
|
||||||
if not keyword_name:
|
if not keyword_name:
|
||||||
continue
|
continue
|
||||||
|
# Build a canonical search identifier for this region+keyword combination.
|
||||||
|
url = get_base_url().format(region=region, keyword=keyword_name.replace(" ", "+"))
|
||||||
|
search_page_id = f"search:{region_name}:{keyword_name}"
|
||||||
|
search_count = 0 # Count jobs for this search
|
||||||
|
try:
|
||||||
|
last = get_last_fetch_time(url)
|
||||||
|
if last is not None:
|
||||||
|
# skip if fetched within the last hour
|
||||||
|
age = datetime.now(
|
||||||
|
timezone.utc) - (last if last.tzinfo is not None else last.replace(tzinfo=timezone.utc))
|
||||||
|
if age.total_seconds() < 1 * 3600:
|
||||||
|
yield f"Skipping {region_name} + {keyword_name} (fetched {age.seconds//3600}h ago)...\n"
|
||||||
|
processed += 1
|
||||||
|
continue
|
||||||
|
except Exception:
|
||||||
|
# if logging lookup fails, proceed with fetch
|
||||||
|
pass
|
||||||
|
processed += 1
|
||||||
|
yield f"Processing {region_name} + {keyword_name} ({processed}/{total_combinations})...\n"
|
||||||
|
# record that we're fetching this search page now
|
||||||
|
try:
|
||||||
|
insert_log(url, region=region_name,
|
||||||
|
keyword=keyword_name, fetched_at=datetime.now(timezone.utc))
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
for row in process_region_keyword(region_name, keyword_name, discovered_urls):
|
for row in process_region_keyword(region_name, keyword_name, discovered_urls):
|
||||||
timestamp, region, keyword, title, pay, location, url = row
|
timestamp, region, keyword, title, pay, location, url = row
|
||||||
discovered_urls.add(url)
|
discovered_urls.add(url)
|
||||||
|
search_count += 1
|
||||||
if url not in existing_db_urls:
|
if url not in existing_db_urls:
|
||||||
new_rows.append(row)
|
new_rows.append(row)
|
||||||
|
new_jobs.append({
|
||||||
|
"timestamp": timestamp,
|
||||||
|
"region": region,
|
||||||
|
"keyword": keyword,
|
||||||
|
"title": title,
|
||||||
|
"pay": pay,
|
||||||
|
"location": location,
|
||||||
|
"url": url,
|
||||||
|
})
|
||||||
# Upsert or update listing to reflect current search result
|
# Upsert or update listing to reflect current search result
|
||||||
upsert_listing(
|
upsert_listing(
|
||||||
url=url,
|
url=url,
|
||||||
@@ -68,80 +153,154 @@ def fetch_listings():
|
|||||||
pay=pay,
|
pay=pay,
|
||||||
location=location,
|
location=location,
|
||||||
timestamp=timestamp,
|
timestamp=timestamp,
|
||||||
|
fetched_from=search_page_id,
|
||||||
|
fetched_at=datetime.now(timezone.utc),
|
||||||
)
|
)
|
||||||
|
# Record per-search count
|
||||||
|
search_results.append({
|
||||||
|
"region": region_name,
|
||||||
|
"keyword": keyword_name,
|
||||||
|
"count": search_count
|
||||||
|
})
|
||||||
|
|
||||||
# Remove stale listings: those present in DB but not discovered now.
|
yield f"Listing fetch complete: {len(discovered_urls)} discovered, {len(new_rows)} new,\n"
|
||||||
stale_urls = existing_db_urls - discovered_urls
|
return {
|
||||||
for url in stale_urls:
|
"discovered": len(discovered_urls),
|
||||||
try:
|
"new": len(new_rows),
|
||||||
jid = url_to_job_id(url)
|
"by_search": search_results,
|
||||||
db_delete_job(jid)
|
"new_jobs": new_jobs,
|
||||||
# Also try to remove cached file and its metadata
|
}
|
||||||
delete_cached_page(url)
|
|
||||||
db_remove_cached_url(url)
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
return {"discovered": len(discovered_urls), "new": len(new_rows), "stale": len(stale_urls)}
|
|
||||||
|
|
||||||
|
|
||||||
def process_job_url(job_url: str):
|
def process_job_url(job_url: str, region: str = "", keyword: str = ""):
|
||||||
|
last = get_last_fetch_time(job_url)
|
||||||
|
if last is not None:
|
||||||
|
# skip if fetched within the last 24 hours
|
||||||
|
age = datetime.now(
|
||||||
|
timezone.utc) - (last if last.tzinfo is not None else last.replace(tzinfo=timezone.utc))
|
||||||
|
if age.total_seconds() < 24 * 3600:
|
||||||
|
yield f"Skipping job {job_url} (fetched {age.seconds//3600}h ago)...\n"
|
||||||
|
return None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
job_id = url_to_job_id(job_url)
|
job_id = url_to_job_id(job_url)
|
||||||
content = None
|
yield f"Fetching job page: {job_url}\n"
|
||||||
cached_page = db_get_cache_url(job_url)
|
content = make_request_with_retry(job_url, 1)
|
||||||
if cached_page:
|
|
||||||
last_modified = cached_page.get("last_modified")
|
|
||||||
if last_modified and not is_cache_stale(last_modified):
|
|
||||||
content = get_cached_content(job_url)
|
|
||||||
else:
|
|
||||||
content = make_request_with_retry(job_url, 1)
|
|
||||||
else:
|
|
||||||
content = make_request_with_retry(job_url, 1)
|
|
||||||
|
|
||||||
if content is None:
|
if content is None:
|
||||||
|
yield f"Failed to fetch content for {job_url}, removing from database\n"
|
||||||
remove_job(job_url)
|
remove_job(job_url)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# refresh cache and details
|
yield f"Scraping job data from {job_url}\n"
|
||||||
cache_page(job_url, content)
|
|
||||||
upsert_cached_page(
|
|
||||||
file_path=get_cache_path(job_url),
|
|
||||||
url_guess=job_url,
|
|
||||||
last_modified=now_iso(),
|
|
||||||
size_bytes=len(content),
|
|
||||||
job_id=job_id
|
|
||||||
)
|
|
||||||
job_data = scrape_job_page(content, job_url)
|
job_data = scrape_job_page(content, job_url)
|
||||||
if job_data:
|
if job_data:
|
||||||
upsert_job_details(job_data)
|
negative_info = _negative_match_details(job_data)
|
||||||
upsert_user_interaction(
|
if negative_info:
|
||||||
job_id, seen_at=datetime.now(timezone.utc).isoformat())
|
keyword, field = negative_info
|
||||||
|
yield (
|
||||||
|
f"Skipping job {job_id} due to negative keyword "
|
||||||
|
f"'{keyword}' in {field}\n"
|
||||||
|
)
|
||||||
|
remove_job(job_url)
|
||||||
|
return None
|
||||||
|
yield f"Upserting job details for {job_id}\n"
|
||||||
|
upsert_job_details(job_data, region=region, keyword=keyword)
|
||||||
|
yield f"Successfully processed job {job_id}: {job_data.get('title', 'Unknown')}\n"
|
||||||
return job_data
|
return job_data
|
||||||
|
else:
|
||||||
|
yield f"Failed to scrape job data from {job_url}\n"
|
||||||
return None
|
return None
|
||||||
except Exception:
|
except Exception as e:
|
||||||
|
yield f"Error processing {job_url}: {str(e)}\n"
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def scraper():
|
def scraper():
|
||||||
"""Main function to run the scraper."""
|
"""Main function to run the scraper."""
|
||||||
ensure_cache_dir()
|
yield "Starting scraper...\n"
|
||||||
db_init()
|
db_init()
|
||||||
|
yield "Database initialized\n"
|
||||||
|
|
||||||
# First, fetch current listings from search pages and make DB reflect them.
|
# First, fetch current listings from search pages and make DB reflect them.
|
||||||
jl = fetch_listings()
|
yield "Fetching listings...\n"
|
||||||
|
listing_summary: dict | None = None
|
||||||
|
fetch_iter = fetch_listings()
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
message = next(fetch_iter)
|
||||||
|
yield message
|
||||||
|
except StopIteration as stop:
|
||||||
|
listing_summary = stop.value if isinstance(stop.value, dict) else {}
|
||||||
|
|
||||||
# Sync any cached files we have on disk into the cached_pages table.
|
new_jobs = []
|
||||||
db_sync_cached_pages(get_cache_dir())
|
if listing_summary:
|
||||||
|
new_jobs = listing_summary.get("new_jobs", []) or []
|
||||||
|
|
||||||
# Normalize any relative cached file paths to absolute paths in DB
|
if new_jobs:
|
||||||
normalized = normalize_cached_page_paths()
|
yield f"Preparing email alert for {len(new_jobs)} new jobs...\n"
|
||||||
if normalized:
|
try:
|
||||||
pass
|
sent, info = _send_new_job_alert(new_jobs)
|
||||||
|
if sent:
|
||||||
|
yield "Job alert email sent.\n"
|
||||||
|
else:
|
||||||
|
yield f"Skipping email alert: {info}\n"
|
||||||
|
except Exception as exc:
|
||||||
|
yield f"Failed to send job alert email: {exc}\n"
|
||||||
|
|
||||||
# Finally, fetch and refresh individual job pages for current listings
|
# Finally, fetch and refresh individual job pages for current listings
|
||||||
for url in db_get_all_job_urls():
|
job_urls = db_get_all_job_urls()
|
||||||
process_job_url(url)
|
yield f"Processing {len(job_urls)} job pages...\n"
|
||||||
|
|
||||||
|
for i, url_dict in enumerate(job_urls, start=1):
|
||||||
|
url = url_dict.get("url")
|
||||||
|
region = url_dict.get("region", "")
|
||||||
|
keyword = url_dict.get("keyword", "")
|
||||||
|
if not url:
|
||||||
|
continue
|
||||||
|
yield f"\n--- Processing job {i}/{len(job_urls)} ---\n"
|
||||||
|
for message in process_job_url(job_url=url, region=region, keyword=keyword):
|
||||||
|
yield message
|
||||||
|
|
||||||
|
yield "\nScraping completed successfully!\n"
|
||||||
|
|
||||||
|
|
||||||
|
def scrape_jobs_with_retry(max_retries=3):
|
||||||
|
"""Run the scraping process with retry logic for failures."""
|
||||||
|
for attempt in range(max_retries):
|
||||||
|
try:
|
||||||
|
scraper()
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
if attempt < max_retries - 1:
|
||||||
|
time.sleep(2 ** attempt * 10) # Exponential backoff
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def start_scheduler():
|
||||||
|
"""Start the scheduler to run scraping every hour."""
|
||||||
|
# Clear any existing jobs
|
||||||
|
schedule.clear()
|
||||||
|
|
||||||
|
# Schedule scraping every hour
|
||||||
|
schedule.every().hour.do(scrape_jobs_with_retry)
|
||||||
|
|
||||||
|
# Run the scheduler in a loop
|
||||||
|
while True:
|
||||||
|
schedule.run_pending()
|
||||||
|
time.sleep(60) # Check every minute
|
||||||
|
|
||||||
|
|
||||||
|
def run_scheduled_scraping():
|
||||||
|
"""Run the scheduled scraping process."""
|
||||||
|
try:
|
||||||
|
scrape_jobs_with_retry()
|
||||||
|
except Exception as e:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
# Initialize scheduler when module is imported
|
||||||
|
schedule.every().hour.do(run_scheduled_scraping)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
scraper()
|
scraper()
|
||||||
|
|||||||
759
web/db.py
759
web/db.py
@@ -4,27 +4,24 @@ from __future__ import annotations
|
|||||||
|
|
||||||
Tables:
|
Tables:
|
||||||
- users(user_id PK, username UNIQUE, created_at)
|
- users(user_id PK, username UNIQUE, created_at)
|
||||||
- cached_pages(file_path PK, url_guess, last_modified, size_bytes, job_id)
|
|
||||||
- job_listings(job_id PK, url UNIQUE, region, keyword, title, pay, location, timestamp)
|
- job_listings(job_id PK, url UNIQUE, region, keyword, title, pay, location, timestamp)
|
||||||
- job_descriptions(job_id PK FK -> job_listings, title, company, location, description, posted_time, url)
|
- job_descriptions(job_id PK FK -> job_listings, title, company, location, description, posted_time, url, reply_url)
|
||||||
- user_interactions(job_id PK FK -> job_listings, user_id FK -> users, seen_at, url_visited, is_user_favorite)
|
- user_interactions(job_id PK FK -> job_listings, user_id FK -> users, seen_at, url_visited, is_user_favorite)
|
||||||
- regions(region_id PK, name UNIQUE)
|
- regions(region_id PK, name UNIQUE)
|
||||||
- keywords(keyword_id PK, name UNIQUE)
|
- keywords(keyword_id PK, name UNIQUE)
|
||||||
- user_regions(user_id FK -> users, region_id FK -> regions, composite PK)
|
- user_regions(user_id FK -> users, region_id FK -> regions, composite PK)
|
||||||
- user_keywords(user_id FK -> users, keyword_id FK -> keywords, composite PK)
|
- user_keywords(user_id FK -> users, keyword_id FK -> keywords, composite PK)
|
||||||
|
- logs(id PK, page_url, region, keyword, fetched_at)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from datetime import datetime, UTC
|
from datetime import datetime, UTC
|
||||||
import os
|
|
||||||
from typing import Optional, Dict, Any, List
|
from typing import Optional, Dict, Any, List
|
||||||
|
import re
|
||||||
from web.utils import (
|
from web.utils import (
|
||||||
get_url_from_filename,
|
|
||||||
get_color_from_string,
|
get_color_from_string,
|
||||||
url_to_job_id,
|
url_to_job_id,
|
||||||
normalize_job_id,
|
normalize_job_id,
|
||||||
now_iso,
|
now_iso,
|
||||||
get_cache_path,
|
|
||||||
get_cache_dir,
|
|
||||||
get_mysql_config,
|
get_mysql_config,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -86,8 +83,6 @@ class JobListing(Base):
|
|||||||
|
|
||||||
description = relationship(
|
description = relationship(
|
||||||
"JobDescription", back_populates="listing", uselist=False, cascade="all, delete-orphan")
|
"JobDescription", back_populates="listing", uselist=False, cascade="all, delete-orphan")
|
||||||
cached_pages = relationship(
|
|
||||||
"CachedPage", back_populates="listing", cascade="all, delete-orphan")
|
|
||||||
interactions = relationship(
|
interactions = relationship(
|
||||||
"UserInteraction", back_populates="listing", cascade="all, delete-orphan")
|
"UserInteraction", back_populates="listing", cascade="all, delete-orphan")
|
||||||
|
|
||||||
@@ -102,32 +97,277 @@ class JobDescription(Base):
|
|||||||
description = Column(Text)
|
description = Column(Text)
|
||||||
posted_time = Column(String(TIME_LEN))
|
posted_time = Column(String(TIME_LEN))
|
||||||
url = Column(String(URL_LEN))
|
url = Column(String(URL_LEN))
|
||||||
|
reply_url = Column(String(URL_LEN))
|
||||||
|
contact_email = Column(String(SHORT_LEN))
|
||||||
|
contact_phone = Column(String(SHORT_LEN))
|
||||||
|
contact_name = Column(String(SHORT_LEN))
|
||||||
|
|
||||||
listing = relationship("JobListing", back_populates="description")
|
listing = relationship("JobListing", back_populates="description")
|
||||||
|
|
||||||
|
|
||||||
class CachedPage(Base):
|
def _normalize_email(value: Optional[str]) -> str:
|
||||||
__tablename__ = "cached_pages"
|
if not value or not isinstance(value, str):
|
||||||
file_path = Column(String(FILE_PATH_LEN), primary_key=True)
|
return ""
|
||||||
url_guess = Column(String(URL_LEN))
|
return value.strip().lower()
|
||||||
last_modified = Column(String(TIME_LEN))
|
|
||||||
size_bytes = Column(Integer)
|
|
||||||
job_id = Column(String(JOB_ID_LEN), ForeignKey(
|
|
||||||
"job_listings.job_id", ondelete="CASCADE"))
|
|
||||||
|
|
||||||
listing = relationship("JobListing", back_populates="cached_pages")
|
|
||||||
|
|
||||||
@property
|
def subscribe_email(email: str) -> bool:
|
||||||
def abs_path(self) -> Optional[str]:
|
"""Add or reactivate an email subscription."""
|
||||||
"""Return the absolute filesystem path for this cached page.
|
address = _normalize_email(email)
|
||||||
|
if not address:
|
||||||
|
raise ValueError("email address required")
|
||||||
|
with _ensure_session() as session:
|
||||||
|
existing = session.execute(
|
||||||
|
text(
|
||||||
|
"SELECT subscription_id, is_active FROM email_subscriptions WHERE email = :e"
|
||||||
|
),
|
||||||
|
{"e": address},
|
||||||
|
).fetchone()
|
||||||
|
now = datetime.now(UTC)
|
||||||
|
if existing:
|
||||||
|
session.execute(
|
||||||
|
text(
|
||||||
|
"UPDATE email_subscriptions SET is_active = 1, updated_at = :u WHERE subscription_id = :sid"
|
||||||
|
),
|
||||||
|
{"u": now, "sid": existing[0]},
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
session.execute(
|
||||||
|
text(
|
||||||
|
"INSERT INTO email_subscriptions(email, is_active, created_at, updated_at) "
|
||||||
|
"VALUES(:e, 1, :u, :u)"
|
||||||
|
),
|
||||||
|
{"e": address, "u": now},
|
||||||
|
)
|
||||||
|
session.commit()
|
||||||
|
return True
|
||||||
|
|
||||||
The DB stores `file_path` relative to the configured cache dir. This
|
|
||||||
helper centralizes resolution so callers can use `cached_page.abs_path`.
|
def unsubscribe_email(email: str) -> bool:
|
||||||
"""
|
"""Deactivate an email subscription."""
|
||||||
fp = getattr(self, 'file_path', None)
|
address = _normalize_email(email)
|
||||||
if not fp:
|
if not address:
|
||||||
return None
|
raise ValueError("email address required")
|
||||||
return os.path.join(os.path.abspath(get_cache_dir()), fp)
|
with _ensure_session() as session:
|
||||||
|
now = datetime.now(UTC)
|
||||||
|
result = session.execute(
|
||||||
|
text(
|
||||||
|
"UPDATE email_subscriptions SET is_active = 0, updated_at = :u WHERE email = :e"
|
||||||
|
),
|
||||||
|
{"u": now, "e": address},
|
||||||
|
)
|
||||||
|
session.commit()
|
||||||
|
rowcount = getattr(result, "rowcount", None)
|
||||||
|
if rowcount is None:
|
||||||
|
return False
|
||||||
|
return rowcount > 0
|
||||||
|
|
||||||
|
|
||||||
|
def list_email_subscriptions(*, active_only: bool = False) -> List[Dict[str, Any]]:
|
||||||
|
"""Return subscription rows as dicts."""
|
||||||
|
query = "SELECT subscription_id, email, is_active, created_at, updated_at FROM email_subscriptions"
|
||||||
|
params: Dict[str, Any] = {}
|
||||||
|
if active_only:
|
||||||
|
query += " WHERE is_active = 1"
|
||||||
|
query += " ORDER BY email"
|
||||||
|
with _ensure_session() as session:
|
||||||
|
rows = session.execute(text(query), params).fetchall()
|
||||||
|
result: List[Dict[str, Any]] = []
|
||||||
|
for row in rows:
|
||||||
|
result.append(
|
||||||
|
{
|
||||||
|
"subscription_id": row[0],
|
||||||
|
"email": row[1],
|
||||||
|
"is_active": bool(row[2]),
|
||||||
|
"created_at": row[3],
|
||||||
|
"updated_at": row[4],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def get_active_email_recipients() -> List[str]:
|
||||||
|
"""Return list of active subscription email addresses."""
|
||||||
|
return [s["email"] for s in list_email_subscriptions(active_only=True)]
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_slug(value: Optional[str]) -> str:
|
||||||
|
if not value:
|
||||||
|
return ""
|
||||||
|
slug = re.sub(r"[^a-zA-Z0-9-]+", "-", value.strip().lower())
|
||||||
|
slug = re.sub(r"-+", "-", slug).strip("-")
|
||||||
|
return slug
|
||||||
|
|
||||||
|
|
||||||
|
def _template_to_dict(template: EmailTemplate) -> Dict[str, Any]:
|
||||||
|
created = getattr(template, "created_at", None)
|
||||||
|
updated = getattr(template, "updated_at", None)
|
||||||
|
return {
|
||||||
|
"template_id": template.template_id,
|
||||||
|
"slug": template.slug,
|
||||||
|
"name": template.name,
|
||||||
|
"subject": template.subject,
|
||||||
|
"body": template.body,
|
||||||
|
"is_active": bool(template.is_active),
|
||||||
|
"created_at": created.isoformat() if isinstance(created, datetime) else created,
|
||||||
|
"updated_at": updated.isoformat() if isinstance(updated, datetime) else updated,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def list_email_templates(*, include_inactive: bool = True) -> List[Dict[str, Any]]:
|
||||||
|
with _ensure_session() as session:
|
||||||
|
query = session.query(EmailTemplate)
|
||||||
|
if not include_inactive:
|
||||||
|
query = query.filter(EmailTemplate.is_active.is_(True))
|
||||||
|
items = query.order_by(EmailTemplate.name.asc()).all()
|
||||||
|
return [_template_to_dict(obj) for obj in items]
|
||||||
|
|
||||||
|
|
||||||
|
def get_email_template(template_id: int) -> Optional[Dict[str, Any]]:
|
||||||
|
if not template_id:
|
||||||
|
return None
|
||||||
|
with _ensure_session() as session:
|
||||||
|
obj = session.get(EmailTemplate, int(template_id))
|
||||||
|
return _template_to_dict(obj) if obj else None
|
||||||
|
|
||||||
|
|
||||||
|
def get_email_template_by_slug(slug: str) -> Optional[Dict[str, Any]]:
|
||||||
|
normalized = _normalize_slug(slug)
|
||||||
|
if not normalized:
|
||||||
|
return None
|
||||||
|
with _ensure_session() as session:
|
||||||
|
obj = session.query(EmailTemplate).filter(
|
||||||
|
EmailTemplate.slug == normalized).one_or_none()
|
||||||
|
return _template_to_dict(obj) if obj else None
|
||||||
|
|
||||||
|
|
||||||
|
def create_email_template(
|
||||||
|
*,
|
||||||
|
name: str,
|
||||||
|
subject: str,
|
||||||
|
body: str,
|
||||||
|
slug: Optional[str] = None,
|
||||||
|
is_active: bool = True,
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
name_clean = (name or "").strip()
|
||||||
|
if not name_clean:
|
||||||
|
raise ValueError("Template name is required")
|
||||||
|
subject_clean = (subject or "").strip()
|
||||||
|
if not subject_clean:
|
||||||
|
raise ValueError("Template subject is required")
|
||||||
|
body_clean = (body or "").strip()
|
||||||
|
if not body_clean:
|
||||||
|
raise ValueError("Template body is required")
|
||||||
|
|
||||||
|
slug_clean = _normalize_slug(slug or name_clean)
|
||||||
|
if not slug_clean:
|
||||||
|
raise ValueError("Template slug is required")
|
||||||
|
|
||||||
|
with _ensure_session() as session:
|
||||||
|
existing = session.query(EmailTemplate).filter(
|
||||||
|
EmailTemplate.slug == slug_clean).one_or_none()
|
||||||
|
if existing:
|
||||||
|
raise ValueError("A template with this slug already exists")
|
||||||
|
template = EmailTemplate(
|
||||||
|
name=name_clean,
|
||||||
|
slug=slug_clean,
|
||||||
|
subject=subject_clean,
|
||||||
|
body=body_clean,
|
||||||
|
is_active=bool(is_active),
|
||||||
|
)
|
||||||
|
session.add(template)
|
||||||
|
session.commit()
|
||||||
|
session.refresh(template)
|
||||||
|
return _template_to_dict(template)
|
||||||
|
|
||||||
|
|
||||||
|
def update_email_template(
|
||||||
|
template_id: int,
|
||||||
|
*,
|
||||||
|
name: Optional[str] = None,
|
||||||
|
subject: Optional[str] = None,
|
||||||
|
body: Optional[str] = None,
|
||||||
|
slug: Optional[str] = None,
|
||||||
|
is_active: Optional[bool] = None,
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
if not template_id:
|
||||||
|
raise ValueError("template_id is required")
|
||||||
|
with _ensure_session() as session:
|
||||||
|
template = session.get(EmailTemplate, int(template_id))
|
||||||
|
if template is None:
|
||||||
|
raise ValueError("Template not found")
|
||||||
|
if name is not None:
|
||||||
|
name_clean = name.strip()
|
||||||
|
if not name_clean:
|
||||||
|
raise ValueError("Template name is required")
|
||||||
|
setattr(template, "name", name_clean)
|
||||||
|
if subject is not None:
|
||||||
|
subject_clean = subject.strip()
|
||||||
|
if not subject_clean:
|
||||||
|
raise ValueError("Template subject is required")
|
||||||
|
setattr(template, "subject", subject_clean)
|
||||||
|
if body is not None:
|
||||||
|
body_clean = body.strip()
|
||||||
|
if not body_clean:
|
||||||
|
raise ValueError("Template body is required")
|
||||||
|
setattr(template, "body", body_clean)
|
||||||
|
if slug is not None:
|
||||||
|
slug_clean = _normalize_slug(slug)
|
||||||
|
if not slug_clean:
|
||||||
|
raise ValueError("Template slug is required")
|
||||||
|
existing = (
|
||||||
|
session.query(EmailTemplate)
|
||||||
|
.filter(EmailTemplate.slug == slug_clean, EmailTemplate.template_id != template.template_id)
|
||||||
|
.one_or_none()
|
||||||
|
)
|
||||||
|
if existing:
|
||||||
|
raise ValueError("A template with this slug already exists")
|
||||||
|
setattr(template, "slug", slug_clean)
|
||||||
|
if is_active is not None:
|
||||||
|
setattr(template, "is_active", bool(is_active))
|
||||||
|
template.touch()
|
||||||
|
session.commit()
|
||||||
|
session.refresh(template)
|
||||||
|
return _template_to_dict(template)
|
||||||
|
|
||||||
|
|
||||||
|
def delete_email_template(template_id: int) -> bool:
|
||||||
|
if not template_id:
|
||||||
|
return False
|
||||||
|
with _ensure_session() as session:
|
||||||
|
template = session.get(EmailTemplate, int(template_id))
|
||||||
|
if template is None:
|
||||||
|
return False
|
||||||
|
session.delete(template)
|
||||||
|
session.commit()
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def ensure_default_email_template() -> None:
|
||||||
|
try:
|
||||||
|
from web.email_templates import DEFAULT_JOB_ALERT_SUBJECT, DEFAULT_JOB_ALERT_BODY
|
||||||
|
except Exception:
|
||||||
|
DEFAULT_JOB_ALERT_SUBJECT = "{count_label}{scope}"
|
||||||
|
DEFAULT_JOB_ALERT_BODY = (
|
||||||
|
"Hi,\n\n{intro_line}\n{jobs_message}\n\nGenerated at {timestamp} UTC.\n"
|
||||||
|
"You are receiving this message because job alerts are enabled.\n"
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
with _ensure_session() as session:
|
||||||
|
existing = session.query(EmailTemplate).filter(
|
||||||
|
EmailTemplate.slug == "job-alert").one_or_none()
|
||||||
|
if existing is None:
|
||||||
|
template = EmailTemplate(
|
||||||
|
name="Job Alert",
|
||||||
|
slug="job-alert",
|
||||||
|
subject=DEFAULT_JOB_ALERT_SUBJECT,
|
||||||
|
body=DEFAULT_JOB_ALERT_BODY,
|
||||||
|
is_active=True,
|
||||||
|
)
|
||||||
|
session.add(template)
|
||||||
|
session.commit()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
class UserInteraction(Base):
|
class UserInteraction(Base):
|
||||||
@@ -176,6 +416,58 @@ class UserKeyword(Base):
|
|||||||
"keywords.keyword_id", ondelete="CASCADE"), primary_key=True)
|
"keywords.keyword_id", ondelete="CASCADE"), primary_key=True)
|
||||||
|
|
||||||
|
|
||||||
|
class NegativeKeyword(Base):
|
||||||
|
__tablename__ = "negative_keywords"
|
||||||
|
keyword_id = Column(Integer, primary_key=True, autoincrement=True)
|
||||||
|
name = Column(String(SHORT_LEN), unique=True, nullable=False)
|
||||||
|
|
||||||
|
|
||||||
|
class UserNegativeKeyword(Base):
|
||||||
|
__tablename__ = "user_negative_keywords"
|
||||||
|
user_id = Column(Integer, ForeignKey(
|
||||||
|
"users.user_id", ondelete="CASCADE"), primary_key=True)
|
||||||
|
keyword_id = Column(Integer, ForeignKey(
|
||||||
|
"negative_keywords.keyword_id", ondelete="CASCADE"), primary_key=True)
|
||||||
|
|
||||||
|
|
||||||
|
class Log(Base):
|
||||||
|
__tablename__ = "logs"
|
||||||
|
id = Column(Integer, primary_key=True, autoincrement=True)
|
||||||
|
page_url = Column(String(URL_LEN))
|
||||||
|
region = Column(String(SHORT_LEN))
|
||||||
|
keyword = Column(String(SHORT_LEN))
|
||||||
|
fetched_at = Column(DateTime)
|
||||||
|
|
||||||
|
|
||||||
|
class EmailSubscription(Base):
|
||||||
|
__tablename__ = "email_subscriptions"
|
||||||
|
subscription_id = Column(Integer, primary_key=True, autoincrement=True)
|
||||||
|
email = Column(String(SHORT_LEN), unique=True, nullable=False)
|
||||||
|
is_active = Column(Boolean, default=True, nullable=False)
|
||||||
|
created_at = Column(DateTime, default=datetime.utcnow, nullable=False)
|
||||||
|
updated_at = Column(DateTime, default=datetime.utcnow, nullable=False)
|
||||||
|
|
||||||
|
def touch(self):
|
||||||
|
setattr(self, "updated_at", datetime.utcnow())
|
||||||
|
|
||||||
|
|
||||||
|
class EmailTemplate(Base):
|
||||||
|
__tablename__ = "email_templates"
|
||||||
|
template_id = Column(Integer, primary_key=True, autoincrement=True)
|
||||||
|
slug = Column(String(SHORT_LEN), unique=True, nullable=False)
|
||||||
|
name = Column(String(SHORT_LEN), nullable=False)
|
||||||
|
subject = Column(Text, nullable=False)
|
||||||
|
body = Column(Text, nullable=False)
|
||||||
|
is_active = Column(Boolean, default=True, nullable=False)
|
||||||
|
created_at = Column(
|
||||||
|
DateTime, default=lambda: datetime.now(UTC), nullable=False)
|
||||||
|
updated_at = Column(
|
||||||
|
DateTime, default=lambda: datetime.now(UTC), nullable=False)
|
||||||
|
|
||||||
|
def touch(self):
|
||||||
|
setattr(self, "updated_at", datetime.now(UTC))
|
||||||
|
|
||||||
|
|
||||||
def _ensure_session() -> Session:
|
def _ensure_session() -> Session:
|
||||||
global engine, SessionLocal
|
global engine, SessionLocal
|
||||||
if engine is None or SessionLocal is None:
|
if engine is None or SessionLocal is None:
|
||||||
@@ -223,6 +515,31 @@ def db_init():
|
|||||||
text("ALTER TABLE users ADD COLUMN IF NOT EXISTS last_login DATETIME NULL"))
|
text("ALTER TABLE users ADD COLUMN IF NOT EXISTS last_login DATETIME NULL"))
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
try:
|
||||||
|
conn.execute(text(
|
||||||
|
"ALTER TABLE job_descriptions ADD COLUMN IF NOT EXISTS reply_url VARCHAR(512) NULL"))
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
conn.execute(text(
|
||||||
|
"ALTER TABLE job_descriptions ADD COLUMN IF NOT EXISTS contact_email VARCHAR(255) NULL"))
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
conn.execute(text(
|
||||||
|
"ALTER TABLE job_descriptions ADD COLUMN IF NOT EXISTS contact_phone VARCHAR(255) NULL"))
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
conn.execute(text(
|
||||||
|
"ALTER TABLE job_descriptions ADD COLUMN IF NOT EXISTS contact_name VARCHAR(255) NULL"))
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
ensure_default_email_template()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
def upsert_user_interaction(job_id: str | int, *, user_id: Optional[int] = None, seen_at: Optional[str] = None, url_visited: Optional[str] = None, is_user_favorite: Optional[bool] = None):
|
def upsert_user_interaction(job_id: str | int, *, user_id: Optional[int] = None, seen_at: Optional[str] = None, url_visited: Optional[str] = None, is_user_favorite: Optional[bool] = None):
|
||||||
@@ -247,7 +564,7 @@ def upsert_user_interaction(job_id: str | int, *, user_id: Optional[int] = None,
|
|||||||
session.commit()
|
session.commit()
|
||||||
|
|
||||||
|
|
||||||
def upsert_listing(*, url: str, region: str, keyword: str, title: str, pay: str, location: str, timestamp: str):
|
def upsert_listing(*, url: str, region: str, keyword: str, title: str, pay: str, location: str, timestamp: str, fetched_from: str | None = None, fetched_at: Optional[datetime] = None):
|
||||||
"""Insert or update a job listing row based on job_id derived from URL."""
|
"""Insert or update a job listing row based on job_id derived from URL."""
|
||||||
job_id = str(url_to_job_id(url))
|
job_id = str(url_to_job_id(url))
|
||||||
with _ensure_session() as session:
|
with _ensure_session() as session:
|
||||||
@@ -263,19 +580,74 @@ def upsert_listing(*, url: str, region: str, keyword: str, title: str, pay: str,
|
|||||||
setattr(obj, "location", location)
|
setattr(obj, "location", location)
|
||||||
setattr(obj, "timestamp", timestamp)
|
setattr(obj, "timestamp", timestamp)
|
||||||
session.commit()
|
session.commit()
|
||||||
|
# Optionally record a fetch log for the listing if source provided
|
||||||
|
if fetched_from:
|
||||||
|
try:
|
||||||
|
insert_log(fetched_from, region=region, keyword=keyword,
|
||||||
|
fetched_at=fetched_at or datetime.now())
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
def upsert_job_details(job_data: Dict[str, Any]):
|
def insert_log(page_url: str, region: str | None = None, keyword: str | None = None, fetched_at: Optional[datetime] = None):
|
||||||
"""Upsert into job_descriptions table using scraped job details dict."""
|
"""Insert a log row for a fetched page."""
|
||||||
|
fetched_at = fetched_at or datetime.now()
|
||||||
|
with _ensure_session() as session:
|
||||||
|
l = Log(page_url=page_url, region=region or '',
|
||||||
|
keyword=keyword or '', fetched_at=fetched_at)
|
||||||
|
session.add(l)
|
||||||
|
session.commit()
|
||||||
|
|
||||||
|
|
||||||
|
def get_last_fetch_time(page_url: str) -> Optional[datetime]:
|
||||||
|
"""Return the latest fetched_at for a given page_url, or None if never fetched."""
|
||||||
|
with _ensure_session() as session:
|
||||||
|
row = session.execute(text("SELECT fetched_at FROM logs WHERE page_url = :u ORDER BY fetched_at DESC LIMIT 1"), {
|
||||||
|
"u": page_url}).fetchone()
|
||||||
|
if row and row[0]:
|
||||||
|
return row[0]
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def upsert_job_details(job_data: Dict[str, Any], region: str = "", keyword: str = ""):
|
||||||
|
"""Upsert into job_descriptions table using scraped job details dict.
|
||||||
|
|
||||||
|
Behavior additions:
|
||||||
|
- If the provided job `url` has a log entry with fetched_at less than 24 hours ago,
|
||||||
|
the function will skip updating to avoid unnecessary work.
|
||||||
|
- On successful upsert, a log entry is recorded with `insert_log(url, ...)`.
|
||||||
|
"""
|
||||||
|
if not job_data or job_data.get("is_negative_match"):
|
||||||
|
return
|
||||||
|
|
||||||
url = job_data.get("url")
|
url = job_data.get("url")
|
||||||
job_id = normalize_job_id(job_data.get("id"), url)
|
job_id = normalize_job_id(job_data.get("id"), url)
|
||||||
if not job_id:
|
if not job_id:
|
||||||
return
|
return
|
||||||
|
# Skip if job page was fetched recently (24 hours)
|
||||||
|
try:
|
||||||
|
if isinstance(url, str) and url:
|
||||||
|
last = get_last_fetch_time(url)
|
||||||
|
if last is not None:
|
||||||
|
# normalize tz-awareness
|
||||||
|
from datetime import timezone as _tz
|
||||||
|
now = datetime.now(_tz.utc)
|
||||||
|
last_dt = last if getattr(
|
||||||
|
last, 'tzinfo', None) is not None else last.replace(tzinfo=_tz.utc)
|
||||||
|
if (now - last_dt).total_seconds() < 24 * 3600:
|
||||||
|
return
|
||||||
|
except Exception:
|
||||||
|
# if log lookup fails, proceed normally
|
||||||
|
pass
|
||||||
title = job_data.get("title") or None
|
title = job_data.get("title") or None
|
||||||
company = job_data.get("company") or None
|
company = job_data.get("company") or None
|
||||||
location = job_data.get("location") or None
|
location = job_data.get("location") or None
|
||||||
description = job_data.get("description") or None
|
description = job_data.get("description") or None
|
||||||
posted_time = job_data.get("posted_time") or None
|
posted_time = job_data.get("posted_time") or None
|
||||||
|
reply_url = job_data.get("reply_url") or None
|
||||||
|
contact_email = job_data.get("contact_email") or None
|
||||||
|
contact_phone = job_data.get("contact_phone") or None
|
||||||
|
contact_name = job_data.get("contact_name") or None
|
||||||
|
|
||||||
job_id = str(job_id)
|
job_id = str(job_id)
|
||||||
with _ensure_session() as session:
|
with _ensure_session() as session:
|
||||||
@@ -289,142 +661,20 @@ def upsert_job_details(job_data: Dict[str, Any]):
|
|||||||
setattr(obj, "description", description)
|
setattr(obj, "description", description)
|
||||||
setattr(obj, "posted_time", posted_time)
|
setattr(obj, "posted_time", posted_time)
|
||||||
setattr(obj, "url", url)
|
setattr(obj, "url", url)
|
||||||
|
setattr(obj, "reply_url", reply_url)
|
||||||
|
setattr(obj, "contact_email", contact_email)
|
||||||
|
setattr(obj, "contact_phone", contact_phone)
|
||||||
|
setattr(obj, "contact_name", contact_name)
|
||||||
session.commit()
|
session.commit()
|
||||||
|
# Record that we fetched/updated this job page
|
||||||
|
|
||||||
def upsert_cached_page(*, file_path: str, url_guess: Optional[str], last_modified: Optional[str], size_bytes: Optional[int], job_id: Optional[int]):
|
|
||||||
# Store file paths relative to the cache directory (keeps DB portable)
|
|
||||||
cache_dir = os.path.abspath(get_cache_dir())
|
|
||||||
abs_fp = os.path.abspath(file_path)
|
|
||||||
rel_fp = os.path.relpath(abs_fp, start=cache_dir)
|
|
||||||
with _ensure_session() as session:
|
|
||||||
obj = session.get(CachedPage, rel_fp)
|
|
||||||
if obj is None:
|
|
||||||
obj = CachedPage(file_path=rel_fp)
|
|
||||||
session.add(obj)
|
|
||||||
setattr(obj, "url_guess", url_guess)
|
|
||||||
setattr(obj, "last_modified", last_modified)
|
|
||||||
setattr(obj, "size_bytes", size_bytes)
|
|
||||||
setattr(obj, "job_id", str(job_id) if job_id else None)
|
|
||||||
session.commit()
|
|
||||||
|
|
||||||
|
|
||||||
def remove_cached_page(file_path: str):
|
|
||||||
# Accept absolute or relative input but DB keys are stored relative to cache dir
|
|
||||||
cache_dir = os.path.abspath(get_cache_dir())
|
|
||||||
abs_fp = os.path.abspath(file_path)
|
|
||||||
rel_fp = os.path.relpath(abs_fp, start=cache_dir)
|
|
||||||
with _ensure_session() as session:
|
|
||||||
obj = session.get(CachedPage, rel_fp)
|
|
||||||
if obj:
|
|
||||||
session.delete(obj)
|
|
||||||
session.commit()
|
|
||||||
|
|
||||||
|
|
||||||
def db_remove_cached_url(url: str):
|
|
||||||
"""Remove a cached page by URL."""
|
|
||||||
# Compute absolute path for the URL and delegate to remove_cached_page
|
|
||||||
abs_fp = os.path.abspath(get_cache_path(url))
|
|
||||||
try:
|
try:
|
||||||
remove_cached_page(abs_fp)
|
if isinstance(url, str) and url:
|
||||||
|
insert_log(url, region=None, keyword=None,
|
||||||
|
fetched_at=datetime.now())
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
def db_get_all_cached_pages() -> List[Dict[str, Any]]:
|
|
||||||
with _ensure_session() as session:
|
|
||||||
rows = session.execute(text(
|
|
||||||
"SELECT file_path, url_guess, last_modified, size_bytes, job_id FROM cached_pages")).fetchall()
|
|
||||||
out = []
|
|
||||||
for row in rows:
|
|
||||||
fp = row[0]
|
|
||||||
out.append({
|
|
||||||
"file_path": fp,
|
|
||||||
"file_path_abs": db_get_cached_abs_path(fp) if fp else None,
|
|
||||||
"url_guess": row[1],
|
|
||||||
"last_modified": row[2],
|
|
||||||
"size_bytes": row[3],
|
|
||||||
"job_id": row[4],
|
|
||||||
})
|
|
||||||
return out
|
|
||||||
|
|
||||||
|
|
||||||
def db_get_cache_url(url: str):
|
|
||||||
"""Return the data for a specific URL from cached_pages.
|
|
||||||
|
|
||||||
Arguments:
|
|
||||||
url -- The URL to look up in the cache.
|
|
||||||
"""
|
|
||||||
with _ensure_session() as session:
|
|
||||||
row = session.execute(text(
|
|
||||||
"SELECT file_path, url_guess, last_modified, size_bytes, job_id FROM cached_pages WHERE url_guess = :u"), {"u": url}).fetchone()
|
|
||||||
if not row:
|
|
||||||
return None
|
|
||||||
fp = row[0]
|
|
||||||
return {
|
|
||||||
"file_path": fp,
|
|
||||||
"file_path_abs": db_get_cached_abs_path(fp) if fp else None,
|
|
||||||
"url_guess": row[1],
|
|
||||||
"last_modified": row[2],
|
|
||||||
"size_bytes": row[3],
|
|
||||||
"job_id": row[4],
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def db_sync_cached_pages(cache_dir: str):
|
|
||||||
"""Scan cache_dir and upsert page metadata into cached_pages table."""
|
|
||||||
if not os.path.isdir(cache_dir):
|
|
||||||
return
|
|
||||||
abs_cache = os.path.abspath(cache_dir)
|
|
||||||
# read existing DB keys once for quick membership tests
|
|
||||||
db_cache_paths = {c["file_path"] for c in db_get_all_cached_pages()}
|
|
||||||
for root, _, files in os.walk(abs_cache):
|
|
||||||
for name in files:
|
|
||||||
if not name.lower().endswith(".html"):
|
|
||||||
continue
|
|
||||||
fp = os.path.abspath(os.path.join(root, name))
|
|
||||||
rel_fp = os.path.relpath(fp, start=abs_cache)
|
|
||||||
if rel_fp in db_cache_paths:
|
|
||||||
continue
|
|
||||||
try:
|
|
||||||
stat = os.stat(fp)
|
|
||||||
mtime = datetime.fromtimestamp(stat.st_mtime).isoformat()
|
|
||||||
size = stat.st_size
|
|
||||||
except OSError:
|
|
||||||
mtime = None
|
|
||||||
size = None
|
|
||||||
url_guess = get_url_from_filename(name)
|
|
||||||
job_id = url_to_job_id(url_guess)
|
|
||||||
upsert_cached_page(file_path=rel_fp, url_guess=url_guess,
|
|
||||||
last_modified=mtime, size_bytes=size, job_id=job_id)
|
|
||||||
|
|
||||||
|
|
||||||
def normalize_cached_page_paths() -> int:
|
|
||||||
"""Ensure all cached_pages.file_path values are absolute. Returns number of rows updated/normalized."""
|
|
||||||
# Convert any absolute paths in DB to relative paths (relative to cache dir)
|
|
||||||
changed = 0
|
|
||||||
abs_cache = os.path.abspath(get_cache_dir())
|
|
||||||
with _ensure_session() as session:
|
|
||||||
rows = session.execute(text(
|
|
||||||
"SELECT file_path, url_guess, last_modified, size_bytes, job_id FROM cached_pages")).fetchall()
|
|
||||||
for (fp, url_guess, last_modified, size_bytes, job_id) in rows:
|
|
||||||
if os.path.isabs(fp):
|
|
||||||
rel_fp = os.path.relpath(fp, start=abs_cache)
|
|
||||||
upsert_cached_page(
|
|
||||||
file_path=rel_fp,
|
|
||||||
url_guess=url_guess,
|
|
||||||
last_modified=last_modified,
|
|
||||||
size_bytes=size_bytes,
|
|
||||||
job_id=job_id,
|
|
||||||
)
|
|
||||||
with _ensure_session() as session:
|
|
||||||
session.execute(
|
|
||||||
text("DELETE FROM cached_pages WHERE file_path = :fp"), {"fp": fp})
|
|
||||||
session.commit()
|
|
||||||
changed += 1
|
|
||||||
return changed
|
|
||||||
|
|
||||||
|
|
||||||
def db_get_keywords() -> List[str]:
|
def db_get_keywords() -> List[str]:
|
||||||
"""Return a list of all unique keywords from job listings."""
|
"""Return a list of all unique keywords from job listings."""
|
||||||
with _ensure_session() as session:
|
with _ensure_session() as session:
|
||||||
@@ -453,15 +703,10 @@ SELECT l.job_id
|
|||||||
,l.timestamp
|
,l.timestamp
|
||||||
,d.posted_time
|
,d.posted_time
|
||||||
,l.url
|
,l.url
|
||||||
,c.file_path
|
|
||||||
,c.last_modified
|
|
||||||
,c.url_guess
|
|
||||||
,CASE WHEN c.url_guess != l.url THEN 1 ELSE 0 END AS url_guess_stale
|
|
||||||
FROM job_listings AS l
|
FROM job_listings AS l
|
||||||
INNER JOIN job_descriptions AS d
|
INNER JOIN job_descriptions AS d
|
||||||
ON l.job_id = d.job_id
|
ON l.job_id = d.job_id
|
||||||
AND l.url = d.url
|
AND l.url = d.url
|
||||||
LEFT JOIN cached_pages AS c ON l.job_id = c.job_id
|
|
||||||
ORDER BY d.posted_time DESC
|
ORDER BY d.posted_time DESC
|
||||||
"""
|
"""
|
||||||
with _ensure_session() as session:
|
with _ensure_session() as session:
|
||||||
@@ -479,32 +724,21 @@ ORDER BY d.posted_time DESC
|
|||||||
"timestamp": row[7],
|
"timestamp": row[7],
|
||||||
"posted_time": row[8],
|
"posted_time": row[8],
|
||||||
"url": row[9],
|
"url": row[9],
|
||||||
# file_path is stored relative to cache dir; provide both forms
|
|
||||||
"file_path": row[10],
|
|
||||||
"file_path_abs": os.path.join(os.path.abspath(get_cache_dir()), row[10]) if row[10] else None,
|
|
||||||
"last_modified": row[11],
|
|
||||||
"url_guess": row[12],
|
|
||||||
"url_guess_stale": row[13],
|
|
||||||
}
|
}
|
||||||
jobs.append(job)
|
jobs.append(job)
|
||||||
return jobs
|
return jobs
|
||||||
|
|
||||||
|
|
||||||
def db_get_cached_abs_path(db_file_path: Optional[str]) -> Optional[str]:
|
def db_get_all_job_urls() -> List[dict]:
|
||||||
"""Return absolute cache file path given a DB-stored (relative) file_path.
|
"""Return list of job URLs from job_listings.
|
||||||
|
|
||||||
Returns None if input is falsy.
|
Returns:
|
||||||
|
- List of dicts with keys: url, region, keyword
|
||||||
"""
|
"""
|
||||||
if not db_file_path:
|
|
||||||
return None
|
|
||||||
return os.path.join(os.path.abspath(get_cache_dir()), db_file_path)
|
|
||||||
|
|
||||||
|
|
||||||
def db_get_all_job_urls() -> List[str]:
|
|
||||||
"""Return list of job URLs from job_listings."""
|
|
||||||
with _ensure_session() as session:
|
with _ensure_session() as session:
|
||||||
rows = session.execute(text("SELECT url FROM job_listings")).fetchall()
|
rows = session.execute(
|
||||||
return [r[0] for r in rows]
|
text("SELECT url, region, keyword FROM job_listings")).fetchall()
|
||||||
|
return [{"url": r[0], "region": r[1], "keyword": r[2]} for r in rows]
|
||||||
|
|
||||||
|
|
||||||
def db_delete_job(job_id: str | int):
|
def db_delete_job(job_id: str | int):
|
||||||
@@ -522,10 +756,6 @@ def remove_job(url):
|
|||||||
try:
|
try:
|
||||||
jid = url_to_job_id(url)
|
jid = url_to_job_id(url)
|
||||||
db_delete_job(jid)
|
db_delete_job(jid)
|
||||||
cache_fp = get_cache_path(url)
|
|
||||||
remove_cached_page(os.path.abspath(cache_fp))
|
|
||||||
if os.path.exists(cache_fp):
|
|
||||||
os.remove(cache_fp)
|
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@@ -534,7 +764,8 @@ def remove_job(url):
|
|||||||
|
|
||||||
def get_or_create_user(username: str) -> int:
|
def get_or_create_user(username: str) -> int:
|
||||||
"""Return user_id for username, creating if missing."""
|
"""Return user_id for username, creating if missing."""
|
||||||
created_at = datetime.now(UTC).isoformat()
|
# 2025-08-30T16:04:29.660245+00:00 is wrong. should be 2025-08-30T16:04:29
|
||||||
|
created_at = datetime.now(UTC).isoformat().split('.')[0]
|
||||||
with _ensure_session() as session:
|
with _ensure_session() as session:
|
||||||
row = session.execute(
|
row = session.execute(
|
||||||
text("SELECT user_id FROM users WHERE username = :u"), {
|
text("SELECT user_id FROM users WHERE username = :u"), {
|
||||||
@@ -654,22 +885,54 @@ def get_user(username: str) -> Optional[Dict[str, Any]]:
|
|||||||
"""Return single user dict or None."""
|
"""Return single user dict or None."""
|
||||||
with _ensure_session() as session:
|
with _ensure_session() as session:
|
||||||
row = session.execute(text(
|
row = session.execute(text(
|
||||||
"SELECT user_id, username, is_admin, is_active, password_hash, last_login, created_at FROM users WHERE username = :u"
|
"SELECT user_id, username, created_at, is_admin, is_active, last_login, (password_hash IS NOT NULL) AS has_pw FROM users WHERE username = :u"
|
||||||
), {"u": username}).fetchone()
|
), {"u": username}).fetchone()
|
||||||
if not row:
|
if not row:
|
||||||
return None
|
return None
|
||||||
return {
|
return {
|
||||||
"user_id": int(row[0]),
|
"user_id": int(row[0]),
|
||||||
"username": row[1],
|
"username": row[1],
|
||||||
"is_admin": bool(row[2]),
|
"created_at": row[2].isoformat() if isinstance(row[2], datetime) else (row[2] or None),
|
||||||
"is_active": bool(row[3]),
|
"is_admin": bool(row[3]),
|
||||||
"password_hash": row[4],
|
"is_active": bool(row[4]),
|
||||||
"last_login": row[5].isoformat() if row[5] else None,
|
"last_login": row[5].isoformat() if row[5] else None,
|
||||||
"created_at": row[6].isoformat() if isinstance(row[6], datetime) else (row[6] or None),
|
"has_password": bool(row[6]),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def get_user_by_id(user_id: int) -> Optional[Dict[str, Any]]:
|
||||||
|
"""Return single user dict or None."""
|
||||||
|
with _ensure_session() as session:
|
||||||
|
row = session.execute(text(
|
||||||
|
"SELECT user_id, username, created_at, is_admin, is_active, last_login, (password_hash IS NOT NULL) AS has_pw FROM users WHERE user_id = :u"
|
||||||
|
), {"u": user_id}).fetchone()
|
||||||
|
if not row:
|
||||||
|
return None
|
||||||
|
return {
|
||||||
|
"user_id": int(row[0]),
|
||||||
|
"username": row[1],
|
||||||
|
"created_at": row[2].isoformat() if isinstance(row[2], datetime) else (row[2] or None),
|
||||||
|
"is_admin": bool(row[3]),
|
||||||
|
"is_active": bool(row[4]),
|
||||||
|
"last_login": row[5].isoformat() if row[5] else None,
|
||||||
|
"has_password": bool(row[6]),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def delete_user_by_id(user_id: int) -> bool:
|
||||||
|
with _ensure_session() as session:
|
||||||
|
result = session.execute(
|
||||||
|
text("DELETE FROM users WHERE user_id = :u"), {"u": user_id})
|
||||||
|
session.commit()
|
||||||
|
rc = getattr(result, 'rowcount', None)
|
||||||
|
if rc is None:
|
||||||
|
# Unable to determine rowcount; assume success if no exception
|
||||||
|
return True
|
||||||
|
return rc > 0
|
||||||
|
|
||||||
# ---------------- Regions/Keywords helpers ---------------------------------
|
# ---------------- Regions/Keywords helpers ---------------------------------
|
||||||
|
|
||||||
|
|
||||||
def upsert_region(name: str) -> int:
|
def upsert_region(name: str) -> int:
|
||||||
"""Get or create a region by name; return region_id."""
|
"""Get or create a region by name; return region_id."""
|
||||||
name = (name or "").strip()
|
name = (name or "").strip()
|
||||||
@@ -713,6 +976,27 @@ def upsert_keyword(name: str) -> int:
|
|||||||
return upsert_keyword(name)
|
return upsert_keyword(name)
|
||||||
|
|
||||||
|
|
||||||
|
def upsert_negative_keyword(name: str) -> int:
|
||||||
|
"""Get or create a negative keyword by name; return keyword_id."""
|
||||||
|
name = (name or "").strip().lower()
|
||||||
|
if not name:
|
||||||
|
raise ValueError("Negative keyword cannot be empty")
|
||||||
|
with _ensure_session() as session:
|
||||||
|
row = session.execute(text("SELECT keyword_id FROM negative_keywords WHERE name = :n"), {
|
||||||
|
"n": name}).fetchone()
|
||||||
|
if row:
|
||||||
|
return int(row[0])
|
||||||
|
session.execute(
|
||||||
|
text("INSERT INTO negative_keywords(name) VALUES (:n)"), {"n": name})
|
||||||
|
session.commit()
|
||||||
|
with _ensure_session() as session:
|
||||||
|
row2 = session.execute(text("SELECT keyword_id FROM negative_keywords WHERE name = :n"), {
|
||||||
|
"n": name}).fetchone()
|
||||||
|
if row2:
|
||||||
|
return int(row2[0])
|
||||||
|
return upsert_negative_keyword(name)
|
||||||
|
|
||||||
|
|
||||||
def set_user_regions(username: str, region_names: List[str]) -> None:
|
def set_user_regions(username: str, region_names: List[str]) -> None:
|
||||||
"""Replace user's preferred regions with given names."""
|
"""Replace user's preferred regions with given names."""
|
||||||
user_id = get_or_create_user(username)
|
user_id = get_or_create_user(username)
|
||||||
@@ -771,6 +1055,34 @@ def set_user_keywords(username: str, keyword_names: List[str]) -> None:
|
|||||||
session.commit()
|
session.commit()
|
||||||
|
|
||||||
|
|
||||||
|
def set_user_negative_keywords(username: str, keyword_names: List[str]) -> None:
|
||||||
|
"""Replace user's negative keywords with given names."""
|
||||||
|
user_id = get_or_create_user(username)
|
||||||
|
names = sorted({(n or "").strip().lower()
|
||||||
|
for n in keyword_names if (n or "").strip()})
|
||||||
|
keyword_ids: List[int] = [upsert_negative_keyword(n) for n in names]
|
||||||
|
if not keyword_ids and not names:
|
||||||
|
with _ensure_session() as session:
|
||||||
|
session.execute(
|
||||||
|
text("DELETE FROM user_negative_keywords WHERE user_id = :u"), {"u": user_id})
|
||||||
|
session.commit()
|
||||||
|
return
|
||||||
|
desired = set(keyword_ids)
|
||||||
|
with _ensure_session() as session:
|
||||||
|
rows = session.execute(text("SELECT keyword_id FROM user_negative_keywords WHERE user_id = :u"), {
|
||||||
|
"u": user_id}).fetchall()
|
||||||
|
current = set(int(r[0]) for r in rows)
|
||||||
|
to_add = desired - current
|
||||||
|
to_remove = current - desired
|
||||||
|
for kid in to_remove:
|
||||||
|
session.execute(text("DELETE FROM user_negative_keywords WHERE user_id = :u AND keyword_id = :k"), {
|
||||||
|
"u": user_id, "k": int(kid)})
|
||||||
|
for kid in to_add:
|
||||||
|
session.execute(text("INSERT INTO user_negative_keywords(user_id, keyword_id) VALUES(:u, :k)"), {
|
||||||
|
"u": user_id, "k": int(kid)})
|
||||||
|
session.commit()
|
||||||
|
|
||||||
|
|
||||||
def get_user_regions(username: str) -> List[Dict[str, str]]:
|
def get_user_regions(username: str) -> List[Dict[str, str]]:
|
||||||
"""Return preferred region names for a user (empty if none)."""
|
"""Return preferred region names for a user (empty if none)."""
|
||||||
with _ensure_session() as session:
|
with _ensure_session() as session:
|
||||||
@@ -811,6 +1123,26 @@ def get_user_keywords(username: str) -> List[Dict[str, str]]:
|
|||||||
return [{"name": r[0], "color": r[1]} for r in rows]
|
return [{"name": r[0], "color": r[1]} for r in rows]
|
||||||
|
|
||||||
|
|
||||||
|
def get_user_negative_keywords(username: str) -> List[str]:
|
||||||
|
"""Return negative keyword names for a user (empty if none)."""
|
||||||
|
with _ensure_session() as session:
|
||||||
|
row = session.execute(text("SELECT user_id FROM users WHERE username = :u"), {
|
||||||
|
"u": username}).fetchone()
|
||||||
|
if not row:
|
||||||
|
return []
|
||||||
|
user_id = int(row[0])
|
||||||
|
rows = session.execute(text(
|
||||||
|
"""
|
||||||
|
SELECT k.name
|
||||||
|
FROM negative_keywords k
|
||||||
|
INNER JOIN user_negative_keywords uk ON uk.keyword_id = k.keyword_id
|
||||||
|
WHERE uk.user_id = :u
|
||||||
|
ORDER BY k.name ASC
|
||||||
|
"""
|
||||||
|
), {"u": user_id}).fetchall()
|
||||||
|
return [r[0] for r in rows]
|
||||||
|
|
||||||
|
|
||||||
def get_all_regions() -> List[Dict[str, str]]:
|
def get_all_regions() -> List[Dict[str, str]]:
|
||||||
"""Return all region names from regions table (sorted)."""
|
"""Return all region names from regions table (sorted)."""
|
||||||
with _ensure_session() as session:
|
with _ensure_session() as session:
|
||||||
@@ -941,3 +1273,44 @@ def change_keyword_color(keyword_id: int, new_color: str) -> bool:
|
|||||||
except Exception:
|
except Exception:
|
||||||
session.rollback()
|
session.rollback()
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def stats_overview() -> Dict[str, Any]:
|
||||||
|
"""Return an overview of job DB statistics.
|
||||||
|
|
||||||
|
Returns a dict with keys:
|
||||||
|
- total_jobs: int
|
||||||
|
- total_keywords: int (distinct keywords in listings)
|
||||||
|
- total_regions: int (distinct regions in listings)
|
||||||
|
- jobs_per_keyword: List[{"keyword": str, "count": int}]
|
||||||
|
- jobs_per_region: List[{"region": str, "count": int}]
|
||||||
|
"""
|
||||||
|
with _ensure_session() as session:
|
||||||
|
total_jobs = session.execute(text(
|
||||||
|
"SELECT COUNT(*) FROM job_listings l INNER JOIN job_descriptions d ON l.job_id = d.job_id AND l.url = d.url"
|
||||||
|
)).scalar_one()
|
||||||
|
total_keywords = session.execute(text(
|
||||||
|
"SELECT COUNT(DISTINCT keyword) FROM job_listings WHERE keyword IS NOT NULL AND keyword != ''"
|
||||||
|
)).scalar_one()
|
||||||
|
total_regions = session.execute(text(
|
||||||
|
"SELECT COUNT(DISTINCT region) FROM job_listings WHERE region IS NOT NULL AND region != ''"
|
||||||
|
)).scalar_one()
|
||||||
|
|
||||||
|
rows = session.execute(text(
|
||||||
|
"SELECT COALESCE(keyword, '') AS keyword, COUNT(*) as cnt FROM job_listings l INNER JOIN job_descriptions d ON l.job_id = d.job_id AND l.url = d.url GROUP BY keyword ORDER BY cnt DESC"
|
||||||
|
)).fetchall()
|
||||||
|
jobs_per_keyword = [
|
||||||
|
{"keyword": r[0], "count": int(r[1])} for r in rows]
|
||||||
|
|
||||||
|
rows = session.execute(text(
|
||||||
|
"SELECT COALESCE(region, '') AS region, COUNT(*) as cnt FROM job_listings l INNER JOIN job_descriptions d ON l.job_id = d.job_id AND l.url = d.url GROUP BY region ORDER BY cnt DESC"
|
||||||
|
)).fetchall()
|
||||||
|
jobs_per_region = [{"region": r[0], "count": int(r[1])} for r in rows]
|
||||||
|
|
||||||
|
return {
|
||||||
|
"total_jobs": int(total_jobs or 0),
|
||||||
|
"total_keywords": int(total_keywords or 0),
|
||||||
|
"total_regions": int(total_regions or 0),
|
||||||
|
"jobs_per_keyword": jobs_per_keyword,
|
||||||
|
"jobs_per_region": jobs_per_region,
|
||||||
|
}
|
||||||
|
|||||||
130
web/email_service.py
Normal file
130
web/email_service.py
Normal file
@@ -0,0 +1,130 @@
|
|||||||
|
"""Email sending utilities for the jobs scraper."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from email.message import EmailMessage
|
||||||
|
from typing import Iterable, Sequence
|
||||||
|
import smtplib
|
||||||
|
|
||||||
|
from web.utils import get_email_settings
|
||||||
|
|
||||||
|
|
||||||
|
class EmailConfigurationError(RuntimeError):
|
||||||
|
"""Raised when email settings are missing or invalid."""
|
||||||
|
|
||||||
|
|
||||||
|
class EmailDeliveryError(RuntimeError):
|
||||||
|
"""Raised when an email fails to send."""
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_addresses(addresses: Sequence[str] | str | None) -> list[str]:
|
||||||
|
if not addresses:
|
||||||
|
return []
|
||||||
|
if isinstance(addresses, str):
|
||||||
|
items = [addresses]
|
||||||
|
else:
|
||||||
|
items = list(addresses)
|
||||||
|
cleaned: list[str] = []
|
||||||
|
seen: set[str] = set()
|
||||||
|
for raw in items:
|
||||||
|
if not isinstance(raw, str):
|
||||||
|
continue
|
||||||
|
addr = raw.strip()
|
||||||
|
if not addr:
|
||||||
|
continue
|
||||||
|
lower = addr.lower()
|
||||||
|
if lower in seen:
|
||||||
|
continue
|
||||||
|
seen.add(lower)
|
||||||
|
cleaned.append(addr)
|
||||||
|
return cleaned
|
||||||
|
|
||||||
|
|
||||||
|
def _ensure_recipients(*recipient_groups: Iterable[str]) -> list[str]:
|
||||||
|
merged: list[str] = []
|
||||||
|
seen: set[str] = set()
|
||||||
|
for group in recipient_groups:
|
||||||
|
for addr in group:
|
||||||
|
lower = addr.lower()
|
||||||
|
if lower in seen:
|
||||||
|
continue
|
||||||
|
seen.add(lower)
|
||||||
|
merged.append(addr)
|
||||||
|
if not merged:
|
||||||
|
raise EmailConfigurationError(
|
||||||
|
"At least one recipient address is required")
|
||||||
|
return merged
|
||||||
|
|
||||||
|
|
||||||
|
def send_email(
|
||||||
|
*,
|
||||||
|
subject: str,
|
||||||
|
body: str,
|
||||||
|
to: Sequence[str] | str,
|
||||||
|
cc: Sequence[str] | str | None = None,
|
||||||
|
bcc: Sequence[str] | str | None = None,
|
||||||
|
reply_to: Sequence[str] | str | None = None,
|
||||||
|
settings: dict | None = None,
|
||||||
|
) -> bool:
|
||||||
|
"""Send an email using configured SMTP settings.
|
||||||
|
|
||||||
|
Returns True when a message is sent, False when email is disabled.
|
||||||
|
Raises EmailConfigurationError for invalid config and EmailDeliveryError for SMTP failures.
|
||||||
|
"""
|
||||||
|
|
||||||
|
config = settings or get_email_settings()
|
||||||
|
if not config.get("enabled"):
|
||||||
|
return False
|
||||||
|
|
||||||
|
smtp_cfg = config.get("smtp", {})
|
||||||
|
host = (smtp_cfg.get("host") or "").strip()
|
||||||
|
if not host:
|
||||||
|
raise EmailConfigurationError("SMTP host is not configured")
|
||||||
|
|
||||||
|
port = int(smtp_cfg.get("port", 587) or 587)
|
||||||
|
timeout = int(smtp_cfg.get("timeout", 30) or 30)
|
||||||
|
use_ssl = bool(smtp_cfg.get("use_ssl", False))
|
||||||
|
use_tls = bool(smtp_cfg.get("use_tls", True))
|
||||||
|
|
||||||
|
from_address = (config.get("from_address")
|
||||||
|
or smtp_cfg.get("username") or "").strip()
|
||||||
|
if not from_address:
|
||||||
|
raise EmailConfigurationError("From address is not configured")
|
||||||
|
|
||||||
|
to_list = _normalize_addresses(to)
|
||||||
|
cc_list = _normalize_addresses(cc)
|
||||||
|
bcc_list = _normalize_addresses(bcc)
|
||||||
|
reply_to_list = _normalize_addresses(reply_to)
|
||||||
|
all_recipients = _ensure_recipients(to_list, cc_list, bcc_list)
|
||||||
|
|
||||||
|
message = EmailMessage()
|
||||||
|
message["Subject"] = subject
|
||||||
|
message["From"] = from_address
|
||||||
|
message["To"] = ", ".join(to_list)
|
||||||
|
if cc_list:
|
||||||
|
message["Cc"] = ", ".join(cc_list)
|
||||||
|
if reply_to_list:
|
||||||
|
message["Reply-To"] = ", ".join(reply_to_list)
|
||||||
|
message.set_content(body)
|
||||||
|
|
||||||
|
username = (smtp_cfg.get("username") or "").strip()
|
||||||
|
password = smtp_cfg.get("password") or ""
|
||||||
|
|
||||||
|
client_cls = smtplib.SMTP_SSL if use_ssl else smtplib.SMTP
|
||||||
|
|
||||||
|
try:
|
||||||
|
with client_cls(host=host, port=port, timeout=timeout) as client:
|
||||||
|
client.ehlo()
|
||||||
|
if use_tls and not use_ssl:
|
||||||
|
client.starttls()
|
||||||
|
client.ehlo()
|
||||||
|
if username:
|
||||||
|
client.login(username, password)
|
||||||
|
client.send_message(message, from_addr=from_address,
|
||||||
|
to_addrs=all_recipients)
|
||||||
|
except EmailConfigurationError:
|
||||||
|
raise
|
||||||
|
except Exception as exc: # pragma: no cover - network errors depend on env
|
||||||
|
raise EmailDeliveryError(str(exc)) from exc
|
||||||
|
|
||||||
|
return True
|
||||||
106
web/email_templates.py
Normal file
106
web/email_templates.py
Normal file
@@ -0,0 +1,106 @@
|
|||||||
|
"""Email templates for job notifications."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from datetime import datetime, UTC
|
||||||
|
from typing import Iterable, Mapping, Dict, Any
|
||||||
|
|
||||||
|
DEFAULT_DATETIME_FORMAT = "%Y-%m-%d %H:%M"
|
||||||
|
DEFAULT_JOB_ALERT_SUBJECT = "{count_label}{scope}"
|
||||||
|
DEFAULT_JOB_ALERT_BODY = (
|
||||||
|
"Hi,\n\n{intro_line}{jobs_section}\n\nGenerated at {timestamp} UTC.\n"
|
||||||
|
"You are receiving this message because job alerts are enabled.\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class _SafeDict(dict):
|
||||||
|
def __missing__(self, key: str) -> str:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def _format_template(template: str, context: Dict[str, Any]) -> str:
|
||||||
|
safe_context = _SafeDict(
|
||||||
|
{k: ("\n".join(str(v) for v in context[k]) if isinstance(
|
||||||
|
context[k], list) else context[k]) for k in context}
|
||||||
|
)
|
||||||
|
return template.format_map(safe_context)
|
||||||
|
|
||||||
|
|
||||||
|
def render_job_alert_email(
|
||||||
|
jobs: Iterable[Mapping[str, object]],
|
||||||
|
*,
|
||||||
|
region: str | None = None,
|
||||||
|
keyword: str | None = None,
|
||||||
|
generated_at: datetime | None = None,
|
||||||
|
template_override: Mapping[str, str] | None = None,
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
"""Render the subject/body for a job alert email.
|
||||||
|
|
||||||
|
Returns a dict with subject/body strings and the context used to render them.
|
||||||
|
"""
|
||||||
|
|
||||||
|
job_list = list(jobs)
|
||||||
|
generated_at = generated_at or datetime.now(UTC)
|
||||||
|
timestamp = generated_at.strftime(DEFAULT_DATETIME_FORMAT)
|
||||||
|
|
||||||
|
scope_parts = []
|
||||||
|
if region:
|
||||||
|
scope_parts.append(f"region: {region}")
|
||||||
|
if keyword:
|
||||||
|
scope_parts.append(f"keyword: {keyword}")
|
||||||
|
scope = " (" + ", ".join(scope_parts) + ")" if scope_parts else ""
|
||||||
|
|
||||||
|
job_lines: list[str] = []
|
||||||
|
for index, job in enumerate(job_list, start=1):
|
||||||
|
title = str(job.get("title", "Untitled"))
|
||||||
|
company = str(job.get("company", "Unknown company"))
|
||||||
|
location = str(job.get("location", "N/A"))
|
||||||
|
url = str(job.get("url", ""))
|
||||||
|
line = f"{index}. {title} — {company} ({location})"
|
||||||
|
job_lines.append(line)
|
||||||
|
if url:
|
||||||
|
job_lines.append(f" {url}")
|
||||||
|
|
||||||
|
if job_lines:
|
||||||
|
jobs_section = "\n" + "\n".join(job_lines)
|
||||||
|
else:
|
||||||
|
jobs_section = "\nNo jobs matched this alert."
|
||||||
|
jobs_message = jobs_section.strip()
|
||||||
|
context: Dict[str, Any] = {
|
||||||
|
"count": len(job_list),
|
||||||
|
"count_label": "No new jobs" if not job_list else f"{len(job_list)} new jobs",
|
||||||
|
"scope": scope,
|
||||||
|
"region": region or "",
|
||||||
|
"keyword": keyword or "",
|
||||||
|
"timestamp": timestamp,
|
||||||
|
"generated_at": generated_at,
|
||||||
|
"intro_line": "Here are the latest jobs discovered by the scraper:",
|
||||||
|
"jobs_message": jobs_message,
|
||||||
|
"jobs_section": jobs_section,
|
||||||
|
"jobs_lines": job_lines,
|
||||||
|
"has_jobs": bool(job_list),
|
||||||
|
}
|
||||||
|
|
||||||
|
template = template_override
|
||||||
|
if template is None:
|
||||||
|
try:
|
||||||
|
from web.db import get_email_template_by_slug
|
||||||
|
|
||||||
|
template = get_email_template_by_slug("job-alert")
|
||||||
|
except Exception:
|
||||||
|
template = None
|
||||||
|
|
||||||
|
template_subject = (template or {}).get(
|
||||||
|
"subject") or DEFAULT_JOB_ALERT_SUBJECT
|
||||||
|
template_body = (template or {}).get("body") or DEFAULT_JOB_ALERT_BODY
|
||||||
|
|
||||||
|
subject = _format_template(template_subject, context)
|
||||||
|
body = _format_template(template_body, context)
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"subject": subject,
|
||||||
|
"body": body,
|
||||||
|
"context": context,
|
||||||
|
"template_slug": (template or {}).get("slug", "job-alert"),
|
||||||
|
}
|
||||||
|
return result
|
||||||
133
web/scraper.py
133
web/scraper.py
@@ -1,7 +1,82 @@
|
|||||||
from datetime import datetime, UTC
|
from datetime import datetime, UTC
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from typing import List, Dict, Set
|
from typing import List, Dict, Set
|
||||||
from web.utils import get_base_url, cache_page, safe_get_text, safe_get_attr, is_cached, get_cached_content, make_request_with_retry
|
from urllib.parse import urlparse, parse_qs
|
||||||
|
import re
|
||||||
|
from web.utils import (
|
||||||
|
get_base_url,
|
||||||
|
safe_get_text,
|
||||||
|
safe_get_attr,
|
||||||
|
make_request_with_retry,
|
||||||
|
get_negative_keywords,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_contact_info(reply_url) -> Dict[str, str]:
|
||||||
|
"""Extract contact information from reply URL.
|
||||||
|
|
||||||
|
Parses mailto links, phone links, and contact form URLs to extract:
|
||||||
|
- email: Email address (from mailto links)
|
||||||
|
- phone: Phone number (from tel links or URL parameters)
|
||||||
|
- contact_name: Contact person name (if available in URL parameters)
|
||||||
|
|
||||||
|
Returns a dict with email, phone, and contact_name keys (values may be "N/A").
|
||||||
|
"""
|
||||||
|
contact_info = {
|
||||||
|
"email": "N/A",
|
||||||
|
"phone": "N/A",
|
||||||
|
"contact_name": "N/A"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Handle None or empty cases
|
||||||
|
if not reply_url or reply_url == "N/A":
|
||||||
|
return contact_info
|
||||||
|
|
||||||
|
reply_url = str(reply_url).strip()
|
||||||
|
if not reply_url or reply_url == "N/A":
|
||||||
|
return contact_info
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Check for mailto links
|
||||||
|
if reply_url.startswith("mailto:"):
|
||||||
|
email_part = reply_url.replace("mailto:", "")
|
||||||
|
# Extract email (may contain ?subject=...)
|
||||||
|
email = email_part.split("?")[0]
|
||||||
|
contact_info["email"] = email
|
||||||
|
return contact_info
|
||||||
|
|
||||||
|
# Check for tel links
|
||||||
|
if reply_url.startswith("tel:"):
|
||||||
|
phone = reply_url.replace("tel:", "")
|
||||||
|
contact_info["phone"] = phone
|
||||||
|
return contact_info
|
||||||
|
|
||||||
|
# Parse as URL
|
||||||
|
if reply_url.startswith("http"):
|
||||||
|
parsed = urlparse(reply_url)
|
||||||
|
params = parse_qs(parsed.query)
|
||||||
|
|
||||||
|
# Try to extract email from parameters
|
||||||
|
for key in ["email", "from_email", "sender_email", "contact_email"]:
|
||||||
|
if key in params:
|
||||||
|
contact_info["email"] = params[key][0]
|
||||||
|
break
|
||||||
|
|
||||||
|
# Try to extract phone from parameters
|
||||||
|
for key in ["phone", "tel", "telephone"]:
|
||||||
|
if key in params:
|
||||||
|
contact_info["phone"] = params[key][0]
|
||||||
|
break
|
||||||
|
|
||||||
|
# Try to extract contact name from parameters
|
||||||
|
for key in ["contact_name", "from_name", "name"]:
|
||||||
|
if key in params:
|
||||||
|
contact_info["contact_name"] = params[key][0]
|
||||||
|
break
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return contact_info
|
||||||
|
|
||||||
|
|
||||||
def scrape_listings_page(listing, region: str, keyword: str, seen_urls: Set[str]) -> List:
|
def scrape_listings_page(listing, region: str, keyword: str, seen_urls: Set[str]) -> List:
|
||||||
@@ -40,6 +115,16 @@ def scrape_job_page(content: str, url: str) -> Dict:
|
|||||||
"""Scrape job details from a job listing page."""
|
"""Scrape job details from a job listing page."""
|
||||||
soup = BeautifulSoup(content, "html.parser")
|
soup = BeautifulSoup(content, "html.parser")
|
||||||
|
|
||||||
|
# Extract reply button
|
||||||
|
reply_button = soup.find("button", class_="reply-button")
|
||||||
|
if reply_button:
|
||||||
|
reply_url = safe_get_attr(reply_button, "data-href")
|
||||||
|
else:
|
||||||
|
reply_url = "N/A"
|
||||||
|
|
||||||
|
# Extract contact information from reply URL
|
||||||
|
contact_info = extract_contact_info(reply_url)
|
||||||
|
|
||||||
# Extract each field
|
# Extract each field
|
||||||
title = safe_get_text(soup.find("h1", class_="postingtitle"))
|
title = safe_get_text(soup.find("h1", class_="postingtitle"))
|
||||||
company = safe_get_text(soup.find("h2", class_="company-name"))
|
company = safe_get_text(soup.find("h2", class_="company-name"))
|
||||||
@@ -80,6 +165,30 @@ def scrape_job_page(content: str, url: str) -> Dict:
|
|||||||
job_id = ""
|
job_id = ""
|
||||||
posted_time = ""
|
posted_time = ""
|
||||||
|
|
||||||
|
# Negative keyword detection
|
||||||
|
negative_keyword_match = None
|
||||||
|
negative_match_field = None
|
||||||
|
negative_keywords = get_negative_keywords()
|
||||||
|
if negative_keywords:
|
||||||
|
fields_to_check = {
|
||||||
|
"title": title or "",
|
||||||
|
"company": company or "",
|
||||||
|
"location": location or "",
|
||||||
|
"description": description or "",
|
||||||
|
}
|
||||||
|
for keyword in negative_keywords:
|
||||||
|
if not keyword:
|
||||||
|
continue
|
||||||
|
pattern = re.compile(
|
||||||
|
r"\b" + re.escape(keyword) + r"\b", re.IGNORECASE)
|
||||||
|
for field_name, field_value in fields_to_check.items():
|
||||||
|
if field_value and pattern.search(field_value):
|
||||||
|
negative_keyword_match = keyword
|
||||||
|
negative_match_field = field_name
|
||||||
|
break
|
||||||
|
if negative_keyword_match:
|
||||||
|
break
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"url": url,
|
"url": url,
|
||||||
"title": title,
|
"title": title,
|
||||||
@@ -87,7 +196,14 @@ def scrape_job_page(content: str, url: str) -> Dict:
|
|||||||
"location": location,
|
"location": location,
|
||||||
"description": description,
|
"description": description,
|
||||||
"id": job_id,
|
"id": job_id,
|
||||||
"posted_time": posted_time
|
"posted_time": posted_time,
|
||||||
|
"reply_url": reply_url,
|
||||||
|
"contact_email": contact_info["email"],
|
||||||
|
"contact_phone": contact_info["phone"],
|
||||||
|
"contact_name": contact_info["contact_name"],
|
||||||
|
"negative_keyword_match": negative_keyword_match,
|
||||||
|
"negative_match_field": negative_match_field,
|
||||||
|
"is_negative_match": bool(negative_keyword_match),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -108,14 +224,7 @@ def scrape_job_data(content: str, region: str, keyword: str, seen_urls: Set[str]
|
|||||||
def process_region_keyword(region: str, keyword: str, seen_urls: Set[str]) -> List[List]:
|
def process_region_keyword(region: str, keyword: str, seen_urls: Set[str]) -> List[List]:
|
||||||
"""Process a single region and keyword."""
|
"""Process a single region and keyword."""
|
||||||
url = get_base_url().format(region=region, keyword=keyword.replace(" ", "+"))
|
url = get_base_url().format(region=region, keyword=keyword.replace(" ", "+"))
|
||||||
if is_cached(url):
|
content = make_request_with_retry(url, 1)
|
||||||
content = get_cached_content(url)
|
if content is None:
|
||||||
cache_status = "CACHED"
|
return []
|
||||||
else:
|
|
||||||
content = make_request_with_retry(url, 3)
|
|
||||||
if content is None:
|
|
||||||
return []
|
|
||||||
cache_page(url, content)
|
|
||||||
cache_status = "FETCHED"
|
|
||||||
_ = cache_status # no-op to silence unused var
|
|
||||||
return scrape_job_data(content, region, keyword, seen_urls)
|
return scrape_job_data(content, region, keyword, seen_urls)
|
||||||
|
|||||||
@@ -41,12 +41,16 @@ function scrape(event) {
|
|||||||
event.preventDefault(); // Prevent the default form submission
|
event.preventDefault(); // Prevent the default form submission
|
||||||
updateScrapeInfo("Scraping in progress...", "blue");
|
updateScrapeInfo("Scraping in progress...", "blue");
|
||||||
fetch("/scrape")
|
fetch("/scrape")
|
||||||
.then((response) => response.json())
|
// expect HTML response containing "Scraping completed successfully!"
|
||||||
|
.then((response) => response.text())
|
||||||
.then((data) => {
|
.then((data) => {
|
||||||
if (data.status) {
|
if (data.includes("Scraping completed successfully!")) {
|
||||||
updateScrapeInfo(data.status, "green");
|
updateScrapeInfo("Scraping completed successfully!", "green");
|
||||||
} else {
|
} else {
|
||||||
updateScrapeInfo("Scraping failed. Please try again.", "red");
|
updateScrapeInfo(
|
||||||
|
"Scraping failed or timed out. Please try again.",
|
||||||
|
"red"
|
||||||
|
);
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
.catch((error) => console.error("Error:", error));
|
.catch((error) => console.error("Error:", error));
|
||||||
|
|||||||
44
web/static/scrape.js
Normal file
44
web/static/scrape.js
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
function startScrape() {
|
||||||
|
const output = document.getElementById("output");
|
||||||
|
const startButton = document.getElementById("start-scrape");
|
||||||
|
|
||||||
|
output.textContent = "Starting scrape...\n";
|
||||||
|
startButton.disabled = true;
|
||||||
|
startButton.textContent = "Scraping...";
|
||||||
|
|
||||||
|
fetch("/scrape")
|
||||||
|
.then((response) => {
|
||||||
|
const reader = response.body.getReader();
|
||||||
|
const decoder = new TextDecoder();
|
||||||
|
|
||||||
|
function readStream() {
|
||||||
|
reader
|
||||||
|
.read()
|
||||||
|
.then(({ done, value }) => {
|
||||||
|
if (done) {
|
||||||
|
output.textContent += "\nScraping completed!";
|
||||||
|
startButton.disabled = false;
|
||||||
|
startButton.textContent = "Start Scraping";
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const chunk = decoder.decode(value, { stream: true });
|
||||||
|
output.textContent += chunk;
|
||||||
|
output.scrollTop = output.scrollHeight;
|
||||||
|
readStream();
|
||||||
|
})
|
||||||
|
.catch((error) => {
|
||||||
|
output.textContent += `\nError: ${error.message}`;
|
||||||
|
startButton.disabled = false;
|
||||||
|
startButton.textContent = "Start Scraping";
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
readStream();
|
||||||
|
})
|
||||||
|
.catch((error) => {
|
||||||
|
output.textContent = `Error starting scrape: ${error.message}`;
|
||||||
|
startButton.disabled = false;
|
||||||
|
startButton.textContent = "Start Scraping";
|
||||||
|
});
|
||||||
|
}
|
||||||
@@ -1,4 +1,22 @@
|
|||||||
/* javascript form handling */
|
/* javascript form handling */
|
||||||
|
document.addEventListener("DOMContentLoaded", function () {
|
||||||
|
const newNkInput = document.getElementById("new-negative-keyword");
|
||||||
|
if (newNkInput) {
|
||||||
|
newNkInput.addEventListener("input", function () {
|
||||||
|
const val = this.value.trim();
|
||||||
|
const existing = Array.from(
|
||||||
|
document.querySelectorAll('input[name="negative_keyword"]')
|
||||||
|
).map((el) => el.value);
|
||||||
|
if (existing.includes(val)) {
|
||||||
|
this.setCustomValidity("Keyword already exists");
|
||||||
|
this.reportValidity();
|
||||||
|
} else {
|
||||||
|
this.setCustomValidity("");
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
document
|
document
|
||||||
.getElementById("user-settings-form")
|
.getElementById("user-settings-form")
|
||||||
.addEventListener("submit", function (event) {
|
.addEventListener("submit", function (event) {
|
||||||
@@ -10,11 +28,15 @@ document
|
|||||||
// Collect selected regions and keywords
|
// Collect selected regions and keywords
|
||||||
const selectedRegions = [];
|
const selectedRegions = [];
|
||||||
const selectedKeywords = [];
|
const selectedKeywords = [];
|
||||||
|
const selectedNegativeKeywords = [];
|
||||||
|
|
||||||
formData.forEach((value, key) => {
|
formData.forEach((value, key) => {
|
||||||
if (key === "region") {
|
if (key === "region") {
|
||||||
selectedRegions.push(value);
|
selectedRegions.push(value);
|
||||||
} else if (key === "keyword") {
|
} else if (key === "keyword") {
|
||||||
selectedKeywords.push(value);
|
selectedKeywords.push(value);
|
||||||
|
} else if (key === "negative_keyword") {
|
||||||
|
selectedNegativeKeywords.push(value);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -30,10 +52,21 @@ document
|
|||||||
selectedKeywords.push(newKeyword);
|
selectedKeywords.push(newKeyword);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Add new negative keyword if provided
|
||||||
|
const newNegativeKeyword = formData.get("new-negative-keyword").trim();
|
||||||
|
if (newNegativeKeyword) {
|
||||||
|
if (selectedNegativeKeywords.includes(newNegativeKeyword)) {
|
||||||
|
alert("Negative keyword already exists!");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
selectedNegativeKeywords.push(newNegativeKeyword);
|
||||||
|
}
|
||||||
|
|
||||||
// Prepare data to send
|
// Prepare data to send
|
||||||
const dataToSend = {
|
const dataToSend = {
|
||||||
regions: selectedRegions,
|
regions: selectedRegions,
|
||||||
keywords: selectedKeywords,
|
keywords: selectedKeywords,
|
||||||
|
negative_keywords: selectedNegativeKeywords,
|
||||||
csrf_token: formData.get("csrf_token"),
|
csrf_token: formData.get("csrf_token"),
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
62
web/templates/admin/email.html
Normal file
62
web/templates/admin/email.html
Normal file
@@ -0,0 +1,62 @@
|
|||||||
|
{% extends 'base.html' %} {% block content %}
|
||||||
|
<h2>Email Subscriptions</h2>
|
||||||
|
<section>
|
||||||
|
<h3>Add Subscription</h3>
|
||||||
|
<form method="post">
|
||||||
|
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}" />
|
||||||
|
<input type="hidden" name="action" value="subscribe" />
|
||||||
|
<label for="email">Email address</label>
|
||||||
|
<input
|
||||||
|
type="email"
|
||||||
|
id="email"
|
||||||
|
name="email"
|
||||||
|
placeholder="alerts@example.com"
|
||||||
|
required
|
||||||
|
/>
|
||||||
|
<button type="submit">Subscribe</button>
|
||||||
|
</form>
|
||||||
|
</section>
|
||||||
|
<section>
|
||||||
|
<h3>Current Recipients</h3>
|
||||||
|
{% if not subscriptions %}
|
||||||
|
<p>No subscriptions yet. Add one above to start sending alerts.</p>
|
||||||
|
<p>You can customize alert content from the <a href="{{ url_for('admin_email_templates') }}">Email Templates</a> page.</p>
|
||||||
|
{% else %}
|
||||||
|
<p>{{ total_active }} active of {{ total }} total.</p>
|
||||||
|
<table>
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th>Email</th>
|
||||||
|
<th>Status</th>
|
||||||
|
<th>Created</th>
|
||||||
|
<th>Updated</th>
|
||||||
|
<th>Action</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
{% for sub in subscriptions %}
|
||||||
|
<tr>
|
||||||
|
<td>{{ sub.email }}</td>
|
||||||
|
<td>{{ 'Active' if sub.is_active else 'Inactive' }}</td>
|
||||||
|
<td>{{ sub.created_at }}</td>
|
||||||
|
<td>{{ sub.updated_at }}</td>
|
||||||
|
<td>
|
||||||
|
<form method="post" style="display: inline-flex; gap: 0.5rem">
|
||||||
|
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}" />
|
||||||
|
<input type="hidden" name="email" value="{{ sub.email }}" />
|
||||||
|
{% if sub.is_active %}
|
||||||
|
<input type="hidden" name="action" value="unsubscribe" />
|
||||||
|
<button type="submit">Deactivate</button>
|
||||||
|
{% else %}
|
||||||
|
<input type="hidden" name="action" value="reactivate" />
|
||||||
|
<button type="submit">Reactivate</button>
|
||||||
|
{% endif %}
|
||||||
|
</form>
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
{% endfor %}
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
{% endif %}
|
||||||
|
</section>
|
||||||
|
{% endblock %}
|
||||||
102
web/templates/admin/email_templates.html
Normal file
102
web/templates/admin/email_templates.html
Normal file
@@ -0,0 +1,102 @@
|
|||||||
|
{% extends 'base.html' %}
|
||||||
|
{% block content %}
|
||||||
|
<h2>Email Templates</h2>
|
||||||
|
<section>
|
||||||
|
<h3>Available Templates</h3>
|
||||||
|
{% if not templates %}
|
||||||
|
<p>No templates found. Create one below to get started.</p>
|
||||||
|
{% else %}
|
||||||
|
<table>
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th>Name</th>
|
||||||
|
<th>Slug</th>
|
||||||
|
<th>Status</th>
|
||||||
|
<th>Updated</th>
|
||||||
|
<th>Actions</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
{% for template in templates %}
|
||||||
|
<tr>
|
||||||
|
<td>{{ template.name }}</td>
|
||||||
|
<td>{{ template.slug }}</td>
|
||||||
|
<td>{{ 'Active' if template.is_active else 'Inactive' }}</td>
|
||||||
|
<td>{{ template.updated_at or template.created_at or '' }}</td>
|
||||||
|
<td style="display: flex; gap: 0.5rem;">
|
||||||
|
<a class="button" href="{{ url_for('admin_email_templates', template_id=template.template_id) }}">Edit</a>
|
||||||
|
<a class="button" href="{{ url_for('admin_email_templates', preview_id=template.template_id) }}">Preview</a>
|
||||||
|
<form method="post" onsubmit="return confirm('Delete template {{ template.name }}?');">
|
||||||
|
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}" />
|
||||||
|
<input type="hidden" name="action" value="delete" />
|
||||||
|
<input type="hidden" name="template_id" value="{{ template.template_id }}" />
|
||||||
|
<button type="submit">Delete</button>
|
||||||
|
</form>
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
{% endfor %}
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
{% endif %}
|
||||||
|
</section>
|
||||||
|
<section>
|
||||||
|
<h3>{{ 'Edit Template' if editing else 'Create Template' }}</h3>
|
||||||
|
<form method="post">
|
||||||
|
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}" />
|
||||||
|
<input type="hidden" name="action" value="{{ 'update' if editing else 'create' }}" />
|
||||||
|
{% if editing %}
|
||||||
|
<input type="hidden" name="template_id" value="{{ editing.template_id }}" />
|
||||||
|
{% endif %}
|
||||||
|
<div>
|
||||||
|
<label for="name">Name</label>
|
||||||
|
<input type="text" id="name" name="name" value="{{ editing.name if editing else '' }}" required />
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<label for="slug">Slug</label>
|
||||||
|
<input type="text" id="slug" name="slug" placeholder="job-alert" value="{{ editing.slug if editing else '' }}" />
|
||||||
|
<small>Leave blank to reuse the name. Slug must be URL friendly (letters, numbers, dashes).</small>
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<label for="subject">Subject Template</label>
|
||||||
|
<input type="text" id="subject" name="subject" value="{{ editing.subject if editing else '' }}" required />
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<label for="body">Body Template</label>
|
||||||
|
<textarea id="body" name="body" rows="12" required>{{ editing.body if editing else '' }}</textarea>
|
||||||
|
</div>
|
||||||
|
<div>
|
||||||
|
<label>
|
||||||
|
<input type="checkbox" name="is_active" {% if editing is none or editing.is_active %}checked{% endif %} />
|
||||||
|
Active
|
||||||
|
</label>
|
||||||
|
</div>
|
||||||
|
<button type="submit">{{ 'Update Template' if editing else 'Create Template' }}</button>
|
||||||
|
{% if editing %}
|
||||||
|
<a class="button" href="{{ url_for('admin_email_templates') }}">Cancel</a>
|
||||||
|
{% endif %}
|
||||||
|
</form>
|
||||||
|
<aside>
|
||||||
|
<h4>Available placeholders</h4>
|
||||||
|
<ul>
|
||||||
|
<li><code>{count}</code> – number of jobs in the alert</li>
|
||||||
|
<li><code>{count_label}</code> – "No new jobs" or "X new jobs"</li>
|
||||||
|
<li><code>{scope}</code> – formatted region/keyword context</li>
|
||||||
|
<li><code>{region}</code>, <code>{keyword}</code></li>
|
||||||
|
<li><code>{timestamp}</code> – formatted timestamp</li>
|
||||||
|
<li><code>{jobs_section}</code> – newline-prefixed block of job entries</li>
|
||||||
|
<li><code>{jobs_message}</code> – jobs block without leading newline</li>
|
||||||
|
</ul>
|
||||||
|
</aside>
|
||||||
|
</section>
|
||||||
|
{% if preview %}
|
||||||
|
<section>
|
||||||
|
<h3>Preview: {{ preview_template.name if preview_template else 'Job Alert' }}</h3>
|
||||||
|
<article>
|
||||||
|
<h4>Subject</h4>
|
||||||
|
<pre>{{ preview.subject }}</pre>
|
||||||
|
<h4>Body</h4>
|
||||||
|
<pre>{{ preview.body }}</pre>
|
||||||
|
</article>
|
||||||
|
</section>
|
||||||
|
{% endif %}
|
||||||
|
{% endblock %}
|
||||||
46
web/templates/admin/stats.html
Normal file
46
web/templates/admin/stats.html
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
{% extends 'base.html' %} {% block content %}
|
||||||
|
<div id="admin-stats">
|
||||||
|
<h2>Database Statistics</h2>
|
||||||
|
<div class="stats-summary">
|
||||||
|
<p><strong>Total jobs:</strong> {{ stats.total_jobs }}</p>
|
||||||
|
<p><strong>Total keywords:</strong> {{ stats.total_keywords }}</p>
|
||||||
|
<p><strong>Total regions:</strong> {{ stats.total_regions }}</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<h3>Jobs per keyword</h3>
|
||||||
|
<table>
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th>Keyword</th>
|
||||||
|
<th>Count</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
{% for row in stats.jobs_per_keyword %}
|
||||||
|
<tr>
|
||||||
|
<td>{{ row.keyword or '(empty)' }}</td>
|
||||||
|
<td>{{ row.count }}</td>
|
||||||
|
</tr>
|
||||||
|
{% endfor %}
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
|
||||||
|
<h3>Jobs per region</h3>
|
||||||
|
<table>
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th>Region</th>
|
||||||
|
<th>Count</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
{% for row in stats.jobs_per_region %}
|
||||||
|
<tr>
|
||||||
|
<td>{{ row.region or '(empty)' }}</td>
|
||||||
|
<td>{{ row.count }}</td>
|
||||||
|
</tr>
|
||||||
|
{% endfor %}
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
{% endblock %}
|
||||||
114
web/templates/admin/user.html
Normal file
114
web/templates/admin/user.html
Normal file
@@ -0,0 +1,114 @@
|
|||||||
|
{% extends 'base.html' %} {% block content %}
|
||||||
|
<div id="user-details">
|
||||||
|
{% if not user %}
|
||||||
|
<h2>Create new user</h2>
|
||||||
|
<form id="new-user-form" method="post" action="{{ url_for('admin_users') }}">
|
||||||
|
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}" />
|
||||||
|
<div id="user-info">
|
||||||
|
<p>
|
||||||
|
<strong>Username:</strong>
|
||||||
|
<input type="text" name="username" required />
|
||||||
|
</p>
|
||||||
|
<p>
|
||||||
|
<strong>Password:</strong>
|
||||||
|
<input type="password" name="password" required />
|
||||||
|
</p>
|
||||||
|
<p>
|
||||||
|
<strong>Admin:</strong>
|
||||||
|
<input type="checkbox" name="is_admin" />
|
||||||
|
</p>
|
||||||
|
<p>
|
||||||
|
<strong>Active:</strong>
|
||||||
|
<input type="checkbox" name="is_active" />
|
||||||
|
</p>
|
||||||
|
<button type="submit">Create User</button>
|
||||||
|
</div>
|
||||||
|
</form>
|
||||||
|
{% else %}
|
||||||
|
<h2>User {{ user.username }}</h2>
|
||||||
|
<form
|
||||||
|
id="user-form"
|
||||||
|
method="post"
|
||||||
|
action="{{ url_for('admin_user', user_id=user.user_id) }}"
|
||||||
|
>
|
||||||
|
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}" />
|
||||||
|
<input type="hidden" name="user_id" value="{{ user.user_id }}" />
|
||||||
|
<input type="hidden" name="username" value="{{ user.username }}" />
|
||||||
|
<div id="user-info">
|
||||||
|
<p><strong>ID:</strong> {{ user.user_id }}</p>
|
||||||
|
<p><strong>Username:</strong> {{ user.username }}</p>
|
||||||
|
<p><strong>Created At:</strong> {{ user.created_at }}</p>
|
||||||
|
<p><strong>Last Login:</strong> {{ user.last_login }}</p>
|
||||||
|
<p>
|
||||||
|
<strong>Admin:</strong>
|
||||||
|
<input type="checkbox" name="is_admin" {{ 'checked' if user.is_admin
|
||||||
|
else '' }} />
|
||||||
|
</p>
|
||||||
|
<p>
|
||||||
|
<strong>Active:</strong>
|
||||||
|
<input type="checkbox" name="is_active" {{ 'checked' if user.is_active
|
||||||
|
else '' }} />
|
||||||
|
</p>
|
||||||
|
<p>
|
||||||
|
<strong>Has Password:</strong> {{ '✅' if user.has_password else '❌' }}
|
||||||
|
</p>
|
||||||
|
<p>
|
||||||
|
<strong>New Password:</strong>
|
||||||
|
<input type="password" id="new_password" name="new_password" />
|
||||||
|
</p>
|
||||||
|
<button type="submit">Save</button>
|
||||||
|
</div>
|
||||||
|
</form>
|
||||||
|
</div>
|
||||||
|
<script>
|
||||||
|
const userForm = document.getElementById("user-form");
|
||||||
|
|
||||||
|
userForm.addEventListener("submit", function (event) {
|
||||||
|
const userId = document.getElementById("user_id").value;
|
||||||
|
event.preventDefault(); // Prevent the default form submission
|
||||||
|
updateUser(userId);
|
||||||
|
});
|
||||||
|
|
||||||
|
function updateUser(userId) {
|
||||||
|
const passwordInput = document.getElementById("new_password");
|
||||||
|
const formData = userForm.elements;
|
||||||
|
const username = formData.username.value;
|
||||||
|
const password = passwordInput.value;
|
||||||
|
const isAdmin = formData.is_admin.checked;
|
||||||
|
const isActive = formData.is_active.checked;
|
||||||
|
const hasPassword = passwordInput.value.trim() !== "";
|
||||||
|
|
||||||
|
fetch("/admin/user/" + userId, {
|
||||||
|
method: "POST",
|
||||||
|
headers: {
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
"X-CSRF-Token": formData.csrf_token.value,
|
||||||
|
},
|
||||||
|
body: JSON.stringify({
|
||||||
|
user_id: userId,
|
||||||
|
password: password,
|
||||||
|
username: username,
|
||||||
|
is_admin: isAdmin,
|
||||||
|
is_active: isActive,
|
||||||
|
}),
|
||||||
|
})
|
||||||
|
.then((response) => {
|
||||||
|
if (response.ok) {
|
||||||
|
alert("User updated successfully");
|
||||||
|
// Clear the password field after successful update
|
||||||
|
passwordInput.value = "";
|
||||||
|
// Set 'has_password' indicator
|
||||||
|
userForm.querySelector('input[name="has_password"]').value =
|
||||||
|
hasPassword ? "✅" : "❌";
|
||||||
|
} else {
|
||||||
|
alert("Error updating user");
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.catch((error) => {
|
||||||
|
console.error("Error:", error);
|
||||||
|
alert("Error updating user");
|
||||||
|
});
|
||||||
|
}
|
||||||
|
</script>
|
||||||
|
|
||||||
|
{% endif %} {% endblock %} {% block footer_scripts %} {% endblock %}
|
||||||
@@ -1,139 +1,100 @@
|
|||||||
{% extends 'base.html' %} {% block content %}
|
{% extends 'base.html' %} {% block content %}
|
||||||
<div id="users">
|
<div id="users">
|
||||||
<h2>Users</h2>
|
<h2>Users</h2>
|
||||||
<form id="user-form" method="post" action="{{ url_for('admin_users') }}">
|
|
||||||
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}" />
|
|
||||||
<table>
|
|
||||||
<thead>
|
|
||||||
<tr>
|
|
||||||
<th>ID</th>
|
|
||||||
<th>Username</th>
|
|
||||||
<th>Admin</th>
|
|
||||||
<th>Active</th>
|
|
||||||
<th colspan="2">Password</th>
|
|
||||||
<th>Created</th>
|
|
||||||
<th>Last Login</th>
|
|
||||||
<th></th>
|
|
||||||
</tr>
|
|
||||||
</thead>
|
|
||||||
<tbody>
|
|
||||||
{% for u in users %}
|
|
||||||
<tr class="user-row" data-user-id="{{ u.user_id }}">
|
|
||||||
<td>
|
|
||||||
{{ u.user_id }}<input
|
|
||||||
type="hidden"
|
|
||||||
name="user_id"
|
|
||||||
value="{{ u.user_id }}"
|
|
||||||
/>
|
|
||||||
</td>
|
|
||||||
<td>
|
|
||||||
<input
|
|
||||||
type="text"
|
|
||||||
name="username"
|
|
||||||
value="{{ u.username }}"
|
|
||||||
required
|
|
||||||
/>
|
|
||||||
</td>
|
|
||||||
<td>
|
|
||||||
<input type="checkbox" name="is_admin" {{ 'checked' if u.is_admin
|
|
||||||
else '' }} />
|
|
||||||
</td>
|
|
||||||
<td>
|
|
||||||
<input type="checkbox" name="is_active" {{ 'checked' if u.is_active
|
|
||||||
else '' }} />
|
|
||||||
</td>
|
|
||||||
<td>{{ '✅' if u.has_password else '❌' }}</td>
|
|
||||||
<td><input type="password" name="password" /></td>
|
|
||||||
<td>{{ u.created_at }}</td>
|
|
||||||
<td>{{ u.last_login or 'never' }}</td>
|
|
||||||
<td>
|
|
||||||
<button type="submit" data-user-id="{{ u.user_id }}">Save</button>
|
|
||||||
</td>
|
|
||||||
</tr>
|
|
||||||
{% endfor %}
|
|
||||||
</tbody>
|
|
||||||
</table>
|
|
||||||
</form>
|
|
||||||
</div>
|
|
||||||
<h3>Create / Update User</h3>
|
|
||||||
<form
|
|
||||||
id="create-update-user-form"
|
|
||||||
method="post"
|
|
||||||
action="{{ url_for('admin_users') }}"
|
|
||||||
>
|
|
||||||
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}" />
|
<input type="hidden" name="csrf_token" value="{{ csrf_token() }}" />
|
||||||
<label>Username <input type="text" name="username" required /></label>
|
<table>
|
||||||
<label>Password <input type="password" name="password" /></label>
|
<thead>
|
||||||
<label>Admin <input type="checkbox" name="is_admin" value="1" /></label>
|
<tr>
|
||||||
<label
|
<th>ID</th>
|
||||||
>Active <input type="checkbox" name="is_active" value="1" checked
|
<th>Username</th>
|
||||||
/></label>
|
<th>Admin</th>
|
||||||
<button type="submit">Save</button>
|
<th>Active</th>
|
||||||
</form>
|
<th>Password</th>
|
||||||
|
<th>Created</th>
|
||||||
|
<th>Last Login</th>
|
||||||
|
|
||||||
|
<th>Edit</th>
|
||||||
|
<th>Delete</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
{% for u in users %}
|
||||||
|
<tr class="user-row" data-user-id="{{ u.user_id }}">
|
||||||
|
<td>{{ u.user_id }}</td>
|
||||||
|
<td>
|
||||||
|
<a href="{{ url_for('admin_user', user_id=u.user_id) }}"
|
||||||
|
>{{ u.username }}</a
|
||||||
|
>
|
||||||
|
</td>
|
||||||
|
<td>{{ '✅' if u.is_admin else '❌' }}</td>
|
||||||
|
<td>{{ '✅' if u.is_active else '❌' }}</td>
|
||||||
|
<td>{{ '✅' if u.has_password else '❌' }}</td>
|
||||||
|
<td>{{ u.created_at }}</td>
|
||||||
|
<td>{{ u.last_login or 'never' }}</td>
|
||||||
|
<td>
|
||||||
|
<button
|
||||||
|
type="button"
|
||||||
|
class="edit-user"
|
||||||
|
data-user-id="{{ u.user_id }}"
|
||||||
|
onclick="editUser({{ u.user_id }})"
|
||||||
|
>
|
||||||
|
Edit
|
||||||
|
</button>
|
||||||
|
</td>
|
||||||
|
<td>
|
||||||
|
<button
|
||||||
|
type="button"
|
||||||
|
class="delete-user"
|
||||||
|
data-user-id="{{ u.user_id }}"
|
||||||
|
onclick="deleteUser({{ u.user_id }})"
|
||||||
|
>
|
||||||
|
Delete
|
||||||
|
</button>
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
{% endfor %}
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
<h2>Create New User</h2>
|
||||||
|
<a href="{{ url_for('admin_user', user_id='new') }}">Create User</a>
|
||||||
{% endblock %} {% block footer_scripts %}
|
{% endblock %} {% block footer_scripts %}
|
||||||
<script>
|
<script>
|
||||||
function updateUser(userId) {
|
function editUser(userId) {
|
||||||
const row = document.querySelector(`.user-row[data-user-id="${userId}"]`);
|
window.location.href = `/admin/user/${userId}`;
|
||||||
const passwordInput = row.querySelector('input[name="password"]');
|
}
|
||||||
const hasPassword =
|
function deleteUser(userId) {
|
||||||
row.querySelector("td:nth-child(5)").textContent.trim() === "✅";
|
if (
|
||||||
const formData = row.querySelector("form").elements;
|
confirm(
|
||||||
const username = formData.username.value;
|
"Are you sure you want to delete this user? This action cannot be undone."
|
||||||
const password = hasPassword ? passwordInput.value : undefined;
|
)
|
||||||
const isAdmin = formData.is_admin.checked;
|
) {
|
||||||
const isActive = formData.is_active.checked;
|
fetch(`/admin/user/${userId}/delete`, {
|
||||||
|
method: "POST",
|
||||||
fetch("/admin/users", {
|
headers: {
|
||||||
method: "POST",
|
"Content-Type": "application/json",
|
||||||
headers: {
|
"X-CSRFToken": document.querySelector('input[name="csrf_token"]')
|
||||||
"Content-Type": "application/json",
|
.value,
|
||||||
},
|
},
|
||||||
body: JSON.stringify({
|
|
||||||
user_id: userId,
|
|
||||||
password: password,
|
|
||||||
username: username,
|
|
||||||
is_admin: isAdmin,
|
|
||||||
is_active: isActive,
|
|
||||||
csrf_token: formData.csrf_token.value,
|
|
||||||
}),
|
|
||||||
})
|
|
||||||
.then((response) => {
|
|
||||||
if (response.ok) {
|
|
||||||
alert("User updated successfully");
|
|
||||||
// Clear the password field after successful update
|
|
||||||
passwordInput.value = "";
|
|
||||||
} else {
|
|
||||||
alert("Error updating user");
|
|
||||||
}
|
|
||||||
})
|
})
|
||||||
.catch((error) => {
|
.then((response) => {
|
||||||
console.error("Error:", error);
|
if (response.ok) {
|
||||||
alert("Error updating user");
|
// Remove the user row from the table
|
||||||
});
|
const row = document.querySelector(
|
||||||
|
`.user-row[data-user-id="${userId}"]`
|
||||||
|
);
|
||||||
|
if (row) {
|
||||||
|
row.remove();
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
alert("Error deleting user.");
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.catch((error) => {
|
||||||
|
console.error("Error:", error);
|
||||||
|
alert("Error deleting user.");
|
||||||
|
});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
function initUserForm() {
|
|
||||||
const form = document.getElementById("user-form");
|
|
||||||
const createUpdateForm = document.getElementById("create-update-user-form");
|
|
||||||
|
|
||||||
form.addEventListener("submit", function (event) {
|
|
||||||
const userId = event.target.querySelector('input[name="user_id"]').value;
|
|
||||||
event.preventDefault(); // Prevent the default form submission
|
|
||||||
updateUser(userId);
|
|
||||||
});
|
|
||||||
|
|
||||||
form.addEventListener("click", function (event) {
|
|
||||||
const userId = event.target.closest(".user-row").dataset.userId;
|
|
||||||
updateUser(userId);
|
|
||||||
});
|
|
||||||
|
|
||||||
createUpdateForm.addEventListener("submit", function (event) {
|
|
||||||
const passwordInput = createUpdateForm.querySelector(
|
|
||||||
'input[name="password"]'
|
|
||||||
);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
initUserForm();
|
|
||||||
</script>
|
</script>
|
||||||
{% endblock %}
|
{% endblock %}
|
||||||
|
|||||||
@@ -16,15 +16,21 @@
|
|||||||
<header>
|
<header>
|
||||||
<h1><a href="/">{{ title or 'Admin' }}</a></h1>
|
<h1><a href="/">{{ title or 'Admin' }}</a></h1>
|
||||||
<nav>
|
<nav>
|
||||||
{% if username %}<span>Hi, {{ username }}</span> | {% endif %}
|
<div id="navigation">
|
||||||
<a href="{{ url_for('index') }}">Home</a> |
|
{% if username %}<span>Hi, {{ username }}</span> | {% endif %}
|
||||||
<a href="{{ url_for('user_settings') }}">Preferences</a>
|
<a href="{{ url_for('index') }}">Home</a> |
|
||||||
{% if current_user and current_user.is_admin %} |
|
<a href="{{ url_for('user_settings') }}">Preferences</a>
|
||||||
<a href="{{ url_for('admin_taxonomy') }}">Taxonomy</a> |
|
{% if current_user and current_user.is_admin %} |
|
||||||
<a href="{{ url_for('admin_users') }}">Users</a> {% endif %} {% if
|
<a href="{{ url_for('scrape_page') }}">Scrape Jobs</a> |
|
||||||
session.get('username') %} |
|
<a href="{{ url_for('admin_taxonomy') }}">Taxonomy</a> |
|
||||||
<a href="{{ url_for('logout') }}">Logout</a> {% else %} |
|
<a href="{{ url_for('admin_stats') }}">Statistics</a> |
|
||||||
<a href="{{ url_for('login') }}">Login</a>{% endif %}
|
<a href="{{ url_for('admin_emails') }}">Email Alerts</a> |
|
||||||
|
<a href="{{ url_for('admin_email_templates') }}">Email Templates</a> |
|
||||||
|
<a href="{{ url_for('admin_users') }}">Users</a> {% endif %} {% if
|
||||||
|
session.get('username') %} |
|
||||||
|
<a href="{{ url_for('logout') }}">Logout</a> {% else %} |
|
||||||
|
<a href="{{ url_for('login') }}">Login</a>{% endif %}
|
||||||
|
</div>
|
||||||
</nav>
|
</nav>
|
||||||
{% with messages = get_flashed_messages() %} {% if messages %}
|
{% with messages = get_flashed_messages() %} {% if messages %}
|
||||||
<ul>
|
<ul>
|
||||||
|
|||||||
@@ -44,7 +44,8 @@
|
|||||||
<div id="jobs">
|
<div id="jobs">
|
||||||
{% for job in jobs %}
|
{% for job in jobs %}
|
||||||
<div class="job">
|
<div class="job">
|
||||||
<h3><a href="{{ job['url'] }}" target="_blank">{{ job['title'] }}</a></h3>
|
<!--<h3><a href="{{ job['url'] }}" target="_blank">{{ job['title'] }}</a></h3>-->
|
||||||
|
<h3><a href="{{ url_for('job_by_id', job_id=job['id']) }}" target="_blank">{{ job['title'] }}</a></h3>
|
||||||
<p class="job-posted-time">{{ job['posted_time'] }}</p>
|
<p class="job-posted-time">{{ job['posted_time'] }}</p>
|
||||||
<span class="job-region region-{{ job['region'] }}">{{ job['region'] }}</span>
|
<span class="job-region region-{{ job['region'] }}">{{ job['region'] }}</span>
|
||||||
<span class="job-keyword keyword-{{ job['keyword']|replace(' ', '')|lower }}">{{ job['keyword'] }}</span>
|
<span class="job-keyword keyword-{{ job['keyword']|replace(' ', '')|lower }}">{{ job['keyword'] }}</span>
|
||||||
|
|||||||
@@ -23,13 +23,5 @@ styles %}{% endblock %} {% block content %}
|
|||||||
>{{ job.title }}</a
|
>{{ job.title }}</a
|
||||||
>
|
>
|
||||||
</p>
|
</p>
|
||||||
{% if job.file_path_abs or job.file_path %}
|
|
||||||
<p>
|
|
||||||
<strong>Cached copy:</strong>
|
|
||||||
<a href="{{ url_for('serve_cached', job_id=job.id) }}" target="_blank"
|
|
||||||
>View cached copy</a
|
|
||||||
>
|
|
||||||
</p>
|
|
||||||
{% endif %}
|
|
||||||
</div>
|
</div>
|
||||||
{% endblock %}
|
{% endblock %}
|
||||||
|
|||||||
22
web/templates/scrape.html
Normal file
22
web/templates/scrape.html
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
{% extends "base.html" %} {% block title %}Scrape Jobs{% endblock %} {% block
|
||||||
|
content %}
|
||||||
|
<div id="scrape-container">
|
||||||
|
<h2>Job Scraping Progress</h2>
|
||||||
|
<button id="start-scrape" onclick="startScrape()">Start Scraping</button>
|
||||||
|
<div
|
||||||
|
id="output"
|
||||||
|
style="
|
||||||
|
margin-top: 20px;
|
||||||
|
padding: 10px;
|
||||||
|
border: 1px solid #ccc;
|
||||||
|
height: 400px;
|
||||||
|
overflow-y: auto;
|
||||||
|
background-color: #f9f9f9;
|
||||||
|
font-family: monospace;
|
||||||
|
white-space: pre-wrap;
|
||||||
|
"
|
||||||
|
></div>
|
||||||
|
</div>
|
||||||
|
{% endblock %} {% block scripts %}
|
||||||
|
<script src="{{ url_for('static', filename='scrape.js') }}"></script>
|
||||||
|
{% endblock %}
|
||||||
@@ -77,6 +77,29 @@ block content %}
|
|||||||
<p>No keywords available. Ask an admin to add some.</p>
|
<p>No keywords available. Ask an admin to add some.</p>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
</fieldset>
|
</fieldset>
|
||||||
|
<fieldset>
|
||||||
|
<legend>Negative Keywords</legend>
|
||||||
|
<p>
|
||||||
|
<small>Add new Negative Keyword:</small>
|
||||||
|
<input
|
||||||
|
type="text"
|
||||||
|
name="new-negative-keyword"
|
||||||
|
id="new-negative-keyword"
|
||||||
|
value=""
|
||||||
|
placeholder="Type a keyword and save to add"
|
||||||
|
size="30"
|
||||||
|
/>
|
||||||
|
</p>
|
||||||
|
{% if user_negative_keywords %} {% for nk in user_negative_keywords %}
|
||||||
|
<label style="display: block">
|
||||||
|
<input type="checkbox" name="negative_keyword" value="{{ nk }}" checked />
|
||||||
|
{{ nk }}
|
||||||
|
</label>
|
||||||
|
{% endfor %} {% else %}
|
||||||
|
<p>No negative keywords set.</p>
|
||||||
|
{% endif %}
|
||||||
|
<p><small>Uncheck to remove.</small></p>
|
||||||
|
</fieldset>
|
||||||
<button type="submit">Save</button>
|
<button type="submit">Save</button>
|
||||||
</form>
|
</form>
|
||||||
{% endblock %} {% block footer_scripts %}
|
{% endblock %} {% block footer_scripts %}
|
||||||
|
|||||||
153
web/utils.py
153
web/utils.py
@@ -2,7 +2,7 @@
|
|||||||
Utility functions for the Craigslist scraper.
|
Utility functions for the Craigslist scraper.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from typing import Any, Optional as _Optional
|
from typing import Any, Optional, List, Dict
|
||||||
from datetime import datetime, UTC
|
from datetime import datetime, UTC
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
@@ -10,7 +10,6 @@ import random
|
|||||||
import re
|
import re
|
||||||
import requests
|
import requests
|
||||||
import time
|
import time
|
||||||
from typing import Optional, List, Dict
|
|
||||||
|
|
||||||
|
|
||||||
def get_config_file() -> str:
|
def get_config_file() -> str:
|
||||||
@@ -94,10 +93,6 @@ def get_paths() -> dict:
|
|||||||
return get_config().get('paths', {})
|
return get_config().get('paths', {})
|
||||||
|
|
||||||
|
|
||||||
def get_cache_dir() -> str:
|
|
||||||
return get_paths().get('cache_dir', 'cache')
|
|
||||||
|
|
||||||
|
|
||||||
def get_logs_dir() -> str:
|
def get_logs_dir() -> str:
|
||||||
return get_paths().get('logs_dir', 'logs')
|
return get_paths().get('logs_dir', 'logs')
|
||||||
|
|
||||||
@@ -130,9 +125,64 @@ def get_base_url() -> str:
|
|||||||
return get_config().get('scraper', {}).get('base_url', "https://{region}.craigslist.org/search/jjj?query={keyword}&sort=rel")
|
return get_config().get('scraper', {}).get('base_url', "https://{region}.craigslist.org/search/jjj?query={keyword}&sort=rel")
|
||||||
|
|
||||||
|
|
||||||
def ensure_cache_dir():
|
def get_negative_keywords() -> List[str]:
|
||||||
"""Ensure cache directory exists."""
|
"""Return normalized list of negative keywords from config."""
|
||||||
os.makedirs(get_cache_dir(), exist_ok=True)
|
raw = get_config().get('scraper', {}).get('negative_keywords', [])
|
||||||
|
if not isinstance(raw, list):
|
||||||
|
return []
|
||||||
|
cleaned: List[str] = []
|
||||||
|
for item in raw:
|
||||||
|
if not isinstance(item, str):
|
||||||
|
continue
|
||||||
|
val = item.strip()
|
||||||
|
if not val:
|
||||||
|
continue
|
||||||
|
cleaned.append(val.lower())
|
||||||
|
return cleaned
|
||||||
|
|
||||||
|
|
||||||
|
def get_email_settings() -> Dict[str, Any]:
|
||||||
|
"""Return normalized email settings from config."""
|
||||||
|
cfg = get_config().get('email', {})
|
||||||
|
if not isinstance(cfg, dict):
|
||||||
|
cfg = {}
|
||||||
|
raw_smtp = cfg.get('smtp', {}) if isinstance(cfg.get('smtp'), dict) else {}
|
||||||
|
raw_recipients = cfg.get('recipients', [])
|
||||||
|
|
||||||
|
def _to_int(value, default):
|
||||||
|
try:
|
||||||
|
return int(value)
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
return default
|
||||||
|
|
||||||
|
recipients: List[str] = []
|
||||||
|
if isinstance(raw_recipients, list):
|
||||||
|
for item in raw_recipients:
|
||||||
|
if isinstance(item, str):
|
||||||
|
addr = item.strip()
|
||||||
|
if addr:
|
||||||
|
recipients.append(addr)
|
||||||
|
|
||||||
|
smtp = {
|
||||||
|
'host': (raw_smtp.get('host') or '').strip(),
|
||||||
|
'port': _to_int(raw_smtp.get('port', 587), 587),
|
||||||
|
'username': (raw_smtp.get('username') or '').strip(),
|
||||||
|
'password': raw_smtp.get('password') or '',
|
||||||
|
'use_tls': bool(raw_smtp.get('use_tls', True)),
|
||||||
|
'use_ssl': bool(raw_smtp.get('use_ssl', False)),
|
||||||
|
'timeout': _to_int(raw_smtp.get('timeout', 30), 30),
|
||||||
|
}
|
||||||
|
if smtp['port'] <= 0:
|
||||||
|
smtp['port'] = 587
|
||||||
|
if smtp['timeout'] <= 0:
|
||||||
|
smtp['timeout'] = 30
|
||||||
|
|
||||||
|
return {
|
||||||
|
'enabled': bool(cfg.get('enabled', False)),
|
||||||
|
'from_address': (cfg.get('from_address') or '').strip(),
|
||||||
|
'smtp': smtp,
|
||||||
|
'recipients': recipients,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def now_iso() -> str:
|
def now_iso() -> str:
|
||||||
@@ -174,12 +224,6 @@ def get_url_from_filename(name: str) -> str:
|
|||||||
return url_guess
|
return url_guess
|
||||||
|
|
||||||
|
|
||||||
def get_cached_content(url: str) -> str:
|
|
||||||
"""Get cached content for URL."""
|
|
||||||
with open(get_cache_path(url), "r", encoding="utf-8") as f:
|
|
||||||
return f.read()
|
|
||||||
|
|
||||||
|
|
||||||
def safe_get_text(element, default="N/A"):
|
def safe_get_text(element, default="N/A"):
|
||||||
"""Safely extract text from BeautifulSoup element."""
|
"""Safely extract text from BeautifulSoup element."""
|
||||||
return element.get_text(strip=True) if element else default
|
return element.get_text(strip=True) if element else default
|
||||||
@@ -195,53 +239,6 @@ def get_random_delay(min_delay: int = get_min_delay(), max_delay: int = get_max_
|
|||||||
return random.uniform(min_delay, max_delay)
|
return random.uniform(min_delay, max_delay)
|
||||||
|
|
||||||
|
|
||||||
def get_cache_path(url: str) -> str:
|
|
||||||
"""Get cache file path for URL."""
|
|
||||||
return os.path.join(get_cache_dir(), f"{get_filename_from_url(url)}.html")
|
|
||||||
|
|
||||||
|
|
||||||
def cache_page(url: str, content: str):
|
|
||||||
"""Cache the page content with a timestamp."""
|
|
||||||
cache_path = get_cache_path(url)
|
|
||||||
with open(cache_path, "w", encoding="utf-8") as f:
|
|
||||||
f.write(content)
|
|
||||||
# Update the file's modification time to the current time
|
|
||||||
os.utime(cache_path, None)
|
|
||||||
|
|
||||||
|
|
||||||
def is_cached(url: str) -> bool:
|
|
||||||
"""Check if the page is cached and not older than 24 hours."""
|
|
||||||
cache_path = get_cache_path(url)
|
|
||||||
if not os.path.isfile(cache_path):
|
|
||||||
return False
|
|
||||||
|
|
||||||
# Check the file's age if it's a search result page
|
|
||||||
if 'search' in url:
|
|
||||||
file_age = time.time() - os.path.getmtime(cache_path)
|
|
||||||
if file_age > 24 * 3600: # 24 hours in seconds
|
|
||||||
return False
|
|
||||||
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
|
||||||
def is_cache_stale(last_modified: str, days: int = 1) -> bool:
|
|
||||||
"""Check if the cached page is stale (older than 24 hours)."""
|
|
||||||
if not last_modified:
|
|
||||||
return True
|
|
||||||
last_datetime = datetime.fromisoformat(last_modified)
|
|
||||||
file_age = time.time() - last_datetime.timestamp()
|
|
||||||
return file_age > days * 24 * 3600 # days in seconds
|
|
||||||
|
|
||||||
|
|
||||||
def delete_cached_page(url: str):
|
|
||||||
cache_fp = get_cache_path(url)
|
|
||||||
if os.path.exists(cache_fp):
|
|
||||||
try:
|
|
||||||
os.remove(cache_fp)
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
def get_color_from_string(s: str) -> str:
|
def get_color_from_string(s: str) -> str:
|
||||||
"""Generate a color code from a string."""
|
"""Generate a color code from a string."""
|
||||||
hash_code = hash(s)
|
hash_code = hash(s)
|
||||||
@@ -264,15 +261,41 @@ def get_color_from_string(s: str) -> str:
|
|||||||
|
|
||||||
def filter_jobs(
|
def filter_jobs(
|
||||||
jobs: List[Dict[str, Any]],
|
jobs: List[Dict[str, Any]],
|
||||||
region: _Optional[str] = None,
|
region: Optional[str] = None,
|
||||||
keyword: _Optional[str] = None,
|
keyword: Optional[str] = None,
|
||||||
|
negative_keywords: Optional[List[str]] = None,
|
||||||
) -> List[Dict[str, Any]]:
|
) -> List[Dict[str, Any]]:
|
||||||
"""Filter jobs by optional region and keyword."""
|
"""Filter jobs by optional region, keyword, and negative keywords."""
|
||||||
filtered = jobs
|
filtered = jobs
|
||||||
if region:
|
if region:
|
||||||
filtered = [j for j in filtered if j.get("region") == region]
|
filtered = [j for j in filtered if j.get("region") == region]
|
||||||
if keyword:
|
if keyword:
|
||||||
filtered = [j for j in filtered if j.get("keyword") == keyword]
|
filtered = [j for j in filtered if j.get("keyword") == keyword]
|
||||||
|
if negative_keywords:
|
||||||
|
# Pre-compile regexes or just check substring?
|
||||||
|
# Scraper uses substring check. Let's do the same for consistency.
|
||||||
|
# Fields to check: title, company, location, description
|
||||||
|
# Note: description might contain HTML or be long.
|
||||||
|
|
||||||
|
# Normalize negative keywords
|
||||||
|
nks = [nk.lower() for nk in negative_keywords if nk]
|
||||||
|
|
||||||
|
def is_clean(job):
|
||||||
|
# Check all fields
|
||||||
|
text_blob = " ".join([
|
||||||
|
str(job.get("title") or ""),
|
||||||
|
str(job.get("company") or ""),
|
||||||
|
str(job.get("location") or ""),
|
||||||
|
str(job.get("description") or "")
|
||||||
|
]).lower()
|
||||||
|
|
||||||
|
for nk in nks:
|
||||||
|
if nk in text_blob:
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
filtered = [j for j in filtered if is_clean(j)]
|
||||||
|
|
||||||
return filtered
|
return filtered
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user