separate javascript for scrape page

reorganize imports
removing unused imports
2025-09-17 17:12:29 +02:00 · 2025-09-17 17:12:16 +02:00 · 2025-09-17 17:11:45 +02:00
5 changed files with 65 additions and 57 deletions
--- a/setup.py
+++ b/setup.py
@@ -36,15 +36,15 @@ try:
        engine = create_engine(url, future=True)
        with engine.begin() as conn:
            for table in [
                "users",
                "regions",
                "keywords",
                "user_regions",
                "user_keywords",
                "job_listings",
                "job_descriptions",
-                "cached_pages",
+                "job_listings",
                "keywords",
                "logs",
                "regions",
                "users",
                "user_interactions",
                "user_keywords",
                "user_regions",
            ]:
                try:
                    n = conn.execute(text(f"SELECT COUNT(*) FROM {table}"))
--- a/web/craigslist.py
+++ b/web/craigslist.py
@@ -15,6 +15,7 @@ from web.db import (
 # Import utility functions
 from web.utils import (
    get_base_url,
    make_request_with_retry,
    now_iso,
 )
@@ -54,9 +55,10 @@ def fetch_listings():
            if not keyword_name:
                continue
            # Build a canonical search identifier for this region+keyword combination.
            url = get_base_url().format(region=region, keyword=keyword_name.replace(" ", "+"))
            search_page_id = f"search:{region_name}:{keyword_name}"
            try:
-                last = get_last_fetch_time(search_page_id)
+                last = get_last_fetch_time(url)
                if last is not None:
                    # skip if fetched within the last 24 hours
                    age = datetime.now(
@@ -72,7 +74,7 @@ def fetch_listings():
            yield f"Processing {region_name} + {keyword_name} ({processed}/{total_combinations})...\n"
            # record that we're fetching this search page now
            try:
-                insert_log(search_page_id, region=region_name,
+                insert_log(url, region=region_name,
                           keyword=keyword_name, fetched_at=datetime.now(timezone.utc))
            except Exception:
                pass
@@ -99,6 +101,15 @@ def fetch_listings():
 def process_job_url(job_url: str, region: str = "", keyword: str = ""):
    last = get_last_fetch_time(job_url)
    if last is not None:
        # skip if fetched within the last 24 hours
        age = datetime.now(
            timezone.utc) - (last if last.tzinfo is not None else last.replace(tzinfo=timezone.utc))
        if age.total_seconds() < 24 * 3600:
            yield f"Skipping job {job_url} (fetched {age.seconds//3600}h ago)...\n"
            return None
    try:
        job_id = url_to_job_id(job_url)
        yield f"Fetching job page: {job_url}\n"
--- a/web/db.py
+++ b/web/db.py
@@ -14,10 +14,8 @@ Tables:
 """
 from datetime import datetime, UTC
 import os
 from typing import Optional, Dict, Any, List
 from web.utils import (
    get_url_from_filename,
    get_color_from_string,
    url_to_job_id,
    normalize_job_id,
--- a/web/static/scrape.js
+++ b/web/static/scrape.js
@@ -0,0 +1,44 @@
 function startScrape() {
  const output = document.getElementById("output");
  const startButton = document.getElementById("start-scrape");
  output.textContent = "Starting scrape...\n";
  startButton.disabled = true;
  startButton.textContent = "Scraping...";
  fetch("/scrape")
    .then((response) => {
      const reader = response.body.getReader();
      const decoder = new TextDecoder();
      function readStream() {
        reader
          .read()
          .then(({ done, value }) => {
            if (done) {
              output.textContent += "\nScraping completed!";
              startButton.disabled = false;
              startButton.textContent = "Start Scraping";
              return;
            }
            const chunk = decoder.decode(value, { stream: true });
            output.textContent += chunk;
            output.scrollTop = output.scrollHeight;
            readStream();
          })
          .catch((error) => {
            output.textContent += `\nError: ${error.message}`;
            startButton.disabled = false;
            startButton.textContent = "Start Scraping";
          });
      }
      readStream();
    })
    .catch((error) => {
      output.textContent = `Error starting scrape: ${error.message}`;
      startButton.disabled = false;
      startButton.textContent = "Start Scraping";
    });
 }
--- a/web/templates/scrape.html
+++ b/web/templates/scrape.html
@@ -18,50 +18,5 @@ content %}
  ></div>
 </div>
 {% endblock %} {% block scripts %}
-<script>
+<script src="{{ url_for('static', filename='scrape.js') }}"></script>
  function startScrape() {
    const output = document.getElementById("output");
    const startButton = document.getElementById("start-scrape");
    output.textContent = "Starting scrape...\n";
    startButton.disabled = true;
    startButton.textContent = "Scraping...";
    fetch("/scrape")
      .then((response) => {
        const reader = response.body.getReader();
        const decoder = new TextDecoder();
        function readStream() {
          reader
            .read()
            .then(({ done, value }) => {
              if (done) {
                output.textContent += "\nScraping completed!";
                startButton.disabled = false;
                startButton.textContent = "Start Scraping";
                return;
              }
              const chunk = decoder.decode(value, { stream: true });
              output.textContent += chunk;
              output.scrollTop = output.scrollHeight;
              readStream();
            })
            .catch((error) => {
              output.textContent += `\nError: ${error.message}`;
              startButton.disabled = false;
              startButton.textContent = "Start Scraping";
            });
        }
        readStream();
      })
      .catch((error) => {
        output.textContent = `Error starting scrape: ${error.message}`;
        startButton.disabled = false;
        startButton.textContent = "Start Scraping";
      });
  }
 </script>
 {% endblock %}
Author	SHA1	Message	Date
zwitschi	5cbb760005	separate javascript for scrape page	2025-09-17 17:12:29 +02:00
zwitschi	2ae1e2058d	reorganize imports removing unused imports	2025-09-17 17:12:16 +02:00
zwitschi	e549fae3f6	fix table setup	2025-09-17 17:11:45 +02:00