separate javascript for scrape page

reorganize imports
removing unused imports
2025-09-17 17:12:29 +02:00 · 2025-09-17 17:12:16 +02:00 · 2025-09-17 17:11:45 +02:00
5 changed files with 65 additions and 57 deletions
--- a/setup.py
+++ b/setup.py
@@ -36,15 +36,15 @@ try:
        engine = create_engine(url, future=True)
        with engine.begin() as conn:
            for table in [
-                "users",
-                "regions",
-                "keywords",
-                "user_regions",
-                "user_keywords",
-                "job_listings",
                "job_descriptions",
-                "cached_pages",
+                "job_listings",
+                "keywords",
+                "logs",
+                "regions",
+                "users",
                "user_interactions",
+                "user_keywords",
+                "user_regions",
            ]:
                try:
                    n = conn.execute(text(f"SELECT COUNT(*) FROM {table}"))
--- a/web/craigslist.py
+++ b/web/craigslist.py
@@ -15,6 +15,7 @@ from web.db import (

 # Import utility functions
 from web.utils import (
+    get_base_url,
    make_request_with_retry,
    now_iso,
 )
@@ -54,9 +55,10 @@ def fetch_listings():
            if not keyword_name:
                continue
            # Build a canonical search identifier for this region+keyword combination.
+            url = get_base_url().format(region=region, keyword=keyword_name.replace(" ", "+"))
            search_page_id = f"search:{region_name}:{keyword_name}"
            try:
-                last = get_last_fetch_time(search_page_id)
+                last = get_last_fetch_time(url)
                if last is not None:
                    # skip if fetched within the last 24 hours
                    age = datetime.now(
@@ -72,7 +74,7 @@ def fetch_listings():
            yield f"Processing {region_name} + {keyword_name} ({processed}/{total_combinations})...\n"
            # record that we're fetching this search page now
            try:
-                insert_log(search_page_id, region=region_name,
+                insert_log(url, region=region_name,
                           keyword=keyword_name, fetched_at=datetime.now(timezone.utc))
            except Exception:
                pass
@@ -99,6 +101,15 @@ def fetch_listings():


 def process_job_url(job_url: str, region: str = "", keyword: str = ""):
+    last = get_last_fetch_time(job_url)
+    if last is not None:
+        # skip if fetched within the last 24 hours
+        age = datetime.now(
+            timezone.utc) - (last if last.tzinfo is not None else last.replace(tzinfo=timezone.utc))
+        if age.total_seconds() < 24 * 3600:
+            yield f"Skipping job {job_url} (fetched {age.seconds//3600}h ago)...\n"
+            return None
+
    try:
        job_id = url_to_job_id(job_url)
        yield f"Fetching job page: {job_url}\n"
--- a/web/db.py
+++ b/web/db.py
@@ -14,10 +14,8 @@ Tables:
 """

 from datetime import datetime, UTC
-import os
 from typing import Optional, Dict, Any, List
 from web.utils import (
-    get_url_from_filename,
    get_color_from_string,
    url_to_job_id,
    normalize_job_id,
--- a/web/static/scrape.js
+++ b/web/static/scrape.js
@@ -0,0 +1,44 @@
+function startScrape() {
+  const output = document.getElementById("output");
+  const startButton = document.getElementById("start-scrape");
+
+  output.textContent = "Starting scrape...\n";
+  startButton.disabled = true;
+  startButton.textContent = "Scraping...";
+
+  fetch("/scrape")
+    .then((response) => {
+      const reader = response.body.getReader();
+      const decoder = new TextDecoder();
+
+      function readStream() {
+        reader
+          .read()
+          .then(({ done, value }) => {
+            if (done) {
+              output.textContent += "\nScraping completed!";
+              startButton.disabled = false;
+              startButton.textContent = "Start Scraping";
+              return;
+            }
+
+            const chunk = decoder.decode(value, { stream: true });
+            output.textContent += chunk;
+            output.scrollTop = output.scrollHeight;
+            readStream();
+          })
+          .catch((error) => {
+            output.textContent += `\nError: ${error.message}`;
+            startButton.disabled = false;
+            startButton.textContent = "Start Scraping";
+          });
+      }
+
+      readStream();
+    })
+    .catch((error) => {
+      output.textContent = `Error starting scrape: ${error.message}`;
+      startButton.disabled = false;
+      startButton.textContent = "Start Scraping";
+    });
+}
--- a/web/templates/scrape.html
+++ b/web/templates/scrape.html
@@ -18,50 +18,5 @@ content %}
  ></div>
 </div>
 {% endblock %} {% block scripts %}
-<script>
-  function startScrape() {
-    const output = document.getElementById("output");
-    const startButton = document.getElementById("start-scrape");
-
-    output.textContent = "Starting scrape...\n";
-    startButton.disabled = true;
-    startButton.textContent = "Scraping...";
-
-    fetch("/scrape")
-      .then((response) => {
-        const reader = response.body.getReader();
-        const decoder = new TextDecoder();
-
-        function readStream() {
-          reader
-            .read()
-            .then(({ done, value }) => {
-              if (done) {
-                output.textContent += "\nScraping completed!";
-                startButton.disabled = false;
-                startButton.textContent = "Start Scraping";
-                return;
-              }
-
-              const chunk = decoder.decode(value, { stream: true });
-              output.textContent += chunk;
-              output.scrollTop = output.scrollHeight;
-              readStream();
-            })
-            .catch((error) => {
-              output.textContent += `\nError: ${error.message}`;
-              startButton.disabled = false;
-              startButton.textContent = "Start Scraping";
-            });
-        }
-
-        readStream();
-      })
-      .catch((error) => {
-        output.textContent = `Error starting scrape: ${error.message}`;
-        startButton.disabled = false;
-        startButton.textContent = "Start Scraping";
-      });
-  }
-</script>
+<script src="{{ url_for('static', filename='scrape.js') }}"></script>
 {% endblock %}
Author	SHA1	Message	Date
zwitschi	5cbb760005	separate javascript for scrape page	2025-09-17 17:12:29 +02:00
zwitschi	2ae1e2058d	reorganize imports removing unused imports	2025-09-17 17:12:16 +02:00
zwitschi	e549fae3f6	fix table setup	2025-09-17 17:11:45 +02:00