Compare commits

...

3 Commits

Author SHA1 Message Date
5cbb760005 separate javascript for scrape page 2025-09-17 17:12:29 +02:00
2ae1e2058d reorganize imports
removing unused imports
2025-09-17 17:12:16 +02:00
e549fae3f6 fix table setup 2025-09-17 17:11:45 +02:00
5 changed files with 65 additions and 57 deletions

View File

@@ -36,15 +36,15 @@ try:
engine = create_engine(url, future=True) engine = create_engine(url, future=True)
with engine.begin() as conn: with engine.begin() as conn:
for table in [ for table in [
"users",
"regions",
"keywords",
"user_regions",
"user_keywords",
"job_listings",
"job_descriptions", "job_descriptions",
"cached_pages", "job_listings",
"keywords",
"logs",
"regions",
"users",
"user_interactions", "user_interactions",
"user_keywords",
"user_regions",
]: ]:
try: try:
n = conn.execute(text(f"SELECT COUNT(*) FROM {table}")) n = conn.execute(text(f"SELECT COUNT(*) FROM {table}"))

View File

@@ -15,6 +15,7 @@ from web.db import (
# Import utility functions # Import utility functions
from web.utils import ( from web.utils import (
get_base_url,
make_request_with_retry, make_request_with_retry,
now_iso, now_iso,
) )
@@ -54,9 +55,10 @@ def fetch_listings():
if not keyword_name: if not keyword_name:
continue continue
# Build a canonical search identifier for this region+keyword combination. # Build a canonical search identifier for this region+keyword combination.
url = get_base_url().format(region=region, keyword=keyword_name.replace(" ", "+"))
search_page_id = f"search:{region_name}:{keyword_name}" search_page_id = f"search:{region_name}:{keyword_name}"
try: try:
last = get_last_fetch_time(search_page_id) last = get_last_fetch_time(url)
if last is not None: if last is not None:
# skip if fetched within the last 24 hours # skip if fetched within the last 24 hours
age = datetime.now( age = datetime.now(
@@ -72,7 +74,7 @@ def fetch_listings():
yield f"Processing {region_name} + {keyword_name} ({processed}/{total_combinations})...\n" yield f"Processing {region_name} + {keyword_name} ({processed}/{total_combinations})...\n"
# record that we're fetching this search page now # record that we're fetching this search page now
try: try:
insert_log(search_page_id, region=region_name, insert_log(url, region=region_name,
keyword=keyword_name, fetched_at=datetime.now(timezone.utc)) keyword=keyword_name, fetched_at=datetime.now(timezone.utc))
except Exception: except Exception:
pass pass
@@ -99,6 +101,15 @@ def fetch_listings():
def process_job_url(job_url: str, region: str = "", keyword: str = ""): def process_job_url(job_url: str, region: str = "", keyword: str = ""):
last = get_last_fetch_time(job_url)
if last is not None:
# skip if fetched within the last 24 hours
age = datetime.now(
timezone.utc) - (last if last.tzinfo is not None else last.replace(tzinfo=timezone.utc))
if age.total_seconds() < 24 * 3600:
yield f"Skipping job {job_url} (fetched {age.seconds//3600}h ago)...\n"
return None
try: try:
job_id = url_to_job_id(job_url) job_id = url_to_job_id(job_url)
yield f"Fetching job page: {job_url}\n" yield f"Fetching job page: {job_url}\n"

View File

@@ -14,10 +14,8 @@ Tables:
""" """
from datetime import datetime, UTC from datetime import datetime, UTC
import os
from typing import Optional, Dict, Any, List from typing import Optional, Dict, Any, List
from web.utils import ( from web.utils import (
get_url_from_filename,
get_color_from_string, get_color_from_string,
url_to_job_id, url_to_job_id,
normalize_job_id, normalize_job_id,

44
web/static/scrape.js Normal file
View File

@@ -0,0 +1,44 @@
function startScrape() {
const output = document.getElementById("output");
const startButton = document.getElementById("start-scrape");
output.textContent = "Starting scrape...\n";
startButton.disabled = true;
startButton.textContent = "Scraping...";
fetch("/scrape")
.then((response) => {
const reader = response.body.getReader();
const decoder = new TextDecoder();
function readStream() {
reader
.read()
.then(({ done, value }) => {
if (done) {
output.textContent += "\nScraping completed!";
startButton.disabled = false;
startButton.textContent = "Start Scraping";
return;
}
const chunk = decoder.decode(value, { stream: true });
output.textContent += chunk;
output.scrollTop = output.scrollHeight;
readStream();
})
.catch((error) => {
output.textContent += `\nError: ${error.message}`;
startButton.disabled = false;
startButton.textContent = "Start Scraping";
});
}
readStream();
})
.catch((error) => {
output.textContent = `Error starting scrape: ${error.message}`;
startButton.disabled = false;
startButton.textContent = "Start Scraping";
});
}

View File

@@ -18,50 +18,5 @@ content %}
></div> ></div>
</div> </div>
{% endblock %} {% block scripts %} {% endblock %} {% block scripts %}
<script> <script src="{{ url_for('static', filename='scrape.js') }}"></script>
function startScrape() {
const output = document.getElementById("output");
const startButton = document.getElementById("start-scrape");
output.textContent = "Starting scrape...\n";
startButton.disabled = true;
startButton.textContent = "Scraping...";
fetch("/scrape")
.then((response) => {
const reader = response.body.getReader();
const decoder = new TextDecoder();
function readStream() {
reader
.read()
.then(({ done, value }) => {
if (done) {
output.textContent += "\nScraping completed!";
startButton.disabled = false;
startButton.textContent = "Start Scraping";
return;
}
const chunk = decoder.decode(value, { stream: true });
output.textContent += chunk;
output.scrollTop = output.scrollHeight;
readStream();
})
.catch((error) => {
output.textContent += `\nError: ${error.message}`;
startButton.disabled = false;
startButton.textContent = "Start Scraping";
});
}
readStream();
})
.catch((error) => {
output.textContent = `Error starting scrape: ${error.message}`;
startButton.disabled = false;
startButton.textContent = "Start Scraping";
});
}
</script>
{% endblock %} {% endblock %}