Compare commits
3 Commits
c4761c257c
...
5cbb760005
| Author | SHA1 | Date | |
|---|---|---|---|
| 5cbb760005 | |||
| 2ae1e2058d | |||
| e549fae3f6 |
14
setup.py
14
setup.py
@@ -36,15 +36,15 @@ try:
|
||||
engine = create_engine(url, future=True)
|
||||
with engine.begin() as conn:
|
||||
for table in [
|
||||
"users",
|
||||
"regions",
|
||||
"keywords",
|
||||
"user_regions",
|
||||
"user_keywords",
|
||||
"job_listings",
|
||||
"job_descriptions",
|
||||
"cached_pages",
|
||||
"job_listings",
|
||||
"keywords",
|
||||
"logs",
|
||||
"regions",
|
||||
"users",
|
||||
"user_interactions",
|
||||
"user_keywords",
|
||||
"user_regions",
|
||||
]:
|
||||
try:
|
||||
n = conn.execute(text(f"SELECT COUNT(*) FROM {table}"))
|
||||
|
||||
@@ -15,6 +15,7 @@ from web.db import (
|
||||
|
||||
# Import utility functions
|
||||
from web.utils import (
|
||||
get_base_url,
|
||||
make_request_with_retry,
|
||||
now_iso,
|
||||
)
|
||||
@@ -54,9 +55,10 @@ def fetch_listings():
|
||||
if not keyword_name:
|
||||
continue
|
||||
# Build a canonical search identifier for this region+keyword combination.
|
||||
url = get_base_url().format(region=region, keyword=keyword_name.replace(" ", "+"))
|
||||
search_page_id = f"search:{region_name}:{keyword_name}"
|
||||
try:
|
||||
last = get_last_fetch_time(search_page_id)
|
||||
last = get_last_fetch_time(url)
|
||||
if last is not None:
|
||||
# skip if fetched within the last 24 hours
|
||||
age = datetime.now(
|
||||
@@ -72,7 +74,7 @@ def fetch_listings():
|
||||
yield f"Processing {region_name} + {keyword_name} ({processed}/{total_combinations})...\n"
|
||||
# record that we're fetching this search page now
|
||||
try:
|
||||
insert_log(search_page_id, region=region_name,
|
||||
insert_log(url, region=region_name,
|
||||
keyword=keyword_name, fetched_at=datetime.now(timezone.utc))
|
||||
except Exception:
|
||||
pass
|
||||
@@ -99,6 +101,15 @@ def fetch_listings():
|
||||
|
||||
|
||||
def process_job_url(job_url: str, region: str = "", keyword: str = ""):
|
||||
last = get_last_fetch_time(job_url)
|
||||
if last is not None:
|
||||
# skip if fetched within the last 24 hours
|
||||
age = datetime.now(
|
||||
timezone.utc) - (last if last.tzinfo is not None else last.replace(tzinfo=timezone.utc))
|
||||
if age.total_seconds() < 24 * 3600:
|
||||
yield f"Skipping job {job_url} (fetched {age.seconds//3600}h ago)...\n"
|
||||
return None
|
||||
|
||||
try:
|
||||
job_id = url_to_job_id(job_url)
|
||||
yield f"Fetching job page: {job_url}\n"
|
||||
|
||||
@@ -14,10 +14,8 @@ Tables:
|
||||
"""
|
||||
|
||||
from datetime import datetime, UTC
|
||||
import os
|
||||
from typing import Optional, Dict, Any, List
|
||||
from web.utils import (
|
||||
get_url_from_filename,
|
||||
get_color_from_string,
|
||||
url_to_job_id,
|
||||
normalize_job_id,
|
||||
|
||||
44
web/static/scrape.js
Normal file
44
web/static/scrape.js
Normal file
@@ -0,0 +1,44 @@
|
||||
function startScrape() {
|
||||
const output = document.getElementById("output");
|
||||
const startButton = document.getElementById("start-scrape");
|
||||
|
||||
output.textContent = "Starting scrape...\n";
|
||||
startButton.disabled = true;
|
||||
startButton.textContent = "Scraping...";
|
||||
|
||||
fetch("/scrape")
|
||||
.then((response) => {
|
||||
const reader = response.body.getReader();
|
||||
const decoder = new TextDecoder();
|
||||
|
||||
function readStream() {
|
||||
reader
|
||||
.read()
|
||||
.then(({ done, value }) => {
|
||||
if (done) {
|
||||
output.textContent += "\nScraping completed!";
|
||||
startButton.disabled = false;
|
||||
startButton.textContent = "Start Scraping";
|
||||
return;
|
||||
}
|
||||
|
||||
const chunk = decoder.decode(value, { stream: true });
|
||||
output.textContent += chunk;
|
||||
output.scrollTop = output.scrollHeight;
|
||||
readStream();
|
||||
})
|
||||
.catch((error) => {
|
||||
output.textContent += `\nError: ${error.message}`;
|
||||
startButton.disabled = false;
|
||||
startButton.textContent = "Start Scraping";
|
||||
});
|
||||
}
|
||||
|
||||
readStream();
|
||||
})
|
||||
.catch((error) => {
|
||||
output.textContent = `Error starting scrape: ${error.message}`;
|
||||
startButton.disabled = false;
|
||||
startButton.textContent = "Start Scraping";
|
||||
});
|
||||
}
|
||||
@@ -18,50 +18,5 @@ content %}
|
||||
></div>
|
||||
</div>
|
||||
{% endblock %} {% block scripts %}
|
||||
<script>
|
||||
function startScrape() {
|
||||
const output = document.getElementById("output");
|
||||
const startButton = document.getElementById("start-scrape");
|
||||
|
||||
output.textContent = "Starting scrape...\n";
|
||||
startButton.disabled = true;
|
||||
startButton.textContent = "Scraping...";
|
||||
|
||||
fetch("/scrape")
|
||||
.then((response) => {
|
||||
const reader = response.body.getReader();
|
||||
const decoder = new TextDecoder();
|
||||
|
||||
function readStream() {
|
||||
reader
|
||||
.read()
|
||||
.then(({ done, value }) => {
|
||||
if (done) {
|
||||
output.textContent += "\nScraping completed!";
|
||||
startButton.disabled = false;
|
||||
startButton.textContent = "Start Scraping";
|
||||
return;
|
||||
}
|
||||
|
||||
const chunk = decoder.decode(value, { stream: true });
|
||||
output.textContent += chunk;
|
||||
output.scrollTop = output.scrollHeight;
|
||||
readStream();
|
||||
})
|
||||
.catch((error) => {
|
||||
output.textContent += `\nError: ${error.message}`;
|
||||
startButton.disabled = false;
|
||||
startButton.textContent = "Start Scraping";
|
||||
});
|
||||
}
|
||||
|
||||
readStream();
|
||||
})
|
||||
.catch((error) => {
|
||||
output.textContent = `Error starting scrape: ${error.message}`;
|
||||
startButton.disabled = false;
|
||||
startButton.textContent = "Start Scraping";
|
||||
});
|
||||
}
|
||||
</script>
|
||||
<script src="{{ url_for('static', filename='scrape.js') }}"></script>
|
||||
{% endblock %}
|
||||
|
||||
Reference in New Issue
Block a user