Compare commits
3 Commits
c4761c257c
...
5cbb760005
| Author | SHA1 | Date | |
|---|---|---|---|
| 5cbb760005 | |||
| 2ae1e2058d | |||
| e549fae3f6 |
14
setup.py
14
setup.py
@@ -36,15 +36,15 @@ try:
|
|||||||
engine = create_engine(url, future=True)
|
engine = create_engine(url, future=True)
|
||||||
with engine.begin() as conn:
|
with engine.begin() as conn:
|
||||||
for table in [
|
for table in [
|
||||||
"users",
|
|
||||||
"regions",
|
|
||||||
"keywords",
|
|
||||||
"user_regions",
|
|
||||||
"user_keywords",
|
|
||||||
"job_listings",
|
|
||||||
"job_descriptions",
|
"job_descriptions",
|
||||||
"cached_pages",
|
"job_listings",
|
||||||
|
"keywords",
|
||||||
|
"logs",
|
||||||
|
"regions",
|
||||||
|
"users",
|
||||||
"user_interactions",
|
"user_interactions",
|
||||||
|
"user_keywords",
|
||||||
|
"user_regions",
|
||||||
]:
|
]:
|
||||||
try:
|
try:
|
||||||
n = conn.execute(text(f"SELECT COUNT(*) FROM {table}"))
|
n = conn.execute(text(f"SELECT COUNT(*) FROM {table}"))
|
||||||
|
|||||||
@@ -15,6 +15,7 @@ from web.db import (
|
|||||||
|
|
||||||
# Import utility functions
|
# Import utility functions
|
||||||
from web.utils import (
|
from web.utils import (
|
||||||
|
get_base_url,
|
||||||
make_request_with_retry,
|
make_request_with_retry,
|
||||||
now_iso,
|
now_iso,
|
||||||
)
|
)
|
||||||
@@ -54,9 +55,10 @@ def fetch_listings():
|
|||||||
if not keyword_name:
|
if not keyword_name:
|
||||||
continue
|
continue
|
||||||
# Build a canonical search identifier for this region+keyword combination.
|
# Build a canonical search identifier for this region+keyword combination.
|
||||||
|
url = get_base_url().format(region=region, keyword=keyword_name.replace(" ", "+"))
|
||||||
search_page_id = f"search:{region_name}:{keyword_name}"
|
search_page_id = f"search:{region_name}:{keyword_name}"
|
||||||
try:
|
try:
|
||||||
last = get_last_fetch_time(search_page_id)
|
last = get_last_fetch_time(url)
|
||||||
if last is not None:
|
if last is not None:
|
||||||
# skip if fetched within the last 24 hours
|
# skip if fetched within the last 24 hours
|
||||||
age = datetime.now(
|
age = datetime.now(
|
||||||
@@ -72,7 +74,7 @@ def fetch_listings():
|
|||||||
yield f"Processing {region_name} + {keyword_name} ({processed}/{total_combinations})...\n"
|
yield f"Processing {region_name} + {keyword_name} ({processed}/{total_combinations})...\n"
|
||||||
# record that we're fetching this search page now
|
# record that we're fetching this search page now
|
||||||
try:
|
try:
|
||||||
insert_log(search_page_id, region=region_name,
|
insert_log(url, region=region_name,
|
||||||
keyword=keyword_name, fetched_at=datetime.now(timezone.utc))
|
keyword=keyword_name, fetched_at=datetime.now(timezone.utc))
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
@@ -99,6 +101,15 @@ def fetch_listings():
|
|||||||
|
|
||||||
|
|
||||||
def process_job_url(job_url: str, region: str = "", keyword: str = ""):
|
def process_job_url(job_url: str, region: str = "", keyword: str = ""):
|
||||||
|
last = get_last_fetch_time(job_url)
|
||||||
|
if last is not None:
|
||||||
|
# skip if fetched within the last 24 hours
|
||||||
|
age = datetime.now(
|
||||||
|
timezone.utc) - (last if last.tzinfo is not None else last.replace(tzinfo=timezone.utc))
|
||||||
|
if age.total_seconds() < 24 * 3600:
|
||||||
|
yield f"Skipping job {job_url} (fetched {age.seconds//3600}h ago)...\n"
|
||||||
|
return None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
job_id = url_to_job_id(job_url)
|
job_id = url_to_job_id(job_url)
|
||||||
yield f"Fetching job page: {job_url}\n"
|
yield f"Fetching job page: {job_url}\n"
|
||||||
|
|||||||
@@ -14,10 +14,8 @@ Tables:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
from datetime import datetime, UTC
|
from datetime import datetime, UTC
|
||||||
import os
|
|
||||||
from typing import Optional, Dict, Any, List
|
from typing import Optional, Dict, Any, List
|
||||||
from web.utils import (
|
from web.utils import (
|
||||||
get_url_from_filename,
|
|
||||||
get_color_from_string,
|
get_color_from_string,
|
||||||
url_to_job_id,
|
url_to_job_id,
|
||||||
normalize_job_id,
|
normalize_job_id,
|
||||||
|
|||||||
44
web/static/scrape.js
Normal file
44
web/static/scrape.js
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
function startScrape() {
|
||||||
|
const output = document.getElementById("output");
|
||||||
|
const startButton = document.getElementById("start-scrape");
|
||||||
|
|
||||||
|
output.textContent = "Starting scrape...\n";
|
||||||
|
startButton.disabled = true;
|
||||||
|
startButton.textContent = "Scraping...";
|
||||||
|
|
||||||
|
fetch("/scrape")
|
||||||
|
.then((response) => {
|
||||||
|
const reader = response.body.getReader();
|
||||||
|
const decoder = new TextDecoder();
|
||||||
|
|
||||||
|
function readStream() {
|
||||||
|
reader
|
||||||
|
.read()
|
||||||
|
.then(({ done, value }) => {
|
||||||
|
if (done) {
|
||||||
|
output.textContent += "\nScraping completed!";
|
||||||
|
startButton.disabled = false;
|
||||||
|
startButton.textContent = "Start Scraping";
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const chunk = decoder.decode(value, { stream: true });
|
||||||
|
output.textContent += chunk;
|
||||||
|
output.scrollTop = output.scrollHeight;
|
||||||
|
readStream();
|
||||||
|
})
|
||||||
|
.catch((error) => {
|
||||||
|
output.textContent += `\nError: ${error.message}`;
|
||||||
|
startButton.disabled = false;
|
||||||
|
startButton.textContent = "Start Scraping";
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
readStream();
|
||||||
|
})
|
||||||
|
.catch((error) => {
|
||||||
|
output.textContent = `Error starting scrape: ${error.message}`;
|
||||||
|
startButton.disabled = false;
|
||||||
|
startButton.textContent = "Start Scraping";
|
||||||
|
});
|
||||||
|
}
|
||||||
@@ -18,50 +18,5 @@ content %}
|
|||||||
></div>
|
></div>
|
||||||
</div>
|
</div>
|
||||||
{% endblock %} {% block scripts %}
|
{% endblock %} {% block scripts %}
|
||||||
<script>
|
<script src="{{ url_for('static', filename='scrape.js') }}"></script>
|
||||||
function startScrape() {
|
|
||||||
const output = document.getElementById("output");
|
|
||||||
const startButton = document.getElementById("start-scrape");
|
|
||||||
|
|
||||||
output.textContent = "Starting scrape...\n";
|
|
||||||
startButton.disabled = true;
|
|
||||||
startButton.textContent = "Scraping...";
|
|
||||||
|
|
||||||
fetch("/scrape")
|
|
||||||
.then((response) => {
|
|
||||||
const reader = response.body.getReader();
|
|
||||||
const decoder = new TextDecoder();
|
|
||||||
|
|
||||||
function readStream() {
|
|
||||||
reader
|
|
||||||
.read()
|
|
||||||
.then(({ done, value }) => {
|
|
||||||
if (done) {
|
|
||||||
output.textContent += "\nScraping completed!";
|
|
||||||
startButton.disabled = false;
|
|
||||||
startButton.textContent = "Start Scraping";
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
const chunk = decoder.decode(value, { stream: true });
|
|
||||||
output.textContent += chunk;
|
|
||||||
output.scrollTop = output.scrollHeight;
|
|
||||||
readStream();
|
|
||||||
})
|
|
||||||
.catch((error) => {
|
|
||||||
output.textContent += `\nError: ${error.message}`;
|
|
||||||
startButton.disabled = false;
|
|
||||||
startButton.textContent = "Start Scraping";
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
readStream();
|
|
||||||
})
|
|
||||||
.catch((error) => {
|
|
||||||
output.textContent = `Error starting scrape: ${error.message}`;
|
|
||||||
startButton.disabled = false;
|
|
||||||
startButton.textContent = "Start Scraping";
|
|
||||||
});
|
|
||||||
}
|
|
||||||
</script>
|
|
||||||
{% endblock %}
|
{% endblock %}
|
||||||
|
|||||||
Reference in New Issue
Block a user