fix: update fetch logic to skip jobs fetched within the last 24 hours and adjust retry attempts in scraper
Some checks failed
CI/CD Pipeline / test (push) Failing after 20s
CI/CD Pipeline / build-image (push) Has been skipped

This commit is contained in:
2025-11-28 20:54:39 +01:00
parent e0bc295936
commit 02e3e77f78
3 changed files with 4 additions and 3 deletions

1
.gitignore vendored
View File

@@ -166,3 +166,4 @@ cython_debug/
docs/online.md docs/online.md
.github/copilot* .github/copilot*
.github/TODO.md .github/TODO.md
.vscode/launch.json

View File

@@ -175,10 +175,10 @@ def fetch_listings():
def process_job_url(job_url: str, region: str = "", keyword: str = ""): def process_job_url(job_url: str, region: str = "", keyword: str = ""):
last = get_last_fetch_time(job_url) last = get_last_fetch_time(job_url)
if last is not None: if last is not None:
# skip if fetched within the last hour # skip if fetched within the last 24 hours
age = datetime.now( age = datetime.now(
timezone.utc) - (last if last.tzinfo is not None else last.replace(tzinfo=timezone.utc)) timezone.utc) - (last if last.tzinfo is not None else last.replace(tzinfo=timezone.utc))
if age.total_seconds() < 1 * 3600: if age.total_seconds() < 24 * 3600:
yield f"Skipping job {job_url} (fetched {age.seconds//3600}h ago)...\n" yield f"Skipping job {job_url} (fetched {age.seconds//3600}h ago)...\n"
return None return None

View File

@@ -224,7 +224,7 @@ def scrape_job_data(content: str, region: str, keyword: str, seen_urls: Set[str]
def process_region_keyword(region: str, keyword: str, seen_urls: Set[str]) -> List[List]: def process_region_keyword(region: str, keyword: str, seen_urls: Set[str]) -> List[List]:
"""Process a single region and keyword.""" """Process a single region and keyword."""
url = get_base_url().format(region=region, keyword=keyword.replace(" ", "+")) url = get_base_url().format(region=region, keyword=keyword.replace(" ", "+"))
content = make_request_with_retry(url, 3) content = make_request_with_retry(url, 1)
if content is None: if content is None:
return [] return []
return scrape_job_data(content, region, keyword, seen_urls) return scrape_job_data(content, region, keyword, seen_urls)