fix: update fetch logic to skip jobs fetched within the last 24 hours and adjust retry attempts in scraper
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -166,3 +166,4 @@ cython_debug/
|
|||||||
docs/online.md
|
docs/online.md
|
||||||
.github/copilot*
|
.github/copilot*
|
||||||
.github/TODO.md
|
.github/TODO.md
|
||||||
|
.vscode/launch.json
|
||||||
|
|||||||
@@ -175,10 +175,10 @@ def fetch_listings():
|
|||||||
def process_job_url(job_url: str, region: str = "", keyword: str = ""):
|
def process_job_url(job_url: str, region: str = "", keyword: str = ""):
|
||||||
last = get_last_fetch_time(job_url)
|
last = get_last_fetch_time(job_url)
|
||||||
if last is not None:
|
if last is not None:
|
||||||
# skip if fetched within the last hour
|
# skip if fetched within the last 24 hours
|
||||||
age = datetime.now(
|
age = datetime.now(
|
||||||
timezone.utc) - (last if last.tzinfo is not None else last.replace(tzinfo=timezone.utc))
|
timezone.utc) - (last if last.tzinfo is not None else last.replace(tzinfo=timezone.utc))
|
||||||
if age.total_seconds() < 1 * 3600:
|
if age.total_seconds() < 24 * 3600:
|
||||||
yield f"Skipping job {job_url} (fetched {age.seconds//3600}h ago)...\n"
|
yield f"Skipping job {job_url} (fetched {age.seconds//3600}h ago)...\n"
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|||||||
@@ -224,7 +224,7 @@ def scrape_job_data(content: str, region: str, keyword: str, seen_urls: Set[str]
|
|||||||
def process_region_keyword(region: str, keyword: str, seen_urls: Set[str]) -> List[List]:
|
def process_region_keyword(region: str, keyword: str, seen_urls: Set[str]) -> List[List]:
|
||||||
"""Process a single region and keyword."""
|
"""Process a single region and keyword."""
|
||||||
url = get_base_url().format(region=region, keyword=keyword.replace(" ", "+"))
|
url = get_base_url().format(region=region, keyword=keyword.replace(" ", "+"))
|
||||||
content = make_request_with_retry(url, 3)
|
content = make_request_with_retry(url, 1)
|
||||||
if content is None:
|
if content is None:
|
||||||
return []
|
return []
|
||||||
return scrape_job_data(content, region, keyword, seen_urls)
|
return scrape_job_data(content, region, keyword, seen_urls)
|
||||||
|
|||||||
Reference in New Issue
Block a user