remove caching

This commit is contained in:
georg.sinn-schirwitz
2025-09-08 14:44:46 +02:00
parent f8e23d0fba
commit 042a196718
13 changed files with 144 additions and 525 deletions

View File

@@ -93,8 +93,7 @@ def get_paths() -> dict:
return get_config().get('paths', {})
def get_cache_dir() -> str:
return get_paths().get('cache_dir', 'cache')
def get_logs_dir() -> str:
@@ -129,9 +128,7 @@ def get_base_url() -> str:
return get_config().get('scraper', {}).get('base_url', "https://{region}.craigslist.org/search/jjj?query={keyword}&sort=rel")
def ensure_cache_dir():
"""Ensure cache directory exists."""
os.makedirs(get_cache_dir(), exist_ok=True)
def now_iso() -> str:
@@ -173,10 +170,7 @@ def get_url_from_filename(name: str) -> str:
return url_guess
def get_cached_content(url: str) -> str:
"""Get cached content for URL."""
with open(get_cache_path(url), "r", encoding="utf-8") as f:
return f.read()
def safe_get_text(element, default="N/A"):
@@ -194,51 +188,19 @@ def get_random_delay(min_delay: int = get_min_delay(), max_delay: int = get_max_
return random.uniform(min_delay, max_delay)
def get_cache_path(url: str) -> str:
"""Get cache file path for URL."""
return os.path.join(get_cache_dir(), f"{get_filename_from_url(url)}.html")
def cache_page(url: str, content: str):
"""Cache the page content with a timestamp."""
cache_path = get_cache_path(url)
with open(cache_path, "w", encoding="utf-8") as f:
f.write(content)
# Update the file's modification time to the current time
os.utime(cache_path, None)
def is_cached(url: str) -> bool:
"""Check if the page is cached and not older than 24 hours."""
cache_path = get_cache_path(url)
if not os.path.isfile(cache_path):
return False
# Check the file's age if it's a search result page
if 'search' in url:
file_age = time.time() - os.path.getmtime(cache_path)
if file_age > 24 * 3600: # 24 hours in seconds
return False
return True
def is_cache_stale(last_modified: str, days: int = 1) -> bool:
"""Check if the cached page is stale (older than 24 hours)."""
if not last_modified:
return True
last_datetime = datetime.fromisoformat(last_modified)
file_age = time.time() - last_datetime.timestamp()
return file_age > days * 24 * 3600 # days in seconds
def delete_cached_page(url: str):
cache_fp = get_cache_path(url)
if os.path.exists(cache_fp):
try:
os.remove(cache_fp)
except Exception:
pass
def get_color_from_string(s: str) -> str: