updating db logic adding logging

This commit is contained in:
2025-09-17 15:24:59 +02:00
parent c4a5ed56b5
commit e26dc9c164
3 changed files with 62 additions and 25 deletions

View File

@@ -9,6 +9,8 @@ from web.db import (
db_get_all_job_urls, db_get_all_job_urls,
db_delete_job, db_delete_job,
remove_job, remove_job,
insert_log,
get_last_fetch_time,
) )
# Import utility functions # Import utility functions
@@ -51,8 +53,29 @@ def fetch_listings():
keyword_name = keyword.get("name") keyword_name = keyword.get("name")
if not keyword_name: if not keyword_name:
continue continue
# Build a canonical search identifier for this region+keyword combination.
search_page_id = f"search:{region_name}:{keyword_name}"
try:
last = get_last_fetch_time(search_page_id)
if last is not None:
# skip if fetched within the last 24 hours
age = datetime.now(
timezone.utc) - (last if last.tzinfo is not None else last.replace(tzinfo=timezone.utc))
if age.total_seconds() < 24 * 3600:
yield f"Skipping {region_name} + {keyword_name} (fetched {age.seconds//3600}h ago)...\n"
processed += 1
continue
except Exception:
# if logging lookup fails, proceed with fetch
pass
processed += 1 processed += 1
yield f"Processing {region_name} + {keyword_name} ({processed}/{total_combinations})...\n" yield f"Processing {region_name} + {keyword_name} ({processed}/{total_combinations})...\n"
# record that we're fetching this search page now
try:
insert_log(search_page_id, region=region_name,
keyword=keyword_name, fetched_at=datetime.now(timezone.utc))
except Exception:
pass
for row in process_region_keyword(region_name, keyword_name, discovered_urls): for row in process_region_keyword(region_name, keyword_name, discovered_urls):
timestamp, region, keyword, title, pay, location, url = row timestamp, region, keyword, title, pay, location, url = row
discovered_urls.add(url) discovered_urls.add(url)
@@ -67,6 +90,8 @@ def fetch_listings():
pay=pay, pay=pay,
location=location, location=location,
timestamp=timestamp, timestamp=timestamp,
fetched_from=search_page_id,
fetched_at=datetime.now(timezone.utc),
) )
# Remove stale listings: those present in DB but not discovered now. # Remove stale listings: those present in DB but not discovered now.

View File

@@ -147,6 +147,15 @@ class UserKeyword(Base):
"keywords.keyword_id", ondelete="CASCADE"), primary_key=True) "keywords.keyword_id", ondelete="CASCADE"), primary_key=True)
class Log(Base):
__tablename__ = "logs"
id = Column(Integer, primary_key=True, autoincrement=True)
page_url = Column(String(URL_LEN))
region = Column(String(SHORT_LEN))
keyword = Column(String(SHORT_LEN))
fetched_at = Column(DateTime)
def _ensure_session() -> Session: def _ensure_session() -> Session:
global engine, SessionLocal global engine, SessionLocal
if engine is None or SessionLocal is None: if engine is None or SessionLocal is None:
@@ -218,7 +227,7 @@ def upsert_user_interaction(job_id: str | int, *, user_id: Optional[int] = None,
session.commit() session.commit()
def upsert_listing(*, url: str, region: str, keyword: str, title: str, pay: str, location: str, timestamp: str): def upsert_listing(*, url: str, region: str, keyword: str, title: str, pay: str, location: str, timestamp: str, fetched_from: str | None = None, fetched_at: Optional[datetime] = None):
"""Insert or update a job listing row based on job_id derived from URL.""" """Insert or update a job listing row based on job_id derived from URL."""
job_id = str(url_to_job_id(url)) job_id = str(url_to_job_id(url))
with _ensure_session() as session: with _ensure_session() as session:
@@ -234,6 +243,33 @@ def upsert_listing(*, url: str, region: str, keyword: str, title: str, pay: str,
setattr(obj, "location", location) setattr(obj, "location", location)
setattr(obj, "timestamp", timestamp) setattr(obj, "timestamp", timestamp)
session.commit() session.commit()
# Optionally record a fetch log for the listing if source provided
if fetched_from:
try:
insert_log(fetched_from, region=region, keyword=keyword,
fetched_at=fetched_at or datetime.now())
except Exception:
pass
def insert_log(page_url: str, region: str | None = None, keyword: str | None = None, fetched_at: Optional[datetime] = None):
"""Insert a log row for a fetched page."""
fetched_at = fetched_at or datetime.now()
with _ensure_session() as session:
l = Log(page_url=page_url, region=region or '',
keyword=keyword or '', fetched_at=fetched_at)
session.add(l)
session.commit()
def get_last_fetch_time(page_url: str) -> Optional[datetime]:
"""Return the latest fetched_at for a given page_url, or None if never fetched."""
with _ensure_session() as session:
row = session.execute(text("SELECT fetched_at FROM logs WHERE page_url = :u ORDER BY fetched_at DESC LIMIT 1"), {
"u": page_url}).fetchone()
if row and row[0]:
return row[0]
return None
def upsert_job_details(job_data: Dict[str, Any]): def upsert_job_details(job_data: Dict[str, Any]):

View File

@@ -93,9 +93,6 @@ def get_paths() -> dict:
return get_config().get('paths', {}) return get_config().get('paths', {})
def get_logs_dir() -> str: def get_logs_dir() -> str:
return get_paths().get('logs_dir', 'logs') return get_paths().get('logs_dir', 'logs')
@@ -128,9 +125,6 @@ def get_base_url() -> str:
return get_config().get('scraper', {}).get('base_url', "https://{region}.craigslist.org/search/jjj?query={keyword}&sort=rel") return get_config().get('scraper', {}).get('base_url', "https://{region}.craigslist.org/search/jjj?query={keyword}&sort=rel")
def now_iso() -> str: def now_iso() -> str:
"""Get the current time in ISO format.""" """Get the current time in ISO format."""
return datetime.now(UTC).isoformat() return datetime.now(UTC).isoformat()
@@ -170,9 +164,6 @@ def get_url_from_filename(name: str) -> str:
return url_guess return url_guess
def safe_get_text(element, default="N/A"): def safe_get_text(element, default="N/A"):
"""Safely extract text from BeautifulSoup element.""" """Safely extract text from BeautifulSoup element."""
return element.get_text(strip=True) if element else default return element.get_text(strip=True) if element else default
@@ -188,21 +179,6 @@ def get_random_delay(min_delay: int = get_min_delay(), max_delay: int = get_max_
return random.uniform(min_delay, max_delay) return random.uniform(min_delay, max_delay)
def get_color_from_string(s: str) -> str: def get_color_from_string(s: str) -> str:
"""Generate a color code from a string.""" """Generate a color code from a string."""
hash_code = hash(s) hash_code = hash(s)