Files
calminer/services/importers.py

906 lines
30 KiB
Python

from __future__ import annotations
import logging
import time
from dataclasses import dataclass
from enum import Enum
from pathlib import Path
from typing import Any, BinaryIO, Callable, Generic, Iterable, Mapping, Optional, TypeVar, cast
from uuid import uuid4
from types import MappingProxyType
import pandas as pd
from pandas import DataFrame
from pydantic import BaseModel, ValidationError
from models import Project, Scenario
from schemas.imports import ProjectImportRow, ScenarioImportRow
from services.unit_of_work import UnitOfWork
from models.import_export_log import ImportExportLog
from monitoring.metrics import observe_import
logger = logging.getLogger(__name__)
TImportRow = TypeVar("TImportRow", bound=BaseModel)
PROJECT_COLUMNS: tuple[str, ...] = (
"name",
"location",
"operation_type",
"description",
"created_at",
"updated_at",
)
SCENARIO_COLUMNS: tuple[str, ...] = (
"project_name",
"name",
"status",
"start_date",
"end_date",
"discount_rate",
"currency",
"primary_resource",
"description",
"created_at",
"updated_at",
)
@dataclass(slots=True)
class ImportRowError:
row_number: int
field: str | None
message: str
@dataclass(slots=True)
class ParsedImportRow(Generic[TImportRow]):
row_number: int
data: TImportRow
@dataclass(slots=True)
class ImportResult(Generic[TImportRow]):
rows: list[ParsedImportRow[TImportRow]]
errors: list[ImportRowError]
class UnsupportedImportFormat(ValueError):
pass
class ImportPreviewState(str, Enum):
NEW = "new"
UPDATE = "update"
SKIP = "skip"
ERROR = "error"
@dataclass(slots=True)
class ImportPreviewRow(Generic[TImportRow]):
row_number: int
data: TImportRow
state: ImportPreviewState
issues: list[str]
context: dict[str, Any] | None = None
@dataclass(slots=True)
class ImportPreviewSummary:
total_rows: int
accepted: int
skipped: int
errored: int
@dataclass(slots=True)
class ImportPreview(Generic[TImportRow]):
rows: list[ImportPreviewRow[TImportRow]]
summary: ImportPreviewSummary
row_issues: list["ImportPreviewRowIssues"]
parser_errors: list[ImportRowError]
stage_token: str | None
@dataclass(slots=True)
class StagedRow(Generic[TImportRow]):
parsed: ParsedImportRow[TImportRow]
context: dict[str, Any]
@dataclass(slots=True)
class ImportPreviewRowIssue:
message: str
field: str | None = None
@dataclass(slots=True)
class ImportPreviewRowIssues:
row_number: int
state: ImportPreviewState | None
issues: list[ImportPreviewRowIssue]
@dataclass(slots=True)
class StagedImport(Generic[TImportRow]):
token: str
rows: list[StagedRow[TImportRow]]
@dataclass(slots=True, frozen=True)
class StagedRowView(Generic[TImportRow]):
row_number: int
data: TImportRow
context: Mapping[str, Any]
@dataclass(slots=True, frozen=True)
class StagedImportView(Generic[TImportRow]):
token: str
rows: tuple[StagedRowView[TImportRow], ...]
@dataclass(slots=True, frozen=True)
class ImportCommitSummary:
created: int
updated: int
@dataclass(slots=True, frozen=True)
class ImportCommitResult(Generic[TImportRow]):
token: str
rows: tuple[StagedRowView[TImportRow], ...]
summary: ImportCommitSummary
UnitOfWorkFactory = Callable[[], UnitOfWork]
class ImportIngestionService:
"""Coordinates parsing, validation, and preview staging for imports."""
def __init__(self, uow_factory: UnitOfWorkFactory) -> None:
self._uow_factory = uow_factory
self._project_stage: dict[str, StagedImport[ProjectImportRow]] = {}
self._scenario_stage: dict[str, StagedImport[ScenarioImportRow]] = {}
def preview_projects(
self,
stream: BinaryIO,
filename: str,
) -> ImportPreview[ProjectImportRow]:
start = time.perf_counter()
result = load_project_imports(stream, filename)
status = "success" if not result.errors else "partial"
self._record_audit_log(
action="preview",
dataset="projects",
status=status,
filename=filename,
row_count=len(result.rows),
detail=f"accepted={len(result.rows)} parser_errors={len(result.errors)}",
)
observe_import(
action="preview",
dataset="projects",
status=status,
seconds=time.perf_counter() - start,
)
logger.info(
"import.preview",
extra={
"event": "import.preview",
"dataset": "projects",
"status": status,
"filename": filename,
"row_count": len(result.rows),
"error_count": len(result.errors),
},
)
parser_errors = result.errors
preview_rows: list[ImportPreviewRow[ProjectImportRow]] = []
staged_rows: list[StagedRow[ProjectImportRow]] = []
accepted = skipped = errored = 0
seen_names: set[str] = set()
existing_by_name: dict[str, Project] = {}
if result.rows:
with self._uow_factory() as uow:
if not uow.projects:
raise RuntimeError("Project repository is unavailable")
existing_by_name = dict(
uow.projects.find_by_names(
parsed.data.name for parsed in result.rows
)
)
for parsed in result.rows:
name_key = _normalise_key(parsed.data.name)
issues: list[str] = []
context: dict[str, Any] | None = None
state = ImportPreviewState.NEW
if name_key in seen_names:
state = ImportPreviewState.SKIP
issues.append(
"Duplicate project name within upload; row skipped.")
else:
seen_names.add(name_key)
existing = existing_by_name.get(name_key)
if existing:
state = ImportPreviewState.UPDATE
context = {
"mode": "update",
"project_id": existing.id,
}
issues.append("Existing project will be updated.")
else:
context = {"mode": "create"}
preview_rows.append(
ImportPreviewRow(
row_number=parsed.row_number,
data=parsed.data,
state=state,
issues=issues,
context=context,
)
)
if state in {ImportPreviewState.NEW, ImportPreviewState.UPDATE}:
accepted += 1
staged_rows.append(
StagedRow(parsed=parsed, context=context or {
"mode": "create"})
)
elif state == ImportPreviewState.SKIP:
skipped += 1
else:
errored += 1
parser_error_rows = {error.row_number for error in parser_errors}
errored += len(parser_error_rows)
total_rows = len(preview_rows) + len(parser_error_rows)
summary = ImportPreviewSummary(
total_rows=total_rows,
accepted=accepted,
skipped=skipped,
errored=errored,
)
row_issues = _compile_row_issues(preview_rows, parser_errors)
stage_token: str | None = None
if staged_rows:
stage_token = self._store_project_stage(staged_rows)
return ImportPreview(
rows=preview_rows,
summary=summary,
row_issues=row_issues,
parser_errors=parser_errors,
stage_token=stage_token,
)
def preview_scenarios(
self,
stream: BinaryIO,
filename: str,
) -> ImportPreview[ScenarioImportRow]:
start = time.perf_counter()
result = load_scenario_imports(stream, filename)
status = "success" if not result.errors else "partial"
self._record_audit_log(
action="preview",
dataset="scenarios",
status=status,
filename=filename,
row_count=len(result.rows),
detail=f"accepted={len(result.rows)} parser_errors={len(result.errors)}",
)
observe_import(
action="preview",
dataset="scenarios",
status=status,
seconds=time.perf_counter() - start,
)
logger.info(
"import.preview",
extra={
"event": "import.preview",
"dataset": "scenarios",
"status": status,
"filename": filename,
"row_count": len(result.rows),
"error_count": len(result.errors),
},
)
parser_errors = result.errors
preview_rows: list[ImportPreviewRow[ScenarioImportRow]] = []
staged_rows: list[StagedRow[ScenarioImportRow]] = []
accepted = skipped = errored = 0
seen_pairs: set[tuple[str, str]] = set()
existing_projects: dict[str, Project] = {}
existing_scenarios: dict[tuple[int, str], Scenario] = {}
if result.rows:
with self._uow_factory() as uow:
if not uow.projects or not uow.scenarios:
raise RuntimeError("Repositories are unavailable")
existing_projects = dict(
uow.projects.find_by_names(
parsed.data.project_name for parsed in result.rows
)
)
names_by_project: dict[int, set[str]] = {}
for parsed in result.rows:
project = existing_projects.get(
_normalise_key(parsed.data.project_name)
)
if not project:
continue
names_by_project.setdefault(project.id, set()).add(
_normalise_key(parsed.data.name)
)
for project_id, names in names_by_project.items():
matches = uow.scenarios.find_by_project_and_names(
project_id, names)
for name_key, scenario in matches.items():
existing_scenarios[(project_id, name_key)] = scenario
for parsed in result.rows:
project_key = _normalise_key(parsed.data.project_name)
scenario_key = _normalise_key(parsed.data.name)
issues: list[str] = []
context: dict[str, Any] | None = None
state = ImportPreviewState.NEW
if (project_key, scenario_key) in seen_pairs:
state = ImportPreviewState.SKIP
issues.append(
"Duplicate scenario for project within upload; row skipped."
)
else:
seen_pairs.add((project_key, scenario_key))
project = existing_projects.get(project_key)
if not project:
state = ImportPreviewState.ERROR
issues.append(
f"Project '{parsed.data.project_name}' does not exist."
)
else:
context = {"mode": "create", "project_id": project.id}
existing = existing_scenarios.get(
(project.id, scenario_key))
if existing:
state = ImportPreviewState.UPDATE
context = {
"mode": "update",
"project_id": project.id,
"scenario_id": existing.id,
}
issues.append("Existing scenario will be updated.")
preview_rows.append(
ImportPreviewRow(
row_number=parsed.row_number,
data=parsed.data,
state=state,
issues=issues,
context=context,
)
)
if state in {ImportPreviewState.NEW, ImportPreviewState.UPDATE}:
accepted += 1
staged_rows.append(
StagedRow(parsed=parsed, context=context or {
"mode": "create"})
)
elif state == ImportPreviewState.SKIP:
skipped += 1
else:
errored += 1
parser_error_rows = {error.row_number for error in parser_errors}
errored += len(parser_error_rows)
total_rows = len(preview_rows) + len(parser_error_rows)
summary = ImportPreviewSummary(
total_rows=total_rows,
accepted=accepted,
skipped=skipped,
errored=errored,
)
row_issues = _compile_row_issues(preview_rows, parser_errors)
stage_token: str | None = None
if staged_rows:
stage_token = self._store_scenario_stage(staged_rows)
return ImportPreview(
rows=preview_rows,
summary=summary,
row_issues=row_issues,
parser_errors=parser_errors,
stage_token=stage_token,
)
def get_staged_projects(
self, token: str
) -> StagedImportView[ProjectImportRow] | None:
staged = self._project_stage.get(token)
if not staged:
return None
return _build_staged_view(staged)
def get_staged_scenarios(
self, token: str
) -> StagedImportView[ScenarioImportRow] | None:
staged = self._scenario_stage.get(token)
if not staged:
return None
return _build_staged_view(staged)
def consume_staged_projects(
self, token: str
) -> StagedImportView[ProjectImportRow] | None:
staged = self._project_stage.pop(token, None)
if not staged:
return None
return _build_staged_view(staged)
def consume_staged_scenarios(
self, token: str
) -> StagedImportView[ScenarioImportRow] | None:
staged = self._scenario_stage.pop(token, None)
if not staged:
return None
return _build_staged_view(staged)
def clear_staged_projects(self, token: str) -> bool:
return self._project_stage.pop(token, None) is not None
def clear_staged_scenarios(self, token: str) -> bool:
return self._scenario_stage.pop(token, None) is not None
def commit_project_import(self, token: str) -> ImportCommitResult[ProjectImportRow]:
staged = self._project_stage.get(token)
if not staged:
raise ValueError(f"Unknown project import token: {token}")
staged_view = _build_staged_view(staged)
created = updated = 0
start = time.perf_counter()
try:
with self._uow_factory() as uow:
if not uow.projects:
raise RuntimeError("Project repository is unavailable")
for row in staged.rows:
mode = row.context.get("mode")
data = row.parsed.data
if mode == "create":
project = Project(
name=data.name,
location=data.location,
operation_type=data.operation_type,
description=data.description,
)
if data.created_at:
project.created_at = data.created_at
if data.updated_at:
project.updated_at = data.updated_at
uow.projects.create(project)
created += 1
elif mode == "update":
project_id = row.context.get("project_id")
if not project_id:
raise ValueError(
"Staged project update is missing project_id context"
)
project = uow.projects.get(project_id)
project.name = data.name
project.location = data.location
project.operation_type = data.operation_type
project.description = data.description
if data.created_at:
project.created_at = data.created_at
if data.updated_at:
project.updated_at = data.updated_at
updated += 1
else:
raise ValueError(
f"Unsupported staged project mode: {mode!r}")
except Exception as exc:
self._record_audit_log(
action="commit",
dataset="projects",
status="failure",
filename=None,
row_count=len(staged.rows),
detail=f"error={type(exc).__name__}: {exc}",
)
observe_import(
action="commit",
dataset="projects",
status="failure",
seconds=time.perf_counter() - start,
)
logger.exception(
"import.commit.failed",
extra={
"event": "import.commit",
"dataset": "projects",
"status": "failure",
"row_count": len(staged.rows),
"token": token,
},
)
raise
else:
self._record_audit_log(
action="commit",
dataset="projects",
status="success",
filename=None,
row_count=len(staged.rows),
detail=f"created={created} updated={updated}",
)
observe_import(
action="commit",
dataset="projects",
status="success",
seconds=time.perf_counter() - start,
)
logger.info(
"import.commit",
extra={
"event": "import.commit",
"dataset": "projects",
"status": "success",
"row_count": len(staged.rows),
"created": created,
"updated": updated,
"token": token,
},
)
self._project_stage.pop(token, None)
return ImportCommitResult(
token=token,
rows=staged_view.rows,
summary=ImportCommitSummary(created=created, updated=updated),
)
def commit_scenario_import(self, token: str) -> ImportCommitResult[ScenarioImportRow]:
staged = self._scenario_stage.get(token)
if not staged:
raise ValueError(f"Unknown scenario import token: {token}")
staged_view = _build_staged_view(staged)
created = updated = 0
start = time.perf_counter()
try:
with self._uow_factory() as uow:
if not uow.scenarios or not uow.projects:
raise RuntimeError("Scenario repositories are unavailable")
for row in staged.rows:
mode = row.context.get("mode")
data = row.parsed.data
project_id = row.context.get("project_id")
if not project_id:
raise ValueError(
"Staged scenario row is missing project_id context"
)
project = uow.projects.get(project_id)
if mode == "create":
scenario = Scenario(
project_id=project.id,
name=data.name,
status=data.status,
start_date=data.start_date,
end_date=data.end_date,
discount_rate=data.discount_rate,
currency=data.currency,
primary_resource=data.primary_resource,
description=data.description,
)
if data.created_at:
scenario.created_at = data.created_at
if data.updated_at:
scenario.updated_at = data.updated_at
uow.scenarios.create(scenario)
created += 1
elif mode == "update":
scenario_id = row.context.get("scenario_id")
if not scenario_id:
raise ValueError(
"Staged scenario update is missing scenario_id context"
)
scenario = uow.scenarios.get(scenario_id)
scenario.project_id = project.id
scenario.name = data.name
scenario.status = data.status
scenario.start_date = data.start_date
scenario.end_date = data.end_date
scenario.discount_rate = data.discount_rate
scenario.currency = data.currency
scenario.primary_resource = data.primary_resource
scenario.description = data.description
if data.created_at:
scenario.created_at = data.created_at
if data.updated_at:
scenario.updated_at = data.updated_at
updated += 1
else:
raise ValueError(
f"Unsupported staged scenario mode: {mode!r}")
except Exception as exc:
self._record_audit_log(
action="commit",
dataset="scenarios",
status="failure",
filename=None,
row_count=len(staged.rows),
detail=f"error={type(exc).__name__}: {exc}",
)
observe_import(
action="commit",
dataset="scenarios",
status="failure",
seconds=time.perf_counter() - start,
)
logger.exception(
"import.commit.failed",
extra={
"event": "import.commit",
"dataset": "scenarios",
"status": "failure",
"row_count": len(staged.rows),
"token": token,
},
)
raise
else:
self._record_audit_log(
action="commit",
dataset="scenarios",
status="success",
filename=None,
row_count=len(staged.rows),
detail=f"created={created} updated={updated}",
)
observe_import(
action="commit",
dataset="scenarios",
status="success",
seconds=time.perf_counter() - start,
)
logger.info(
"import.commit",
extra={
"event": "import.commit",
"dataset": "scenarios",
"status": "success",
"row_count": len(staged.rows),
"created": created,
"updated": updated,
"token": token,
},
)
self._scenario_stage.pop(token, None)
return ImportCommitResult(
token=token,
rows=staged_view.rows,
summary=ImportCommitSummary(created=created, updated=updated),
)
def _record_audit_log(
self,
*,
action: str,
dataset: str,
status: str,
row_count: int,
detail: Optional[str],
filename: Optional[str],
) -> None:
try:
with self._uow_factory() as uow:
if uow.session is None:
return
log = ImportExportLog(
action=action,
dataset=dataset,
status=status,
filename=filename,
row_count=row_count,
detail=detail,
)
uow.session.add(log)
uow.commit()
except Exception:
# Audit logging must not break core workflows
pass
def _store_project_stage(
self, rows: list[StagedRow[ProjectImportRow]]
) -> str:
token = str(uuid4())
self._project_stage[token] = StagedImport(token=token, rows=rows)
return token
def _store_scenario_stage(
self, rows: list[StagedRow[ScenarioImportRow]]
) -> str:
token = str(uuid4())
self._scenario_stage[token] = StagedImport(token=token, rows=rows)
return token
def load_project_imports(stream: BinaryIO, filename: str) -> ImportResult[ProjectImportRow]:
df = _load_dataframe(stream, filename)
return _parse_dataframe(df, ProjectImportRow, PROJECT_COLUMNS)
def load_scenario_imports(stream: BinaryIO, filename: str) -> ImportResult[ScenarioImportRow]:
df = _load_dataframe(stream, filename)
return _parse_dataframe(df, ScenarioImportRow, SCENARIO_COLUMNS)
def _load_dataframe(stream: BinaryIO, filename: str) -> DataFrame:
stream.seek(0)
suffix = Path(filename).suffix.lower()
if suffix == ".csv":
df = pd.read_csv(stream, dtype=str,
keep_default_na=False, encoding="utf-8")
elif suffix in {".xls", ".xlsx"}:
df = pd.read_excel(stream, dtype=str, engine="openpyxl")
else:
raise UnsupportedImportFormat(
f"Unsupported file type: {suffix or 'unknown'}")
df.columns = [str(col).strip().lower() for col in df.columns]
return df
def _parse_dataframe(
df: DataFrame,
model: type[TImportRow],
expected_columns: Iterable[str],
) -> ImportResult[TImportRow]:
rows: list[ParsedImportRow[TImportRow]] = []
errors: list[ImportRowError] = []
for index, raw in enumerate(df.to_dict(orient="records"), start=2):
payload = _prepare_payload(
cast(dict[str, object], raw), expected_columns)
try:
rows.append(
ParsedImportRow(row_number=index, data=model(**payload))
)
except ValidationError as exc: # pragma: no cover - exercised via tests
for detail in exc.errors():
loc = ".".join(str(part)
for part in detail.get("loc", [])) or None
errors.append(
ImportRowError(
row_number=index,
field=loc,
message=detail.get("msg", "Invalid value"),
)
)
return ImportResult(rows=rows, errors=errors)
def _prepare_payload(
raw: dict[str, object], expected_columns: Iterable[str]
) -> dict[str, object | None]:
payload: dict[str, object | None] = {}
for column in expected_columns:
if column not in raw:
continue
value = raw.get(column)
if isinstance(value, str):
value = value.strip()
if value == "":
value = None
if value is not None and pd.isna(cast(Any, value)):
value = None
payload[column] = value
return payload
def _normalise_key(value: str) -> str:
return value.strip().lower()
def _build_staged_view(
staged: StagedImport[TImportRow],
) -> StagedImportView[TImportRow]:
rows = tuple(
StagedRowView(
row_number=row.parsed.row_number,
data=cast(TImportRow, _deep_copy_model(row.parsed.data)),
context=MappingProxyType(dict(row.context)),
)
for row in staged.rows
)
return StagedImportView(token=staged.token, rows=rows)
def _deep_copy_model(model: BaseModel) -> BaseModel:
copy_method = getattr(model, "model_copy", None)
if callable(copy_method): # pydantic v2
return cast(BaseModel, copy_method(deep=True))
return model.copy(deep=True) # type: ignore[attr-defined]
def _compile_row_issues(
preview_rows: Iterable[ImportPreviewRow[Any]],
parser_errors: Iterable[ImportRowError],
) -> list[ImportPreviewRowIssues]:
issue_map: dict[int, ImportPreviewRowIssues] = {}
def ensure_bundle(
row_number: int,
state: ImportPreviewState | None,
) -> ImportPreviewRowIssues:
bundle = issue_map.get(row_number)
if bundle is None:
bundle = ImportPreviewRowIssues(
row_number=row_number,
state=state,
issues=[],
)
issue_map[row_number] = bundle
else:
if _state_priority(state) > _state_priority(bundle.state):
bundle.state = state
return bundle
for row in preview_rows:
if not row.issues:
continue
bundle = ensure_bundle(row.row_number, row.state)
for message in row.issues:
bundle.issues.append(ImportPreviewRowIssue(message=message))
for error in parser_errors:
bundle = ensure_bundle(error.row_number, ImportPreviewState.ERROR)
bundle.issues.append(
ImportPreviewRowIssue(message=error.message, field=error.field)
)
return sorted(issue_map.values(), key=lambda item: item.row_number)
def _state_priority(state: ImportPreviewState | None) -> int:
if state is None:
return -1
if state == ImportPreviewState.ERROR:
return 3
if state == ImportPreviewState.SKIP:
return 2
if state == ImportPreviewState.UPDATE:
return 1
return 0