from __future__ import annotations import logging import time from dataclasses import dataclass from enum import Enum from pathlib import Path from typing import Any, BinaryIO, Callable, Generic, Iterable, Mapping, Optional, TypeVar, cast from uuid import uuid4 from types import MappingProxyType import pandas as pd from pandas import DataFrame from pydantic import BaseModel, ValidationError from models import Project, Scenario from schemas.imports import ProjectImportRow, ScenarioImportRow from services.unit_of_work import UnitOfWork from models.import_export_log import ImportExportLog from monitoring.metrics import observe_import logger = logging.getLogger(__name__) TImportRow = TypeVar("TImportRow", bound=BaseModel) PROJECT_COLUMNS: tuple[str, ...] = ( "name", "location", "operation_type", "description", "created_at", "updated_at", ) SCENARIO_COLUMNS: tuple[str, ...] = ( "project_name", "name", "status", "start_date", "end_date", "discount_rate", "currency", "primary_resource", "description", "created_at", "updated_at", ) @dataclass(slots=True) class ImportRowError: row_number: int field: str | None message: str @dataclass(slots=True) class ParsedImportRow(Generic[TImportRow]): row_number: int data: TImportRow @dataclass(slots=True) class ImportResult(Generic[TImportRow]): rows: list[ParsedImportRow[TImportRow]] errors: list[ImportRowError] class UnsupportedImportFormat(ValueError): pass class ImportPreviewState(str, Enum): NEW = "new" UPDATE = "update" SKIP = "skip" ERROR = "error" @dataclass(slots=True) class ImportPreviewRow(Generic[TImportRow]): row_number: int data: TImportRow state: ImportPreviewState issues: list[str] context: dict[str, Any] | None = None @dataclass(slots=True) class ImportPreviewSummary: total_rows: int accepted: int skipped: int errored: int @dataclass(slots=True) class ImportPreview(Generic[TImportRow]): rows: list[ImportPreviewRow[TImportRow]] summary: ImportPreviewSummary row_issues: list["ImportPreviewRowIssues"] parser_errors: list[ImportRowError] stage_token: str | None @dataclass(slots=True) class StagedRow(Generic[TImportRow]): parsed: ParsedImportRow[TImportRow] context: dict[str, Any] @dataclass(slots=True) class ImportPreviewRowIssue: message: str field: str | None = None @dataclass(slots=True) class ImportPreviewRowIssues: row_number: int state: ImportPreviewState | None issues: list[ImportPreviewRowIssue] @dataclass(slots=True) class StagedImport(Generic[TImportRow]): token: str rows: list[StagedRow[TImportRow]] @dataclass(slots=True, frozen=True) class StagedRowView(Generic[TImportRow]): row_number: int data: TImportRow context: Mapping[str, Any] @dataclass(slots=True, frozen=True) class StagedImportView(Generic[TImportRow]): token: str rows: tuple[StagedRowView[TImportRow], ...] @dataclass(slots=True, frozen=True) class ImportCommitSummary: created: int updated: int @dataclass(slots=True, frozen=True) class ImportCommitResult(Generic[TImportRow]): token: str rows: tuple[StagedRowView[TImportRow], ...] summary: ImportCommitSummary UnitOfWorkFactory = Callable[[], UnitOfWork] class ImportIngestionService: """Coordinates parsing, validation, and preview staging for imports.""" def __init__(self, uow_factory: UnitOfWorkFactory) -> None: self._uow_factory = uow_factory self._project_stage: dict[str, StagedImport[ProjectImportRow]] = {} self._scenario_stage: dict[str, StagedImport[ScenarioImportRow]] = {} def preview_projects( self, stream: BinaryIO, filename: str, ) -> ImportPreview[ProjectImportRow]: start = time.perf_counter() result = load_project_imports(stream, filename) status = "success" if not result.errors else "partial" self._record_audit_log( action="preview", dataset="projects", status=status, filename=filename, row_count=len(result.rows), detail=f"accepted={len(result.rows)} parser_errors={len(result.errors)}", ) observe_import( action="preview", dataset="projects", status=status, seconds=time.perf_counter() - start, ) logger.info( "import.preview", extra={ "event": "import.preview", "dataset": "projects", "status": status, "filename": filename, "row_count": len(result.rows), "error_count": len(result.errors), }, ) parser_errors = result.errors preview_rows: list[ImportPreviewRow[ProjectImportRow]] = [] staged_rows: list[StagedRow[ProjectImportRow]] = [] accepted = skipped = errored = 0 seen_names: set[str] = set() existing_by_name: dict[str, Project] = {} if result.rows: with self._uow_factory() as uow: if not uow.projects: raise RuntimeError("Project repository is unavailable") existing_by_name = dict( uow.projects.find_by_names( parsed.data.name for parsed in result.rows ) ) for parsed in result.rows: name_key = _normalise_key(parsed.data.name) issues: list[str] = [] context: dict[str, Any] | None = None state = ImportPreviewState.NEW if name_key in seen_names: state = ImportPreviewState.SKIP issues.append( "Duplicate project name within upload; row skipped.") else: seen_names.add(name_key) existing = existing_by_name.get(name_key) if existing: state = ImportPreviewState.UPDATE context = { "mode": "update", "project_id": existing.id, } issues.append("Existing project will be updated.") else: context = {"mode": "create"} preview_rows.append( ImportPreviewRow( row_number=parsed.row_number, data=parsed.data, state=state, issues=issues, context=context, ) ) if state in {ImportPreviewState.NEW, ImportPreviewState.UPDATE}: accepted += 1 staged_rows.append( StagedRow(parsed=parsed, context=context or { "mode": "create"}) ) elif state == ImportPreviewState.SKIP: skipped += 1 else: errored += 1 parser_error_rows = {error.row_number for error in parser_errors} errored += len(parser_error_rows) total_rows = len(preview_rows) + len(parser_error_rows) summary = ImportPreviewSummary( total_rows=total_rows, accepted=accepted, skipped=skipped, errored=errored, ) row_issues = _compile_row_issues(preview_rows, parser_errors) stage_token: str | None = None if staged_rows: stage_token = self._store_project_stage(staged_rows) return ImportPreview( rows=preview_rows, summary=summary, row_issues=row_issues, parser_errors=parser_errors, stage_token=stage_token, ) def preview_scenarios( self, stream: BinaryIO, filename: str, ) -> ImportPreview[ScenarioImportRow]: start = time.perf_counter() result = load_scenario_imports(stream, filename) status = "success" if not result.errors else "partial" self._record_audit_log( action="preview", dataset="scenarios", status=status, filename=filename, row_count=len(result.rows), detail=f"accepted={len(result.rows)} parser_errors={len(result.errors)}", ) observe_import( action="preview", dataset="scenarios", status=status, seconds=time.perf_counter() - start, ) logger.info( "import.preview", extra={ "event": "import.preview", "dataset": "scenarios", "status": status, "filename": filename, "row_count": len(result.rows), "error_count": len(result.errors), }, ) parser_errors = result.errors preview_rows: list[ImportPreviewRow[ScenarioImportRow]] = [] staged_rows: list[StagedRow[ScenarioImportRow]] = [] accepted = skipped = errored = 0 seen_pairs: set[tuple[str, str]] = set() existing_projects: dict[str, Project] = {} existing_scenarios: dict[tuple[int, str], Scenario] = {} if result.rows: with self._uow_factory() as uow: if not uow.projects or not uow.scenarios: raise RuntimeError("Repositories are unavailable") existing_projects = dict( uow.projects.find_by_names( parsed.data.project_name for parsed in result.rows ) ) names_by_project: dict[int, set[str]] = {} for parsed in result.rows: project = existing_projects.get( _normalise_key(parsed.data.project_name) ) if not project: continue names_by_project.setdefault(project.id, set()).add( _normalise_key(parsed.data.name) ) for project_id, names in names_by_project.items(): matches = uow.scenarios.find_by_project_and_names( project_id, names) for name_key, scenario in matches.items(): existing_scenarios[(project_id, name_key)] = scenario for parsed in result.rows: project_key = _normalise_key(parsed.data.project_name) scenario_key = _normalise_key(parsed.data.name) issues: list[str] = [] context: dict[str, Any] | None = None state = ImportPreviewState.NEW if (project_key, scenario_key) in seen_pairs: state = ImportPreviewState.SKIP issues.append( "Duplicate scenario for project within upload; row skipped." ) else: seen_pairs.add((project_key, scenario_key)) project = existing_projects.get(project_key) if not project: state = ImportPreviewState.ERROR issues.append( f"Project '{parsed.data.project_name}' does not exist." ) else: context = {"mode": "create", "project_id": project.id} existing = existing_scenarios.get( (project.id, scenario_key)) if existing: state = ImportPreviewState.UPDATE context = { "mode": "update", "project_id": project.id, "scenario_id": existing.id, } issues.append("Existing scenario will be updated.") preview_rows.append( ImportPreviewRow( row_number=parsed.row_number, data=parsed.data, state=state, issues=issues, context=context, ) ) if state in {ImportPreviewState.NEW, ImportPreviewState.UPDATE}: accepted += 1 staged_rows.append( StagedRow(parsed=parsed, context=context or { "mode": "create"}) ) elif state == ImportPreviewState.SKIP: skipped += 1 else: errored += 1 parser_error_rows = {error.row_number for error in parser_errors} errored += len(parser_error_rows) total_rows = len(preview_rows) + len(parser_error_rows) summary = ImportPreviewSummary( total_rows=total_rows, accepted=accepted, skipped=skipped, errored=errored, ) row_issues = _compile_row_issues(preview_rows, parser_errors) stage_token: str | None = None if staged_rows: stage_token = self._store_scenario_stage(staged_rows) return ImportPreview( rows=preview_rows, summary=summary, row_issues=row_issues, parser_errors=parser_errors, stage_token=stage_token, ) def get_staged_projects( self, token: str ) -> StagedImportView[ProjectImportRow] | None: staged = self._project_stage.get(token) if not staged: return None return _build_staged_view(staged) def get_staged_scenarios( self, token: str ) -> StagedImportView[ScenarioImportRow] | None: staged = self._scenario_stage.get(token) if not staged: return None return _build_staged_view(staged) def consume_staged_projects( self, token: str ) -> StagedImportView[ProjectImportRow] | None: staged = self._project_stage.pop(token, None) if not staged: return None return _build_staged_view(staged) def consume_staged_scenarios( self, token: str ) -> StagedImportView[ScenarioImportRow] | None: staged = self._scenario_stage.pop(token, None) if not staged: return None return _build_staged_view(staged) def clear_staged_projects(self, token: str) -> bool: return self._project_stage.pop(token, None) is not None def clear_staged_scenarios(self, token: str) -> bool: return self._scenario_stage.pop(token, None) is not None def commit_project_import(self, token: str) -> ImportCommitResult[ProjectImportRow]: staged = self._project_stage.get(token) if not staged: raise ValueError(f"Unknown project import token: {token}") staged_view = _build_staged_view(staged) created = updated = 0 start = time.perf_counter() try: with self._uow_factory() as uow: if not uow.projects: raise RuntimeError("Project repository is unavailable") for row in staged.rows: mode = row.context.get("mode") data = row.parsed.data if mode == "create": project = Project( name=data.name, location=data.location, operation_type=data.operation_type, description=data.description, ) if data.created_at: project.created_at = data.created_at if data.updated_at: project.updated_at = data.updated_at uow.projects.create(project) created += 1 elif mode == "update": project_id = row.context.get("project_id") if not project_id: raise ValueError( "Staged project update is missing project_id context" ) project = uow.projects.get(project_id) project.name = data.name project.location = data.location project.operation_type = data.operation_type project.description = data.description if data.created_at: project.created_at = data.created_at if data.updated_at: project.updated_at = data.updated_at updated += 1 else: raise ValueError( f"Unsupported staged project mode: {mode!r}") except Exception as exc: self._record_audit_log( action="commit", dataset="projects", status="failure", filename=None, row_count=len(staged.rows), detail=f"error={type(exc).__name__}: {exc}", ) observe_import( action="commit", dataset="projects", status="failure", seconds=time.perf_counter() - start, ) logger.exception( "import.commit.failed", extra={ "event": "import.commit", "dataset": "projects", "status": "failure", "row_count": len(staged.rows), "token": token, }, ) raise else: self._record_audit_log( action="commit", dataset="projects", status="success", filename=None, row_count=len(staged.rows), detail=f"created={created} updated={updated}", ) observe_import( action="commit", dataset="projects", status="success", seconds=time.perf_counter() - start, ) logger.info( "import.commit", extra={ "event": "import.commit", "dataset": "projects", "status": "success", "row_count": len(staged.rows), "created": created, "updated": updated, "token": token, }, ) self._project_stage.pop(token, None) return ImportCommitResult( token=token, rows=staged_view.rows, summary=ImportCommitSummary(created=created, updated=updated), ) def commit_scenario_import(self, token: str) -> ImportCommitResult[ScenarioImportRow]: staged = self._scenario_stage.get(token) if not staged: raise ValueError(f"Unknown scenario import token: {token}") staged_view = _build_staged_view(staged) created = updated = 0 start = time.perf_counter() try: with self._uow_factory() as uow: if not uow.scenarios or not uow.projects: raise RuntimeError("Scenario repositories are unavailable") for row in staged.rows: mode = row.context.get("mode") data = row.parsed.data project_id = row.context.get("project_id") if not project_id: raise ValueError( "Staged scenario row is missing project_id context" ) project = uow.projects.get(project_id) if mode == "create": scenario = Scenario( project_id=project.id, name=data.name, status=data.status, start_date=data.start_date, end_date=data.end_date, discount_rate=data.discount_rate, currency=data.currency, primary_resource=data.primary_resource, description=data.description, ) if data.created_at: scenario.created_at = data.created_at if data.updated_at: scenario.updated_at = data.updated_at uow.scenarios.create(scenario) created += 1 elif mode == "update": scenario_id = row.context.get("scenario_id") if not scenario_id: raise ValueError( "Staged scenario update is missing scenario_id context" ) scenario = uow.scenarios.get(scenario_id) scenario.project_id = project.id scenario.name = data.name scenario.status = data.status scenario.start_date = data.start_date scenario.end_date = data.end_date scenario.discount_rate = data.discount_rate scenario.currency = data.currency scenario.primary_resource = data.primary_resource scenario.description = data.description if data.created_at: scenario.created_at = data.created_at if data.updated_at: scenario.updated_at = data.updated_at updated += 1 else: raise ValueError( f"Unsupported staged scenario mode: {mode!r}") except Exception as exc: self._record_audit_log( action="commit", dataset="scenarios", status="failure", filename=None, row_count=len(staged.rows), detail=f"error={type(exc).__name__}: {exc}", ) observe_import( action="commit", dataset="scenarios", status="failure", seconds=time.perf_counter() - start, ) logger.exception( "import.commit.failed", extra={ "event": "import.commit", "dataset": "scenarios", "status": "failure", "row_count": len(staged.rows), "token": token, }, ) raise else: self._record_audit_log( action="commit", dataset="scenarios", status="success", filename=None, row_count=len(staged.rows), detail=f"created={created} updated={updated}", ) observe_import( action="commit", dataset="scenarios", status="success", seconds=time.perf_counter() - start, ) logger.info( "import.commit", extra={ "event": "import.commit", "dataset": "scenarios", "status": "success", "row_count": len(staged.rows), "created": created, "updated": updated, "token": token, }, ) self._scenario_stage.pop(token, None) return ImportCommitResult( token=token, rows=staged_view.rows, summary=ImportCommitSummary(created=created, updated=updated), ) def _record_audit_log( self, *, action: str, dataset: str, status: str, row_count: int, detail: Optional[str], filename: Optional[str], ) -> None: try: with self._uow_factory() as uow: if uow.session is None: return log = ImportExportLog( action=action, dataset=dataset, status=status, filename=filename, row_count=row_count, detail=detail, ) uow.session.add(log) uow.commit() except Exception: # Audit logging must not break core workflows pass def _store_project_stage( self, rows: list[StagedRow[ProjectImportRow]] ) -> str: token = str(uuid4()) self._project_stage[token] = StagedImport(token=token, rows=rows) return token def _store_scenario_stage( self, rows: list[StagedRow[ScenarioImportRow]] ) -> str: token = str(uuid4()) self._scenario_stage[token] = StagedImport(token=token, rows=rows) return token def load_project_imports(stream: BinaryIO, filename: str) -> ImportResult[ProjectImportRow]: df = _load_dataframe(stream, filename) return _parse_dataframe(df, ProjectImportRow, PROJECT_COLUMNS) def load_scenario_imports(stream: BinaryIO, filename: str) -> ImportResult[ScenarioImportRow]: df = _load_dataframe(stream, filename) return _parse_dataframe(df, ScenarioImportRow, SCENARIO_COLUMNS) def _load_dataframe(stream: BinaryIO, filename: str) -> DataFrame: stream.seek(0) suffix = Path(filename).suffix.lower() if suffix == ".csv": df = pd.read_csv(stream, dtype=str, keep_default_na=False, encoding="utf-8") elif suffix in {".xls", ".xlsx"}: df = pd.read_excel(stream, dtype=str, engine="openpyxl") else: raise UnsupportedImportFormat( f"Unsupported file type: {suffix or 'unknown'}") df.columns = [str(col).strip().lower() for col in df.columns] return df def _parse_dataframe( df: DataFrame, model: type[TImportRow], expected_columns: Iterable[str], ) -> ImportResult[TImportRow]: rows: list[ParsedImportRow[TImportRow]] = [] errors: list[ImportRowError] = [] for index, raw in enumerate(df.to_dict(orient="records"), start=2): payload = _prepare_payload( cast(dict[str, object], raw), expected_columns) try: rows.append( ParsedImportRow(row_number=index, data=model(**payload)) ) except ValidationError as exc: # pragma: no cover - exercised via tests for detail in exc.errors(): loc = ".".join(str(part) for part in detail.get("loc", [])) or None errors.append( ImportRowError( row_number=index, field=loc, message=detail.get("msg", "Invalid value"), ) ) return ImportResult(rows=rows, errors=errors) def _prepare_payload( raw: dict[str, object], expected_columns: Iterable[str] ) -> dict[str, object | None]: payload: dict[str, object | None] = {} for column in expected_columns: if column not in raw: continue value = raw.get(column) if isinstance(value, str): value = value.strip() if value == "": value = None if value is not None and pd.isna(cast(Any, value)): value = None payload[column] = value return payload def _normalise_key(value: str) -> str: return value.strip().lower() def _build_staged_view( staged: StagedImport[TImportRow], ) -> StagedImportView[TImportRow]: rows = tuple( StagedRowView( row_number=row.parsed.row_number, data=cast(TImportRow, _deep_copy_model(row.parsed.data)), context=MappingProxyType(dict(row.context)), ) for row in staged.rows ) return StagedImportView(token=staged.token, rows=rows) def _deep_copy_model(model: BaseModel) -> BaseModel: copy_method = getattr(model, "model_copy", None) if callable(copy_method): # pydantic v2 return cast(BaseModel, copy_method(deep=True)) return model.copy(deep=True) # type: ignore[attr-defined] def _compile_row_issues( preview_rows: Iterable[ImportPreviewRow[Any]], parser_errors: Iterable[ImportRowError], ) -> list[ImportPreviewRowIssues]: issue_map: dict[int, ImportPreviewRowIssues] = {} def ensure_bundle( row_number: int, state: ImportPreviewState | None, ) -> ImportPreviewRowIssues: bundle = issue_map.get(row_number) if bundle is None: bundle = ImportPreviewRowIssues( row_number=row_number, state=state, issues=[], ) issue_map[row_number] = bundle else: if _state_priority(state) > _state_priority(bundle.state): bundle.state = state return bundle for row in preview_rows: if not row.issues: continue bundle = ensure_bundle(row.row_number, row.state) for message in row.issues: bundle.issues.append(ImportPreviewRowIssue(message=message)) for error in parser_errors: bundle = ensure_bundle(error.row_number, ImportPreviewState.ERROR) bundle.issues.append( ImportPreviewRowIssue(message=error.message, field=error.field) ) return sorted(issue_map.values(), key=lambda item: item.row_number) def _state_priority(state: ImportPreviewState | None) -> int: if state is None: return -1 if state == ImportPreviewState.ERROR: return 3 if state == ImportPreviewState.SKIP: return 2 if state == ImportPreviewState.UPDATE: return 1 return 0