From f612c8533a08295119789493b2be2d2c883b0697 Mon Sep 17 00:00:00 2001
From: zwitschi <zwitschi82@gmail.com>
Date: Tue, 2 Jun 2026 08:44:10 +0200
Subject: [PATCH] feat: Add backtesting parameter sweep support and related
 functionality

---
 CHANGELOG.md                          |   3 +
 README.md                             |  22 +-
 scripts/backtest_sweep.py             | 151 ++++++++++
 src/arbitrade/backtesting/__init__.py |  18 ++
 src/arbitrade/backtesting/sweep.py    | 396 ++++++++++++++++++++++++++
 tests/unit/test_backtesting_sweep.py  | 102 +++++++
 6 files changed, 685 insertions(+), 7 deletions(-)
 create mode 100644 scripts/backtest_sweep.py
 create mode 100644 src/arbitrade/backtesting/sweep.py
 create mode 100644 tests/unit/test_backtesting_sweep.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index be09aa9..4298045 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -18,6 +18,9 @@
 - Added synthetic latency profiler scenarios and CLI scripts for baseline generation and regression checks.
 - Added latency baseline/threshold artifacts and CI latency guardrail enforcement.
 - Added deterministic replay backtesting engine, CLI script, and unit coverage for JSONL event replay.
+- Added backtesting parameter sweep support (`scripts/backtest_sweep.py`) for theta, trade-capital, pair-universe, and staleness-threshold grid search.
+- Added persisted sweep artifacts with ranked in-sample/out-of-sample results and promotion-ready candidate reporting.
+- Added out-of-sample overfit guards via train/test time-window split and generalization-gap checks.
 - Added dashboard controls for tradable pair universe selection and strategy mode/parameter configuration.
 
 ### Changed
diff --git a/README.md b/README.md
index 2286834..9a31440 100644
--- a/README.md
+++ b/README.md
@@ -104,7 +104,7 @@ Minimum `.env` values:
 ```env
 APP_ENV=dev
 APP_HOST=0.0.0.0
-APP_PORT=8000
+APP_PORT=9090
 LOG_LEVEL=INFO
 LOG_JSON=true
 DUCKDB_PATH=./data/arbitrade.duckdb
@@ -132,8 +132,8 @@ python -m arbitrade.main
 
 Health endpoints:
 
-- HTML: `http://localhost:8000/`
-- JSON: `http://localhost:8000/health`
+- HTML: `http://localhost:9090/`
+- JSON: `http://localhost:9090/health`
 
 ## Database
 
@@ -283,12 +283,12 @@ Set these in Coolify application settings:
 - Build Command: leave empty.
 - Install Command: leave empty.
 - Start Command: leave empty unless you explicitly want to override the image default.
-- Port: `8000`
+- Port: `9090` (coolify uses `8000` internally)
 
 ### 3) Configure health check and networking
 
 - Health Check Path: `/health`
-- Exposed Port: `8000`
+- Exposed Port: `9090`
 - Use Coolify-generated domain or attach your own domain.
 
 ### 4) Configure persistent storage
@@ -305,7 +305,7 @@ Add runtime environment variables in Coolify (UI: Environment Variables):
 
 - `APP_ENV=prod`
 - `APP_HOST=0.0.0.0`
-- `APP_PORT=8000`
+- `APP_PORT=9090`
 - `DUCKDB_PATH=/app/data/arbitrade.duckdb`
 - `LOG_LEVEL=INFO`
 - `LOG_JSON=true`
@@ -431,6 +431,12 @@ Run a deterministic replay backtest from a JSONL event stream:
 python scripts/backtest_replay.py --events path\to\replay.jsonl --starting-balances USD=1000.0
 ```
 
+Run parameter sweep with train/test split and promotion scoring:
+
+```powershell
+python scripts/backtest_sweep.py --events path\to\replay.jsonl --starting-balances USD=1000.0 --output ops/backtesting/parameter_sweep_results.json
+```
+
 Replay event format:
 
 ```json
@@ -447,7 +453,9 @@ Notes:
 - Events are replayed in timestamp order.
 - The replay engine reuses the production detector, pre-trade validation, trade limits, and execution sequencer.
 - The simulated execution path applies configurable slippage and execution latency so reports include deterministic trade/miss statistics.
-  Latency baseline and threshold artifacts:
+- Parameter sweep splits replay data into in-sample and out-of-sample windows, ranks configurations by out-of-sample score, and flags overfit via train/test generalization-gap checks.
+- Sweep output persists ranked combinations and promotion-ready candidates for paper-trading canary promotion decisions.
+- Latency baseline and threshold artifacts:
 
 - `ops/performance/latency_baseline.json`
 - `ops/performance/latency_thresholds.json`
diff --git a/scripts/backtest_sweep.py b/scripts/backtest_sweep.py
new file mode 100644
index 0000000..e7376a8
--- /dev/null
+++ b/scripts/backtest_sweep.py
@@ -0,0 +1,151 @@
+from __future__ import annotations
+
+import argparse
+from collections.abc import Mapping, Sequence
+from pathlib import Path
+
+from arbitrade.backtesting import load_replay_events
+from arbitrade.backtesting.sweep import (
+    PromotionCriteria,
+    SweepResult,
+    build_parameter_grid,
+    persist_sweep_results,
+    run_parameter_search,
+)
+from arbitrade.detection.graph import CurrencyGraph, TriangularCycle
+
+
+def _parse_balances(raw: str) -> Mapping[str, float]:
+    balances: dict[str, float] = {}
+    for entry in raw.split(","):
+        stripped = entry.strip()
+        if not stripped:
+            continue
+        asset, value = stripped.split("=", 1)
+        balances[asset.strip().upper()] = float(value)
+    return balances
+
+
+def _parse_float_list(raw: str) -> list[float]:
+    values = [item.strip() for item in raw.split(",") if item.strip()]
+    if not values:
+        raise ValueError("expected at least one numeric value")
+    return [float(value) for value in values]
+
+
+def _parse_pair_universes(raw: str) -> list[tuple[str, ...]]:
+    universes: list[tuple[str, ...]] = []
+    for chunk in raw.split(";"):
+        symbols = tuple(item.strip().upper()
+                        for item in chunk.split("|") if item.strip())
+        if symbols:
+            universes.append(symbols)
+    if not universes:
+        raise ValueError("at least one pair universe must be provided")
+    return universes
+
+
+def _build_graph_from_symbols(symbols: Sequence[str]) -> dict[str, list[TriangularCycle]]:
+    graph = CurrencyGraph()
+    for symbol in symbols:
+        normalized = symbol.upper()
+        if "/" not in normalized:
+            continue
+        base, quote = normalized.split("/", 1)
+        graph.add_pair(base, quote, normalized)
+
+    cycles = graph.triangular_cycles()
+    return graph.index_cycles_by_pair(cycles)
+
+
+def _print_top_results(results: Sequence[SweepResult], *, limit: int = 5) -> None:
+    print(f"Top {min(limit, len(results))} result(s) by out-of-sample score:")
+    for index, result in enumerate(results[:limit], start=1):
+        print(
+            "- "
+            f"#{index} "
+            f"theta={result.parameters.min_profit_threshold:.6f}, "
+            f"capital={result.parameters.trade_capital:.2f}, "
+            f"pairs={','.join(result.parameters.pair_universe)}, "
+            f"staleness={result.parameters.staleness_threshold_seconds:.2f}s, "
+            f"test_score={result.test_score:.4f}, "
+            f"promotion_ready={result.promotion_ready}"
+        )
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description="Run backtesting parameter sweep with train/test split.")
+    parser.add_argument("--events", type=Path, required=True)
+    parser.add_argument("--starting-balances", type=str, default="USD=1000.0")
+    parser.add_argument("--theta-values", type=str,
+                        default="0.0003,0.0005,0.0008")
+    parser.add_argument("--trade-capital-values",
+                        type=str, default="50,100,150")
+    parser.add_argument(
+        "--pair-universes",
+        type=str,
+        default="BTC/USD|ETH/BTC|ETH/USD",
+        help="Semicolon-separated universes, each with | delimited pairs",
+    )
+    parser.add_argument("--staleness-threshold-values",
+                        type=str, default="3,5,8")
+    parser.add_argument("--train-ratio", type=float, default=0.7)
+    parser.add_argument("--output", type=Path,
+                        default=Path("ops/backtesting/parameter_sweep_results.json"))
+
+    parser.add_argument("--min-test-realized-pnl-usd", type=float, default=0.0)
+    parser.add_argument("--min-test-win-rate", type=float, default=0.5)
+    parser.add_argument("--min-test-fill-rate", type=float, default=0.9)
+    parser.add_argument("--max-test-drawdown-usd", type=float, default=25.0)
+    parser.add_argument("--max-generalization-gap-ratio",
+                        type=float, default=0.5)
+
+    args = parser.parse_args()
+
+    events = load_replay_events(args.events)
+    symbols = sorted({event.symbol.upper() for event in events})
+    cycles_by_pair = _build_graph_from_symbols(symbols)
+    if not cycles_by_pair:
+        raise SystemExit(
+            "No triangular cycles found in supplied replay events")
+
+    grid = build_parameter_grid(
+        theta_values=_parse_float_list(args.theta_values),
+        trade_capital_values=_parse_float_list(args.trade_capital_values),
+        pair_universes=_parse_pair_universes(args.pair_universes),
+        staleness_threshold_values=_parse_float_list(
+            args.staleness_threshold_values),
+    )
+
+    artifacts = run_parameter_search(
+        events=events,
+        cycles_by_pair=cycles_by_pair,
+        parameter_grid=grid,
+        starting_balances=_parse_balances(args.starting_balances),
+        train_ratio=args.train_ratio,
+        promotion_criteria=PromotionCriteria(
+            min_test_realized_pnl_usd=args.min_test_realized_pnl_usd,
+            min_test_win_rate=args.min_test_win_rate,
+            min_test_fill_rate=args.min_test_fill_rate,
+            max_test_drawdown_usd=args.max_test_drawdown_usd,
+            max_generalization_gap_ratio=args.max_generalization_gap_ratio,
+        ),
+    )
+
+    persist_sweep_results(args.output, artifacts)
+
+    print(f"Completed sweep combinations: {len(artifacts.results)}")
+    print(f"Promotion-ready combinations: {len(artifacts.promoted)}")
+    print(f"Results written: {args.output}")
+
+    _print_top_results(artifacts.results)
+    if artifacts.promoted:
+        print("Promotion candidates (paper-trading canary):")
+        _print_top_results(artifacts.promoted)
+
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/src/arbitrade/backtesting/__init__.py b/src/arbitrade/backtesting/__init__.py
index a657f6c..e6cbc60 100644
--- a/src/arbitrade/backtesting/__init__.py
+++ b/src/arbitrade/backtesting/__init__.py
@@ -6,6 +6,16 @@ from arbitrade.backtesting.replay import (
     ReplayClock,
     load_replay_events,
 )
+from arbitrade.backtesting.sweep import (
+    PromotionCriteria,
+    SweepArtifacts,
+    SweepParameters,
+    SweepResult,
+    build_parameter_grid,
+    persist_sweep_results,
+    run_parameter_search,
+    split_events_time_windows,
+)
 
 __all__ = [
     "ReplayClock",
@@ -14,4 +24,12 @@ __all__ = [
     "BacktestReport",
     "BacktestReplayEngine",
     "load_replay_events",
+    "SweepParameters",
+    "SweepResult",
+    "SweepArtifacts",
+    "PromotionCriteria",
+    "split_events_time_windows",
+    "build_parameter_grid",
+    "run_parameter_search",
+    "persist_sweep_results",
 ]
diff --git a/src/arbitrade/backtesting/sweep.py b/src/arbitrade/backtesting/sweep.py
new file mode 100644
index 0000000..67c44a7
--- /dev/null
+++ b/src/arbitrade/backtesting/sweep.py
@@ -0,0 +1,396 @@
+from __future__ import annotations
+
+import asyncio
+from collections.abc import Mapping, Sequence
+from dataclasses import dataclass
+from datetime import UTC, datetime
+from pathlib import Path
+
+import orjson
+
+from arbitrade.backtesting.replay import (
+    BacktestConfig,
+    BacktestReplayEngine,
+    BacktestReport,
+    ReplayBookEvent,
+)
+from arbitrade.detection.graph import TriangularCycle
+
+
+@dataclass(frozen=True, slots=True)
+class SweepParameters:
+    min_profit_threshold: float
+    trade_capital: float
+    pair_universe: tuple[str, ...]
+    staleness_threshold_seconds: float
+
+
+@dataclass(frozen=True, slots=True)
+class PromotionCriteria:
+    min_test_realized_pnl_usd: float = 0.0
+    min_test_win_rate: float = 0.5
+    min_test_fill_rate: float = 0.9
+    max_test_drawdown_usd: float = 25.0
+    max_generalization_gap_ratio: float = 0.5
+
+
+@dataclass(frozen=True, slots=True)
+class SweepResult:
+    parameters: SweepParameters
+    train_report: BacktestReport
+    test_report: BacktestReport
+    train_score: float
+    test_score: float
+    generalization_gap_ratio: float
+    overfit_detected: bool
+    promotion_ready: bool
+    promotion_reasons: tuple[str, ...]
+    train_event_count: int
+    test_event_count: int
+
+
+@dataclass(frozen=True, slots=True)
+class SweepArtifacts:
+    results: tuple[SweepResult, ...]
+    promoted: tuple[SweepResult, ...]
+    train_window: tuple[datetime, datetime] | None
+    test_window: tuple[datetime, datetime] | None
+
+
+def split_events_time_windows(
+    events: Sequence[ReplayBookEvent],
+    *,
+    train_ratio: float,
+) -> tuple[list[ReplayBookEvent], list[ReplayBookEvent]]:
+    if train_ratio <= 0.0 or train_ratio >= 1.0:
+        raise ValueError("train_ratio must be between 0 and 1")
+    if len(events) < 2:
+        raise ValueError("at least two events are required for time split")
+
+    split_index = max(1, min(len(events) - 1, int(len(events) * train_ratio)))
+    return list(events[:split_index]), list(events[split_index:])
+
+
+def build_parameter_grid(
+    *,
+    theta_values: Sequence[float],
+    trade_capital_values: Sequence[float],
+    pair_universes: Sequence[Sequence[str]],
+    staleness_threshold_values: Sequence[float],
+) -> list[SweepParameters]:
+    if not theta_values:
+        raise ValueError("theta_values must not be empty")
+    if not trade_capital_values:
+        raise ValueError("trade_capital_values must not be empty")
+    if not pair_universes:
+        raise ValueError("pair_universes must not be empty")
+    if not staleness_threshold_values:
+        raise ValueError("staleness_threshold_values must not be empty")
+
+    grid: list[SweepParameters] = []
+    for theta in theta_values:
+        for trade_capital in trade_capital_values:
+            for pair_universe in pair_universes:
+                normalized_universe = tuple(
+                    sorted({pair.upper() for pair in pair_universe}))
+                for staleness_threshold in staleness_threshold_values:
+                    grid.append(
+                        SweepParameters(
+                            min_profit_threshold=float(theta),
+                            trade_capital=float(trade_capital),
+                            pair_universe=normalized_universe,
+                            staleness_threshold_seconds=float(
+                                staleness_threshold),
+                        )
+                    )
+    return grid
+
+
+def _filter_events_for_parameters(
+    events: Sequence[ReplayBookEvent],
+    *,
+    pair_universe: set[str],
+    staleness_threshold_seconds: float,
+) -> list[ReplayBookEvent]:
+    if staleness_threshold_seconds <= 0.0:
+        raise ValueError("staleness_threshold_seconds must be > 0")
+
+    filtered: list[ReplayBookEvent] = []
+    last_seen_by_symbol: dict[str, datetime] = {}
+
+    for event in events:
+        symbol = event.symbol.upper()
+        if symbol not in pair_universe:
+            continue
+
+        previous = last_seen_by_symbol.get(symbol)
+        last_seen_by_symbol[symbol] = event.occurred_at
+        if previous is None:
+            filtered.append(event)
+            continue
+
+        gap_seconds = (event.occurred_at - previous).total_seconds()
+        if gap_seconds <= staleness_threshold_seconds:
+            filtered.append(event)
+
+    return filtered
+
+
+def _restrict_cycles_by_pair(
+    cycles_by_pair: Mapping[str, list[TriangularCycle]],
+    *,
+    pair_universe: set[str],
+) -> dict[str, list[TriangularCycle]]:
+    restricted: dict[str, list[TriangularCycle]] = {}
+    for pair_symbol, cycles in cycles_by_pair.items():
+        normalized_pair = pair_symbol.upper()
+        if normalized_pair not in pair_universe:
+            continue
+
+        kept = [cycle for cycle in cycles if all(
+            pair.upper() in pair_universe for pair in cycle.pairs)]
+        if kept:
+            restricted[normalized_pair] = kept
+    return restricted
+
+
+def _score_report(report: BacktestReport) -> float:
+    win_rate_bonus = (report.win_rate or 0.0) * 100.0
+    fill_rate_bonus = (report.fill_rate or 0.0) * 50.0
+    return report.realized_pnl_usd + win_rate_bonus + fill_rate_bonus - report.max_drawdown_usd
+
+
+def _safe_ratio(numerator: float, denominator: float) -> float:
+    if denominator <= 0.0:
+        return 0.0 if numerator <= 0.0 else 1.0
+    return max(0.0, numerator / denominator)
+
+
+def _evaluate_promotion(
+    *,
+    result: SweepResult,
+    criteria: PromotionCriteria,
+) -> tuple[bool, tuple[str, ...]]:
+    reasons: list[str] = []
+    test = result.test_report
+
+    if test.realized_pnl_usd < criteria.min_test_realized_pnl_usd:
+        reasons.append(
+            "test_realized_pnl_below_threshold"
+        )
+    if (test.win_rate or 0.0) < criteria.min_test_win_rate:
+        reasons.append("test_win_rate_below_threshold")
+    if (test.fill_rate or 0.0) < criteria.min_test_fill_rate:
+        reasons.append("test_fill_rate_below_threshold")
+    if test.max_drawdown_usd > criteria.max_test_drawdown_usd:
+        reasons.append("test_drawdown_above_threshold")
+    if result.generalization_gap_ratio > criteria.max_generalization_gap_ratio:
+        reasons.append("generalization_gap_above_threshold")
+
+    return (not reasons), tuple(reasons)
+
+
+def _run_backtest(
+    *,
+    events: Sequence[ReplayBookEvent],
+    cycles_by_pair: Mapping[str, list[TriangularCycle]],
+    available_pairs: Sequence[str],
+    config: BacktestConfig,
+    starting_balances: Mapping[str, float],
+) -> BacktestReport:
+    started_at = events[0].occurred_at if events else datetime.now(UTC)
+    engine = BacktestReplayEngine(
+        cycles_by_pair=cycles_by_pair,
+        available_pairs=available_pairs,
+        config=config,
+        started_at=started_at,
+    )
+    return asyncio.run(engine.run(events, starting_balances=starting_balances))
+
+
+def run_parameter_search(
+    *,
+    events: Sequence[ReplayBookEvent],
+    cycles_by_pair: Mapping[str, list[TriangularCycle]],
+    parameter_grid: Sequence[SweepParameters],
+    starting_balances: Mapping[str, float],
+    train_ratio: float,
+    promotion_criteria: PromotionCriteria | None = None,
+    max_concurrent_trades: int = 1,
+    max_depth_levels: int = 10,
+    quote_asset: str = "USD",
+) -> SweepArtifacts:
+    criteria = promotion_criteria or PromotionCriteria()
+    train_events, test_events = split_events_time_windows(
+        events, train_ratio=train_ratio)
+
+    results: list[SweepResult] = []
+    promoted: list[SweepResult] = []
+
+    for parameters in parameter_grid:
+        allowed_pairs = set(parameters.pair_universe)
+        filtered_train = _filter_events_for_parameters(
+            train_events,
+            pair_universe=allowed_pairs,
+            staleness_threshold_seconds=parameters.staleness_threshold_seconds,
+        )
+        filtered_test = _filter_events_for_parameters(
+            test_events,
+            pair_universe=allowed_pairs,
+            staleness_threshold_seconds=parameters.staleness_threshold_seconds,
+        )
+
+        if not filtered_train or not filtered_test:
+            continue
+
+        restricted_cycles = _restrict_cycles_by_pair(
+            cycles_by_pair,
+            pair_universe=allowed_pairs,
+        )
+        if not restricted_cycles:
+            continue
+
+        config = BacktestConfig(
+            min_profit_threshold=parameters.min_profit_threshold,
+            trade_capital=parameters.trade_capital,
+            max_concurrent_trades=max_concurrent_trades,
+            max_depth_levels=max_depth_levels,
+            quote_asset=quote_asset,
+        )
+
+        train_report = _run_backtest(
+            events=filtered_train,
+            cycles_by_pair=restricted_cycles,
+            available_pairs=sorted(allowed_pairs),
+            config=config,
+            starting_balances=starting_balances,
+        )
+        test_report = _run_backtest(
+            events=filtered_test,
+            cycles_by_pair=restricted_cycles,
+            available_pairs=sorted(allowed_pairs),
+            config=config,
+            starting_balances=starting_balances,
+        )
+
+        train_score = _score_report(train_report)
+        test_score = _score_report(test_report)
+        score_drop = max(0.0, train_score - test_score)
+        generalization_gap_ratio = _safe_ratio(score_drop, abs(train_score))
+        overfit_detected = generalization_gap_ratio > criteria.max_generalization_gap_ratio
+
+        base_result = SweepResult(
+            parameters=parameters,
+            train_report=train_report,
+            test_report=test_report,
+            train_score=train_score,
+            test_score=test_score,
+            generalization_gap_ratio=generalization_gap_ratio,
+            overfit_detected=overfit_detected,
+            promotion_ready=False,
+            promotion_reasons=(),
+            train_event_count=len(filtered_train),
+            test_event_count=len(filtered_test),
+        )
+        promotion_ready, promotion_reasons = _evaluate_promotion(
+            result=base_result, criteria=criteria)
+        completed_result = SweepResult(
+            parameters=base_result.parameters,
+            train_report=base_result.train_report,
+            test_report=base_result.test_report,
+            train_score=base_result.train_score,
+            test_score=base_result.test_score,
+            generalization_gap_ratio=base_result.generalization_gap_ratio,
+            overfit_detected=base_result.overfit_detected,
+            promotion_ready=promotion_ready,
+            promotion_reasons=promotion_reasons,
+            train_event_count=base_result.train_event_count,
+            test_event_count=base_result.test_event_count,
+        )
+
+        results.append(completed_result)
+        if completed_result.promotion_ready:
+            promoted.append(completed_result)
+
+    results.sort(key=lambda item: item.test_score, reverse=True)
+    promoted.sort(key=lambda item: item.test_score, reverse=True)
+
+    train_window: tuple[datetime, datetime] | None = None
+    test_window: tuple[datetime, datetime] | None = None
+    if train_events:
+        train_window = (train_events[0].occurred_at,
+                        train_events[-1].occurred_at)
+    if test_events:
+        test_window = (test_events[0].occurred_at, test_events[-1].occurred_at)
+
+    return SweepArtifacts(
+        results=tuple(results),
+        promoted=tuple(promoted),
+        train_window=train_window,
+        test_window=test_window,
+    )
+
+
+def _report_to_dict(report: BacktestReport) -> dict[str, object]:
+    return {
+        "started_at": report.started_at.isoformat(),
+        "finished_at": report.finished_at.isoformat(),
+        "processed_events": report.processed_events,
+        "opportunities_seen": report.opportunities_seen,
+        "trades_executed": report.trades_executed,
+        "win_rate": report.win_rate,
+        "fill_rate": report.fill_rate,
+        "realized_pnl_usd": report.realized_pnl_usd,
+        "max_drawdown_usd": report.max_drawdown_usd,
+        "miss_reasons": dict(report.miss_reasons),
+        "execution_latency_p50_ms": report.execution_latency_p50_ms,
+        "execution_latency_p95_ms": report.execution_latency_p95_ms,
+        "execution_latency_p99_ms": report.execution_latency_p99_ms,
+    }
+
+
+def persist_sweep_results(path: Path, artifacts: SweepArtifacts) -> None:
+    payload = {
+        "generated_at": datetime.now(UTC).isoformat(),
+        "train_window": (
+            {
+                "started_at": artifacts.train_window[0].isoformat(),
+                "finished_at": artifacts.train_window[1].isoformat(),
+            }
+            if artifacts.train_window is not None
+            else None
+        ),
+        "test_window": (
+            {
+                "started_at": artifacts.test_window[0].isoformat(),
+                "finished_at": artifacts.test_window[1].isoformat(),
+            }
+            if artifacts.test_window is not None
+            else None
+        ),
+        "results": [
+            {
+                "parameters": {
+                    "min_profit_threshold": result.parameters.min_profit_threshold,
+                    "trade_capital": result.parameters.trade_capital,
+                    "pair_universe": list(result.parameters.pair_universe),
+                    "staleness_threshold_seconds": result.parameters.staleness_threshold_seconds,
+                },
+                "train_report": _report_to_dict(result.train_report),
+                "test_report": _report_to_dict(result.test_report),
+                "train_score": result.train_score,
+                "test_score": result.test_score,
+                "generalization_gap_ratio": result.generalization_gap_ratio,
+                "overfit_detected": result.overfit_detected,
+                "promotion_ready": result.promotion_ready,
+                "promotion_reasons": list(result.promotion_reasons),
+                "train_event_count": result.train_event_count,
+                "test_event_count": result.test_event_count,
+            }
+            for result in artifacts.results
+        ],
+    }
+
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_bytes(orjson.dumps(
+        payload, option=orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS))
diff --git a/tests/unit/test_backtesting_sweep.py b/tests/unit/test_backtesting_sweep.py
new file mode 100644
index 0000000..974e5c0
--- /dev/null
+++ b/tests/unit/test_backtesting_sweep.py
@@ -0,0 +1,102 @@
+from __future__ import annotations
+
+from datetime import UTC, datetime, timedelta
+
+from arbitrade.backtesting.replay import ReplayBookEvent
+from arbitrade.backtesting.sweep import (
+    PromotionCriteria,
+    SweepResult,
+    build_parameter_grid,
+    run_parameter_search,
+    split_events_time_windows,
+)
+from arbitrade.detection.graph import CurrencyGraph
+from arbitrade.exchange.models import BookLevel
+
+
+def _build_cycles() -> dict[str, list]:
+    graph = CurrencyGraph()
+    graph.add_pair("USD", "BTC", "BTC/USD")
+    graph.add_pair("BTC", "ETH", "ETH/BTC")
+    graph.add_pair("ETH", "USD", "ETH/USD")
+    return graph.index_cycles_by_pair(graph.triangular_cycles())
+
+
+def _events() -> list[ReplayBookEvent]:
+    base_time = datetime(2026, 6, 1, 12, 0, tzinfo=UTC)
+    rows: list[ReplayBookEvent] = []
+    for index in range(12):
+        tick = base_time + timedelta(seconds=index)
+        rows.extend(
+            [
+                ReplayBookEvent(
+                    occurred_at=tick,
+                    symbol="BTC/USD",
+                    bids=(BookLevel(price=99.5, volume=10.0),),
+                    asks=(BookLevel(price=100.0, volume=10.0),),
+                ),
+                ReplayBookEvent(
+                    occurred_at=tick,
+                    symbol="ETH/BTC",
+                    bids=(BookLevel(price=0.051, volume=10.0),),
+                    asks=(BookLevel(price=0.050, volume=10.0),),
+                ),
+                ReplayBookEvent(
+                    occurred_at=tick,
+                    symbol="ETH/USD",
+                    bids=(BookLevel(price=110.0, volume=10.0),),
+                    asks=(BookLevel(price=110.5, volume=10.0),),
+                ),
+            ]
+        )
+    return rows
+
+
+def test_split_events_time_windows_returns_non_empty_train_and_test() -> None:
+    train, test = split_events_time_windows(_events(), train_ratio=0.7)
+
+    assert train
+    assert test
+    assert train[-1].occurred_at <= test[0].occurred_at
+
+
+def test_build_parameter_grid_expands_combinations() -> None:
+    grid = build_parameter_grid(
+        theta_values=[0.0005, 0.001],
+        trade_capital_values=[100.0],
+        pair_universes=[["BTC/USD", "ETH/BTC", "ETH/USD"]],
+        staleness_threshold_values=[3.0, 5.0],
+    )
+
+    assert len(grid) == 4
+
+
+def test_run_parameter_search_produces_ranked_results_with_overfit_guard() -> None:
+    artifacts = run_parameter_search(
+        events=_events(),
+        cycles_by_pair=_build_cycles(),
+        parameter_grid=build_parameter_grid(
+            theta_values=[0.0005, 0.001],
+            trade_capital_values=[75.0, 100.0],
+            pair_universes=[["BTC/USD", "ETH/BTC", "ETH/USD"]],
+            staleness_threshold_values=[5.0],
+        ),
+        starting_balances={"USD": 2000.0},
+        train_ratio=0.7,
+        promotion_criteria=PromotionCriteria(
+            min_test_realized_pnl_usd=-1000.0,
+            min_test_win_rate=0.0,
+            min_test_fill_rate=0.0,
+            max_test_drawdown_usd=1_000_000.0,
+            max_generalization_gap_ratio=0.9,
+        ),
+    )
+
+    assert artifacts.results
+    assert artifacts.results[0].test_score >= artifacts.results[-1].test_score
+
+    first: SweepResult = artifacts.results[0]
+    assert first.train_event_count > 0
+    assert first.test_event_count > 0
+    assert first.generalization_gap_ratio >= 0.0
+    assert isinstance(first.promotion_ready, bool)