feat: Add backtesting parameter sweep support and related functionality

This commit is contained in:
2026-06-02 08:44:10 +02:00
parent 8ef8dc801d
commit f612c8533a
6 changed files with 685 additions and 7 deletions
+3
View File
@@ -18,6 +18,9 @@
- Added synthetic latency profiler scenarios and CLI scripts for baseline generation and regression checks.
- Added latency baseline/threshold artifacts and CI latency guardrail enforcement.
- Added deterministic replay backtesting engine, CLI script, and unit coverage for JSONL event replay.
- Added backtesting parameter sweep support (`scripts/backtest_sweep.py`) for theta, trade-capital, pair-universe, and staleness-threshold grid search.
- Added persisted sweep artifacts with ranked in-sample/out-of-sample results and promotion-ready candidate reporting.
- Added out-of-sample overfit guards via train/test time-window split and generalization-gap checks.
- Added dashboard controls for tradable pair universe selection and strategy mode/parameter configuration.
### Changed
+15 -7
View File
@@ -104,7 +104,7 @@ Minimum `.env` values:
```env
APP_ENV=dev
APP_HOST=0.0.0.0
APP_PORT=8000
APP_PORT=9090
LOG_LEVEL=INFO
LOG_JSON=true
DUCKDB_PATH=./data/arbitrade.duckdb
@@ -132,8 +132,8 @@ python -m arbitrade.main
Health endpoints:
- HTML: `http://localhost:8000/`
- JSON: `http://localhost:8000/health`
- HTML: `http://localhost:9090/`
- JSON: `http://localhost:9090/health`
## Database
@@ -283,12 +283,12 @@ Set these in Coolify application settings:
- Build Command: leave empty.
- Install Command: leave empty.
- Start Command: leave empty unless you explicitly want to override the image default.
- Port: `8000`
- Port: `9090` (coolify uses `8000` internally)
### 3) Configure health check and networking
- Health Check Path: `/health`
- Exposed Port: `8000`
- Exposed Port: `9090`
- Use Coolify-generated domain or attach your own domain.
### 4) Configure persistent storage
@@ -305,7 +305,7 @@ Add runtime environment variables in Coolify (UI: Environment Variables):
- `APP_ENV=prod`
- `APP_HOST=0.0.0.0`
- `APP_PORT=8000`
- `APP_PORT=9090`
- `DUCKDB_PATH=/app/data/arbitrade.duckdb`
- `LOG_LEVEL=INFO`
- `LOG_JSON=true`
@@ -431,6 +431,12 @@ Run a deterministic replay backtest from a JSONL event stream:
python scripts/backtest_replay.py --events path\to\replay.jsonl --starting-balances USD=1000.0
```
Run parameter sweep with train/test split and promotion scoring:
```powershell
python scripts/backtest_sweep.py --events path\to\replay.jsonl --starting-balances USD=1000.0 --output ops/backtesting/parameter_sweep_results.json
```
Replay event format:
```json
@@ -447,7 +453,9 @@ Notes:
- Events are replayed in timestamp order.
- The replay engine reuses the production detector, pre-trade validation, trade limits, and execution sequencer.
- The simulated execution path applies configurable slippage and execution latency so reports include deterministic trade/miss statistics.
Latency baseline and threshold artifacts:
- Parameter sweep splits replay data into in-sample and out-of-sample windows, ranks configurations by out-of-sample score, and flags overfit via train/test generalization-gap checks.
- Sweep output persists ranked combinations and promotion-ready candidates for paper-trading canary promotion decisions.
- Latency baseline and threshold artifacts:
- `ops/performance/latency_baseline.json`
- `ops/performance/latency_thresholds.json`
+151
View File
@@ -0,0 +1,151 @@
from __future__ import annotations
import argparse
from collections.abc import Mapping, Sequence
from pathlib import Path
from arbitrade.backtesting import load_replay_events
from arbitrade.backtesting.sweep import (
PromotionCriteria,
SweepResult,
build_parameter_grid,
persist_sweep_results,
run_parameter_search,
)
from arbitrade.detection.graph import CurrencyGraph, TriangularCycle
def _parse_balances(raw: str) -> Mapping[str, float]:
balances: dict[str, float] = {}
for entry in raw.split(","):
stripped = entry.strip()
if not stripped:
continue
asset, value = stripped.split("=", 1)
balances[asset.strip().upper()] = float(value)
return balances
def _parse_float_list(raw: str) -> list[float]:
values = [item.strip() for item in raw.split(",") if item.strip()]
if not values:
raise ValueError("expected at least one numeric value")
return [float(value) for value in values]
def _parse_pair_universes(raw: str) -> list[tuple[str, ...]]:
universes: list[tuple[str, ...]] = []
for chunk in raw.split(";"):
symbols = tuple(item.strip().upper()
for item in chunk.split("|") if item.strip())
if symbols:
universes.append(symbols)
if not universes:
raise ValueError("at least one pair universe must be provided")
return universes
def _build_graph_from_symbols(symbols: Sequence[str]) -> dict[str, list[TriangularCycle]]:
graph = CurrencyGraph()
for symbol in symbols:
normalized = symbol.upper()
if "/" not in normalized:
continue
base, quote = normalized.split("/", 1)
graph.add_pair(base, quote, normalized)
cycles = graph.triangular_cycles()
return graph.index_cycles_by_pair(cycles)
def _print_top_results(results: Sequence[SweepResult], *, limit: int = 5) -> None:
print(f"Top {min(limit, len(results))} result(s) by out-of-sample score:")
for index, result in enumerate(results[:limit], start=1):
print(
"- "
f"#{index} "
f"theta={result.parameters.min_profit_threshold:.6f}, "
f"capital={result.parameters.trade_capital:.2f}, "
f"pairs={','.join(result.parameters.pair_universe)}, "
f"staleness={result.parameters.staleness_threshold_seconds:.2f}s, "
f"test_score={result.test_score:.4f}, "
f"promotion_ready={result.promotion_ready}"
)
def main() -> int:
parser = argparse.ArgumentParser(
description="Run backtesting parameter sweep with train/test split.")
parser.add_argument("--events", type=Path, required=True)
parser.add_argument("--starting-balances", type=str, default="USD=1000.0")
parser.add_argument("--theta-values", type=str,
default="0.0003,0.0005,0.0008")
parser.add_argument("--trade-capital-values",
type=str, default="50,100,150")
parser.add_argument(
"--pair-universes",
type=str,
default="BTC/USD|ETH/BTC|ETH/USD",
help="Semicolon-separated universes, each with | delimited pairs",
)
parser.add_argument("--staleness-threshold-values",
type=str, default="3,5,8")
parser.add_argument("--train-ratio", type=float, default=0.7)
parser.add_argument("--output", type=Path,
default=Path("ops/backtesting/parameter_sweep_results.json"))
parser.add_argument("--min-test-realized-pnl-usd", type=float, default=0.0)
parser.add_argument("--min-test-win-rate", type=float, default=0.5)
parser.add_argument("--min-test-fill-rate", type=float, default=0.9)
parser.add_argument("--max-test-drawdown-usd", type=float, default=25.0)
parser.add_argument("--max-generalization-gap-ratio",
type=float, default=0.5)
args = parser.parse_args()
events = load_replay_events(args.events)
symbols = sorted({event.symbol.upper() for event in events})
cycles_by_pair = _build_graph_from_symbols(symbols)
if not cycles_by_pair:
raise SystemExit(
"No triangular cycles found in supplied replay events")
grid = build_parameter_grid(
theta_values=_parse_float_list(args.theta_values),
trade_capital_values=_parse_float_list(args.trade_capital_values),
pair_universes=_parse_pair_universes(args.pair_universes),
staleness_threshold_values=_parse_float_list(
args.staleness_threshold_values),
)
artifacts = run_parameter_search(
events=events,
cycles_by_pair=cycles_by_pair,
parameter_grid=grid,
starting_balances=_parse_balances(args.starting_balances),
train_ratio=args.train_ratio,
promotion_criteria=PromotionCriteria(
min_test_realized_pnl_usd=args.min_test_realized_pnl_usd,
min_test_win_rate=args.min_test_win_rate,
min_test_fill_rate=args.min_test_fill_rate,
max_test_drawdown_usd=args.max_test_drawdown_usd,
max_generalization_gap_ratio=args.max_generalization_gap_ratio,
),
)
persist_sweep_results(args.output, artifacts)
print(f"Completed sweep combinations: {len(artifacts.results)}")
print(f"Promotion-ready combinations: {len(artifacts.promoted)}")
print(f"Results written: {args.output}")
_print_top_results(artifacts.results)
if artifacts.promoted:
print("Promotion candidates (paper-trading canary):")
_print_top_results(artifacts.promoted)
return 0
if __name__ == "__main__":
raise SystemExit(main())
+18
View File
@@ -6,6 +6,16 @@ from arbitrade.backtesting.replay import (
ReplayClock,
load_replay_events,
)
from arbitrade.backtesting.sweep import (
PromotionCriteria,
SweepArtifacts,
SweepParameters,
SweepResult,
build_parameter_grid,
persist_sweep_results,
run_parameter_search,
split_events_time_windows,
)
__all__ = [
"ReplayClock",
@@ -14,4 +24,12 @@ __all__ = [
"BacktestReport",
"BacktestReplayEngine",
"load_replay_events",
"SweepParameters",
"SweepResult",
"SweepArtifacts",
"PromotionCriteria",
"split_events_time_windows",
"build_parameter_grid",
"run_parameter_search",
"persist_sweep_results",
]
+396
View File
@@ -0,0 +1,396 @@
from __future__ import annotations
import asyncio
from collections.abc import Mapping, Sequence
from dataclasses import dataclass
from datetime import UTC, datetime
from pathlib import Path
import orjson
from arbitrade.backtesting.replay import (
BacktestConfig,
BacktestReplayEngine,
BacktestReport,
ReplayBookEvent,
)
from arbitrade.detection.graph import TriangularCycle
@dataclass(frozen=True, slots=True)
class SweepParameters:
min_profit_threshold: float
trade_capital: float
pair_universe: tuple[str, ...]
staleness_threshold_seconds: float
@dataclass(frozen=True, slots=True)
class PromotionCriteria:
min_test_realized_pnl_usd: float = 0.0
min_test_win_rate: float = 0.5
min_test_fill_rate: float = 0.9
max_test_drawdown_usd: float = 25.0
max_generalization_gap_ratio: float = 0.5
@dataclass(frozen=True, slots=True)
class SweepResult:
parameters: SweepParameters
train_report: BacktestReport
test_report: BacktestReport
train_score: float
test_score: float
generalization_gap_ratio: float
overfit_detected: bool
promotion_ready: bool
promotion_reasons: tuple[str, ...]
train_event_count: int
test_event_count: int
@dataclass(frozen=True, slots=True)
class SweepArtifacts:
results: tuple[SweepResult, ...]
promoted: tuple[SweepResult, ...]
train_window: tuple[datetime, datetime] | None
test_window: tuple[datetime, datetime] | None
def split_events_time_windows(
events: Sequence[ReplayBookEvent],
*,
train_ratio: float,
) -> tuple[list[ReplayBookEvent], list[ReplayBookEvent]]:
if train_ratio <= 0.0 or train_ratio >= 1.0:
raise ValueError("train_ratio must be between 0 and 1")
if len(events) < 2:
raise ValueError("at least two events are required for time split")
split_index = max(1, min(len(events) - 1, int(len(events) * train_ratio)))
return list(events[:split_index]), list(events[split_index:])
def build_parameter_grid(
*,
theta_values: Sequence[float],
trade_capital_values: Sequence[float],
pair_universes: Sequence[Sequence[str]],
staleness_threshold_values: Sequence[float],
) -> list[SweepParameters]:
if not theta_values:
raise ValueError("theta_values must not be empty")
if not trade_capital_values:
raise ValueError("trade_capital_values must not be empty")
if not pair_universes:
raise ValueError("pair_universes must not be empty")
if not staleness_threshold_values:
raise ValueError("staleness_threshold_values must not be empty")
grid: list[SweepParameters] = []
for theta in theta_values:
for trade_capital in trade_capital_values:
for pair_universe in pair_universes:
normalized_universe = tuple(
sorted({pair.upper() for pair in pair_universe}))
for staleness_threshold in staleness_threshold_values:
grid.append(
SweepParameters(
min_profit_threshold=float(theta),
trade_capital=float(trade_capital),
pair_universe=normalized_universe,
staleness_threshold_seconds=float(
staleness_threshold),
)
)
return grid
def _filter_events_for_parameters(
events: Sequence[ReplayBookEvent],
*,
pair_universe: set[str],
staleness_threshold_seconds: float,
) -> list[ReplayBookEvent]:
if staleness_threshold_seconds <= 0.0:
raise ValueError("staleness_threshold_seconds must be > 0")
filtered: list[ReplayBookEvent] = []
last_seen_by_symbol: dict[str, datetime] = {}
for event in events:
symbol = event.symbol.upper()
if symbol not in pair_universe:
continue
previous = last_seen_by_symbol.get(symbol)
last_seen_by_symbol[symbol] = event.occurred_at
if previous is None:
filtered.append(event)
continue
gap_seconds = (event.occurred_at - previous).total_seconds()
if gap_seconds <= staleness_threshold_seconds:
filtered.append(event)
return filtered
def _restrict_cycles_by_pair(
cycles_by_pair: Mapping[str, list[TriangularCycle]],
*,
pair_universe: set[str],
) -> dict[str, list[TriangularCycle]]:
restricted: dict[str, list[TriangularCycle]] = {}
for pair_symbol, cycles in cycles_by_pair.items():
normalized_pair = pair_symbol.upper()
if normalized_pair not in pair_universe:
continue
kept = [cycle for cycle in cycles if all(
pair.upper() in pair_universe for pair in cycle.pairs)]
if kept:
restricted[normalized_pair] = kept
return restricted
def _score_report(report: BacktestReport) -> float:
win_rate_bonus = (report.win_rate or 0.0) * 100.0
fill_rate_bonus = (report.fill_rate or 0.0) * 50.0
return report.realized_pnl_usd + win_rate_bonus + fill_rate_bonus - report.max_drawdown_usd
def _safe_ratio(numerator: float, denominator: float) -> float:
if denominator <= 0.0:
return 0.0 if numerator <= 0.0 else 1.0
return max(0.0, numerator / denominator)
def _evaluate_promotion(
*,
result: SweepResult,
criteria: PromotionCriteria,
) -> tuple[bool, tuple[str, ...]]:
reasons: list[str] = []
test = result.test_report
if test.realized_pnl_usd < criteria.min_test_realized_pnl_usd:
reasons.append(
"test_realized_pnl_below_threshold"
)
if (test.win_rate or 0.0) < criteria.min_test_win_rate:
reasons.append("test_win_rate_below_threshold")
if (test.fill_rate or 0.0) < criteria.min_test_fill_rate:
reasons.append("test_fill_rate_below_threshold")
if test.max_drawdown_usd > criteria.max_test_drawdown_usd:
reasons.append("test_drawdown_above_threshold")
if result.generalization_gap_ratio > criteria.max_generalization_gap_ratio:
reasons.append("generalization_gap_above_threshold")
return (not reasons), tuple(reasons)
def _run_backtest(
*,
events: Sequence[ReplayBookEvent],
cycles_by_pair: Mapping[str, list[TriangularCycle]],
available_pairs: Sequence[str],
config: BacktestConfig,
starting_balances: Mapping[str, float],
) -> BacktestReport:
started_at = events[0].occurred_at if events else datetime.now(UTC)
engine = BacktestReplayEngine(
cycles_by_pair=cycles_by_pair,
available_pairs=available_pairs,
config=config,
started_at=started_at,
)
return asyncio.run(engine.run(events, starting_balances=starting_balances))
def run_parameter_search(
*,
events: Sequence[ReplayBookEvent],
cycles_by_pair: Mapping[str, list[TriangularCycle]],
parameter_grid: Sequence[SweepParameters],
starting_balances: Mapping[str, float],
train_ratio: float,
promotion_criteria: PromotionCriteria | None = None,
max_concurrent_trades: int = 1,
max_depth_levels: int = 10,
quote_asset: str = "USD",
) -> SweepArtifacts:
criteria = promotion_criteria or PromotionCriteria()
train_events, test_events = split_events_time_windows(
events, train_ratio=train_ratio)
results: list[SweepResult] = []
promoted: list[SweepResult] = []
for parameters in parameter_grid:
allowed_pairs = set(parameters.pair_universe)
filtered_train = _filter_events_for_parameters(
train_events,
pair_universe=allowed_pairs,
staleness_threshold_seconds=parameters.staleness_threshold_seconds,
)
filtered_test = _filter_events_for_parameters(
test_events,
pair_universe=allowed_pairs,
staleness_threshold_seconds=parameters.staleness_threshold_seconds,
)
if not filtered_train or not filtered_test:
continue
restricted_cycles = _restrict_cycles_by_pair(
cycles_by_pair,
pair_universe=allowed_pairs,
)
if not restricted_cycles:
continue
config = BacktestConfig(
min_profit_threshold=parameters.min_profit_threshold,
trade_capital=parameters.trade_capital,
max_concurrent_trades=max_concurrent_trades,
max_depth_levels=max_depth_levels,
quote_asset=quote_asset,
)
train_report = _run_backtest(
events=filtered_train,
cycles_by_pair=restricted_cycles,
available_pairs=sorted(allowed_pairs),
config=config,
starting_balances=starting_balances,
)
test_report = _run_backtest(
events=filtered_test,
cycles_by_pair=restricted_cycles,
available_pairs=sorted(allowed_pairs),
config=config,
starting_balances=starting_balances,
)
train_score = _score_report(train_report)
test_score = _score_report(test_report)
score_drop = max(0.0, train_score - test_score)
generalization_gap_ratio = _safe_ratio(score_drop, abs(train_score))
overfit_detected = generalization_gap_ratio > criteria.max_generalization_gap_ratio
base_result = SweepResult(
parameters=parameters,
train_report=train_report,
test_report=test_report,
train_score=train_score,
test_score=test_score,
generalization_gap_ratio=generalization_gap_ratio,
overfit_detected=overfit_detected,
promotion_ready=False,
promotion_reasons=(),
train_event_count=len(filtered_train),
test_event_count=len(filtered_test),
)
promotion_ready, promotion_reasons = _evaluate_promotion(
result=base_result, criteria=criteria)
completed_result = SweepResult(
parameters=base_result.parameters,
train_report=base_result.train_report,
test_report=base_result.test_report,
train_score=base_result.train_score,
test_score=base_result.test_score,
generalization_gap_ratio=base_result.generalization_gap_ratio,
overfit_detected=base_result.overfit_detected,
promotion_ready=promotion_ready,
promotion_reasons=promotion_reasons,
train_event_count=base_result.train_event_count,
test_event_count=base_result.test_event_count,
)
results.append(completed_result)
if completed_result.promotion_ready:
promoted.append(completed_result)
results.sort(key=lambda item: item.test_score, reverse=True)
promoted.sort(key=lambda item: item.test_score, reverse=True)
train_window: tuple[datetime, datetime] | None = None
test_window: tuple[datetime, datetime] | None = None
if train_events:
train_window = (train_events[0].occurred_at,
train_events[-1].occurred_at)
if test_events:
test_window = (test_events[0].occurred_at, test_events[-1].occurred_at)
return SweepArtifacts(
results=tuple(results),
promoted=tuple(promoted),
train_window=train_window,
test_window=test_window,
)
def _report_to_dict(report: BacktestReport) -> dict[str, object]:
return {
"started_at": report.started_at.isoformat(),
"finished_at": report.finished_at.isoformat(),
"processed_events": report.processed_events,
"opportunities_seen": report.opportunities_seen,
"trades_executed": report.trades_executed,
"win_rate": report.win_rate,
"fill_rate": report.fill_rate,
"realized_pnl_usd": report.realized_pnl_usd,
"max_drawdown_usd": report.max_drawdown_usd,
"miss_reasons": dict(report.miss_reasons),
"execution_latency_p50_ms": report.execution_latency_p50_ms,
"execution_latency_p95_ms": report.execution_latency_p95_ms,
"execution_latency_p99_ms": report.execution_latency_p99_ms,
}
def persist_sweep_results(path: Path, artifacts: SweepArtifacts) -> None:
payload = {
"generated_at": datetime.now(UTC).isoformat(),
"train_window": (
{
"started_at": artifacts.train_window[0].isoformat(),
"finished_at": artifacts.train_window[1].isoformat(),
}
if artifacts.train_window is not None
else None
),
"test_window": (
{
"started_at": artifacts.test_window[0].isoformat(),
"finished_at": artifacts.test_window[1].isoformat(),
}
if artifacts.test_window is not None
else None
),
"results": [
{
"parameters": {
"min_profit_threshold": result.parameters.min_profit_threshold,
"trade_capital": result.parameters.trade_capital,
"pair_universe": list(result.parameters.pair_universe),
"staleness_threshold_seconds": result.parameters.staleness_threshold_seconds,
},
"train_report": _report_to_dict(result.train_report),
"test_report": _report_to_dict(result.test_report),
"train_score": result.train_score,
"test_score": result.test_score,
"generalization_gap_ratio": result.generalization_gap_ratio,
"overfit_detected": result.overfit_detected,
"promotion_ready": result.promotion_ready,
"promotion_reasons": list(result.promotion_reasons),
"train_event_count": result.train_event_count,
"test_event_count": result.test_event_count,
}
for result in artifacts.results
],
}
path.parent.mkdir(parents=True, exist_ok=True)
path.write_bytes(orjson.dumps(
payload, option=orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS))
+102
View File
@@ -0,0 +1,102 @@
from __future__ import annotations
from datetime import UTC, datetime, timedelta
from arbitrade.backtesting.replay import ReplayBookEvent
from arbitrade.backtesting.sweep import (
PromotionCriteria,
SweepResult,
build_parameter_grid,
run_parameter_search,
split_events_time_windows,
)
from arbitrade.detection.graph import CurrencyGraph
from arbitrade.exchange.models import BookLevel
def _build_cycles() -> dict[str, list]:
graph = CurrencyGraph()
graph.add_pair("USD", "BTC", "BTC/USD")
graph.add_pair("BTC", "ETH", "ETH/BTC")
graph.add_pair("ETH", "USD", "ETH/USD")
return graph.index_cycles_by_pair(graph.triangular_cycles())
def _events() -> list[ReplayBookEvent]:
base_time = datetime(2026, 6, 1, 12, 0, tzinfo=UTC)
rows: list[ReplayBookEvent] = []
for index in range(12):
tick = base_time + timedelta(seconds=index)
rows.extend(
[
ReplayBookEvent(
occurred_at=tick,
symbol="BTC/USD",
bids=(BookLevel(price=99.5, volume=10.0),),
asks=(BookLevel(price=100.0, volume=10.0),),
),
ReplayBookEvent(
occurred_at=tick,
symbol="ETH/BTC",
bids=(BookLevel(price=0.051, volume=10.0),),
asks=(BookLevel(price=0.050, volume=10.0),),
),
ReplayBookEvent(
occurred_at=tick,
symbol="ETH/USD",
bids=(BookLevel(price=110.0, volume=10.0),),
asks=(BookLevel(price=110.5, volume=10.0),),
),
]
)
return rows
def test_split_events_time_windows_returns_non_empty_train_and_test() -> None:
train, test = split_events_time_windows(_events(), train_ratio=0.7)
assert train
assert test
assert train[-1].occurred_at <= test[0].occurred_at
def test_build_parameter_grid_expands_combinations() -> None:
grid = build_parameter_grid(
theta_values=[0.0005, 0.001],
trade_capital_values=[100.0],
pair_universes=[["BTC/USD", "ETH/BTC", "ETH/USD"]],
staleness_threshold_values=[3.0, 5.0],
)
assert len(grid) == 4
def test_run_parameter_search_produces_ranked_results_with_overfit_guard() -> None:
artifacts = run_parameter_search(
events=_events(),
cycles_by_pair=_build_cycles(),
parameter_grid=build_parameter_grid(
theta_values=[0.0005, 0.001],
trade_capital_values=[75.0, 100.0],
pair_universes=[["BTC/USD", "ETH/BTC", "ETH/USD"]],
staleness_threshold_values=[5.0],
),
starting_balances={"USD": 2000.0},
train_ratio=0.7,
promotion_criteria=PromotionCriteria(
min_test_realized_pnl_usd=-1000.0,
min_test_win_rate=0.0,
min_test_fill_rate=0.0,
max_test_drawdown_usd=1_000_000.0,
max_generalization_gap_ratio=0.9,
),
)
assert artifacts.results
assert artifacts.results[0].test_score >= artifacts.results[-1].test_score
first: SweepResult = artifacts.results[0]
assert first.train_event_count > 0
assert first.test_event_count > 0
assert first.generalization_gap_ratio >= 0.0
assert isinstance(first.promotion_ready, bool)