"""
Hyperparameter Tuner: Optuna/TPE-based optimization of training hyperparameters.
This module provides meta-optimization for training hyperparameters using
walk-forward evaluation as the objective. Instead of optimizing for in-sample
performance (which leads to overfitting), we optimize for OOS metrics like:
- Mean OOS Sharpe
- Walk-Forward Efficiency (WFE)
- Rademacher-adjusted Sharpe
Architecture::
Level 3: HyperparamTuner (this module)
| tries different (lr, bs, bout_offset, ...)
Level 2: TrainingEvaluator
| runs walk-forward cycles, computes WFE/Rademacher
Level 1: Trainer (train_on_historic_data, multi_period_sgd)
| optimizes strategy params (lamb, k, weights)
Level 0: Forward pass
Usage:
.. code-block:: python
from quantammsim.runners.hyperparam_tuner import HyperparamTuner
# Basic usage - tune training hyperparameters
tuner = HyperparamTuner(
runner_name="train_on_historic_data",
n_trials=50,
n_wfa_cycles=3, # WFA cycles per trial
)
result = tuner.tune(run_fingerprint)
# Use best params for final training
run_fingerprint["optimisation_settings"].update(result.best_params)
# Multi-objective: optimize OOS Sharpe AND WFE
tuner = HyperparamTuner(
runner_name="multi_period_sgd",
objective="multi", # Pareto front of OOS Sharpe vs WFE
n_trials=30,
)
"""
import numpy as np
import optuna
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner, PercentilePruner, HyperbandPruner, SuccessiveHalvingPruner
from dataclasses import dataclass, field
from typing import Dict, List, Optional, Any, Tuple, Callable, Union
from copy import deepcopy
from datetime import datetime
import json
import warnings
import traceback
from quantammsim.runners.training_evaluator import (
TrainingEvaluator,
EvaluationResult,
ExistingRunnerWrapper,
)
from quantammsim.core_simulator.param_utils import recursive_default_set
from quantammsim.runners.default_run_fingerprint import run_fingerprint_defaults
from quantammsim.runners.metric_extraction import extract_cycle_metric
# =============================================================================
# Constants
# =============================================================================
# Maps outer Optuna objective to inner training metric (return_val / early_stopping_metric)
# Used in HyperparamSpace.create() to decide if training_objective choice is meaningful,
# and in create_objective() to resolve "aligned" to the actual metric.
#
# Valid inner metrics (from calculate_period_metrics / forward_pass.py):
# sharpe, return, returns_over_hodl, returns_over_uniform_hodl, calmar, sterling, ulcer
# All metrics are normalized so higher = better.
OUTER_TO_INNER_METRIC = {
"mean_oos_sharpe": "sharpe",
"worst_oos_sharpe": "sharpe",
"mean_oos_daily_log_sharpe": "daily_log_sharpe",
"worst_oos_daily_log_sharpe": "daily_log_sharpe",
"mean_oos_calmar": "calmar",
"worst_oos_calmar": "calmar",
"mean_oos_sterling": "sterling",
"worst_oos_sterling": "sterling",
"mean_oos_ulcer": "ulcer",
"worst_oos_ulcer": "ulcer",
"mean_oos_returns_over_hodl": "returns_over_uniform_hodl",
"worst_oos_returns_over_hodl": "returns_over_uniform_hodl",
"mean_wfe": "sharpe", # WFE uses sharpe internally
"worst_wfe": "sharpe",
}
# =============================================================================
# Result Data Classes
# =============================================================================
[docs]
@dataclass
class TuningResult:
"""Results from hyperparameter tuning.
Captures the best trial, study-level statistics, and (optionally) the
full Pareto front for multi-objective optimisation.
Attributes
----------
best_params : Dict[str, Any]
Hyperparameter values from the best trial.
best_value : float
Objective value achieved by the best trial.
best_evaluation : Optional[EvaluationResult]
Full walk-forward evaluation result for the best trial, or ``None``
if evaluation was skipped.
n_trials : int
Total number of trials launched.
n_completed : int
Number of trials that completed successfully.
n_pruned : int
Number of trials pruned by the Optuna pruner.
n_failed : int
Number of trials that raised exceptions.
all_trials : List[Dict[str, Any]]
Per-trial summary dicts (params, value, state) for post-hoc analysis.
pareto_front : Optional[List[Dict[str, Any]]]
For multi-objective studies: list of non-dominated trial summaries.
``None`` for single-objective studies.
total_time_seconds : float
Wall-clock time for the entire tuning run.
"""
best_params: Dict[str, Any]
best_value: float
best_evaluation: Optional[EvaluationResult]
# Study metadata
n_trials: int
n_completed: int
n_pruned: int
n_failed: int = 0
# All trials for analysis
all_trials: List[Dict[str, Any]] = field(default_factory=list)
# Multi-objective results (if applicable)
pareto_front: Optional[List[Dict[str, Any]]] = None
# Timing
total_time_seconds: float = 0.0
[docs]
@dataclass
class HyperparamSpace:
"""
Defines the hyperparameter search space with conditional sampling support.
Each parameter can be:
- float range: {"low": 0.001, "high": 1.0, "log": True}
- int range: {"low": 1, "high": 100, "log": False, "type": "int"}
- categorical: {"choices": ["adam", "sgd"]}
- conditional: {"conditional_on": "parent_param", "conditional_value": "value", ...}
Conditional parameters:
- softmin_temperature: only sampled when aggregation="softmin"
- weight_decay: only sampled when use_weight_decay=True (and triggers adamw)
- lr_decay_ratio: only sampled when lr_schedule_type != "constant"
- warmup_fraction: only sampled when lr_schedule_type == "warmup_cosine",
converted to ``warmup_steps = warmup_fraction * n_iterations``
Note on bout_offset:
- bout_offset is in MINUTES, always multiples of 1440 (whole days)
- Internally we tune bout_offset_days (1 to ~90% of cycle in days)
- Then multiply by 1440 to get minutes
"""
params: Dict[str, Dict[str, Any]] = field(default_factory=dict)
[docs]
@classmethod
def create(
cls,
runner: str = "train_on_historic_data",
cycle_days: int = 180,
optimizer: str = "adam",
include_lr_schedule: bool = True,
include_early_stopping: bool = True,
include_weight_decay: bool = True,
minimal: bool = False,
objective_metric: str = "mean_oos_sharpe",
) -> "HyperparamSpace":
"""
Unified factory method for creating hyperparameter search spaces.
Parameters
----------
runner : str
Which runner to create space for: "train_on_historic_data" or "multi_period_sgd"
cycle_days : int
Approximate duration of one WFA cycle in days. Used to set bout_offset upper bound.
optimizer : str
Optimizer type: "adam", "adamw", or "sgd". Affects learning rate ranges.
include_lr_schedule : bool
Include lr_schedule_type and warmup_fraction (conditional).
include_early_stopping : bool
Include early_stopping_patience.
include_weight_decay : bool
Include use_weight_decay and weight_decay (conditional).
minimal : bool
If True, return minimal space with just lr and iterations.
objective_metric : str
Outer Optuna objective (e.g., "mean_oos_sharpe", "mean_oos_calmar").
Used to determine if training_objective choice is meaningful.
Returns
-------
HyperparamSpace
Configured search space.
Example
-------
>>> space = HyperparamSpace.create(cycle_days=180, optimizer="adam")
>>> space = HyperparamSpace.create(runner="multi_period_sgd", cycle_days=90)
>>> space = HyperparamSpace.create(minimal=True) # Quick tuning
"""
if minimal:
return cls(params={
"base_lr": {"low": 0.01, "high": 0.5, "log": True},
"n_iterations": {"low": 50, "high": 200, "log": True, "type": "int"},
})
max_bout_days = max(1, int(cycle_days * 0.9)) # Ensure at least 1 day
# LR ranges calibrated for each optimizer:
# - SGD: typically needs higher LR (1e-3 to 1.0)
# - Adam/AdamW: typically needs lower LR (1e-5 to 1e-1), with 3e-4 being common default
lr_range = (
{"low": 1e-3, "high": 1.0, "log": True}
if optimizer == "sgd"
else {"low": 1e-5, "high": 1e-1, "log": True}
)
if runner == "multi_period_sgd":
params = {
"base_lr": lr_range,
"n_periods": {"low": 2, "high": 8, "log": False, "type": "int"},
"max_epochs": {"low": 50, "high": 300, "log": True, "type": "int"},
"aggregation": {"choices": ["mean", "worst", "softmin"]},
"softmin_temperature": {
"low": 0.1, "high": 10.0, "log": True,
"conditional_on": "aggregation", "conditional_value": "softmin"
},
"bout_offset_days": {"low": 1, "high": max_bout_days, "log": True, "type": "int"},
}
else:
# Ensure bout_offset_days bounds are valid (low <= high)
# For short cycles, allow smaller bout offsets
bout_offset_low = min(7, max_bout_days) # Use 7 or max if max is smaller
params = {
"base_lr": lr_range,
"batch_size": {"low": 8, "high": 64, "log": True, "type": "int"},
"n_iterations": {"low": 50, "high": 5000, "log": True, "type": "int"},
"bout_offset_days": {"low": bout_offset_low, "high": max_bout_days, "log": True, "type": "int"},
"clip_norm": {"low": 0.5, "high": 50.0, "log": True},
}
if include_weight_decay:
params["use_weight_decay"] = {"choices": [True, False]}
params["weight_decay"] = {
"low": 0.0001, "high": 0.1, "log": True,
"conditional_on": "use_weight_decay", "conditional_value": True
}
if include_lr_schedule:
# Available schedules in backpropagation._create_lr_schedule:
# constant, cosine, exponential, warmup_cosine
params["lr_schedule_type"] = {"choices": ["constant", "cosine", "warmup_cosine", "exponential"]}
# lr_decay_ratio: min_lr = base_lr / lr_decay_ratio (only for decay schedules)
params["lr_decay_ratio"] = {
"low": 10, "high": 10000, "log": True,
"conditional_on": "lr_schedule_type", "conditional_value_not": "constant"
}
# Only warmup_cosine uses warmup_fraction (converted to warmup_steps later)
# Sample as fraction of n_iterations to avoid warmup_steps > n_iterations
params["warmup_fraction"] = {
"low": 0.05, "high": 0.3, "log": False,
"conditional_on": "lr_schedule_type", "conditional_value": "warmup_cosine"
}
if include_early_stopping:
params["use_early_stopping"] = {"choices": [True, False]}
params["early_stopping_patience"] = {
"low": 30, "high": 300, "log": True, "type": "int",
"conditional_on": "use_early_stopping", "conditional_value": True
}
# Validation fraction - how much of training to hold out for early stopping
# Larger = more robust validation signal but less training data
params["val_fraction"] = {
"low": 0.15, "high": 0.4, "log": False,
"conditional_on": "use_early_stopping", "conditional_value": True
}
# Training objective: controls BOTH return_val (what gradients optimize) AND
# early_stopping_metric (what decides when to stop / which params to select)
# - "aligned": match the outer Optuna objective (sharpe→sharpe, calmar→calmar, etc.)
# - "returns_over_uniform_hodl": always use this robust proxy metric
# If "aligned" performs poorly (e.g., calmar has bad gradients), Optuna will learn
# to favor "returns_over_uniform_hodl" instead.
#
# Only include this choice if it's meaningful (i.e., aligned would differ from
# returns_over_uniform_hodl). If outer objective already maps to returns_over_hodl,
# both choices would be identical, so we skip it.
aligned_metric = OUTER_TO_INNER_METRIC.get(objective_metric, "returns_over_uniform_hodl")
if aligned_metric != "returns_over_uniform_hodl":
# Choice is meaningful - include it
params["training_objective"] = {"choices": ["aligned", "returns_over_uniform_hodl"]}
# noise_scale: controls initialization diversity for n_parameter_sets > 1
# Larger noise = more diverse initializations = better exploration but more variance
# Only relevant when n_parameter_sets > 1 (set in run_fingerprint)
params["noise_scale"] = {"low": 0.01, "high": 0.5, "log": True}
# maximum_change: max weight change per time step (controls trading speed limit)
# Lower = more constrained/slower rebalancing, higher = more aggressive
# Default is 3e-4, range from very constrained (1e-5) to effectively unconstrained (2.0)
params["maximum_change"] = {"low": 1e-5, "high": 2.0, "log": True}
# turnover_penalty: penalize weight turnover in loss function
# Higher values discourage frequent rebalancing, improving out-of-sample robustness
params["turnover_penalty"] = {"low": 1e-4, "high": 1.0, "log": True}
# price_noise_sigma: multiplicative noise on prices during training
# Acts as data augmentation to improve out-of-sample robustness
params["price_noise_sigma"] = {"low": 0.0001, "high": 0.01, "log": True}
# sample_method: how training windows are sampled
# "uniform" = random, "stratified" = one sample per time segment (better coverage)
params["sample_method"] = {"choices": ["uniform", "stratified"]}
# parameter_init_method: how parameter sets 1+ are initialized
# "gaussian" = random noise, "sobol"/"lhs" = low-discrepancy (better coverage)
params["parameter_init_method"] = {"choices": ["gaussian", "sobol", "lhs"]}
return cls(params=params)
[docs]
@classmethod
def default_sgd_space(cls, cycle_days: int = 180) -> "HyperparamSpace":
"""Default search space for SGD-based training.
Parameters
----------
cycle_days : int
Training cycle length in days (scales ``bout_offset`` range).
Returns
-------
HyperparamSpace
Space with SGD-appropriate learning rate, schedule, and
early stopping ranges.
"""
return cls.create(cycle_days=cycle_days, optimizer="sgd")
[docs]
@classmethod
def default_adam_space(cls, cycle_days: int = 180) -> "HyperparamSpace":
"""Default search space for Adam-based training.
Parameters
----------
cycle_days : int
Training cycle length in days (scales ``bout_offset`` range).
Returns
-------
HyperparamSpace
Space with Adam-appropriate learning rate and schedule ranges.
"""
return cls.create(cycle_days=cycle_days, optimizer="adam")
[docs]
@classmethod
def default_multi_period_space(cls, cycle_days: int = 180) -> "HyperparamSpace":
"""Default search space for multi-period SGD training.
Includes additional parameters for period count and aggregation
method selection.
Parameters
----------
cycle_days : int
Training cycle length in days.
Returns
-------
HyperparamSpace
"""
return cls.create(runner="multi_period_sgd", cycle_days=cycle_days)
[docs]
@classmethod
def minimal_space(cls) -> "HyperparamSpace":
"""Minimal search space for quick smoke-test tuning.
Contains only learning rate and epoch count — useful for
verifying the tuning pipeline before a full run.
Returns
-------
HyperparamSpace
"""
return cls.create(minimal=True)
[docs]
@classmethod
def for_cycle_duration(
cls,
cycle_days: int,
runner: str = "train_on_historic_data",
include_lr_schedule: bool = True,
include_early_stopping: bool = True,
include_weight_decay: bool = True,
**kwargs,
) -> "HyperparamSpace":
"""Create search space with ``bout_offset`` scaled to cycle duration.
A convenience wrapper around :meth:`create` that forwards all
keyword arguments and sets the ``cycle_days`` accordingly.
Parameters
----------
cycle_days : int
Training cycle length in days.
runner : str
Runner name (``"train_on_historic_data"`` or ``"multi_period_sgd"``).
include_lr_schedule : bool
Include learning rate schedule parameters.
include_early_stopping : bool
Include early stopping parameters.
include_weight_decay : bool
Include weight decay parameter.
Returns
-------
HyperparamSpace
"""
return cls.create(
runner=runner,
cycle_days=cycle_days,
include_lr_schedule=include_lr_schedule,
include_early_stopping=include_early_stopping,
include_weight_decay=include_weight_decay,
**kwargs,
)
[docs]
def suggest(self, trial: optuna.Trial) -> Dict[str, Any]:
"""
Suggest hyperparameters for a trial with conditional sampling.
Conditional parameters are only sampled when their parent condition is met.
This allows Optuna's TPE sampler to properly model the conditional structure.
Supported conditionals:
- conditional_on + conditional_value: sample only when parent == value
- conditional_on + conditional_value_not: sample only when parent != value
"""
suggested = {}
# First pass: sample all non-conditional params
for name, spec in self.params.items():
if "conditional_on" in spec:
continue # Handle in second pass
suggested[name] = self._suggest_param(trial, name, spec)
# Second pass: sample conditional params based on parent values
for name, spec in self.params.items():
if "conditional_on" not in spec:
continue
parent_name = spec["conditional_on"]
parent_value = suggested.get(parent_name)
# Check if condition is met
should_sample = False
if "conditional_value" in spec:
should_sample = (parent_value == spec["conditional_value"])
elif "conditional_value_not" in spec:
should_sample = (parent_value != spec["conditional_value_not"])
if should_sample:
suggested[name] = self._suggest_param(trial, name, spec)
# If condition not met, param is not suggested (not in dict)
return suggested
def _suggest_param(self, trial: optuna.Trial, name: str, spec: Dict[str, Any]) -> Any:
"""Suggest a single parameter value from an Optuna trial.
Dispatches to ``trial.suggest_categorical``, ``trial.suggest_int``,
or ``trial.suggest_float`` depending on the spec format.
Parameters
----------
trial : optuna.Trial
Active Optuna trial.
name : str
Parameter name (used as Optuna distribution name).
spec : Dict[str, Any]
Parameter specification with keys ``"choices"`` (categorical),
``"type": "int"`` (integer), or ``"low"``/``"high"`` (float).
Optional ``"log": True`` for log-uniform sampling.
Returns
-------
Any
Sampled parameter value.
"""
if "choices" in spec:
return trial.suggest_categorical(name, spec["choices"])
elif spec.get("type") == "int":
return trial.suggest_int(
name, spec["low"], spec["high"], log=spec.get("log", False)
)
else:
return trial.suggest_float(
name, spec["low"], spec["high"], log=spec.get("log", False)
)
# =============================================================================
# Objective Functions
# =============================================================================
[docs]
def create_objective(
run_fingerprint: dict,
runner_name: str,
runner_kwargs: Dict[str, Any],
hyperparam_space: HyperparamSpace,
n_wfa_cycles: int,
objective_metric: str,
verbose: bool,
enable_pruning: bool = True,
root: str = None,
) -> Callable[[optuna.Trial], float]:
"""
Create an Optuna objective function with pruning support.
The objective runs TrainingEvaluator with suggested hyperparameters
and returns the specified metric. Reports intermediate values after
each WFA cycle to enable early pruning of unpromising trials.
Parameters
----------
run_fingerprint : dict
Base run configuration
runner_name : str
Which runner to use
runner_kwargs : dict
Extra kwargs for the runner
hyperparam_space : HyperparamSpace
Search space
n_wfa_cycles : int
Number of WFA cycles per trial
objective_metric : str
Metric to optimize
verbose : bool
Print progress
enable_pruning : bool
If True, report intermediate values and check for pruning after each cycle.
If False, run all cycles without pruning checks (default True).
"""
def objective(trial: optuna.Trial) -> float:
# Suggest hyperparameters (with conditional sampling)
suggested = hyperparam_space.suggest(trial)
# Build fingerprint with suggested params
fp = deepcopy(run_fingerprint)
recursive_default_set(fp, run_fingerprint_defaults)
# Handle weight_decay conditional logic:
# If use_weight_decay=True and weight_decay was sampled, use AdamW
# If use_weight_decay=False or not present, use Adam (no weight decay)
use_weight_decay = suggested.get("use_weight_decay", False)
if use_weight_decay and "weight_decay" in suggested:
fp["optimisation_settings"]["optimiser"] = "adamw"
fp["optimisation_settings"]["weight_decay"] = suggested["weight_decay"]
else:
# Ensure no weight decay is applied
fp["optimisation_settings"]["weight_decay"] = 0.0
# Handle val_fraction and early_stopping
# For Optuna: val_fraction always applies (controls validation holdout)
# For SGD: val_fraction is tied to early_stopping
is_optuna = fp["optimisation_settings"].get("method") == "optuna"
use_early_stopping = suggested.get("use_early_stopping", True)
if is_optuna:
# Optuna always uses val_fraction (not tied to early_stopping)
if "val_fraction" in suggested:
fp["optimisation_settings"]["val_fraction"] = suggested["val_fraction"]
else:
# SGD: val_fraction tied to early_stopping
fp["optimisation_settings"]["early_stopping"] = use_early_stopping
if use_early_stopping:
if "val_fraction" in suggested:
fp["optimisation_settings"]["val_fraction"] = suggested["val_fraction"]
else:
# Set a very high patience so it effectively never triggers
fp["optimisation_settings"]["early_stopping_patience"] = 999999
# Set val_fraction to 0 when early stopping is disabled
fp["optimisation_settings"]["val_fraction"] = 0.0
# Handle training_objective: controls BOTH return_val AND early_stopping_metric
training_obj = suggested.get("training_objective", "returns_over_uniform_hodl")
if training_obj == "aligned":
# Align with outer objective
inner_metric = OUTER_TO_INNER_METRIC.get(objective_metric, "returns_over_uniform_hodl")
else:
# Use robust proxy
inner_metric = "returns_over_uniform_hodl"
fp["return_val"] = inner_metric
fp["optimisation_settings"]["early_stopping_metric"] = inner_metric
# Apply suggested hyperparameters
# These go in optimisation_settings
opt_settings_keys = [
"base_lr", "batch_size", "n_iterations",
"clip_norm", "n_cycles", "lr_schedule_type", "lr_decay_ratio",
"early_stopping_patience", "noise_scale",
"sample_method", "parameter_init_method",
]
# Parameters that go directly in run_fingerprint (not optimisation_settings)
fingerprint_root_keys = [
# Initial strategy params (tunable)
"initial_memory_length",
"initial_k_per_day", "initial_log_amplitude",
"initial_raw_width", "initial_raw_exponents",
"initial_pre_exp_scaling",
# Strategy constraints
"maximum_change",
"minimum_weight",
# Training loss modifiers
"turnover_penalty",
# Data augmentation
"price_noise_sigma",
]
for key, value in suggested.items():
if key in opt_settings_keys:
fp["optimisation_settings"][key] = value
elif key in fingerprint_root_keys:
# Initial strategy params go in fingerprint root
fp[key] = value
elif key == "bout_offset_days":
# Convert days to minutes (bout_offset is in minutes)
fp["bout_offset"] = value * 1440
elif key == "bout_offset":
# Legacy: direct minutes value
fp["bout_offset"] = value
elif key == "warmup_fraction":
# Convert warmup_fraction to warmup_steps based on n_iterations
n_iterations = suggested.get("n_iterations", fp["optimisation_settings"].get("n_iterations", 1000))
fp["optimisation_settings"]["warmup_steps"] = int(value * n_iterations)
# Inner Optuna settings (for method="optuna")
elif key == "optuna_overfitting_penalty":
if "optuna_settings" not in fp["optimisation_settings"]:
fp["optimisation_settings"]["optuna_settings"] = {}
fp["optimisation_settings"]["optuna_settings"]["overfitting_penalty"] = value
elif key == "optuna_n_startup_trials":
if "optuna_settings" not in fp["optimisation_settings"]:
fp["optimisation_settings"]["optuna_settings"] = {}
fp["optimisation_settings"]["optuna_settings"]["n_startup_trials"] = int(value)
elif key == "optuna_n_trials":
if "optuna_settings" not in fp["optimisation_settings"]:
fp["optimisation_settings"]["optuna_settings"] = {}
fp["optimisation_settings"]["optuna_settings"]["n_trials"] = int(value)
# Skip control params that aren't real hyperparams (handled above)
elif key in ["use_weight_decay", "weight_decay", "use_early_stopping",
"val_fraction", "training_objective"]:
pass # Already handled above
# multi_period_sgd specific params handled in runner_kwargs
# Build runner kwargs with suggested params
# Only include params that were actually sampled (conditional params may be absent)
local_runner_kwargs = deepcopy(runner_kwargs)
for key in ["n_periods", "max_epochs", "aggregation", "softmin_temperature"]:
if key in suggested:
local_runner_kwargs[key] = suggested[key]
# Determine WFE metric from outer objective (e.g., "mean_oos_calmar" → "calmar")
wfe_metric = OUTER_TO_INNER_METRIC.get(objective_metric, "sharpe")
# Create evaluator
evaluator = TrainingEvaluator.from_runner(
runner_name,
n_cycles=n_wfa_cycles,
verbose=verbose,
root=root,
wfe_metric=wfe_metric,
**local_runner_kwargs,
)
# Run evaluation with pruning support
try:
cycle_evals = []
gen = evaluator.evaluate_iter(fp)
# Manually iterate to capture the return value from StopIteration
# (for loops consume StopIteration without giving access to .value)
result = None
while True:
try:
cycle_eval = next(gen)
except StopIteration as e:
result = e.value
break
cycle_evals.append(cycle_eval)
# Compute running metric for intermediate reporting using unified extraction
# Ensure Python float (not np.float64) for Optuna storage compatibility
intermediate_value = float(extract_cycle_metric(cycle_evals, objective_metric))
# Report intermediate value BEFORE pruning checks
# This ensures all pruned trials have their intermediate values stored for analysis
if enable_pruning:
trial.report(intermediate_value, step=cycle_eval.cycle_number)
# Aggressive pruning: prune if oos_returns_over_hodl is non-positive or NaN
# This catches obviously broken training early without waiting for Optuna's pruner
oos_roh = cycle_eval.oos_returns_over_hodl
if oos_roh is None or (isinstance(oos_roh, float) and np.isnan(oos_roh)) or oos_roh <= 0:
if verbose:
print(f"Trial {trial.number} pruned at cycle {cycle_eval.cycle_number}: "
f"non-positive OOS metrics (sharpe={cycle_eval.oos_sharpe:.4f}, "
f"returns_over_hodl={oos_roh}, intermediate={intermediate_value:.4f})")
raise optuna.TrialPruned()
# Check if trial should be pruned (Optuna's percentile/median pruner)
if trial.should_prune():
if verbose:
print(f"Trial {trial.number} pruned at cycle {cycle_eval.cycle_number} "
f"by Optuna pruner (intermediate={intermediate_value:.4f})")
raise optuna.TrialPruned()
except optuna.TrialPruned:
raise # Re-raise pruning exception
except ValueError as e:
# Re-raise ValueErrors (including NaN detection) - these should FAIL the trial
# not silently return -inf. NaN metrics indicate training collapsed and
# Optuna should mark this as a failed trial, not a completed one.
if verbose:
print(f"Trial {trial.number} failed with ValueError: {e}")
traceback.print_exc()
raise
except Exception as e:
if verbose:
print(f"Trial {trial.number} failed: {e}")
traceback.print_exc()
# Return bad value for other failures (e.g., data loading issues)
# Metrics we MAXIMIZE (higher is better): sharpe, wfe, calmar, sterling, returns, ulcer
# Note: ulcer is negated (higher = less pain), so we maximize
# Metrics we MINIMIZE (lower is better): is_oos_gap
maximize_metrics = [
"mean_oos_sharpe", "worst_oos_sharpe",
"mean_oos_daily_log_sharpe", "worst_oos_daily_log_sharpe",
"mean_wfe", "worst_wfe",
"adjusted_mean_oos_sharpe",
"mean_oos_calmar", "worst_oos_calmar",
"mean_oos_sterling", "worst_oos_sterling",
"mean_oos_returns", "worst_oos_returns",
"mean_oos_returns_over_hodl", "worst_oos_returns_over_hodl",
"mean_oos_ulcer", "worst_oos_ulcer",
]
if objective_metric in maximize_metrics:
return float("-inf") # Worst possible for maximization
else:
return float("inf") # Worst possible for minimization
# Store full result for later analysis
# Include per-cycle metrics for detailed inspection
per_cycle_metrics = []
for c in result.cycles:
per_cycle_metrics.append({
"cycle": c.cycle_number,
# Date ranges
"train_start_date": c.train_start_date,
"train_end_date": c.train_end_date,
"test_start_date": c.test_start_date,
"test_end_date": c.test_end_date,
# Metrics
"is_sharpe": c.is_sharpe,
"oos_sharpe": c.oos_sharpe,
"is_calmar": c.is_calmar,
"oos_calmar": c.oos_calmar,
"is_sterling": c.is_sterling,
"oos_sterling": c.oos_sterling,
"is_ulcer": c.is_ulcer,
"oos_ulcer": c.oos_ulcer,
"is_returns_over_hodl": c.is_returns_over_hodl,
"oos_returns_over_hodl": c.oos_returns_over_hodl,
"is_daily_log_sharpe": c.is_daily_log_sharpe,
"oos_daily_log_sharpe": c.oos_daily_log_sharpe,
"wfe": c.walk_forward_efficiency,
"is_oos_gap": c.is_oos_gap,
# Trained strategy parameters
"trained_params": c.trained_params,
# Provenance: for debugging and linking to output files
"run_location": c.run_location,
"run_fingerprint": c.run_fingerprint,
})
try:
trial.set_user_attr("evaluation_result", {
"mean_oos_sharpe": result.mean_oos_sharpe,
"mean_wfe": result.mean_wfe,
"worst_oos_sharpe": result.worst_oos_sharpe,
"mean_is_oos_gap": result.mean_is_oos_gap,
"aggregate_rademacher": result.aggregate_rademacher,
"adjusted_mean_oos_sharpe": result.adjusted_mean_oos_sharpe,
"is_effective": result.is_effective,
"cycles": per_cycle_metrics,
})
except Exception as e:
if verbose:
print(f"Warning: Failed to store evaluation_result for trial {trial.number}: {e}")
# Return requested metric using unified extraction from cycles
# Ensure Python float (not np.float64) for Optuna storage compatibility
final_value = extract_cycle_metric(result.cycles, objective_metric)
final_value = float(final_value) # Convert np.float64 -> Python float
if verbose:
print(f"Trial {trial.number} returning final value: {final_value}")
return final_value
return objective
[docs]
def create_multi_objective(
run_fingerprint: dict,
runner_name: str,
runner_kwargs: Dict[str, Any],
hyperparam_space: HyperparamSpace,
n_wfa_cycles: int,
objectives: List[str],
verbose: bool,
enable_pruning: bool = True,
root: str = None,
) -> Callable[[optuna.Trial], Tuple[float, ...]]:
"""
Create a multi-objective function for Pareto optimization.
Common combinations:
- ["mean_oos_sharpe", "mean_wfe"]: Maximize both OOS performance and efficiency
- ["mean_oos_sharpe", "neg_is_oos_gap"]: Maximize OOS while minimizing overfitting
Note: Pruning in multi-objective is based on the first objective only.
"""
single_objective = create_objective(
run_fingerprint, runner_name, runner_kwargs,
hyperparam_space, n_wfa_cycles, objectives[0], verbose,
enable_pruning=enable_pruning,
root=root,
)
def multi_objective(trial: optuna.Trial) -> Tuple[float, ...]:
# Run evaluation once (with pruning on first objective)
try:
_ = single_objective(trial)
except optuna.TrialPruned:
raise # Re-raise pruning exception
except ValueError:
raise # Re-raise ValueError (e.g., NaN detection) to fail the trial
except Exception as e:
# For other exceptions, log and return worst values for all objectives
if verbose:
print(f"Trial {trial.number} multi-objective failed: {e}")
return tuple(float("-inf") for _ in objectives)
# Get stored results
eval_result = trial.user_attrs.get("evaluation_result", {})
# Check if evaluation_result is empty (shouldn't happen if single_objective succeeded)
if not eval_result:
if verbose:
print(f"Trial {trial.number}: evaluation_result is empty after single_objective succeeded")
return tuple(float("-inf") for _ in objectives)
values = []
for metric in objectives:
if metric == "mean_oos_sharpe":
values.append(eval_result.get("mean_oos_sharpe", float("-inf")))
elif metric == "mean_wfe":
values.append(eval_result.get("mean_wfe", float("-inf")))
elif metric == "worst_oos_sharpe":
values.append(eval_result.get("worst_oos_sharpe", float("-inf")))
elif metric == "neg_is_oos_gap":
gap = eval_result.get("mean_is_oos_gap", float("inf"))
values.append(-gap) # Negative because we want to minimize gap
elif metric == "adjusted_mean_oos_sharpe":
adj = eval_result.get("adjusted_mean_oos_sharpe")
if adj is None:
adj = eval_result.get("mean_oos_sharpe", float("-inf"))
values.append(adj)
else:
values.append(float("-inf"))
return tuple(values)
return multi_objective
# =============================================================================
# Main Tuner Class
# =============================================================================
[docs]
class HyperparamTuner:
"""
Tunes training hyperparameters using Optuna/TPE.
Uses walk-forward evaluation as the objective, optimizing for
OOS performance rather than in-sample fit.
Parameters
----------
runner_name : str
Which runner to tune: "train_on_historic_data" or "multi_period_sgd"
n_trials : int
Number of Optuna trials to run
n_wfa_cycles : int
Number of walk-forward cycles per evaluation (more = more robust but slower)
objective : str
What to optimize:
- "mean_oos_sharpe": Maximize average OOS Sharpe ratio
- "mean_wfe": Maximize Walk-Forward Efficiency
- "worst_oos_sharpe": Maximize worst-case OOS Sharpe
- "adjusted_mean_oos_sharpe": Maximize Rademacher-adjusted Sharpe
- "multi": Multi-objective (returns Pareto front)
multi_objectives : List[str]
If objective="multi", which metrics to jointly optimize
hyperparam_space : HyperparamSpace
Search space (uses sensible defaults if not provided)
sampler : optuna.samplers.BaseSampler
Optuna sampler (defaults to TPE)
pruner : optuna.pruners.BasePruner
Optuna pruner for early stopping unpromising trials.
Defaults to MedianPruner. Set to None to disable pruning.
enable_pruning : bool
Whether to enable intermediate value reporting and pruning (default True)
timeout_per_trial : Optional[float]
Maximum seconds per trial. If None, no per-trial timeout (default None).
Note: This is approximate - enforced via study.optimize timeout.
total_timeout : Optional[float]
Maximum total seconds for all trials. If None, no total timeout (default None).
verbose : bool
Print progress
runner_kwargs : dict
Extra kwargs passed to the runner
Example
-------
>>> tuner = HyperparamTuner(
... runner_name="train_on_historic_data",
... n_trials=30,
... objective="mean_oos_sharpe",
... enable_pruning=True, # Prune slow/bad trials early
... total_timeout=3600, # Stop after 1 hour
... )
>>> result = tuner.tune(run_fingerprint)
>>> print(f"Best LR: {result.best_params['base_lr']}")
>>> print(f"Best OOS Sharpe: {result.best_value}")
"""
[docs]
def __init__(
self,
runner_name: str = "train_on_historic_data",
n_trials: int = 50,
n_wfa_cycles: int = 3,
objective: str = "mean_oos_sharpe",
multi_objectives: Optional[List[str]] = None,
hyperparam_space: Optional[HyperparamSpace] = None,
sampler: Optional[optuna.samplers.BaseSampler] = None,
pruner: Optional[optuna.pruners.BasePruner] = "default",
enable_pruning: bool = True,
timeout_per_trial: Optional[float] = None,
total_timeout: Optional[float] = None,
verbose: bool = True,
runner_kwargs: Optional[Dict[str, Any]] = None,
study_name: Optional[str] = None,
storage: Optional[str] = None,
root: str = None,
):
self.runner_name = runner_name
self.n_trials = n_trials
self.n_wfa_cycles = n_wfa_cycles
self.objective = objective
self.multi_objectives = multi_objectives or ["mean_oos_sharpe", "mean_wfe"]
self.enable_pruning = enable_pruning
self.timeout_per_trial = timeout_per_trial
self.total_timeout = total_timeout
self.verbose = verbose
self.runner_kwargs = runner_kwargs or {}
self.study_name = study_name
self.storage = storage
self.root = root
# Set default search space based on runner
# IMPORTANT: Pass objective so training_objective is conditionally included correctly
if hyperparam_space is not None:
self.hyperparam_space = hyperparam_space
elif runner_name == "multi_period_sgd":
self.hyperparam_space = HyperparamSpace.create(
runner="multi_period_sgd",
objective_metric=objective,
)
else:
self.hyperparam_space = HyperparamSpace.create(
optimizer="adam",
objective_metric=objective,
)
# Set sampler (TPE is good for expensive evaluations)
self.sampler = sampler or TPESampler(
n_startup_trials=min(10, n_trials // 3),
multivariate=True,
)
# Set pruner for early stopping unpromising trials
# Note: WFA cycles are NOT true multi-fidelity (cycle 1 doesn't predict cycles 2-4,
# they're different market regimes). So Hyperband/ASHA are overkill - their
# sophisticated logic assumes correlation between fidelities we don't have.
# PercentilePruner is better: just filter obvious disasters without predicting.
if not enable_pruning or pruner is None or pruner == "none":
self.pruner = optuna.pruners.NopPruner()
elif pruner == "default" or pruner == "percentile":
# PercentilePruner with 25%: prune bottom 25% after each cycle.
# This is appropriate for WFA where cycles are independent regimes.
# We're not predicting future cycles, just filtering disasters.
self.pruner = PercentilePruner(
percentile=25.0,
n_startup_trials=max(5, n_trials // 5),
n_warmup_steps=0,
interval_steps=1,
)
elif pruner == "median":
# MedianPruner: prune if below median of completed trials at same step
self.pruner = MedianPruner(
n_startup_trials=max(3, n_trials // 5),
n_warmup_steps=0,
interval_steps=1,
)
elif pruner == "hyperband":
# HyperbandPruner: structured successive halving with multiple brackets
# Note: Designed for true multi-fidelity where cheap evals predict expensive ones.
# Use cautiously with WFA - cycles are different regimes, not fidelity levels.
self.pruner = HyperbandPruner(
min_resource=1,
max_resource=n_wfa_cycles,
reduction_factor=3,
)
elif pruner == "successive_halving":
# SuccessiveHalvingPruner: single bracket successive halving
self.pruner = SuccessiveHalvingPruner(
min_resource=1,
reduction_factor=3,
)
else:
# Custom pruner instance
self.pruner = pruner
[docs]
def tune(self, run_fingerprint: dict) -> TuningResult:
"""
Run hyperparameter tuning.
Parameters
----------
run_fingerprint : dict
Base run configuration. Hyperparameters will be varied around this.
Returns
-------
TuningResult
Contains best parameters, best value, and all trial data.
"""
start_time = datetime.now()
# Create study
study_name = self.study_name or f"hyperparam_tune_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
if self.objective == "multi":
# Multi-objective optimization
# Note: Multi-objective doesn't support pruning directly in Optuna,
# but we still report intermediate values for monitoring
study = optuna.create_study(
study_name=study_name,
storage=self.storage,
directions=["maximize"] * len(self.multi_objectives),
sampler=self.sampler,
load_if_exists=True,
)
objective_fn = create_multi_objective(
run_fingerprint,
self.runner_name,
self.runner_kwargs,
self.hyperparam_space,
self.n_wfa_cycles,
self.multi_objectives,
self.verbose,
enable_pruning=False, # Multi-objective doesn't support pruning
root=self.root,
)
else:
# Single objective optimization with pruning support
study = optuna.create_study(
study_name=study_name,
storage=self.storage,
direction="maximize",
sampler=self.sampler,
pruner=self.pruner,
load_if_exists=True,
)
objective_fn = create_objective(
run_fingerprint,
self.runner_name,
self.runner_kwargs,
self.hyperparam_space,
self.n_wfa_cycles,
self.objective,
self.verbose,
enable_pruning=self.enable_pruning,
root=self.root,
)
# Run optimization
if self.verbose:
print("=" * 70)
print(f"HYPERPARAMETER TUNING: {self.runner_name}")
print("=" * 70)
print(f"Objective: {self.objective}")
print(f"Trials: {self.n_trials}")
print(f"WFA cycles per trial: {self.n_wfa_cycles}")
print(f"Search space: {list(self.hyperparam_space.params.keys())}")
print(f"Pruning: {'enabled' if self.enable_pruning and self.objective != 'multi' else 'disabled'}")
if self.total_timeout:
print(f"Total timeout: {self.total_timeout}s")
print("=" * 70)
# Suppress Optuna's verbose logging unless we want it
if not self.verbose:
optuna.logging.set_verbosity(optuna.logging.WARNING)
study.optimize(
objective_fn,
n_trials=self.n_trials,
timeout=self.total_timeout,
show_progress_bar=self.verbose,
catch=(Exception,), # Catch exceptions and continue with other trials
)
# Collect results
end_time = datetime.now()
total_time = (end_time - start_time).total_seconds()
# Get all trials data
all_trials = []
for trial in study.trials:
trial_data = {
"number": trial.number,
"params": trial.params,
"value": trial.value if self.objective != "multi" else trial.values,
"state": str(trial.state),
"evaluation_result": trial.user_attrs.get("evaluation_result"),
}
all_trials.append(trial_data)
# Count trial states
n_completed = len([t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE])
n_pruned = len([t for t in study.trials if t.state == optuna.trial.TrialState.PRUNED])
n_failed = len([t for t in study.trials if t.state == optuna.trial.TrialState.FAIL])
# Build result
if self.objective == "multi":
# Multi-objective: return Pareto front
pareto_trials = study.best_trials
pareto_front = [
{
"params": t.params,
"values": t.values,
"evaluation_result": t.user_attrs.get("evaluation_result"),
}
for t in pareto_trials
]
# Pick one "best" for convenience (highest first objective)
if pareto_trials:
best_trial = max(pareto_trials, key=lambda t: t.values[0])
best_params = best_trial.params
best_value = best_trial.values[0]
else:
best_params = {}
best_value = float("-inf")
result = TuningResult(
best_params=best_params,
best_value=best_value,
best_evaluation=None, # Would need to re-run to get full result
n_trials=self.n_trials,
n_completed=n_completed,
n_pruned=n_pruned,
n_failed=n_failed,
all_trials=all_trials,
pareto_front=pareto_front,
total_time_seconds=total_time,
)
else:
# Single objective
if n_completed > 0:
best_trial = study.best_trial
best_params = best_trial.params
best_value = best_trial.value
else:
# No completed trials - return empty result
best_params = {}
best_value = float("-inf")
result = TuningResult(
best_params=best_params,
best_value=best_value,
best_evaluation=None,
n_trials=self.n_trials,
n_completed=n_completed,
n_pruned=n_pruned,
n_failed=n_failed,
all_trials=all_trials,
pareto_front=None,
total_time_seconds=total_time,
)
if self.verbose:
self._print_report(result, study)
return result
def _print_report(self, result: TuningResult, study: optuna.Study):
"""Print a human-readable summary of the tuning run.
Includes trial counts (completed/pruned/failed), best trial
parameters and value, timing, and (for multi-objective studies)
the Pareto front.
Parameters
----------
result : TuningResult
Completed tuning result.
study : optuna.Study
Underlying Optuna study (used to query trial states).
"""
print("\n" + "=" * 70)
print("TUNING COMPLETE")
print("=" * 70)
# Count trial states
n_failed = len([t for t in study.trials if t.state == optuna.trial.TrialState.FAIL])
n_running = len([t for t in study.trials if t.state == optuna.trial.TrialState.RUNNING])
print(f"Trials: {len(study.trials)} total")
print(f" Completed: {result.n_completed}")
print(f" Pruned: {result.n_pruned}")
if n_failed > 0:
print(f" Failed: {n_failed}")
if n_running > 0:
print(f" Running: {n_running}")
print(f"\nTotal time: {result.total_time_seconds:.1f}s")
if result.n_completed > 0:
print(f"Time per completed trial: {result.total_time_seconds / result.n_completed:.1f}s")
if result.n_pruned > 0:
# Estimate time saved by pruning
avg_completed_time = result.total_time_seconds / max(1, result.n_completed + result.n_pruned)
print(f"Estimated time saved by pruning: {result.n_pruned * avg_completed_time * 0.5:.1f}s")
print("\n--- Best Parameters ---")
for key, value in result.best_params.items():
print(f" {key}: {value}")
print(f"\n--- Best {self.objective}: {result.best_value:.4f} ---")
if result.pareto_front:
print(f"\n--- Pareto Front ({len(result.pareto_front)} solutions) ---")
for i, sol in enumerate(result.pareto_front[:5]): # Show top 5
values_str = ", ".join(f"{v:.3f}" for v in sol["values"])
print(f" {i+1}. [{values_str}]")
print(f" params: {sol['params']}")
print("=" * 70)
# =============================================================================
# Convenience Functions
# =============================================================================
[docs]
def quick_tune(
run_fingerprint: dict,
runner_name: str = "train_on_historic_data",
n_trials: int = 20,
) -> Dict[str, Any]:
"""
Quick hyperparameter tuning with minimal configuration.
Returns the best hyperparameters found.
Example
-------
>>> best_params = quick_tune(run_fingerprint, n_trials=20)
>>> run_fingerprint["optimisation_settings"]["base_lr"] = best_params["base_lr"]
"""
tuner = HyperparamTuner(
runner_name=runner_name,
n_trials=n_trials,
n_wfa_cycles=2, # Fast evaluation
hyperparam_space=HyperparamSpace.minimal_space(),
verbose=True,
)
result = tuner.tune(run_fingerprint)
return result.best_params
[docs]
def tune_for_robustness(
run_fingerprint: dict,
runner_name: str = "train_on_historic_data",
n_trials: int = 50,
) -> TuningResult:
"""
Tune hyperparameters with emphasis on robustness (WFE + OOS Sharpe).
Uses multi-objective optimization to find the Pareto front of
OOS performance vs walk-forward efficiency.
"""
tuner = HyperparamTuner(
runner_name=runner_name,
n_trials=n_trials,
n_wfa_cycles=4, # More cycles for robust estimate
objective="multi",
multi_objectives=["mean_oos_sharpe", "mean_wfe"],
verbose=True,
)
return tuner.tune(run_fingerprint)
# =============================================================================
# Example
# =============================================================================
if __name__ == "__main__":
# Example usage
run_fingerprint = {
"tokens": ["BTC", "ETH"],
"rule": "momentum",
"startDateString": "2021-01-01 00:00:00",
"endDateString": "2023-06-01 00:00:00",
"endTestDateString": "2024-01-01 00:00:00",
"chunk_period": 1440,
"weight_interpolation_period": 1440,
"initial_pool_value": 1000000.0,
"fees": 0.003,
"optimisation_settings": {
"training_data_kind": "historic",
"optimiser": "adam",
},
}
# Quick tune
best_params = quick_tune(run_fingerprint, n_trials=10)
print(f"\nBest params: {best_params}")