Source code for quantammsim.runners.hyperparam_tuner

"""
Hyperparameter Tuner: Optuna/TPE-based optimization of training hyperparameters.

This module provides meta-optimization for training hyperparameters using
walk-forward evaluation as the objective. Instead of optimizing for in-sample
performance (which leads to overfitting), we optimize for OOS metrics like:

- Mean OOS Sharpe
- Walk-Forward Efficiency (WFE)
- Rademacher-adjusted Sharpe

Architecture::

    Level 3: HyperparamTuner (this module)
        | tries different (lr, bs, bout_offset, ...)
    Level 2: TrainingEvaluator
        | runs walk-forward cycles, computes WFE/Rademacher
    Level 1: Trainer (train_on_historic_data, multi_period_sgd)
        | optimizes strategy params (lamb, k, weights)
    Level 0: Forward pass

Usage:

.. code-block:: python

    from quantammsim.runners.hyperparam_tuner import HyperparamTuner

    # Basic usage - tune training hyperparameters
    tuner = HyperparamTuner(
        runner_name="train_on_historic_data",
        n_trials=50,
        n_wfa_cycles=3,  # WFA cycles per trial
    )
    result = tuner.tune(run_fingerprint)

    # Use best params for final training
    run_fingerprint["optimisation_settings"].update(result.best_params)

    # Multi-objective: optimize OOS Sharpe AND WFE
    tuner = HyperparamTuner(
        runner_name="multi_period_sgd",
        objective="multi",  # Pareto front of OOS Sharpe vs WFE
        n_trials=30,
    )
"""

import numpy as np
import optuna
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner, PercentilePruner, HyperbandPruner, SuccessiveHalvingPruner
from dataclasses import dataclass, field
from typing import Dict, List, Optional, Any, Tuple, Callable, Union
from copy import deepcopy
from datetime import datetime
import json
import warnings
import traceback

from quantammsim.runners.training_evaluator import (
    TrainingEvaluator,
    EvaluationResult,
    ExistingRunnerWrapper,
)
from quantammsim.core_simulator.param_utils import recursive_default_set
from quantammsim.runners.default_run_fingerprint import run_fingerprint_defaults
from quantammsim.runners.metric_extraction import extract_cycle_metric


# =============================================================================
# Constants
# =============================================================================

# Maps outer Optuna objective to inner training metric (return_val / early_stopping_metric)
# Used in HyperparamSpace.create() to decide if training_objective choice is meaningful,
# and in create_objective() to resolve "aligned" to the actual metric.
#
# Valid inner metrics (from calculate_period_metrics / forward_pass.py):
#   sharpe, return, returns_over_hodl, returns_over_uniform_hodl, calmar, sterling, ulcer
# All metrics are normalized so higher = better.
OUTER_TO_INNER_METRIC = {
    "mean_oos_sharpe": "sharpe",
    "worst_oos_sharpe": "sharpe",
    "mean_oos_daily_log_sharpe": "daily_log_sharpe",
    "worst_oos_daily_log_sharpe": "daily_log_sharpe",
    "mean_oos_calmar": "calmar",
    "worst_oos_calmar": "calmar",
    "mean_oos_sterling": "sterling",
    "worst_oos_sterling": "sterling",
    "mean_oos_ulcer": "ulcer",
    "worst_oos_ulcer": "ulcer",
    "mean_oos_returns_over_hodl": "returns_over_uniform_hodl",
    "worst_oos_returns_over_hodl": "returns_over_uniform_hodl",
    "mean_wfe": "sharpe",  # WFE uses sharpe internally
    "worst_wfe": "sharpe",
}


# =============================================================================
# Result Data Classes
# =============================================================================


[docs]
@dataclass
class TuningResult:
    """Results from hyperparameter tuning.

    Captures the best trial, study-level statistics, and (optionally) the
    full Pareto front for multi-objective optimisation.

    Attributes
    ----------
    best_params : Dict[str, Any]
        Hyperparameter values from the best trial.
    best_value : float
        Objective value achieved by the best trial.
    best_evaluation : Optional[EvaluationResult]
        Full walk-forward evaluation result for the best trial, or ``None``
        if evaluation was skipped.
    n_trials : int
        Total number of trials launched.
    n_completed : int
        Number of trials that completed successfully.
    n_pruned : int
        Number of trials pruned by the Optuna pruner.
    n_failed : int
        Number of trials that raised exceptions.
    all_trials : List[Dict[str, Any]]
        Per-trial summary dicts (params, value, state) for post-hoc analysis.
    pareto_front : Optional[List[Dict[str, Any]]]
        For multi-objective studies: list of non-dominated trial summaries.
        ``None`` for single-objective studies.
    total_time_seconds : float
        Wall-clock time for the entire tuning run.
    """
    best_params: Dict[str, Any]
    best_value: float
    best_evaluation: Optional[EvaluationResult]

    # Study metadata
    n_trials: int
    n_completed: int
    n_pruned: int
    n_failed: int = 0

    # All trials for analysis
    all_trials: List[Dict[str, Any]] = field(default_factory=list)

    # Multi-objective results (if applicable)
    pareto_front: Optional[List[Dict[str, Any]]] = None

    # Timing
    total_time_seconds: float = 0.0




[docs]
@dataclass
class HyperparamSpace:
    """
    Defines the hyperparameter search space with conditional sampling support.

    Each parameter can be:
    - float range: {"low": 0.001, "high": 1.0, "log": True}
    - int range: {"low": 1, "high": 100, "log": False, "type": "int"}
    - categorical: {"choices": ["adam", "sgd"]}
    - conditional: {"conditional_on": "parent_param", "conditional_value": "value", ...}

    Conditional parameters:

    - softmin_temperature: only sampled when aggregation="softmin"
    - weight_decay: only sampled when use_weight_decay=True (and triggers adamw)
    - lr_decay_ratio: only sampled when lr_schedule_type != "constant"
    - warmup_fraction: only sampled when lr_schedule_type == "warmup_cosine",
      converted to ``warmup_steps = warmup_fraction * n_iterations``


    Note on bout_offset:
    - bout_offset is in MINUTES, always multiples of 1440 (whole days)
    - Internally we tune bout_offset_days (1 to ~90% of cycle in days)
    - Then multiply by 1440 to get minutes
    """
    params: Dict[str, Dict[str, Any]] = field(default_factory=dict)


[docs]
    @classmethod
    def create(
        cls,
        runner: str = "train_on_historic_data",
        cycle_days: int = 180,
        optimizer: str = "adam",
        include_lr_schedule: bool = True,
        include_early_stopping: bool = True,
        include_weight_decay: bool = True,
        minimal: bool = False,
        objective_metric: str = "mean_oos_sharpe",
    ) -> "HyperparamSpace":
        """
        Unified factory method for creating hyperparameter search spaces.

        Parameters
        ----------
        runner : str
            Which runner to create space for: "train_on_historic_data" or "multi_period_sgd"
        cycle_days : int
            Approximate duration of one WFA cycle in days. Used to set bout_offset upper bound.
        optimizer : str
            Optimizer type: "adam", "adamw", or "sgd". Affects learning rate ranges.
        include_lr_schedule : bool
            Include lr_schedule_type and warmup_fraction (conditional).
        include_early_stopping : bool
            Include early_stopping_patience.
        include_weight_decay : bool
            Include use_weight_decay and weight_decay (conditional).
        minimal : bool
            If True, return minimal space with just lr and iterations.
        objective_metric : str
            Outer Optuna objective (e.g., "mean_oos_sharpe", "mean_oos_calmar").
            Used to determine if training_objective choice is meaningful.

        Returns
        -------
        HyperparamSpace
            Configured search space.

        Example
        -------
        >>> space = HyperparamSpace.create(cycle_days=180, optimizer="adam")
        >>> space = HyperparamSpace.create(runner="multi_period_sgd", cycle_days=90)
        >>> space = HyperparamSpace.create(minimal=True)  # Quick tuning
        """
        if minimal:
            return cls(params={
                "base_lr": {"low": 0.01, "high": 0.5, "log": True},
                "n_iterations": {"low": 50, "high": 200, "log": True, "type": "int"},
            })

        max_bout_days = max(1, int(cycle_days * 0.9))  # Ensure at least 1 day
        # LR ranges calibrated for each optimizer:
        # - SGD: typically needs higher LR (1e-3 to 1.0)
        # - Adam/AdamW: typically needs lower LR (1e-5 to 1e-1), with 3e-4 being common default
        lr_range = (
            {"low": 1e-3, "high": 1.0, "log": True}
            if optimizer == "sgd"
            else {"low": 1e-5, "high": 1e-1, "log": True}
        )

        if runner == "multi_period_sgd":
            params = {
                "base_lr": lr_range,
                "n_periods": {"low": 2, "high": 8, "log": False, "type": "int"},
                "max_epochs": {"low": 50, "high": 300, "log": True, "type": "int"},
                "aggregation": {"choices": ["mean", "worst", "softmin"]},
                "softmin_temperature": {
                    "low": 0.1, "high": 10.0, "log": True,
                    "conditional_on": "aggregation", "conditional_value": "softmin"
                },
                "bout_offset_days": {"low": 1, "high": max_bout_days, "log": True, "type": "int"},
            }
        else:
            # Ensure bout_offset_days bounds are valid (low <= high)
            # For short cycles, allow smaller bout offsets
            bout_offset_low = min(7, max_bout_days)  # Use 7 or max if max is smaller
            params = {
                "base_lr": lr_range,
                "batch_size": {"low": 8, "high": 64, "log": True, "type": "int"},
                "n_iterations": {"low": 50, "high": 5000, "log": True, "type": "int"},
                "bout_offset_days": {"low": bout_offset_low, "high": max_bout_days, "log": True, "type": "int"},
                "clip_norm": {"low": 0.5, "high": 50.0, "log": True},
            }

        if include_weight_decay:
            params["use_weight_decay"] = {"choices": [True, False]}
            params["weight_decay"] = {
                "low": 0.0001, "high": 0.1, "log": True,
                "conditional_on": "use_weight_decay", "conditional_value": True
            }

        if include_lr_schedule:
            # Available schedules in backpropagation._create_lr_schedule:
            # constant, cosine, exponential, warmup_cosine
            params["lr_schedule_type"] = {"choices": ["constant", "cosine", "warmup_cosine", "exponential"]}
            # lr_decay_ratio: min_lr = base_lr / lr_decay_ratio (only for decay schedules)
            params["lr_decay_ratio"] = {
                "low": 10, "high": 10000, "log": True,
                "conditional_on": "lr_schedule_type", "conditional_value_not": "constant"
            }
            # Only warmup_cosine uses warmup_fraction (converted to warmup_steps later)
            # Sample as fraction of n_iterations to avoid warmup_steps > n_iterations
            params["warmup_fraction"] = {
                "low": 0.05, "high": 0.3, "log": False,
                "conditional_on": "lr_schedule_type", "conditional_value": "warmup_cosine"
            }

        if include_early_stopping:
            params["use_early_stopping"] = {"choices": [True, False]}
            params["early_stopping_patience"] = {
                "low": 30, "high": 300, "log": True, "type": "int",
                "conditional_on": "use_early_stopping", "conditional_value": True
            }
            # Validation fraction - how much of training to hold out for early stopping
            # Larger = more robust validation signal but less training data
            params["val_fraction"] = {
                "low": 0.15, "high": 0.4, "log": False,
                "conditional_on": "use_early_stopping", "conditional_value": True
            }

        # Training objective: controls BOTH return_val (what gradients optimize) AND
        # early_stopping_metric (what decides when to stop / which params to select)
        # - "aligned": match the outer Optuna objective (sharpe→sharpe, calmar→calmar, etc.)
        # - "returns_over_uniform_hodl": always use this robust proxy metric
        # If "aligned" performs poorly (e.g., calmar has bad gradients), Optuna will learn
        # to favor "returns_over_uniform_hodl" instead.
        #
        # Only include this choice if it's meaningful (i.e., aligned would differ from
        # returns_over_uniform_hodl). If outer objective already maps to returns_over_hodl,
        # both choices would be identical, so we skip it.
        aligned_metric = OUTER_TO_INNER_METRIC.get(objective_metric, "returns_over_uniform_hodl")
        if aligned_metric != "returns_over_uniform_hodl":
            # Choice is meaningful - include it
            params["training_objective"] = {"choices": ["aligned", "returns_over_uniform_hodl"]}

        # noise_scale: controls initialization diversity for n_parameter_sets > 1
        # Larger noise = more diverse initializations = better exploration but more variance
        # Only relevant when n_parameter_sets > 1 (set in run_fingerprint)
        params["noise_scale"] = {"low": 0.01, "high": 0.5, "log": True}

        # maximum_change: max weight change per time step (controls trading speed limit)
        # Lower = more constrained/slower rebalancing, higher = more aggressive
        # Default is 3e-4, range from very constrained (1e-5) to effectively unconstrained (2.0)
        params["maximum_change"] = {"low": 1e-5, "high": 2.0, "log": True}

        # turnover_penalty: penalize weight turnover in loss function
        # Higher values discourage frequent rebalancing, improving out-of-sample robustness
        params["turnover_penalty"] = {"low": 1e-4, "high": 1.0, "log": True}

        # price_noise_sigma: multiplicative noise on prices during training
        # Acts as data augmentation to improve out-of-sample robustness
        params["price_noise_sigma"] = {"low": 0.0001, "high": 0.01, "log": True}

        # sample_method: how training windows are sampled
        # "uniform" = random, "stratified" = one sample per time segment (better coverage)
        params["sample_method"] = {"choices": ["uniform", "stratified"]}

        # parameter_init_method: how parameter sets 1+ are initialized
        # "gaussian" = random noise, "sobol"/"lhs" = low-discrepancy (better coverage)
        params["parameter_init_method"] = {"choices": ["gaussian", "sobol", "lhs"]}

        return cls(params=params)



[docs]
    @classmethod
    def default_sgd_space(cls, cycle_days: int = 180) -> "HyperparamSpace":
        """Default search space for SGD-based training.

        Parameters
        ----------
        cycle_days : int
            Training cycle length in days (scales ``bout_offset`` range).

        Returns
        -------
        HyperparamSpace
            Space with SGD-appropriate learning rate, schedule, and
            early stopping ranges.
        """
        return cls.create(cycle_days=cycle_days, optimizer="sgd")



[docs]
    @classmethod
    def default_adam_space(cls, cycle_days: int = 180) -> "HyperparamSpace":
        """Default search space for Adam-based training.

        Parameters
        ----------
        cycle_days : int
            Training cycle length in days (scales ``bout_offset`` range).

        Returns
        -------
        HyperparamSpace
            Space with Adam-appropriate learning rate and schedule ranges.
        """
        return cls.create(cycle_days=cycle_days, optimizer="adam")



[docs]
    @classmethod
    def default_multi_period_space(cls, cycle_days: int = 180) -> "HyperparamSpace":
        """Default search space for multi-period SGD training.

        Includes additional parameters for period count and aggregation
        method selection.

        Parameters
        ----------
        cycle_days : int
            Training cycle length in days.

        Returns
        -------
        HyperparamSpace
        """
        return cls.create(runner="multi_period_sgd", cycle_days=cycle_days)



[docs]
    @classmethod
    def minimal_space(cls) -> "HyperparamSpace":
        """Minimal search space for quick smoke-test tuning.

        Contains only learning rate and epoch count — useful for
        verifying the tuning pipeline before a full run.

        Returns
        -------
        HyperparamSpace
        """
        return cls.create(minimal=True)



[docs]
    @classmethod
    def for_cycle_duration(
        cls,
        cycle_days: int,
        runner: str = "train_on_historic_data",
        include_lr_schedule: bool = True,
        include_early_stopping: bool = True,
        include_weight_decay: bool = True,
        **kwargs,
    ) -> "HyperparamSpace":
        """Create search space with ``bout_offset`` scaled to cycle duration.

        A convenience wrapper around :meth:`create` that forwards all
        keyword arguments and sets the ``cycle_days`` accordingly.

        Parameters
        ----------
        cycle_days : int
            Training cycle length in days.
        runner : str
            Runner name (``"train_on_historic_data"`` or ``"multi_period_sgd"``).
        include_lr_schedule : bool
            Include learning rate schedule parameters.
        include_early_stopping : bool
            Include early stopping parameters.
        include_weight_decay : bool
            Include weight decay parameter.

        Returns
        -------
        HyperparamSpace
        """
        return cls.create(
            runner=runner,
            cycle_days=cycle_days,
            include_lr_schedule=include_lr_schedule,
            include_early_stopping=include_early_stopping,
            include_weight_decay=include_weight_decay,
            **kwargs,
        )



[docs]
    def suggest(self, trial: optuna.Trial) -> Dict[str, Any]:
        """
        Suggest hyperparameters for a trial with conditional sampling.

        Conditional parameters are only sampled when their parent condition is met.
        This allows Optuna's TPE sampler to properly model the conditional structure.

        Supported conditionals:
        - conditional_on + conditional_value: sample only when parent == value
        - conditional_on + conditional_value_not: sample only when parent != value
        """
        suggested = {}

        # First pass: sample all non-conditional params
        for name, spec in self.params.items():
            if "conditional_on" in spec:
                continue  # Handle in second pass
            suggested[name] = self._suggest_param(trial, name, spec)

        # Second pass: sample conditional params based on parent values
        for name, spec in self.params.items():
            if "conditional_on" not in spec:
                continue

            parent_name = spec["conditional_on"]
            parent_value = suggested.get(parent_name)

            # Check if condition is met
            should_sample = False
            if "conditional_value" in spec:
                should_sample = (parent_value == spec["conditional_value"])
            elif "conditional_value_not" in spec:
                should_sample = (parent_value != spec["conditional_value_not"])

            if should_sample:
                suggested[name] = self._suggest_param(trial, name, spec)
            # If condition not met, param is not suggested (not in dict)

        return suggested


    def _suggest_param(self, trial: optuna.Trial, name: str, spec: Dict[str, Any]) -> Any:
        """Suggest a single parameter value from an Optuna trial.

        Dispatches to ``trial.suggest_categorical``, ``trial.suggest_int``,
        or ``trial.suggest_float`` depending on the spec format.

        Parameters
        ----------
        trial : optuna.Trial
            Active Optuna trial.
        name : str
            Parameter name (used as Optuna distribution name).
        spec : Dict[str, Any]
            Parameter specification with keys ``"choices"`` (categorical),
            ``"type": "int"`` (integer), or ``"low"``/``"high"`` (float).
            Optional ``"log": True`` for log-uniform sampling.

        Returns
        -------
        Any
            Sampled parameter value.
        """
        if "choices" in spec:
            return trial.suggest_categorical(name, spec["choices"])
        elif spec.get("type") == "int":
            return trial.suggest_int(
                name, spec["low"], spec["high"], log=spec.get("log", False)
            )
        else:
            return trial.suggest_float(
                name, spec["low"], spec["high"], log=spec.get("log", False)
            )



# =============================================================================
# Objective Functions
# =============================================================================


[docs]
def create_objective(
    run_fingerprint: dict,
    runner_name: str,
    runner_kwargs: Dict[str, Any],
    hyperparam_space: HyperparamSpace,
    n_wfa_cycles: int,
    objective_metric: str,
    verbose: bool,
    enable_pruning: bool = True,
    root: str = None,
) -> Callable[[optuna.Trial], float]:
    """
    Create an Optuna objective function with pruning support.

    The objective runs TrainingEvaluator with suggested hyperparameters
    and returns the specified metric. Reports intermediate values after
    each WFA cycle to enable early pruning of unpromising trials.

    Parameters
    ----------
    run_fingerprint : dict
        Base run configuration
    runner_name : str
        Which runner to use
    runner_kwargs : dict
        Extra kwargs for the runner
    hyperparam_space : HyperparamSpace
        Search space
    n_wfa_cycles : int
        Number of WFA cycles per trial
    objective_metric : str
        Metric to optimize
    verbose : bool
        Print progress
    enable_pruning : bool
        If True, report intermediate values and check for pruning after each cycle.
        If False, run all cycles without pruning checks (default True).
    """
    def objective(trial: optuna.Trial) -> float:
        # Suggest hyperparameters (with conditional sampling)
        suggested = hyperparam_space.suggest(trial)

        # Build fingerprint with suggested params
        fp = deepcopy(run_fingerprint)
        recursive_default_set(fp, run_fingerprint_defaults)

        # Handle weight_decay conditional logic:
        # If use_weight_decay=True and weight_decay was sampled, use AdamW
        # If use_weight_decay=False or not present, use Adam (no weight decay)
        use_weight_decay = suggested.get("use_weight_decay", False)
        if use_weight_decay and "weight_decay" in suggested:
            fp["optimisation_settings"]["optimiser"] = "adamw"
            fp["optimisation_settings"]["weight_decay"] = suggested["weight_decay"]
        else:
            # Ensure no weight decay is applied
            fp["optimisation_settings"]["weight_decay"] = 0.0

        # Handle val_fraction and early_stopping
        # For Optuna: val_fraction always applies (controls validation holdout)
        # For SGD: val_fraction is tied to early_stopping
        is_optuna = fp["optimisation_settings"].get("method") == "optuna"
        use_early_stopping = suggested.get("use_early_stopping", True)

        if is_optuna:
            # Optuna always uses val_fraction (not tied to early_stopping)
            if "val_fraction" in suggested:
                fp["optimisation_settings"]["val_fraction"] = suggested["val_fraction"]
        else:
            # SGD: val_fraction tied to early_stopping
            fp["optimisation_settings"]["early_stopping"] = use_early_stopping
            if use_early_stopping:
                if "val_fraction" in suggested:
                    fp["optimisation_settings"]["val_fraction"] = suggested["val_fraction"]
            else:
                # Set a very high patience so it effectively never triggers
                fp["optimisation_settings"]["early_stopping_patience"] = 999999
                # Set val_fraction to 0 when early stopping is disabled
                fp["optimisation_settings"]["val_fraction"] = 0.0

        # Handle training_objective: controls BOTH return_val AND early_stopping_metric
        training_obj = suggested.get("training_objective", "returns_over_uniform_hodl")
        if training_obj == "aligned":
            # Align with outer objective
            inner_metric = OUTER_TO_INNER_METRIC.get(objective_metric, "returns_over_uniform_hodl")
        else:
            # Use robust proxy
            inner_metric = "returns_over_uniform_hodl"
        fp["return_val"] = inner_metric
        fp["optimisation_settings"]["early_stopping_metric"] = inner_metric

        # Apply suggested hyperparameters
        # These go in optimisation_settings
        opt_settings_keys = [
            "base_lr", "batch_size", "n_iterations",
            "clip_norm", "n_cycles", "lr_schedule_type", "lr_decay_ratio",
            "early_stopping_patience", "noise_scale",
            "sample_method", "parameter_init_method",
        ]

        # Parameters that go directly in run_fingerprint (not optimisation_settings)
        fingerprint_root_keys = [
            # Initial strategy params (tunable)
            "initial_memory_length",
            "initial_k_per_day", "initial_log_amplitude",
            "initial_raw_width", "initial_raw_exponents",
            "initial_pre_exp_scaling",
            # Strategy constraints
            "maximum_change",
            "minimum_weight",
            # Training loss modifiers
            "turnover_penalty",
            # Data augmentation
            "price_noise_sigma",
        ]

        for key, value in suggested.items():
            if key in opt_settings_keys:
                fp["optimisation_settings"][key] = value
            elif key in fingerprint_root_keys:
                # Initial strategy params go in fingerprint root
                fp[key] = value
            elif key == "bout_offset_days":
                # Convert days to minutes (bout_offset is in minutes)
                fp["bout_offset"] = value * 1440
            elif key == "bout_offset":
                # Legacy: direct minutes value
                fp["bout_offset"] = value
            elif key == "warmup_fraction":
                # Convert warmup_fraction to warmup_steps based on n_iterations
                n_iterations = suggested.get("n_iterations", fp["optimisation_settings"].get("n_iterations", 1000))
                fp["optimisation_settings"]["warmup_steps"] = int(value * n_iterations)
            # Inner Optuna settings (for method="optuna")
            elif key == "optuna_overfitting_penalty":
                if "optuna_settings" not in fp["optimisation_settings"]:
                    fp["optimisation_settings"]["optuna_settings"] = {}
                fp["optimisation_settings"]["optuna_settings"]["overfitting_penalty"] = value
            elif key == "optuna_n_startup_trials":
                if "optuna_settings" not in fp["optimisation_settings"]:
                    fp["optimisation_settings"]["optuna_settings"] = {}
                fp["optimisation_settings"]["optuna_settings"]["n_startup_trials"] = int(value)
            elif key == "optuna_n_trials":
                if "optuna_settings" not in fp["optimisation_settings"]:
                    fp["optimisation_settings"]["optuna_settings"] = {}
                fp["optimisation_settings"]["optuna_settings"]["n_trials"] = int(value)
            # Skip control params that aren't real hyperparams (handled above)
            elif key in ["use_weight_decay", "weight_decay", "use_early_stopping",
                         "val_fraction", "training_objective"]:
                pass  # Already handled above
            # multi_period_sgd specific params handled in runner_kwargs

        # Build runner kwargs with suggested params
        # Only include params that were actually sampled (conditional params may be absent)
        local_runner_kwargs = deepcopy(runner_kwargs)
        for key in ["n_periods", "max_epochs", "aggregation", "softmin_temperature"]:
            if key in suggested:
                local_runner_kwargs[key] = suggested[key]

        # Determine WFE metric from outer objective (e.g., "mean_oos_calmar" → "calmar")
        wfe_metric = OUTER_TO_INNER_METRIC.get(objective_metric, "sharpe")

        # Create evaluator
        evaluator = TrainingEvaluator.from_runner(
            runner_name,
            n_cycles=n_wfa_cycles,
            verbose=verbose,
            root=root,
            wfe_metric=wfe_metric,
            **local_runner_kwargs,
        )

        # Run evaluation with pruning support
        try:
            cycle_evals = []
            gen = evaluator.evaluate_iter(fp)

            # Manually iterate to capture the return value from StopIteration
            # (for loops consume StopIteration without giving access to .value)
            result = None
            while True:
                try:
                    cycle_eval = next(gen)
                except StopIteration as e:
                    result = e.value
                    break

                cycle_evals.append(cycle_eval)

                # Compute running metric for intermediate reporting using unified extraction
                # Ensure Python float (not np.float64) for Optuna storage compatibility
                intermediate_value = float(extract_cycle_metric(cycle_evals, objective_metric))

                # Report intermediate value BEFORE pruning checks
                # This ensures all pruned trials have their intermediate values stored for analysis
                if enable_pruning:
                    trial.report(intermediate_value, step=cycle_eval.cycle_number)

                    # Aggressive pruning: prune if oos_returns_over_hodl is non-positive or NaN
                    # This catches obviously broken training early without waiting for Optuna's pruner
                    oos_roh = cycle_eval.oos_returns_over_hodl
                    if oos_roh is None or (isinstance(oos_roh, float) and np.isnan(oos_roh)) or oos_roh <= 0:
                        if verbose:
                            print(f"Trial {trial.number} pruned at cycle {cycle_eval.cycle_number}: "
                                  f"non-positive OOS metrics (sharpe={cycle_eval.oos_sharpe:.4f}, "
                                  f"returns_over_hodl={oos_roh}, intermediate={intermediate_value:.4f})")
                        raise optuna.TrialPruned()

                    # Check if trial should be pruned (Optuna's percentile/median pruner)
                    if trial.should_prune():
                        if verbose:
                            print(f"Trial {trial.number} pruned at cycle {cycle_eval.cycle_number} "
                                  f"by Optuna pruner (intermediate={intermediate_value:.4f})")
                        raise optuna.TrialPruned()

        except optuna.TrialPruned:
            raise  # Re-raise pruning exception
        except ValueError as e:
            # Re-raise ValueErrors (including NaN detection) - these should FAIL the trial
            # not silently return -inf. NaN metrics indicate training collapsed and
            # Optuna should mark this as a failed trial, not a completed one.
            if verbose:
                print(f"Trial {trial.number} failed with ValueError: {e}")
                traceback.print_exc()
            raise
        except Exception as e:
            if verbose:
                print(f"Trial {trial.number} failed: {e}")
                traceback.print_exc()
            # Return bad value for other failures (e.g., data loading issues)
            # Metrics we MAXIMIZE (higher is better): sharpe, wfe, calmar, sterling, returns, ulcer
            # Note: ulcer is negated (higher = less pain), so we maximize
            # Metrics we MINIMIZE (lower is better): is_oos_gap
            maximize_metrics = [
                "mean_oos_sharpe", "worst_oos_sharpe",
                "mean_oos_daily_log_sharpe", "worst_oos_daily_log_sharpe",
                "mean_wfe", "worst_wfe",
                "adjusted_mean_oos_sharpe",
                "mean_oos_calmar", "worst_oos_calmar",
                "mean_oos_sterling", "worst_oos_sterling",
                "mean_oos_returns", "worst_oos_returns",
                "mean_oos_returns_over_hodl", "worst_oos_returns_over_hodl",
                "mean_oos_ulcer", "worst_oos_ulcer",
            ]
            if objective_metric in maximize_metrics:
                return float("-inf")  # Worst possible for maximization
            else:
                return float("inf")  # Worst possible for minimization

        # Store full result for later analysis
        # Include per-cycle metrics for detailed inspection
        per_cycle_metrics = []
        for c in result.cycles:
            per_cycle_metrics.append({
                "cycle": c.cycle_number,
                # Date ranges
                "train_start_date": c.train_start_date,
                "train_end_date": c.train_end_date,
                "test_start_date": c.test_start_date,
                "test_end_date": c.test_end_date,
                # Metrics
                "is_sharpe": c.is_sharpe,
                "oos_sharpe": c.oos_sharpe,
                "is_calmar": c.is_calmar,
                "oos_calmar": c.oos_calmar,
                "is_sterling": c.is_sterling,
                "oos_sterling": c.oos_sterling,
                "is_ulcer": c.is_ulcer,
                "oos_ulcer": c.oos_ulcer,
                "is_returns_over_hodl": c.is_returns_over_hodl,
                "oos_returns_over_hodl": c.oos_returns_over_hodl,
                "is_daily_log_sharpe": c.is_daily_log_sharpe,
                "oos_daily_log_sharpe": c.oos_daily_log_sharpe,
                "wfe": c.walk_forward_efficiency,
                "is_oos_gap": c.is_oos_gap,
                # Trained strategy parameters
                "trained_params": c.trained_params,
                # Provenance: for debugging and linking to output files
                "run_location": c.run_location,
                "run_fingerprint": c.run_fingerprint,
            })

        try:
            trial.set_user_attr("evaluation_result", {
                "mean_oos_sharpe": result.mean_oos_sharpe,
                "mean_wfe": result.mean_wfe,
                "worst_oos_sharpe": result.worst_oos_sharpe,
                "mean_is_oos_gap": result.mean_is_oos_gap,
                "aggregate_rademacher": result.aggregate_rademacher,
                "adjusted_mean_oos_sharpe": result.adjusted_mean_oos_sharpe,
                "is_effective": result.is_effective,
                "cycles": per_cycle_metrics,
            })
        except Exception as e:
            if verbose:
                print(f"Warning: Failed to store evaluation_result for trial {trial.number}: {e}")

        # Return requested metric using unified extraction from cycles
        # Ensure Python float (not np.float64) for Optuna storage compatibility
        final_value = extract_cycle_metric(result.cycles, objective_metric)
        final_value = float(final_value)  # Convert np.float64 -> Python float

        if verbose:
            print(f"Trial {trial.number} returning final value: {final_value}")

        return final_value

    return objective




[docs]
def create_multi_objective(
    run_fingerprint: dict,
    runner_name: str,
    runner_kwargs: Dict[str, Any],
    hyperparam_space: HyperparamSpace,
    n_wfa_cycles: int,
    objectives: List[str],
    verbose: bool,
    enable_pruning: bool = True,
    root: str = None,
) -> Callable[[optuna.Trial], Tuple[float, ...]]:
    """
    Create a multi-objective function for Pareto optimization.

    Common combinations:
    - ["mean_oos_sharpe", "mean_wfe"]: Maximize both OOS performance and efficiency
    - ["mean_oos_sharpe", "neg_is_oos_gap"]: Maximize OOS while minimizing overfitting

    Note: Pruning in multi-objective is based on the first objective only.
    """
    single_objective = create_objective(
        run_fingerprint, runner_name, runner_kwargs,
        hyperparam_space, n_wfa_cycles, objectives[0], verbose,
        enable_pruning=enable_pruning,
        root=root,
    )

    def multi_objective(trial: optuna.Trial) -> Tuple[float, ...]:
        # Run evaluation once (with pruning on first objective)
        try:
            _ = single_objective(trial)
        except optuna.TrialPruned:
            raise  # Re-raise pruning exception
        except ValueError:
            raise  # Re-raise ValueError (e.g., NaN detection) to fail the trial
        except Exception as e:
            # For other exceptions, log and return worst values for all objectives
            if verbose:
                print(f"Trial {trial.number} multi-objective failed: {e}")
            return tuple(float("-inf") for _ in objectives)

        # Get stored results
        eval_result = trial.user_attrs.get("evaluation_result", {})

        # Check if evaluation_result is empty (shouldn't happen if single_objective succeeded)
        if not eval_result:
            if verbose:
                print(f"Trial {trial.number}: evaluation_result is empty after single_objective succeeded")
            return tuple(float("-inf") for _ in objectives)

        values = []
        for metric in objectives:
            if metric == "mean_oos_sharpe":
                values.append(eval_result.get("mean_oos_sharpe", float("-inf")))
            elif metric == "mean_wfe":
                values.append(eval_result.get("mean_wfe", float("-inf")))
            elif metric == "worst_oos_sharpe":
                values.append(eval_result.get("worst_oos_sharpe", float("-inf")))
            elif metric == "neg_is_oos_gap":
                gap = eval_result.get("mean_is_oos_gap", float("inf"))
                values.append(-gap)  # Negative because we want to minimize gap
            elif metric == "adjusted_mean_oos_sharpe":
                adj = eval_result.get("adjusted_mean_oos_sharpe")
                if adj is None:
                    adj = eval_result.get("mean_oos_sharpe", float("-inf"))
                values.append(adj)
            else:
                values.append(float("-inf"))

        return tuple(values)

    return multi_objective



# =============================================================================
# Main Tuner Class
# =============================================================================


[docs]
class HyperparamTuner:
    """
    Tunes training hyperparameters using Optuna/TPE.

    Uses walk-forward evaluation as the objective, optimizing for
    OOS performance rather than in-sample fit.

    Parameters
    ----------
    runner_name : str
        Which runner to tune: "train_on_historic_data" or "multi_period_sgd"
    n_trials : int
        Number of Optuna trials to run
    n_wfa_cycles : int
        Number of walk-forward cycles per evaluation (more = more robust but slower)
    objective : str
        What to optimize:
        - "mean_oos_sharpe": Maximize average OOS Sharpe ratio
        - "mean_wfe": Maximize Walk-Forward Efficiency
        - "worst_oos_sharpe": Maximize worst-case OOS Sharpe
        - "adjusted_mean_oos_sharpe": Maximize Rademacher-adjusted Sharpe
        - "multi": Multi-objective (returns Pareto front)
    multi_objectives : List[str]
        If objective="multi", which metrics to jointly optimize
    hyperparam_space : HyperparamSpace
        Search space (uses sensible defaults if not provided)
    sampler : optuna.samplers.BaseSampler
        Optuna sampler (defaults to TPE)
    pruner : optuna.pruners.BasePruner
        Optuna pruner for early stopping unpromising trials.
        Defaults to MedianPruner. Set to None to disable pruning.
    enable_pruning : bool
        Whether to enable intermediate value reporting and pruning (default True)
    timeout_per_trial : Optional[float]
        Maximum seconds per trial. If None, no per-trial timeout (default None).
        Note: This is approximate - enforced via study.optimize timeout.
    total_timeout : Optional[float]
        Maximum total seconds for all trials. If None, no total timeout (default None).
    verbose : bool
        Print progress
    runner_kwargs : dict
        Extra kwargs passed to the runner

    Example
    -------
    >>> tuner = HyperparamTuner(
    ...     runner_name="train_on_historic_data",
    ...     n_trials=30,
    ...     objective="mean_oos_sharpe",
    ...     enable_pruning=True,  # Prune slow/bad trials early
    ...     total_timeout=3600,   # Stop after 1 hour
    ... )
    >>> result = tuner.tune(run_fingerprint)
    >>> print(f"Best LR: {result.best_params['base_lr']}")
    >>> print(f"Best OOS Sharpe: {result.best_value}")
    """


[docs]
    def __init__(
        self,
        runner_name: str = "train_on_historic_data",
        n_trials: int = 50,
        n_wfa_cycles: int = 3,
        objective: str = "mean_oos_sharpe",
        multi_objectives: Optional[List[str]] = None,
        hyperparam_space: Optional[HyperparamSpace] = None,
        sampler: Optional[optuna.samplers.BaseSampler] = None,
        pruner: Optional[optuna.pruners.BasePruner] = "default",
        enable_pruning: bool = True,
        timeout_per_trial: Optional[float] = None,
        total_timeout: Optional[float] = None,
        verbose: bool = True,
        runner_kwargs: Optional[Dict[str, Any]] = None,
        study_name: Optional[str] = None,
        storage: Optional[str] = None,
        root: str = None,
    ):
        self.runner_name = runner_name
        self.n_trials = n_trials
        self.n_wfa_cycles = n_wfa_cycles
        self.objective = objective
        self.multi_objectives = multi_objectives or ["mean_oos_sharpe", "mean_wfe"]
        self.enable_pruning = enable_pruning
        self.timeout_per_trial = timeout_per_trial
        self.total_timeout = total_timeout
        self.verbose = verbose
        self.runner_kwargs = runner_kwargs or {}
        self.study_name = study_name
        self.storage = storage
        self.root = root

        # Set default search space based on runner
        # IMPORTANT: Pass objective so training_objective is conditionally included correctly
        if hyperparam_space is not None:
            self.hyperparam_space = hyperparam_space
        elif runner_name == "multi_period_sgd":
            self.hyperparam_space = HyperparamSpace.create(
                runner="multi_period_sgd",
                objective_metric=objective,
            )
        else:
            self.hyperparam_space = HyperparamSpace.create(
                optimizer="adam",
                objective_metric=objective,
            )

        # Set sampler (TPE is good for expensive evaluations)
        self.sampler = sampler or TPESampler(
            n_startup_trials=min(10, n_trials // 3),
            multivariate=True,
        )

        # Set pruner for early stopping unpromising trials
        # Note: WFA cycles are NOT true multi-fidelity (cycle 1 doesn't predict cycles 2-4,
        # they're different market regimes). So Hyperband/ASHA are overkill - their
        # sophisticated logic assumes correlation between fidelities we don't have.
        # PercentilePruner is better: just filter obvious disasters without predicting.
        if not enable_pruning or pruner is None or pruner == "none":
            self.pruner = optuna.pruners.NopPruner()
        elif pruner == "default" or pruner == "percentile":
            # PercentilePruner with 25%: prune bottom 25% after each cycle.
            # This is appropriate for WFA where cycles are independent regimes.
            # We're not predicting future cycles, just filtering disasters.
            self.pruner = PercentilePruner(
                percentile=25.0,
                n_startup_trials=max(5, n_trials // 5),
                n_warmup_steps=0,
                interval_steps=1,
            )
        elif pruner == "median":
            # MedianPruner: prune if below median of completed trials at same step
            self.pruner = MedianPruner(
                n_startup_trials=max(3, n_trials // 5),
                n_warmup_steps=0,
                interval_steps=1,
            )
        elif pruner == "hyperband":
            # HyperbandPruner: structured successive halving with multiple brackets
            # Note: Designed for true multi-fidelity where cheap evals predict expensive ones.
            # Use cautiously with WFA - cycles are different regimes, not fidelity levels.
            self.pruner = HyperbandPruner(
                min_resource=1,
                max_resource=n_wfa_cycles,
                reduction_factor=3,
            )
        elif pruner == "successive_halving":
            # SuccessiveHalvingPruner: single bracket successive halving
            self.pruner = SuccessiveHalvingPruner(
                min_resource=1,
                reduction_factor=3,
            )
        else:
            # Custom pruner instance
            self.pruner = pruner



[docs]
    def tune(self, run_fingerprint: dict) -> TuningResult:
        """
        Run hyperparameter tuning.

        Parameters
        ----------
        run_fingerprint : dict
            Base run configuration. Hyperparameters will be varied around this.

        Returns
        -------
        TuningResult
            Contains best parameters, best value, and all trial data.
        """
        start_time = datetime.now()

        # Create study
        study_name = self.study_name or f"hyperparam_tune_{datetime.now().strftime('%Y%m%d_%H%M%S')}"

        if self.objective == "multi":
            # Multi-objective optimization
            # Note: Multi-objective doesn't support pruning directly in Optuna,
            # but we still report intermediate values for monitoring
            study = optuna.create_study(
                study_name=study_name,
                storage=self.storage,
                directions=["maximize"] * len(self.multi_objectives),
                sampler=self.sampler,
                load_if_exists=True,
            )

            objective_fn = create_multi_objective(
                run_fingerprint,
                self.runner_name,
                self.runner_kwargs,
                self.hyperparam_space,
                self.n_wfa_cycles,
                self.multi_objectives,
                self.verbose,
                enable_pruning=False,  # Multi-objective doesn't support pruning
                root=self.root,
            )
        else:
            # Single objective optimization with pruning support
            study = optuna.create_study(
                study_name=study_name,
                storage=self.storage,
                direction="maximize",
                sampler=self.sampler,
                pruner=self.pruner,
                load_if_exists=True,
            )

            objective_fn = create_objective(
                run_fingerprint,
                self.runner_name,
                self.runner_kwargs,
                self.hyperparam_space,
                self.n_wfa_cycles,
                self.objective,
                self.verbose,
                enable_pruning=self.enable_pruning,
                root=self.root,
            )

        # Run optimization
        if self.verbose:
            print("=" * 70)
            print(f"HYPERPARAMETER TUNING: {self.runner_name}")
            print("=" * 70)
            print(f"Objective: {self.objective}")
            print(f"Trials: {self.n_trials}")
            print(f"WFA cycles per trial: {self.n_wfa_cycles}")
            print(f"Search space: {list(self.hyperparam_space.params.keys())}")
            print(f"Pruning: {'enabled' if self.enable_pruning and self.objective != 'multi' else 'disabled'}")
            if self.total_timeout:
                print(f"Total timeout: {self.total_timeout}s")
            print("=" * 70)

        # Suppress Optuna's verbose logging unless we want it
        if not self.verbose:
            optuna.logging.set_verbosity(optuna.logging.WARNING)

        study.optimize(
            objective_fn,
            n_trials=self.n_trials,
            timeout=self.total_timeout,
            show_progress_bar=self.verbose,
            catch=(Exception,),  # Catch exceptions and continue with other trials
        )

        # Collect results
        end_time = datetime.now()
        total_time = (end_time - start_time).total_seconds()

        # Get all trials data
        all_trials = []
        for trial in study.trials:
            trial_data = {
                "number": trial.number,
                "params": trial.params,
                "value": trial.value if self.objective != "multi" else trial.values,
                "state": str(trial.state),
                "evaluation_result": trial.user_attrs.get("evaluation_result"),
            }
            all_trials.append(trial_data)

        # Count trial states
        n_completed = len([t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE])
        n_pruned = len([t for t in study.trials if t.state == optuna.trial.TrialState.PRUNED])
        n_failed = len([t for t in study.trials if t.state == optuna.trial.TrialState.FAIL])

        # Build result
        if self.objective == "multi":
            # Multi-objective: return Pareto front
            pareto_trials = study.best_trials
            pareto_front = [
                {
                    "params": t.params,
                    "values": t.values,
                    "evaluation_result": t.user_attrs.get("evaluation_result"),
                }
                for t in pareto_trials
            ]

            # Pick one "best" for convenience (highest first objective)
            if pareto_trials:
                best_trial = max(pareto_trials, key=lambda t: t.values[0])
                best_params = best_trial.params
                best_value = best_trial.values[0]
            else:
                best_params = {}
                best_value = float("-inf")

            result = TuningResult(
                best_params=best_params,
                best_value=best_value,
                best_evaluation=None,  # Would need to re-run to get full result
                n_trials=self.n_trials,
                n_completed=n_completed,
                n_pruned=n_pruned,
                n_failed=n_failed,
                all_trials=all_trials,
                pareto_front=pareto_front,
                total_time_seconds=total_time,
            )
        else:
            # Single objective
            if n_completed > 0:
                best_trial = study.best_trial
                best_params = best_trial.params
                best_value = best_trial.value
            else:
                # No completed trials - return empty result
                best_params = {}
                best_value = float("-inf")

            result = TuningResult(
                best_params=best_params,
                best_value=best_value,
                best_evaluation=None,
                n_trials=self.n_trials,
                n_completed=n_completed,
                n_pruned=n_pruned,
                n_failed=n_failed,
                all_trials=all_trials,
                pareto_front=None,
                total_time_seconds=total_time,
            )

        if self.verbose:
            self._print_report(result, study)

        return result


    def _print_report(self, result: TuningResult, study: optuna.Study):
        """Print a human-readable summary of the tuning run.

        Includes trial counts (completed/pruned/failed), best trial
        parameters and value, timing, and (for multi-objective studies)
        the Pareto front.

        Parameters
        ----------
        result : TuningResult
            Completed tuning result.
        study : optuna.Study
            Underlying Optuna study (used to query trial states).
        """
        print("\n" + "=" * 70)
        print("TUNING COMPLETE")
        print("=" * 70)

        # Count trial states
        n_failed = len([t for t in study.trials if t.state == optuna.trial.TrialState.FAIL])
        n_running = len([t for t in study.trials if t.state == optuna.trial.TrialState.RUNNING])

        print(f"Trials: {len(study.trials)} total")
        print(f"  Completed: {result.n_completed}")
        print(f"  Pruned:    {result.n_pruned}")
        if n_failed > 0:
            print(f"  Failed:    {n_failed}")
        if n_running > 0:
            print(f"  Running:   {n_running}")

        print(f"\nTotal time: {result.total_time_seconds:.1f}s")
        if result.n_completed > 0:
            print(f"Time per completed trial: {result.total_time_seconds / result.n_completed:.1f}s")
        if result.n_pruned > 0:
            # Estimate time saved by pruning
            avg_completed_time = result.total_time_seconds / max(1, result.n_completed + result.n_pruned)
            print(f"Estimated time saved by pruning: {result.n_pruned * avg_completed_time * 0.5:.1f}s")

        print("\n--- Best Parameters ---")
        for key, value in result.best_params.items():
            print(f"  {key}: {value}")

        print(f"\n--- Best {self.objective}: {result.best_value:.4f} ---")

        if result.pareto_front:
            print(f"\n--- Pareto Front ({len(result.pareto_front)} solutions) ---")
            for i, sol in enumerate(result.pareto_front[:5]):  # Show top 5
                values_str = ", ".join(f"{v:.3f}" for v in sol["values"])
                print(f"  {i+1}. [{values_str}]")
                print(f"     params: {sol['params']}")

        print("=" * 70)



# =============================================================================
# Convenience Functions
# =============================================================================


[docs]
def quick_tune(
    run_fingerprint: dict,
    runner_name: str = "train_on_historic_data",
    n_trials: int = 20,
) -> Dict[str, Any]:
    """
    Quick hyperparameter tuning with minimal configuration.

    Returns the best hyperparameters found.

    Example
    -------
    >>> best_params = quick_tune(run_fingerprint, n_trials=20)
    >>> run_fingerprint["optimisation_settings"]["base_lr"] = best_params["base_lr"]
    """
    tuner = HyperparamTuner(
        runner_name=runner_name,
        n_trials=n_trials,
        n_wfa_cycles=2,  # Fast evaluation
        hyperparam_space=HyperparamSpace.minimal_space(),
        verbose=True,
    )
    result = tuner.tune(run_fingerprint)
    return result.best_params




[docs]
def tune_for_robustness(
    run_fingerprint: dict,
    runner_name: str = "train_on_historic_data",
    n_trials: int = 50,
) -> TuningResult:
    """
    Tune hyperparameters with emphasis on robustness (WFE + OOS Sharpe).

    Uses multi-objective optimization to find the Pareto front of
    OOS performance vs walk-forward efficiency.
    """
    tuner = HyperparamTuner(
        runner_name=runner_name,
        n_trials=n_trials,
        n_wfa_cycles=4,  # More cycles for robust estimate
        objective="multi",
        multi_objectives=["mean_oos_sharpe", "mean_wfe"],
        verbose=True,
    )
    return tuner.tune(run_fingerprint)



# =============================================================================
# Example
# =============================================================================

if __name__ == "__main__":
    # Example usage
    run_fingerprint = {
        "tokens": ["BTC", "ETH"],
        "rule": "momentum",
        "startDateString": "2021-01-01 00:00:00",
        "endDateString": "2023-06-01 00:00:00",
        "endTestDateString": "2024-01-01 00:00:00",
        "chunk_period": 1440,
        "weight_interpolation_period": 1440,
        "initial_pool_value": 1000000.0,
        "fees": 0.003,
        "optimisation_settings": {
            "training_data_kind": "historic",
            "optimiser": "adam",
        },
    }

    # Quick tune
    best_params = quick_tune(run_fingerprint, n_trials=10)
    print(f"\nBest params: {best_params}")