Source code for boost_loss.regression.sklearn

from __future__ import annotations

from typing import Any, Literal, Sequence, overload

import numpy as np
from joblib import Parallel, delayed
from numpy.random import RandomState
from numpy.typing import NDArray
from sklearn.base import BaseEstimator
from typing_extensions import Self

from ..base import LossBase
from ..sklearn import apply_custom_loss
from .asymmetric import AsymmetricLoss


def _create_random_random_state(random_state: int | RandomState | None) -> RandomState:
    seed = RandomState(random_state).randint(0, np.iinfo(np.int32).max)
    return RandomState(seed)


def _recursively_set_random_state(
    estimator: BaseEstimator, random_state: int | RandomState | None
) -> None:
    state = RandomState(random_state)
    for k, v in estimator.get_params(deep=True).items():
        if isinstance(k, str) and k.endswith("_random_state") and v is not None:
            estimator.set_params(**{k: _create_random_random_state(state)})


[docs]class VarianceEstimator(BaseEstimator): """Estimator that estimates the distribution by simply using multiple estimators with different `t`. Compared to [NGBoost](https://stanfordmlgroup.github.io/projects/ngboost/) or [CatBoost's Uncertainty](https://catboost.ai/en/docs/references/uncertainty), this estimator is much slower and does not support "natural gradient", but does not require any assumption on the distribution. Note that NGBoost supports [any user-defineddistribution](https://stanfordmlgroup.github.io/ngboost/5-dev.html) # noqa but it has to be defined beforehand. NGBoost requires mean estimator and log standard deviation estimator to be trained simultaneously, which is very difficult to implement in sklearn / lightgbm / xgboost. (Need to start and stop fitting per iteration) Consider change `Base` parameter in NGBoost. (See https://github.com/stanfordmlgroup/ngboost/issues/250) """ ts_: Sequence[float] m_type: Literal["mean", "median"] var_type: Literal["var", "std", "range", "mae", "mse"] def __init__( self, estimator: Any, loss: LossBase, *, ts: int | Sequence[float], n_jobs: int | None = 1, verbose: int = 0, random_state: int | None = None, m_type: Literal["mean", "median"] = "mean", var_type: Literal["var", "std", "range", "mae", "mse"] = "std", apply_objective: bool = True, apply_eval_metric: bool = True, target_transformer: BaseEstimator | Any | None = None, recursive: bool = True, recursive_strict: bool = False, ) -> None: """Estimator that estimates the distribution by simply using multiple estimators with different `t`. Compared to [NGBoost](https://stanfordmlgroup.github.io/projects/ngboost/) or [CatBoost's Uncertainty](https://catboost.ai/en/docs/references/uncertainty), this estimator is much slower and does not support "natural gradient", but does not require any assumption on the distribution. Note that NGBoost supports [any user-defineddistribution](https://stanfordmlgroup.github.io/ngboost/5-dev.html) # noqa but it has to be defined beforehand. NGBoost requires mean estimator and log standard deviation estimator to be trained simultaneously, which is very difficult to implement in sklearn / lightgbm / xgboost. (Need to start and stop fitting per iteration) Consider change `Base` parameter in NGBoost. (See https://github.com/stanfordmlgroup/ngboost/issues/250) Parameters ---------- estimator : Any The base estimator to use for fitting the data. loss : LossBase The loss function to use for fitting the data. Generally, `loss` should not be `AsymmetricLoss`. ts : int | Sequence[float] The list of `t` to use for fitting the data or the number of `t` to use. If `ts` is an integer, `np.linspace(1 / (ts * 2), 1 - 1 / (ts * 2), ts)` is used. n_jobs : int | None, optional The number of jobs to run in parallel for `fit`. `None` means 1. verbose : int, optional The verbosity level. random_state : int | None, optional The random state to use for fitting the data. If `None`, the random state is not set. If not `None`, new random state generated from `random_state` is set to each estimator. m_type : Literal["mean", "median"], optional M-statistics type to return from `predict` by default, by default "median" var_type : Literal["var", "std", "range", "mae", "mse"], optional Variance type to return from `predict` by default, by default "var" apply_objective : bool, optional Whether to apply the custom loss to the estimator's objective, by default True apply_eval_metric : bool, optional Whether to apply the custom loss to the estimator's eval_metric, by default True target_transformer : BaseEstimator | Any | None, optional The transformer to use for transforming the target, by default None If `None`, no `TransformedTargetRegressor` is used. recursive : bool, optional Whether to recursively patch the estimator, by default True recursive_strict : bool, optional Whether to recursively patch the estimator's attributes, lists, tuples, sets, and frozensets as well, by default False Raises ------ TypeError Raises if `estimator` does not have `fit` method or `predict` method. """ if not hasattr(estimator, "fit"): raise TypeError(f"{estimator} does not have fit method") if not hasattr(estimator, "predict"): raise TypeError(f"{estimator} does not have predict method") self.estimator = estimator self.loss = loss self.ts = ts self.n_jobs = n_jobs self.verbose = verbose self.random_state = random_state self.m_type = m_type self.var_type = var_type self.apply_objective = apply_objective self.apply_eval_metric = apply_eval_metric self.target_transformer = target_transformer self.recursive = recursive self.recursive_strict = recursive_strict self.random_state_ = np.random.RandomState(random_state)
[docs] def fit(self, X: Any, y: Any, **fit_params: Any) -> Self: """Fit each estimator with different `t`. Parameters ---------- X : Any The training input samples. y : Any The target values. Returns ------- Self Fitted estimator. Raises ------ RuntimeError Raises if joblib fails to return the results. """ ts = self.ts if isinstance(ts, int): ts = np.linspace(1 / (ts * 2), 1 - 1 / (ts * 2), ts) self.ts_ = ts # type: ignore estimators_ = [ apply_custom_loss( self.estimator, AsymmetricLoss(self.loss, t=t), apply_objective=self.apply_objective, apply_eval_metric=self.apply_eval_metric, target_transformer=self.target_transformer, recursive=self.recursive, recursive_strict=self.recursive_strict, ) for t in self.ts_ ] if self.random_state_ is not None: # set different random state for each estimator # otherwise, estimators will be identical for estimator in estimators_: _recursively_set_random_state( estimator, self.random_state_.randint(0, np.iinfo(np.int32).max) ) parallel_result = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)( [delayed(estimator.fit)(X, y, **fit_params) for estimator in estimators_] ) if parallel_result is None: raise RuntimeError("joblib.Parallel returned None") self.estimators_ = parallel_result return self
[docs] def predict_raw(self, X: Any, **predict_params: Any) -> NDArray[Any]: """Returns raw predictions of each estimator. Parameters ---------- X : Any X **predict_params : Any The parameters to be passed to `predict` method of each estimator. Returns ------- NDArray[Any] Raw predictions of each estimator with shape (n_estimators, n_samples) """ return np.array( [estimator.predict(X, **predict_params) for estimator in self.estimators_] )
@overload def predict( self, X: Any, type_: Literal["mean", "median", "var", "std", "range", "mae", "mse"] | None = None, return_std: Literal[False] = False, **predict_params: Any, ) -> NDArray[Any]: ... @overload def predict( self, X: Any, type_: tuple[ Literal["mean", "median"], Literal["var", "std", "range", "mae", "mse"] ] | None = None, return_std: Literal[True] = ..., **predict_params: Any, ) -> tuple[NDArray[Any], NDArray[Any]]: ...
[docs] def predict( self, X: Any, type_: Literal["mean", "median", "var", "std", "range", "mae", "mse"] | tuple[Literal["mean", "median"], Literal["var", "std", "range", "mae", "mse"]] | None = None, return_std: bool = False, **predict_params: Any, ) -> NDArray[Any] | tuple[NDArray[Any], NDArray[Any]]: """Returns predictions of the ensemble. Parameters ---------- X : Any X type_ : Literal['mean', 'median', 'var', 'std', 'range', 'mae', 'mse'], optional Type of the prediction, by default None If None, self.m_type is used. return_std : bool, optional Whether to return a tuple of (predictions, standard deviation), by default False **predict_params : Any The parameters to be passed to `predict` method of each estimator. Returns ------- NDArray[Any] Predictions of the ensemble with shape (n_samples,) Raises ------ ValueError When type_ is not supported. """ if return_std or isinstance(type_, tuple): if isinstance(type_, str): type_tuple = (type_, self.var_type) elif type_ is None: type_tuple = (self.m_type, self.var_type) else: type_tuple = type_ return self.predict( X, type_=type_tuple[0], **predict_params ), self.predict_var(X, type_=type_tuple[1], **predict_params) type_ = type_ or self.m_type if type_ == "mean": return self.predict_raw(X, **predict_params).mean(axis=0) elif type_ == "median": return np.median(self.predict_raw(X, **predict_params), axis=0) elif type_ == "var": return self.predict_raw(X, **predict_params).var(axis=0) elif type_ == "std": return self.predict_raw(X, **predict_params).std(axis=0) elif type_ == "range": return self.predict_raw(X, **predict_params).max(axis=0) - self.predict_raw( X, **predict_params ).min(axis=0) elif type_ == "mae": return np.abs( self.predict_raw(X, **predict_params) - self.predict_raw(X, **predict_params).mean(axis=0) ).mean(axis=0) elif type_ == "mse": return ( ( self.predict_raw(X, **predict_params) - self.predict_raw(X, **predict_params).mean(axis=0) ) ** 2 ).mean(axis=0) else: raise ValueError(f"Unknown type_: {type_}")
[docs] def predict_var( self, X: Any, type_: Literal["var", "std", "range", "mae", "mse"] | None = None, **predict_params: Any, ) -> NDArray[Any]: """Returns variance of the ensemble. Parameters ---------- X : Any X type_ : Literal['var', 'std', 'range', 'mae', 'mse'], optional Type of the variance, by default None If None, self.var_type is used. **predict_params : Any The parameters to be passed to `predict` method of each estimator. Returns ------- NDArray[Any] Variance of the ensemble with shape (n_samples,) """ type_ = type_ or self.var_type return self.predict(X, type_=type_, **predict_params)