mlsauce

  1try:
  2    from .adaopt import AdaOpt
  3except ImportError as e:
  4    print(f"Could not import some modules: {e}")
  5
  6try:
  7    from .booster import (
  8        LSBoostClassifier,
  9        LSBoostRegressor,
 10        GenericBoostingClassifier,
 11        GenericBoostingRegressor,
 12    )
 13except ImportError as e:
 14    print(f"Could not import some modules: {e}")
 15
 16try:
 17    from .lazybooster import (
 18        LazyBoostingClassifier,
 19        LazyBoostingRegressor,
 20        LazyBoostingMTS,
 21    )
 22except ImportError as e:
 23    print(f"Could not import some modules: {e}")
 24
 25try:
 26    from .multitaskregressor import MultiTaskRegressor
 27except ImportError as e:
 28    print(f"Could not import some modules: {e}")
 29
 30try:
 31    from .datasets import download
 32except ImportError as e:
 33    print(f"Could not import some modules: {e}")
 34
 35try:
 36    from .elasticnet import ElasticNetRegressor
 37except ImportError as e:
 38    print(f"Could not import some modules: {e}")
 39
 40try:
 41    from .kernelridge import KRLSRegressor
 42except ImportError as e:
 43    print(f"Could not import some modules: {e}")
 44
 45try:
 46    from .lasso import LassoRegressor
 47except ImportError as e:
 48    print(f"Could not import some modules: {e}")
 49
 50try:
 51    from .ridge import RidgeRegressor
 52except ImportError as e:
 53    print(f"Could not import some modules: {e}")
 54
 55try:
 56    from .stump import StumpClassifier
 57except ImportError as e:
 58    print(f"Could not import some modules: {e}")
 59
 60try:
 61    from .isotonicregression import IsotonicRegressor
 62except ImportError as e:
 63    print(f"Could not import some modules: {e}")
 64
 65try:
 66    from .fpca import GenericFunctionalForecaster
 67except ImportError as e:
 68    print(f"Could not import some modules: {e}")
 69
 70try:
 71    from .generators import (
 72        make_diverse_classification,
 73        HealthcareTimeSeriesGenerator,
 74        generate_synthetic_returns,
 75        plot_synthetic_returns,
 76    )
 77except ImportError as e:
 78    print(f"Could not import generators: {e}")
 79
 80try:
 81    from .catencoder import RankTargetEncoder
 82except ImportError as e:
 83    print(f"Could not import RankTargetEncoder: {e}")
 84
 85try:
 86    from .rollingoriginregression import RollingOriginForecaster
 87except ImportError as e:
 88    print(f"Could not import RollingOriginForecaster: {e}")
 89# from .encoders import corrtarget_encoder
 90
 91try:
 92    from .penalizedcv import penalized_cross_val_score
 93except ImportError as e:
 94    print(f"Could not import penalized_cross_val_score: {e}")
 95
 96try:
 97    from .conformalbayesian import ConformalBayesianRegressor
 98except ImportError as e:
 99    print(f"Could not import ConformalBayesianRegressor: {e}")
100
101try:
102    from .conformalbayesian import ConformalBayesianClassifier
103except ImportError as e:
104    print(f"Could not import ConformalBayesianClassifier: {e}")
105
106try:
107    from .contextawaretheta import ContextAwareThetaForecaster
108except ImportError as e:
109    print(f"Could not import ContextAwareThetaForecaster: {e}")
110
111
112__all__ = [
113    "AdaOpt",
114    "ConformalBayesianRegressor",
115    "ConformalBayesianClassifier",
116    "ContextAwareThetaForecaster",
117    "LSBoostClassifier",
118    "GenericBoostingClassifier",
119    "GenericBoostingRegressor",
120    "StumpClassifier",
121    "ElasticNetRegressor",
122    "KRLSRegressor",
123    "LassoRegressor",
124    "LSBoostRegressor",
125    "LSTMRegressor",
126    "RidgeRegressor",
127    "LazyBoostingClassifier",
128    "LazyBoostingMTS",
129    "LazyBoostingRegressor",
130    "MultiTaskRegressor",
131    "IsotonicRegressor",
132    "GenericFunctionalForecaster",
133    "RankTargetEncoder",
134    "RollingOriginForecaster",
135    # Other imports
136    # "corrtarget_encoder",
137    "download",
138    # Non-modules:
139    "get_config",
140    "set_config",
141    "config_context",
142    "penalized_cross_val_score",
143    "make_diverse_classification",
144    "HealthcareTimeSeriesGenerator",
145    "generate_synthetic_returns",
146    "plot_synthetic_returns",
147]
class AdaOpt(sklearn.base.BaseEstimator, sklearn.base.ClassifierMixin):
 19class AdaOpt(BaseEstimator, ClassifierMixin):
 20    """AdaOpt classifier.
 21
 22    Attributes:
 23
 24        n_iterations: int
 25            number of iterations of the optimizer at training time.
 26
 27        learning_rate: float
 28            controls the speed of the optimizer at training time.
 29
 30        reg_lambda: float
 31            L2 regularization parameter for successive errors in the optimizer
 32            (at training time).
 33
 34        reg_alpha: float
 35            L1 regularization parameter for successive errors in the optimizer
 36            (at training time).
 37
 38        eta: float
 39            controls the slope in gradient descent (at training time).
 40
 41        gamma: float
 42            controls the step size in gradient descent (at training time).
 43
 44        k: int
 45            number of nearest neighbors selected at test time for classification.
 46
 47        tolerance: float
 48            controls early stopping in gradient descent (at training time).
 49
 50        n_clusters: int
 51            number of clusters, if MiniBatch k-means is used at test time
 52            (for faster prediction).
 53
 54        batch_size: int
 55            size of the batch, if MiniBatch k-means is used at test time
 56            (for faster prediction).
 57
 58        row_sample: float
 59            percentage of rows chosen from training set (by stratified subsampling,
 60            for faster prediction).
 61
 62        type_dist: str
 63            distance used for finding the nearest neighbors; currently `euclidean-f`
 64            (euclidean distances calculated as whole), `euclidean` (euclidean distances
 65            calculated row by row), `cosine` (cosine distance).
 66
 67        n_jobs: int
 68            number of cpus for parallel processing (default: None)
 69
 70        verbose: int
 71            progress bar for parallel processing (yes = 1) or not (no = 0)
 72
 73        cache: boolean
 74            if the nearest neighbors are cached or not, for faster retrieval in
 75            subsequent calls.
 76
 77        n_clusters_input: int
 78            number of clusters (a priori) for clustering the features
 79
 80        clustering_method: str
 81            clustering method: currently 'kmeans', 'gmm'
 82
 83        cluster_scaling: str
 84            scaling method for clustering: currently 'standard', 'robust', 'minmax'
 85
 86        backend: str
 87            backend for parallel processing: "cpu" or "gpu" or "tpu"
 88
 89        seed: int
 90            reproducibility seed for nodes_sim=='uniform', clustering and dropout.
 91
 92    """
 93
 94    def __init__(
 95        self,
 96        n_iterations=50,
 97        learning_rate=0.3,
 98        reg_lambda=0.1,
 99        reg_alpha=0.5,
100        eta=0.01,
101        gamma=0.01,
102        k=3,
103        tolerance=0,
104        n_clusters=0,
105        batch_size=100,
106        row_sample=0.8,
107        type_dist="euclidean-f",
108        n_jobs=None,
109        verbose=0,
110        cache=True,
111        n_clusters_input=0,
112        clustering_method="kmeans",
113        cluster_scaling="standard",
114        backend="cpu",
115        seed=123,
116    ):
117        if n_clusters_input > 0:
118            assert clustering_method in (
119                "kmeans",
120                "gmm",
121            ), "`clustering_method` must be in ('kmeans', 'gmm')"
122            assert cluster_scaling in (
123                "standard",
124                "robust",
125                "minmax",
126            ), "`cluster_scaling` must be in ('standard', 'robust', 'minmax')"
127
128        assert type_dist in (
129            "euclidean",
130            "manhattan",
131            "euclidean-f",
132            "cosine",
133        ), "must have: `type_dist` in ('euclidean', 'manhattan', 'euclidean-f', 'cosine') "
134
135        self.n_iterations = n_iterations
136        self.learning_rate = learning_rate
137        self.reg_lambda = reg_lambda
138        self.reg_alpha = reg_alpha
139        self.eta = eta
140        self.gamma = gamma
141        self.k = k
142        self.tolerance = tolerance
143        self.n_clusters = n_clusters
144        self.batch_size = batch_size
145        self.row_sample = row_sample
146        self.type_dist = type_dist
147        self.n_jobs = n_jobs
148        self.cache = cache
149        self.verbose = verbose
150        self.n_clusters_input = n_clusters_input
151        self.clustering_method = clustering_method
152        self.cluster_scaling = cluster_scaling
153        self.scaler_, self.label_encoder_, self.clusterer_ = None, None, None
154        self.backend = backend
155        self.seed = seed
156
157    def fit(self, X, y, **kwargs):
158        """Fit AdaOpt to training data (X, y)
159
160        Args:
161
162            X: {array-like}, shape = [n_samples, n_features]
163                Training vectors, where n_samples is the number
164                of samples and n_features is the number of features.
165
166            y: array-like, shape = [n_samples]
167                Target values.
168
169            **kwargs: additional parameters to be passed to self.cook_training_set.
170
171        Returns:
172
173            self: object.
174
175        """
176
177        if self.n_clusters_input > 0:
178            clustered_X, self.scaler_, self.label_encoder_, self.clusterer_ = (
179                cluster(
180                    X,
181                    n_clusters=self.n_clusters_input,
182                    method=self.clustering_method,
183                    type_scaling=self.cluster_scaling,
184                    training=True,
185                    seed=self.seed,
186                )
187            )
188            X = np.column_stack((X.copy(), clustered_X))
189
190        if self.row_sample < 1:
191            index_subsample = subsample(
192                y, row_sample=self.row_sample, seed=self.seed
193            )
194            y_ = y[index_subsample]
195            X_ = X[index_subsample, :]
196        else:
197            y_ = deepcopy(y)
198            X_ = deepcopy(X)
199
200        n, p = X_.shape
201
202        n_classes = len(np.unique(y_))
203
204        assert n == len(y_), "must have X.shape[0] == len(y)"
205
206        res = adaoptc.fit_adaopt(
207            X=np.asarray(X_).astype(np.float64),
208            y=np.asarray(y_).astype(np.int64),
209            n_iterations=self.n_iterations,
210            n_X=n,
211            p_X=p,
212            n_classes=n_classes,
213            learning_rate=self.learning_rate,
214            reg_lambda=self.reg_lambda,
215            reg_alpha=self.reg_alpha,
216            eta=self.eta,
217            gamma=self.gamma,
218            tolerance=self.tolerance,
219            backend=self.backend,
220        )
221
222        self.probs_training = res["probs"]
223        self.training_accuracy = res["training_accuracy"]
224        self.alphas = res["alphas"]
225        self.n_iterations = res["n_iterations"]
226        self.scaled_X_train = np.array(res["scaled_X_train"], dtype=np.float64)
227        self.n_classes_ = len(np.unique(y))  # for compatibility with sklearn
228        return self
229
230    def predict(self, X, **kwargs):
231        """Predict test data X.
232
233        Args:
234
235            X: {array-like}, shape = [n_samples, n_features]
236                Training vectors, where n_samples is the number
237                of samples and n_features is the number of features.
238
239            **kwargs: additional parameters to be passed to `predict_proba`
240
241        Returns:
242
243            model predictions: {array-like}
244
245        """
246
247        return np.argmax(self.predict_proba(X, **kwargs), axis=1)
248
249    def predict_proba(self, X, **kwargs):
250        """Predict probabilities for test data X.
251
252        Args:
253
254            X: {array-like}, shape = [n_samples, n_features]
255                Training vectors, where n_samples is the number
256                of samples and n_features is the number of features.
257
258            **kwargs: additional parameters to be passed to
259                self.cook_test_set
260
261        Returns:
262
263            probability estimates for test data: {array-like}
264
265        """
266
267        n_train, p_train = self.scaled_X_train.shape
268
269        if self.n_clusters_input > 0:
270            X = np.column_stack(
271                (
272                    X.copy(),
273                    cluster(
274                        X,
275                        training=False,
276                        scaler=self.scaler_,
277                        label_encoder=self.label_encoder_,
278                        clusterer=self.clusterer_,
279                        seed=self.seed,
280                    ),
281                )
282            )
283
284        n_test = X.shape[0]
285
286        if self.n_jobs is None:
287            return adaoptc.predict_proba_adaopt(
288                X_test=np.asarray(X, order="C").astype(np.float64),
289                scaled_X_train=np.asarray(
290                    self.scaled_X_train, order="C"
291                ).astype(np.float64),
292                n_test=n_test,
293                n_train=n_train,
294                probs_train=self.probs_training,
295                k=self.k,
296                n_clusters=self.n_clusters,
297                batch_size=self.batch_size,
298                type_dist=self.type_dist,
299                cache=self.cache,
300                seed=self.seed,
301                backend=self.backend,
302            )
303
304        # parallel: self.n_jobs is not None
305        assert self.type_dist in (
306            "euclidean",
307            "manhattan",
308            "cosine",
309        ), "must have: `self.type_dist` in ('euclidean', 'manhattan', 'cosine') "
310
311        scaled_X_test = X / norm(X, ord=2, axis=1)[:, None]
312
313        if self.type_dist == "euclidean":
314
315            @delayed
316            @wrap_non_picklable_objects
317            def multiproc_func(i):
318                dists_test_i = adaoptc.distance_to_mat_euclidean2(
319                    np.asarray(scaled_X_test.astype(np.float64), order="C")[
320                        i, :
321                    ],
322                    np.asarray(
323                        self.scaled_X_train.astype(np.float64), order="C"
324                    ),
325                    np.zeros(n_train),
326                    n_train,
327                    p_train,
328                )
329
330                kmin_test_i = adaoptc.find_kmin_x(
331                    dists_test_i, n_x=n_train, k=self.k, cache=self.cache
332                )
333
334                weights_test_i = adaoptc.calculate_weights(kmin_test_i[0])
335
336                probs_test_i = adaoptc.calculate_probs(
337                    kmin_test_i[1], self.probs_training
338                )
339
340                return adaoptc.average_probs(
341                    probs=probs_test_i, weights=weights_test_i
342                )
343
344        if self.type_dist == "manhattan":
345
346            @delayed
347            @wrap_non_picklable_objects
348            def multiproc_func(i):
349                dists_test_i = adaoptc.distance_to_mat_manhattan2(
350                    np.asarray(scaled_X_test.astype(np.float64), order="C")[
351                        i, :
352                    ],
353                    np.asarray(
354                        self.scaled_X_train.astype(np.float64), order="C"
355                    ),
356                    np.zeros(n_train),
357                    n_train,
358                    p_train,
359                )
360
361                kmin_test_i = adaoptc.find_kmin_x(
362                    dists_test_i, n_x=n_train, k=self.k, cache=self.cache
363                )
364
365                weights_test_i = adaoptc.calculate_weights(kmin_test_i[0])
366
367                probs_test_i = adaoptc.calculate_probs(
368                    kmin_test_i[1], self.probs_training
369                )
370
371                return adaoptc.average_probs(
372                    probs=probs_test_i, weights=weights_test_i
373                )
374
375        if self.type_dist == "cosine":
376
377            @delayed
378            @wrap_non_picklable_objects
379            def multiproc_func(i, *args):
380                dists_test_i = adaoptc.distance_to_mat_cosine2(
381                    np.asarray(scaled_X_test.astype(np.float64), order="C")[
382                        i, :
383                    ],
384                    np.asarray(
385                        self.scaled_X_train.astype(np.float64), order="C"
386                    ),
387                    np.zeros(n_train),
388                    n_train,
389                    p_train,
390                )
391
392                kmin_test_i = adaoptc.find_kmin_x(
393                    dists_test_i, n_x=n_train, k=self.k, cache=self.cache
394                )
395
396                weights_test_i = adaoptc.calculate_weights(kmin_test_i[0])
397
398                probs_test_i = adaoptc.calculate_probs(
399                    kmin_test_i[1], self.probs_training
400                )
401
402                return adaoptc.average_probs(
403                    probs=probs_test_i, weights=weights_test_i
404                )
405
406        if self.verbose == 1:
407            res = Parallel(n_jobs=self.n_jobs, prefer="threads")(
408                (multiproc_func)(m) for m in tqdm(range(n_test))
409            )
410
411        else:
412            res = Parallel(n_jobs=self.n_jobs, prefer="threads")(
413                (multiproc_func)(m) for m in range(n_test)
414            )
415
416        return np.asarray(res)

AdaOpt classifier.

Attributes:

n_iterations: int
    number of iterations of the optimizer at training time.

learning_rate: float
    controls the speed of the optimizer at training time.

reg_lambda: float
    L2 regularization parameter for successive errors in the optimizer
    (at training time).

reg_alpha: float
    L1 regularization parameter for successive errors in the optimizer
    (at training time).

eta: float
    controls the slope in gradient descent (at training time).

gamma: float
    controls the step size in gradient descent (at training time).

k: int
    number of nearest neighbors selected at test time for classification.

tolerance: float
    controls early stopping in gradient descent (at training time).

n_clusters: int
    number of clusters, if MiniBatch k-means is used at test time
    (for faster prediction).

batch_size: int
    size of the batch, if MiniBatch k-means is used at test time
    (for faster prediction).

row_sample: float
    percentage of rows chosen from training set (by stratified subsampling,
    for faster prediction).

type_dist: str
    distance used for finding the nearest neighbors; currently `euclidean-f`
    (euclidean distances calculated as whole), `euclidean` (euclidean distances
    calculated row by row), `cosine` (cosine distance).

n_jobs: int
    number of cpus for parallel processing (default: None)

verbose: int
    progress bar for parallel processing (yes = 1) or not (no = 0)

cache: boolean
    if the nearest neighbors are cached or not, for faster retrieval in
    subsequent calls.

n_clusters_input: int
    number of clusters (a priori) for clustering the features

clustering_method: str
    clustering method: currently 'kmeans', 'gmm'

cluster_scaling: str
    scaling method for clustering: currently 'standard', 'robust', 'minmax'

backend: str
    backend for parallel processing: "cpu" or "gpu" or "tpu"

seed: int
    reproducibility seed for nodes_sim=='uniform', clustering and dropout.
def fit(self, X, y, **kwargs):
157    def fit(self, X, y, **kwargs):
158        """Fit AdaOpt to training data (X, y)
159
160        Args:
161
162            X: {array-like}, shape = [n_samples, n_features]
163                Training vectors, where n_samples is the number
164                of samples and n_features is the number of features.
165
166            y: array-like, shape = [n_samples]
167                Target values.
168
169            **kwargs: additional parameters to be passed to self.cook_training_set.
170
171        Returns:
172
173            self: object.
174
175        """
176
177        if self.n_clusters_input > 0:
178            clustered_X, self.scaler_, self.label_encoder_, self.clusterer_ = (
179                cluster(
180                    X,
181                    n_clusters=self.n_clusters_input,
182                    method=self.clustering_method,
183                    type_scaling=self.cluster_scaling,
184                    training=True,
185                    seed=self.seed,
186                )
187            )
188            X = np.column_stack((X.copy(), clustered_X))
189
190        if self.row_sample < 1:
191            index_subsample = subsample(
192                y, row_sample=self.row_sample, seed=self.seed
193            )
194            y_ = y[index_subsample]
195            X_ = X[index_subsample, :]
196        else:
197            y_ = deepcopy(y)
198            X_ = deepcopy(X)
199
200        n, p = X_.shape
201
202        n_classes = len(np.unique(y_))
203
204        assert n == len(y_), "must have X.shape[0] == len(y)"
205
206        res = adaoptc.fit_adaopt(
207            X=np.asarray(X_).astype(np.float64),
208            y=np.asarray(y_).astype(np.int64),
209            n_iterations=self.n_iterations,
210            n_X=n,
211            p_X=p,
212            n_classes=n_classes,
213            learning_rate=self.learning_rate,
214            reg_lambda=self.reg_lambda,
215            reg_alpha=self.reg_alpha,
216            eta=self.eta,
217            gamma=self.gamma,
218            tolerance=self.tolerance,
219            backend=self.backend,
220        )
221
222        self.probs_training = res["probs"]
223        self.training_accuracy = res["training_accuracy"]
224        self.alphas = res["alphas"]
225        self.n_iterations = res["n_iterations"]
226        self.scaled_X_train = np.array(res["scaled_X_train"], dtype=np.float64)
227        self.n_classes_ = len(np.unique(y))  # for compatibility with sklearn
228        return self

Fit AdaOpt to training data (X, y)

Args:

X: {array-like}, shape = [n_samples, n_features]
    Training vectors, where n_samples is the number
    of samples and n_features is the number of features.

y: array-like, shape = [n_samples]
    Target values.

**kwargs: additional parameters to be passed to self.cook_training_set.

Returns:

self: object.
def predict(self, X, **kwargs):
230    def predict(self, X, **kwargs):
231        """Predict test data X.
232
233        Args:
234
235            X: {array-like}, shape = [n_samples, n_features]
236                Training vectors, where n_samples is the number
237                of samples and n_features is the number of features.
238
239            **kwargs: additional parameters to be passed to `predict_proba`
240
241        Returns:
242
243            model predictions: {array-like}
244
245        """
246
247        return np.argmax(self.predict_proba(X, **kwargs), axis=1)

Predict test data X.

Args:

X: {array-like}, shape = [n_samples, n_features]
    Training vectors, where n_samples is the number
    of samples and n_features is the number of features.

**kwargs: additional parameters to be passed to `predict_proba`

Returns:

model predictions: {array-like}
def predict_proba(self, X, **kwargs):
249    def predict_proba(self, X, **kwargs):
250        """Predict probabilities for test data X.
251
252        Args:
253
254            X: {array-like}, shape = [n_samples, n_features]
255                Training vectors, where n_samples is the number
256                of samples and n_features is the number of features.
257
258            **kwargs: additional parameters to be passed to
259                self.cook_test_set
260
261        Returns:
262
263            probability estimates for test data: {array-like}
264
265        """
266
267        n_train, p_train = self.scaled_X_train.shape
268
269        if self.n_clusters_input > 0:
270            X = np.column_stack(
271                (
272                    X.copy(),
273                    cluster(
274                        X,
275                        training=False,
276                        scaler=self.scaler_,
277                        label_encoder=self.label_encoder_,
278                        clusterer=self.clusterer_,
279                        seed=self.seed,
280                    ),
281                )
282            )
283
284        n_test = X.shape[0]
285
286        if self.n_jobs is None:
287            return adaoptc.predict_proba_adaopt(
288                X_test=np.asarray(X, order="C").astype(np.float64),
289                scaled_X_train=np.asarray(
290                    self.scaled_X_train, order="C"
291                ).astype(np.float64),
292                n_test=n_test,
293                n_train=n_train,
294                probs_train=self.probs_training,
295                k=self.k,
296                n_clusters=self.n_clusters,
297                batch_size=self.batch_size,
298                type_dist=self.type_dist,
299                cache=self.cache,
300                seed=self.seed,
301                backend=self.backend,
302            )
303
304        # parallel: self.n_jobs is not None
305        assert self.type_dist in (
306            "euclidean",
307            "manhattan",
308            "cosine",
309        ), "must have: `self.type_dist` in ('euclidean', 'manhattan', 'cosine') "
310
311        scaled_X_test = X / norm(X, ord=2, axis=1)[:, None]
312
313        if self.type_dist == "euclidean":
314
315            @delayed
316            @wrap_non_picklable_objects
317            def multiproc_func(i):
318                dists_test_i = adaoptc.distance_to_mat_euclidean2(
319                    np.asarray(scaled_X_test.astype(np.float64), order="C")[
320                        i, :
321                    ],
322                    np.asarray(
323                        self.scaled_X_train.astype(np.float64), order="C"
324                    ),
325                    np.zeros(n_train),
326                    n_train,
327                    p_train,
328                )
329
330                kmin_test_i = adaoptc.find_kmin_x(
331                    dists_test_i, n_x=n_train, k=self.k, cache=self.cache
332                )
333
334                weights_test_i = adaoptc.calculate_weights(kmin_test_i[0])
335
336                probs_test_i = adaoptc.calculate_probs(
337                    kmin_test_i[1], self.probs_training
338                )
339
340                return adaoptc.average_probs(
341                    probs=probs_test_i, weights=weights_test_i
342                )
343
344        if self.type_dist == "manhattan":
345
346            @delayed
347            @wrap_non_picklable_objects
348            def multiproc_func(i):
349                dists_test_i = adaoptc.distance_to_mat_manhattan2(
350                    np.asarray(scaled_X_test.astype(np.float64), order="C")[
351                        i, :
352                    ],
353                    np.asarray(
354                        self.scaled_X_train.astype(np.float64), order="C"
355                    ),
356                    np.zeros(n_train),
357                    n_train,
358                    p_train,
359                )
360
361                kmin_test_i = adaoptc.find_kmin_x(
362                    dists_test_i, n_x=n_train, k=self.k, cache=self.cache
363                )
364
365                weights_test_i = adaoptc.calculate_weights(kmin_test_i[0])
366
367                probs_test_i = adaoptc.calculate_probs(
368                    kmin_test_i[1], self.probs_training
369                )
370
371                return adaoptc.average_probs(
372                    probs=probs_test_i, weights=weights_test_i
373                )
374
375        if self.type_dist == "cosine":
376
377            @delayed
378            @wrap_non_picklable_objects
379            def multiproc_func(i, *args):
380                dists_test_i = adaoptc.distance_to_mat_cosine2(
381                    np.asarray(scaled_X_test.astype(np.float64), order="C")[
382                        i, :
383                    ],
384                    np.asarray(
385                        self.scaled_X_train.astype(np.float64), order="C"
386                    ),
387                    np.zeros(n_train),
388                    n_train,
389                    p_train,
390                )
391
392                kmin_test_i = adaoptc.find_kmin_x(
393                    dists_test_i, n_x=n_train, k=self.k, cache=self.cache
394                )
395
396                weights_test_i = adaoptc.calculate_weights(kmin_test_i[0])
397
398                probs_test_i = adaoptc.calculate_probs(
399                    kmin_test_i[1], self.probs_training
400                )
401
402                return adaoptc.average_probs(
403                    probs=probs_test_i, weights=weights_test_i
404                )
405
406        if self.verbose == 1:
407            res = Parallel(n_jobs=self.n_jobs, prefer="threads")(
408                (multiproc_func)(m) for m in tqdm(range(n_test))
409            )
410
411        else:
412            res = Parallel(n_jobs=self.n_jobs, prefer="threads")(
413                (multiproc_func)(m) for m in range(n_test)
414            )
415
416        return np.asarray(res)

Predict probabilities for test data X.

Args:

X: {array-like}, shape = [n_samples, n_features]
    Training vectors, where n_samples is the number
    of samples and n_features is the number of features.

**kwargs: additional parameters to be passed to
    self.cook_test_set

Returns:

probability estimates for test data: {array-like}
class ConformalBayesianRegressor(sklearn.base.BaseEstimator, sklearn.base.RegressorMixin):
 19class ConformalBayesianRegressor(BaseEstimator, RegressorMixin):
 20    def __init__(
 21        self,
 22        obj=Ridge,
 23        level=95,
 24        hyperparameter_bounds=None,
 25        n_samples=20,
 26        calibration_fraction=0.2,
 27        scaling_method="standard",
 28        random_state=None,
 29        show_progress=False,
 30        verbose=True,
 31        n_jobs=-1,
 32    ):
 33        self.obj = obj
 34        self.level = level
 35        self.alpha_ = 1 - self.level / 100
 36        self.hyperparameter_bounds = hyperparameter_bounds
 37        self.n_samples = n_samples
 38        self.calibration_fraction = calibration_fraction
 39        self.scaling_method = scaling_method
 40        self.random_state = random_state
 41        self.verbose = verbose
 42        self.show_progress = show_progress
 43        self.n_jobs = n_jobs
 44
 45        self.is_fitted_ = False
 46
 47    def _sample_hyperparameters(self):
 48        # Simple uniform sampling or use fixed bounds
 49        configs = []
 50        for _ in range(self.n_samples):
 51            if self.hyperparameter_bounds:
 52                cfg = {}
 53                for k, v in self.hyperparameter_bounds.items():
 54                    if isinstance(v, list) and len(v) == 2:
 55                        # Always sample as float first
 56                        sampled_value = np.random.uniform(v[0], v[1])
 57
 58                        # If both bounds are integers, assume integer parameter
 59                        if isinstance(v[0], (int, np.integer)) and isinstance(
 60                            v[1], (int, np.integer)
 61                        ):
 62                            cfg[k] = int(sampled_value)
 63                        else:
 64                            cfg[k] = sampled_value
 65                    else:
 66                        # Assume fixed value
 67                        cfg[k] = v
 68            else:
 69                cfg = {}
 70            configs.append(cfg)
 71        return configs
 72
 73    def _train_models_parallel(self, X, y, configs):
 74        def train_model(cfg):
 75            self.obj.set_params(**cfg)
 76            self.obj.fit(X, y)
 77            return self.obj
 78
 79        if self.show_progress == False:
 80            models = joblib.Parallel(n_jobs=self.n_jobs)(
 81                joblib.delayed(train_model)(cfg)
 82                for cfg in tqdm(configs, disable=not self.verbose)
 83            )
 84        else:
 85            models = joblib.Parallel(n_jobs=self.n_jobs)(
 86                joblib.delayed(train_model)(cfg) for cfg in configs
 87            )
 88        return models
 89
 90    def _predict_models(self, models, X):
 91        if self.show_progress == False:
 92            preds = [m.predict(X) for m in models]
 93        else:
 94            preds = [
 95                m.predict(X) for m in tqdm(models, disable=not self.verbose)
 96            ]
 97        return np.column_stack(preds)
 98
 99    def fit(self, X, y):
100        if hasattr(X, "values"):  # keep DataFrame if possible
101            X = X.copy()
102        else:
103            X = np.asarray(X)
104        X = np.asarray(X)
105        y = np.asarray(y)
106        rng = check_random_state(self.random_state)
107        X, y = shuffle(X, y, random_state=rng)
108        # 1. Cluster with GMM (full covariance)
109        n_clusters = min(10, X.shape[0] // 30)
110        gmm = GaussianMixture(
111            n_components=n_clusters,
112            covariance_type="full",
113            random_state=self.random_state,
114        )
115        clusters = gmm.fit_predict(X)
116        # 2. Stratified train/calibration split
117        X_train, X_calib, y_train, y_calib = train_test_split(
118            X,
119            y,
120            test_size=self.calibration_fraction,
121            random_state=self.random_state,
122            stratify=clusters,
123        )
124        # 3. Scale features
125        if self.scaling_method == "standard":
126            scaler = StandardScaler().fit(X_train)
127        else:
128            raise ValueError("Scaling method must be 'standard'")
129        self.scaler_ = scaler
130        X_train_s = scaler.transform(X_train)
131        X_calib_s = scaler.transform(X_calib)
132        # 4. Train ensemble
133        configs = self._sample_hyperparameters()
134        self.models_ = self._train_models_parallel(X_train_s, y_train, configs)
135        # 5. Calibration residuals
136        preds_calib = self._predict_models(self.models_, X_calib_s)
137        self.calibration_residuals_ = y_calib - np.median(preds_calib, axis=1)
138        self.is_fitted_ = True
139        return self
140
141    def predict(self, X, return_pi=False):
142        """Obtain predictions and prediction intervals
143
144        Args:
145
146            X: array-like, shape = [n_samples, n_features];
147                Testing set vectors, where n_samples is the number
148                of samples and n_features is the number of features.
149
150            return_pi: boolean
151                Whether the prediction interval is returned or not.
152                Default is False, for compatibility with other _estimators_.
153                If True, a tuple containing the predictions + lower and upper
154                bounds is returned.
155
156        """
157        if not self.is_fitted_:
158            raise RuntimeError("Fit the model first")
159        X_s = self.scaler_.transform(X)
160        preds = self._predict_models(self.models_, X_s)
161        self.mean_ = np.median(preds, axis=1)
162        if return_pi == False:
163            return self.mean_
164        DescribeResult = namedtuple(
165            "DescribeResult", ("mean", "lower", "upper")
166        )
167        q = np.quantile(self.calibration_residuals_, q=self.alpha_ / 200)
168        return DescribeResult(self.mean_, self.mean_ + q, self.mean_ - q)
169
170    def get_coverage(self, y_true, lower, upper):
171        return np.mean((y_true >= lower) & (y_true <= upper))

Base class for all estimators in scikit-learn.

Inheriting from this class provides default implementations of:

  • setting and getting parameters used by GridSearchCV and friends;
  • textual and HTML representation displayed in terminals and IDEs;
  • estimator serialization;
  • parameters validation;
  • data validation;
  • feature names validation.

Read more in the :ref:User Guide <rolling_your_own_estimator>.

Notes

All estimators should specify all the parameters that can be set at the class level in their __init__ as explicit keyword arguments (no *args or **kwargs).

Examples

>>> import numpy as np
>>> from sklearn.base import BaseEstimator
>>> class MyEstimator(BaseEstimator):
...     def __init__(self, *, param=1):
...         self.param = param
...     def fit(self, X, y=None):
...         self.is_fitted_ = True
...         return self
...     def predict(self, X):
...         return np.full(shape=X.shape[0], fill_value=self.param)
>>> estimator = MyEstimator(param=2)
>>> estimator.get_params()
{'param': 2}
>>> X = np.array([[1, 2], [2, 3], [3, 4]])
>>> y = np.array([1, 0, 1])
>>> estimator.fit(X, y).predict(X)
array([2, 2, 2])
>>> estimator.set_params(param=3).fit(X, y).predict(X)
array([3, 3, 3])
def fit(self, X, y):
 99    def fit(self, X, y):
100        if hasattr(X, "values"):  # keep DataFrame if possible
101            X = X.copy()
102        else:
103            X = np.asarray(X)
104        X = np.asarray(X)
105        y = np.asarray(y)
106        rng = check_random_state(self.random_state)
107        X, y = shuffle(X, y, random_state=rng)
108        # 1. Cluster with GMM (full covariance)
109        n_clusters = min(10, X.shape[0] // 30)
110        gmm = GaussianMixture(
111            n_components=n_clusters,
112            covariance_type="full",
113            random_state=self.random_state,
114        )
115        clusters = gmm.fit_predict(X)
116        # 2. Stratified train/calibration split
117        X_train, X_calib, y_train, y_calib = train_test_split(
118            X,
119            y,
120            test_size=self.calibration_fraction,
121            random_state=self.random_state,
122            stratify=clusters,
123        )
124        # 3. Scale features
125        if self.scaling_method == "standard":
126            scaler = StandardScaler().fit(X_train)
127        else:
128            raise ValueError("Scaling method must be 'standard'")
129        self.scaler_ = scaler
130        X_train_s = scaler.transform(X_train)
131        X_calib_s = scaler.transform(X_calib)
132        # 4. Train ensemble
133        configs = self._sample_hyperparameters()
134        self.models_ = self._train_models_parallel(X_train_s, y_train, configs)
135        # 5. Calibration residuals
136        preds_calib = self._predict_models(self.models_, X_calib_s)
137        self.calibration_residuals_ = y_calib - np.median(preds_calib, axis=1)
138        self.is_fitted_ = True
139        return self
def predict(self, X, return_pi=False):
141    def predict(self, X, return_pi=False):
142        """Obtain predictions and prediction intervals
143
144        Args:
145
146            X: array-like, shape = [n_samples, n_features];
147                Testing set vectors, where n_samples is the number
148                of samples and n_features is the number of features.
149
150            return_pi: boolean
151                Whether the prediction interval is returned or not.
152                Default is False, for compatibility with other _estimators_.
153                If True, a tuple containing the predictions + lower and upper
154                bounds is returned.
155
156        """
157        if not self.is_fitted_:
158            raise RuntimeError("Fit the model first")
159        X_s = self.scaler_.transform(X)
160        preds = self._predict_models(self.models_, X_s)
161        self.mean_ = np.median(preds, axis=1)
162        if return_pi == False:
163            return self.mean_
164        DescribeResult = namedtuple(
165            "DescribeResult", ("mean", "lower", "upper")
166        )
167        q = np.quantile(self.calibration_residuals_, q=self.alpha_ / 200)
168        return DescribeResult(self.mean_, self.mean_ + q, self.mean_ - q)

Obtain predictions and prediction intervals

Args:

X: array-like, shape = [n_samples, n_features];
    Testing set vectors, where n_samples is the number
    of samples and n_features is the number of features.

return_pi: boolean
    Whether the prediction interval is returned or not.
    Default is False, for compatibility with other _estimators_.
    If True, a tuple containing the predictions + lower and upper
    bounds is returned.
class ConformalBayesianClassifier(sklearn.base.BaseEstimator, sklearn.base.ClassifierMixin):
 21class ConformalBayesianClassifier(BaseEstimator, ClassifierMixin):
 22
 23    # construct the object -----
 24    _estimator_type = "classifier"
 25
 26    def __init__(
 27        self,
 28        obj=Ridge(),
 29        hyperparameter_bounds=None,
 30        n_samples=20,
 31        calibration_fraction=0.2,
 32        scaling_method="standard",
 33        random_state=None,
 34        verbose=True,
 35        n_jobs=-1,
 36        calibrated=False,
 37        calibration_method="sigmoid",
 38        calibration_cv=3,
 39        use_classifier=True,
 40    ):
 41        """
 42        Conformal Bayesian Classifier with optional probability calibration.
 43
 44        Parameters
 45        ----------
 46        obj : estimator object
 47            Base estimator (default: Ridge() for regression-based, or classifier if use_classifier=True)
 48        hyperparameter_bounds : dict, optional
 49            Bounds for hyperparameter sampling
 50        n_samples : int
 51            Number of ensemble models to train
 52        calibration_fraction : float
 53            Fraction of data to use for conformal calibration
 54        scaling_method : str
 55            Feature scaling method (default: "standard")
 56        random_state : int, optional
 57            Random seed for reproducibility
 58        verbose : bool
 59            Whether to show progress bars
 60        n_jobs : int
 61            Number of parallel jobs (-1 for all cores)
 62        calibrated : bool
 63            Whether to apply probability calibration (default: False)
 64        calibration_method : str
 65            Method for calibration: "sigmoid" or "isotonic" (default: "sigmoid")
 66        calibration_cv : int
 67            Number of CV folds for calibration (default: 3)
 68        use_classifier : bool
 69            If True, use sklearn-like classifier with probability averaging.
 70            If False, use regression-based approach with SimpleMultitaskClassifier (default: False)
 71        """
 72        self.obj_base = obj
 73        self.hyperparameter_bounds = hyperparameter_bounds
 74        self.n_samples = n_samples
 75        self.calibration_fraction = calibration_fraction
 76        self.scaling_method = scaling_method
 77        self.random_state = random_state
 78        self.verbose = verbose
 79        self.n_jobs = n_jobs
 80        self.calibrated = calibrated
 81        self.calibration_method = calibration_method
 82        self.calibration_cv = calibration_cv
 83        self.use_classifier = use_classifier
 84
 85        self.is_fitted_ = False
 86
 87    def _sample_hyperparameters(self):
 88        """Sample hyperparameters for ensemble members"""
 89        configs = []
 90
 91        # Get parameter constraints from the model if available
 92        param_constraints = {}
 93        if hasattr(self.obj_base, "_parameter_constraints"):
 94            param_constraints = self.obj_base._parameter_constraints
 95
 96        for _ in range(self.n_samples):
 97            if self.hyperparameter_bounds:
 98                cfg = {}
 99                for k, v in self.hyperparameter_bounds.items():
100                    if isinstance(v, list) and len(v) == 2:
101                        # If both bounds are integers, assume integer parameter
102                        if isinstance(v[0], (int, np.integer)) and isinstance(
103                            v[1], (int, np.integer)
104                        ):
105                            cfg[k] = np.random.randint(v[0], v[1] + 1)
106                        else:
107                            cfg[k] = np.random.uniform(v[0], v[1])
108                    else:
109                        # Assume fixed value
110                        cfg[k] = v
111            else:
112                cfg = {}
113            configs.append(cfg)
114        return configs
115
116    def _train_classifiers_parallel(self, X, y, configs):
117        """Train ensemble of classifiers in parallel"""
118
119        def train_classifier(cfg):
120            clf = clone(self.obj_base)
121            clf.set_params(**cfg)
122            clf.fit(X, y)
123            return clf
124
125        if self.verbose:
126            models = joblib.Parallel(n_jobs=self.n_jobs)(
127                joblib.delayed(train_classifier)(cfg)
128                for cfg in tqdm(configs, desc="Training classifiers")
129            )
130        else:
131            models = joblib.Parallel(n_jobs=self.n_jobs)(
132                joblib.delayed(train_classifier)(cfg) for cfg in configs
133            )
134        return models
135
136    def fit(self, X, y):
137        """
138        Fit the conformal Bayesian classifier.
139
140        Parameters
141        ----------
142        X : array-like of shape (n_samples, n_features)
143            Training data
144        y : array-like of shape (n_samples,)
145            Target values
146
147        Returns
148        -------
149        self : object
150            Fitted classifier
151        """
152        X = np.asarray(X)
153        y = np.asarray(y)
154
155        # Store classes
156        self.classes_ = np.unique(y)
157        self.n_classes_ = len(self.classes_)
158
159        if self.use_classifier:
160            # Simpler approach: use sklearn-like classifier with probability averaging
161            if not is_classifier(self.obj_base):
162                raise ValueError(
163                    "use_classifier=True requires obj to be a classifier "
164                    "(e.g., LogisticRegression, RandomForestClassifier)"
165                )
166
167            rng = check_random_state(self.random_state)
168            X, y = shuffle(X, y, random_state=rng)
169
170            # Stratified train/calibration split
171            X_train, X_calib, y_train, y_calib = train_test_split(
172                X,
173                y,
174                test_size=self.calibration_fraction,
175                random_state=self.random_state,
176                stratify=y,
177            )
178
179            # Scale features
180            if self.scaling_method == "standard":
181                self.scaler_ = StandardScaler().fit(X_train)
182            else:
183                raise ValueError("Scaling method must be 'standard'")
184
185            X_train_s = self.scaler_.transform(X_train)
186            X_calib_s = self.scaler_.transform(X_calib)
187
188            # Train ensemble of classifiers
189            configs = self._sample_hyperparameters()
190            self.classifiers_ = self._train_classifiers_parallel(
191                X_train_s, y_train, configs
192            )
193
194            # Store calibration data for potential conformal prediction
195            self.X_calib_ = X_calib_s
196            self.y_calib_ = y_calib
197
198            if self.calibrated:
199                # Calibrate each classifier
200                if self.verbose:
201                    print("Calibrating classifiers...")
202
203                self.calibrated_classifiers_ = []
204                for clf in tqdm(
205                    self.classifiers_,
206                    disable=not self.verbose,
207                    desc="Calibrating",
208                ):
209                    cal_clf = CalibratedClassifierCV(
210                        clf,
211                        method=self.calibration_method,
212                        cv="prefit",
213                        ensemble=False,
214                    )
215                    cal_clf.fit(X_calib_s, y_calib)
216                    self.calibrated_classifiers_.append(cal_clf)
217
218        else:
219            # Original approach: use regression-based with SimpleMultitaskClassifier
220            base_regressor = ConformalBayesianRegressor(
221                self.obj_base,
222                hyperparameter_bounds=self.hyperparameter_bounds,
223                n_samples=self.n_samples,
224                calibration_fraction=self.calibration_fraction,
225                scaling_method=self.scaling_method,
226                random_state=self.random_state,
227                verbose=self.verbose,
228                n_jobs=self.n_jobs,
229            )
230
231            # Wrap in multitask classifier
232            self.obj = ns.SimpleMultitaskClassifier(base_regressor)
233
234            if self.calibrated:
235                # Fit the base classifier first
236                self.obj.fit(X, y)
237
238                # Then wrap with calibration
239                self.calibrated_obj_ = CalibratedClassifierCV(
240                    self.obj,
241                    method=self.calibration_method,
242                    cv="prefit",
243                    ensemble=False,
244                )
245
246                # Fit calibration on the same data
247                self.calibrated_obj_.fit(X, y)
248            else:
249                self.obj.fit(X, y)
250
251        self.is_fitted_ = True
252        return self
253
254    def predict_proba(self, X):
255        """
256        Predict class probabilities.
257
258        Parameters
259        ----------
260        X : array-like of shape (n_samples, n_features)
261            Test data
262
263        Returns
264        -------
265        proba : array of shape (n_samples, n_classes)
266            Predicted probabilities for each class
267        """
268        if not self.is_fitted_:
269            raise RuntimeError("Fit the model first")
270
271        X = np.asarray(X)
272
273        if self.use_classifier:
274            # Average probabilities from ensemble
275            X_s = self.scaler_.transform(X)
276
277            if self.calibrated:
278                # Use calibrated classifiers
279                probas = np.array(
280                    [
281                        clf.predict_proba(X_s)
282                        for clf in self.calibrated_classifiers_
283                    ]
284                )
285            else:
286                # Use uncalibrated classifiers
287                probas = np.array(
288                    [clf.predict_proba(X_s) for clf in self.classifiers_]
289                )
290
291            # Average probabilities across ensemble
292            mean_proba = np.mean(probas, axis=0)
293
294            return mean_proba
295        else:
296            # Use regression-based approach
297            if self.calibrated:
298                return self.calibrated_obj_.predict_proba(X)
299            else:
300                return self.obj.predict_proba(X)
301
302    def predict(self, X):
303        """
304        Predict class labels.
305
306        Parameters
307        ----------
308        X : array-like of shape (n_samples, n_features)
309            Test data
310
311        Returns
312        -------
313        y_pred : array of shape (n_samples,)
314            Predicted class labels
315        """
316        proba = self.predict_proba(X)
317        return self.classes_[np.argmax(proba, axis=1)]
318
319    @property
320    def _estimator_type(self):
321        return "classifier"

Base class for all estimators in scikit-learn.

Inheriting from this class provides default implementations of:

  • setting and getting parameters used by GridSearchCV and friends;
  • textual and HTML representation displayed in terminals and IDEs;
  • estimator serialization;
  • parameters validation;
  • data validation;
  • feature names validation.

Read more in the :ref:User Guide <rolling_your_own_estimator>.

Notes

All estimators should specify all the parameters that can be set at the class level in their __init__ as explicit keyword arguments (no *args or **kwargs).

Examples

>>> import numpy as np
>>> from sklearn.base import BaseEstimator
>>> class MyEstimator(BaseEstimator):
...     def __init__(self, *, param=1):
...         self.param = param
...     def fit(self, X, y=None):
...         self.is_fitted_ = True
...         return self
...     def predict(self, X):
...         return np.full(shape=X.shape[0], fill_value=self.param)
>>> estimator = MyEstimator(param=2)
>>> estimator.get_params()
{'param': 2}
>>> X = np.array([[1, 2], [2, 3], [3, 4]])
>>> y = np.array([1, 0, 1])
>>> estimator.fit(X, y).predict(X)
array([2, 2, 2])
>>> estimator.set_params(param=3).fit(X, y).predict(X)
array([3, 3, 3])
def fit(self, X, y):
136    def fit(self, X, y):
137        """
138        Fit the conformal Bayesian classifier.
139
140        Parameters
141        ----------
142        X : array-like of shape (n_samples, n_features)
143            Training data
144        y : array-like of shape (n_samples,)
145            Target values
146
147        Returns
148        -------
149        self : object
150            Fitted classifier
151        """
152        X = np.asarray(X)
153        y = np.asarray(y)
154
155        # Store classes
156        self.classes_ = np.unique(y)
157        self.n_classes_ = len(self.classes_)
158
159        if self.use_classifier:
160            # Simpler approach: use sklearn-like classifier with probability averaging
161            if not is_classifier(self.obj_base):
162                raise ValueError(
163                    "use_classifier=True requires obj to be a classifier "
164                    "(e.g., LogisticRegression, RandomForestClassifier)"
165                )
166
167            rng = check_random_state(self.random_state)
168            X, y = shuffle(X, y, random_state=rng)
169
170            # Stratified train/calibration split
171            X_train, X_calib, y_train, y_calib = train_test_split(
172                X,
173                y,
174                test_size=self.calibration_fraction,
175                random_state=self.random_state,
176                stratify=y,
177            )
178
179            # Scale features
180            if self.scaling_method == "standard":
181                self.scaler_ = StandardScaler().fit(X_train)
182            else:
183                raise ValueError("Scaling method must be 'standard'")
184
185            X_train_s = self.scaler_.transform(X_train)
186            X_calib_s = self.scaler_.transform(X_calib)
187
188            # Train ensemble of classifiers
189            configs = self._sample_hyperparameters()
190            self.classifiers_ = self._train_classifiers_parallel(
191                X_train_s, y_train, configs
192            )
193
194            # Store calibration data for potential conformal prediction
195            self.X_calib_ = X_calib_s
196            self.y_calib_ = y_calib
197
198            if self.calibrated:
199                # Calibrate each classifier
200                if self.verbose:
201                    print("Calibrating classifiers...")
202
203                self.calibrated_classifiers_ = []
204                for clf in tqdm(
205                    self.classifiers_,
206                    disable=not self.verbose,
207                    desc="Calibrating",
208                ):
209                    cal_clf = CalibratedClassifierCV(
210                        clf,
211                        method=self.calibration_method,
212                        cv="prefit",
213                        ensemble=False,
214                    )
215                    cal_clf.fit(X_calib_s, y_calib)
216                    self.calibrated_classifiers_.append(cal_clf)
217
218        else:
219            # Original approach: use regression-based with SimpleMultitaskClassifier
220            base_regressor = ConformalBayesianRegressor(
221                self.obj_base,
222                hyperparameter_bounds=self.hyperparameter_bounds,
223                n_samples=self.n_samples,
224                calibration_fraction=self.calibration_fraction,
225                scaling_method=self.scaling_method,
226                random_state=self.random_state,
227                verbose=self.verbose,
228                n_jobs=self.n_jobs,
229            )
230
231            # Wrap in multitask classifier
232            self.obj = ns.SimpleMultitaskClassifier(base_regressor)
233
234            if self.calibrated:
235                # Fit the base classifier first
236                self.obj.fit(X, y)
237
238                # Then wrap with calibration
239                self.calibrated_obj_ = CalibratedClassifierCV(
240                    self.obj,
241                    method=self.calibration_method,
242                    cv="prefit",
243                    ensemble=False,
244                )
245
246                # Fit calibration on the same data
247                self.calibrated_obj_.fit(X, y)
248            else:
249                self.obj.fit(X, y)
250
251        self.is_fitted_ = True
252        return self

Fit the conformal Bayesian classifier.

Parameters

X : array-like of shape (n_samples, n_features) Training data y : array-like of shape (n_samples,) Target values

Returns

self : object Fitted classifier

def predict_proba(self, X):
254    def predict_proba(self, X):
255        """
256        Predict class probabilities.
257
258        Parameters
259        ----------
260        X : array-like of shape (n_samples, n_features)
261            Test data
262
263        Returns
264        -------
265        proba : array of shape (n_samples, n_classes)
266            Predicted probabilities for each class
267        """
268        if not self.is_fitted_:
269            raise RuntimeError("Fit the model first")
270
271        X = np.asarray(X)
272
273        if self.use_classifier:
274            # Average probabilities from ensemble
275            X_s = self.scaler_.transform(X)
276
277            if self.calibrated:
278                # Use calibrated classifiers
279                probas = np.array(
280                    [
281                        clf.predict_proba(X_s)
282                        for clf in self.calibrated_classifiers_
283                    ]
284                )
285            else:
286                # Use uncalibrated classifiers
287                probas = np.array(
288                    [clf.predict_proba(X_s) for clf in self.classifiers_]
289                )
290
291            # Average probabilities across ensemble
292            mean_proba = np.mean(probas, axis=0)
293
294            return mean_proba
295        else:
296            # Use regression-based approach
297            if self.calibrated:
298                return self.calibrated_obj_.predict_proba(X)
299            else:
300                return self.obj.predict_proba(X)

Predict class probabilities.

Parameters

X : array-like of shape (n_samples, n_features) Test data

Returns

proba : array of shape (n_samples, n_classes) Predicted probabilities for each class

def predict(self, X):
302    def predict(self, X):
303        """
304        Predict class labels.
305
306        Parameters
307        ----------
308        X : array-like of shape (n_samples, n_features)
309            Test data
310
311        Returns
312        -------
313        y_pred : array of shape (n_samples,)
314            Predicted class labels
315        """
316        proba = self.predict_proba(X)
317        return self.classes_[np.argmax(proba, axis=1)]

Predict class labels.

Parameters

X : array-like of shape (n_samples, n_features) Test data

Returns

y_pred : array of shape (n_samples,) Predicted class labels

class ContextAwareThetaForecaster:
 16class ContextAwareThetaForecaster:
 17    """
 18    Unified Theta Method with multiple estimation modes.
 19
 20    Variants:
 21    - 'classic': Standard Theta (SES + drift)
 22    - 'cox': Context-aware with Cox partial likelihood
 23    - 'ridge': Context-aware with Ridge regression
 24    - 'ml': ML-enhanced with sklearn estimator
 25    - 'rslope': R-style slopes via numerical differentiation (context-free)
 26
 27    Parameters
 28    ----------
 29    mode : {'classic', 'cox', 'ridge', 'ml', 'rslope'}
 30        Estimation mode
 31    theta : float, default=0.5
 32        Drift intensity (0=no drift, 0.5=classical, 1=full)
 33    estimator : sklearn estimator, optional
 34        For 'ml' and 'rslope' modes
 35    tau : float, default=12
 36        Temporal attention decay
 37    sigma_val : float, optional
 38        Value-based kernel bandwidth
 39    kernel : {'temporal', 'value', 'hybrid'}
 40        Attention kernel type
 41    seasonal_period : int, optional
 42        Seasonal period (auto-detected if None)
 43    risk_set_size : int, default=15
 44        Risk set size for Cox PL
 45    stability_factor : float, default=0.8
 46        Gamma clipping safety factor (0,1]
 47    random_state : int, optional
 48        Random seed for ML mode
 49    """
 50
 51    def __init__(
 52        self,
 53        mode: Literal["classic", "cox", "ridge", "ml", "rslope"] = "cox",
 54        theta: float = 0.5,
 55        estimator: Optional[Any] = None,
 56        tau: float = 12.0,
 57        sigma_val: Optional[float] = None,
 58        kernel: Literal["temporal", "value", "hybrid"] = "temporal",
 59        seasonal_period: Optional[int] = None,
 60        risk_set_size: int = 15,
 61        stability_factor: float = 0.8,
 62        random_state: Optional[int] = None,
 63    ):
 64        self.mode = mode
 65        self.theta = theta
 66        self.estimator = estimator
 67        self.tau = tau
 68        self.sigma_val = sigma_val
 69        self.kernel = kernel
 70        self.seasonal_period = seasonal_period
 71        self.risk_set_size = risk_set_size
 72        self.stability_factor = stability_factor
 73        self.random_state = random_state
 74
 75        # Fitted params
 76        self.alpha_ = None
 77        self.l_n_ = None
 78        self.b0_ = None
 79        self.gamma_ = None
 80        self.mu_z_ = None
 81        self.sigma_z_ = None
 82        self.sigma2_ = None
 83        self.seasonal_indices_ = None
 84        self._y_train = None
 85        self._fitted = False
 86
 87    # ============ SEASONAL DECOMPOSITION ============
 88    def _decompose(self, y: np.ndarray, period: int):
 89        """Multiplicative seasonal decomposition"""
 90        n = len(y)
 91        if n < 2 * period:
 92            return y, np.ones(n), y
 93
 94        # Centered MA trend
 95        trend = np.full(n, np.nan)
 96        half = period // 2
 97        for i in range(half, n - half):
 98            window = y[i - half : i + half + (period % 2)]
 99            trend[i] = np.mean(window)
100
101        # Fill edges
102        valid = np.where(~np.isnan(trend))[0]
103        if len(valid) > 0:
104            trend[: valid[0]] = trend[valid[0]]
105            trend[valid[-1] + 1 :] = trend[valid[-1]]
106
107        # Seasonal indices
108        detrended = y / (trend + 1e-10)
109        seasonal = np.zeros(period)
110        for i in range(period):
111            seasonal[i] = np.nanmean(detrended[i::period])
112        seasonal /= seasonal.mean() + 1e-10
113
114        seasonal_full = np.tile(seasonal, n // period + 1)[:n]
115        adjusted = y / (seasonal_full + 1e-10)
116
117        return adjusted, seasonal_full, trend
118
119    # ============ SES ============
120    def _ses_level(self, y: np.ndarray, alpha: float) -> np.ndarray:
121        """Compute SES level array"""
122        level = np.zeros(len(y))
123        level[0] = y[0]
124        for t in range(1, len(y)):
125            level[t] = alpha * y[t] + (1 - alpha) * level[t - 1]
126        return level
127
128    def _ses_nll(self, alpha: float, y: np.ndarray) -> float:
129        """SES negative log-likelihood"""
130        if alpha <= 0 or alpha >= 1:
131            return 1e10
132        level = self._ses_level(y, alpha)
133        resid = y[1:] - level[:-1]
134        sigma2 = np.var(resid) + 1e-10
135        return 0.5 * len(resid) * (np.log(2 * np.pi * sigma2) + 1)
136
137    def _fit_ses(self, y: np.ndarray):
138        """Estimate alpha via MLE"""
139        res = minimize(
140            lambda a: self._ses_nll(a[0], y),
141            [0.3],
142            bounds=[(0.01, 0.99)],
143            method="L-BFGS-B",
144        )
145        alpha = res.x[0]
146        level_array = self._ses_level(y, alpha)
147        return alpha, level_array[-1], level_array
148
149    # ============ DRIFT ============
150    def _estimate_drift(self, y: np.ndarray) -> float:
151        """Baseline drift: b0 = beta_OLS / 2"""
152        t = np.arange(len(y))
153        beta = np.sum((t - t.mean()) * (y - y.mean())) / (
154            np.sum((t - t.mean()) ** 2) + 1e-10
155        )
156        return beta / 2.0
157
158    # ============ ATTENTION CONTEXT ============
159    def _attention_kernel(self, Xj: float, Xt: float, j: int, t: int) -> float:
160        """Compute kernel weight"""
161        if self.kernel == "temporal":
162            return np.exp(-(t - j) / (self.tau + 1e-12))
163        elif self.kernel == "value":
164            sigma = self.sigma_val if self.sigma_val else 1.0
165            return np.exp(-((Xj - Xt) ** 2) / (2 * sigma**2 + 1e-12))
166        else:  # hybrid
167            sigma = self.sigma_val if self.sigma_val else 1.0
168            return np.exp(
169                -(t - j) / (self.tau + 1e-12)
170                - ((Xj - Xt) ** 2) / (2 * sigma**2 + 1e-12)
171            )
172
173    def _compute_context(self, y: np.ndarray) -> np.ndarray:
174        """Compute attention-weighted context z_t"""
175        n = len(y)
176        z = np.zeros(n)
177        for t in range(n):
178            weights = np.array(
179                [self._attention_kernel(y[j], y[t], j, t) for j in range(t + 1)]
180            )
181            weights /= weights.sum() + 1e-12
182            z[t] = np.dot(weights, y[: t + 1])
183        return z
184
185    # ============ GAMMA ESTIMATION ============
186    def _partial_nll(self, gamma: float, z_star: np.ndarray) -> float:
187        """Cox partial negative log-likelihood (stable)"""
188        n = len(z_star)
189        k = min(self.risk_set_size, n // 2)
190        nll = 0.0
191        for t in range(k, n):
192            z_risk = z_star[max(0, t - k) : t + 1]
193            nll -= gamma * z_star[t] - logsumexp(gamma * z_risk)
194        return nll
195
196    def _estimate_gamma_cox(self, z_star: np.ndarray) -> float:
197        """Estimate gamma via Cox PL"""
198        res = minimize(
199            lambda g: self._partial_nll(g[0], z_star), [0.0], method="BFGS"
200        )
201        return res.x[0]
202
203    def _estimate_gamma_ridge(
204        self,
205        y: np.ndarray,
206        z_star: np.ndarray,
207        level_array: np.ndarray,
208        alpha: float,
209        b0: float,
210    ) -> float:
211        """Estimate gamma via Ridge regression"""
212        try:
213            from sklearn.linear_model import Ridge
214        except ImportError:
215            warnings.warn("sklearn unavailable, using Cox method")
216            return self._estimate_gamma_cox(z_star)
217
218        n = len(y)
219        residuals = y - level_array
220
221        # Build design matrix
222        X_design = []
223        y_resid = []
224        for t in range(20, n):
225            D_h = self._D_n(1, alpha, t)
226            x_t = 0.5 * b0 * z_star[t - 1] * D_h
227            X_design.append(x_t)
228            y_resid.append(residuals[t])
229
230        X_design = np.array(X_design).reshape(-1, 1)
231        y_resid = np.array(y_resid)
232
233        ridge = Ridge(alpha=10.0, fit_intercept=False)
234        ridge.fit(X_design, y_resid)
235        return ridge.coef_[0]
236
237    def _estimate_gamma_ml(self, y: np.ndarray, h: int) -> float:
238        """Estimate gamma via ML numerical differentiation"""
239        if self.estimator is None:
240            from sklearn.linear_model import LinearRegression
241
242            self.estimator = LinearRegression()
243
244        n = len(y)
245        if self.random_state is not None:
246            np.random.seed(self.random_state)
247
248        # Features
249        time_idx = np.arange(n + h)
250        time_norm = time_idx / n
251        n_random = 3
252        random_cov = np.random.randn(n + h, n_random)
253        X_all = np.column_stack([time_norm, random_cov])
254
255        # Scale
256        from sklearn.preprocessing import StandardScaler
257
258        scaler = StandardScaler()
259        X_train = scaler.fit_transform(X_all[:n])
260        X_all = scaler.transform(X_all)
261
262        # Fit
263        self.estimator.fit(X_train, y)
264
265        # Numerical differentiation
266        eps = 1e-4 ** (1 / 3)
267        h_eps = np.maximum(eps * np.abs(time_norm), eps / n)
268
269        t_plus = np.clip(time_norm + h_eps, 0, 2.0)
270        t_minus = np.clip(time_norm - h_eps, 0, 2.0)
271
272        X_plus = scaler.transform(np.column_stack([t_plus, random_cov]))
273        X_minus = scaler.transform(np.column_stack([t_minus, random_cov]))
274
275        fx_plus = self.estimator.predict(X_plus)
276        fx_minus = self.estimator.predict(X_minus)
277
278        slopes = (fx_plus - fx_minus) / (2 * h_eps) / n
279
280        # Approximate gamma from slopes
281        return self.theta * slopes.mean() / (0.5 * self.b0_ + 1e-12)
282
283    def _estimate_slopes_rslope(self, y: np.ndarray, h: int) -> np.ndarray:
284        """
285        R-style slope estimation via numerical differentiation.
286        Returns slopes for forecast horizon (not gamma).
287        Similar to estimate_theta_slope() in R code.
288        """
289        if self.estimator is None:
290            from sklearn.linear_model import LinearRegression
291
292            self.estimator = LinearRegression()
293
294        n = len(y)
295        if self.random_state is not None:
296            np.random.seed(self.random_state)
297
298        # Create features: time + random noise
299        time_idx = np.arange(n + h)
300        time_norm = time_idx / n
301        n_random = 3
302        random_cov = np.random.randn(n + h, n_random)
303        X_all = np.column_stack([time_norm, random_cov])
304
305        # Scale features
306        from sklearn.preprocessing import StandardScaler
307
308        scaler = StandardScaler()
309        X_train = scaler.fit_transform(X_all[:n])
310        X_all_scaled = scaler.transform(X_all)
311
312        # Fit model
313        self.estimator.fit(X_train, y)
314
315        # Numerical differentiation for ALL points (historical + forecast)
316        eps = 1e-4 ** (1 / 3)
317        h_eps = np.maximum(eps * np.abs(time_norm), eps / n)
318
319        t_plus = np.clip(time_norm + h_eps, 0, 2.0)
320        t_minus = np.clip(time_norm - h_eps, 0, 2.0)
321
322        X_plus = scaler.transform(np.column_stack([t_plus, random_cov]))
323        X_minus = scaler.transform(np.column_stack([t_minus, random_cov]))
324
325        fx_plus = self.estimator.predict(X_plus)
326        fx_minus = self.estimator.predict(X_minus)
327
328        # Slopes at each time point
329        slopes = (fx_plus - fx_minus) / (2 * h_eps) / n
330
331        # Return ONLY the forecast horizon slopes (last h values)
332        return slopes[-h:] * self.theta
333
334    # ============ DRIFT MULTIPLIER ============
335    def _D_n(self, h: int, alpha: float, n: int) -> float:
336        """Drift multiplier D_n(h)"""
337        return (h - 1) + (1 - (1 - alpha) ** n) / (alpha + 1e-12)
338
339    # ============ FIT ============
340    def fit(self, y: np.ndarray):
341        """Fit the model"""
342        y = np.asarray(y, dtype=float).ravel()
343        self._y_train = y.copy()
344        n = len(y)
345
346        # Detect seasonality
347        period = self.seasonal_period
348        if period is None:
349            period = 12 if n >= 24 else 1
350
351        # Decompose
352        if period > 1 and n >= 2 * period:
353            y_adj, seasonal_full, _ = self._decompose(y, period)
354            self.seasonal_indices_ = seasonal_full[-period:]
355        else:
356            y_adj = y.copy()
357            self.seasonal_indices_ = None
358
359        # SES
360        self.alpha_, self.l_n_, level_array = self._fit_ses(y_adj)
361
362        # Drift
363        self.b0_ = self._estimate_drift(y_adj)
364
365        # Context & Gamma
366        if self.mode == "classic":
367            self.gamma_ = 0.0
368            self.mu_z_ = 0.0
369            self.sigma_z_ = 1.0
370        elif self.mode == "rslope":
371            # R-style: no gamma, slopes computed during prediction
372            self.gamma_ = 0.0
373            self.mu_z_ = 0.0
374            self.sigma_z_ = 1.0
375        else:
376            # Compute context
377            z_raw = self._compute_context(y_adj)
378            self.mu_z_ = z_raw.mean()
379            self.sigma_z_ = z_raw.std() + 1e-12
380            z_star = (z_raw - self.mu_z_) / self.sigma_z_
381
382            # Estimate gamma
383            if self.mode == "cox":
384                gamma_raw = self._estimate_gamma_cox(z_star)
385            elif self.mode == "ridge":
386                gamma_raw = self._estimate_gamma_ridge(
387                    y_adj, z_star, level_array, self.alpha_, self.b0_
388                )
389            else:  # ml
390                gamma_raw = self._estimate_gamma_ml(y_adj, 12)
391
392            # Stability constraint
393            D_max = self._D_n(36, self.alpha_, n)
394            stability_bound = 2.0 / (abs(self.b0_) * D_max + 1e-12)
395            self.gamma_ = np.clip(
396                gamma_raw,
397                -self.stability_factor * stability_bound,
398                self.stability_factor * stability_bound,
399            )
400
401        # Innovation variance
402        residuals = y_adj[1:] - level_array[:-1]
403        self.sigma2_ = np.var(residuals, ddof=1)
404
405        self._fitted = True
406        return self
407
408    # ============ PREDICT ============
409    def predict(
410        self, h: int, return_pi: bool = True, alpha_ci: float = 0.05
411    ) -> Dict[str, np.ndarray]:
412        """Generate forecasts"""
413        if not self._fitted:
414            raise RuntimeError("Call fit() first")
415
416        n = len(self._y_train)
417
418        # For rslope mode, compute slopes once
419        if self.mode == "rslope":
420            y_adj = (
421                self._y_train
422                if self.seasonal_indices_ is None
423                else self._y_train
424                / np.tile(
425                    self.seasonal_indices_, n // len(self.seasonal_indices_) + 1
426                )[:n]
427            )
428            rslope_slopes = self._estimate_slopes_rslope(y_adj, h)
429
430        # Deseasonalized forecast (recursive for non-rslope, direct for rslope)
431        if self.mode == "rslope":
432            # R-style: direct application of slopes
433            fc = np.zeros(h)
434            for step in range(h):
435                D_h = self._D_n(step + 1, self.alpha_, n)
436                fc[step] = self.l_n_ + rslope_slopes[step] * D_h
437        else:
438            # Original recursive logic
439            fc = []
440            history = list(
441                self._y_train
442                if self.seasonal_indices_ is None
443                else self._y_train
444                / np.tile(
445                    self.seasonal_indices_, n // len(self.seasonal_indices_) + 1
446                )[:n]
447            )
448
449            for step in range(1, h + 1):
450                # Recompute context
451                if self.mode != "classic":
452                    t_now = len(history) - 1
453                    weights = np.array(
454                        [
455                            self._attention_kernel(
456                                history[j], history[t_now], j, t_now
457                            )
458                            for j in range(t_now + 1)
459                        ]
460                    )
461                    weights /= weights.sum() + 1e-12
462                    z_h = np.dot(weights, history)
463                    z_h_star = (z_h - self.mu_z_) / self.sigma_z_
464                else:
465                    z_h_star = 0.0
466
467                # Forecast
468                D_h = self._D_n(step, self.alpha_, n)
469                context_factor = 1.0 + self.gamma_ * z_h_star
470                fc_val = (
471                    self.l_n_
472                    + 0.5 * self.b0_ * self.theta * context_factor * D_h
473                )
474
475                fc.append(fc_val)
476                history.append(fc_val)
477
478            fc = np.array(fc)
479
480        # Reseasonalize
481        if self.seasonal_indices_ is not None:
482            seasonal_fc = np.tile(
483                self.seasonal_indices_, (h // len(self.seasonal_indices_)) + 1
484            )[:h]
485            fc *= seasonal_fc
486
487        result = {"mean": fc}
488
489        # Prediction intervals
490        if return_pi:
491            z_score = norm.ppf(1 - alpha_ci / 2)
492            lower = []
493            upper = []
494
495            for step in range(1, h + 1):
496                D_h = self._D_n(step, self.alpha_, n)
497                var_ses = self.sigma2_ * ((step - 1) * self.alpha_**2 + 1)
498                var_ctx = (
499                    (0.5 * self.gamma_ * self.b0_ * self.sigma_z_ * D_h) ** 2
500                ) / n
501                se = np.sqrt(var_ses + var_ctx)
502
503                lower.append(fc[step - 1] - z_score * se)
504                upper.append(fc[step - 1] + z_score * se)
505
506            result["lower"] = np.array(lower)
507            result["upper"] = np.array(upper)
508
509        return result
510
511    # ============ UTILITIES ============
512    def get_params(self) -> Dict[str, Any]:
513        """Get fitted parameters"""
514        return {
515            "mode": self.mode,
516            "alpha": self.alpha_,
517            "b0": self.b0_,
518            "gamma": self.gamma_,
519            "l_n": self.l_n_,
520            "theta": self.theta,
521            "sigma2": self.sigma2_,
522            "seasonal": self.seasonal_indices_ is not None,
523        }
524
525    def plot(self, forecast: Dict[str, np.ndarray], title: str = None):
526        """Rich visualization of forecasts"""
527        import matplotlib.pyplot as plt
528
529        n = len(self._y_train)
530        h = len(forecast["mean"])
531
532        fig, axes = plt.subplots(2, 2, figsize=(14, 10))
533        fig.suptitle(
534            title or f"Unified Theta: {self.mode.upper()} Mode",
535            fontsize=14,
536            fontweight="bold",
537        )
538
539        # Plot 1: Forecasts with PI
540        ax1 = axes[0, 0]
541        train_idx = np.arange(n)
542        fc_idx = np.arange(n, n + h)
543
544        ax1.plot(
545            train_idx,
546            self._y_train,
547            "o-",
548            color="black",
549            label="Train",
550            linewidth=1.5,
551            markersize=3,
552            alpha=0.7,
553        )
554        ax1.plot(
555            fc_idx,
556            forecast["mean"],
557            "s-",
558            color="steelblue",
559            label="Forecast",
560            linewidth=2.5,
561            markersize=5,
562        )
563        if "lower" in forecast:
564            ax1.fill_between(
565                fc_idx,
566                forecast["lower"],
567                forecast["upper"],
568                color="lightblue",
569                alpha=0.3,
570                label="95% PI",
571            )
572        ax1.axvline(
573            n - 0.5, color="red", linestyle="--", alpha=0.5, linewidth=2
574        )
575        ax1.set_title("Forecasts with Prediction Intervals", fontweight="bold")
576        ax1.set_xlabel("Time")
577        ax1.set_ylabel("Value")
578        ax1.legend(loc="upper left")
579        ax1.grid(alpha=0.3)
580
581        # Plot 2: Context variable (if not classic/rslope)
582        ax2 = axes[0, 1]
583        if self.mode not in ["classic", "rslope"]:
584            y_adj = (
585                self._y_train
586                if self.seasonal_indices_ is None
587                else self._y_train
588                / np.tile(
589                    self.seasonal_indices_, n // len(self.seasonal_indices_) + 1
590                )[:n]
591            )
592            z = self._compute_context(y_adj)
593            z_star = (z - self.mu_z_) / self.sigma_z_
594
595            ax2.plot(
596                train_idx,
597                z_star,
598                color="purple",
599                linewidth=2,
600                label="z* (standardized)",
601            )
602            ax2.axhline(0, color="black", linestyle="--", alpha=0.5)
603            ax2.fill_between(
604                train_idx,
605                0,
606                z_star,
607                where=(z_star > 0),
608                color="green",
609                alpha=0.2,
610                label="Above trend",
611            )
612            ax2.fill_between(
613                train_idx,
614                0,
615                z_star,
616                where=(z_star < 0),
617                color="red",
618                alpha=0.2,
619                label="Below trend",
620            )
621            ax2.set_title(
622                f"Context Signal (γ={self.gamma_:.4f})", fontweight="bold"
623            )
624            ax2.set_xlabel("Time")
625            ax2.set_ylabel("z* (std dev)")
626            ax2.legend()
627        elif self.mode == "rslope":
628            ax2.text(
629                0.5,
630                0.5,
631                "R-Slope Mode\n(Direct ML Slopes)",
632                ha="center",
633                va="center",
634                transform=ax2.transAxes,
635                fontsize=14,
636                bbox=dict(boxstyle="round", facecolor="lightblue"),
637            )
638            ax2.axis("off")
639        else:
640            ax2.text(
641                0.5,
642                0.5,
643                "Classic Mode\n(No Context)",
644                ha="center",
645                va="center",
646                transform=ax2.transAxes,
647                fontsize=14,
648                bbox=dict(boxstyle="round", facecolor="wheat"),
649            )
650            ax2.axis("off")
651        ax2.grid(alpha=0.3)
652
653        # Plot 3: Residuals
654        ax3 = axes[1, 0]
655        y_adj = (
656            self._y_train
657            if self.seasonal_indices_ is None
658            else self._y_train
659            / np.tile(
660                self.seasonal_indices_, n // len(self.seasonal_indices_) + 1
661            )[:n]
662        )
663        level_array = self._ses_level(y_adj, self.alpha_)
664        residuals = y_adj[1:] - level_array[:-1]
665
666        ax3.scatter(train_idx[1:], residuals, alpha=0.6, s=20, color="coral")
667        ax3.axhline(0, color="black", linestyle="--", linewidth=1.5)
668        ax3.set_title(f"Residuals (σ²={self.sigma2_:.3f})", fontweight="bold")
669        ax3.set_xlabel("Time")
670        ax3.set_ylabel("Residual")
671        ax3.grid(alpha=0.3)
672
673        # Histogram
674        ax3_inset = ax3.inset_axes([0.65, 0.65, 0.3, 0.3])
675        ax3_inset.hist(
676            residuals, bins=15, color="coral", alpha=0.7, edgecolor="black"
677        )
678        ax3_inset.axvline(0, color="black", linestyle="--", linewidth=1)
679        ax3_inset.set_title("Distribution", fontsize=8)
680        ax3_inset.tick_params(labelsize=7)
681
682        # Plot 4: Summary table
683        ax4 = axes[1, 1]
684        ax4.axis("off")
685
686        params = self.get_params()
687        summary = f"""
688╔═══════════════════════════════════════╗
689║        MODEL PARAMETERS               ║
690╠═══════════════════════════════════════╣
691║                                       ║
692║  Mode:          {self.mode.upper():<20}
693║  Theta:         {self.theta:<20.4f}
694║  Alpha (α):     {params['alpha']:<20.4f}
695║  Drift (b₀):    {params['b0']:<20.4f}
696║  Gamma (γ):     {params['gamma']:<20.6f}
697║  Level (ℓₙ):    {params['l_n']:<20.2f}
698║  σ²:            {params['sigma2']:<20.4f}
699║  Seasonal:      {str(params['seasonal']):<20}
700║                                       ║
701╠═══════════════════════════════════════╣
702║        FORECAST SUMMARY               ║
703╠═══════════════════════════════════════╣
704║                                       ║
705║  Horizon:       {h:<20} steps     ║
706║  Final FC:      {forecast['mean'][-1]:<20.2f}
707"""
708        if "lower" in forecast:
709            summary += f"║  95% PI:        [{forecast['lower'][-1]:>6.2f}, {forecast['upper'][-1]:>6.2f}]   ║\n"
710
711        summary += "║                                       ║\n"
712        summary += "╚═══════════════════════════════════════╝"
713
714        ax4.text(
715            0.05,
716            0.95,
717            summary,
718            fontsize=9,
719            family="monospace",
720            verticalalignment="top",
721            transform=ax4.transAxes,
722            bbox=dict(
723                boxstyle="round",
724                facecolor="lightyellow",
725                alpha=0.8,
726                edgecolor="black",
727                linewidth=1.5,
728            ),
729        )
730
731        plt.tight_layout()
732        return fig

Unified Theta Method with multiple estimation modes.

Variants:

  • 'classic': Standard Theta (SES + drift)
  • 'cox': Context-aware with Cox partial likelihood
  • 'ridge': Context-aware with Ridge regression
  • 'ml': ML-enhanced with sklearn estimator
  • 'rslope': R-style slopes via numerical differentiation (context-free)

Parameters

mode : {'classic', 'cox', 'ridge', 'ml', 'rslope'} Estimation mode theta : float, default=0.5 Drift intensity (0=no drift, 0.5=classical, 1=full) estimator : sklearn estimator, optional For 'ml' and 'rslope' modes tau : float, default=12 Temporal attention decay sigma_val : float, optional Value-based kernel bandwidth kernel : {'temporal', 'value', 'hybrid'} Attention kernel type seasonal_period : int, optional Seasonal period (auto-detected if None) risk_set_size : int, default=15 Risk set size for Cox PL stability_factor : float, default=0.8 Gamma clipping safety factor (0,1] random_state : int, optional Random seed for ML mode

def fit(self, y: numpy.ndarray):
340    def fit(self, y: np.ndarray):
341        """Fit the model"""
342        y = np.asarray(y, dtype=float).ravel()
343        self._y_train = y.copy()
344        n = len(y)
345
346        # Detect seasonality
347        period = self.seasonal_period
348        if period is None:
349            period = 12 if n >= 24 else 1
350
351        # Decompose
352        if period > 1 and n >= 2 * period:
353            y_adj, seasonal_full, _ = self._decompose(y, period)
354            self.seasonal_indices_ = seasonal_full[-period:]
355        else:
356            y_adj = y.copy()
357            self.seasonal_indices_ = None
358
359        # SES
360        self.alpha_, self.l_n_, level_array = self._fit_ses(y_adj)
361
362        # Drift
363        self.b0_ = self._estimate_drift(y_adj)
364
365        # Context & Gamma
366        if self.mode == "classic":
367            self.gamma_ = 0.0
368            self.mu_z_ = 0.0
369            self.sigma_z_ = 1.0
370        elif self.mode == "rslope":
371            # R-style: no gamma, slopes computed during prediction
372            self.gamma_ = 0.0
373            self.mu_z_ = 0.0
374            self.sigma_z_ = 1.0
375        else:
376            # Compute context
377            z_raw = self._compute_context(y_adj)
378            self.mu_z_ = z_raw.mean()
379            self.sigma_z_ = z_raw.std() + 1e-12
380            z_star = (z_raw - self.mu_z_) / self.sigma_z_
381
382            # Estimate gamma
383            if self.mode == "cox":
384                gamma_raw = self._estimate_gamma_cox(z_star)
385            elif self.mode == "ridge":
386                gamma_raw = self._estimate_gamma_ridge(
387                    y_adj, z_star, level_array, self.alpha_, self.b0_
388                )
389            else:  # ml
390                gamma_raw = self._estimate_gamma_ml(y_adj, 12)
391
392            # Stability constraint
393            D_max = self._D_n(36, self.alpha_, n)
394            stability_bound = 2.0 / (abs(self.b0_) * D_max + 1e-12)
395            self.gamma_ = np.clip(
396                gamma_raw,
397                -self.stability_factor * stability_bound,
398                self.stability_factor * stability_bound,
399            )
400
401        # Innovation variance
402        residuals = y_adj[1:] - level_array[:-1]
403        self.sigma2_ = np.var(residuals, ddof=1)
404
405        self._fitted = True
406        return self

Fit the model

def predict( self, h: int, return_pi: bool = True, alpha_ci: float = 0.05) -> Dict[str, numpy.ndarray]:
409    def predict(
410        self, h: int, return_pi: bool = True, alpha_ci: float = 0.05
411    ) -> Dict[str, np.ndarray]:
412        """Generate forecasts"""
413        if not self._fitted:
414            raise RuntimeError("Call fit() first")
415
416        n = len(self._y_train)
417
418        # For rslope mode, compute slopes once
419        if self.mode == "rslope":
420            y_adj = (
421                self._y_train
422                if self.seasonal_indices_ is None
423                else self._y_train
424                / np.tile(
425                    self.seasonal_indices_, n // len(self.seasonal_indices_) + 1
426                )[:n]
427            )
428            rslope_slopes = self._estimate_slopes_rslope(y_adj, h)
429
430        # Deseasonalized forecast (recursive for non-rslope, direct for rslope)
431        if self.mode == "rslope":
432            # R-style: direct application of slopes
433            fc = np.zeros(h)
434            for step in range(h):
435                D_h = self._D_n(step + 1, self.alpha_, n)
436                fc[step] = self.l_n_ + rslope_slopes[step] * D_h
437        else:
438            # Original recursive logic
439            fc = []
440            history = list(
441                self._y_train
442                if self.seasonal_indices_ is None
443                else self._y_train
444                / np.tile(
445                    self.seasonal_indices_, n // len(self.seasonal_indices_) + 1
446                )[:n]
447            )
448
449            for step in range(1, h + 1):
450                # Recompute context
451                if self.mode != "classic":
452                    t_now = len(history) - 1
453                    weights = np.array(
454                        [
455                            self._attention_kernel(
456                                history[j], history[t_now], j, t_now
457                            )
458                            for j in range(t_now + 1)
459                        ]
460                    )
461                    weights /= weights.sum() + 1e-12
462                    z_h = np.dot(weights, history)
463                    z_h_star = (z_h - self.mu_z_) / self.sigma_z_
464                else:
465                    z_h_star = 0.0
466
467                # Forecast
468                D_h = self._D_n(step, self.alpha_, n)
469                context_factor = 1.0 + self.gamma_ * z_h_star
470                fc_val = (
471                    self.l_n_
472                    + 0.5 * self.b0_ * self.theta * context_factor * D_h
473                )
474
475                fc.append(fc_val)
476                history.append(fc_val)
477
478            fc = np.array(fc)
479
480        # Reseasonalize
481        if self.seasonal_indices_ is not None:
482            seasonal_fc = np.tile(
483                self.seasonal_indices_, (h // len(self.seasonal_indices_)) + 1
484            )[:h]
485            fc *= seasonal_fc
486
487        result = {"mean": fc}
488
489        # Prediction intervals
490        if return_pi:
491            z_score = norm.ppf(1 - alpha_ci / 2)
492            lower = []
493            upper = []
494
495            for step in range(1, h + 1):
496                D_h = self._D_n(step, self.alpha_, n)
497                var_ses = self.sigma2_ * ((step - 1) * self.alpha_**2 + 1)
498                var_ctx = (
499                    (0.5 * self.gamma_ * self.b0_ * self.sigma_z_ * D_h) ** 2
500                ) / n
501                se = np.sqrt(var_ses + var_ctx)
502
503                lower.append(fc[step - 1] - z_score * se)
504                upper.append(fc[step - 1] + z_score * se)
505
506            result["lower"] = np.array(lower)
507            result["upper"] = np.array(upper)
508
509        return result

Generate forecasts

class LSBoostClassifier(sklearn.base.BaseEstimator, sklearn.base.ClassifierMixin):
 18class LSBoostClassifier(BaseEstimator, ClassifierMixin):
 19    """LSBoost classifier.
 20
 21    Attributes:
 22
 23        n_estimators: int
 24            number of boosting iterations.
 25
 26        learning_rate: float
 27            controls the learning speed at training time.
 28
 29        n_hidden_features: int
 30            number of nodes in successive hidden layers.
 31
 32        reg_lambda: float
 33            L2 regularization parameter for successive errors in the optimizer
 34            (at training time).
 35
 36        alpha: float
 37            compromise between L1 and L2 regularization (must be in [0, 1]),
 38            for `solver` == 'enet'.
 39
 40        row_sample: float
 41            percentage of rows chosen from the training set.
 42
 43        col_sample: float
 44            percentage of columns chosen from the training set.
 45
 46        dropout: float
 47            percentage of nodes dropped from the training set.
 48
 49        tolerance: float
 50            controls early stopping in gradient descent (at training time).
 51
 52        direct_link: bool
 53            indicates whether the original features are included (True) in model's
 54            fitting or not (False).
 55
 56        verbose: int
 57            progress bar (yes = 1) or not (no = 0) (currently).
 58
 59        seed: int
 60            reproducibility seed for nodes_sim=='uniform', clustering and dropout.
 61
 62        backend: str
 63            type of backend; must be in ('cpu', 'gpu', 'tpu')
 64
 65        solver: str
 66            type of 'weak' learner; currently in ('ridge', 'lasso', 'enet').
 67            'enet' is a combination of 'ridge' and 'lasso' called Elastic Net.
 68
 69        activation: str
 70            activation function: currently 'relu', 'relu6', 'sigmoid', 'tanh'
 71
 72        n_clusters: int
 73            number of clusters for clustering the features
 74
 75        clustering_method: str
 76            clustering method: currently 'kmeans', 'gmm'
 77
 78        cluster_scaling: str
 79            scaling method for clustering: currently 'standard', 'robust', 'minmax'
 80
 81        degree: int
 82            degree of features interactions to include in the model
 83
 84        weights_distr: str
 85            distribution of weights for constructing the model's hidden layer;
 86            currently 'uniform', 'gaussian'
 87
 88        hist: bool
 89            indicates whether histogram features are used or not (default is False)
 90
 91        bins: int or str
 92            number of bins for histogram features (same as numpy.histogram, default is 'auto')
 93
 94    Examples:
 95
 96        ```python
 97        import numpy as np
 98        from sklearn.datasets import load_digits, load_breast_cancer, load_wine, load_iris
 99        from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
100        from sklearn.tree import DecisionTreeRegressor
101        from sklearn.kernel_ridge import KernelRidge
102        from time import time
103        from os import chdir
104        from sklearn import metrics
105        import os
106
107        import mlsauce as ms
108
109        print("\n")
110        print("GenericBoosting Decision tree -----")
111        print("\n")
112
113        print("\n")
114        print("breast_cancer data -----")
115
116        # data 1
117        breast_cancer = load_breast_cancer()
118        X = breast_cancer.data
119        y = breast_cancer.target
120        # split data into training test and test set
121        np.random.seed(15029)
122        X_train, X_test, y_train, y_test = train_test_split(X, y,
123                                                            test_size=0.2)
124
125        clf = DecisionTreeRegressor()
126        clf2 = KernelRidge()
127
128        obj = ms.GenericBoostingClassifier(clf, tolerance=1e-2)
129        print(obj.get_params())
130        start = time()
131        obj.fit(X_train, y_train)
132        print(time()-start)
133        start = time()
134        print(obj.score(X_test, y_test))
135        print(time()-start)
136
137        print(obj.obj['loss'])
138
139        obj = ms.GenericBoostingClassifier(clf, tolerance=1e-2, n_clusters=2)
140        print(obj.get_params())
141        start = time()
142        obj.fit(X_train, y_train)
143        print(time()-start)
144        start = time()
145        print(obj.score(X_test, y_test))
146        print(time()-start)
147
148        print(obj.obj['loss'])
149
150
151        # data 2
152        print("\n")
153        print("wine data -----")
154
155        wine = load_wine()
156        Z = wine.data
157        t = wine.target
158        np.random.seed(879423)
159        X_train, X_test, y_train, y_test = train_test_split(Z, t,
160                                                            test_size=0.2)
161
162        obj = ms.GenericBoostingClassifier(clf)
163        print(obj.get_params())
164        start = time()
165        obj.fit(X_train, y_train)
166        print(time()-start)
167        start = time()
168        print(obj.score(X_test, y_test))
169        print(time()-start)
170
171        print(obj.obj['loss'])
172
173        obj = ms.GenericBoostingClassifier(clf, n_clusters=3)
174        print(obj.get_params())
175        start = time()
176        obj.fit(X_train, y_train)
177        print(time()-start)
178        start = time()
179        print(obj.score(X_test, y_test))
180        print(time()-start)
181
182        print(obj.obj['loss'])
183
184        # data 3
185        print("\n")
186        print("iris data -----")
187
188        iris = load_iris()
189        Z = iris.data
190        t = iris.target
191        np.random.seed(734563)
192        X_train, X_test, y_train, y_test = train_test_split(Z, t,
193                                                            test_size=0.2)
194
195
196        obj = ms.GenericBoostingClassifier(clf)
197        print(obj.get_params())
198        start = time()
199        obj.fit(X_train, y_train)
200        print(time()-start)
201        start = time()
202        print(obj.score(X_test, y_test))
203        print(time()-start)
204
205        print(obj.obj['loss'])
206
207
208        print("\n")
209        print("GenericBoosting  KRR -----")
210        print("\n")
211
212        obj = ms.GenericBoostingClassifier(clf2, tolerance=1e-2)
213        print(obj.get_params())
214        start = time()
215        obj.fit(X_train, y_train)
216        print(time()-start)
217        start = time()
218        print(obj.score(X_test, y_test))
219        print(time()-start)
220
221        print(obj.obj['loss'])
222
223        obj = ms.GenericBoostingClassifier(clf2, tolerance=1e-2, n_clusters=2)
224        print(obj.get_params())
225        start = time()
226        obj.fit(X_train, y_train)
227        print(time()-start)
228        start = time()
229        print(obj.score(X_test, y_test))
230        print(time()-start)
231
232        print(obj.obj['loss'])
233
234
235        # data 2
236        print("\n")
237        print("wine data -----")
238
239        wine = load_wine()
240        Z = wine.data
241        t = wine.target
242        np.random.seed(879423)
243        X_train, X_test, y_train, y_test = train_test_split(Z, t,
244                                                            test_size=0.2)
245
246        obj = ms.GenericBoostingClassifier(clf2)
247        print(obj.get_params())
248        start = time()
249        obj.fit(X_train, y_train)
250        print(time()-start)
251        start = time()
252        print(obj.score(X_test, y_test))
253        print(time()-start)
254
255        print(obj.obj['loss'])
256
257        obj = ms.GenericBoostingClassifier(clf2, n_clusters=3)
258        print(obj.get_params())
259        start = time()
260        obj.fit(X_train, y_train)
261        print(time()-start)
262        start = time()
263        print(obj.score(X_test, y_test))
264        print(time()-start)
265
266        print(obj.obj['loss'])
267
268        # data 3
269        print("\n")
270        print("iris data -----")
271
272        iris = load_iris()
273        Z = iris.data
274        t = iris.target
275        np.random.seed(734563)
276        X_train, X_test, y_train, y_test = train_test_split(Z, t,
277                                                            test_size=0.2)
278
279
280        obj = ms.GenericBoostingClassifier(clf2)
281        print(obj.get_params())
282        start = time()
283        obj.fit(X_train, y_train)
284        print(time()-start)
285        start = time()
286        print(obj.score(X_test, y_test))
287        print(time()-start)
288
289        print(obj.obj['loss'])
290    ```
291
292    """
293
294    def __init__(
295        self,
296        n_estimators=100,
297        learning_rate=0.1,
298        n_hidden_features=5,
299        reg_lambda=0.1,
300        alpha=0.5,
301        row_sample=1,
302        col_sample=1,
303        dropout=0,
304        tolerance=1e-4,
305        direct_link=1,
306        verbose=1,
307        seed=123,
308        backend="cpu",
309        solver="ridge",
310        activation="relu",
311        n_clusters=0,
312        clustering_method="kmeans",
313        cluster_scaling="standard",
314        degree=None,
315        weights_distr="uniform",
316        base_model=None,
317        hist=False,
318        bins="auto",
319    ):
320
321        self.base_model = base_model
322        self.hist = hist
323        self.bins = bins
324        self.hist_bins_ = None
325
326        if n_clusters > 0:
327            assert clustering_method in (
328                "kmeans",
329                "gmm",
330            ), "`clustering_method` must be in ('kmeans', 'gmm')"
331            assert cluster_scaling in (
332                "standard",
333                "robust",
334                "minmax",
335            ), "`cluster_scaling` must be in ('standard', 'robust', 'minmax')"
336
337        assert backend in (
338            "cpu",
339            "gpu",
340            "tpu",
341        ), "`backend` must be in ('cpu', 'gpu', 'tpu')"
342
343        assert solver in (
344            "ridge",
345            "lasso",
346            "enet",
347        ), "`solver` must be in ('ridge', 'lasso', 'enet')"
348
349        sys_platform = platform.system()
350
351        if (sys_platform == "Windows") and (backend in ("gpu", "tpu")):
352            warnings.warn(
353                "No GPU/TPU computing on Windows yet, backend set to 'cpu'"
354            )
355            backend = "cpu"
356
357        self.n_estimators = n_estimators
358        self.learning_rate = learning_rate
359        self.n_hidden_features = n_hidden_features
360        self.reg_lambda = reg_lambda
361        assert alpha >= 0 and alpha <= 1, "`alpha` must be in [0, 1]"
362        self.alpha = alpha
363        self.row_sample = row_sample
364        self.col_sample = col_sample
365        self.dropout = dropout
366        self.tolerance = tolerance
367        self.direct_link = direct_link
368        self.verbose = verbose
369        self.seed = seed
370        self.backend = backend
371        self.obj = None
372        self.solver = solver
373        self.activation = activation
374        self.n_clusters = n_clusters
375        self.clustering_method = clustering_method
376        self.cluster_scaling = cluster_scaling
377        self.scaler_, self.label_encoder_, self.clusterer_ = None, None, None
378        self.degree = degree
379        self.poly_ = None
380        self.weights_distr = weights_distr
381        if self.backend in ("gpu", "tpu"):
382            check_and_install("jax")
383            check_and_install("jaxlib")
384
385    def fit(self, X, y, **kwargs):
386        """Fit Booster (classifier) to training data (X, y)
387
388        Args:
389
390            X: {array-like}, shape = [n_samples, n_features]
391                Training vectors, where n_samples is the number
392                of samples and n_features is the number of features.
393
394            y: array-like, shape = [n_samples]
395                Target values.
396
397            **kwargs: additional parameters to be passed to self.cook_training_set.
398
399        Returns:
400
401            self: object.
402        """
403
404        if isinstance(X, pd.DataFrame):
405            X = X.values
406
407        if self.hist == True:
408            X, self.hist_bins_ = get_histo_features(X)
409
410        if isinstance(y, pd.Series):
411            y = y.values.ravel()
412        else:
413            y = np.asarray(y).ravel()
414
415        if self.degree is not None:
416            assert isinstance(self.degree, int), "`degree` must be an integer"
417            self.poly_ = PolynomialFeatures(
418                degree=self.degree, interaction_only=True, include_bias=False
419            )
420            X = self.poly_.fit_transform(X)
421
422        if self.n_clusters > 0:
423            clustered_X, self.scaler_, self.label_encoder_, self.clusterer_ = (
424                cluster(
425                    X,
426                    n_clusters=self.n_clusters,
427                    method=self.clustering_method,
428                    type_scaling=self.cluster_scaling,
429                    training=True,
430                    seed=self.seed,
431                )
432            )
433            X = np.column_stack((X, clustered_X))
434
435        self.obj = boosterc.fit_booster_classifier(
436            np.asarray(X, order="C", dtype=np.float64),
437            np.asarray(y, order="C", dtype=np.int64),
438            n_estimators=self.n_estimators,
439            learning_rate=self.learning_rate,
440            n_hidden_features=self.n_hidden_features,
441            reg_lambda=self.reg_lambda,
442            alpha=self.alpha,
443            row_sample=self.row_sample,
444            col_sample=self.col_sample,
445            dropout=self.dropout,
446            tolerance=self.tolerance,
447            direct_link=self.direct_link,
448            verbose=self.verbose,
449            seed=self.seed,
450            backend=self.backend,
451            solver=self.solver,
452            activation=self.activation,
453            obj=self.base_model,
454        )
455
456        self.classes_ = np.unique(y)  # for compatibility with sklearn
457        self.n_classes_ = len(self.classes_)  # for compatibility with sklearn
458        self.n_estimators = self.obj["n_estimators"]
459        return self
460
461    def predict(self, X, **kwargs):
462        """Predict test data X.
463
464        Args:
465
466            X: {array-like}, shape = [n_samples, n_features]
467                Training vectors, where n_samples is the number
468                of samples and n_features is the number of features.
469
470            **kwargs: additional parameters to be passed to `predict_proba`
471
472
473        Returns:
474
475            model predictions: {array-like}
476        """
477
478        return np.argmax(self.predict_proba(X, **kwargs), axis=1)
479
480    def predict_proba(self, X, **kwargs):
481        """Predict probabilities for test data X.
482
483        Args:
484
485            X: {array-like}, shape = [n_samples, n_features]
486                Training vectors, where n_samples is the number
487                of samples and n_features is the number of features.
488
489            **kwargs: additional parameters to be passed to
490                self.cook_test_set
491
492        Returns:
493
494            probability estimates for test data: {array-like}
495        """
496
497        if isinstance(X, pd.DataFrame):
498            X = X.values
499
500        if self.hist == True:
501            X = get_histo_features(X, bins=self.hist_bins_)
502
503        if self.degree is not None:
504            X = self.poly_.transform(X)
505
506        if self.n_clusters > 0:
507            X = np.column_stack(
508                (
509                    X,
510                    cluster(
511                        X,
512                        training=False,
513                        scaler=self.scaler_,
514                        label_encoder=self.label_encoder_,
515                        clusterer=self.clusterer_,
516                        seed=self.seed,
517                    ),
518                )
519            )
520        try:
521            return boosterc.predict_proba_booster_classifier(
522                self.obj, np.asarray(X, order="C")
523            )
524        except ValueError:
525            pass
526
527    def update(self, X, y, eta=0.9):
528        """Update model with new data.
529
530        Args:
531
532            X: {array-like}, shape = [n_samples=1, n_features]
533                Training vectors, where n_samples is the number
534                of samples and n_features is the number of features.
535
536            y: float = [n_samples=1]
537               Target value.
538
539            eta: float
540                Inverse power applied to number of observations
541                (defines a learning rate).
542
543        Returns:
544
545            self: object.
546        """
547
548        if isinstance(X, pd.DataFrame):
549            X = X.values
550
551        if self.degree is not None:
552            X = self.poly_.transform(X)
553
554        if self.n_clusters > 0:
555            X = np.column_stack(
556                (
557                    X,
558                    cluster(
559                        X,
560                        training=False,
561                        scaler=self.scaler_,
562                        label_encoder=self.label_encoder_,
563                        clusterer=self.clusterer_,
564                        seed=self.seed,
565                    ),
566                )
567            )
568
569        self.obj = boosterc.update_booster(
570            self.obj,
571            np.asarray(X, order="C"),
572            np.asarray(y, order="C").ravel(),
573            eta,
574        )
575
576        return self

LSBoost classifier.

Attributes:

    n_estimators: int
        number of boosting iterations.

    learning_rate: float
        controls the learning speed at training time.

    n_hidden_features: int
        number of nodes in successive hidden layers.

    reg_lambda: float
        L2 regularization parameter for successive errors in the optimizer
        (at training time).

    alpha: float
        compromise between L1 and L2 regularization (must be in [0, 1]),
        for `solver` == 'enet'.

    row_sample: float
        percentage of rows chosen from the training set.

    col_sample: float
        percentage of columns chosen from the training set.

    dropout: float
        percentage of nodes dropped from the training set.

    tolerance: float
        controls early stopping in gradient descent (at training time).

    direct_link: bool
        indicates whether the original features are included (True) in model's
        fitting or not (False).

    verbose: int
        progress bar (yes = 1) or not (no = 0) (currently).

    seed: int
        reproducibility seed for nodes_sim=='uniform', clustering and dropout.

    backend: str
        type of backend; must be in ('cpu', 'gpu', 'tpu')

    solver: str
        type of 'weak' learner; currently in ('ridge', 'lasso', 'enet').
        'enet' is a combination of 'ridge' and 'lasso' called Elastic Net.

    activation: str
        activation function: currently 'relu', 'relu6', 'sigmoid', 'tanh'

    n_clusters: int
        number of clusters for clustering the features

    clustering_method: str
        clustering method: currently 'kmeans', 'gmm'

    cluster_scaling: str
        scaling method for clustering: currently 'standard', 'robust', 'minmax'

    degree: int
        degree of features interactions to include in the model

    weights_distr: str
        distribution of weights for constructing the model's hidden layer;
        currently 'uniform', 'gaussian'

    hist: bool
        indicates whether histogram features are used or not (default is False)

    bins: int or str
        number of bins for histogram features (same as numpy.histogram, default is 'auto')

Examples:

    ```python
    import numpy as np
    from sklearn.datasets import load_digits, load_breast_cancer, load_wine, load_iris
    from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
    from sklearn.tree import DecisionTreeRegressor
    from sklearn.kernel_ridge import KernelRidge
    from time import time
    from os import chdir
    from sklearn import metrics
    import os

    import mlsauce as ms

    print("

") print("GenericBoosting Decision tree -----") print(" ")

    print("

") print("breast_cancer data -----")

    # data 1
    breast_cancer = load_breast_cancer()
    X = breast_cancer.data
    y = breast_cancer.target
    # split data into training test and test set
    np.random.seed(15029)
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=0.2)

    clf = DecisionTreeRegressor()
    clf2 = KernelRidge()

    obj = ms.GenericBoostingClassifier(clf, tolerance=1e-2)
    print(obj.get_params())
    start = time()
    obj.fit(X_train, y_train)
    print(time()-start)
    start = time()
    print(obj.score(X_test, y_test))
    print(time()-start)

    print(obj.obj['loss'])

    obj = ms.GenericBoostingClassifier(clf, tolerance=1e-2, n_clusters=2)
    print(obj.get_params())
    start = time()
    obj.fit(X_train, y_train)
    print(time()-start)
    start = time()
    print(obj.score(X_test, y_test))
    print(time()-start)

    print(obj.obj['loss'])


    # data 2
    print("

") print("wine data -----")

    wine = load_wine()
    Z = wine.data
    t = wine.target
    np.random.seed(879423)
    X_train, X_test, y_train, y_test = train_test_split(Z, t,
                                                        test_size=0.2)

    obj = ms.GenericBoostingClassifier(clf)
    print(obj.get_params())
    start = time()
    obj.fit(X_train, y_train)
    print(time()-start)
    start = time()
    print(obj.score(X_test, y_test))
    print(time()-start)

    print(obj.obj['loss'])

    obj = ms.GenericBoostingClassifier(clf, n_clusters=3)
    print(obj.get_params())
    start = time()
    obj.fit(X_train, y_train)
    print(time()-start)
    start = time()
    print(obj.score(X_test, y_test))
    print(time()-start)

    print(obj.obj['loss'])

    # data 3
    print("

") print("iris data -----")

    iris = load_iris()
    Z = iris.data
    t = iris.target
    np.random.seed(734563)
    X_train, X_test, y_train, y_test = train_test_split(Z, t,
                                                        test_size=0.2)


    obj = ms.GenericBoostingClassifier(clf)
    print(obj.get_params())
    start = time()
    obj.fit(X_train, y_train)
    print(time()-start)
    start = time()
    print(obj.score(X_test, y_test))
    print(time()-start)

    print(obj.obj['loss'])


    print("

") print("GenericBoosting KRR -----") print(" ")

    obj = ms.GenericBoostingClassifier(clf2, tolerance=1e-2)
    print(obj.get_params())
    start = time()
    obj.fit(X_train, y_train)
    print(time()-start)
    start = time()
    print(obj.score(X_test, y_test))
    print(time()-start)

    print(obj.obj['loss'])

    obj = ms.GenericBoostingClassifier(clf2, tolerance=1e-2, n_clusters=2)
    print(obj.get_params())
    start = time()
    obj.fit(X_train, y_train)
    print(time()-start)
    start = time()
    print(obj.score(X_test, y_test))
    print(time()-start)

    print(obj.obj['loss'])


    # data 2
    print("

") print("wine data -----")

    wine = load_wine()
    Z = wine.data
    t = wine.target
    np.random.seed(879423)
    X_train, X_test, y_train, y_test = train_test_split(Z, t,
                                                        test_size=0.2)

    obj = ms.GenericBoostingClassifier(clf2)
    print(obj.get_params())
    start = time()
    obj.fit(X_train, y_train)
    print(time()-start)
    start = time()
    print(obj.score(X_test, y_test))
    print(time()-start)

    print(obj.obj['loss'])

    obj = ms.GenericBoostingClassifier(clf2, n_clusters=3)
    print(obj.get_params())
    start = time()
    obj.fit(X_train, y_train)
    print(time()-start)
    start = time()
    print(obj.score(X_test, y_test))
    print(time()-start)

    print(obj.obj['loss'])

    # data 3
    print("

") print("iris data -----")

    iris = load_iris()
    Z = iris.data
    t = iris.target
    np.random.seed(734563)
    X_train, X_test, y_train, y_test = train_test_split(Z, t,
                                                        test_size=0.2)


    obj = ms.GenericBoostingClassifier(clf2)
    print(obj.get_params())
    start = time()
    obj.fit(X_train, y_train)
    print(time()-start)
    start = time()
    print(obj.score(X_test, y_test))
    print(time()-start)

    print(obj.obj['loss'])
```
def fit(self, X, y, **kwargs):
385    def fit(self, X, y, **kwargs):
386        """Fit Booster (classifier) to training data (X, y)
387
388        Args:
389
390            X: {array-like}, shape = [n_samples, n_features]
391                Training vectors, where n_samples is the number
392                of samples and n_features is the number of features.
393
394            y: array-like, shape = [n_samples]
395                Target values.
396
397            **kwargs: additional parameters to be passed to self.cook_training_set.
398
399        Returns:
400
401            self: object.
402        """
403
404        if isinstance(X, pd.DataFrame):
405            X = X.values
406
407        if self.hist == True:
408            X, self.hist_bins_ = get_histo_features(X)
409
410        if isinstance(y, pd.Series):
411            y = y.values.ravel()
412        else:
413            y = np.asarray(y).ravel()
414
415        if self.degree is not None:
416            assert isinstance(self.degree, int), "`degree` must be an integer"
417            self.poly_ = PolynomialFeatures(
418                degree=self.degree, interaction_only=True, include_bias=False
419            )
420            X = self.poly_.fit_transform(X)
421
422        if self.n_clusters > 0:
423            clustered_X, self.scaler_, self.label_encoder_, self.clusterer_ = (
424                cluster(
425                    X,
426                    n_clusters=self.n_clusters,
427                    method=self.clustering_method,
428                    type_scaling=self.cluster_scaling,
429                    training=True,
430                    seed=self.seed,
431                )
432            )
433            X = np.column_stack((X, clustered_X))
434
435        self.obj = boosterc.fit_booster_classifier(
436            np.asarray(X, order="C", dtype=np.float64),
437            np.asarray(y, order="C", dtype=np.int64),
438            n_estimators=self.n_estimators,
439            learning_rate=self.learning_rate,
440            n_hidden_features=self.n_hidden_features,
441            reg_lambda=self.reg_lambda,
442            alpha=self.alpha,
443            row_sample=self.row_sample,
444            col_sample=self.col_sample,
445            dropout=self.dropout,
446            tolerance=self.tolerance,
447            direct_link=self.direct_link,
448            verbose=self.verbose,
449            seed=self.seed,
450            backend=self.backend,
451            solver=self.solver,
452            activation=self.activation,
453            obj=self.base_model,
454        )
455
456        self.classes_ = np.unique(y)  # for compatibility with sklearn
457        self.n_classes_ = len(self.classes_)  # for compatibility with sklearn
458        self.n_estimators = self.obj["n_estimators"]
459        return self

Fit Booster (classifier) to training data (X, y)

Args:

X: {array-like}, shape = [n_samples, n_features]
    Training vectors, where n_samples is the number
    of samples and n_features is the number of features.

y: array-like, shape = [n_samples]
    Target values.

**kwargs: additional parameters to be passed to self.cook_training_set.

Returns:

self: object.
def predict(self, X, **kwargs):
461    def predict(self, X, **kwargs):
462        """Predict test data X.
463
464        Args:
465
466            X: {array-like}, shape = [n_samples, n_features]
467                Training vectors, where n_samples is the number
468                of samples and n_features is the number of features.
469
470            **kwargs: additional parameters to be passed to `predict_proba`
471
472
473        Returns:
474
475            model predictions: {array-like}
476        """
477
478        return np.argmax(self.predict_proba(X, **kwargs), axis=1)

Predict test data X.

Args:

X: {array-like}, shape = [n_samples, n_features]
    Training vectors, where n_samples is the number
    of samples and n_features is the number of features.

**kwargs: additional parameters to be passed to `predict_proba`

Returns:

model predictions: {array-like}
def predict_proba(self, X, **kwargs):
480    def predict_proba(self, X, **kwargs):
481        """Predict probabilities for test data X.
482
483        Args:
484
485            X: {array-like}, shape = [n_samples, n_features]
486                Training vectors, where n_samples is the number
487                of samples and n_features is the number of features.
488
489            **kwargs: additional parameters to be passed to
490                self.cook_test_set
491
492        Returns:
493
494            probability estimates for test data: {array-like}
495        """
496
497        if isinstance(X, pd.DataFrame):
498            X = X.values
499
500        if self.hist == True:
501            X = get_histo_features(X, bins=self.hist_bins_)
502
503        if self.degree is not None:
504            X = self.poly_.transform(X)
505
506        if self.n_clusters > 0:
507            X = np.column_stack(
508                (
509                    X,
510                    cluster(
511                        X,
512                        training=False,
513                        scaler=self.scaler_,
514                        label_encoder=self.label_encoder_,
515                        clusterer=self.clusterer_,
516                        seed=self.seed,
517                    ),
518                )
519            )
520        try:
521            return boosterc.predict_proba_booster_classifier(
522                self.obj, np.asarray(X, order="C")
523            )
524        except ValueError:
525            pass

Predict probabilities for test data X.

Args:

X: {array-like}, shape = [n_samples, n_features]
    Training vectors, where n_samples is the number
    of samples and n_features is the number of features.

**kwargs: additional parameters to be passed to
    self.cook_test_set

Returns:

probability estimates for test data: {array-like}
class GenericBoostingClassifier(mlsauce.LSBoostClassifier):
579class GenericBoostingClassifier(LSBoostClassifier):
580    """Generic Boosting classifier (using any classifier as base learner).
581
582    Attributes:
583
584        base_model: object
585            base learner (default is ExtraTreeRegressor) to be boosted.
586
587        n_estimators: int
588            number of boosting iterations.
589
590        learning_rate: float
591            controls the learning speed at training time.
592
593        n_hidden_features: int
594            number of nodes in successive hidden layers.
595
596        row_sample: float
597            percentage of rows chosen from the training set.
598
599        col_sample: float
600            percentage of columns chosen from the training set.
601
602        dropout: float
603            percentage of nodes dropped from the training set.
604
605        tolerance: float
606            controls early stopping in gradient descent (at training time).
607
608        direct_link: bool
609            indicates whether the original features are included (True) in model's
610            fitting or not (False).
611
612        verbose: int
613            progress bar (yes = 1) or not (no = 0) (currently).
614
615        seed: int
616            reproducibility seed for nodes_sim=='uniform', clustering and dropout.
617
618        activation: str
619            activation function: currently 'relu', 'relu6', 'sigmoid', 'tanh'
620
621        n_clusters: int
622            number of clusters for clustering the features
623
624        clustering_method: str
625            clustering method: currently 'kmeans', 'gmm'
626
627        cluster_scaling: str
628            scaling method for clustering: currently 'standard', 'robust', 'minmax'
629
630        degree: int
631            degree of features interactions to include in the model
632
633        weights_distr: str
634            distribution of weights for constructing the model's hidden layer;
635            currently 'uniform', 'gaussian'
636
637        hist: bool
638            indicates whether histogram features are used or not (default is False)
639
640        bins: int or str
641            number of bins for histogram features (same as numpy.histogram, default is 'auto')
642
643    """
644
645    def __init__(
646        self,
647        base_model=ExtraTreeRegressor(),
648        n_estimators=100,
649        learning_rate=0.1,
650        n_hidden_features=5,
651        row_sample=1,
652        col_sample=1,
653        dropout=0,
654        tolerance=1e-4,
655        direct_link=1,
656        verbose=1,
657        backend="cpu",
658        seed=123,
659        activation="relu",
660        n_clusters=0,
661        clustering_method="kmeans",
662        cluster_scaling="standard",
663        degree=None,
664        weights_distr="uniform",
665        hist=False,
666        bins="auto",
667    ):
668        self.base_model = base_model
669        self.hist = hist
670        self.bins = bins
671        self.hist_bins_ = None
672
673        super().__init__(
674            n_estimators=n_estimators,
675            learning_rate=learning_rate,
676            n_hidden_features=n_hidden_features,
677            row_sample=row_sample,
678            col_sample=col_sample,
679            dropout=dropout,
680            tolerance=tolerance,
681            direct_link=direct_link,
682            verbose=verbose,
683            backend=backend,
684            seed=seed,
685            activation=activation,
686            n_clusters=n_clusters,
687            clustering_method=clustering_method,
688            cluster_scaling=cluster_scaling,
689            degree=degree,
690            weights_distr=weights_distr,
691            base_model=self.base_model,
692        )

Generic Boosting classifier (using any classifier as base learner).

Attributes:

base_model: object
    base learner (default is ExtraTreeRegressor) to be boosted.

n_estimators: int
    number of boosting iterations.

learning_rate: float
    controls the learning speed at training time.

n_hidden_features: int
    number of nodes in successive hidden layers.

row_sample: float
    percentage of rows chosen from the training set.

col_sample: float
    percentage of columns chosen from the training set.

dropout: float
    percentage of nodes dropped from the training set.

tolerance: float
    controls early stopping in gradient descent (at training time).

direct_link: bool
    indicates whether the original features are included (True) in model's
    fitting or not (False).

verbose: int
    progress bar (yes = 1) or not (no = 0) (currently).

seed: int
    reproducibility seed for nodes_sim=='uniform', clustering and dropout.

activation: str
    activation function: currently 'relu', 'relu6', 'sigmoid', 'tanh'

n_clusters: int
    number of clusters for clustering the features

clustering_method: str
    clustering method: currently 'kmeans', 'gmm'

cluster_scaling: str
    scaling method for clustering: currently 'standard', 'robust', 'minmax'

degree: int
    degree of features interactions to include in the model

weights_distr: str
    distribution of weights for constructing the model's hidden layer;
    currently 'uniform', 'gaussian'

hist: bool
    indicates whether histogram features are used or not (default is False)

bins: int or str
    number of bins for histogram features (same as numpy.histogram, default is 'auto')
class GenericBoostingRegressor(mlsauce.LSBoostRegressor):
460class GenericBoostingRegressor(LSBoostRegressor):
461    """Generic Boosting regressor.
462
463    Attributes:
464
465        base_model: object
466            base learner (default is ExtraTreeRegressor) to be boosted.
467
468        n_estimators: int
469            number of boosting iterations.
470
471        learning_rate: float
472            controls the learning speed at training time.
473
474        n_hidden_features: int
475            number of nodes in successive hidden layers.
476
477        row_sample: float
478            percentage of rows chosen from the training set.
479
480        col_sample: float
481            percentage of columns chosen from the training set.
482
483        dropout: float
484            percentage of nodes dropped from the training set.
485
486        tolerance: float
487            controls early stopping in gradient descent (at training time).
488
489        direct_link: bool
490            indicates whether the original features are included (True) in model's
491            fitting or not (False).
492
493        verbose: int
494            progress bar (yes = 1) or not (no = 0) (currently).
495
496        seed: int
497            reproducibility seed for nodes_sim=='uniform', clustering and dropout.
498
499        activation: str
500            activation function: currently 'relu', 'relu6', 'sigmoid', 'tanh'
501
502        type_pi: str.
503            type of prediction interval; currently "kde" (default) or "bootstrap".
504            Used only in `self.predict`, for `self.replications` > 0 and `self.kernel`
505            in ('gaussian', 'tophat'). Default is `None`.
506
507        replications: int.
508            number of replications (if needed) for predictive simulation.
509            Used only in `self.predict`, for `self.kernel` in ('gaussian',
510            'tophat') and `self.type_pi = 'kde'`. Default is `None`.
511
512        n_clusters: int
513            number of clusters for clustering the features
514
515        clustering_method: str
516            clustering method: currently 'kmeans', 'gmm'
517
518        cluster_scaling: str
519            scaling method for clustering: currently 'standard', 'robust', 'minmax'
520
521        degree: int
522            degree of features interactions to include in the model
523
524        weights_distr: str
525            distribution of weights for constructing the model's hidden layer;
526            either 'uniform' or 'gaussian'
527
528        hist: bool
529            whether to use histogram features or not
530
531        bins: int or str
532            number of bins for histogram features (same as numpy.histogram, default is 'auto')
533
534    """
535
536    def __init__(
537        self,
538        base_model=ExtraTreeRegressor(),
539        n_estimators=100,
540        learning_rate=0.1,
541        n_hidden_features=5,
542        row_sample=1,
543        col_sample=1,
544        dropout=0,
545        tolerance=1e-4,
546        direct_link=1,
547        verbose=1,
548        backend="cpu",
549        seed=123,
550        activation="relu",
551        type_pi=None,
552        replications=None,
553        kernel=None,
554        n_clusters=0,
555        clustering_method="kmeans",
556        cluster_scaling="standard",
557        degree=None,
558        weights_distr="uniform",
559        hist=False,
560        bins="auto",
561    ):
562        self.base_model = base_model
563        self.hist = hist
564        self.bins = bins
565        self.hist_bins_ = None
566
567        super().__init__(
568            n_estimators=n_estimators,
569            learning_rate=learning_rate,
570            n_hidden_features=n_hidden_features,
571            row_sample=row_sample,
572            col_sample=col_sample,
573            dropout=dropout,
574            tolerance=tolerance,
575            direct_link=direct_link,
576            verbose=verbose,
577            backend=backend,
578            seed=seed,
579            activation=activation,
580            type_pi=type_pi,
581            replications=replications,
582            kernel=kernel,
583            n_clusters=n_clusters,
584            clustering_method=clustering_method,
585            cluster_scaling=cluster_scaling,
586            degree=degree,
587            weights_distr=weights_distr,
588            base_model=self.base_model,
589        )

Generic Boosting regressor.

Attributes:

base_model: object
    base learner (default is ExtraTreeRegressor) to be boosted.

n_estimators: int
    number of boosting iterations.

learning_rate: float
    controls the learning speed at training time.

n_hidden_features: int
    number of nodes in successive hidden layers.

row_sample: float
    percentage of rows chosen from the training set.

col_sample: float
    percentage of columns chosen from the training set.

dropout: float
    percentage of nodes dropped from the training set.

tolerance: float
    controls early stopping in gradient descent (at training time).

direct_link: bool
    indicates whether the original features are included (True) in model's
    fitting or not (False).

verbose: int
    progress bar (yes = 1) or not (no = 0) (currently).

seed: int
    reproducibility seed for nodes_sim=='uniform', clustering and dropout.

activation: str
    activation function: currently 'relu', 'relu6', 'sigmoid', 'tanh'

type_pi: str.
    type of prediction interval; currently "kde" (default) or "bootstrap".
    Used only in `self.predict`, for `self.replications` > 0 and `self.kernel`
    in ('gaussian', 'tophat'). Default is `None`.

replications: int.
    number of replications (if needed) for predictive simulation.
    Used only in `self.predict`, for `self.kernel` in ('gaussian',
    'tophat') and `self.type_pi = 'kde'`. Default is `None`.

n_clusters: int
    number of clusters for clustering the features

clustering_method: str
    clustering method: currently 'kmeans', 'gmm'

cluster_scaling: str
    scaling method for clustering: currently 'standard', 'robust', 'minmax'

degree: int
    degree of features interactions to include in the model

weights_distr: str
    distribution of weights for constructing the model's hidden layer;
    either 'uniform' or 'gaussian'

hist: bool
    whether to use histogram features or not

bins: int or str
    number of bins for histogram features (same as numpy.histogram, default is 'auto')
class StumpClassifier(sklearn.base.BaseEstimator, sklearn.base.ClassifierMixin):
 12class StumpClassifier(BaseEstimator, ClassifierMixin):
 13    """Stump classifier.
 14
 15    Attributes:
 16
 17        bins: int
 18            Number of histogram bins; as in numpy.histogram.
 19    """
 20
 21    def __init__(self, bins="auto"):
 22        self.bins = bins
 23        self.obj = None
 24
 25    def fit(self, X, y, sample_weight=None, **kwargs):
 26        """Fit Stump to training data (X, y)
 27
 28        Args:
 29
 30            X: {array-like}, shape = [n_samples, n_features]
 31                Training vectors, where n_samples is the number
 32                of samples and n_features is the number of features.
 33
 34            y: array-like, shape = [n_samples]
 35                Target values.
 36
 37            sample_weight: array_like, shape = [n_samples]
 38                Observations weights.
 39
 40        Returns:
 41
 42            self: object.
 43        """
 44
 45        if sample_weight is None:
 46            self.obj = stumpc.fit_stump_classifier(
 47                X=np.asarray(X, order="C"),
 48                y=np.asarray(y, order="C"),
 49                bins=self.bins,
 50            )
 51
 52            return self
 53
 54        self.obj = stumpc.fit_stump_classifier(
 55            X=np.asarray(X, order="C"),
 56            y=np.asarray(y, order="C"),
 57            sample_weight=np.ravel(sample_weight, order="C"),
 58            bins=self.bins,
 59        )
 60        self.n_classes_ = len(np.unique(y))  # for compatibility with sklearn
 61        return self
 62
 63    def predict(self, X, **kwargs):
 64        """Predict test data X.
 65
 66        Args:
 67
 68            X: {array-like}, shape = [n_samples, n_features]
 69                Training vectors, where n_samples is the number
 70                of samples and n_features is the number of features.
 71
 72            **kwargs: additional parameters to be passed to `predict_proba`
 73
 74
 75        Returns:
 76
 77            model predictions: {array-like}
 78        """
 79
 80        return np.argmax(self.predict_proba(X, **kwargs), axis=1)
 81
 82    def predict_proba(self, X, **kwargs):
 83        """Predict probabilities for test data X.
 84
 85        Args:
 86
 87            X: {array-like}, shape = [n_samples, n_features]
 88                Training vectors, where n_samples is the number
 89                of samples and n_features is the number of features.
 90
 91            **kwargs: additional parameters to be passed to
 92                self.cook_test_set
 93
 94        Returns:
 95
 96            probability estimates for test data: {array-like}
 97        """
 98
 99        return stumpc.predict_proba_stump_classifier(
100            self.obj, np.asarray(X, order="C")
101        )

Stump classifier.

Attributes:

bins: int
    Number of histogram bins; as in numpy.histogram.
def fit(self, X, y, sample_weight=None, **kwargs):
25    def fit(self, X, y, sample_weight=None, **kwargs):
26        """Fit Stump to training data (X, y)
27
28        Args:
29
30            X: {array-like}, shape = [n_samples, n_features]
31                Training vectors, where n_samples is the number
32                of samples and n_features is the number of features.
33
34            y: array-like, shape = [n_samples]
35                Target values.
36
37            sample_weight: array_like, shape = [n_samples]
38                Observations weights.
39
40        Returns:
41
42            self: object.
43        """
44
45        if sample_weight is None:
46            self.obj = stumpc.fit_stump_classifier(
47                X=np.asarray(X, order="C"),
48                y=np.asarray(y, order="C"),
49                bins=self.bins,
50            )
51
52            return self
53
54        self.obj = stumpc.fit_stump_classifier(
55            X=np.asarray(X, order="C"),
56            y=np.asarray(y, order="C"),
57            sample_weight=np.ravel(sample_weight, order="C"),
58            bins=self.bins,
59        )
60        self.n_classes_ = len(np.unique(y))  # for compatibility with sklearn
61        return self

Fit Stump to training data (X, y)

Args:

X: {array-like}, shape = [n_samples, n_features]
    Training vectors, where n_samples is the number
    of samples and n_features is the number of features.

y: array-like, shape = [n_samples]
    Target values.

sample_weight: array_like, shape = [n_samples]
    Observations weights.

Returns:

self: object.
def predict(self, X, **kwargs):
63    def predict(self, X, **kwargs):
64        """Predict test data X.
65
66        Args:
67
68            X: {array-like}, shape = [n_samples, n_features]
69                Training vectors, where n_samples is the number
70                of samples and n_features is the number of features.
71
72            **kwargs: additional parameters to be passed to `predict_proba`
73
74
75        Returns:
76
77            model predictions: {array-like}
78        """
79
80        return np.argmax(self.predict_proba(X, **kwargs), axis=1)

Predict test data X.

Args:

X: {array-like}, shape = [n_samples, n_features]
    Training vectors, where n_samples is the number
    of samples and n_features is the number of features.

**kwargs: additional parameters to be passed to `predict_proba`

Returns:

model predictions: {array-like}
def predict_proba(self, X, **kwargs):
 82    def predict_proba(self, X, **kwargs):
 83        """Predict probabilities for test data X.
 84
 85        Args:
 86
 87            X: {array-like}, shape = [n_samples, n_features]
 88                Training vectors, where n_samples is the number
 89                of samples and n_features is the number of features.
 90
 91            **kwargs: additional parameters to be passed to
 92                self.cook_test_set
 93
 94        Returns:
 95
 96            probability estimates for test data: {array-like}
 97        """
 98
 99        return stumpc.predict_proba_stump_classifier(
100            self.obj, np.asarray(X, order="C")
101        )

Predict probabilities for test data X.

Args:

X: {array-like}, shape = [n_samples, n_features]
    Training vectors, where n_samples is the number
    of samples and n_features is the number of features.

**kwargs: additional parameters to be passed to
    self.cook_test_set

Returns:

probability estimates for test data: {array-like}
class ElasticNetRegressor(sklearn.base.BaseEstimator, sklearn.base.RegressorMixin):
 19class ElasticNetRegressor(BaseEstimator, RegressorMixin):
 20    """Elasticnet.
 21
 22    Attributes:
 23
 24        reg_lambda: float
 25            regularization parameter.
 26
 27        alpha: float
 28            compromise between L1 and L2 regularization (must be in [0, 1]),
 29            for `solver` == 'enet'.
 30
 31        backend: str
 32            type of backend; must be in ('cpu', 'gpu', 'tpu')
 33
 34    """
 35
 36    def __init__(self, reg_lambda=0.1, alpha=0.5, backend="cpu"):
 37        assert backend in (
 38            "cpu",
 39            "gpu",
 40            "tpu",
 41        ), "`backend` must be in ('cpu', 'gpu', 'tpu')"
 42
 43        sys_platform = platform.system()
 44
 45        if (sys_platform == "Windows") and (backend in ("gpu", "tpu")):
 46            warnings.warn(
 47                "No GPU/TPU computing on Windows yet, backend set to 'cpu'"
 48            )
 49            backend = "cpu"
 50
 51        self.reg_lambda = reg_lambda
 52        self.alpha = alpha
 53        self.backend = backend
 54        if self.backend in ("gpu", "tpu"):
 55            check_and_install("jax")
 56            check_and_install("jaxlib")
 57
 58    def fit(self, X, y, **kwargs):
 59        """Fit matrixops (classifier) to training data (X, y)
 60
 61        Args:
 62
 63            X: {array-like}, shape = [n_samples, n_features]
 64                Training vectors, where n_samples is the number
 65                of samples and n_features is the number of features.
 66
 67            y: array-like, shape = [n_samples]
 68                Target values.
 69
 70            **kwargs: additional parameters to be passed to self.cook_training_set.
 71
 72        Returns:
 73
 74            self: object.
 75
 76        """
 77        fit_result = fit_elasticnet(
 78            X, y, lam=self.reg_lambda, alpha=self.alpha, backend=self.backend
 79        )
 80        self.coef_ = fit_result.coef_
 81        self.y_train_mean = fit_result.y_train_mean
 82        self.scaler = fit_result.scaler
 83        self.converged = fit_result.converged
 84        return self
 85
 86    def predict(self, X, **kwargs):
 87        """Predict test data X.
 88
 89        Args:
 90
 91            X: {array-like}, shape = [n_samples, n_features]
 92                Training vectors, where n_samples is the number
 93                of samples and n_features is the number of features.
 94
 95            **kwargs: additional parameters to be passed to `predict_proba`
 96
 97        Returns:
 98
 99            model predictions: {array-like}
100
101        """
102        return predict_elasticnet(X, self, backend=self.backend)

Elasticnet.

Attributes:

reg_lambda: float
    regularization parameter.

alpha: float
    compromise between L1 and L2 regularization (must be in [0, 1]),
    for `solver` == 'enet'.

backend: str
    type of backend; must be in ('cpu', 'gpu', 'tpu')
def fit(self, X, y, **kwargs):
58    def fit(self, X, y, **kwargs):
59        """Fit matrixops (classifier) to training data (X, y)
60
61        Args:
62
63            X: {array-like}, shape = [n_samples, n_features]
64                Training vectors, where n_samples is the number
65                of samples and n_features is the number of features.
66
67            y: array-like, shape = [n_samples]
68                Target values.
69
70            **kwargs: additional parameters to be passed to self.cook_training_set.
71
72        Returns:
73
74            self: object.
75
76        """
77        fit_result = fit_elasticnet(
78            X, y, lam=self.reg_lambda, alpha=self.alpha, backend=self.backend
79        )
80        self.coef_ = fit_result.coef_
81        self.y_train_mean = fit_result.y_train_mean
82        self.scaler = fit_result.scaler
83        self.converged = fit_result.converged
84        return self

Fit matrixops (classifier) to training data (X, y)

Args:

X: {array-like}, shape = [n_samples, n_features]
    Training vectors, where n_samples is the number
    of samples and n_features is the number of features.

y: array-like, shape = [n_samples]
    Target values.

**kwargs: additional parameters to be passed to self.cook_training_set.

Returns:

self: object.
def predict(self, X, **kwargs):
 86    def predict(self, X, **kwargs):
 87        """Predict test data X.
 88
 89        Args:
 90
 91            X: {array-like}, shape = [n_samples, n_features]
 92                Training vectors, where n_samples is the number
 93                of samples and n_features is the number of features.
 94
 95            **kwargs: additional parameters to be passed to `predict_proba`
 96
 97        Returns:
 98
 99            model predictions: {array-like}
100
101        """
102        return predict_elasticnet(X, self, backend=self.backend)

Predict test data X.

Args:

X: {array-like}, shape = [n_samples, n_features]
    Training vectors, where n_samples is the number
    of samples and n_features is the number of features.

**kwargs: additional parameters to be passed to `predict_proba`

Returns:

model predictions: {array-like}
class KRLSRegressor(sklearn.base.BaseEstimator, sklearn.base.RegressorMixin):
36class KRLSRegressor(BaseEstimator, RegressorMixin):
37
38    def __init__(self, regularization=0.1, kernel=None, backend="cpu"):
39
40        if kernel is None:
41            if backend == "cpu":
42
43                def kernel(x, y):
44                    return np.sqrt(np.sum(np.square(x - y)))
45
46            else:
47
48                def kernel(x, y):
49                    device_put(x)
50                    device_put(y)
51                    return jnp.sqrt(jnp.sum(jnp.square(x - y)))
52
53        self.backend = backend
54        self.kernel = kernel
55        self.regularization = regularization
56        self.ym_ = None
57        self.scaler_ = StandardScaler()
58        self.X_ = None
59        self.coef_ = None
60
61    def fit(self, X, y):
62        self.ym_ = np.mean(y)
63        centered_y = y - self.ym_
64        X_ = self.scaler_.fit_transform(X)
65        if self.backend == "cpu":
66            K = compute_kernel_matrix(
67                X_, self.kernel
68            ) + self.regularization * np.eye(X_.shape[0])
69            self.coef_ = np.linalg.solve(K, centered_y)
70        else:
71            device_put(X_)
72            device_put(centered_y)
73            K = compute_kernel_matrix(
74                X_, self.kernel
75            ) + self.regularization * jnp.eye(X_.shape[0])
76            self.coef_ = jnp.linalg.solve(K, centered_y)
77        self.X_ = X_
78        return self
79
80    def predict(self, X):
81        X_ = self.scaler_.transform(X)
82        if self.backend != "cpu":
83            device_put(X_)
84            device_put(self.X_)
85            device_put(self.coef_)
86            device_put(self.ym_)
87        return (
88            compute_kernel_matrix(self.X_, self.kernel, X_) @ self.coef_
89            + self.ym_
90        )

Base class for all estimators in scikit-learn.

Inheriting from this class provides default implementations of:

  • setting and getting parameters used by GridSearchCV and friends;
  • textual and HTML representation displayed in terminals and IDEs;
  • estimator serialization;
  • parameters validation;
  • data validation;
  • feature names validation.

Read more in the :ref:User Guide <rolling_your_own_estimator>.

Notes

All estimators should specify all the parameters that can be set at the class level in their __init__ as explicit keyword arguments (no *args or **kwargs).

Examples

>>> import numpy as np
>>> from sklearn.base import BaseEstimator
>>> class MyEstimator(BaseEstimator):
...     def __init__(self, *, param=1):
...         self.param = param
...     def fit(self, X, y=None):
...         self.is_fitted_ = True
...         return self
...     def predict(self, X):
...         return np.full(shape=X.shape[0], fill_value=self.param)
>>> estimator = MyEstimator(param=2)
>>> estimator.get_params()
{'param': 2}
>>> X = np.array([[1, 2], [2, 3], [3, 4]])
>>> y = np.array([1, 0, 1])
>>> estimator.fit(X, y).predict(X)
array([2, 2, 2])
>>> estimator.set_params(param=3).fit(X, y).predict(X)
array([3, 3, 3])
def fit(self, X, y):
61    def fit(self, X, y):
62        self.ym_ = np.mean(y)
63        centered_y = y - self.ym_
64        X_ = self.scaler_.fit_transform(X)
65        if self.backend == "cpu":
66            K = compute_kernel_matrix(
67                X_, self.kernel
68            ) + self.regularization * np.eye(X_.shape[0])
69            self.coef_ = np.linalg.solve(K, centered_y)
70        else:
71            device_put(X_)
72            device_put(centered_y)
73            K = compute_kernel_matrix(
74                X_, self.kernel
75            ) + self.regularization * jnp.eye(X_.shape[0])
76            self.coef_ = jnp.linalg.solve(K, centered_y)
77        self.X_ = X_
78        return self
def predict(self, X):
80    def predict(self, X):
81        X_ = self.scaler_.transform(X)
82        if self.backend != "cpu":
83            device_put(X_)
84            device_put(self.X_)
85            device_put(self.coef_)
86            device_put(self.ym_)
87        return (
88            compute_kernel_matrix(self.X_, self.kernel, X_) @ self.coef_
89            + self.ym_
90        )
class LassoRegressor(sklearn.base.BaseEstimator, sklearn.base.RegressorMixin):
 24class LassoRegressor(BaseEstimator, RegressorMixin):
 25    """Lasso.
 26
 27    Attributes:
 28
 29        reg_lambda: float
 30            L1 regularization parameter.
 31
 32        max_iter: int
 33            number of iterations of lasso shooting algorithm.
 34
 35        tol: float
 36            tolerance for convergence of lasso shooting algorithm.
 37
 38        backend: str
 39            type of backend; must be in ('cpu', 'gpu', 'tpu').
 40
 41    """
 42
 43    def __init__(self, reg_lambda=0.1, max_iter=10, tol=1e-3, backend="cpu"):
 44        assert backend in (
 45            "cpu",
 46            "gpu",
 47            "tpu",
 48        ), "`backend` must be in ('cpu', 'gpu', 'tpu')"
 49
 50        sys_platform = platform.system()
 51
 52        if (sys_platform == "Windows") and (backend in ("gpu", "tpu")):
 53            warnings.warn(
 54                "No GPU/TPU computing on Windows yet, backend set to 'cpu'"
 55            )
 56            backend = "cpu"
 57
 58        self.reg_lambda = reg_lambda
 59        self.max_iter = max_iter
 60        self.tol = tol
 61        self.backend = backend
 62        if self.backend in ("gpu", "tpu"):
 63            check_and_install("jax")
 64            check_and_install("jaxlib")
 65
 66    def fit(self, X, y, **kwargs):
 67        """Fit matrixops (classifier) to training data (X, y)
 68
 69        Args:
 70
 71            X: {array-like}, shape = [n_samples, n_features]
 72                Training vectors, where n_samples is the number
 73                of samples and n_features is the number of features.
 74
 75            y: array-like, shape = [n_samples]
 76                Target values.
 77
 78            **kwargs: additional parameters to be passed to self.cook_training_set.
 79
 80        Returns:
 81
 82            self: object.
 83
 84        """
 85
 86        self.ym, centered_y = mo.center_response(y)
 87        self.xm = X.mean(axis=0)
 88        self.xsd = X.std(axis=0)
 89        self.xsd[self.xsd == 0] = 1
 90        X_ = (X - self.xm[None, :]) / self.xsd[None, :]
 91        XX = mo.crossprod(X_, backend=self.backend)
 92        Xy = mo.crossprod(X_, centered_y, backend=self.backend)
 93        XX2 = 2 * XX
 94        Xy2 = 2 * Xy
 95
 96        if self.backend == "cpu":
 97            # beta0, _, _, _ = np.linalg.lstsq(X_, centered_y, rcond=None)
 98            beta0 = get_beta(X_, centered_y)
 99            if len(np.asarray(y).shape) == 1:
100                res = mo.get_beta_1D(
101                    beta0=np.asarray(beta0),
102                    XX2=np.asarray(XX2),
103                    Xy2=np.asarray(Xy2),
104                    reg_lambda=self.reg_lambda,
105                    max_iter=self.max_iter,
106                    tol=self.tol,
107                )
108                self.beta = res[0]
109                return self
110
111            res = mo.get_beta_2D(
112                beta0=np.asarray(beta0),
113                XX2=np.asarray(XX2),
114                Xy2=np.asarray(Xy2),
115                reg_lambda=self.reg_lambda,
116                max_iter=self.max_iter,
117                tol=self.tol,
118            )
119            self.beta = res[0]
120            return self
121
122        invXX = jinv(XX + self.reg_lambda * jnp.eye(X_.shape[1]))
123        beta0 = mo.safe_sparse_dot(invXX, Xy, backend=self.backend)
124        if len(np.asarray(y).shape) == 1:
125            res = mo.get_beta_1D(
126                beta0=np.asarray(beta0),
127                XX2=np.asarray(XX2),
128                Xy2=np.asarray(Xy2),
129                reg_lambda=self.reg_lambda,
130                max_iter=self.max_iter,
131                tol=self.tol,
132            )
133            self.beta = res[0]
134            return self
135
136        res = mo.get_beta_2D(
137            beta0=np.asarray(beta0),
138            XX2=np.asarray(XX2),
139            Xy2=np.asarray(Xy2),
140            reg_lambda=self.reg_lambda,
141            max_iter=self.max_iter,
142            tol=self.tol,
143        )
144        self.beta = res[0]
145        return self
146
147    def predict(self, X, **kwargs):
148        """Predict test data X.
149
150        Args:
151
152            X: {array-like}, shape = [n_samples, n_features]
153                Training vectors, where n_samples is the number
154                of samples and n_features is the number of features.
155
156            **kwargs: additional parameters to be passed to `predict_proba`
157
158
159        Returns:
160
161            model predictions: {array-like}
162
163        """
164        X_ = (X - self.xm[None, :]) / self.xsd[None, :]
165
166        if self.backend == "cpu":
167            if isinstance(self.ym, float):
168                return self.ym + mo.safe_sparse_dot(X_, self.beta)
169            return self.ym[None, :] + mo.safe_sparse_dot(X_, self.beta)
170
171        # if self.backend in ("gpu", "tpu"):
172        if isinstance(self.ym, float):
173            return self.ym + mo.safe_sparse_dot(
174                X_, self.beta, backend=self.backend
175            )
176        return self.ym[None, :] + mo.safe_sparse_dot(
177            X_, self.beta, backend=self.backend
178        )

Lasso.

Attributes:

reg_lambda: float
    L1 regularization parameter.

max_iter: int
    number of iterations of lasso shooting algorithm.

tol: float
    tolerance for convergence of lasso shooting algorithm.

backend: str
    type of backend; must be in ('cpu', 'gpu', 'tpu').
def fit(self, X, y, **kwargs):
 66    def fit(self, X, y, **kwargs):
 67        """Fit matrixops (classifier) to training data (X, y)
 68
 69        Args:
 70
 71            X: {array-like}, shape = [n_samples, n_features]
 72                Training vectors, where n_samples is the number
 73                of samples and n_features is the number of features.
 74
 75            y: array-like, shape = [n_samples]
 76                Target values.
 77
 78            **kwargs: additional parameters to be passed to self.cook_training_set.
 79
 80        Returns:
 81
 82            self: object.
 83
 84        """
 85
 86        self.ym, centered_y = mo.center_response(y)
 87        self.xm = X.mean(axis=0)
 88        self.xsd = X.std(axis=0)
 89        self.xsd[self.xsd == 0] = 1
 90        X_ = (X - self.xm[None, :]) / self.xsd[None, :]
 91        XX = mo.crossprod(X_, backend=self.backend)
 92        Xy = mo.crossprod(X_, centered_y, backend=self.backend)
 93        XX2 = 2 * XX
 94        Xy2 = 2 * Xy
 95
 96        if self.backend == "cpu":
 97            # beta0, _, _, _ = np.linalg.lstsq(X_, centered_y, rcond=None)
 98            beta0 = get_beta(X_, centered_y)
 99            if len(np.asarray(y).shape) == 1:
100                res = mo.get_beta_1D(
101                    beta0=np.asarray(beta0),
102                    XX2=np.asarray(XX2),
103                    Xy2=np.asarray(Xy2),
104                    reg_lambda=self.reg_lambda,
105                    max_iter=self.max_iter,
106                    tol=self.tol,
107                )
108                self.beta = res[0]
109                return self
110
111            res = mo.get_beta_2D(
112                beta0=np.asarray(beta0),
113                XX2=np.asarray(XX2),
114                Xy2=np.asarray(Xy2),
115                reg_lambda=self.reg_lambda,
116                max_iter=self.max_iter,
117                tol=self.tol,
118            )
119            self.beta = res[0]
120            return self
121
122        invXX = jinv(XX + self.reg_lambda * jnp.eye(X_.shape[1]))
123        beta0 = mo.safe_sparse_dot(invXX, Xy, backend=self.backend)
124        if len(np.asarray(y).shape) == 1:
125            res = mo.get_beta_1D(
126                beta0=np.asarray(beta0),
127                XX2=np.asarray(XX2),
128                Xy2=np.asarray(Xy2),
129                reg_lambda=self.reg_lambda,
130                max_iter=self.max_iter,
131                tol=self.tol,
132            )
133            self.beta = res[0]
134            return self
135
136        res = mo.get_beta_2D(
137            beta0=np.asarray(beta0),
138            XX2=np.asarray(XX2),
139            Xy2=np.asarray(Xy2),
140            reg_lambda=self.reg_lambda,
141            max_iter=self.max_iter,
142            tol=self.tol,
143        )
144        self.beta = res[0]
145        return self

Fit matrixops (classifier) to training data (X, y)

Args:

X: {array-like}, shape = [n_samples, n_features]
    Training vectors, where n_samples is the number
    of samples and n_features is the number of features.

y: array-like, shape = [n_samples]
    Target values.

**kwargs: additional parameters to be passed to self.cook_training_set.

Returns:

self: object.
def predict(self, X, **kwargs):
147    def predict(self, X, **kwargs):
148        """Predict test data X.
149
150        Args:
151
152            X: {array-like}, shape = [n_samples, n_features]
153                Training vectors, where n_samples is the number
154                of samples and n_features is the number of features.
155
156            **kwargs: additional parameters to be passed to `predict_proba`
157
158
159        Returns:
160
161            model predictions: {array-like}
162
163        """
164        X_ = (X - self.xm[None, :]) / self.xsd[None, :]
165
166        if self.backend == "cpu":
167            if isinstance(self.ym, float):
168                return self.ym + mo.safe_sparse_dot(X_, self.beta)
169            return self.ym[None, :] + mo.safe_sparse_dot(X_, self.beta)
170
171        # if self.backend in ("gpu", "tpu"):
172        if isinstance(self.ym, float):
173            return self.ym + mo.safe_sparse_dot(
174                X_, self.beta, backend=self.backend
175            )
176        return self.ym[None, :] + mo.safe_sparse_dot(
177            X_, self.beta, backend=self.backend
178        )

Predict test data X.

Args:

X: {array-like}, shape = [n_samples, n_features]
    Training vectors, where n_samples is the number
    of samples and n_features is the number of features.

**kwargs: additional parameters to be passed to `predict_proba`

Returns:

model predictions: {array-like}
class LSBoostRegressor(sklearn.base.BaseEstimator, sklearn.base.RegressorMixin):
 19class LSBoostRegressor(BaseEstimator, RegressorMixin):
 20    """LSBoost regressor.
 21
 22    Attributes:
 23
 24        n_estimators: int
 25            number of boosting iterations.
 26
 27        learning_rate: float
 28            controls the learning speed at training time.
 29
 30        n_hidden_features: int
 31            number of nodes in successive hidden layers.
 32
 33        reg_lambda: float
 34            L2 regularization parameter for successive errors in the optimizer
 35            (at training time).
 36
 37        alpha: float
 38            compromise between L1 and L2 regularization (must be in [0, 1]),
 39            for `solver` == 'enet'
 40
 41        row_sample: float
 42            percentage of rows chosen from the training set.
 43
 44        col_sample: float
 45            percentage of columns chosen from the training set.
 46
 47        dropout: float
 48            percentage of nodes dropped from the training set.
 49
 50        tolerance: float
 51            controls early stopping in gradient descent (at training time).
 52
 53        direct_link: bool
 54            indicates whether the original features are included (True) in model's
 55            fitting or not (False).
 56
 57        verbose: int
 58            progress bar (yes = 1) or not (no = 0) (currently).
 59
 60        seed: int
 61            reproducibility seed for nodes_sim=='uniform', clustering and dropout.
 62
 63        backend: str
 64            type of backend; must be in ('cpu', 'gpu', 'tpu')
 65
 66        solver: str
 67            type of 'weak' learner; currently in ('ridge', 'lasso')
 68
 69        activation: str
 70            activation function: currently 'relu', 'relu6', 'sigmoid', 'tanh'
 71
 72        type_pi: str.
 73            type of prediction interval; currently "kde" (default) or "bootstrap".
 74            Used only in `self.predict`, for `self.replications` > 0 and `self.kernel`
 75            in ('gaussian', 'tophat'). Default is `None`.
 76
 77        replications: int.
 78            number of replications (if needed) for predictive simulation.
 79            Used only in `self.predict`, for `self.kernel` in ('gaussian',
 80            'tophat') and `self.type_pi = 'kde'`. Default is `None`.
 81
 82        n_clusters: int
 83            number of clusters for clustering the features
 84
 85        clustering_method: str
 86            clustering method: currently 'kmeans', 'gmm'
 87
 88        cluster_scaling: str
 89            scaling method for clustering: currently 'standard', 'robust', 'minmax'
 90
 91        degree: int
 92            degree of features interactions to include in the model
 93
 94        weights_distr: str
 95            distribution of weights for constructing the model's hidden layer;
 96            either 'uniform' or 'gaussian'
 97
 98        hist: bool
 99            whether to use histogram features or not
100
101        bins: int or str
102            number of bins for histogram features (same as numpy.histogram, default is 'auto')
103
104    Examples:
105
106        ```python
107        import subprocess
108        import sys
109        import os
110
111        import mlsauce as ms
112        import numpy as np
113        import matplotlib.pyplot as plt
114        from sklearn.datasets import load_diabetes
115        from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
116        from sklearn.tree import DecisionTreeRegressor
117        from time import time
118        from os import chdir
119        from sklearn import metrics
120
121        regr = DecisionTreeRegressor()
122
123        diabetes = load_diabetes()
124        X = diabetes.data
125        y = diabetes.target
126        # split data into training test and test set
127        np.random.seed(15029)
128        X_train, X_test, y_train, y_test = train_test_split(X, y,
129                                                            test_size=0.2)
130
131        obj = ms.GenericBoostingRegressor(regr, col_sample=0.9, row_sample=0.9)
132        print(obj.get_params())
133        start = time()
134        obj.fit(X_train, y_train)
135        print(time()-start)
136        start = time()
137        print(np.sqrt(np.mean(np.square(obj.predict(X_test) - y_test))))
138        print(time()-start)
139
140        print(obj.obj['loss'])
141
142        obj = ms.GenericBoostingRegressor(regr, col_sample=0.9, row_sample=0.9, n_clusters=2)
143        print(obj.get_params())
144        start = time()
145        obj.fit(X_train, y_train)
146        print(time()-start)
147        start = time()
148        print(np.sqrt(np.mean(np.square(obj.predict(X_test) - y_test))))
149        print(time()-start)
150
151        print(obj.obj['loss'])
152        ```
153
154    """
155
156    def __init__(
157        self,
158        n_estimators=100,
159        learning_rate=0.1,
160        n_hidden_features=5,
161        reg_lambda=0.1,
162        alpha=0.5,
163        row_sample=1,
164        col_sample=1,
165        dropout=0,
166        tolerance=1e-4,
167        direct_link=1,
168        verbose=1,
169        seed=123,
170        backend="cpu",
171        solver="ridge",
172        activation="relu",
173        type_pi=None,
174        replications=None,
175        kernel=None,
176        n_clusters=0,
177        clustering_method="kmeans",
178        cluster_scaling="standard",
179        degree=None,
180        weights_distr="uniform",
181        base_model=None,
182        hist=False,
183        bins="auto",
184    ):
185
186        self.base_model = base_model
187        self.hist = hist
188        self.bins = bins
189        self.hist_bins_ = None
190
191        if n_clusters > 0:
192            assert clustering_method in (
193                "kmeans",
194                "gmm",
195            ), "`clustering_method` must be in ('kmeans', 'gmm')"
196            assert cluster_scaling in (
197                "standard",
198                "robust",
199                "minmax",
200            ), "`cluster_scaling` must be in ('standard', 'robust', 'minmax')"
201
202        assert backend in (
203            "cpu",
204            "gpu",
205            "tpu",
206        ), "`backend` must be in ('cpu', 'gpu', 'tpu')"
207
208        assert solver in (
209            "ridge",
210            "lasso",
211            "enet",
212        ), "`solver` must be in ('ridge', 'lasso', 'enet')"
213
214        sys_platform = platform.system()
215
216        if (sys_platform == "Windows") and (backend in ("gpu", "tpu")):
217            warnings.warn(
218                "No GPU/TPU computing on Windows yet, backend set to 'cpu'"
219            )
220            backend = "cpu"
221
222        self.n_estimators = n_estimators
223        self.learning_rate = learning_rate
224        self.n_hidden_features = n_hidden_features
225        self.reg_lambda = reg_lambda
226        assert alpha >= 0 and alpha <= 1, "`alpha` must be in [0, 1]"
227        self.alpha = alpha
228        self.row_sample = row_sample
229        self.col_sample = col_sample
230        self.dropout = dropout
231        self.tolerance = tolerance
232        self.direct_link = direct_link
233        self.verbose = verbose
234        self.seed = seed
235        self.backend = backend
236        self.obj = None
237        self.solver = solver
238        self.activation = activation
239        self.type_pi = type_pi
240        self.replications = replications
241        self.kernel = kernel
242        self.n_clusters = n_clusters
243        self.clustering_method = clustering_method
244        self.cluster_scaling = cluster_scaling
245        self.scaler_, self.label_encoder_, self.clusterer_ = None, None, None
246        self.degree = degree
247        self.poly_ = None
248        self.weights_distr = weights_distr
249        if self.backend in ("gpu", "tpu"):
250            check_and_install("jax")
251            check_and_install("jaxlib")
252
253    def fit(self, X, y, **kwargs):
254        """Fit Booster (regressor) to training data (X, y)
255
256        Args:
257
258            X: {array-like}, shape = [n_samples, n_features]
259                Training vectors, where n_samples is the number
260                of samples and n_features is the number of features.
261
262            y: array-like, shape = [n_samples]
263               Target values.
264
265            **kwargs: additional parameters to be passed to self.cook_training_set.
266
267        Returns:
268
269            self: object.
270        """
271
272        if isinstance(X, pd.DataFrame):
273            X = X.values
274
275        if self.hist == True:
276            X, self.hist_bins_ = get_histo_features(X)
277
278        if isinstance(y, pd.Series):
279            y = y.values.ravel()
280        else:
281            y = np.asarray(y).ravel()
282
283        if self.degree is not None:
284            assert isinstance(self.degree, int), "`degree` must be an integer"
285            self.poly_ = PolynomialFeatures(
286                degree=self.degree, interaction_only=True, include_bias=False
287            )
288            X = self.poly_.fit_transform(X)
289
290        if self.n_clusters > 0:
291            clustered_X, self.scaler_, self.label_encoder_, self.clusterer_ = (
292                cluster(
293                    X,
294                    n_clusters=self.n_clusters,
295                    method=self.clustering_method,
296                    type_scaling=self.cluster_scaling,
297                    training=True,
298                    seed=self.seed,
299                )
300            )
301            X = np.column_stack((X, clustered_X))
302
303        self.obj = boosterc.fit_booster_regressor(
304            X=np.asarray(X, order="C", dtype=np.float64),
305            y=np.asarray(y, order="C", dtype=np.float64),
306            n_estimators=self.n_estimators,
307            learning_rate=self.learning_rate,
308            n_hidden_features=self.n_hidden_features,
309            reg_lambda=self.reg_lambda,
310            alpha=self.alpha,
311            row_sample=self.row_sample,
312            col_sample=self.col_sample,
313            dropout=self.dropout,
314            tolerance=self.tolerance,
315            direct_link=self.direct_link,
316            verbose=self.verbose,
317            seed=self.seed,
318            backend=self.backend,
319            solver=self.solver,
320            activation=self.activation,
321            obj=self.base_model,
322        )
323
324        self.n_estimators = self.obj["n_estimators"]
325
326        self.X_ = X
327
328        self.y_ = y
329
330        return self
331
332    def predict(self, X, level=95, method=None, histo=False, **kwargs):
333        """Predict values for test data X.
334
335        Args:
336
337            X: {array-like}, shape = [n_samples, n_features]
338                Training vectors, where n_samples is the number
339                of samples and n_features is the number of features.
340
341            level: int
342                Level of confidence (default = 95)
343
344            method: str
345                `None`, or 'splitconformal', 'localconformal'
346                prediction (if you specify `return_pi = True`)
347
348            histo: bool
349                whether to use histogram features or not
350
351            **kwargs: additional parameters to be passed to
352                self.cook_test_set
353
354        Returns:
355
356            predicted values estimates for test data: {array-like}
357        """
358
359        if isinstance(X, pd.DataFrame):
360            X = X.values
361
362        if self.hist == True:
363            X = get_histo_features(X, bins=self.hist_bins_)
364
365        if self.degree is not None:
366            X = self.poly_.transform(X)
367
368        if self.n_clusters > 0:
369            X = np.column_stack(
370                (
371                    X,
372                    cluster(
373                        X,
374                        training=False,
375                        scaler=self.scaler_,
376                        label_encoder=self.label_encoder_,
377                        clusterer=self.clusterer_,
378                        seed=self.seed,
379                    ),
380                )
381            )
382        if "return_pi" in kwargs:
383            assert method in (
384                "splitconformal",
385                "localconformal",
386            ), "method must be in ('splitconformal', 'localconformal')"
387            self.pi = PredictionInterval(
388                obj=self,
389                method=method,
390                level=level,
391                type_pi=self.type_pi,
392                replications=self.replications,
393                kernel=self.kernel,
394            )
395            self.pi.fit(self.X_, self.y_)
396            self.X_ = None
397            self.y_ = None
398            preds = self.pi.predict(X, return_pi=True)
399            return preds
400        # print(f"\n in predict self: {self} \n")
401        # print(f"\n in predict self.obj: {self.obj} \n")
402        # try:
403        return boosterc.predict_booster_regressor(
404            self.obj,
405            np.asarray(X, order="C"),
406            backend=self.backend,
407        )
408        # except ValueError:
409        #    pass
410
411    def update(self, X, y, eta=0.9):
412        """Update model with new data.
413
414        Args:
415
416            X: {array-like}, shape = [n_samples=1, n_features]
417                Training vectors, where n_samples is the number
418                of samples and n_features is the number of features.
419
420            y: float = [n_samples=1]
421               Target value.
422
423            eta: float
424                Inverse power applied to number of observations
425                (defines a learning rate).
426
427        Returns:
428
429            self: object.
430        """
431
432        if isinstance(X, pd.DataFrame):
433            X = X.values
434
435        if self.degree is not None:
436            X = self.poly_.transform(X)
437
438        if self.n_clusters > 0:
439            X = np.column_stack(
440                (
441                    X,
442                    cluster(
443                        X,
444                        training=False,
445                        scaler=self.scaler_,
446                        label_encoder=self.label_encoder_,
447                        clusterer=self.clusterer_,
448                        seed=self.seed,
449                    ),
450                )
451            )
452
453        self.obj = boosterc.update_booster(
454            self.obj, np.asarray(X, order="C"), np.asarray(y, order="C"), eta
455        )
456
457        return self

LSBoost regressor.

Attributes:

n_estimators: int
    number of boosting iterations.

learning_rate: float
    controls the learning speed at training time.

n_hidden_features: int
    number of nodes in successive hidden layers.

reg_lambda: float
    L2 regularization parameter for successive errors in the optimizer
    (at training time).

alpha: float
    compromise between L1 and L2 regularization (must be in [0, 1]),
    for `solver` == 'enet'

row_sample: float
    percentage of rows chosen from the training set.

col_sample: float
    percentage of columns chosen from the training set.

dropout: float
    percentage of nodes dropped from the training set.

tolerance: float
    controls early stopping in gradient descent (at training time).

direct_link: bool
    indicates whether the original features are included (True) in model's
    fitting or not (False).

verbose: int
    progress bar (yes = 1) or not (no = 0) (currently).

seed: int
    reproducibility seed for nodes_sim=='uniform', clustering and dropout.

backend: str
    type of backend; must be in ('cpu', 'gpu', 'tpu')

solver: str
    type of 'weak' learner; currently in ('ridge', 'lasso')

activation: str
    activation function: currently 'relu', 'relu6', 'sigmoid', 'tanh'

type_pi: str.
    type of prediction interval; currently "kde" (default) or "bootstrap".
    Used only in `self.predict`, for `self.replications` > 0 and `self.kernel`
    in ('gaussian', 'tophat'). Default is `None`.

replications: int.
    number of replications (if needed) for predictive simulation.
    Used only in `self.predict`, for `self.kernel` in ('gaussian',
    'tophat') and `self.type_pi = 'kde'`. Default is `None`.

n_clusters: int
    number of clusters for clustering the features

clustering_method: str
    clustering method: currently 'kmeans', 'gmm'

cluster_scaling: str
    scaling method for clustering: currently 'standard', 'robust', 'minmax'

degree: int
    degree of features interactions to include in the model

weights_distr: str
    distribution of weights for constructing the model's hidden layer;
    either 'uniform' or 'gaussian'

hist: bool
    whether to use histogram features or not

bins: int or str
    number of bins for histogram features (same as numpy.histogram, default is 'auto')

Examples:

import subprocess
import sys
import os

import mlsauce as ms
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.tree import DecisionTreeRegressor
from time import time
from os import chdir
from sklearn import metrics

regr = DecisionTreeRegressor()

diabetes = load_diabetes()
X = diabetes.data
y = diabetes.target
# split data into training test and test set
np.random.seed(15029)
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2)

obj = ms.GenericBoostingRegressor(regr, col_sample=0.9, row_sample=0.9)
print(obj.get_params())
start = time()
obj.fit(X_train, y_train)
print(time()-start)
start = time()
print(np.sqrt(np.mean(np.square(obj.predict(X_test) - y_test))))
print(time()-start)

print(obj.obj['loss'])

obj = ms.GenericBoostingRegressor(regr, col_sample=0.9, row_sample=0.9, n_clusters=2)
print(obj.get_params())
start = time()
obj.fit(X_train, y_train)
print(time()-start)
start = time()
print(np.sqrt(np.mean(np.square(obj.predict(X_test) - y_test))))
print(time()-start)

print(obj.obj['loss'])
def fit(self, X, y, **kwargs):
253    def fit(self, X, y, **kwargs):
254        """Fit Booster (regressor) to training data (X, y)
255
256        Args:
257
258            X: {array-like}, shape = [n_samples, n_features]
259                Training vectors, where n_samples is the number
260                of samples and n_features is the number of features.
261
262            y: array-like, shape = [n_samples]
263               Target values.
264
265            **kwargs: additional parameters to be passed to self.cook_training_set.
266
267        Returns:
268
269            self: object.
270        """
271
272        if isinstance(X, pd.DataFrame):
273            X = X.values
274
275        if self.hist == True:
276            X, self.hist_bins_ = get_histo_features(X)
277
278        if isinstance(y, pd.Series):
279            y = y.values.ravel()
280        else:
281            y = np.asarray(y).ravel()
282
283        if self.degree is not None:
284            assert isinstance(self.degree, int), "`degree` must be an integer"
285            self.poly_ = PolynomialFeatures(
286                degree=self.degree, interaction_only=True, include_bias=False
287            )
288            X = self.poly_.fit_transform(X)
289
290        if self.n_clusters > 0:
291            clustered_X, self.scaler_, self.label_encoder_, self.clusterer_ = (
292                cluster(
293                    X,
294                    n_clusters=self.n_clusters,
295                    method=self.clustering_method,
296                    type_scaling=self.cluster_scaling,
297                    training=True,
298                    seed=self.seed,
299                )
300            )
301            X = np.column_stack((X, clustered_X))
302
303        self.obj = boosterc.fit_booster_regressor(
304            X=np.asarray(X, order="C", dtype=np.float64),
305            y=np.asarray(y, order="C", dtype=np.float64),
306            n_estimators=self.n_estimators,
307            learning_rate=self.learning_rate,
308            n_hidden_features=self.n_hidden_features,
309            reg_lambda=self.reg_lambda,
310            alpha=self.alpha,
311            row_sample=self.row_sample,
312            col_sample=self.col_sample,
313            dropout=self.dropout,
314            tolerance=self.tolerance,
315            direct_link=self.direct_link,
316            verbose=self.verbose,
317            seed=self.seed,
318            backend=self.backend,
319            solver=self.solver,
320            activation=self.activation,
321            obj=self.base_model,
322        )
323
324        self.n_estimators = self.obj["n_estimators"]
325
326        self.X_ = X
327
328        self.y_ = y
329
330        return self

Fit Booster (regressor) to training data (X, y)

Args:

X: {array-like}, shape = [n_samples, n_features]
    Training vectors, where n_samples is the number
    of samples and n_features is the number of features.

y: array-like, shape = [n_samples]
   Target values.

**kwargs: additional parameters to be passed to self.cook_training_set.

Returns:

self: object.
def predict(self, X, level=95, method=None, histo=False, **kwargs):
332    def predict(self, X, level=95, method=None, histo=False, **kwargs):
333        """Predict values for test data X.
334
335        Args:
336
337            X: {array-like}, shape = [n_samples, n_features]
338                Training vectors, where n_samples is the number
339                of samples and n_features is the number of features.
340
341            level: int
342                Level of confidence (default = 95)
343
344            method: str
345                `None`, or 'splitconformal', 'localconformal'
346                prediction (if you specify `return_pi = True`)
347
348            histo: bool
349                whether to use histogram features or not
350
351            **kwargs: additional parameters to be passed to
352                self.cook_test_set
353
354        Returns:
355
356            predicted values estimates for test data: {array-like}
357        """
358
359        if isinstance(X, pd.DataFrame):
360            X = X.values
361
362        if self.hist == True:
363            X = get_histo_features(X, bins=self.hist_bins_)
364
365        if self.degree is not None:
366            X = self.poly_.transform(X)
367
368        if self.n_clusters > 0:
369            X = np.column_stack(
370                (
371                    X,
372                    cluster(
373                        X,
374                        training=False,
375                        scaler=self.scaler_,
376                        label_encoder=self.label_encoder_,
377                        clusterer=self.clusterer_,
378                        seed=self.seed,
379                    ),
380                )
381            )
382        if "return_pi" in kwargs:
383            assert method in (
384                "splitconformal",
385                "localconformal",
386            ), "method must be in ('splitconformal', 'localconformal')"
387            self.pi = PredictionInterval(
388                obj=self,
389                method=method,
390                level=level,
391                type_pi=self.type_pi,
392                replications=self.replications,
393                kernel=self.kernel,
394            )
395            self.pi.fit(self.X_, self.y_)
396            self.X_ = None
397            self.y_ = None
398            preds = self.pi.predict(X, return_pi=True)
399            return preds
400        # print(f"\n in predict self: {self} \n")
401        # print(f"\n in predict self.obj: {self.obj} \n")
402        # try:
403        return boosterc.predict_booster_regressor(
404            self.obj,
405            np.asarray(X, order="C"),
406            backend=self.backend,
407        )
408        # except ValueError:
409        #    pass

Predict values for test data X.

Args:

X: {array-like}, shape = [n_samples, n_features]
    Training vectors, where n_samples is the number
    of samples and n_features is the number of features.

level: int
    Level of confidence (default = 95)

method: str
    `None`, or 'splitconformal', 'localconformal'
    prediction (if you specify `return_pi = True`)

histo: bool
    whether to use histogram features or not

**kwargs: additional parameters to be passed to
    self.cook_test_set

Returns:

predicted values estimates for test data: {array-like}
LSTMRegressor
class RidgeRegressor(sklearn.base.BaseEstimator, sklearn.base.RegressorMixin):
 23class RidgeRegressor(BaseEstimator, RegressorMixin):
 24    """Ridge.
 25
 26    Attributes:
 27
 28        reg_lambda: float
 29            regularization parameter.
 30
 31        backend: str
 32            type of backend; must be in ('cpu', 'gpu', 'tpu')
 33
 34    """
 35
 36    def __init__(self, reg_lambda=0.1, backend="cpu"):
 37        assert backend in (
 38            "cpu",
 39            "gpu",
 40            "tpu",
 41        ), "`backend` must be in ('cpu', 'gpu', 'tpu')"
 42
 43        sys_platform = platform.system()
 44
 45        if (sys_platform == "Windows") and (backend in ("gpu", "tpu")):
 46            warnings.warn(
 47                "No GPU/TPU computing on Windows yet, backend set to 'cpu'"
 48            )
 49            backend = "cpu"
 50
 51        self.reg_lambda = reg_lambda
 52        self.backend = backend
 53        if self.backend in ("gpu", "tpu"):
 54            check_and_install("jax")
 55            check_and_install("jaxlib")
 56
 57    def fit(self, X, y, **kwargs):
 58        """Fit matrixops (classifier) to training data (X, y)
 59
 60        Args:
 61
 62            X: {array-like}, shape = [n_samples, n_features]
 63                Training vectors, where n_samples is the number
 64                of samples and n_features is the number of features.
 65
 66            y: array-like, shape = [n_samples]
 67                Target values.
 68
 69            **kwargs: additional parameters to be passed to self.cook_training_set.
 70
 71        Returns:
 72
 73            self: object.
 74
 75        """
 76        self.ym, centered_y = mo.center_response(y)
 77        self.xm = X.mean(axis=0)
 78        self.xsd = X.std(axis=0)
 79        self.xsd[self.xsd == 0] = 1  # avoid division by zero
 80        X_ = (X - self.xm[None, :]) / self.xsd[None, :]
 81
 82        if self.backend == "cpu":
 83            if len(centered_y.shape) <= 1:
 84                eye_term = np.sqrt(self.reg_lambda) * np.eye(X.shape[1])
 85                X_ = np.row_stack((X_, eye_term))
 86                y_ = np.concatenate((centered_y, np.zeros(X.shape[1])))
 87                # self.beta, _, _, _ = np.linalg.lstsq(X_, y_, rcond=None)
 88                self.beta = get_beta(X_, y_)
 89            else:
 90                try:
 91                    eye_term = np.sqrt(self.reg_lambda) * np.eye(X.shape[1])
 92                    X_ = np.row_stack((X_, eye_term))
 93                    y_ = np.row_stack(
 94                        (
 95                            centered_y,
 96                            np.zeros((eye_term.shape[0], centered_y.shape[1])),
 97                        )
 98                    )
 99                    # self.beta, _, _, _ = np.linalg.lstsq(X_, y_, rcond=None)
100                    self.beta = get_beta(X_, y_)
101                except Exception:
102                    x = inv(
103                        mo.crossprod(X_) + self.reg_lambda * np.eye(X_.shape[1])
104                    )
105                    hat_matrix = mo.tcrossprod(x, X_)
106                    self.beta = mo.safe_sparse_dot(hat_matrix, centered_y)
107            return self
108
109        x = jinv(
110            mo.crossprod(X_, backend=self.backend)
111            + self.reg_lambda * jnp.eye(X_.shape[1])
112        )
113        hat_matrix = mo.tcrossprod(x, X_, backend=self.backend)
114        self.beta = mo.safe_sparse_dot(
115            hat_matrix, centered_y, backend=self.backend
116        )
117        return self
118
119    def predict(self, X, **kwargs):
120        """Predict test data X.
121
122        Args:
123
124            X: {array-like}, shape = [n_samples, n_features]
125                Training vectors, where n_samples is the number
126                of samples and n_features is the number of features.
127
128            **kwargs: additional parameters to be passed to `predict_proba`
129
130        Returns:
131
132            model predictions: {array-like}
133
134        """
135        X_ = (X - self.xm[None, :]) / self.xsd[None, :]
136
137        if self.backend == "cpu":
138            if isinstance(self.ym, float):
139                return self.ym + mo.safe_sparse_dot(X_, self.beta)
140            return self.ym[None, :] + mo.safe_sparse_dot(X_, self.beta)
141
142        # if self.backend in ("gpu", "tpu"):
143        if isinstance(self.ym, float):
144            return self.ym + mo.safe_sparse_dot(
145                X_, self.beta, backend=self.backend
146            )
147        return self.ym[None, :] + mo.safe_sparse_dot(
148            X_, self.beta, backend=self.backend
149        )

Ridge.

Attributes:

reg_lambda: float
    regularization parameter.

backend: str
    type of backend; must be in ('cpu', 'gpu', 'tpu')
def fit(self, X, y, **kwargs):
 57    def fit(self, X, y, **kwargs):
 58        """Fit matrixops (classifier) to training data (X, y)
 59
 60        Args:
 61
 62            X: {array-like}, shape = [n_samples, n_features]
 63                Training vectors, where n_samples is the number
 64                of samples and n_features is the number of features.
 65
 66            y: array-like, shape = [n_samples]
 67                Target values.
 68
 69            **kwargs: additional parameters to be passed to self.cook_training_set.
 70
 71        Returns:
 72
 73            self: object.
 74
 75        """
 76        self.ym, centered_y = mo.center_response(y)
 77        self.xm = X.mean(axis=0)
 78        self.xsd = X.std(axis=0)
 79        self.xsd[self.xsd == 0] = 1  # avoid division by zero
 80        X_ = (X - self.xm[None, :]) / self.xsd[None, :]
 81
 82        if self.backend == "cpu":
 83            if len(centered_y.shape) <= 1:
 84                eye_term = np.sqrt(self.reg_lambda) * np.eye(X.shape[1])
 85                X_ = np.row_stack((X_, eye_term))
 86                y_ = np.concatenate((centered_y, np.zeros(X.shape[1])))
 87                # self.beta, _, _, _ = np.linalg.lstsq(X_, y_, rcond=None)
 88                self.beta = get_beta(X_, y_)
 89            else:
 90                try:
 91                    eye_term = np.sqrt(self.reg_lambda) * np.eye(X.shape[1])
 92                    X_ = np.row_stack((X_, eye_term))
 93                    y_ = np.row_stack(
 94                        (
 95                            centered_y,
 96                            np.zeros((eye_term.shape[0], centered_y.shape[1])),
 97                        )
 98                    )
 99                    # self.beta, _, _, _ = np.linalg.lstsq(X_, y_, rcond=None)
100                    self.beta = get_beta(X_, y_)
101                except Exception:
102                    x = inv(
103                        mo.crossprod(X_) + self.reg_lambda * np.eye(X_.shape[1])
104                    )
105                    hat_matrix = mo.tcrossprod(x, X_)
106                    self.beta = mo.safe_sparse_dot(hat_matrix, centered_y)
107            return self
108
109        x = jinv(
110            mo.crossprod(X_, backend=self.backend)
111            + self.reg_lambda * jnp.eye(X_.shape[1])
112        )
113        hat_matrix = mo.tcrossprod(x, X_, backend=self.backend)
114        self.beta = mo.safe_sparse_dot(
115            hat_matrix, centered_y, backend=self.backend
116        )
117        return self

Fit matrixops (classifier) to training data (X, y)

Args:

X: {array-like}, shape = [n_samples, n_features]
    Training vectors, where n_samples is the number
    of samples and n_features is the number of features.

y: array-like, shape = [n_samples]
    Target values.

**kwargs: additional parameters to be passed to self.cook_training_set.

Returns:

self: object.
def predict(self, X, **kwargs):
119    def predict(self, X, **kwargs):
120        """Predict test data X.
121
122        Args:
123
124            X: {array-like}, shape = [n_samples, n_features]
125                Training vectors, where n_samples is the number
126                of samples and n_features is the number of features.
127
128            **kwargs: additional parameters to be passed to `predict_proba`
129
130        Returns:
131
132            model predictions: {array-like}
133
134        """
135        X_ = (X - self.xm[None, :]) / self.xsd[None, :]
136
137        if self.backend == "cpu":
138            if isinstance(self.ym, float):
139                return self.ym + mo.safe_sparse_dot(X_, self.beta)
140            return self.ym[None, :] + mo.safe_sparse_dot(X_, self.beta)
141
142        # if self.backend in ("gpu", "tpu"):
143        if isinstance(self.ym, float):
144            return self.ym + mo.safe_sparse_dot(
145                X_, self.beta, backend=self.backend
146            )
147        return self.ym[None, :] + mo.safe_sparse_dot(
148            X_, self.beta, backend=self.backend
149        )

Predict test data X.

Args:

X: {array-like}, shape = [n_samples, n_features]
    Training vectors, where n_samples is the number
    of samples and n_features is the number of features.

**kwargs: additional parameters to be passed to `predict_proba`

Returns:

model predictions: {array-like}
class LazyBoostingClassifier(sklearn.base.ClassifierMixin):
 89class LazyBoostingClassifier(ClassifierMixin):
 90    """
 91
 92    Fitting -- almost -- all the classification algorithms
 93    and returning their scores.
 94
 95    Parameters:
 96
 97        verbose: int, optional (default=0)
 98            Any positive number for verbosity.
 99
100        ignore_warnings: bool, optional (default=True)
101            When set to True, the warning related to algorigms that are not
102            able to run are ignored.
103
104        custom_metric: function, optional (default=None)
105            When function is provided, models are evaluated based on the custom
106              evaluation metric provided.
107
108        predictions: bool, optional (default=False)
109            When set to True, the predictions of all the models models are
110            returned as data frame.
111
112        sort_by: string, optional (default='Accuracy')
113            Sort models by a metric. Available options are 'Accuracy',
114            'Balanced Accuracy', 'ROC AUC', 'F1 Score' or a custom metric
115            identified by its name and provided by custom_metric.
116
117        random_state: int, optional (default=42)
118            Reproducibiility seed.
119
120        estimators: list, optional (default='all')
121            list of Estimators names or just 'all' for > 90 classifiers
122            (default='all')
123
124        preprocess: bool, preprocessing is done when set to True
125
126        n_jobs: int, when possible, run in parallel
127            For now, only used by individual models that support it.
128
129        n_layers: int, optional (default=3)
130            Number of layers of GenericBoostingClassifiers to be used.
131
132        All the other parameters are the same as GenericBoostingClassifier's.
133
134    Attributes:
135
136        models_: dict-object
137            Returns a dictionary with each model pipeline as value
138            with key as name of models.
139
140        best_model_: object
141            Returns the best model pipeline.
142
143    Examples
144
145        ```python
146        import os
147        import mlsauce as ms
148        from sklearn.datasets import load_breast_cancer, load_iris, load_wine, load_digits
149        from sklearn.model_selection import train_test_split
150        from time import time
151
152        load_models = [load_breast_cancer, load_iris, load_wine]
153
154        for model in load_models:
155
156            data = model()
157            X = data.data
158            y= data.target
159
160            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 13)
161
162            clf = ms.LazyBoostingClassifier(verbose=1, ignore_warnings=False,
163                                            custom_metric=None, preprocess=False)
164
165            start = time()
166            models, predictioms = clf.fit(X_train, X_test, y_train, y_test)
167            print(f"\nElapsed: {time() - start} seconds\n")
168
169            print(models)
170        ```
171
172    """
173
174    def __init__(
175        self,
176        verbose=0,
177        ignore_warnings=True,
178        custom_metric=None,
179        predictions=False,
180        sort_by="Accuracy",
181        random_state=42,
182        estimators="all",
183        preprocess=False,
184        n_jobs=None,
185    ):
186        self.verbose = verbose
187        self.ignore_warnings = ignore_warnings
188        self.custom_metric = custom_metric
189        self.predictions = predictions
190        self.sort_by = sort_by
191        self.models_ = {}
192        self.best_model_ = None
193        self.random_state = random_state
194        self.estimators = estimators
195        self.preprocess = preprocess
196        self.n_jobs = n_jobs
197
198    def fit(self, X_train, X_test, y_train, y_test, hist=False, **kwargs):
199        """Fit classifiers to X_train and y_train, predict and score on X_test,
200        y_test.
201
202        Parameters:
203
204            X_train: array-like,
205                Training vectors, where rows is the number of samples
206                and columns is the number of features.
207
208            X_test: array-like,
209                Testing vectors, where rows is the number of samples
210                and columns is the number of features.
211
212            y_train: array-like,
213                Training vectors, where rows is the number of samples
214                and columns is the number of features.
215
216            y_test: array-like,
217                Testing vectors, where rows is the number of samples
218                and columns is the number of features.
219
220            hist: bool, optional (default=False)
221                When set to True, the model is a GenericBoostingClassifier.
222
223            **kwargs: dict,
224                Additional arguments to be passed to the fit GenericBoostingClassifier.
225
226        Returns:
227
228            scores: Pandas DataFrame
229                Returns metrics of all the models in a Pandas DataFrame.
230
231            predictions: Pandas DataFrame
232                Returns predictions of all the models in a Pandas DataFrame.
233        """
234        Accuracy = []
235        B_Accuracy = []
236        ROC_AUC = []
237        F1 = []
238        names = []
239        TIME = []
240        predictions = {}
241
242        if self.custom_metric is not None:
243            CUSTOM_METRIC = []
244
245        if isinstance(X_train, np.ndarray):
246            X_train = pd.DataFrame(X_train)
247            X_test = pd.DataFrame(X_test)
248
249        numeric_features = X_train.select_dtypes(include=[np.number]).columns
250        categorical_features = X_train.select_dtypes(include=["object"]).columns
251
252        categorical_low, categorical_high = get_card_split(
253            X_train, categorical_features
254        )
255
256        if self.preprocess is True:
257            preprocessor = ColumnTransformer(
258                transformers=[
259                    ("numeric", numeric_transformer, numeric_features),
260                    (
261                        "categorical_low",
262                        categorical_transformer_low,
263                        categorical_low,
264                    ),
265                    (
266                        "categorical_high",
267                        categorical_transformer_high,
268                        categorical_high,
269                    ),
270                ]
271            )
272
273        # baseline models
274        try:
275            baseline_names = ["RandomForestClassifier", "XGBClassifier"]
276            baseline_models = [RandomForestClassifier(), xgb.XGBClassifier()]
277        except Exception as exception:
278            baseline_names = ["RandomForestClassifier"]
279            baseline_models = [RandomForestClassifier()]
280
281        if self.verbose > 0:
282            print("\n Fitting baseline models...")
283        for name, model in tqdm(zip(baseline_names, baseline_models)):
284            start = time.time()
285            try:
286                model.fit(X_train, y_train)
287                self.models_[name] = model
288                y_pred = model.predict(X_test)
289                accuracy = accuracy_score(y_test, y_pred, normalize=True)
290                b_accuracy = balanced_accuracy_score(y_test, y_pred)
291                f1 = f1_score(y_test, y_pred, average="weighted")
292                try:
293                    roc_auc = roc_auc_score(y_test, y_pred)
294                except Exception as exception:
295                    roc_auc = None
296                    if self.ignore_warnings is False:
297                        print("ROC AUC couldn't be calculated for " + name)
298                        print(exception)
299                names.append(name)
300                Accuracy.append(accuracy)
301                B_Accuracy.append(b_accuracy)
302                ROC_AUC.append(roc_auc)
303                F1.append(f1)
304                TIME.append(time.time() - start)
305                if self.custom_metric is not None:
306                    custom_metric = self.custom_metric(y_test, y_pred)
307                    CUSTOM_METRIC.append(custom_metric)
308                if self.verbose > 0:
309                    if self.custom_metric is not None:
310                        print(
311                            {
312                                "Model": name,
313                                "Accuracy": accuracy,
314                                "Balanced Accuracy": b_accuracy,
315                                "ROC AUC": roc_auc,
316                                "F1 Score": f1,
317                                self.custom_metric.__name__: custom_metric,
318                                "Time taken": time.time() - start,
319                            }
320                        )
321                    else:
322                        print(
323                            {
324                                "Model": name,
325                                "Accuracy": accuracy,
326                                "Balanced Accuracy": b_accuracy,
327                                "ROC AUC": roc_auc,
328                                "F1 Score": f1,
329                                "Time taken": time.time() - start,
330                            }
331                        )
332                if self.predictions:
333                    predictions[name] = y_pred
334            except Exception as exception:
335                if self.ignore_warnings is False:
336                    print(name + " model failed to execute")
337                    print(exception)
338
339        if self.estimators == "all":
340            self.classifiers = REGRESSORS + MTASKREGRESSORS
341        else:
342            self.classifiers = [
343                ("GBoostClassifier(" + est[0] + ")", est[1]())
344                for est in all_estimators()
345                if (
346                    issubclass(est[1], RegressorMixin)
347                    and (est[0] in self.estimators)
348                )
349            ] + [
350                (
351                    "GBoostClassifier(MultiTask(" + est[0] + "))",
352                    partial(MultiTaskRegressor, regr=est[1]()),
353                )
354                for est in all_estimators()
355                if (
356                    issubclass(est[1], RegressorMixin)
357                    and (est[0] in self.estimators)
358                )
359            ]
360
361        if self.preprocess is True:
362
363            if self.n_jobs is None:
364
365                for name, model in tqdm(self.classifiers):  # do parallel exec
366
367                    other_args = (
368                        {}
369                    )  # use this trick for `random_state` too --> refactor
370                    try:
371                        if (
372                            "n_jobs" in model().get_params().keys()
373                            and name.find("LogisticRegression") == -1
374                        ):
375                            other_args["n_jobs"] = self.n_jobs
376                    except Exception:
377                        pass
378
379                    start = time.time()
380
381                    try:
382                        if "random_state" in model().get_params().keys():
383                            if hist is False:
384                                fitted_clf = GenericBoostingClassifier(
385                                    {**other_args, **kwargs},
386                                    verbose=self.verbose,
387                                    base_model=model(
388                                        random_state=self.random_state
389                                    ),
390                                )
391                            else:
392                                fitted_clf = GenericBoostingClassifier(
393                                    {**other_args, **kwargs},
394                                    verbose=self.verbose,
395                                    base_model=model(
396                                        random_state=self.random_state
397                                    ),
398                                    hist=True,
399                                )
400
401                        else:
402                            if hist is False:
403                                fitted_clf = GenericBoostingClassifier(
404                                    base_model=model(**kwargs),
405                                    verbose=self.verbose,
406                                )
407                            else:
408                                fitted_clf = GenericBoostingClassifier(
409                                    base_model=model(**kwargs),
410                                    verbose=self.verbose,
411                                    hist=True,
412                                )
413
414                        if self.verbose > 0:
415                            print("\n Fitting boosted " + name + " model...")
416                        fitted_clf.fit(X_train, y_train)
417
418                        pipe = Pipeline(
419                            [
420                                ("preprocessor", preprocessor),
421                                ("classifier", fitted_clf),
422                            ]
423                        )
424
425                        if self.verbose > 0:
426                            print("\n Fitting boosted " + name + " model...")
427                        pipe.fit(X_train, y_train)
428                        self.models_[name] = pipe
429                        y_pred = pipe.predict(X_test)
430                        accuracy = accuracy_score(
431                            y_test, y_pred, normalize=True
432                        )
433                        b_accuracy = balanced_accuracy_score(y_test, y_pred)
434                        f1 = f1_score(y_test, y_pred, average="weighted")
435                        try:
436                            roc_auc = roc_auc_score(y_test, y_pred)
437                        except Exception as exception:
438                            roc_auc = None
439                            if self.ignore_warnings is False:
440                                print(
441                                    "ROC AUC couldn't be calculated for " + name
442                                )
443                                print(exception)
444                        names.append(name)
445                        Accuracy.append(accuracy)
446                        B_Accuracy.append(b_accuracy)
447                        ROC_AUC.append(roc_auc)
448                        F1.append(f1)
449                        TIME.append(time.time() - start)
450                        if self.custom_metric is not None:
451                            custom_metric = self.custom_metric(y_test, y_pred)
452                            CUSTOM_METRIC.append(custom_metric)
453                        if self.verbose > 0:
454                            if self.custom_metric is not None:
455                                print(
456                                    {
457                                        "Model": name,
458                                        "Accuracy": accuracy,
459                                        "Balanced Accuracy": b_accuracy,
460                                        "ROC AUC": roc_auc,
461                                        "F1 Score": f1,
462                                        self.custom_metric.__name__: custom_metric,
463                                        "Time taken": time.time() - start,
464                                    }
465                                )
466                            else:
467                                print(
468                                    {
469                                        "Model": name,
470                                        "Accuracy": accuracy,
471                                        "Balanced Accuracy": b_accuracy,
472                                        "ROC AUC": roc_auc,
473                                        "F1 Score": f1,
474                                        "Time taken": time.time() - start,
475                                    }
476                                )
477                        if self.predictions:
478                            predictions[name] = y_pred
479                    except Exception as exception:
480                        if self.ignore_warnings is False:
481                            print(name + " model failed to execute")
482                            print(exception)
483
484            else:
485
486                # train_model(self, name, model, X_train, y_train, X_test, y_test,
487                # use_preprocessing=False, preprocessor=None,
488                #    **kwargs):
489                results = Parallel(n_jobs=self.n_jobs)(
490                    delayed(self.train_model)(
491                        name,
492                        model,
493                        X_train,
494                        y_train,
495                        X_test,
496                        y_test,
497                        use_preprocessing=True,
498                        preprocessor=preprocessor,
499                        **kwargs,
500                    )
501                    for name, model in tqdm(self.classifiers)
502                )
503                Accuracy = [res["accuracy"] for res in results]
504                B_Accuracy = [res["balanced_accuracy"] for res in results]
505                ROC_AUC = [res["roc_auc"] for res in results]
506                F1 = [res["f1"] for res in results]
507                names = [res["name"] for res in results]
508                TIME = [res["time"] for res in results]
509                if self.custom_metric is not None:
510                    CUSTOM_METRIC = [res["custom_metric"] for res in results]
511                if self.predictions:
512                    predictions = {
513                        res["name"]: res["predictions"] for res in results
514                    }
515
516        else:  # no preprocessing
517
518            if self.n_jobs is None:
519
520                for name, model in tqdm(self.classifiers):  # do parallel exec
521                    start = time.time()
522                    try:
523                        if "random_state" in model().get_params().keys():
524                            if hist is False:
525                                fitted_clf = GenericBoostingClassifier(
526                                    base_model=model(
527                                        random_state=self.random_state
528                                    ),
529                                    verbose=self.verbose,
530                                    **kwargs,
531                                )
532                            else:
533                                fitted_clf = GenericBoostingClassifier(
534                                    base_model=model(
535                                        random_state=self.random_state
536                                    ),
537                                    verbose=self.verbose,
538                                    hist=True,
539                                    **kwargs,
540                                )
541
542                        else:
543                            if hist is False:
544                                fitted_clf = GenericBoostingClassifier(
545                                    base_model=model(),
546                                    verbose=self.verbose,
547                                    **kwargs,
548                                )
549                            else:
550                                fitted_clf = GenericBoostingClassifier(
551                                    base_model=model(),
552                                    verbose=self.verbose,
553                                    hist=True,
554                                    **kwargs,
555                                )
556
557                        fitted_clf.fit(X_train, y_train)
558
559                        self.models_[name] = fitted_clf
560                        y_pred = fitted_clf.predict(X_test)
561                        accuracy = accuracy_score(
562                            y_test, y_pred, normalize=True
563                        )
564                        b_accuracy = balanced_accuracy_score(y_test, y_pred)
565                        f1 = f1_score(y_test, y_pred, average="weighted")
566                        try:
567                            roc_auc = roc_auc_score(y_test, y_pred)
568                        except Exception as exception:
569                            roc_auc = None
570                            if self.ignore_warnings is False:
571                                print(
572                                    "ROC AUC couldn't be calculated for " + name
573                                )
574                                print(exception)
575                        names.append(name)
576                        Accuracy.append(accuracy)
577                        B_Accuracy.append(b_accuracy)
578                        ROC_AUC.append(roc_auc)
579                        F1.append(f1)
580                        TIME.append(time.time() - start)
581                        if self.custom_metric is not None:
582                            custom_metric = self.custom_metric(y_test, y_pred)
583                            CUSTOM_METRIC.append(custom_metric)
584                        if self.verbose > 0:
585                            if self.custom_metric is not None:
586                                print(
587                                    {
588                                        "Model": name,
589                                        "Accuracy": accuracy,
590                                        "Balanced Accuracy": b_accuracy,
591                                        "ROC AUC": roc_auc,
592                                        "F1 Score": f1,
593                                        self.custom_metric.__name__: custom_metric,
594                                        "Time taken": time.time() - start,
595                                    }
596                                )
597                            else:
598                                print(
599                                    {
600                                        "Model": name,
601                                        "Accuracy": accuracy,
602                                        "Balanced Accuracy": b_accuracy,
603                                        "ROC AUC": roc_auc,
604                                        "F1 Score": f1,
605                                        "Time taken": time.time() - start,
606                                    }
607                                )
608                        if self.predictions:
609                            predictions[name] = y_pred
610                    except Exception as exception:
611                        if self.ignore_warnings is False:
612                            print(name + " model failed to execute")
613                            print(exception)
614
615            else:
616
617                results = Parallel(n_jobs=self.n_jobs)(
618                    delayed(self.train_model)(
619                        name,
620                        model,
621                        X_train,
622                        y_train,
623                        X_test,
624                        y_test,
625                        use_preprocessing=False,
626                        **kwargs,
627                    )
628                    for name, model in tqdm(self.classifiers)
629                )
630                Accuracy = [res["accuracy"] for res in results]
631                B_Accuracy = [res["balanced_accuracy"] for res in results]
632                ROC_AUC = [res["roc_auc"] for res in results]
633                F1 = [res["f1"] for res in results]
634                names = [res["name"] for res in results]
635                TIME = [res["time"] for res in results]
636                if self.custom_metric is not None:
637                    CUSTOM_METRIC = [res["custom_metric"] for res in results]
638                if self.predictions:
639                    predictions = {
640                        res["name"]: res["predictions"] for res in results
641                    }
642
643        if self.custom_metric is None:
644            scores = pd.DataFrame(
645                {
646                    "Model": names,
647                    "Accuracy": Accuracy,
648                    "Balanced Accuracy": B_Accuracy,
649                    "ROC AUC": ROC_AUC,
650                    "F1 Score": F1,
651                    "Time Taken": TIME,
652                }
653            )
654        else:
655            scores = pd.DataFrame(
656                {
657                    "Model": names,
658                    "Accuracy": Accuracy,
659                    "Balanced Accuracy": B_Accuracy,
660                    "ROC AUC": ROC_AUC,
661                    "F1 Score": F1,
662                    "Custom metric": CUSTOM_METRIC,
663                    "Time Taken": TIME,
664                }
665            )
666        scores = scores.sort_values(by=self.sort_by, ascending=False).set_index(
667            "Model"
668        )
669
670        self.best_model_ = self.models_[scores.index[0]]
671
672        if self.predictions:
673            predictions_df = pd.DataFrame.from_dict(predictions)
674        return scores, predictions_df if self.predictions is True else scores
675
676    def get_best_model(self):
677        """
678        This function returns the best model pipeline based on the sort_by metric.
679
680        Returns:
681
682            best_model: object,
683                Returns the best model pipeline based on the sort_by metric.
684
685        """
686        return self.best_model_
687
688    def provide_models(self, X_train, X_test, y_train, y_test):
689        """Returns all the model objects trained. If fit hasn't been called yet,
690        then it's called to return the models.
691
692        Parameters:
693
694        X_train: array-like,
695            Training vectors, where rows is the number of samples
696            and columns is the number of features.
697
698        X_test: array-like,
699            Testing vectors, where rows is the number of samples
700            and columns is the number of features.
701
702        y_train: array-like,
703            Training vectors, where rows is the number of samples
704            and columns is the number of features.
705
706        y_test: array-like,
707            Testing vectors, where rows is the number of samples
708            and columns is the number of features.
709
710        Returns:
711
712            models: dict-object,
713                Returns a dictionary with each model's pipeline as value
714                and key = name of the model.
715        """
716        if len(self.models_.keys()) == 0:
717            self.fit(X_train, X_test, y_train, y_test)
718
719        return self.models_
720
721    def train_model(
722        self,
723        name,
724        model,
725        X_train,
726        y_train,
727        X_test,
728        y_test,
729        use_preprocessing=False,
730        preprocessor=None,
731        hist=False,
732        **kwargs,
733    ):
734        """
735        Function to train a single model and return its results.
736        """
737        other_args = {}
738
739        # Handle n_jobs parameter
740        try:
741            if (
742                "n_jobs" in model().get_params().keys()
743                and "LogisticRegression" not in name
744            ):
745                other_args["n_jobs"] = self.n_jobs
746        except Exception:
747            pass
748
749        start = time.time()
750
751        try:
752            # Handle random_state parameter
753            if "random_state" in model().get_params().keys():
754                if hist is False:
755                    fitted_clf = GenericBoostingClassifier(
756                        {**other_args, **kwargs},
757                        verbose=self.verbose,
758                        base_model=model(random_state=self.random_state),
759                    )
760                else:
761                    fitted_clf = GenericBoostingClassifier(
762                        {**other_args, **kwargs},
763                        verbose=self.verbose,
764                        base_model=model(random_state=self.random_state),
765                        hist=True,
766                    )
767            else:
768                if hist is False:
769                    fitted_clf = GenericBoostingClassifier(
770                        base_model=model(**kwargs),
771                        verbose=self.verbose,
772                    )
773                else:
774                    fitted_clf = GenericBoostingClassifier(
775                        base_model=model(**kwargs),
776                        verbose=self.verbose,
777                        hist=True,
778                    )
779
780            if self.verbose > 0:
781                print("\n Fitting boosted " + name + " model...")
782
783            fitted_clf.fit(X_train, y_train)
784
785            if use_preprocessing and preprocessor is not None:
786                pipe = Pipeline(
787                    [
788                        ("preprocessor", preprocessor),
789                        ("classifier", fitted_clf),
790                    ]
791                )
792                if self.verbose > 0:
793                    print(
794                        "\n Fitting pipeline with preprocessing for "
795                        + name
796                        + " model..."
797                    )
798                pipe.fit(X_train, y_train)
799                y_pred = pipe.predict(X_test)
800            else:
801                # Case with no preprocessing
802                if self.verbose > 0:
803                    print(
804                        "\n Fitting model without preprocessing for "
805                        + name
806                        + " model..."
807                    )
808                y_pred = fitted_clf.predict(X_test)
809
810            accuracy = accuracy_score(y_test, y_pred, normalize=True)
811            b_accuracy = balanced_accuracy_score(y_test, y_pred)
812            f1 = f1_score(y_test, y_pred, average="weighted")
813            roc_auc = None
814
815            try:
816                roc_auc = roc_auc_score(y_test, y_pred)
817            except Exception as exception:
818                if self.ignore_warnings is False:
819                    print("ROC AUC couldn't be calculated for " + name)
820                    print(exception)
821
822            custom_metric = None
823            if self.custom_metric is not None:
824                custom_metric = self.custom_metric(y_test, y_pred)
825
826            return {
827                "name": name,
828                "model": fitted_clf if not use_preprocessing else pipe,
829                "accuracy": accuracy,
830                "balanced_accuracy": b_accuracy,
831                "roc_auc": roc_auc,
832                "f1": f1,
833                "custom_metric": custom_metric,
834                "time": time.time() - start,
835                "predictions": y_pred,
836            }
837        except Exception as exception:
838            if self.ignore_warnings is False:
839                print(name + " model failed to execute")
840                print(exception)
841            return None

Fitting -- almost -- all the classification algorithms and returning their scores.

Parameters:

    verbose: int, optional (default=0)
        Any positive number for verbosity.

    ignore_warnings: bool, optional (default=True)
        When set to True, the warning related to algorigms that are not
        able to run are ignored.

    custom_metric: function, optional (default=None)
        When function is provided, models are evaluated based on the custom
          evaluation metric provided.

    predictions: bool, optional (default=False)
        When set to True, the predictions of all the models models are
        returned as data frame.

    sort_by: string, optional (default='Accuracy')
        Sort models by a metric. Available options are 'Accuracy',
        'Balanced Accuracy', 'ROC AUC', 'F1 Score' or a custom metric
        identified by its name and provided by custom_metric.

    random_state: int, optional (default=42)
        Reproducibiility seed.

    estimators: list, optional (default='all')
        list of Estimators names or just 'all' for > 90 classifiers
        (default='all')

    preprocess: bool, preprocessing is done when set to True

    n_jobs: int, when possible, run in parallel
        For now, only used by individual models that support it.

    n_layers: int, optional (default=3)
        Number of layers of GenericBoostingClassifiers to be used.

    All the other parameters are the same as GenericBoostingClassifier's.

Attributes:

    models_: dict-object
        Returns a dictionary with each model pipeline as value
        with key as name of models.

    best_model_: object
        Returns the best model pipeline.

Examples



    
        import os
        import mlsauce as ms
        from sklearn.datasets import load_breast_cancer, load_iris, load_wine, load_digits
        from sklearn.model_selection import train_test_split
        from time import time

        load_models = [load_breast_cancer, load_iris, load_wine]

        for model in load_models:

            data = model()
            X = data.data
            y= data.target

            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 13)

            clf = ms.LazyBoostingClassifier(verbose=1, ignore_warnings=False,
                                            custom_metric=None, preprocess=False)

            start = time()
            models, predictioms = clf.fit(X_train, X_test, y_train, y_test)
            print(f"
Elapsed: {time() - start} seconds
")

            print(models)
def fit(self, X_train, X_test, y_train, y_test, hist=False, **kwargs):
198    def fit(self, X_train, X_test, y_train, y_test, hist=False, **kwargs):
199        """Fit classifiers to X_train and y_train, predict and score on X_test,
200        y_test.
201
202        Parameters:
203
204            X_train: array-like,
205                Training vectors, where rows is the number of samples
206                and columns is the number of features.
207
208            X_test: array-like,
209                Testing vectors, where rows is the number of samples
210                and columns is the number of features.
211
212            y_train: array-like,
213                Training vectors, where rows is the number of samples
214                and columns is the number of features.
215
216            y_test: array-like,
217                Testing vectors, where rows is the number of samples
218                and columns is the number of features.
219
220            hist: bool, optional (default=False)
221                When set to True, the model is a GenericBoostingClassifier.
222
223            **kwargs: dict,
224                Additional arguments to be passed to the fit GenericBoostingClassifier.
225
226        Returns:
227
228            scores: Pandas DataFrame
229                Returns metrics of all the models in a Pandas DataFrame.
230
231            predictions: Pandas DataFrame
232                Returns predictions of all the models in a Pandas DataFrame.
233        """
234        Accuracy = []
235        B_Accuracy = []
236        ROC_AUC = []
237        F1 = []
238        names = []
239        TIME = []
240        predictions = {}
241
242        if self.custom_metric is not None:
243            CUSTOM_METRIC = []
244
245        if isinstance(X_train, np.ndarray):
246            X_train = pd.DataFrame(X_train)
247            X_test = pd.DataFrame(X_test)
248
249        numeric_features = X_train.select_dtypes(include=[np.number]).columns
250        categorical_features = X_train.select_dtypes(include=["object"]).columns
251
252        categorical_low, categorical_high = get_card_split(
253            X_train, categorical_features
254        )
255
256        if self.preprocess is True:
257            preprocessor = ColumnTransformer(
258                transformers=[
259                    ("numeric", numeric_transformer, numeric_features),
260                    (
261                        "categorical_low",
262                        categorical_transformer_low,
263                        categorical_low,
264                    ),
265                    (
266                        "categorical_high",
267                        categorical_transformer_high,
268                        categorical_high,
269                    ),
270                ]
271            )
272
273        # baseline models
274        try:
275            baseline_names = ["RandomForestClassifier", "XGBClassifier"]
276            baseline_models = [RandomForestClassifier(), xgb.XGBClassifier()]
277        except Exception as exception:
278            baseline_names = ["RandomForestClassifier"]
279            baseline_models = [RandomForestClassifier()]
280
281        if self.verbose > 0:
282            print("\n Fitting baseline models...")
283        for name, model in tqdm(zip(baseline_names, baseline_models)):
284            start = time.time()
285            try:
286                model.fit(X_train, y_train)
287                self.models_[name] = model
288                y_pred = model.predict(X_test)
289                accuracy = accuracy_score(y_test, y_pred, normalize=True)
290                b_accuracy = balanced_accuracy_score(y_test, y_pred)
291                f1 = f1_score(y_test, y_pred, average="weighted")
292                try:
293                    roc_auc = roc_auc_score(y_test, y_pred)
294                except Exception as exception:
295                    roc_auc = None
296                    if self.ignore_warnings is False:
297                        print("ROC AUC couldn't be calculated for " + name)
298                        print(exception)
299                names.append(name)
300                Accuracy.append(accuracy)
301                B_Accuracy.append(b_accuracy)
302                ROC_AUC.append(roc_auc)
303                F1.append(f1)
304                TIME.append(time.time() - start)
305                if self.custom_metric is not None:
306                    custom_metric = self.custom_metric(y_test, y_pred)
307                    CUSTOM_METRIC.append(custom_metric)
308                if self.verbose > 0:
309                    if self.custom_metric is not None:
310                        print(
311                            {
312                                "Model": name,
313                                "Accuracy": accuracy,
314                                "Balanced Accuracy": b_accuracy,
315                                "ROC AUC": roc_auc,
316                                "F1 Score": f1,
317                                self.custom_metric.__name__: custom_metric,
318                                "Time taken": time.time() - start,
319                            }
320                        )
321                    else:
322                        print(
323                            {
324                                "Model": name,
325                                "Accuracy": accuracy,
326                                "Balanced Accuracy": b_accuracy,
327                                "ROC AUC": roc_auc,
328                                "F1 Score": f1,
329                                "Time taken": time.time() - start,
330                            }
331                        )
332                if self.predictions:
333                    predictions[name] = y_pred
334            except Exception as exception:
335                if self.ignore_warnings is False:
336                    print(name + " model failed to execute")
337                    print(exception)
338
339        if self.estimators == "all":
340            self.classifiers = REGRESSORS + MTASKREGRESSORS
341        else:
342            self.classifiers = [
343                ("GBoostClassifier(" + est[0] + ")", est[1]())
344                for est in all_estimators()
345                if (
346                    issubclass(est[1], RegressorMixin)
347                    and (est[0] in self.estimators)
348                )
349            ] + [
350                (
351                    "GBoostClassifier(MultiTask(" + est[0] + "))",
352                    partial(MultiTaskRegressor, regr=est[1]()),
353                )
354                for est in all_estimators()
355                if (
356                    issubclass(est[1], RegressorMixin)
357                    and (est[0] in self.estimators)
358                )
359            ]
360
361        if self.preprocess is True:
362
363            if self.n_jobs is None:
364
365                for name, model in tqdm(self.classifiers):  # do parallel exec
366
367                    other_args = (
368                        {}
369                    )  # use this trick for `random_state` too --> refactor
370                    try:
371                        if (
372                            "n_jobs" in model().get_params().keys()
373                            and name.find("LogisticRegression") == -1
374                        ):
375                            other_args["n_jobs"] = self.n_jobs
376                    except Exception:
377                        pass
378
379                    start = time.time()
380
381                    try:
382                        if "random_state" in model().get_params().keys():
383                            if hist is False:
384                                fitted_clf = GenericBoostingClassifier(
385                                    {**other_args, **kwargs},
386                                    verbose=self.verbose,
387                                    base_model=model(
388                                        random_state=self.random_state
389                                    ),
390                                )
391                            else:
392                                fitted_clf = GenericBoostingClassifier(
393                                    {**other_args, **kwargs},
394                                    verbose=self.verbose,
395                                    base_model=model(
396                                        random_state=self.random_state
397                                    ),
398                                    hist=True,
399                                )
400
401                        else:
402                            if hist is False:
403                                fitted_clf = GenericBoostingClassifier(
404                                    base_model=model(**kwargs),
405                                    verbose=self.verbose,
406                                )
407                            else:
408                                fitted_clf = GenericBoostingClassifier(
409                                    base_model=model(**kwargs),
410                                    verbose=self.verbose,
411                                    hist=True,
412                                )
413
414                        if self.verbose > 0:
415                            print("\n Fitting boosted " + name + " model...")
416                        fitted_clf.fit(X_train, y_train)
417
418                        pipe = Pipeline(
419                            [
420                                ("preprocessor", preprocessor),
421                                ("classifier", fitted_clf),
422                            ]
423                        )
424
425                        if self.verbose > 0:
426                            print("\n Fitting boosted " + name + " model...")
427                        pipe.fit(X_train, y_train)
428                        self.models_[name] = pipe
429                        y_pred = pipe.predict(X_test)
430                        accuracy = accuracy_score(
431                            y_test, y_pred, normalize=True
432                        )
433                        b_accuracy = balanced_accuracy_score(y_test, y_pred)
434                        f1 = f1_score(y_test, y_pred, average="weighted")
435                        try:
436                            roc_auc = roc_auc_score(y_test, y_pred)
437                        except Exception as exception:
438                            roc_auc = None
439                            if self.ignore_warnings is False:
440                                print(
441                                    "ROC AUC couldn't be calculated for " + name
442                                )
443                                print(exception)
444                        names.append(name)
445                        Accuracy.append(accuracy)
446                        B_Accuracy.append(b_accuracy)
447                        ROC_AUC.append(roc_auc)
448                        F1.append(f1)
449                        TIME.append(time.time() - start)
450                        if self.custom_metric is not None:
451                            custom_metric = self.custom_metric(y_test, y_pred)
452                            CUSTOM_METRIC.append(custom_metric)
453                        if self.verbose > 0:
454                            if self.custom_metric is not None:
455                                print(
456                                    {
457                                        "Model": name,
458                                        "Accuracy": accuracy,
459                                        "Balanced Accuracy": b_accuracy,
460                                        "ROC AUC": roc_auc,
461                                        "F1 Score": f1,
462                                        self.custom_metric.__name__: custom_metric,
463                                        "Time taken": time.time() - start,
464                                    }
465                                )
466                            else:
467                                print(
468                                    {
469                                        "Model": name,
470                                        "Accuracy": accuracy,
471                                        "Balanced Accuracy": b_accuracy,
472                                        "ROC AUC": roc_auc,
473                                        "F1 Score": f1,
474                                        "Time taken": time.time() - start,
475                                    }
476                                )
477                        if self.predictions:
478                            predictions[name] = y_pred
479                    except Exception as exception:
480                        if self.ignore_warnings is False:
481                            print(name + " model failed to execute")
482                            print(exception)
483
484            else:
485
486                # train_model(self, name, model, X_train, y_train, X_test, y_test,
487                # use_preprocessing=False, preprocessor=None,
488                #    **kwargs):
489                results = Parallel(n_jobs=self.n_jobs)(
490                    delayed(self.train_model)(
491                        name,
492                        model,
493                        X_train,
494                        y_train,
495                        X_test,
496                        y_test,
497                        use_preprocessing=True,
498                        preprocessor=preprocessor,
499                        **kwargs,
500                    )
501                    for name, model in tqdm(self.classifiers)
502                )
503                Accuracy = [res["accuracy"] for res in results]
504                B_Accuracy = [res["balanced_accuracy"] for res in results]
505                ROC_AUC = [res["roc_auc"] for res in results]
506                F1 = [res["f1"] for res in results]
507                names = [res["name"] for res in results]
508                TIME = [res["time"] for res in results]
509                if self.custom_metric is not None:
510                    CUSTOM_METRIC = [res["custom_metric"] for res in results]
511                if self.predictions:
512                    predictions = {
513                        res["name"]: res["predictions"] for res in results
514                    }
515
516        else:  # no preprocessing
517
518            if self.n_jobs is None:
519
520                for name, model in tqdm(self.classifiers):  # do parallel exec
521                    start = time.time()
522                    try:
523                        if "random_state" in model().get_params().keys():
524                            if hist is False:
525                                fitted_clf = GenericBoostingClassifier(
526                                    base_model=model(
527                                        random_state=self.random_state
528                                    ),
529                                    verbose=self.verbose,
530                                    **kwargs,
531                                )
532                            else:
533                                fitted_clf = GenericBoostingClassifier(
534                                    base_model=model(
535                                        random_state=self.random_state
536                                    ),
537                                    verbose=self.verbose,
538                                    hist=True,
539                                    **kwargs,
540                                )
541
542                        else:
543                            if hist is False:
544                                fitted_clf = GenericBoostingClassifier(
545                                    base_model=model(),
546                                    verbose=self.verbose,
547                                    **kwargs,
548                                )
549                            else:
550                                fitted_clf = GenericBoostingClassifier(
551                                    base_model=model(),
552                                    verbose=self.verbose,
553                                    hist=True,
554                                    **kwargs,
555                                )
556
557                        fitted_clf.fit(X_train, y_train)
558
559                        self.models_[name] = fitted_clf
560                        y_pred = fitted_clf.predict(X_test)
561                        accuracy = accuracy_score(
562                            y_test, y_pred, normalize=True
563                        )
564                        b_accuracy = balanced_accuracy_score(y_test, y_pred)
565                        f1 = f1_score(y_test, y_pred, average="weighted")
566                        try:
567                            roc_auc = roc_auc_score(y_test, y_pred)
568                        except Exception as exception:
569                            roc_auc = None
570                            if self.ignore_warnings is False:
571                                print(
572                                    "ROC AUC couldn't be calculated for " + name
573                                )
574                                print(exception)
575                        names.append(name)
576                        Accuracy.append(accuracy)
577                        B_Accuracy.append(b_accuracy)
578                        ROC_AUC.append(roc_auc)
579                        F1.append(f1)
580                        TIME.append(time.time() - start)
581                        if self.custom_metric is not None:
582                            custom_metric = self.custom_metric(y_test, y_pred)
583                            CUSTOM_METRIC.append(custom_metric)
584                        if self.verbose > 0:
585                            if self.custom_metric is not None:
586                                print(
587                                    {
588                                        "Model": name,
589                                        "Accuracy": accuracy,
590                                        "Balanced Accuracy": b_accuracy,
591                                        "ROC AUC": roc_auc,
592                                        "F1 Score": f1,
593                                        self.custom_metric.__name__: custom_metric,
594                                        "Time taken": time.time() - start,
595                                    }
596                                )
597                            else:
598                                print(
599                                    {
600                                        "Model": name,
601                                        "Accuracy": accuracy,
602                                        "Balanced Accuracy": b_accuracy,
603                                        "ROC AUC": roc_auc,
604                                        "F1 Score": f1,
605                                        "Time taken": time.time() - start,
606                                    }
607                                )
608                        if self.predictions:
609                            predictions[name] = y_pred
610                    except Exception as exception:
611                        if self.ignore_warnings is False:
612                            print(name + " model failed to execute")
613                            print(exception)
614
615            else:
616
617                results = Parallel(n_jobs=self.n_jobs)(
618                    delayed(self.train_model)(
619                        name,
620                        model,
621                        X_train,
622                        y_train,
623                        X_test,
624                        y_test,
625                        use_preprocessing=False,
626                        **kwargs,
627                    )
628                    for name, model in tqdm(self.classifiers)
629                )
630                Accuracy = [res["accuracy"] for res in results]
631                B_Accuracy = [res["balanced_accuracy"] for res in results]
632                ROC_AUC = [res["roc_auc"] for res in results]
633                F1 = [res["f1"] for res in results]
634                names = [res["name"] for res in results]
635                TIME = [res["time"] for res in results]
636                if self.custom_metric is not None:
637                    CUSTOM_METRIC = [res["custom_metric"] for res in results]
638                if self.predictions:
639                    predictions = {
640                        res["name"]: res["predictions"] for res in results
641                    }
642
643        if self.custom_metric is None:
644            scores = pd.DataFrame(
645                {
646                    "Model": names,
647                    "Accuracy": Accuracy,
648                    "Balanced Accuracy": B_Accuracy,
649                    "ROC AUC": ROC_AUC,
650                    "F1 Score": F1,
651                    "Time Taken": TIME,
652                }
653            )
654        else:
655            scores = pd.DataFrame(
656                {
657                    "Model": names,
658                    "Accuracy": Accuracy,
659                    "Balanced Accuracy": B_Accuracy,
660                    "ROC AUC": ROC_AUC,
661                    "F1 Score": F1,
662                    "Custom metric": CUSTOM_METRIC,
663                    "Time Taken": TIME,
664                }
665            )
666        scores = scores.sort_values(by=self.sort_by, ascending=False).set_index(
667            "Model"
668        )
669
670        self.best_model_ = self.models_[scores.index[0]]
671
672        if self.predictions:
673            predictions_df = pd.DataFrame.from_dict(predictions)
674        return scores, predictions_df if self.predictions is True else scores

Fit classifiers to X_train and y_train, predict and score on X_test, y_test.

Parameters:

X_train: array-like,
    Training vectors, where rows is the number of samples
    and columns is the number of features.

X_test: array-like,
    Testing vectors, where rows is the number of samples
    and columns is the number of features.

y_train: array-like,
    Training vectors, where rows is the number of samples
    and columns is the number of features.

y_test: array-like,
    Testing vectors, where rows is the number of samples
    and columns is the number of features.

hist: bool, optional (default=False)
    When set to True, the model is a GenericBoostingClassifier.

**kwargs: dict,
    Additional arguments to be passed to the fit GenericBoostingClassifier.

Returns:

scores: Pandas DataFrame
    Returns metrics of all the models in a Pandas DataFrame.

predictions: Pandas DataFrame
    Returns predictions of all the models in a Pandas DataFrame.
def provide_models(self, X_train, X_test, y_train, y_test):
688    def provide_models(self, X_train, X_test, y_train, y_test):
689        """Returns all the model objects trained. If fit hasn't been called yet,
690        then it's called to return the models.
691
692        Parameters:
693
694        X_train: array-like,
695            Training vectors, where rows is the number of samples
696            and columns is the number of features.
697
698        X_test: array-like,
699            Testing vectors, where rows is the number of samples
700            and columns is the number of features.
701
702        y_train: array-like,
703            Training vectors, where rows is the number of samples
704            and columns is the number of features.
705
706        y_test: array-like,
707            Testing vectors, where rows is the number of samples
708            and columns is the number of features.
709
710        Returns:
711
712            models: dict-object,
713                Returns a dictionary with each model's pipeline as value
714                and key = name of the model.
715        """
716        if len(self.models_.keys()) == 0:
717            self.fit(X_train, X_test, y_train, y_test)
718
719        return self.models_

Returns all the model objects trained. If fit hasn't been called yet, then it's called to return the models.

Parameters:

X_train: array-like, Training vectors, where rows is the number of samples and columns is the number of features.

X_test: array-like, Testing vectors, where rows is the number of samples and columns is the number of features.

y_train: array-like, Training vectors, where rows is the number of samples and columns is the number of features.

y_test: array-like, Testing vectors, where rows is the number of samples and columns is the number of features.

Returns:

models: dict-object,
    Returns a dictionary with each model's pipeline as value
    and key = name of the model.
class LazyBoostingMTS(nnetsauce.mts.mts.MTS):
 106class LazyBoostingMTS(ns.MTS):
 107    """
 108
 109    Fitting -- almost -- all the regression algorithms with layers of
 110    nnetsauce's CustomRegressor to multivariate time series
 111    and returning their scores.
 112
 113    Parameters:
 114
 115        verbose: int, optional (default=0)
 116            Any positive number for verbosity.
 117
 118        ignore_warnings: bool, optional (default=True)
 119            When set to True, the warning related to algorigms that are not
 120            able to run are ignored.
 121
 122        custom_metric: function, optional (default=None)
 123            When function is provided, models are evaluated based on the custom
 124              evaluation metric provided.
 125
 126        predictions: bool, optional (default=False)
 127            When set to True, the predictions of all the models models are returned as dataframe.
 128
 129        sort_by: string, optional (default='RMSE')
 130            Sort models by a metric. Available options are 'RMSE', 'MAE', 'MPL', 'MPE', 'MAPE',
 131            'R-Squared', 'Adjusted R-Squared' or a custom metric identified by its name and
 132            provided by custom_metric.
 133
 134        random_state: int, optional (default=42)
 135            Reproducibiility seed.
 136
 137        estimators: list, optional (default='all')
 138            list of Estimators (regression algorithms) names or just 'all' (default='all')
 139
 140        preprocess: bool, preprocessing is done when set to True
 141
 142        h: int, optional (default=None)
 143            Number of steps ahead to predict (when used, must be > 0 and < X_test.shape[0]).
 144
 145        All the other parameters are the same as MTS's.
 146
 147    Attributes:
 148
 149        models_: dict-object
 150            Returns a dictionary with each model pipeline as value
 151            with key as name of models.
 152
 153        best_model_: object
 154            Returns the best model pipeline based on the sort_by metric.
 155
 156    Examples:
 157
 158        See https://thierrymoudiki.github.io/blog/2023/10/29/python/quasirandomizednn/MTS-LazyPredict
 159
 160    """
 161
 162    def __init__(
 163        self,
 164        verbose=0,
 165        ignore_warnings=True,
 166        custom_metric=None,
 167        predictions=False,
 168        sort_by=None,  # leave it as is
 169        random_state=42,
 170        estimators="all",
 171        preprocess=False,
 172        h=None,
 173        # MTS attributes
 174        obj=None,
 175        n_hidden_features=5,
 176        activation_name="relu",
 177        a=0.01,
 178        nodes_sim="sobol",
 179        bias=True,
 180        dropout=0,
 181        direct_link=True,
 182        n_clusters=2,
 183        cluster_encode=True,
 184        type_clust="kmeans",
 185        type_scaling=("std", "std", "std"),
 186        lags=15,
 187        type_pi="scp2-kde",
 188        block_size=None,
 189        replications=None,
 190        kernel=None,
 191        agg="mean",
 192        seed=123,
 193        backend="cpu",
 194        show_progress=False,
 195    ):
 196        self.verbose = verbose
 197        self.ignore_warnings = ignore_warnings
 198        self.custom_metric = custom_metric
 199        self.predictions = predictions
 200        self.sort_by = sort_by
 201        self.models_ = {}
 202        self.best_model_ = None
 203        self.random_state = random_state
 204        self.estimators = estimators
 205        self.preprocess = preprocess
 206        self.h = h
 207        super().__init__(
 208            obj=obj,
 209            n_hidden_features=n_hidden_features,
 210            activation_name=activation_name,
 211            a=a,
 212            nodes_sim=nodes_sim,
 213            bias=bias,
 214            dropout=dropout,
 215            direct_link=direct_link,
 216            n_clusters=n_clusters,
 217            cluster_encode=cluster_encode,
 218            type_clust=type_clust,
 219            type_scaling=type_scaling,
 220            seed=seed,
 221            backend=backend,
 222            lags=lags,
 223            type_pi=type_pi,
 224            block_size=block_size,
 225            replications=replications,
 226            kernel=kernel,
 227            agg=agg,
 228            verbose=verbose,
 229            show_progress=show_progress,
 230        )
 231        if self.replications is not None or self.type_pi == "gaussian":
 232            if self.sort_by is None:
 233                self.sort_by = "WINKLERSCORE"
 234        else:
 235            if self.sort_by is None:
 236                self.sort_by = "RMSE"
 237
 238    def fit(self, X_train, X_test, xreg=None, per_series=False, **kwargs):
 239        """Fit Regression algorithms to X_train, predict and score on X_test.
 240
 241        Parameters:
 242
 243            X_train: array-like or data frame,
 244                Training vectors, where rows is the number of samples
 245                and columns is the number of features.
 246
 247            X_test: array-like or data frame,
 248                Testing vectors, where rows is the number of samples
 249                and columns is the number of features.
 250
 251            xreg: array-like, optional (default=None)
 252                Additional (external) regressors to be passed to self.obj
 253                xreg must be in 'increasing' order (most recent observations last)
 254
 255            per_series: bool, optional (default=False)
 256                When set to True, the metrics are computed series by series.
 257
 258            **kwargs: dict, optional (default=None)
 259                Additional parameters to be passed to `fit` method of `obj`.
 260
 261        Returns:
 262
 263            scores: Pandas DataFrame
 264                Returns metrics of all the models in a Pandas DataFrame.
 265
 266            predictions: Pandas DataFrame
 267                Returns predictions of all the models in a Pandas DataFrame.
 268
 269        """
 270        R2 = []
 271        ADJR2 = []
 272        ME = []
 273        MPL = []
 274        RMSE = []
 275        MAE = []
 276        MPE = []
 277        MAPE = []
 278        WINKLERSCORE = []
 279        COVERAGE = []
 280
 281        # WIN = []
 282        names = []
 283        TIME = []
 284        predictions = {}
 285
 286        if self.custom_metric is not None:
 287            CUSTOM_METRIC = []
 288
 289        if self.h is None:
 290            assert X_test is not None, "If h is None, X_test must be provided."
 291
 292        if isinstance(X_train, np.ndarray):
 293            X_train = pd.DataFrame(X_train)
 294            X_test = pd.DataFrame(X_test)
 295
 296        self.series_names = X_train.columns.tolist()
 297
 298        X_train = convert_df_to_numeric(X_train)
 299        X_test = convert_df_to_numeric(X_test)
 300
 301        numeric_features = X_train.select_dtypes(include=[np.number]).columns
 302        categorical_features = X_train.select_dtypes(include=["object"]).columns
 303
 304        categorical_low, categorical_high = get_card_split(
 305            X_train, categorical_features
 306        )
 307
 308        if self.preprocess:
 309            preprocessor = ColumnTransformer(
 310                transformers=[
 311                    ("numeric", numeric_transformer, numeric_features),
 312                    (
 313                        "categorical_low",
 314                        categorical_transformer_low,
 315                        categorical_low,
 316                    ),
 317                    (
 318                        "categorical_high",
 319                        categorical_transformer_high,
 320                        categorical_high,
 321                    ),
 322                ]
 323            )
 324
 325        # baselines (Classical MTS) ----
 326        for i, name in enumerate(["ARIMA", "ETS", "Theta", "VAR", "VECM"]):
 327            try:
 328                start = time.time()
 329                regr = ns.ClassicalMTS(model=name)
 330                regr.fit(X_train, **kwargs)
 331                self.models_[name] = regr
 332                if self.h is None:
 333                    X_pred = regr.predict(h=X_test.shape[0], **kwargs)
 334                else:
 335                    assert self.h > 0, "h must be > 0"
 336                    X_pred = regr.predict(h=self.h, **kwargs)
 337                    try:
 338                        X_test = X_test[0 : self.h, :]
 339                    except Exception as e:
 340                        X_test = X_test.iloc[0 : self.h, :]
 341
 342                if per_series == False:
 343                    rmse = np.sqrt(np.mean((X_test - X_pred.mean) ** 2))
 344                    mae = mean_absolute_error(X_test, X_pred.mean)
 345                    mpl = mean_pinball_loss(X_test, X_pred.mean)
 346                else:
 347                    rmse = mean_errors(
 348                        actual=X_test,
 349                        pred=X_pred,
 350                        scoring="root_mean_squared_error",
 351                        per_series=True,
 352                    )
 353                    mae = mean_errors(
 354                        actual=X_test,
 355                        pred=X_pred,
 356                        scoring="mean_absolute_error",
 357                        per_series=True,
 358                    )
 359                    mpl = mean_errors(
 360                        actual=X_test,
 361                        pred=X_pred,
 362                        scoring="mean_pinball_loss",
 363                        per_series=True,
 364                    )
 365            except Exception as exception:
 366                continue
 367
 368            names.append(name)
 369            RMSE.append(rmse)
 370            MAE.append(mae)
 371            MPL.append(mpl)
 372
 373            if self.custom_metric is not None:
 374                try:
 375                    if self.h is None:
 376                        custom_metric = self.custom_metric(X_test, X_pred)
 377                    else:
 378                        custom_metric = self.custom_metric(X_test_h, X_pred)
 379                    CUSTOM_METRIC.append(custom_metric)
 380                except Exception as e:
 381                    custom_metric = np.iinfo(np.float32).max
 382                    CUSTOM_METRIC.append(np.iinfo(np.float32).max)
 383
 384            if (self.replications is not None) or (self.type_pi == "gaussian"):
 385                if per_series == False:
 386                    winklerscore = winkler_score(
 387                        obj=X_pred, actual=X_test, level=95
 388                    )
 389                    coveragecalc = coverage(X_pred, X_test, level=95)
 390                else:
 391                    winklerscore = winkler_score(
 392                        obj=X_pred, actual=X_test, level=95, per_series=True
 393                    )
 394                    coveragecalc = coverage(
 395                        X_pred, X_test, level=95, per_series=True
 396                    )
 397                WINKLERSCORE.append(winklerscore)
 398                COVERAGE.append(coveragecalc)
 399            TIME.append(time.time() - start)
 400
 401        if self.estimators == "all":
 402            self.regressors = MTSREGRESSORS
 403        else:
 404            self.regressors = [
 405                ("MTS(GenericBooster(" + est[0] + "))", est[1])
 406                for est in all_estimators()
 407                if (
 408                    issubclass(est[1], RegressorMixin)
 409                    and (est[0] in self.estimators)
 410                )
 411            ]
 412
 413        if self.preprocess is True:
 414            for name, model in tqdm(self.regressors):  # do parallel exec
 415                start = time.time()
 416                try:
 417                    if "random_state" in model().get_params().keys():
 418                        pipe = Pipeline(
 419                            steps=[
 420                                ("preprocessor", preprocessor),
 421                                (
 422                                    "regressor",
 423                                    ns.MTS(
 424                                        obj=GenericBoostingRegressor(
 425                                            model(
 426                                                random_state=self.random_state,
 427                                                **kwargs,
 428                                            )
 429                                        ),
 430                                        n_hidden_features=self.n_hidden_features,
 431                                        activation_name=self.activation_name,
 432                                        a=self.a,
 433                                        nodes_sim=self.nodes_sim,
 434                                        bias=self.bias,
 435                                        dropout=self.dropout,
 436                                        direct_link=self.direct_link,
 437                                        n_clusters=self.n_clusters,
 438                                        cluster_encode=self.cluster_encode,
 439                                        type_clust=self.type_clust,
 440                                        type_scaling=self.type_scaling,
 441                                        lags=self.lags,
 442                                        type_pi=self.type_pi,
 443                                        block_size=self.block_size,
 444                                        replications=self.replications,
 445                                        kernel=self.kernel,
 446                                        agg=self.agg,
 447                                        seed=self.seed,
 448                                        backend=self.backend,
 449                                        show_progress=self.show_progress,
 450                                    ),
 451                                ),
 452                            ]
 453                        )
 454                    else:  # "random_state" in model().get_params().keys()
 455                        pipe = Pipeline(
 456                            steps=[
 457                                ("preprocessor", preprocessor),
 458                                (
 459                                    "regressor",
 460                                    ns.MTS(
 461                                        obj=GenericBoostingRegressor(
 462                                            model(**kwargs)
 463                                        ),
 464                                        n_hidden_features=self.n_hidden_features,
 465                                        activation_name=self.activation_name,
 466                                        a=self.a,
 467                                        nodes_sim=self.nodes_sim,
 468                                        bias=self.bias,
 469                                        dropout=self.dropout,
 470                                        direct_link=self.direct_link,
 471                                        n_clusters=self.n_clusters,
 472                                        cluster_encode=self.cluster_encode,
 473                                        type_clust=self.type_clust,
 474                                        type_scaling=self.type_scaling,
 475                                        lags=self.lags,
 476                                        type_pi=self.type_pi,
 477                                        block_size=self.block_size,
 478                                        replications=self.replications,
 479                                        kernel=self.kernel,
 480                                        agg=self.agg,
 481                                        seed=self.seed,
 482                                        backend=self.backend,
 483                                        show_progress=self.show_progress,
 484                                    ),
 485                                ),
 486                            ]
 487                        )
 488
 489                    pipe.fit(X_train, **kwargs)
 490                    # pipe.fit(X_train, xreg=xreg)
 491
 492                    self.models_[name] = pipe
 493
 494                    if self.h is None:
 495                        X_pred = pipe["regressor"].predict(h=self.h, **kwargs)
 496                    else:
 497                        assert self.h > 0, "h must be > 0"
 498                        X_pred = pipe["regressor"].predict(h=self.h, **kwargs)
 499
 500                    if (self.replications is not None) or (
 501                        self.type_pi == "gaussian"
 502                    ):
 503                        if per_series == False:
 504                            rmse = np.sqrt(np.mean((X_test - X_pred.mean) ** 2))
 505                            mae = mean_absolute_error(X_test, X_pred.mean)
 506                            mpl = mean_pinball_loss(X_test, X_pred.mean)
 507                            winklerscore = winkler_score(
 508                                obj=X_pred, actual=X_test, level=95
 509                            )
 510                            coveragecalc = coverage(X_pred, X_test, level=95)
 511                        else:
 512                            rmse = mean_errors(
 513                                actual=X_test,
 514                                pred=X_pred,
 515                                scoring="root_mean_squared_error",
 516                                per_series=True,
 517                            )
 518                            mae = mean_errors(
 519                                actual=X_test,
 520                                pred=X_pred,
 521                                scoring="mean_absolute_error",
 522                                per_series=True,
 523                            )
 524                            mpl = mean_errors(
 525                                actual=X_test,
 526                                pred=X_pred,
 527                                scoring="mean_pinball_loss",
 528                                per_series=True,
 529                            )
 530                            winklerscore = winkler_score(
 531                                obj=X_pred,
 532                                actual=X_test,
 533                                level=95,
 534                                per_series=True,
 535                            )
 536                            coveragecalc = coverage(
 537                                X_pred, X_test, level=95, per_series=True
 538                            )
 539                    else:
 540                        if per_series == False:
 541                            rmse = np.sqrt(np.mean((X_test - X_pred) ** 2))
 542                            mae = mean_absolute_error(X_test, X_pred)
 543                            mpl = mean_pinball_loss(X_test, X_pred)
 544                        else:
 545                            rmse = mean_errors(
 546                                actual=X_test,
 547                                pred=X_pred,
 548                                scoring="root_mean_squared_error",
 549                                per_series=True,
 550                            )
 551                            mae = mean_errors(
 552                                actual=X_test,
 553                                pred=X_pred,
 554                                scoring="mean_absolute_error",
 555                                per_series=True,
 556                            )
 557                            mpl = mean_errors(
 558                                actual=X_test,
 559                                pred=X_pred,
 560                                scoring="mean_pinball_loss",
 561                                per_series=True,
 562                            )
 563
 564                    names.append(name)
 565                    RMSE.append(rmse)
 566                    MAE.append(mae)
 567                    MPL.append(mpl)
 568
 569                    if (self.replications is not None) or (
 570                        self.type_pi == "gaussian"
 571                    ):
 572                        WINKLERSCORE.append(winklerscore)
 573                        COVERAGE.append(coveragecalc)
 574                    TIME.append(time.time() - start)
 575
 576                    if self.custom_metric is not None:
 577                        try:
 578                            custom_metric = self.custom_metric(X_test, X_pred)
 579                            CUSTOM_METRIC.append(custom_metric)
 580                        except Exception as e:
 581                            custom_metric = np.iinfo(np.float32).max
 582                            CUSTOM_METRIC.append(custom_metric)
 583
 584                    if self.verbose > 0:
 585                        if (self.replications is not None) or (
 586                            self.type_pi == "gaussian"
 587                        ):
 588                            scores_verbose = {
 589                                "Model": name,
 590                                "RMSE": rmse,
 591                                "MAE": mae,
 592                                "MPL": mpl,
 593                                "WINKLERSCORE": winklerscore,
 594                                "COVERAGE": coveragecalc,
 595                                "Time taken": time.time() - start,
 596                            }
 597                        else:
 598                            scores_verbose = {
 599                                "Model": name,
 600                                "RMSE": rmse,
 601                                "MAE": mae,
 602                                "MPL": mpl,
 603                                "Time taken": time.time() - start,
 604                            }
 605
 606                        if self.custom_metric is not None:
 607                            scores_verbose["Custom metric"] = custom_metric
 608
 609                    if self.predictions:
 610                        predictions[name] = X_pred
 611                except Exception as exception:
 612                    if self.ignore_warnings is False:
 613                        print(name + " model failed to execute")
 614                        print(exception)
 615
 616        else:  # no preprocessing
 617
 618            for name, model in tqdm(self.regressors):  # do parallel exec
 619                start = time.time()
 620                try:
 621                    if "random_state" in model().get_params().keys():
 622                        pipe = ns.MTS(
 623                            obj=model(random_state=self.random_state, **kwargs),
 624                            n_hidden_features=self.n_hidden_features,
 625                            activation_name=self.activation_name,
 626                            a=self.a,
 627                            nodes_sim=self.nodes_sim,
 628                            bias=self.bias,
 629                            dropout=self.dropout,
 630                            direct_link=self.direct_link,
 631                            n_clusters=self.n_clusters,
 632                            cluster_encode=self.cluster_encode,
 633                            type_clust=self.type_clust,
 634                            type_scaling=self.type_scaling,
 635                            lags=self.lags,
 636                            type_pi=self.type_pi,
 637                            block_size=self.block_size,
 638                            replications=self.replications,
 639                            kernel=self.kernel,
 640                            agg=self.agg,
 641                            seed=self.seed,
 642                            backend=self.backend,
 643                            show_progress=self.show_progress,
 644                        )
 645                    else:
 646                        pipe = ns.MTS(
 647                            obj=model(**kwargs),
 648                            n_hidden_features=self.n_hidden_features,
 649                            activation_name=self.activation_name,
 650                            a=self.a,
 651                            nodes_sim=self.nodes_sim,
 652                            bias=self.bias,
 653                            dropout=self.dropout,
 654                            direct_link=self.direct_link,
 655                            n_clusters=self.n_clusters,
 656                            cluster_encode=self.cluster_encode,
 657                            type_clust=self.type_clust,
 658                            type_scaling=self.type_scaling,
 659                            lags=self.lags,
 660                            type_pi=self.type_pi,
 661                            block_size=self.block_size,
 662                            replications=self.replications,
 663                            kernel=self.kernel,
 664                            agg=self.agg,
 665                            seed=self.seed,
 666                            backend=self.backend,
 667                            show_progress=self.show_progress,
 668                        )
 669
 670                    pipe.fit(X_train, xreg, **kwargs)
 671                    # pipe.fit(X_train, xreg=xreg) # DO xreg like in `ahead`
 672
 673                    self.models_[name] = pipe
 674
 675                    if self.preprocess is True:
 676                        if self.h is None:
 677                            X_pred = pipe["regressor"].predict(
 678                                h=X_test.shape[0], **kwargs
 679                            )
 680                        else:
 681                            assert (
 682                                self.h > 0 and self.h <= X_test.shape[0]
 683                            ), "h must be > 0 and < X_test.shape[0]"
 684                            X_pred = pipe["regressor"].predict(
 685                                h=self.h, **kwargs
 686                            )
 687
 688                    else:
 689
 690                        if self.h is None:
 691                            X_pred = pipe.predict(
 692                                h=X_test.shape[0], **kwargs
 693                            )  # X_pred = pipe.predict(h=X_test.shape[0], new_xreg=new_xreg) ## DO xreg like in `ahead`
 694                        else:
 695                            assert (
 696                                self.h > 0 and self.h <= X_test.shape[0]
 697                            ), "h must be > 0 and < X_test.shape[0]"
 698                            X_pred = pipe.predict(h=self.h, **kwargs)
 699
 700                    if self.h is None:
 701                        if (self.replications is not None) or (
 702                            self.type_pi == "gaussian"
 703                        ):
 704
 705                            if per_series == True:
 706                                rmse = mean_errors(
 707                                    actual=X_test,
 708                                    pred=X_pred.mean,
 709                                    scoring="root_mean_squared_error",
 710                                    per_series=True,
 711                                )
 712                                mae = mean_errors(
 713                                    actual=X_test,
 714                                    pred=X_pred.mean,
 715                                    scoring="mean_absolute_error",
 716                                    per_series=True,
 717                                )
 718                                mpl = mean_errors(
 719                                    actual=X_test,
 720                                    pred=X_pred.mean,
 721                                    scoring="mean_pinball_loss",
 722                                    per_series=True,
 723                                )
 724                                winklerscore = winkler_score(
 725                                    obj=X_pred,
 726                                    actual=X_test,
 727                                    level=95,
 728                                    per_series=True,
 729                                )
 730                                coveragecalc = coverage(
 731                                    X_pred, X_test, level=95, per_series=True
 732                                )
 733                            else:
 734                                rmse = np.sqrt(
 735                                    np.mean((X_test - X_pred.mean) ** 2)
 736                                )
 737                                mae = mean_absolute_error(X_test, X_pred.mean)
 738                                mpl = mean_pinball_loss(X_test, X_pred.mean)
 739                                winklerscore = winkler_score(
 740                                    obj=X_pred, actual=X_test, level=95
 741                                )
 742                                coveragecalc = coverage(
 743                                    X_pred, X_test, level=95
 744                                )
 745                        else:  # no prediction interval
 746                            if per_series == True:
 747                                rmse = mean_errors(
 748                                    actual=X_test,
 749                                    pred=X_pred,
 750                                    scoring="root_mean_squared_error",
 751                                    per_series=True,
 752                                )
 753                                mae = mean_errors(
 754                                    actual=X_test,
 755                                    pred=X_pred,
 756                                    scoring="mean_absolute_error",
 757                                    per_series=True,
 758                                )
 759                                mpl = mean_errors(
 760                                    actual=X_test,
 761                                    pred=X_pred,
 762                                    scoring="mean_pinball_loss",
 763                                    per_series=True,
 764                                )
 765                            else:
 766                                rmse = np.sqrt(np.mean((X_test - X_pred) ** 2))
 767                                mae = mean_absolute_error(X_test, X_pred)
 768                                mpl = mean_pinball_loss(X_test, X_pred)
 769                    else:  # self.h is not None
 770                        if (self.replications is not None) or (
 771                            self.type_pi == "gaussian"
 772                        ):
 773
 774                            if per_series == False:
 775                                if isinstance(X_test, pd.DataFrame) == False:
 776                                    X_test_h = X_test[0 : self.h, :]
 777                                    rmse = np.sqrt(
 778                                        np.mean((X_test_h - X_pred.mean) ** 2)
 779                                    )
 780                                    mae = mean_absolute_error(
 781                                        X_test_h, X_pred.mean
 782                                    )
 783                                    mpl = mean_pinball_loss(
 784                                        X_test_h, X_pred.mean
 785                                    )
 786                                    winklerscore = winkler_score(
 787                                        obj=X_pred, actual=X_test_h, level=95
 788                                    )
 789                                    coveragecalc = coverage(
 790                                        X_pred, X_test_h, level=95
 791                                    )
 792                                else:
 793                                    X_test_h = X_test.iloc[0 : self.h, :]
 794                                    rmse = np.sqrt(
 795                                        np.mean((X_test_h - X_pred.mean) ** 2)
 796                                    )
 797                                    mae = mean_absolute_error(
 798                                        X_test_h, X_pred.mean
 799                                    )
 800                                    mpl = mean_pinball_loss(
 801                                        X_test_h, X_pred.mean
 802                                    )
 803                                    winklerscore = winkler_score(
 804                                        obj=X_pred, actual=X_test_h, level=95
 805                                    )
 806                                    coveragecalc = coverage(
 807                                        X_pred, X_test_h, level=95
 808                                    )
 809                            else:
 810                                if isinstance(X_test, pd.DataFrame):
 811                                    X_test_h = X_test.iloc[0 : self.h, :]
 812                                    rmse = mean_errors(
 813                                        actual=X_test_h,
 814                                        pred=X_pred,
 815                                        scoring="root_mean_squared_error",
 816                                        per_series=True,
 817                                    )
 818                                    mae = mean_errors(
 819                                        actual=X_test_h,
 820                                        pred=X_pred,
 821                                        scoring="mean_absolute_error",
 822                                        per_series=True,
 823                                    )
 824                                    mpl = mean_errors(
 825                                        actual=X_test_h,
 826                                        pred=X_pred,
 827                                        scoring="mean_pinball_loss",
 828                                        per_series=True,
 829                                    )
 830                                    winklerscore = winkler_score(
 831                                        obj=X_pred,
 832                                        actual=X_test_h,
 833                                        level=95,
 834                                        per_series=True,
 835                                    )
 836                                    coveragecalc = coverage(
 837                                        X_pred,
 838                                        X_test_h,
 839                                        level=95,
 840                                        per_series=True,
 841                                    )
 842                                else:
 843                                    X_test_h = X_test[0 : self.h, :]
 844                                    rmse = mean_errors(
 845                                        actual=X_test_h,
 846                                        pred=X_pred,
 847                                        scoring="root_mean_squared_error",
 848                                        per_series=True,
 849                                    )
 850                                    mae = mean_errors(
 851                                        actual=X_test_h,
 852                                        pred=X_pred,
 853                                        scoring="mean_absolute_error",
 854                                        per_series=True,
 855                                    )
 856                                    mpl = mean_errors(
 857                                        actual=X_test_h,
 858                                        pred=X_pred,
 859                                        scoring="mean_pinball_loss",
 860                                        per_series=True,
 861                                    )
 862                                    winklerscore = winkler_score(
 863                                        obj=X_pred,
 864                                        actual=X_test_h,
 865                                        level=95,
 866                                        per_series=True,
 867                                    )
 868                                    coveragecalc = coverage(
 869                                        X_pred,
 870                                        X_test_h,
 871                                        level=95,
 872                                        per_series=True,
 873                                    )
 874                        else:  # no prediction interval
 875
 876                            if per_series == False:
 877                                if isinstance(X_test, pd.DataFrame):
 878                                    X_test_h = X_test.iloc[0 : self.h, :]
 879                                    rmse = np.sqrt(
 880                                        np.mean((X_test_h - X_pred) ** 2)
 881                                    )
 882                                    mae = mean_absolute_error(X_test_h, X_pred)
 883                                    mpl = mean_pinball_loss(X_test_h, X_pred)
 884                                else:
 885                                    X_test_h = X_test[0 : self.h, :]
 886                                    rmse = np.sqrt(
 887                                        np.mean((X_test_h - X_pred) ** 2)
 888                                    )
 889                                    mae = mean_absolute_error(X_test_h, X_pred)
 890                                    mpl = mean_pinball_loss(X_test_h, X_pred)
 891                            else:
 892                                if isinstance(X_test, pd.DataFrame):
 893                                    X_test_h = X_test.iloc[0 : self.h, :]
 894                                    rmse = mean_errors(
 895                                        actual=X_test_h,
 896                                        pred=X_pred,
 897                                        scoring="root_mean_squared_error",
 898                                        per_series=True,
 899                                    )
 900                                    mae = mean_errors(
 901                                        actual=X_test_h,
 902                                        pred=X_pred,
 903                                        scoring="mean_absolute_error",
 904                                        per_series=True,
 905                                    )
 906                                    mpl = mean_errors(
 907                                        actual=X_test_h,
 908                                        pred=X_pred,
 909                                        scoring="mean_pinball_loss",
 910                                        per_series=True,
 911                                    )
 912                                else:
 913                                    X_test_h = X_test[0 : self.h, :]
 914                                    rmse = mean_errors(
 915                                        actual=X_test_h,
 916                                        pred=X_pred,
 917                                        scoring="root_mean_squared_error",
 918                                        per_series=True,
 919                                    )
 920                                    mae = mean_errors(
 921                                        actual=X_test_h,
 922                                        pred=X_pred,
 923                                        scoring="mean_absolute_error",
 924                                        per_series=True,
 925                                    )
 926
 927                    names.append(name)
 928                    RMSE.append(rmse)
 929                    MAE.append(mae)
 930                    MPL.append(mpl)
 931                    if (self.replications is not None) or (
 932                        self.type_pi == "gaussian"
 933                    ):
 934                        WINKLERSCORE.append(winklerscore)
 935                        COVERAGE.append(coveragecalc)
 936                    TIME.append(time.time() - start)
 937
 938                    if self.custom_metric is not None:
 939                        try:
 940                            if self.h is None:
 941                                custom_metric = self.custom_metric(
 942                                    X_test, X_pred
 943                                )
 944                            else:
 945                                custom_metric = self.custom_metric(
 946                                    X_test_h, X_pred
 947                                )
 948                            CUSTOM_METRIC.append(custom_metric)
 949                        except Exception as e:
 950                            custom_metric = np.iinfo(np.float32).max
 951                            CUSTOM_METRIC.append(np.iinfo(np.float32).max)
 952
 953                    if self.verbose > 0:
 954                        if (self.replications is not None) or (
 955                            self.type_pi == "gaussian"
 956                        ):
 957                            scores_verbose = {
 958                                "Model": name,
 959                                "RMSE": rmse,
 960                                "MAE": mae,
 961                                "MPL": mpl,
 962                                "WINKLERSCORE": winklerscore,
 963                                "COVERAGE": coveragecalc,
 964                                "Time taken": time.time() - start,
 965                            }
 966                        else:
 967                            scores_verbose = {
 968                                "Model": name,
 969                                "RMSE": rmse,
 970                                "MAE": mae,
 971                                "MPL": mpl,
 972                                "Time taken": time.time() - start,
 973                            }
 974
 975                        if self.custom_metric is not None:
 976                            scores_verbose["Custom metric"] = custom_metric
 977
 978                    if self.predictions:
 979                        predictions[name] = X_pred
 980
 981                except Exception as exception:
 982                    if self.ignore_warnings is False:
 983                        print(name + " model failed to execute")
 984                        print(exception)
 985
 986        if (self.replications is not None) or (self.type_pi == "gaussian"):
 987            scores = {
 988                "Model": names,
 989                "RMSE": RMSE,
 990                "MAE": MAE,
 991                "MPL": MPL,
 992                "WINKLERSCORE": WINKLERSCORE,
 993                "COVERAGE": COVERAGE,
 994                "Time Taken": TIME,
 995            }
 996        else:
 997            scores = {
 998                "Model": names,
 999                "RMSE": RMSE,
1000                "MAE": MAE,
1001                "MPL": MPL,
1002                "Time Taken": TIME,
1003            }
1004
1005        if self.custom_metric is not None:
1006            scores["Custom metric"] = CUSTOM_METRIC
1007
1008        if per_series:
1009            scores = dict_to_dataframe_series(scores, self.series_names)
1010        else:
1011            scores = pd.DataFrame(scores)
1012
1013        try:  # case per_series, can't be sorted
1014            scores = scores.sort_values(
1015                by=self.sort_by, ascending=True
1016            ).set_index("Model")
1017
1018            self.best_model_ = self.models_[scores.index[0]]
1019        except Exception as e:
1020            pass
1021
1022        if self.predictions is True:
1023
1024            return scores, predictions
1025
1026        return scores
1027
1028    def get_best_model(self):
1029        """
1030        This function returns the best model pipeline based on the sort_by metric.
1031
1032        Returns:
1033
1034            best_model: object,
1035                Returns the best model pipeline based on the sort_by metric.
1036
1037        """
1038        return self.best_model_
1039
1040    def provide_models(self, X_train, X_test):
1041        """
1042        This function returns all the model objects trained in fit function.
1043        If fit is not called already, then we call fit and then return the models.
1044
1045        Parameters:
1046
1047            X_train : array-like,
1048                Training vectors, where rows is the number of samples
1049                and columns is the number of features.
1050
1051            X_test : array-like,
1052                Testing vectors, where rows is the number of samples
1053                and columns is the number of features.
1054
1055        Returns:
1056
1057            models: dict-object,
1058                Returns a dictionary with each model pipeline as value
1059                with key as name of models.
1060
1061        """
1062        if self.h is None:
1063            if len(self.models_.keys()) == 0:
1064                self.fit(X_train, X_test)
1065        else:
1066            if len(self.models_.keys()) == 0:
1067                if isinstance(X_test, pd.DataFrame):
1068                    self.fit(X_train, X_test.iloc[0 : self.h, :])
1069                else:
1070                    self.fit(X_train, X_test[0 : self.h, :])
1071
1072        return self.models_

Fitting -- almost -- all the regression algorithms with layers of nnetsauce's CustomRegressor to multivariate time series and returning their scores.

Parameters:

verbose: int, optional (default=0)
    Any positive number for verbosity.

ignore_warnings: bool, optional (default=True)
    When set to True, the warning related to algorigms that are not
    able to run are ignored.

custom_metric: function, optional (default=None)
    When function is provided, models are evaluated based on the custom
      evaluation metric provided.

predictions: bool, optional (default=False)
    When set to True, the predictions of all the models models are returned as dataframe.

sort_by: string, optional (default='RMSE')
    Sort models by a metric. Available options are 'RMSE', 'MAE', 'MPL', 'MPE', 'MAPE',
    'R-Squared', 'Adjusted R-Squared' or a custom metric identified by its name and
    provided by custom_metric.

random_state: int, optional (default=42)
    Reproducibiility seed.

estimators: list, optional (default='all')
    list of Estimators (regression algorithms) names or just 'all' (default='all')

preprocess: bool, preprocessing is done when set to True

h: int, optional (default=None)
    Number of steps ahead to predict (when used, must be > 0 and < X_test.shape[0]).

All the other parameters are the same as MTS's.

Attributes:

models_: dict-object
    Returns a dictionary with each model pipeline as value
    with key as name of models.

best_model_: object
    Returns the best model pipeline based on the sort_by metric.

Examples:

See https://thierrymoudiki.github.io/blog/2023/10/29/python/quasirandomizednn/MTS-LazyPredict
def fit(self, X_train, X_test, xreg=None, per_series=False, **kwargs):
 238    def fit(self, X_train, X_test, xreg=None, per_series=False, **kwargs):
 239        """Fit Regression algorithms to X_train, predict and score on X_test.
 240
 241        Parameters:
 242
 243            X_train: array-like or data frame,
 244                Training vectors, where rows is the number of samples
 245                and columns is the number of features.
 246
 247            X_test: array-like or data frame,
 248                Testing vectors, where rows is the number of samples
 249                and columns is the number of features.
 250
 251            xreg: array-like, optional (default=None)
 252                Additional (external) regressors to be passed to self.obj
 253                xreg must be in 'increasing' order (most recent observations last)
 254
 255            per_series: bool, optional (default=False)
 256                When set to True, the metrics are computed series by series.
 257
 258            **kwargs: dict, optional (default=None)
 259                Additional parameters to be passed to `fit` method of `obj`.
 260
 261        Returns:
 262
 263            scores: Pandas DataFrame
 264                Returns metrics of all the models in a Pandas DataFrame.
 265
 266            predictions: Pandas DataFrame
 267                Returns predictions of all the models in a Pandas DataFrame.
 268
 269        """
 270        R2 = []
 271        ADJR2 = []
 272        ME = []
 273        MPL = []
 274        RMSE = []
 275        MAE = []
 276        MPE = []
 277        MAPE = []
 278        WINKLERSCORE = []
 279        COVERAGE = []
 280
 281        # WIN = []
 282        names = []
 283        TIME = []
 284        predictions = {}
 285
 286        if self.custom_metric is not None:
 287            CUSTOM_METRIC = []
 288
 289        if self.h is None:
 290            assert X_test is not None, "If h is None, X_test must be provided."
 291
 292        if isinstance(X_train, np.ndarray):
 293            X_train = pd.DataFrame(X_train)
 294            X_test = pd.DataFrame(X_test)
 295
 296        self.series_names = X_train.columns.tolist()
 297
 298        X_train = convert_df_to_numeric(X_train)
 299        X_test = convert_df_to_numeric(X_test)
 300
 301        numeric_features = X_train.select_dtypes(include=[np.number]).columns
 302        categorical_features = X_train.select_dtypes(include=["object"]).columns
 303
 304        categorical_low, categorical_high = get_card_split(
 305            X_train, categorical_features
 306        )
 307
 308        if self.preprocess:
 309            preprocessor = ColumnTransformer(
 310                transformers=[
 311                    ("numeric", numeric_transformer, numeric_features),
 312                    (
 313                        "categorical_low",
 314                        categorical_transformer_low,
 315                        categorical_low,
 316                    ),
 317                    (
 318                        "categorical_high",
 319                        categorical_transformer_high,
 320                        categorical_high,
 321                    ),
 322                ]
 323            )
 324
 325        # baselines (Classical MTS) ----
 326        for i, name in enumerate(["ARIMA", "ETS", "Theta", "VAR", "VECM"]):
 327            try:
 328                start = time.time()
 329                regr = ns.ClassicalMTS(model=name)
 330                regr.fit(X_train, **kwargs)
 331                self.models_[name] = regr
 332                if self.h is None:
 333                    X_pred = regr.predict(h=X_test.shape[0], **kwargs)
 334                else:
 335                    assert self.h > 0, "h must be > 0"
 336                    X_pred = regr.predict(h=self.h, **kwargs)
 337                    try:
 338                        X_test = X_test[0 : self.h, :]
 339                    except Exception as e:
 340                        X_test = X_test.iloc[0 : self.h, :]
 341
 342                if per_series == False:
 343                    rmse = np.sqrt(np.mean((X_test - X_pred.mean) ** 2))
 344                    mae = mean_absolute_error(X_test, X_pred.mean)
 345                    mpl = mean_pinball_loss(X_test, X_pred.mean)
 346                else:
 347                    rmse = mean_errors(
 348                        actual=X_test,
 349                        pred=X_pred,
 350                        scoring="root_mean_squared_error",
 351                        per_series=True,
 352                    )
 353                    mae = mean_errors(
 354                        actual=X_test,
 355                        pred=X_pred,
 356                        scoring="mean_absolute_error",
 357                        per_series=True,
 358                    )
 359                    mpl = mean_errors(
 360                        actual=X_test,
 361                        pred=X_pred,
 362                        scoring="mean_pinball_loss",
 363                        per_series=True,
 364                    )
 365            except Exception as exception:
 366                continue
 367
 368            names.append(name)
 369            RMSE.append(rmse)
 370            MAE.append(mae)
 371            MPL.append(mpl)
 372
 373            if self.custom_metric is not None:
 374                try:
 375                    if self.h is None:
 376                        custom_metric = self.custom_metric(X_test, X_pred)
 377                    else:
 378                        custom_metric = self.custom_metric(X_test_h, X_pred)
 379                    CUSTOM_METRIC.append(custom_metric)
 380                except Exception as e:
 381                    custom_metric = np.iinfo(np.float32).max
 382                    CUSTOM_METRIC.append(np.iinfo(np.float32).max)
 383
 384            if (self.replications is not None) or (self.type_pi == "gaussian"):
 385                if per_series == False:
 386                    winklerscore = winkler_score(
 387                        obj=X_pred, actual=X_test, level=95
 388                    )
 389                    coveragecalc = coverage(X_pred, X_test, level=95)
 390                else:
 391                    winklerscore = winkler_score(
 392                        obj=X_pred, actual=X_test, level=95, per_series=True
 393                    )
 394                    coveragecalc = coverage(
 395                        X_pred, X_test, level=95, per_series=True
 396                    )
 397                WINKLERSCORE.append(winklerscore)
 398                COVERAGE.append(coveragecalc)
 399            TIME.append(time.time() - start)
 400
 401        if self.estimators == "all":
 402            self.regressors = MTSREGRESSORS
 403        else:
 404            self.regressors = [
 405                ("MTS(GenericBooster(" + est[0] + "))", est[1])
 406                for est in all_estimators()
 407                if (
 408                    issubclass(est[1], RegressorMixin)
 409                    and (est[0] in self.estimators)
 410                )
 411            ]
 412
 413        if self.preprocess is True:
 414            for name, model in tqdm(self.regressors):  # do parallel exec
 415                start = time.time()
 416                try:
 417                    if "random_state" in model().get_params().keys():
 418                        pipe = Pipeline(
 419                            steps=[
 420                                ("preprocessor", preprocessor),
 421                                (
 422                                    "regressor",
 423                                    ns.MTS(
 424                                        obj=GenericBoostingRegressor(
 425                                            model(
 426                                                random_state=self.random_state,
 427                                                **kwargs,
 428                                            )
 429                                        ),
 430                                        n_hidden_features=self.n_hidden_features,
 431                                        activation_name=self.activation_name,
 432                                        a=self.a,
 433                                        nodes_sim=self.nodes_sim,
 434                                        bias=self.bias,
 435                                        dropout=self.dropout,
 436                                        direct_link=self.direct_link,
 437                                        n_clusters=self.n_clusters,
 438                                        cluster_encode=self.cluster_encode,
 439                                        type_clust=self.type_clust,
 440                                        type_scaling=self.type_scaling,
 441                                        lags=self.lags,
 442                                        type_pi=self.type_pi,
 443                                        block_size=self.block_size,
 444                                        replications=self.replications,
 445                                        kernel=self.kernel,
 446                                        agg=self.agg,
 447                                        seed=self.seed,
 448                                        backend=self.backend,
 449                                        show_progress=self.show_progress,
 450                                    ),
 451                                ),
 452                            ]
 453                        )
 454                    else:  # "random_state" in model().get_params().keys()
 455                        pipe = Pipeline(
 456                            steps=[
 457                                ("preprocessor", preprocessor),
 458                                (
 459                                    "regressor",
 460                                    ns.MTS(
 461                                        obj=GenericBoostingRegressor(
 462                                            model(**kwargs)
 463                                        ),
 464                                        n_hidden_features=self.n_hidden_features,
 465                                        activation_name=self.activation_name,
 466                                        a=self.a,
 467                                        nodes_sim=self.nodes_sim,
 468                                        bias=self.bias,
 469                                        dropout=self.dropout,
 470                                        direct_link=self.direct_link,
 471                                        n_clusters=self.n_clusters,
 472                                        cluster_encode=self.cluster_encode,
 473                                        type_clust=self.type_clust,
 474                                        type_scaling=self.type_scaling,
 475                                        lags=self.lags,
 476                                        type_pi=self.type_pi,
 477                                        block_size=self.block_size,
 478                                        replications=self.replications,
 479                                        kernel=self.kernel,
 480                                        agg=self.agg,
 481                                        seed=self.seed,
 482                                        backend=self.backend,
 483                                        show_progress=self.show_progress,
 484                                    ),
 485                                ),
 486                            ]
 487                        )
 488
 489                    pipe.fit(X_train, **kwargs)
 490                    # pipe.fit(X_train, xreg=xreg)
 491
 492                    self.models_[name] = pipe
 493
 494                    if self.h is None:
 495                        X_pred = pipe["regressor"].predict(h=self.h, **kwargs)
 496                    else:
 497                        assert self.h > 0, "h must be > 0"
 498                        X_pred = pipe["regressor"].predict(h=self.h, **kwargs)
 499
 500                    if (self.replications is not None) or (
 501                        self.type_pi == "gaussian"
 502                    ):
 503                        if per_series == False:
 504                            rmse = np.sqrt(np.mean((X_test - X_pred.mean) ** 2))
 505                            mae = mean_absolute_error(X_test, X_pred.mean)
 506                            mpl = mean_pinball_loss(X_test, X_pred.mean)
 507                            winklerscore = winkler_score(
 508                                obj=X_pred, actual=X_test, level=95
 509                            )
 510                            coveragecalc = coverage(X_pred, X_test, level=95)
 511                        else:
 512                            rmse = mean_errors(
 513                                actual=X_test,
 514                                pred=X_pred,
 515                                scoring="root_mean_squared_error",
 516                                per_series=True,
 517                            )
 518                            mae = mean_errors(
 519                                actual=X_test,
 520                                pred=X_pred,
 521                                scoring="mean_absolute_error",
 522                                per_series=True,
 523                            )
 524                            mpl = mean_errors(
 525                                actual=X_test,
 526                                pred=X_pred,
 527                                scoring="mean_pinball_loss",
 528                                per_series=True,
 529                            )
 530                            winklerscore = winkler_score(
 531                                obj=X_pred,
 532                                actual=X_test,
 533                                level=95,
 534                                per_series=True,
 535                            )
 536                            coveragecalc = coverage(
 537                                X_pred, X_test, level=95, per_series=True
 538                            )
 539                    else:
 540                        if per_series == False:
 541                            rmse = np.sqrt(np.mean((X_test - X_pred) ** 2))
 542                            mae = mean_absolute_error(X_test, X_pred)
 543                            mpl = mean_pinball_loss(X_test, X_pred)
 544                        else:
 545                            rmse = mean_errors(
 546                                actual=X_test,
 547                                pred=X_pred,
 548                                scoring="root_mean_squared_error",
 549                                per_series=True,
 550                            )
 551                            mae = mean_errors(
 552                                actual=X_test,
 553                                pred=X_pred,
 554                                scoring="mean_absolute_error",
 555                                per_series=True,
 556                            )
 557                            mpl = mean_errors(
 558                                actual=X_test,
 559                                pred=X_pred,
 560                                scoring="mean_pinball_loss",
 561                                per_series=True,
 562                            )
 563
 564                    names.append(name)
 565                    RMSE.append(rmse)
 566                    MAE.append(mae)
 567                    MPL.append(mpl)
 568
 569                    if (self.replications is not None) or (
 570                        self.type_pi == "gaussian"
 571                    ):
 572                        WINKLERSCORE.append(winklerscore)
 573                        COVERAGE.append(coveragecalc)
 574                    TIME.append(time.time() - start)
 575
 576                    if self.custom_metric is not None:
 577                        try:
 578                            custom_metric = self.custom_metric(X_test, X_pred)
 579                            CUSTOM_METRIC.append(custom_metric)
 580                        except Exception as e:
 581                            custom_metric = np.iinfo(np.float32).max
 582                            CUSTOM_METRIC.append(custom_metric)
 583
 584                    if self.verbose > 0:
 585                        if (self.replications is not None) or (
 586                            self.type_pi == "gaussian"
 587                        ):
 588                            scores_verbose = {
 589                                "Model": name,
 590                                "RMSE": rmse,
 591                                "MAE": mae,
 592                                "MPL": mpl,
 593                                "WINKLERSCORE": winklerscore,
 594                                "COVERAGE": coveragecalc,
 595                                "Time taken": time.time() - start,
 596                            }
 597                        else:
 598                            scores_verbose = {
 599                                "Model": name,
 600                                "RMSE": rmse,
 601                                "MAE": mae,
 602                                "MPL": mpl,
 603                                "Time taken": time.time() - start,
 604                            }
 605
 606                        if self.custom_metric is not None:
 607                            scores_verbose["Custom metric"] = custom_metric
 608
 609                    if self.predictions:
 610                        predictions[name] = X_pred
 611                except Exception as exception:
 612                    if self.ignore_warnings is False:
 613                        print(name + " model failed to execute")
 614                        print(exception)
 615
 616        else:  # no preprocessing
 617
 618            for name, model in tqdm(self.regressors):  # do parallel exec
 619                start = time.time()
 620                try:
 621                    if "random_state" in model().get_params().keys():
 622                        pipe = ns.MTS(
 623                            obj=model(random_state=self.random_state, **kwargs),
 624                            n_hidden_features=self.n_hidden_features,
 625                            activation_name=self.activation_name,
 626                            a=self.a,
 627                            nodes_sim=self.nodes_sim,
 628                            bias=self.bias,
 629                            dropout=self.dropout,
 630                            direct_link=self.direct_link,
 631                            n_clusters=self.n_clusters,
 632                            cluster_encode=self.cluster_encode,
 633                            type_clust=self.type_clust,
 634                            type_scaling=self.type_scaling,
 635                            lags=self.lags,
 636                            type_pi=self.type_pi,
 637                            block_size=self.block_size,
 638                            replications=self.replications,
 639                            kernel=self.kernel,
 640                            agg=self.agg,
 641                            seed=self.seed,
 642                            backend=self.backend,
 643                            show_progress=self.show_progress,
 644                        )
 645                    else:
 646                        pipe = ns.MTS(
 647                            obj=model(**kwargs),
 648                            n_hidden_features=self.n_hidden_features,
 649                            activation_name=self.activation_name,
 650                            a=self.a,
 651                            nodes_sim=self.nodes_sim,
 652                            bias=self.bias,
 653                            dropout=self.dropout,
 654                            direct_link=self.direct_link,
 655                            n_clusters=self.n_clusters,
 656                            cluster_encode=self.cluster_encode,
 657                            type_clust=self.type_clust,
 658                            type_scaling=self.type_scaling,
 659                            lags=self.lags,
 660                            type_pi=self.type_pi,
 661                            block_size=self.block_size,
 662                            replications=self.replications,
 663                            kernel=self.kernel,
 664                            agg=self.agg,
 665                            seed=self.seed,
 666                            backend=self.backend,
 667                            show_progress=self.show_progress,
 668                        )
 669
 670                    pipe.fit(X_train, xreg, **kwargs)
 671                    # pipe.fit(X_train, xreg=xreg) # DO xreg like in `ahead`
 672
 673                    self.models_[name] = pipe
 674
 675                    if self.preprocess is True:
 676                        if self.h is None:
 677                            X_pred = pipe["regressor"].predict(
 678                                h=X_test.shape[0], **kwargs
 679                            )
 680                        else:
 681                            assert (
 682                                self.h > 0 and self.h <= X_test.shape[0]
 683                            ), "h must be > 0 and < X_test.shape[0]"
 684                            X_pred = pipe["regressor"].predict(
 685                                h=self.h, **kwargs
 686                            )
 687
 688                    else:
 689
 690                        if self.h is None:
 691                            X_pred = pipe.predict(
 692                                h=X_test.shape[0], **kwargs
 693                            )  # X_pred = pipe.predict(h=X_test.shape[0], new_xreg=new_xreg) ## DO xreg like in `ahead`
 694                        else:
 695                            assert (
 696                                self.h > 0 and self.h <= X_test.shape[0]
 697                            ), "h must be > 0 and < X_test.shape[0]"
 698                            X_pred = pipe.predict(h=self.h, **kwargs)
 699
 700                    if self.h is None:
 701                        if (self.replications is not None) or (
 702                            self.type_pi == "gaussian"
 703                        ):
 704
 705                            if per_series == True:
 706                                rmse = mean_errors(
 707                                    actual=X_test,
 708                                    pred=X_pred.mean,
 709                                    scoring="root_mean_squared_error",
 710                                    per_series=True,
 711                                )
 712                                mae = mean_errors(
 713                                    actual=X_test,
 714                                    pred=X_pred.mean,
 715                                    scoring="mean_absolute_error",
 716                                    per_series=True,
 717                                )
 718                                mpl = mean_errors(
 719                                    actual=X_test,
 720                                    pred=X_pred.mean,
 721                                    scoring="mean_pinball_loss",
 722                                    per_series=True,
 723                                )
 724                                winklerscore = winkler_score(
 725                                    obj=X_pred,
 726                                    actual=X_test,
 727                                    level=95,
 728                                    per_series=True,
 729                                )
 730                                coveragecalc = coverage(
 731                                    X_pred, X_test, level=95, per_series=True
 732                                )
 733                            else:
 734                                rmse = np.sqrt(
 735                                    np.mean((X_test - X_pred.mean) ** 2)
 736                                )
 737                                mae = mean_absolute_error(X_test, X_pred.mean)
 738                                mpl = mean_pinball_loss(X_test, X_pred.mean)
 739                                winklerscore = winkler_score(
 740                                    obj=X_pred, actual=X_test, level=95
 741                                )
 742                                coveragecalc = coverage(
 743                                    X_pred, X_test, level=95
 744                                )
 745                        else:  # no prediction interval
 746                            if per_series == True:
 747                                rmse = mean_errors(
 748                                    actual=X_test,
 749                                    pred=X_pred,
 750                                    scoring="root_mean_squared_error",
 751                                    per_series=True,
 752                                )
 753                                mae = mean_errors(
 754                                    actual=X_test,
 755                                    pred=X_pred,
 756                                    scoring="mean_absolute_error",
 757                                    per_series=True,
 758                                )
 759                                mpl = mean_errors(
 760                                    actual=X_test,
 761                                    pred=X_pred,
 762                                    scoring="mean_pinball_loss",
 763                                    per_series=True,
 764                                )
 765                            else:
 766                                rmse = np.sqrt(np.mean((X_test - X_pred) ** 2))
 767                                mae = mean_absolute_error(X_test, X_pred)
 768                                mpl = mean_pinball_loss(X_test, X_pred)
 769                    else:  # self.h is not None
 770                        if (self.replications is not None) or (
 771                            self.type_pi == "gaussian"
 772                        ):
 773
 774                            if per_series == False:
 775                                if isinstance(X_test, pd.DataFrame) == False:
 776                                    X_test_h = X_test[0 : self.h, :]
 777                                    rmse = np.sqrt(
 778                                        np.mean((X_test_h - X_pred.mean) ** 2)
 779                                    )
 780                                    mae = mean_absolute_error(
 781                                        X_test_h, X_pred.mean
 782                                    )
 783                                    mpl = mean_pinball_loss(
 784                                        X_test_h, X_pred.mean
 785                                    )
 786                                    winklerscore = winkler_score(
 787                                        obj=X_pred, actual=X_test_h, level=95
 788                                    )
 789                                    coveragecalc = coverage(
 790                                        X_pred, X_test_h, level=95
 791                                    )
 792                                else:
 793                                    X_test_h = X_test.iloc[0 : self.h, :]
 794                                    rmse = np.sqrt(
 795                                        np.mean((X_test_h - X_pred.mean) ** 2)
 796                                    )
 797                                    mae = mean_absolute_error(
 798                                        X_test_h, X_pred.mean
 799                                    )
 800                                    mpl = mean_pinball_loss(
 801                                        X_test_h, X_pred.mean
 802                                    )
 803                                    winklerscore = winkler_score(
 804                                        obj=X_pred, actual=X_test_h, level=95
 805                                    )
 806                                    coveragecalc = coverage(
 807                                        X_pred, X_test_h, level=95
 808                                    )
 809                            else:
 810                                if isinstance(X_test, pd.DataFrame):
 811                                    X_test_h = X_test.iloc[0 : self.h, :]
 812                                    rmse = mean_errors(
 813                                        actual=X_test_h,
 814                                        pred=X_pred,
 815                                        scoring="root_mean_squared_error",
 816                                        per_series=True,
 817                                    )
 818                                    mae = mean_errors(
 819                                        actual=X_test_h,
 820                                        pred=X_pred,
 821                                        scoring="mean_absolute_error",
 822                                        per_series=True,
 823                                    )
 824                                    mpl = mean_errors(
 825                                        actual=X_test_h,
 826                                        pred=X_pred,
 827                                        scoring="mean_pinball_loss",
 828                                        per_series=True,
 829                                    )
 830                                    winklerscore = winkler_score(
 831                                        obj=X_pred,
 832                                        actual=X_test_h,
 833                                        level=95,
 834                                        per_series=True,
 835                                    )
 836                                    coveragecalc = coverage(
 837                                        X_pred,
 838                                        X_test_h,
 839                                        level=95,
 840                                        per_series=True,
 841                                    )
 842                                else:
 843                                    X_test_h = X_test[0 : self.h, :]
 844                                    rmse = mean_errors(
 845                                        actual=X_test_h,
 846                                        pred=X_pred,
 847                                        scoring="root_mean_squared_error",
 848                                        per_series=True,
 849                                    )
 850                                    mae = mean_errors(
 851                                        actual=X_test_h,
 852                                        pred=X_pred,
 853                                        scoring="mean_absolute_error",
 854                                        per_series=True,
 855                                    )
 856                                    mpl = mean_errors(
 857                                        actual=X_test_h,
 858                                        pred=X_pred,
 859                                        scoring="mean_pinball_loss",
 860                                        per_series=True,
 861                                    )
 862                                    winklerscore = winkler_score(
 863                                        obj=X_pred,
 864                                        actual=X_test_h,
 865                                        level=95,
 866                                        per_series=True,
 867                                    )
 868                                    coveragecalc = coverage(
 869                                        X_pred,
 870                                        X_test_h,
 871                                        level=95,
 872                                        per_series=True,
 873                                    )
 874                        else:  # no prediction interval
 875
 876                            if per_series == False:
 877                                if isinstance(X_test, pd.DataFrame):
 878                                    X_test_h = X_test.iloc[0 : self.h, :]
 879                                    rmse = np.sqrt(
 880                                        np.mean((X_test_h - X_pred) ** 2)
 881                                    )
 882                                    mae = mean_absolute_error(X_test_h, X_pred)
 883                                    mpl = mean_pinball_loss(X_test_h, X_pred)
 884                                else:
 885                                    X_test_h = X_test[0 : self.h, :]
 886                                    rmse = np.sqrt(
 887                                        np.mean((X_test_h - X_pred) ** 2)
 888                                    )
 889                                    mae = mean_absolute_error(X_test_h, X_pred)
 890                                    mpl = mean_pinball_loss(X_test_h, X_pred)
 891                            else:
 892                                if isinstance(X_test, pd.DataFrame):
 893                                    X_test_h = X_test.iloc[0 : self.h, :]
 894                                    rmse = mean_errors(
 895                                        actual=X_test_h,
 896                                        pred=X_pred,
 897                                        scoring="root_mean_squared_error",
 898                                        per_series=True,
 899                                    )
 900                                    mae = mean_errors(
 901                                        actual=X_test_h,
 902                                        pred=X_pred,
 903                                        scoring="mean_absolute_error",
 904                                        per_series=True,
 905                                    )
 906                                    mpl = mean_errors(
 907                                        actual=X_test_h,
 908                                        pred=X_pred,
 909                                        scoring="mean_pinball_loss",
 910                                        per_series=True,
 911                                    )
 912                                else:
 913                                    X_test_h = X_test[0 : self.h, :]
 914                                    rmse = mean_errors(
 915                                        actual=X_test_h,
 916                                        pred=X_pred,
 917                                        scoring="root_mean_squared_error",
 918                                        per_series=True,
 919                                    )
 920                                    mae = mean_errors(
 921                                        actual=X_test_h,
 922                                        pred=X_pred,
 923                                        scoring="mean_absolute_error",
 924                                        per_series=True,
 925                                    )
 926
 927                    names.append(name)
 928                    RMSE.append(rmse)
 929                    MAE.append(mae)
 930                    MPL.append(mpl)
 931                    if (self.replications is not None) or (
 932                        self.type_pi == "gaussian"
 933                    ):
 934                        WINKLERSCORE.append(winklerscore)
 935                        COVERAGE.append(coveragecalc)
 936                    TIME.append(time.time() - start)
 937
 938                    if self.custom_metric is not None:
 939                        try:
 940                            if self.h is None:
 941                                custom_metric = self.custom_metric(
 942                                    X_test, X_pred
 943                                )
 944                            else:
 945                                custom_metric = self.custom_metric(
 946                                    X_test_h, X_pred
 947                                )
 948                            CUSTOM_METRIC.append(custom_metric)
 949                        except Exception as e:
 950                            custom_metric = np.iinfo(np.float32).max
 951                            CUSTOM_METRIC.append(np.iinfo(np.float32).max)
 952
 953                    if self.verbose > 0:
 954                        if (self.replications is not None) or (
 955                            self.type_pi == "gaussian"
 956                        ):
 957                            scores_verbose = {
 958                                "Model": name,
 959                                "RMSE": rmse,
 960                                "MAE": mae,
 961                                "MPL": mpl,
 962                                "WINKLERSCORE": winklerscore,
 963                                "COVERAGE": coveragecalc,
 964                                "Time taken": time.time() - start,
 965                            }
 966                        else:
 967                            scores_verbose = {
 968                                "Model": name,
 969                                "RMSE": rmse,
 970                                "MAE": mae,
 971                                "MPL": mpl,
 972                                "Time taken": time.time() - start,
 973                            }
 974
 975                        if self.custom_metric is not None:
 976                            scores_verbose["Custom metric"] = custom_metric
 977
 978                    if self.predictions:
 979                        predictions[name] = X_pred
 980
 981                except Exception as exception:
 982                    if self.ignore_warnings is False:
 983                        print(name + " model failed to execute")
 984                        print(exception)
 985
 986        if (self.replications is not None) or (self.type_pi == "gaussian"):
 987            scores = {
 988                "Model": names,
 989                "RMSE": RMSE,
 990                "MAE": MAE,
 991                "MPL": MPL,
 992                "WINKLERSCORE": WINKLERSCORE,
 993                "COVERAGE": COVERAGE,
 994                "Time Taken": TIME,
 995            }
 996        else:
 997            scores = {
 998                "Model": names,
 999                "RMSE": RMSE,
1000                "MAE": MAE,
1001                "MPL": MPL,
1002                "Time Taken": TIME,
1003            }
1004
1005        if self.custom_metric is not None:
1006            scores["Custom metric"] = CUSTOM_METRIC
1007
1008        if per_series:
1009            scores = dict_to_dataframe_series(scores, self.series_names)
1010        else:
1011            scores = pd.DataFrame(scores)
1012
1013        try:  # case per_series, can't be sorted
1014            scores = scores.sort_values(
1015                by=self.sort_by, ascending=True
1016            ).set_index("Model")
1017
1018            self.best_model_ = self.models_[scores.index[0]]
1019        except Exception as e:
1020            pass
1021
1022        if self.predictions is True:
1023
1024            return scores, predictions
1025
1026        return scores

Fit Regression algorithms to X_train, predict and score on X_test.

Parameters:

X_train: array-like or data frame,
    Training vectors, where rows is the number of samples
    and columns is the number of features.

X_test: array-like or data frame,
    Testing vectors, where rows is the number of samples
    and columns is the number of features.

xreg: array-like, optional (default=None)
    Additional (external) regressors to be passed to self.obj
    xreg must be in 'increasing' order (most recent observations last)

per_series: bool, optional (default=False)
    When set to True, the metrics are computed series by series.

**kwargs: dict, optional (default=None)
    Additional parameters to be passed to `fit` method of `obj`.

Returns:

scores: Pandas DataFrame
    Returns metrics of all the models in a Pandas DataFrame.

predictions: Pandas DataFrame
    Returns predictions of all the models in a Pandas DataFrame.
def provide_models(self, X_train, X_test):
1040    def provide_models(self, X_train, X_test):
1041        """
1042        This function returns all the model objects trained in fit function.
1043        If fit is not called already, then we call fit and then return the models.
1044
1045        Parameters:
1046
1047            X_train : array-like,
1048                Training vectors, where rows is the number of samples
1049                and columns is the number of features.
1050
1051            X_test : array-like,
1052                Testing vectors, where rows is the number of samples
1053                and columns is the number of features.
1054
1055        Returns:
1056
1057            models: dict-object,
1058                Returns a dictionary with each model pipeline as value
1059                with key as name of models.
1060
1061        """
1062        if self.h is None:
1063            if len(self.models_.keys()) == 0:
1064                self.fit(X_train, X_test)
1065        else:
1066            if len(self.models_.keys()) == 0:
1067                if isinstance(X_test, pd.DataFrame):
1068                    self.fit(X_train, X_test.iloc[0 : self.h, :])
1069                else:
1070                    self.fit(X_train, X_test[0 : self.h, :])
1071
1072        return self.models_

This function returns all the model objects trained in fit function. If fit is not called already, then we call fit and then return the models.

Parameters:

X_train : array-like,
    Training vectors, where rows is the number of samples
    and columns is the number of features.

X_test : array-like,
    Testing vectors, where rows is the number of samples
    and columns is the number of features.

Returns:

models: dict-object,
    Returns a dictionary with each model pipeline as value
    with key as name of models.
class LazyBoostingRegressor(sklearn.base.RegressorMixin):
 93class LazyBoostingRegressor(RegressorMixin):
 94    """
 95        Fitting -- almost -- all the regression algorithms
 96        and returning their scores.
 97
 98    Parameters:
 99
100        verbose: int, optional (default=0)
101            Any positive number for verbosity.
102
103        ignore_warnings: bool, optional (default=True)
104            When set to True, the warning related to algorigms that are not able to run are ignored.
105
106        custom_metric: function, optional (default=None)
107            When function is provided, models are evaluated based on the custom evaluation metric provided.
108
109        predictions: bool, optional (default=False)
110            When set to True, the predictions of all the models models are returned as dataframe.
111
112        sort_by: string, optional (default='RMSE')
113            Sort models by a metric. Available options are 'R-Squared', 'Adjusted R-Squared', 'RMSE', 'Time Taken' and 'Custom Metric'.
114            or a custom metric identified by its name and provided by custom_metric.
115
116        random_state: int, optional (default=42)
117            Reproducibiility seed.
118
119        estimators: list, optional (default='all')
120            list of Estimators names or just 'all' (default='all')
121
122        preprocess: bool
123            preprocessing is done when set to True
124
125        n_jobs : int, when possible, run in parallel
126            For now, only used by individual models that support it.
127
128        n_layers: int, optional (default=3)
129            Number of layers of CustomRegressors to be used.
130
131        All the other parameters are the same as CustomRegressor's.
132
133    Attributes:
134
135        models_: dict-object
136            Returns a dictionary with each model pipeline as value
137            with key as name of models.
138
139        best_model_: object
140            Returns the best model pipeline based on the sort_by metric.
141
142    Examples:
143
144        ```python
145        import os
146        import mlsauce as ms
147        from sklearn.datasets import load_diabetes
148        from sklearn.model_selection import train_test_split
149
150        data = load_diabetes()
151        X = data.data
152        y= data.target
153        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 123)
154
155        regr = ms.LazyBoostingRegressor(verbose=0, ignore_warnings=True,
156                                        custom_metric=None, preprocess=True)
157        models, predictioms = regr.fit(X_train, X_test, y_train, y_test)
158        model_dictionary = regr.provide_models(X_train, X_test, y_train, y_test)
159        print(models)
160        ```
161
162    """
163
164    def __init__(
165        self,
166        verbose=0,
167        ignore_warnings=True,
168        custom_metric=None,
169        predictions=False,
170        sort_by="RMSE",
171        random_state=42,
172        estimators="all",
173        preprocess=False,
174        n_jobs=None,
175    ):
176        self.verbose = verbose
177        self.ignore_warnings = ignore_warnings
178        self.custom_metric = custom_metric
179        self.predictions = predictions
180        self.sort_by = sort_by
181        self.models_ = {}
182        self.best_model_ = None
183        self.random_state = random_state
184        self.estimators = estimators
185        self.preprocess = preprocess
186        self.n_jobs = n_jobs
187
188    def fit(self, X_train, X_test, y_train, y_test, hist=False, **kwargs):
189        """Fit Regression algorithms to X_train and y_train, predict and score on X_test, y_test.
190
191        Parameters:
192
193            X_train : array-like,
194                Training vectors, where rows is the number of samples
195                and columns is the number of features.
196
197            X_test : array-like,
198                Testing vectors, where rows is the number of samples
199                and columns is the number of features.
200
201            y_train : array-like,
202                Training vectors, where rows is the number of samples
203                and columns is the number of features.
204
205            y_test : array-like,
206                Testing vectors, where rows is the number of samples
207                and columns is the number of features.
208
209            hist: bool, optional (default=False)
210                When set to True, the model is a HistGenericBoostingRegressor.
211
212            **kwargs: dict,
213                Additional parameters to be passed to the GenericBoostingRegressor.
214
215        Returns:
216        -------
217        scores:  Pandas DataFrame
218            Returns metrics of all the models in a Pandas DataFrame.
219
220        predictions : Pandas DataFrame
221            Returns predictions of all the models in a Pandas DataFrame.
222
223        """
224        R2 = []
225        ADJR2 = []
226        RMSE = []
227        # WIN = []
228        names = []
229        TIME = []
230        predictions = {}
231
232        if self.custom_metric:
233            CUSTOM_METRIC = []
234
235        if isinstance(X_train, np.ndarray):
236            X_train = pd.DataFrame(X_train)
237            X_test = pd.DataFrame(X_test)
238
239        numeric_features = X_train.select_dtypes(include=[np.number]).columns
240        categorical_features = X_train.select_dtypes(include=["object"]).columns
241
242        categorical_low, categorical_high = get_card_split(
243            X_train, categorical_features
244        )
245
246        if self.preprocess is True:
247            preprocessor = ColumnTransformer(
248                transformers=[
249                    ("numeric", numeric_transformer, numeric_features),
250                    (
251                        "categorical_low",
252                        categorical_transformer_low,
253                        categorical_low,
254                    ),
255                    (
256                        "categorical_high",
257                        categorical_transformer_high,
258                        categorical_high,
259                    ),
260                ]
261            )
262
263        # base models
264        try:
265            baseline_names = [
266                "RandomForestRegressor",
267                "XGBRegressor",
268                "GradientBoostingRegressor",
269            ]
270            baseline_models = [
271                RandomForestRegressor(),
272                xgb.XGBRegressor(),
273                GradientBoostingRegressor(),
274            ]
275        except Exception as exception:
276            baseline_names = [
277                "RandomForestRegressor",
278                "GradientBoostingRegressor",
279            ]
280            baseline_models = [
281                RandomForestRegressor(),
282                GradientBoostingRegressor(),
283            ]
284
285        if self.verbose > 0:
286            print("\n Fitting baseline models...")
287        for name, model in tqdm(zip(baseline_names, baseline_models)):
288            start = time.time()
289            try:
290                model.fit(X_train, y_train.ravel())
291                self.models_[name] = model
292                y_pred = model.predict(X_test)
293                r_squared = r2_score(y_test, y_pred)
294                adj_rsquared = adjusted_rsquared(
295                    r_squared, X_test.shape[0], X_test.shape[1]
296                )
297                rmse = root_mean_squared_error(y_test, y_pred)
298
299                names.append(name)
300                R2.append(r_squared)
301                ADJR2.append(adj_rsquared)
302                RMSE.append(rmse)
303                TIME.append(time.time() - start)
304
305                if self.custom_metric:
306                    custom_metric = self.custom_metric(y_test, y_pred)
307                    CUSTOM_METRIC.append(custom_metric)
308
309                if self.verbose > 0:
310                    scores_verbose = {
311                        "Model": name,
312                        "R-Squared": r_squared,
313                        "Adjusted R-Squared": adj_rsquared,
314                        "RMSE": rmse,
315                        "Time taken": time.time() - start,
316                    }
317
318                    if self.custom_metric:
319                        scores_verbose["Custom metric"] = custom_metric
320
321                    print(scores_verbose)
322                if self.predictions:
323                    predictions[name] = y_pred
324            except Exception as exception:
325                if self.ignore_warnings is False:
326                    print(name + " model failed to execute")
327                    print(exception)
328
329        if self.estimators == "all":
330            self.regressors = REGRESSORS
331        else:
332            self.regressors = [
333                ("GenericBooster(" + est[0] + ")", est[1](**kwargs))
334                for est in all_estimators()
335                if (
336                    issubclass(est[1], RegressorMixin)
337                    and (est[0] in self.estimators)
338                )
339            ]
340
341        if self.preprocess is True:
342
343            if self.n_jobs is None:
344
345                for name, regr in tqdm(self.regressors):  # do parallel exec
346
347                    start = time.time()
348
349                    try:
350
351                        if hist is False:
352
353                            model = GenericBoostingRegressor(
354                                base_model=regr(),
355                                verbose=self.verbose,
356                                **kwargs,
357                            )
358
359                        else:
360
361                            model = HistGenericBoostingRegressor(
362                                base_model=regr(),
363                                verbose=self.verbose,
364                                **kwargs,
365                            )
366
367                        model.fit(X_train, y_train.ravel())
368
369                        pipe = Pipeline(
370                            steps=[
371                                ("preprocessor", preprocessor),
372                                ("regressor", model),
373                            ]
374                        )
375                        if self.verbose > 0:
376                            print("\n Fitting boosted " + name + " model...")
377                        pipe.fit(X_train, y_train.ravel())
378
379                        self.models_[name] = pipe
380                        y_pred = pipe.predict(X_test)
381                        r_squared = r2_score(y_test, y_pred)
382                        adj_rsquared = adjusted_rsquared(
383                            r_squared, X_test.shape[0], X_test.shape[1]
384                        )
385                        rmse = root_mean_squared_error(y_test, y_pred)
386
387                        names.append(name)
388                        R2.append(r_squared)
389                        ADJR2.append(adj_rsquared)
390                        RMSE.append(rmse)
391                        TIME.append(time.time() - start)
392
393                        if self.custom_metric:
394                            custom_metric = self.custom_metric(y_test, y_pred)
395                            CUSTOM_METRIC.append(custom_metric)
396
397                        if self.verbose > 0:
398                            scores_verbose = {
399                                "Model": name,
400                                "R-Squared": r_squared,
401                                "Adjusted R-Squared": adj_rsquared,
402                                "RMSE": rmse,
403                                "Time taken": time.time() - start,
404                            }
405
406                            if self.custom_metric:
407                                scores_verbose["Custom metric"] = custom_metric
408
409                            print(scores_verbose)
410                        if self.predictions:
411                            predictions[name] = y_pred
412
413                    except Exception as exception:
414
415                        if self.ignore_warnings is False:
416                            print(name + " model failed to execute")
417                            print(exception)
418
419            else:
420
421                results = Parallel(n_jobs=self.n_jobs)(
422                    delayed(self.train_model)(
423                        name,
424                        model,
425                        X_train,
426                        y_train,
427                        X_test,
428                        y_test,
429                        use_preprocessing=True,
430                        preprocessor=preprocessor,
431                        **kwargs,
432                    )
433                    for name, model in tqdm(self.regressors)
434                )
435                R2 = [
436                    result["r_squared"]
437                    for result in results
438                    if result is not None
439                ]
440                ADJR2 = [
441                    result["adj_rsquared"]
442                    for result in results
443                    if result is not None
444                ]
445                RMSE = [
446                    result["rmse"] for result in results if result is not None
447                ]
448                TIME = [
449                    result["time"] for result in results if result is not None
450                ]
451                names = [
452                    result["name"] for result in results if result is not None
453                ]
454                if self.custom_metric:
455                    CUSTOM_METRIC = [
456                        result["custom_metric"]
457                        for result in results
458                        if result is not None
459                    ]
460                if self.predictions:
461                    predictions = {
462                        result["name"]: result["predictions"]
463                        for result in results
464                        if result is not None
465                    }
466
467        else:  # self.preprocess is False; no preprocessing
468
469            if self.n_jobs is None:
470
471                for name, regr in tqdm(self.regressors):  # do parallel exec
472                    start = time.time()
473                    try:
474
475                        if hist is False:
476                            model = GenericBoostingRegressor(
477                                base_model=regr(),
478                                verbose=self.verbose,
479                                **kwargs,
480                            )
481                        else:
482                            model = HistGenericBoostingRegressor(
483                                base_model=regr(),
484                                verbose=self.verbose,
485                                **kwargs,
486                            )
487
488                        if self.verbose > 0:
489                            print("\n Fitting boosted " + name + " model...")
490                        model.fit(X_train, y_train.ravel())
491
492                        self.models_[name] = model
493                        y_pred = model.predict(X_test)
494
495                        r_squared = r2_score(y_test, y_pred)
496                        adj_rsquared = adjusted_rsquared(
497                            r_squared, X_test.shape[0], X_test.shape[1]
498                        )
499                        rmse = root_mean_squared_error(y_test, y_pred)
500
501                        names.append(name)
502                        R2.append(r_squared)
503                        ADJR2.append(adj_rsquared)
504                        RMSE.append(rmse)
505                        TIME.append(time.time() - start)
506
507                        if self.custom_metric:
508                            custom_metric = self.custom_metric(y_test, y_pred)
509                            CUSTOM_METRIC.append(custom_metric)
510
511                        if self.verbose > 0:
512                            scores_verbose = {
513                                "Model": name,
514                                "R-Squared": r_squared,
515                                "Adjusted R-Squared": adj_rsquared,
516                                "RMSE": rmse,
517                                "Time taken": time.time() - start,
518                            }
519
520                            if self.custom_metric:
521                                scores_verbose["Custom metric"] = custom_metric
522
523                            print(scores_verbose)
524                        if self.predictions:
525                            predictions[name] = y_pred
526                    except Exception as exception:
527                        if self.ignore_warnings is False:
528                            print(name + " model failed to execute")
529                            print(exception)
530
531            else:
532
533                results = Parallel(n_jobs=self.n_jobs)(
534                    delayed(self.train_model)(
535                        name,
536                        model,
537                        X_train,
538                        y_train,
539                        X_test,
540                        y_test,
541                        use_preprocessing=False,
542                        **kwargs,
543                    )
544                    for name, model in tqdm(self.regressors)
545                )
546                R2 = [
547                    result["r_squared"]
548                    for result in results
549                    if result is not None
550                ]
551                ADJR2 = [
552                    result["adj_rsquared"]
553                    for result in results
554                    if result is not None
555                ]
556                RMSE = [
557                    result["rmse"] for result in results if result is not None
558                ]
559                TIME = [
560                    result["time"] for result in results if result is not None
561                ]
562                names = [
563                    result["name"] for result in results if result is not None
564                ]
565                if self.custom_metric:
566                    CUSTOM_METRIC = [
567                        result["custom_metric"]
568                        for result in results
569                        if result is not None
570                    ]
571                if self.predictions:
572                    predictions = {
573                        result["name"]: result["predictions"]
574                        for result in results
575                        if result is not None
576                    }
577
578        scores = {
579            "Model": names,
580            "Adjusted R-Squared": ADJR2,
581            "R-Squared": R2,
582            "RMSE": RMSE,
583            "Time Taken": TIME,
584        }
585
586        if self.custom_metric:
587            scores["Custom metric"] = CUSTOM_METRIC
588
589        scores = pd.DataFrame(scores)
590        scores = scores.sort_values(by=self.sort_by, ascending=True).set_index(
591            "Model"
592        )
593
594        self.best_model_ = self.models_[scores.index[0]]
595
596        if self.predictions:
597            predictions_df = pd.DataFrame.from_dict(predictions)
598        return scores, predictions_df if self.predictions is True else scores
599
600    def get_best_model(self):
601        """
602        This function returns the best model pipeline based on the sort_by metric.
603
604        Returns:
605
606            best_model: object,
607                Returns the best model pipeline based on the sort_by metric.
608
609        """
610        return self.best_model_
611
612    def provide_models(self, X_train, X_test, y_train, y_test):
613        """
614        This function returns all the model objects trained in fit function.
615        If fit is not called already, then we call fit and then return the models.
616
617        Parameters:
618
619            X_train : array-like,
620                Training vectors, where rows is the number of samples
621                and columns is the number of features.
622
623            X_test : array-like,
624                Testing vectors, where rows is the number of samples
625                and columns is the number of features.
626
627            y_train : array-like,
628                Training vectors, where rows is the number of samples
629                and columns is the number of features.
630
631            y_test : array-like,
632                Testing vectors, where rows is the number of samples
633                and columns is the number of features.
634
635        Returns:
636
637            models: dict-object,
638                Returns a dictionary with each model pipeline as value
639                with key as name of models.
640
641        """
642        if len(self.models_.keys()) == 0:
643            self.fit(X_train, X_test, y_train.ravel(), y_test.values)
644
645        return self.models_
646
647    def train_model(
648        self,
649        name,
650        regr,
651        X_train,
652        y_train,
653        X_test,
654        y_test,
655        use_preprocessing=False,
656        preprocessor=None,
657        hist=False,
658        **kwargs,
659    ):
660        """
661        Function to train a single regression model and return its results.
662        """
663        start = time.time()
664
665        try:
666            if hist is False:
667                model = GenericBoostingRegressor(
668                    base_model=regr(), verbose=self.verbose, **kwargs
669                )
670            else:
671                model = HistGenericBoostingRegressor(
672                    base_model=regr(), verbose=self.verbose, **kwargs
673                )
674
675            if use_preprocessing and preprocessor is not None:
676                pipe = Pipeline(
677                    steps=[
678                        ("preprocessor", preprocessor),
679                        ("regressor", model),
680                    ]
681                )
682                if self.verbose > 0:
683                    print(
684                        "\n Fitting boosted "
685                        + name
686                        + " model with preprocessing..."
687                    )
688                pipe.fit(X_train, y_train.ravel())
689                y_pred = pipe.predict(X_test)
690                fitted_model = pipe
691            else:
692                # Case with no preprocessing
693                if self.verbose > 0:
694                    print(
695                        "\n Fitting boosted "
696                        + name
697                        + " model without preprocessing..."
698                    )
699                model.fit(X_train, y_train.ravel())
700                y_pred = model.predict(X_test)
701                fitted_model = model
702
703            r_squared = r2_score(y_test, y_pred)
704            adj_rsquared = adjusted_rsquared(
705                r_squared, X_test.shape[0], X_test.shape[1]
706            )
707            rmse = root_mean_squared_error(y_test, y_pred)
708
709            custom_metric = None
710            if self.custom_metric:
711                custom_metric = self.custom_metric(y_test, y_pred)
712
713            return {
714                "name": name,
715                "model": fitted_model,
716                "r_squared": r_squared,
717                "adj_rsquared": adj_rsquared,
718                "rmse": rmse,
719                "custom_metric": custom_metric,
720                "time": time.time() - start,
721                "predictions": y_pred,
722            }
723
724        except Exception as exception:
725            if self.ignore_warnings is False:
726                print(name + " model failed to execute")
727                print(exception)
728            return None

Fitting -- almost -- all the regression algorithms and returning their scores.

Parameters:

verbose: int, optional (default=0)
    Any positive number for verbosity.

ignore_warnings: bool, optional (default=True)
    When set to True, the warning related to algorigms that are not able to run are ignored.

custom_metric: function, optional (default=None)
    When function is provided, models are evaluated based on the custom evaluation metric provided.

predictions: bool, optional (default=False)
    When set to True, the predictions of all the models models are returned as dataframe.

sort_by: string, optional (default='RMSE')
    Sort models by a metric. Available options are 'R-Squared', 'Adjusted R-Squared', 'RMSE', 'Time Taken' and 'Custom Metric'.
    or a custom metric identified by its name and provided by custom_metric.

random_state: int, optional (default=42)
    Reproducibiility seed.

estimators: list, optional (default='all')
    list of Estimators names or just 'all' (default='all')

preprocess: bool
    preprocessing is done when set to True

n_jobs : int, when possible, run in parallel
    For now, only used by individual models that support it.

n_layers: int, optional (default=3)
    Number of layers of CustomRegressors to be used.

All the other parameters are the same as CustomRegressor's.

Attributes:

models_: dict-object
    Returns a dictionary with each model pipeline as value
    with key as name of models.

best_model_: object
    Returns the best model pipeline based on the sort_by metric.

Examples:

import os
import mlsauce as ms
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split

data = load_diabetes()
X = data.data
y= data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 123)

regr = ms.LazyBoostingRegressor(verbose=0, ignore_warnings=True,
                                custom_metric=None, preprocess=True)
models, predictioms = regr.fit(X_train, X_test, y_train, y_test)
model_dictionary = regr.provide_models(X_train, X_test, y_train, y_test)
print(models)
def fit(self, X_train, X_test, y_train, y_test, hist=False, **kwargs):
188    def fit(self, X_train, X_test, y_train, y_test, hist=False, **kwargs):
189        """Fit Regression algorithms to X_train and y_train, predict and score on X_test, y_test.
190
191        Parameters:
192
193            X_train : array-like,
194                Training vectors, where rows is the number of samples
195                and columns is the number of features.
196
197            X_test : array-like,
198                Testing vectors, where rows is the number of samples
199                and columns is the number of features.
200
201            y_train : array-like,
202                Training vectors, where rows is the number of samples
203                and columns is the number of features.
204
205            y_test : array-like,
206                Testing vectors, where rows is the number of samples
207                and columns is the number of features.
208
209            hist: bool, optional (default=False)
210                When set to True, the model is a HistGenericBoostingRegressor.
211
212            **kwargs: dict,
213                Additional parameters to be passed to the GenericBoostingRegressor.
214
215        Returns:
216        -------
217        scores:  Pandas DataFrame
218            Returns metrics of all the models in a Pandas DataFrame.
219
220        predictions : Pandas DataFrame
221            Returns predictions of all the models in a Pandas DataFrame.
222
223        """
224        R2 = []
225        ADJR2 = []
226        RMSE = []
227        # WIN = []
228        names = []
229        TIME = []
230        predictions = {}
231
232        if self.custom_metric:
233            CUSTOM_METRIC = []
234
235        if isinstance(X_train, np.ndarray):
236            X_train = pd.DataFrame(X_train)
237            X_test = pd.DataFrame(X_test)
238
239        numeric_features = X_train.select_dtypes(include=[np.number]).columns
240        categorical_features = X_train.select_dtypes(include=["object"]).columns
241
242        categorical_low, categorical_high = get_card_split(
243            X_train, categorical_features
244        )
245
246        if self.preprocess is True:
247            preprocessor = ColumnTransformer(
248                transformers=[
249                    ("numeric", numeric_transformer, numeric_features),
250                    (
251                        "categorical_low",
252                        categorical_transformer_low,
253                        categorical_low,
254                    ),
255                    (
256                        "categorical_high",
257                        categorical_transformer_high,
258                        categorical_high,
259                    ),
260                ]
261            )
262
263        # base models
264        try:
265            baseline_names = [
266                "RandomForestRegressor",
267                "XGBRegressor",
268                "GradientBoostingRegressor",
269            ]
270            baseline_models = [
271                RandomForestRegressor(),
272                xgb.XGBRegressor(),
273                GradientBoostingRegressor(),
274            ]
275        except Exception as exception:
276            baseline_names = [
277                "RandomForestRegressor",
278                "GradientBoostingRegressor",
279            ]
280            baseline_models = [
281                RandomForestRegressor(),
282                GradientBoostingRegressor(),
283            ]
284
285        if self.verbose > 0:
286            print("\n Fitting baseline models...")
287        for name, model in tqdm(zip(baseline_names, baseline_models)):
288            start = time.time()
289            try:
290                model.fit(X_train, y_train.ravel())
291                self.models_[name] = model
292                y_pred = model.predict(X_test)
293                r_squared = r2_score(y_test, y_pred)
294                adj_rsquared = adjusted_rsquared(
295                    r_squared, X_test.shape[0], X_test.shape[1]
296                )
297                rmse = root_mean_squared_error(y_test, y_pred)
298
299                names.append(name)
300                R2.append(r_squared)
301                ADJR2.append(adj_rsquared)
302                RMSE.append(rmse)
303                TIME.append(time.time() - start)
304
305                if self.custom_metric:
306                    custom_metric = self.custom_metric(y_test, y_pred)
307                    CUSTOM_METRIC.append(custom_metric)
308
309                if self.verbose > 0:
310                    scores_verbose = {
311                        "Model": name,
312                        "R-Squared": r_squared,
313                        "Adjusted R-Squared": adj_rsquared,
314                        "RMSE": rmse,
315                        "Time taken": time.time() - start,
316                    }
317
318                    if self.custom_metric:
319                        scores_verbose["Custom metric"] = custom_metric
320
321                    print(scores_verbose)
322                if self.predictions:
323                    predictions[name] = y_pred
324            except Exception as exception:
325                if self.ignore_warnings is False:
326                    print(name + " model failed to execute")
327                    print(exception)
328
329        if self.estimators == "all":
330            self.regressors = REGRESSORS
331        else:
332            self.regressors = [
333                ("GenericBooster(" + est[0] + ")", est[1](**kwargs))
334                for est in all_estimators()
335                if (
336                    issubclass(est[1], RegressorMixin)
337                    and (est[0] in self.estimators)
338                )
339            ]
340
341        if self.preprocess is True:
342
343            if self.n_jobs is None:
344
345                for name, regr in tqdm(self.regressors):  # do parallel exec
346
347                    start = time.time()
348
349                    try:
350
351                        if hist is False:
352
353                            model = GenericBoostingRegressor(
354                                base_model=regr(),
355                                verbose=self.verbose,
356                                **kwargs,
357                            )
358
359                        else:
360
361                            model = HistGenericBoostingRegressor(
362                                base_model=regr(),
363                                verbose=self.verbose,
364                                **kwargs,
365                            )
366
367                        model.fit(X_train, y_train.ravel())
368
369                        pipe = Pipeline(
370                            steps=[
371                                ("preprocessor", preprocessor),
372                                ("regressor", model),
373                            ]
374                        )
375                        if self.verbose > 0:
376                            print("\n Fitting boosted " + name + " model...")
377                        pipe.fit(X_train, y_train.ravel())
378
379                        self.models_[name] = pipe
380                        y_pred = pipe.predict(X_test)
381                        r_squared = r2_score(y_test, y_pred)
382                        adj_rsquared = adjusted_rsquared(
383                            r_squared, X_test.shape[0], X_test.shape[1]
384                        )
385                        rmse = root_mean_squared_error(y_test, y_pred)
386
387                        names.append(name)
388                        R2.append(r_squared)
389                        ADJR2.append(adj_rsquared)
390                        RMSE.append(rmse)
391                        TIME.append(time.time() - start)
392
393                        if self.custom_metric:
394                            custom_metric = self.custom_metric(y_test, y_pred)
395                            CUSTOM_METRIC.append(custom_metric)
396
397                        if self.verbose > 0:
398                            scores_verbose = {
399                                "Model": name,
400                                "R-Squared": r_squared,
401                                "Adjusted R-Squared": adj_rsquared,
402                                "RMSE": rmse,
403                                "Time taken": time.time() - start,
404                            }
405
406                            if self.custom_metric:
407                                scores_verbose["Custom metric"] = custom_metric
408
409                            print(scores_verbose)
410                        if self.predictions:
411                            predictions[name] = y_pred
412
413                    except Exception as exception:
414
415                        if self.ignore_warnings is False:
416                            print(name + " model failed to execute")
417                            print(exception)
418
419            else:
420
421                results = Parallel(n_jobs=self.n_jobs)(
422                    delayed(self.train_model)(
423                        name,
424                        model,
425                        X_train,
426                        y_train,
427                        X_test,
428                        y_test,
429                        use_preprocessing=True,
430                        preprocessor=preprocessor,
431                        **kwargs,
432                    )
433                    for name, model in tqdm(self.regressors)
434                )
435                R2 = [
436                    result["r_squared"]
437                    for result in results
438                    if result is not None
439                ]
440                ADJR2 = [
441                    result["adj_rsquared"]
442                    for result in results
443                    if result is not None
444                ]
445                RMSE = [
446                    result["rmse"] for result in results if result is not None
447                ]
448                TIME = [
449                    result["time"] for result in results if result is not None
450                ]
451                names = [
452                    result["name"] for result in results if result is not None
453                ]
454                if self.custom_metric:
455                    CUSTOM_METRIC = [
456                        result["custom_metric"]
457                        for result in results
458                        if result is not None
459                    ]
460                if self.predictions:
461                    predictions = {
462                        result["name"]: result["predictions"]
463                        for result in results
464                        if result is not None
465                    }
466
467        else:  # self.preprocess is False; no preprocessing
468
469            if self.n_jobs is None:
470
471                for name, regr in tqdm(self.regressors):  # do parallel exec
472                    start = time.time()
473                    try:
474
475                        if hist is False:
476                            model = GenericBoostingRegressor(
477                                base_model=regr(),
478                                verbose=self.verbose,
479                                **kwargs,
480                            )
481                        else:
482                            model = HistGenericBoostingRegressor(
483                                base_model=regr(),
484                                verbose=self.verbose,
485                                **kwargs,
486                            )
487
488                        if self.verbose > 0:
489                            print("\n Fitting boosted " + name + " model...")
490                        model.fit(X_train, y_train.ravel())
491
492                        self.models_[name] = model
493                        y_pred = model.predict(X_test)
494
495                        r_squared = r2_score(y_test, y_pred)
496                        adj_rsquared = adjusted_rsquared(
497                            r_squared, X_test.shape[0], X_test.shape[1]
498                        )
499                        rmse = root_mean_squared_error(y_test, y_pred)
500
501                        names.append(name)
502                        R2.append(r_squared)
503                        ADJR2.append(adj_rsquared)
504                        RMSE.append(rmse)
505                        TIME.append(time.time() - start)
506
507                        if self.custom_metric:
508                            custom_metric = self.custom_metric(y_test, y_pred)
509                            CUSTOM_METRIC.append(custom_metric)
510
511                        if self.verbose > 0:
512                            scores_verbose = {
513                                "Model": name,
514                                "R-Squared": r_squared,
515                                "Adjusted R-Squared": adj_rsquared,
516                                "RMSE": rmse,
517                                "Time taken": time.time() - start,
518                            }
519
520                            if self.custom_metric:
521                                scores_verbose["Custom metric"] = custom_metric
522
523                            print(scores_verbose)
524                        if self.predictions:
525                            predictions[name] = y_pred
526                    except Exception as exception:
527                        if self.ignore_warnings is False:
528                            print(name + " model failed to execute")
529                            print(exception)
530
531            else:
532
533                results = Parallel(n_jobs=self.n_jobs)(
534                    delayed(self.train_model)(
535                        name,
536                        model,
537                        X_train,
538                        y_train,
539                        X_test,
540                        y_test,
541                        use_preprocessing=False,
542                        **kwargs,
543                    )
544                    for name, model in tqdm(self.regressors)
545                )
546                R2 = [
547                    result["r_squared"]
548                    for result in results
549                    if result is not None
550                ]
551                ADJR2 = [
552                    result["adj_rsquared"]
553                    for result in results
554                    if result is not None
555                ]
556                RMSE = [
557                    result["rmse"] for result in results if result is not None
558                ]
559                TIME = [
560                    result["time"] for result in results if result is not None
561                ]
562                names = [
563                    result["name"] for result in results if result is not None
564                ]
565                if self.custom_metric:
566                    CUSTOM_METRIC = [
567                        result["custom_metric"]
568                        for result in results
569                        if result is not None
570                    ]
571                if self.predictions:
572                    predictions = {
573                        result["name"]: result["predictions"]
574                        for result in results
575                        if result is not None
576                    }
577
578        scores = {
579            "Model": names,
580            "Adjusted R-Squared": ADJR2,
581            "R-Squared": R2,
582            "RMSE": RMSE,
583            "Time Taken": TIME,
584        }
585
586        if self.custom_metric:
587            scores["Custom metric"] = CUSTOM_METRIC
588
589        scores = pd.DataFrame(scores)
590        scores = scores.sort_values(by=self.sort_by, ascending=True).set_index(
591            "Model"
592        )
593
594        self.best_model_ = self.models_[scores.index[0]]
595
596        if self.predictions:
597            predictions_df = pd.DataFrame.from_dict(predictions)
598        return scores, predictions_df if self.predictions is True else scores

Fit Regression algorithms to X_train and y_train, predict and score on X_test, y_test.

Parameters:

X_train : array-like,
    Training vectors, where rows is the number of samples
    and columns is the number of features.

X_test : array-like,
    Testing vectors, where rows is the number of samples
    and columns is the number of features.

y_train : array-like,
    Training vectors, where rows is the number of samples
    and columns is the number of features.

y_test : array-like,
    Testing vectors, where rows is the number of samples
    and columns is the number of features.

hist: bool, optional (default=False)
    When set to True, the model is a HistGenericBoostingRegressor.

**kwargs: dict,
    Additional parameters to be passed to the GenericBoostingRegressor.

Returns:

scores: Pandas DataFrame Returns metrics of all the models in a Pandas DataFrame.

predictions : Pandas DataFrame Returns predictions of all the models in a Pandas DataFrame.

def provide_models(self, X_train, X_test, y_train, y_test):
612    def provide_models(self, X_train, X_test, y_train, y_test):
613        """
614        This function returns all the model objects trained in fit function.
615        If fit is not called already, then we call fit and then return the models.
616
617        Parameters:
618
619            X_train : array-like,
620                Training vectors, where rows is the number of samples
621                and columns is the number of features.
622
623            X_test : array-like,
624                Testing vectors, where rows is the number of samples
625                and columns is the number of features.
626
627            y_train : array-like,
628                Training vectors, where rows is the number of samples
629                and columns is the number of features.
630
631            y_test : array-like,
632                Testing vectors, where rows is the number of samples
633                and columns is the number of features.
634
635        Returns:
636
637            models: dict-object,
638                Returns a dictionary with each model pipeline as value
639                with key as name of models.
640
641        """
642        if len(self.models_.keys()) == 0:
643            self.fit(X_train, X_test, y_train.ravel(), y_test.values)
644
645        return self.models_

This function returns all the model objects trained in fit function. If fit is not called already, then we call fit and then return the models.

Parameters:

X_train : array-like,
    Training vectors, where rows is the number of samples
    and columns is the number of features.

X_test : array-like,
    Testing vectors, where rows is the number of samples
    and columns is the number of features.

y_train : array-like,
    Training vectors, where rows is the number of samples
    and columns is the number of features.

y_test : array-like,
    Testing vectors, where rows is the number of samples
    and columns is the number of features.

Returns:

models: dict-object,
    Returns a dictionary with each model pipeline as value
    with key as name of models.
class MultiTaskRegressor(sklearn.base.BaseEstimator, sklearn.base.RegressorMixin):
 9class MultiTaskRegressor(BaseEstimator, RegressorMixin):
10    """
11    A class for multi-task regression
12
13    Parameters
14    ----------
15    regr: object
16        A regressor object
17
18    Attributes
19    ----------
20    objs: list
21        A list containing the fitted regressor objects
22
23    """
24
25    def __init__(self, regr):
26        assert (
27            is_multitask_estimator(regr) == False
28        ), "The regressor is already a multi-task regressor"
29        self.regr = regr
30        self.objs = []
31
32    def fit(self, X, y):
33        """
34        Fit the regressor
35
36        Parameters
37        ----------
38        X: array-like
39            The input data
40        y: array-like
41            The target values
42
43        """
44        n_tasks = y.shape[1]
45        assert n_tasks > 1, "The number of columns in y must be greater than 1"
46        self.n_outputs_ = n_tasks
47        try:
48            for i in range(n_tasks):
49                self.regr.fit(X, y.iloc[:, i].values)
50                self.objs.append(deepcopy(self.regr))
51        except Exception:
52            for i in range(n_tasks):
53                self.regr.fit(X, y[:, i])
54                self.objs.append(deepcopy(self.regr))
55        return self
56
57    def predict(self, X):
58        """
59        Predict the target values
60
61        Parameters
62        ----------
63        X: array-like
64            The input data
65
66        Returns
67        -------
68        y_pred: array-like
69            The predicted target values
70
71        """
72        assert len(self.objs) > 0, "The regressor has not been fitted yet"
73        y_pred = np.zeros((X.shape[0], self.n_outputs_))
74        for i in range(self.n_outputs_):
75            y_pred[:, i] = self.objs[i].predict(X)
76        return y_pred

A class for multi-task regression

Parameters

regr: object A regressor object

Attributes

objs: list A list containing the fitted regressor objects

def fit(self, X, y):
32    def fit(self, X, y):
33        """
34        Fit the regressor
35
36        Parameters
37        ----------
38        X: array-like
39            The input data
40        y: array-like
41            The target values
42
43        """
44        n_tasks = y.shape[1]
45        assert n_tasks > 1, "The number of columns in y must be greater than 1"
46        self.n_outputs_ = n_tasks
47        try:
48            for i in range(n_tasks):
49                self.regr.fit(X, y.iloc[:, i].values)
50                self.objs.append(deepcopy(self.regr))
51        except Exception:
52            for i in range(n_tasks):
53                self.regr.fit(X, y[:, i])
54                self.objs.append(deepcopy(self.regr))
55        return self

Fit the regressor

Parameters

X: array-like The input data y: array-like The target values

def predict(self, X):
57    def predict(self, X):
58        """
59        Predict the target values
60
61        Parameters
62        ----------
63        X: array-like
64            The input data
65
66        Returns
67        -------
68        y_pred: array-like
69            The predicted target values
70
71        """
72        assert len(self.objs) > 0, "The regressor has not been fitted yet"
73        y_pred = np.zeros((X.shape[0], self.n_outputs_))
74        for i in range(self.n_outputs_):
75            y_pred[:, i] = self.objs[i].predict(X)
76        return y_pred

Predict the target values

Parameters

X: array-like The input data

Returns

y_pred: array-like The predicted target values

class IsotonicRegressor(sklearn.base.BaseEstimator, sklearn.base.RegressorMixin):
 8class IsotonicRegressor(BaseEstimator, RegressorMixin):
 9    """Isotonic Regressor with postprocessing.
10
11    This class takes a base regressor and applies isotonic regression as
12    postprocessing in the predict method. The isotonic regression ensures
13    that the predictions are monotonically increasing or decreasing.
14
15    Attributes:
16        regr: estimator
17            Base regressor to use for initial predictions.
18
19        increasing: bool, default=True
20            If True, the isotonic regression will be monotonically increasing.
21            If False, it will be monotonically decreasing.
22
23        out_of_bounds: str, default='nan'
24            The out_of_bounds parameter for IsotonicRegression.
25            Can be 'nan', 'clip', or 'raise'.
26    """
27
28    def __init__(self, regr, increasing=True, out_of_bounds="nan"):
29        """Initialize the IsotonicRegressor.
30
31        Args:
32            regr: estimator
33                Base regressor to use for initial predictions.
34
35            increasing: bool, default=True
36                If True, the isotonic regression will be monotonically increasing.
37                If False, it will be monotonically decreasing.
38
39            out_of_bounds: str, default='nan'
40                The out_of_bounds parameter for IsotonicRegression.
41                Can be 'nan', 'clip', or 'raise'.
42        """
43        self.regr = regr
44        self.increasing = increasing
45        self.out_of_bounds = out_of_bounds
46
47    def fit(self, X, y, **kwargs):
48        """Fit the model.
49
50        Args:
51            X: {array-like}, shape = [n_samples, n_features]
52                Training vectors, where n_samples is the number
53                of samples and n_features is the number of features.
54
55            y: array-like, shape = [n_samples]
56                Target values.
57
58            **kwargs: additional parameters to be passed to the base regressor.
59
60        Returns:
61            self: object.
62        """
63        # Validate input
64        X, y = check_X_y(X, y)
65        # Fit the base regressor
66        self.regr.fit(X, y, **kwargs)
67        # Get predictions from base regressor for training data
68        y_pred_base = self.regr.predict(X)
69        # Fit isotonic regression on the base predictions vs actual targets
70        self.isotonic_regressor_ = IsotonicRegression(
71            increasing=self.increasing, out_of_bounds=self.out_of_bounds
72        )
73        self.isotonic_regressor_.fit(y_pred_base, y)
74        return self
75
76    def predict(self, X, **kwargs):
77        """Predict using the model.
78
79        Args:
80            X: {array-like}, shape = [n_samples, n_features]
81                Samples.
82
83            **kwargs: additional parameters to be passed to the base regressor.
84
85        Returns:
86            y_pred: array-like, shape = [n_samples]
87                Predicted values.
88        """
89        # Check if fitted
90        check_is_fitted(self, ["regr", "isotonic_regressor_"])
91        # Validate input
92        X = check_array(X)
93        # Get predictions from base regressor
94        y_pred_base = self.regr.predict(X, **kwargs)
95        # Apply isotonic regression postprocessing
96        return self.isotonic_regressor_.predict(y_pred_base)

Isotonic Regressor with postprocessing.

This class takes a base regressor and applies isotonic regression as postprocessing in the predict method. The isotonic regression ensures that the predictions are monotonically increasing or decreasing.

Attributes: regr: estimator Base regressor to use for initial predictions.

increasing: bool, default=True
    If True, the isotonic regression will be monotonically increasing.
    If False, it will be monotonically decreasing.

out_of_bounds: str, default='nan'
    The out_of_bounds parameter for IsotonicRegression.
    Can be 'nan', 'clip', or 'raise'.
def fit(self, X, y, **kwargs):
47    def fit(self, X, y, **kwargs):
48        """Fit the model.
49
50        Args:
51            X: {array-like}, shape = [n_samples, n_features]
52                Training vectors, where n_samples is the number
53                of samples and n_features is the number of features.
54
55            y: array-like, shape = [n_samples]
56                Target values.
57
58            **kwargs: additional parameters to be passed to the base regressor.
59
60        Returns:
61            self: object.
62        """
63        # Validate input
64        X, y = check_X_y(X, y)
65        # Fit the base regressor
66        self.regr.fit(X, y, **kwargs)
67        # Get predictions from base regressor for training data
68        y_pred_base = self.regr.predict(X)
69        # Fit isotonic regression on the base predictions vs actual targets
70        self.isotonic_regressor_ = IsotonicRegression(
71            increasing=self.increasing, out_of_bounds=self.out_of_bounds
72        )
73        self.isotonic_regressor_.fit(y_pred_base, y)
74        return self

Fit the model.

Args: X: {array-like}, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features.

y: array-like, shape = [n_samples]
    Target values.

**kwargs: additional parameters to be passed to the base regressor.

Returns: self: object.

def predict(self, X, **kwargs):
76    def predict(self, X, **kwargs):
77        """Predict using the model.
78
79        Args:
80            X: {array-like}, shape = [n_samples, n_features]
81                Samples.
82
83            **kwargs: additional parameters to be passed to the base regressor.
84
85        Returns:
86            y_pred: array-like, shape = [n_samples]
87                Predicted values.
88        """
89        # Check if fitted
90        check_is_fitted(self, ["regr", "isotonic_regressor_"])
91        # Validate input
92        X = check_array(X)
93        # Get predictions from base regressor
94        y_pred_base = self.regr.predict(X, **kwargs)
95        # Apply isotonic regression postprocessing
96        return self.isotonic_regressor_.predict(y_pred_base)

Predict using the model.

Args: X: {array-like}, shape = [n_samples, n_features] Samples.

**kwargs: additional parameters to be passed to the base regressor.

Returns: y_pred: array-like, shape = [n_samples] Predicted values.

class GenericFunctionalForecaster(sklearn.base.BaseEstimator, sklearn.base.RegressorMixin):
 25class GenericFunctionalForecaster(BaseEstimator, RegressorMixin):
 26    """
 27    Functional time series forecaster using dimensionality reduction and regression.
 28
 29    Following Hyndman-Ullah methodology:
 30    1. Extract functional components using dimensionality reduction
 31    2. Model relationships between components and functional data using regression
 32    3. Forecast future functional curves
 33
 34    Parameters
 35    ----------
 36    n_components : int, default=8
 37        Number of components to extract.
 38    reduction_method : str, default='pca'
 39        Dimensionality reduction method.
 40    reduction_params : dict, optional
 41        Additional parameters for the reduction method.
 42    rolling_window : int, optional
 43        Window size for rolling regression. If None, uses full training set.
 44    forecast_method : {'ar', 'last_value'}, default='ar'
 45        Method for forecasting coefficients.
 46    regressor : sklearn regressor, optional
 47        Any sklearn regressor. If None, uses LinearRegression.
 48    regressor_params : dict, optional
 49        Additional parameters for the regressor.
 50    """
 51
 52    def __init__(
 53        self,
 54        n_components: int = 8,
 55        reduction_method: str = "pca",
 56        reduction_params: Optional[dict] = None,
 57        rolling_window: Optional[int] = None,
 58        forecast_method: Literal["ar", "last_value"] = "ar",
 59        regressor: Optional[BaseEstimator] = None,
 60        regressor_params: Optional[dict] = None,
 61    ):
 62        self.n_components = n_components
 63        self.reduction_method = reduction_method
 64        self.reduction_params = reduction_params or {}
 65        self.rolling_window = rolling_window
 66        self.forecast_method = forecast_method
 67        self.regressor = (
 68            regressor if regressor is not None else LinearRegression()
 69        )
 70        self.regressor_params = regressor_params or {}
 71
 72        # Available reduction methods
 73        self._reduction_methods = {
 74            "pca": PCA,
 75            "kernel_pca": KernelPCA,
 76            "truncated_svd": TruncatedSVD,
 77            "factor_analysis": FactorAnalysis,
 78            "fast_ica": FastICA,
 79            "nmf": NMF,
 80            "minibatch_sparse_pca": MiniBatchSparsePCA,
 81            "mds": MDS,
 82            "isomap": Isomap,
 83            "lle": LocallyLinearEmbedding,
 84        }
 85
 86        if reduction_method not in self._reduction_methods:
 87            raise ValueError(
 88                f"reduction_method must be one of {list(self._reduction_methods.keys())}"
 89            )
 90
 91    def _create_regressor(self):
 92        """Create a fresh regressor instance with parameters."""
 93        if hasattr(self.regressor, "__class__"):
 94            # Create new instance from class
 95            regressor = self.regressor.__class__(**self.regressor_params)
 96        else:
 97            # Clone existing instance
 98            from sklearn.base import clone
 99
100            regressor = clone(self.regressor)
101            # Apply additional parameters
102            for param, value in self.regressor_params.items():
103                setattr(regressor, param, value)
104
105        return regressor
106
107    def fit(
108        self, X: Union[np.ndarray, pd.DataFrame]
109    ) -> "GenericFunctionalForecaster":
110        """
111        Fit the functional forecaster.
112
113        Parameters
114        ----------
115        X : np.ndarray or pd.DataFrame, shape (n_samples, n_points)
116            Functional time series data.
117
118        Returns
119        -------
120        self : object
121            Fitted forecaster.
122        """
123        # Input validation and conversion
124        if isinstance(X, pd.DataFrame):
125            X = X.values
126        X = check_array(X)
127
128        self.X_ = X.copy()
129        self.n_samples_, self.n_points_ = X.shape
130
131        # 1. Standardize the functional data
132        self.scaler_ = StandardScaler()
133        X_scaled = self.scaler_.fit_transform(X)
134
135        # 2. Fit dimensionality reduction
136        self._fit_reduction_method(X_scaled)
137
138        # 3. Extract components (reduced features)
139        self.reduced_features_ = self.reducer_.transform(X_scaled)
140
141        # 4. Fit regression models
142        if self.rolling_window is not None:
143            self._fit_rolling_regression(X_scaled)
144        else:
145            self._fit_full_regression(X_scaled)
146
147        self.is_fitted_ = True
148        return self
149
150    def _fit_reduction_method(self, X_scaled):
151        """Fit the dimensionality reduction method."""
152        reduction_class = self._reduction_methods[self.reduction_method]
153
154        # Handle method-specific parameters
155        if self.reduction_method == "kernel_pca":
156            if "kernel" not in self.reduction_params:
157                self.reduction_params["kernel"] = "rbf"
158            if "fit_inverse_transform" not in self.reduction_params:
159                self.reduction_params["fit_inverse_transform"] = True
160        elif self.reduction_method == "minibatch_sparse_pca":
161            if "alpha" not in self.reduction_params:
162                self.reduction_params["alpha"] = 1.0
163            if "batch_size" not in self.reduction_params:
164                self.reduction_params["batch_size"] = min(3, self.n_samples_)
165
166        # Initialize and fit the reducer
167        self.reducer_ = reduction_class(
168            n_components=self.n_components, **self.reduction_params
169        )
170        self.reducer_.fit(X_scaled)
171
172        # Store components/basis functions for reconstruction
173        if hasattr(self.reducer_, "components_"):
174            self.components_ = (
175                self.reducer_.components_
176            )  # Shape: (n_components, n_points)
177        elif hasattr(self.reducer_, "inverse_transform"):
178            # For methods like KernelPCA, create identity mapping to get components
179            try:
180                identity_matrix = np.eye(self.n_components)
181                reconstructed = self.reducer_.inverse_transform(identity_matrix)
182                if reconstructed.shape == (self.n_components, self.n_points_):
183                    self.components_ = reconstructed
184                else:
185                    self.components_ = reconstructed.T
186            except Exception as e:
187                warnings.warn(
188                    f"Could not extract components for {self.reduction_method}: {e}"
189                )
190                self.components_ = None
191        else:
192            warnings.warn(
193                f"No reconstruction available for {self.reduction_method}"
194            )
195            self.components_ = None
196
197    def _fit_rolling_regression(self, X_scaled):
198        """
199        Fit rolling regression models.
200
201        For each window, fit: reduced_features[window] -> next_scaled_curve
202        This maintains scale consistency throughout.
203        """
204        if self.n_samples_ <= self.rolling_window:
205            raise ValueError(
206                f"Need more than {self.rolling_window} samples for rolling window, "
207                f"got {self.n_samples_}"
208            )
209
210        self.rolling_models_ = []
211        self.rolling_coefs_ = []
212
213        n_windows = self.n_samples_ - self.rolling_window
214
215        for i in range(n_windows):
216            # Input: window of reduced features
217            X_window = self.reduced_features_[
218                i : i + self.rolling_window
219            ]  # (window, n_components)
220
221            # Target: next scaled functional curve
222            y_next_scaled = X_scaled[i : i + self.rolling_window]  # (n_points,)
223
224            # Create and fit regressor
225            regressor = self._create_regressor()
226
227            try:
228                # Fit regression: reduced_features_window -> scaled_functional_curve
229                regressor.fit(X_window, y_next_scaled)
230
231                # Store model and coefficients
232                self.rolling_models_.append(regressor)
233
234                # Extract coefficients - shape depends on regressor type
235                if hasattr(regressor, "coef_"):
236                    coef = regressor.coef_
237                    # For multioutput: coef shape is (n_outputs, n_features) = (n_points, n_components)
238                    # For single output with multiple features: (n_features,) = (n_components,)
239                    # We expect multioutput here since y_next_scaled is (n_points,)
240                    if coef.ndim == 1:
241                        # This shouldn't happen with multioutput, but handle gracefully
242                        warnings.warn(
243                            f"Unexpected single output coefficients at window {i}"
244                        )
245                        coef = coef.reshape(1, -1)  # (1, n_components)
246                    self.rolling_coefs_.append(coef)  # (n_points, n_components)
247                else:
248                    # Fallback: use least squares
249                    warnings.warn(
250                        f"Regressor has no coef_ attribute, using least squares at window {i}"
251                    )
252                    coef = np.linalg.lstsq(X_window, y_next_scaled, rcond=None)[
253                        0
254                    ].T
255                    if coef.ndim == 1:
256                        coef = coef.reshape(1, -1)
257                    self.rolling_coefs_.append(coef)
258
259            except Exception as e:
260                warnings.warn(
261                    f"Regression failed at window {i}: {e}. Using least squares fallback."
262                )
263                # Least squares fallback
264                coef = np.linalg.lstsq(X_window, y_next_scaled, rcond=None)[
265                    0
266                ].T  # (n_points, n_components)
267                if coef.ndim == 1:
268                    coef = coef.reshape(1, -1)
269                self.rolling_coefs_.append(coef)
270                self.rolling_models_.append(None)
271
272        # Convert to array for easier manipulation
273        # Shape: (n_windows, n_points, n_components)
274        self.rolling_coefs_ = np.array(self.rolling_coefs_)
275
276    def _fit_full_regression(self, X_scaled):
277        """
278        Fit regression using full training set.
279
280        Fit: reduced_features -> scaled_functional_data
281        """
282        # Create regressor
283        regressor = self._create_regressor()
284
285        try:
286            # Fit: all reduced features -> all scaled functional curves
287            regressor.fit(self.reduced_features_, X_scaled)
288            self.full_model_ = regressor
289
290            # Store coefficients
291            if hasattr(regressor, "coef_"):
292                self.coefs_ = (
293                    regressor.coef_
294                )  # (n_points, n_components) for multioutput
295            else:
296                # Fallback to least squares
297                warnings.warn(
298                    "Regressor has no coef_ attribute, using least squares"
299                )
300                self.coefs_ = np.linalg.lstsq(
301                    self.reduced_features_, X_scaled, rcond=None
302                )[0].T
303
304        except Exception as e:
305            warnings.warn(
306                f"Full regression failed: {e}. Using least squares fallback."
307            )
308            # Least squares fallback
309            self.coefs_ = np.linalg.lstsq(
310                self.reduced_features_, X_scaled, rcond=None
311            )[0].T
312            self.full_model_ = None
313
314    def forecast(self, steps: int = 5) -> np.ndarray:
315        """
316        Forecast functional time series.
317
318        Parameters
319        ----------
320        steps : int
321            Number of steps to forecast.
322
323        Returns
324        -------
325        np.ndarray, shape (steps, n_points)
326            Forecasted functional curves.
327        """
328        check_is_fitted(self, "is_fitted_")
329
330        if self.rolling_window is not None:
331            return self._forecast_rolling(steps)
332        else:
333            return self._forecast_full(steps)
334
335    def _forecast_rolling(self, steps: int) -> np.ndarray:
336        """Forecast using rolling regression approach."""
337        # rolling_coefs_ shape: (n_windows, n_points, n_components)
338        n_windows, n_points, n_components = self.rolling_coefs_.shape
339        # Forecast coefficients for each point and component
340        forecasted_coefs = np.zeros((steps, n_points, n_components))
341
342        for point_idx in range(n_points):
343            for comp_idx in range(n_components):
344                # Get time series of coefficients for this (point, component)
345                coef_series = self.rolling_coefs_[:, point_idx, comp_idx]
346                # Forecast this coefficient series
347                if self.forecast_method == "ar" and len(coef_series) > 1:
348                    try:
349                        # Fit AR model to coefficient series
350                        ar_model = AutoReg(
351                            coef_series, lags=min(2, len(coef_series) - 1)
352                        ).fit()
353                        forecasted_values = ar_model.predict(
354                            start=len(coef_series),
355                            end=len(coef_series) + steps - 1,
356                        )
357                        forecasted_coefs[:, point_idx, comp_idx] = (
358                            forecasted_values
359                        )
360                    except Exception as e:
361                        warnings.warn(
362                            f"AR forecasting failed for point {point_idx}, component {comp_idx}: {e}"
363                        )
364                        # Use last value
365                        forecasted_coefs[:, point_idx, comp_idx] = coef_series[
366                            -1
367                        ]
368                else:
369                    # Use last value
370                    forecasted_coefs[:, point_idx, comp_idx] = coef_series[-1]
371
372        # Reconstruct functional forecasts from predicted coefficients
373        forecasts_scaled = np.zeros((steps, n_points))
374
375        if self.components_ is not None:
376            # Use learned components for reconstruction
377            # For each forecast step and each point, sum over components
378            for step in range(steps):
379                for point_idx in range(n_points):
380                    # forecasted_coefs[step, point_idx, :] has shape (n_components,)
381                    # self.components_[:, point_idx] has shape (n_components,)
382                    forecasts_scaled[step, point_idx] = np.dot(
383                        forecasted_coefs[step, point_idx, :],
384                        self.components_[:, point_idx],
385                    )
386        else:
387            # No reconstruction available - use direct prediction
388            warnings.warn(
389                f"No reconstruction available for {self.reduction_method}. Using last known values."
390            )
391            last_scaled = self.scaler_.transform(self.X_[-1:])
392            forecasts_scaled = np.tile(last_scaled, (steps, 1))
393        # Transform back to original scale
394        return self.scaler_.inverse_transform(forecasts_scaled)
395
396    def _forecast_full(self, steps: int) -> np.ndarray:
397        """Forecast using full training set approach."""
398        # First, forecast the reduced features themselves
399        forecasted_features = np.zeros((steps, self.n_components))
400
401        for comp in range(self.n_components):
402            # Get time series of this component
403            feature_series = self.reduced_features_[:, comp]
404
405            if self.forecast_method == "ar" and len(feature_series) > 1:
406                try:
407                    # Fit AR model to feature series
408                    ar_model = AutoReg(
409                        feature_series, lags=min(2, len(feature_series) - 1)
410                    ).fit()
411                    forecasted_values = ar_model.predict(
412                        start=len(feature_series),
413                        end=len(feature_series) + steps - 1,
414                    )
415                    forecasted_features[:, comp] = forecasted_values
416                except Exception as e:
417                    warnings.warn(
418                        f"AR forecasting failed for component {comp}: {e}"
419                    )
420                    # Use last value
421                    forecasted_features[:, comp] = feature_series[-1]
422            else:
423                # Use last value
424                forecasted_features[:, comp] = feature_series[-1]
425
426        # Reconstruct functional data from forecasted features
427        if hasattr(self, "full_model_") and self.full_model_ is not None:
428            # Use the fitted model to predict
429            try:
430                forecasts_scaled = self.full_model_.predict(forecasted_features)
431            except:
432                # Fallback to coefficient multiplication
433                forecasts_scaled = forecasted_features @ self.coefs_.T
434        else:
435            # Use stored coefficients
436            forecasts_scaled = forecasted_features @ self.coefs_.T
437
438        # Transform back to original scale
439        forecasts = self.scaler_.inverse_transform(forecasts_scaled)
440        return forecasts
441
442    def plot_components(self, n_plot: int = 3) -> None:
443        """Plot functional components."""
444        check_is_fitted(self, "is_fitted_")
445
446        if self.components_ is None:
447            print(f"Components not available for {self.reduction_method}")
448            return
449
450        plt.figure(figsize=(12, 6))
451        for i in range(min(n_plot, self.n_components)):
452            plt.plot(self.components_[i], label=f"Component {i+1}", linewidth=2)
453
454        plt.title(f"{self.reduction_method.upper()} Components")
455        plt.xlabel("Domain Point")
456        plt.ylabel("Component Value")
457        plt.legend()
458        plt.grid(True, alpha=0.3)
459        plt.show()
460
461    def plot_reduced_features(self, n_plot: int = 4) -> None:
462        """Plot reduced features over time."""
463        check_is_fitted(self, "is_fitted_")
464
465        plt.figure(figsize=(12, 8))
466        n_subplot_cols = 2
467        n_subplot_rows = (min(n_plot, self.n_components) + 1) // 2
468
469        for i in range(min(n_plot, self.n_components)):
470            plt.subplot(n_subplot_rows, n_subplot_cols, i + 1)
471            plt.plot(
472                self.reduced_features_[:, i], "o-", linewidth=2, markersize=4
473            )
474            plt.title(f"Reduced Feature {i+1}")
475            plt.xlabel("Time")
476            plt.ylabel("Value")
477            plt.grid(True, alpha=0.3)
478
479        plt.tight_layout()
480        plt.show()
481
482    def plot_forecast(
483        self, actual: Optional[np.ndarray] = None, steps: int = 5
484    ) -> None:
485        """Plot forecasted curves."""
486        forecasts = self.forecast(steps=steps)
487
488        plt.figure(figsize=(12, 6))
489
490        # Plot some historical curves
491        n_history = min(5, len(self.X_))
492        for i in range(n_history):
493            idx = -(n_history - i)
494            plt.plot(
495                self.X_[idx],
496                "b-",
497                alpha=0.3,
498                linewidth=1,
499                label="Historical" if i == 0 else "",
500            )
501
502        # Plot actual test data if provided
503        if actual is not None:
504            for i in range(min(3, len(actual))):
505                plt.plot(
506                    actual[i],
507                    "k-",
508                    alpha=0.7,
509                    linewidth=2,
510                    label="Actual" if i == 0 else "",
511                )
512
513        # Plot forecasts
514        for i in range(steps):
515            plt.plot(
516                forecasts[i],
517                "r--",
518                linewidth=2,
519                alpha=0.7,
520                label="Forecast" if i == 0 else "",
521            )
522
523        plt.title("Functional Time Series Forecast")
524        plt.xlabel("Domain Point")
525        plt.ylabel("Value")
526        plt.legend()
527        plt.grid(True, alpha=0.3)
528        plt.show()
529
530    def get_model_info(self) -> dict:
531        """Get information about the fitted model."""
532        info = {
533            "n_components": self.n_components,
534            "reduction_method": self.reduction_method,
535            "rolling_window": self.rolling_window,
536            "forecast_method": self.forecast_method,
537            "regressor": self.regressor.__class__.__name__,
538            "regressor_params": self.regressor_params,
539            "is_fitted": getattr(self, "is_fitted_", False),
540        }
541
542        if hasattr(self, "reduced_features_"):
543            info.update(
544                {
545                    "n_samples": self.n_samples_,
546                    "n_points": self.n_points_,
547                    "explained_variance_ratio": getattr(
548                        self.reducer_, "explained_variance_ratio_", None
549                    ),
550                    "has_components": self.components_ is not None,
551                    "coefficient_shape": (
552                        getattr(self, "rolling_coefs_", np.array([])).shape
553                        if hasattr(self, "rolling_coefs_")
554                        else getattr(self, "coefs_", np.array([])).shape
555                    ),
556                }
557            )
558
559        return info

Functional time series forecaster using dimensionality reduction and regression.

Following Hyndman-Ullah methodology:

  1. Extract functional components using dimensionality reduction
  2. Model relationships between components and functional data using regression
  3. Forecast future functional curves

Parameters

n_components : int, default=8 Number of components to extract. reduction_method : str, default='pca' Dimensionality reduction method. reduction_params : dict, optional Additional parameters for the reduction method. rolling_window : int, optional Window size for rolling regression. If None, uses full training set. forecast_method : {'ar', 'last_value'}, default='ar' Method for forecasting coefficients. regressor : sklearn regressor, optional Any sklearn regressor. If None, uses LinearRegression. regressor_params : dict, optional Additional parameters for the regressor.

def fit( self, X: Union[numpy.ndarray, pandas.DataFrame]) -> GenericFunctionalForecaster:
107    def fit(
108        self, X: Union[np.ndarray, pd.DataFrame]
109    ) -> "GenericFunctionalForecaster":
110        """
111        Fit the functional forecaster.
112
113        Parameters
114        ----------
115        X : np.ndarray or pd.DataFrame, shape (n_samples, n_points)
116            Functional time series data.
117
118        Returns
119        -------
120        self : object
121            Fitted forecaster.
122        """
123        # Input validation and conversion
124        if isinstance(X, pd.DataFrame):
125            X = X.values
126        X = check_array(X)
127
128        self.X_ = X.copy()
129        self.n_samples_, self.n_points_ = X.shape
130
131        # 1. Standardize the functional data
132        self.scaler_ = StandardScaler()
133        X_scaled = self.scaler_.fit_transform(X)
134
135        # 2. Fit dimensionality reduction
136        self._fit_reduction_method(X_scaled)
137
138        # 3. Extract components (reduced features)
139        self.reduced_features_ = self.reducer_.transform(X_scaled)
140
141        # 4. Fit regression models
142        if self.rolling_window is not None:
143            self._fit_rolling_regression(X_scaled)
144        else:
145            self._fit_full_regression(X_scaled)
146
147        self.is_fitted_ = True
148        return self

Fit the functional forecaster.

Parameters

X : np.ndarray or pd.DataFrame, shape (n_samples, n_points) Functional time series data.

Returns

self : object Fitted forecaster.

class RankTargetEncoder(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin):
 18class RankTargetEncoder(BaseEstimator, TransformerMixin):
 19    """
 20    Rank-based target encoder using Spearman rho or Kendall tau via
 21    Gaussian copula with proper cross-validation.
 22
 23    This encoder uses cross-validation and pseudo-targets generated via
 24    Gaussian copula with specified rank correlation to create robust,
 25    regularized encodings that prevent overfitting.
 26
 27    Parameters:
 28    -----------
 29    correlation_type : str, default='spearman'
 30        Type of rank correlation ('spearman' or 'kendall').
 31    correlation_strength : float, default=0.5
 32        Desired strength of rank correlation (between 0 and 1).
 33    shrinkage : float, default=10
 34        Shrinkage parameter for regularization (Bayesian average).
 35    n_folds : int, default=3
 36        Number of CV folds for leakage-free encoding.
 37    ensemble_size : int, default=5
 38        Number of pseudo-targets to average over (reduces variance).
 39    aggregate : str, default='mean'
 40        Aggregation method for combining values within categories ('mean' or 'median').
 41    random_state : int, default=42
 42        Random seed for reproducibility.
 43    """
 44
 45    def __init__(
 46        self,
 47        correlation_type="spearman",
 48        correlation_strength=0.5,
 49        shrinkage=10,
 50        n_folds=3,
 51        ensemble_size=5,
 52        aggregate="mean",
 53        random_state=42,
 54    ):
 55        self.correlation_type = correlation_type
 56        self.correlation_strength = correlation_strength
 57        self.shrinkage = shrinkage
 58        self.n_folds = n_folds
 59        self.ensemble_size = ensemble_size
 60        self.aggregate = aggregate
 61        self.random_state = random_state
 62        self.cat_columns_ = []
 63
 64        # Validate inputs
 65        if correlation_type not in ["spearman", "kendall"]:
 66            raise ValueError("correlation_type must be 'spearman' or 'kendall'")
 67        if not (0 <= correlation_strength <= 1):
 68            raise ValueError("correlation_strength must be in [0, 1]")
 69        if shrinkage < 0:
 70            raise ValueError("shrinkage must be non-negative")
 71        if n_folds < 2:
 72            raise ValueError("n_folds must be at least 2")
 73        if ensemble_size < 1:
 74            raise ValueError("ensemble_size must be at least 1")
 75        if aggregate not in ["mean", "median"]:
 76            raise ValueError("aggregate must be 'mean' or 'median'")
 77
 78    def _generate_pseudo_target(self, y, random_state):
 79        """Generate pseudo-target with specified rank correlation to y."""
 80        y = np.asarray(y)
 81        n = len(y)
 82        if n <= 1:
 83            return y.copy()
 84
 85        # Convert to uniform margins via ranks
 86        ranks = rankdata(y, method="average")
 87        u_y = ranks / (n + 1)
 88
 89        # Transform to Gaussian
 90        g_y = norm.ppf(u_y)
 91
 92        # Convert rank correlation to Gaussian correlation
 93        if self.correlation_type == "spearman":
 94            rho_g = 2 * np.sin(np.pi * self.correlation_strength / 6)
 95        else:  # kendall
 96            rho_g = np.sin(np.pi * self.correlation_strength / 2)
 97        rho_g = np.clip(rho_g, -1.0, 1.0)
 98
 99        # Generate correlated Gaussian variable
100        rng = np.random.RandomState(random_state)
101        eta = rng.normal(size=n)
102        g_z = rho_g * g_y + np.sqrt(1 - rho_g**2) * eta
103
104        # Transform back to original scale via quantiles
105        u_z = norm.cdf(g_z)
106        y_sorted = np.sort(y)
107        z = np.quantile(y_sorted, u_z, method="linear")
108
109        return z
110
111    def _compute_category_statistics(self, categories, values):
112        """Compute category-wise statistics with proper handling."""
113        if len(categories) == 0:
114            return {}
115
116        df = pd.DataFrame({"cat": categories, "val": values})
117
118        if self.aggregate == "mean":
119            cat_stats = df.groupby("cat")["val"].agg(["mean", "count"])
120            return dict(
121                zip(cat_stats.index, zip(cat_stats["mean"], cat_stats["count"]))
122            )
123        else:  # median
124            cat_stats = df.groupby("cat")["val"].agg(["median", "count"])
125            return dict(
126                zip(
127                    cat_stats.index,
128                    zip(cat_stats["median"], cat_stats["count"]),
129                )
130            )
131
132    def _apply_shrinkage(self, category_stats, global_stat):
133        """Apply shrinkage regularization to category statistics."""
134        regularized = {}
135        for cat, (stat, count) in category_stats.items():
136            regularized[cat] = (count * stat + self.shrinkage * global_stat) / (
137                count + self.shrinkage
138            )
139        return regularized
140
141    def _identify_categorical_columns(self, X):
142        """Identify categorical columns in the DataFrame."""
143        cat_cols = []
144        for col in X.columns:
145            # Check if column is object type or has low cardinality
146            if (
147                X[col].dtype == "object"
148                or X[col].dtype.name == "category"
149                or X[col].nunique() / len(X) < 0.05
150            ):  # heuristic for categorical
151                cat_cols.append(col)
152        return cat_cols
153
154    def fit(self, X, y):
155        """Fit the encoder using cross-validation to prevent leakage."""
156        if not isinstance(X, pd.DataFrame):
157            raise ValueError("X must be a pandas DataFrame")
158
159        X = X.reset_index(drop=True)  # Ensure clean integer indices
160        y = np.asarray(y)
161
162        if len(X) != len(y):
163            raise ValueError("X and y must have the same number of samples")
164
165        if len(X) == 0:
166            raise ValueError("X cannot be empty")
167
168        self.feature_names_in_ = list(X.columns)
169        self.y_mean_ = np.mean(y) if len(y) > 0 else 0.0
170        self.category_mappings_ = {}
171
172        # Identify categorical columns
173        self.cat_columns_ = self._identify_categorical_columns(X)
174        self.non_cat_columns_ = [
175            col for col in X.columns if col not in self.cat_columns_
176        ]
177
178        # Set up cross-validation
179        kf = KFold(
180            n_splits=self.n_folds, shuffle=True, random_state=self.random_state
181        )
182
183        for col in self.cat_columns_:
184            if X[col].nunique() <= 1:
185                # Handle constant columns
186                self.category_mappings_[col] = {X[col].iloc[0]: self.y_mean_}
187                continue
188
189            # Collect encodings for each category across all CV folds and ensemble members
190            category_encodings = defaultdict(list)
191
192            for ensemble_idx in range(self.ensemble_size):
193                ensemble_seed = self.random_state + ensemble_idx
194                fold_encodings = np.full(len(y), np.nan)
195
196                for fold_idx, (train_idx, val_idx) in enumerate(kf.split(X)):
197                    # Training data for this fold
198                    X_train_fold = X.iloc[train_idx]
199                    y_train_fold = y[train_idx]
200                    X_val_fold = X.iloc[val_idx]
201
202                    if len(y_train_fold) == 0:
203                        continue
204
205                    # Generate pseudo-target for this fold and ensemble member
206                    # Use deterministic seed based on ensemble_idx, fold_idx, and column
207                    fold_seed = (
208                        ensemble_seed + fold_idx * 1000 + hash(col) % 10000
209                    )
210                    z_train = self._generate_pseudo_target(
211                        y_train_fold, fold_seed
212                    )
213
214                    # Compute category statistics
215                    cat_stats = self._compute_category_statistics(
216                        X_train_fold[col].values, z_train
217                    )
218
219                    if not cat_stats:
220                        continue
221
222                    # Apply shrinkage regularization
223                    if self.aggregate == "mean":
224                        global_stat = np.mean(z_train)
225                    else:  # median
226                        global_stat = np.median(z_train)
227
228                    regularized_stats = self._apply_shrinkage(
229                        cat_stats, global_stat
230                    )
231
232                    # Encode validation fold
233                    for idx in val_idx:
234                        category = X_val_fold.loc[idx, col]
235                        if category in regularized_stats:
236                            fold_encodings[idx] = regularized_stats[category]
237                        else:
238                            fold_encodings[idx] = global_stat
239
240                # Collect encodings by category for this ensemble member
241                for idx, encoding in enumerate(fold_encodings):
242                    if not np.isnan(encoding):
243                        category = X.iloc[idx][col]
244                        category_encodings[category].append(encoding)
245
246            # Average encodings for each category across all ensemble members and folds
247            final_mappings = {}
248            for category, encodings in category_encodings.items():
249                if encodings:
250                    final_mappings[category] = np.mean(encodings)
251                else:
252                    final_mappings[category] = self.y_mean_
253
254            self.category_mappings_[col] = final_mappings
255
256        return self
257
258    def transform(self, X):
259        """Transform categorical columns using learned encodings."""
260        if not hasattr(self, "category_mappings_"):
261            raise NotFittedError(
262                "This %s instance is not fitted yet." % self.__class__.__name__
263            )
264
265        if not isinstance(X, pd.DataFrame):
266            raise ValueError("X must be a pandas DataFrame")
267
268        # Check for missing columns
269        missing_cols = set(self.feature_names_in_) - set(X.columns)
270        if missing_cols:
271            raise ValueError(f"Missing columns from training: {missing_cols}")
272
273        X_encoded = X.copy()
274
275        for col in self.feature_names_in_:
276            if col not in X.columns:
277                # This shouldn't happen due to check above, but be safe
278                X_encoded[col] = self.y_mean_
279                continue
280
281            # Only encode categorical columns, leave others unchanged
282            if col in self.cat_columns_:
283                mappings = self.category_mappings_[col]
284                X_encoded[col] = X[col].map(mappings).fillna(self.y_mean_)
285            # Non-categorical columns are left as-is
286
287        return X_encoded
288
289    def fit_transform(self, X, y, **fit_params):
290        """Fit encoder and return encoded version of X."""
291        return self.fit(X, y).transform(X)
292
293    def get_feature_names_out(self, input_features=None):
294        """Get output feature names for transformation."""
295        if not hasattr(self, "category_mappings_"):
296            raise NotFittedError(
297                "This %s instance is not fitted yet." % self.__class__.__name__
298            )
299
300        if input_features is None:
301            return np.array(self.feature_names_in_)
302        else:
303            return np.array(input_features)
304
305    def get_category_mappings(self):
306        """Get the learned category mappings for inspection."""
307        if not hasattr(self, "category_mappings_"):
308            raise NotFittedError(
309                "This %s instance is not fitted yet." % self.__class__.__name__
310            )
311
312        return self.category_mappings_.copy()
313
314    def validate_encoding(self, X, y, plot=True):
315        """
316        Comprehensive validation of the encoding process, including correlation
317        preservation, distribution analysis, and category-level statistics.
318
319        Parameters:
320        -----------
321        X : pandas DataFrame
322            Input features (must be the same as used in fitting)
323        y : array-like
324            True target values
325        plot : bool, default=True
326            Whether to generate diagnostic plots
327
328        Returns:
329        --------
330        dict
331            Dictionary containing validation metrics and statistics
332        """
333        if not hasattr(self, "category_mappings_"):
334            raise NotFittedError(
335                "This %s instance is not fitted yet." % self.__class__.__name__
336            )
337
338        if not isinstance(X, pd.DataFrame):
339            raise ValueError("X must be a pandas DataFrame")
340
341        X = X.reset_index(drop=True)
342        y = np.asarray(y)
343
344        # Generate multiple pseudo-targets for robust statistics
345        pseudo_targets = []
346        correlations_achieved = []
347
348        for i in range(self.ensemble_size):
349            seed = self.random_state + i
350            z = self._generate_pseudo_target(y, seed)
351            pseudo_targets.append(z)
352
353            # Compute achieved correlation
354            if self.correlation_type == "spearman":
355                from scipy.stats import spearmanr
356
357                corr, _ = spearmanr(y, z)
358            else:  # kendall
359                from scipy.stats import kendalltau
360
361                corr, _ = kendalltau(y, z)
362            correlations_achieved.append(corr)
363
364        pseudo_targets = np.array(pseudo_targets)
365        mean_pseudo_target = np.mean(pseudo_targets, axis=0)
366
367        # Transform the data
368        X_encoded = self.transform(X)
369
370        # Compute overall validation metrics
371        validation_results = {
372            "target_correlation": self.correlation_strength,
373            "achieved_correlations": correlations_achieved,
374            "mean_achieved_correlation": np.mean(correlations_achieved),
375            "std_achieved_correlation": np.std(correlations_achieved),
376            "correlation_bias": np.mean(correlations_achieved)
377            - self.correlation_strength,
378            "original_target_stats": {
379                "mean": np.mean(y),
380                "std": np.std(y),
381                "min": np.min(y),
382                "max": np.max(y),
383                "median": np.median(y),
384            },
385            "pseudo_target_stats": {
386                "mean": np.mean(mean_pseudo_target),
387                "std": np.std(mean_pseudo_target),
388                "min": np.min(mean_pseudo_target),
389                "max": np.max(mean_pseudo_target),
390                "median": np.median(mean_pseudo_target),
391            },
392        }
393
394        # Category-level analysis
395        category_correlations = {}
396        category_stats = {}
397
398        for col in self.cat_columns_:
399            if col not in X.columns:
400                continue
401
402            unique_categories = X[col].unique()
403            cat_corrs = []
404            cat_means_original = []
405            cat_means_pseudo = []
406            cat_counts = []
407
408            for category in unique_categories:
409                mask = X[col] == category
410                if (
411                    np.sum(mask) > 5
412                ):  # Only analyze categories with sufficient samples
413                    if self.correlation_type == "spearman":
414                        corr, _ = spearmanr(y[mask], mean_pseudo_target[mask])
415                    else:
416                        corr, _ = kendalltau(y[mask], mean_pseudo_target[mask])
417
418                    cat_corrs.append(corr)
419                    cat_means_original.append(np.mean(y[mask]))
420                    cat_means_pseudo.append(np.mean(mean_pseudo_target[mask]))
421                    cat_counts.append(np.sum(mask))
422
423            category_correlations[col] = {
424                "mean_correlation": np.mean(cat_corrs) if cat_corrs else np.nan,
425                "std_correlation": np.std(cat_corrs) if cat_corrs else np.nan,
426                "min_correlation": np.min(cat_corrs) if cat_corrs else np.nan,
427                "max_correlation": np.max(cat_corrs) if cat_corrs else np.nan,
428            }
429
430            category_stats[col] = {
431                "n_categories": len(unique_categories),
432                "n_analyzed_categories": len(cat_corrs),
433                "category_means_original": cat_means_original,
434                "category_means_pseudo": cat_means_pseudo,
435                "category_counts": cat_counts,
436            }
437
438        validation_results["category_correlations"] = category_correlations
439        validation_results["category_stats"] = category_stats
440
441        # Generate plots if requested
442        if plot:
443            try:
444                fig, axes = plt.subplots(2, 2, figsize=(15, 12))
445                axes = axes.flatten()
446
447                # Scatter plot: Original vs Pseudo-targets
448                axes[0].scatter(y, mean_pseudo_target, alpha=0.6, s=20)
449                axes[0].set_xlabel("Original Target")
450                axes[0].set_ylabel("Pseudo Target")
451                axes[0].set_title(
452                    f'Original vs Pseudo-targets\n{self.correlation_type.capitalize()} correlation: {validation_results["mean_achieved_correlation"]:.3f}'
453                )
454
455                # Add correlation line
456                z = np.polyfit(y, mean_pseudo_target, 1)
457                p = np.poly1d(z)
458                axes[0].plot(y, p(y), "r--", alpha=0.8)
459
460                # Distribution comparison
461                axes[1].hist(
462                    y, alpha=0.7, bins=30, label="Original", density=True
463                )
464                axes[1].hist(
465                    mean_pseudo_target,
466                    alpha=0.7,
467                    bins=30,
468                    label="Pseudo",
469                    density=True,
470                )
471                axes[1].set_xlabel("Value")
472                axes[1].set_ylabel("Density")
473                axes[1].set_title("Distribution Comparison")
474                axes[1].legend()
475
476                # Rank comparison
477                original_ranks = rankdata(y, method="average")
478                pseudo_ranks = rankdata(mean_pseudo_target, method="average")
479                axes[2].scatter(original_ranks, pseudo_ranks, alpha=0.6, s=20)
480                axes[2].set_xlabel("Original Ranks")
481                axes[2].set_ylabel("Pseudo Ranks")
482                axes[2].set_title("Rank Preservation")
483
484                # Category analysis - residual plot
485                residuals = y - mean_pseudo_target
486                # Use first categorical column for coloring if available
487                if self.cat_columns_:
488                    cat_col = self.cat_columns_[0]
489                    unique_cats = X[cat_col].unique()[
490                        :10
491                    ]  # Limit to top 10 categories
492                    colors = plt.cm.tab10(np.linspace(0, 1, len(unique_cats)))
493
494                    for i, category in enumerate(unique_cats):
495                        mask = X[cat_col] == category
496                        if np.sum(mask) > 0:
497                            axes[3].scatter(
498                                mean_pseudo_target[mask],
499                                residuals[mask],
500                                alpha=0.6,
501                                s=20,
502                                color=colors[i],
503                                label=str(category),
504                            )
505
506                    axes[3].axhline(y=0, color="r", linestyle="--", alpha=0.8)
507                    axes[3].set_xlabel("Pseudo Target")
508                    axes[3].set_ylabel("Residuals (Original - Pseudo)")
509                    axes[3].set_title("Residuals by Category")
510                    axes[3].legend(bbox_to_anchor=(1.05, 1), loc="upper left")
511                else:
512                    axes[3].scatter(
513                        mean_pseudo_target, residuals, alpha=0.6, s=20
514                    )
515                    axes[3].axhline(y=0, color="r", linestyle="--", alpha=0.8)
516                    axes[3].set_xlabel("Pseudo Target")
517                    axes[3].set_ylabel("Residuals (Original - Pseudo)")
518                    axes[3].set_title("Residual Plot")
519
520                plt.tight_layout()
521                plt.show()
522
523            except ImportError:
524                print("Matplotlib/seaborn not available for plotting")
525
526        return validation_results
527
528    def get_validation_report(self, validation_results):
529        """
530        Generate a human-readable validation report from validation results.
531
532        Parameters:
533        -----------
534        validation_results : dict
535            Results from validate_encoding method
536
537        Returns:
538        --------
539        str
540            Formatted validation report
541        """
542        report = []
543        report.append("=" * 60)
544        report.append("RANK TARGET ENCODER VALIDATION REPORT")
545        report.append("=" * 60)
546
547        report.append(f"\nCORRELATION VALIDATION:")
548        report.append(
549            f"Target {self.correlation_type} correlation: {validation_results['target_correlation']:.3f}"
550        )
551        report.append(
552            f"Achieved mean correlation: {validation_results['mean_achieved_correlation']:.3f}"
553        )
554        report.append(
555            f"Correlation bias: {validation_results['correlation_bias']:.3f}"
556        )
557        report.append(
558            f"Correlation std across ensemble: {validation_results['std_achieved_correlation']:.3f}"
559        )
560
561        report.append(f"\nDISTRIBUTION COMPARISON:")
562        orig = validation_results["original_target_stats"]
563        pseudo = validation_results["pseudo_target_stats"]
564        report.append(
565            f"Original target - Mean: {orig['mean']:.3f}, Std: {orig['std']:.3f}"
566        )
567        report.append(
568            f"Pseudo target  - Mean: {pseudo['mean']:.3f}, Std: {pseudo['std']:.3f}"
569        )
570
571        report.append(f"\nCATEGORY-LEVEL ANALYSIS:")
572        for col, stats in validation_results["category_correlations"].items():
573            if not np.isnan(stats["mean_correlation"]):
574                report.append(
575                    f"  {col}: {stats['mean_correlation']:.3f} ± {stats['std_correlation']:.3f} "
576                    f"(min: {stats['min_correlation']:.3f}, max: {stats['max_correlation']:.3f})"
577                )
578
579        report.append(f"\nROBUST STATISTICS:")
580        report.append(f"Ensemble size: {self.ensemble_size}")
581        report.append(
582            f"Individual correlations: {[f'{c:.3f}' for c in validation_results['achieved_correlations']]}"
583        )
584
585        report.append("=" * 60)
586        return "\n".join(report)

Rank-based target encoder using Spearman rho or Kendall tau via Gaussian copula with proper cross-validation.

This encoder uses cross-validation and pseudo-targets generated via Gaussian copula with specified rank correlation to create robust, regularized encodings that prevent overfitting.

Parameters:

correlation_type : str, default='spearman' Type of rank correlation ('spearman' or 'kendall'). correlation_strength : float, default=0.5 Desired strength of rank correlation (between 0 and 1). shrinkage : float, default=10 Shrinkage parameter for regularization (Bayesian average). n_folds : int, default=3 Number of CV folds for leakage-free encoding. ensemble_size : int, default=5 Number of pseudo-targets to average over (reduces variance). aggregate : str, default='mean' Aggregation method for combining values within categories ('mean' or 'median'). random_state : int, default=42 Random seed for reproducibility.

def fit(self, X, y):
154    def fit(self, X, y):
155        """Fit the encoder using cross-validation to prevent leakage."""
156        if not isinstance(X, pd.DataFrame):
157            raise ValueError("X must be a pandas DataFrame")
158
159        X = X.reset_index(drop=True)  # Ensure clean integer indices
160        y = np.asarray(y)
161
162        if len(X) != len(y):
163            raise ValueError("X and y must have the same number of samples")
164
165        if len(X) == 0:
166            raise ValueError("X cannot be empty")
167
168        self.feature_names_in_ = list(X.columns)
169        self.y_mean_ = np.mean(y) if len(y) > 0 else 0.0
170        self.category_mappings_ = {}
171
172        # Identify categorical columns
173        self.cat_columns_ = self._identify_categorical_columns(X)
174        self.non_cat_columns_ = [
175            col for col in X.columns if col not in self.cat_columns_
176        ]
177
178        # Set up cross-validation
179        kf = KFold(
180            n_splits=self.n_folds, shuffle=True, random_state=self.random_state
181        )
182
183        for col in self.cat_columns_:
184            if X[col].nunique() <= 1:
185                # Handle constant columns
186                self.category_mappings_[col] = {X[col].iloc[0]: self.y_mean_}
187                continue
188
189            # Collect encodings for each category across all CV folds and ensemble members
190            category_encodings = defaultdict(list)
191
192            for ensemble_idx in range(self.ensemble_size):
193                ensemble_seed = self.random_state + ensemble_idx
194                fold_encodings = np.full(len(y), np.nan)
195
196                for fold_idx, (train_idx, val_idx) in enumerate(kf.split(X)):
197                    # Training data for this fold
198                    X_train_fold = X.iloc[train_idx]
199                    y_train_fold = y[train_idx]
200                    X_val_fold = X.iloc[val_idx]
201
202                    if len(y_train_fold) == 0:
203                        continue
204
205                    # Generate pseudo-target for this fold and ensemble member
206                    # Use deterministic seed based on ensemble_idx, fold_idx, and column
207                    fold_seed = (
208                        ensemble_seed + fold_idx * 1000 + hash(col) % 10000
209                    )
210                    z_train = self._generate_pseudo_target(
211                        y_train_fold, fold_seed
212                    )
213
214                    # Compute category statistics
215                    cat_stats = self._compute_category_statistics(
216                        X_train_fold[col].values, z_train
217                    )
218
219                    if not cat_stats:
220                        continue
221
222                    # Apply shrinkage regularization
223                    if self.aggregate == "mean":
224                        global_stat = np.mean(z_train)
225                    else:  # median
226                        global_stat = np.median(z_train)
227
228                    regularized_stats = self._apply_shrinkage(
229                        cat_stats, global_stat
230                    )
231
232                    # Encode validation fold
233                    for idx in val_idx:
234                        category = X_val_fold.loc[idx, col]
235                        if category in regularized_stats:
236                            fold_encodings[idx] = regularized_stats[category]
237                        else:
238                            fold_encodings[idx] = global_stat
239
240                # Collect encodings by category for this ensemble member
241                for idx, encoding in enumerate(fold_encodings):
242                    if not np.isnan(encoding):
243                        category = X.iloc[idx][col]
244                        category_encodings[category].append(encoding)
245
246            # Average encodings for each category across all ensemble members and folds
247            final_mappings = {}
248            for category, encodings in category_encodings.items():
249                if encodings:
250                    final_mappings[category] = np.mean(encodings)
251                else:
252                    final_mappings[category] = self.y_mean_
253
254            self.category_mappings_[col] = final_mappings
255
256        return self

Fit the encoder using cross-validation to prevent leakage.

class RollingOriginForecaster(sklearn.base.BaseEstimator, sklearn.base.RegressorMixin):
  7class RollingOriginForecaster(BaseEstimator, RegressorMixin):
  8    """
  9    A flexible rolling origin forecaster that supports both autoregressive and
 10    exogenous features modes, with multiple prediction strategies.
 11    """
 12
 13    def __init__(
 14        self,
 15        estimator,
 16        max_horizon=1,
 17        n_lags=1,
 18        mode="auto",
 19        multi_output="auto",
 20        recursive=True,
 21    ):
 22        self.estimator = estimator
 23        self.max_horizon = max_horizon
 24        self.n_lags = n_lags
 25        self.mode = mode
 26        self.multi_output = multi_output
 27        self.recursive = recursive
 28
 29    def fit(self, X=None, y=None):
 30        if y is None:
 31            raise ValueError("y cannot be None")
 32
 33        # Determine mode
 34        if self.mode == "auto":
 35            self.mode_ = "ar" if X is None else "exog"
 36        else:
 37            self.mode_ = self.mode
 38
 39        # Fit in appropriate mode
 40        if self.mode_ == "ar":
 41            return self._fit_ar(y)
 42        else:
 43            return self._fit_exog(X, y)
 44
 45    def predict(self, X=None, h=None):
 46        check_is_fitted(self)
 47
 48        # Validate horizon
 49        if h is None:
 50            h = self.max_horizon
 51        elif h > self.max_horizon:
 52            raise ValueError(
 53                f"Requested horizon {h} exceeds max_horizon {self.max_horizon}"
 54            )
 55
 56        if self.mode_ == "ar":
 57            return self._predict_ar(h)
 58        else:
 59            if X is None:
 60                raise ValueError("X cannot be None in exog mode")
 61            X = check_array(X)
 62            return self._predict_exog(X, h)
 63
 64    def _fit_ar(self, y):
 65        """Fit autoregressive model."""
 66        y = check_array(y, ensure_2d=False)
 67
 68        # Validate series length
 69        if len(y) < self.n_lags + 1:
 70            raise ValueError(
 71                f"y must have at least n_lags+1 ({self.n_lags+1}) samples"
 72            )
 73
 74        # Create lagged features matrix
 75        X, y = self._create_lagged_features(y)
 76
 77        # Fit model
 78        self.estimator_ = self._fit_model(X, y)
 79
 80        # Store last window for predictions
 81        self.last_window_ = y[-self.n_lags :].reshape(1, -1)
 82
 83        return self
 84
 85    def _fit_exog(self, X, y):
 86        """Fit model with exogenous features."""
 87        X, y = check_X_y(X, y, multi_output=True)
 88        self.n_features_in_ = X.shape[1]
 89
 90        # Validate series length
 91        if len(y) < self.max_horizon:
 92            raise ValueError(
 93                f"Need at least max_horizon ({self.max_horizon}) samples"
 94            )
 95
 96        # Fit model
 97        self.estimator_ = self._fit_model(X, y)
 98
 99        return self
100
101    def _fit_model(self, X, y):
102        """Internal method to fit model with selected strategy."""
103        # Determine multi-output strategy
104        if self.multi_output == "auto":
105            try:
106                # Test if estimator supports multi-output
107                dummy = clone(self.estimator)
108                test_shape = (min(10, len(X)), self.max_horizon)
109                dummy.fit(X[: min(10, len(X))], np.zeros(test_shape))
110                self.multi_output_ = True
111            except Exception:
112                self.multi_output_ = False
113        else:
114            self.multi_output_ = self.multi_output
115
116        # Prepare targets for multi-step forecasting
117        if y.ndim == 1:
118            y = y.reshape(-1, 1)
119
120        if self.multi_output_:
121            # Create shifted targets matrix
122            T = len(y)
123            if T < self.max_horizon:
124                raise ValueError("Time series too short for max_horizon")
125            y_shifted = np.zeros((T - self.max_horizon + 1, self.max_horizon))
126            for i in range(self.max_horizon):
127                y_shifted[:, i] = y[i : T - self.max_horizon + 1 + i].ravel()
128
129            # Trim X to match
130            X = X[: len(y_shifted)]
131            y = y_shifted
132
133            return clone(self.estimator).fit(X, y)
134
135        else:
136            if self.recursive:
137                # Single model for recursive predictions
138                return clone(self.estimator).fit(X, y.ravel())
139            else:
140                # Separate models for each horizon
141                self.estimators_ = []
142                for i in range(self.max_horizon):
143                    X_i = X[: len(y) - i]
144                    y_i = y[i:].ravel()
145                    est = clone(self.estimator)
146                    est.fit(X_i, y_i)
147                    self.estimators_.append(est)
148                return self
149
150    def _predict_ar(self, h):
151        """Make autoregressive predictions."""
152        current_window = self.last_window_.copy()
153        predictions = np.zeros((1, h))
154
155        for i in range(h):
156            pred = self.estimator_.predict(current_window)[0]
157            predictions[0, i] = pred
158            # Update window
159            current_window = np.roll(current_window, -1)
160            current_window[0, -1] = pred
161
162        return predictions
163
164    def _predict_exog(self, X, h):
165        """Make predictions with exogenous features."""
166        if hasattr(self, "estimators_"):  # Direct strategy
167            preds = np.zeros((X.shape[0], h))
168            for i in range(h):
169                preds[:, i] = self.estimators_[i].predict(X).ravel()
170            return preds
171        else:
172            if self.multi_output_:
173                pred_out = self.estimator_.predict(X)
174                if pred_out.shape[1] < h:
175                    # Pad if model outputs fewer horizons than requested
176                    pad_width = ((0, 0), (0, h - pred_out.shape[1]))
177                    pred_out = np.pad(pred_out, pad_width, mode="constant")
178                return pred_out[:, :h]
179            else:  # Recursive
180                preds = np.zeros((X.shape[0], h))
181                current_pred = self.estimator_.predict(X)
182                if current_pred.ndim == 1:
183                    current_pred = current_pred.reshape(-1, 1)
184                preds[:, 0] = current_pred.ravel()
185
186                for i in range(1, h):
187                    if X.shape[1] > 1:
188                        X_new = np.hstack([X[:, 1:], current_pred])
189                    else:
190                        X_new = current_pred
191                    current_pred = self.estimator_.predict(X_new)
192                    if current_pred.ndim == 1:
193                        current_pred = current_pred.reshape(-1, 1)
194                    preds[:, i] = current_pred.ravel()
195
196                return preds
197
198    def _create_lagged_features(self, y):
199        """Create lagged features matrix for AR mode."""
200        if len(y) <= self.n_lags:
201            raise ValueError("Not enough samples to create lagged features")
202        n_samples = len(y) - self.n_lags
203        X = np.zeros((n_samples, self.n_lags))
204        for i in range(self.n_lags):
205            X[:, i] = y[i : i + n_samples]
206        y = y[self.n_lags :]
207        return X, y
208
209    def get_params(self, deep=True):
210        """Get parameters for this estimator."""
211        params = {
212            "estimator": self.estimator,
213            "max_horizon": self.max_horizon,
214            "n_lags": self.n_lags,
215            "mode": self.mode,
216            "multi_output": self.multi_output,
217            "recursive": self.recursive,
218        }
219        if deep:
220            for key, value in params.items():
221                if hasattr(value, "get_params"):
222                    params[key] = value.get_params(deep)
223        return params
224
225    def set_params(self, **params):
226        """Set the parameters of this estimator."""
227        for parameter, value in params.items():
228            setattr(self, parameter, value)
229        return self

A flexible rolling origin forecaster that supports both autoregressive and exogenous features modes, with multiple prediction strategies.

def fit(self, X=None, y=None):
29    def fit(self, X=None, y=None):
30        if y is None:
31            raise ValueError("y cannot be None")
32
33        # Determine mode
34        if self.mode == "auto":
35            self.mode_ = "ar" if X is None else "exog"
36        else:
37            self.mode_ = self.mode
38
39        # Fit in appropriate mode
40        if self.mode_ == "ar":
41            return self._fit_ar(y)
42        else:
43            return self._fit_exog(X, y)
def predict(self, X=None, h=None):
45    def predict(self, X=None, h=None):
46        check_is_fitted(self)
47
48        # Validate horizon
49        if h is None:
50            h = self.max_horizon
51        elif h > self.max_horizon:
52            raise ValueError(
53                f"Requested horizon {h} exceeds max_horizon {self.max_horizon}"
54            )
55
56        if self.mode_ == "ar":
57            return self._predict_ar(h)
58        else:
59            if X is None:
60                raise ValueError("X cannot be None in exog mode")
61            X = check_array(X)
62            return self._predict_exog(X, h)
def download( pkgname='MASS', dataset='Boston', source='https://cran.r-universe.dev/', **kwargs):
 6def download(
 7    pkgname="MASS",
 8    dataset="Boston",
 9    source="https://cran.r-universe.dev/",
10    **kwargs,
11):
12    URL = source + pkgname + "/data/" + dataset + "/json"
13    res = requests.get(URL)
14    return pd.DataFrame(res.json(), **kwargs)
get_config
set_config
config_context
def penalized_cross_val_score( estimator, X, y, param_dict, cv=5, scorer=None, penalty_strength=0.1, penalty_type='ci', greater_is_better=False):
  7def penalized_cross_val_score(
  8    estimator,
  9    X,
 10    y,
 11    param_dict,
 12    cv=5,
 13    scorer=None,
 14    penalty_strength=0.1,
 15    penalty_type="ci",
 16    greater_is_better=False,
 17):
 18    """
 19    Calculates a penalized cross-validation score that balances mean performance
 20    and result stability (low variability across folds).
 21
 22    Parameters:
 23    -----------
 24    estimator : sklearn estimator
 25        Model to evaluate.
 26    X, y : array-like
 27        Training data.
 28    param_dict : dict
 29        Hyperparameters to set on the estimator.
 30    cv : int, default=5
 31        Number of cross-validation folds (must be >= 2).
 32    scorer : callable or str, optional
 33        Scikit-learn scorer (e.g., from sklearn.metrics.make_scorer).
 34    penalty_strength : float, default=0.1
 35        Multiplicative factor for the variability penalty.
 36        penalty_strength=0.1 = penalize by up to 10% of mean score
 37    penalty_type : {'std', 'max', 'range', 'ci'}
 38        Type of variability to penalize:
 39        - 'std': standard deviation of fold scores
 40        - 'max': maximum deviation from mean across folds
 41        - 'range': difference between best and worst fold
 42        - 'ci': approximate 95% confidence interval width (2 * SEM)
 43    greater_is_better : bool
 44        Whether higher raw scores are better (e.g., accuracy=True, RMSE=False).
 45
 46    Returns:
 47    --------
 48    penalized_score : float
 49        Mean CV score adjusted by penalty. Always "lower is better" in effect,
 50        so unstable models are penalized.
 51    """
 52
 53    if penalty_strength < 0:
 54        raise ValueError("penalty_strength must be non-negative.")
 55
 56    if cv < 2:
 57        raise ValueError("cv must be at least 2.")
 58
 59    # Validate parameters
 60    estimator_params = estimator.get_params()
 61    missing_params = [key for key in param_dict if key not in estimator_params]
 62    if missing_params:
 63        raise ValueError(
 64            f"Estimator does not have parameters: {', '.join(missing_params)}"
 65        )
 66
 67    # Clone and configure estimator
 68    current_estimator = clone(estimator)
 69    current_estimator.set_params(**param_dict)
 70
 71    # Perform cross-validation
 72    cv_scores = cross_val_score(current_estimator, X, y, cv=cv, scoring=scorer)
 73
 74    if len(cv_scores) == 0:
 75        raise ValueError("Cross-validation scores are empty.")
 76
 77    mean_score = np.mean(cv_scores)
 78
 79    # Compute variability measure
 80    if penalty_type == "std":
 81        variability_measure = np.std(cv_scores)
 82    elif penalty_type == "max":
 83        variability_measure = np.max(np.abs(cv_scores - mean_score))
 84    elif penalty_type == "range":
 85        variability_measure = np.ptp(cv_scores)  # max - min
 86    elif penalty_type == "ci":
 87        # Approximate 95% CI width: 2 * standard error of the mean
 88        variability_measure = 2 * (np.std(cv_scores) / np.sqrt(len(cv_scores)))
 89    else:
 90        raise ValueError("penalty_type must be 'std', 'max', 'range', or 'ci'.")
 91
 92    # Scale penalty relative to mean score magnitude
 93    if abs(mean_score) > 1e-10:  # avoid division by zero
 94        normalized_penalty = penalty_strength * (
 95            variability_measure / abs(mean_score)
 96        )
 97    else:
 98        normalized_penalty = penalty_strength * variability_measure
 99
100    # Apply penalty: make worse for instability
101    if greater_is_better:
102        return mean_score - normalized_penalty  # lower score = penalized
103    else:
104        return mean_score + normalized_penalty  # higher score = penalized

Calculates a penalized cross-validation score that balances mean performance and result stability (low variability across folds).

Parameters:

estimator : sklearn estimator Model to evaluate. X, y : array-like Training data. param_dict : dict Hyperparameters to set on the estimator. cv : int, default=5 Number of cross-validation folds (must be >= 2). scorer : callable or str, optional Scikit-learn scorer (e.g., from sklearn.metrics.make_scorer). penalty_strength : float, default=0.1 Multiplicative factor for the variability penalty. penalty_strength=0.1 = penalize by up to 10% of mean score penalty_type : {'std', 'max', 'range', 'ci'} Type of variability to penalize: - 'std': standard deviation of fold scores - 'max': maximum deviation from mean across folds - 'range': difference between best and worst fold - 'ci': approximate 95% confidence interval width (2 * SEM) greater_is_better : bool Whether higher raw scores are better (e.g., accuracy=True, RMSE=False).

Returns:

penalized_score : float Mean CV score adjusted by penalty. Always "lower is better" in effect, so unstable models are penalized.

def make_diverse_classification(n_datasets=100, random_state=None):
  7def make_diverse_classification(n_datasets=100, random_state=None):
  8    rng = np.random.default_rng(random_state)
  9
 10    for _ in range(n_datasets):
 11        # Sample parameters
 12        n_samples = int(loguniform(100, 10000).rvs(random_state=rng))
 13        n_features = int(rng.uniform(10, 50))
 14
 15        # --- Step 1: Choose n_classes safely ---
 16        max_classes_by_sample = max(2, n_samples // 10)
 17        n_classes = rng.integers(2, min(100, max_classes_by_sample) + 1)
 18        n_classes = max(2, min(n_classes, n_samples // 2))
 19
 20        # --- Step 2: Class weights with minimum samples ---
 21        alpha = [0.5] * n_classes
 22        weights = dirichlet.rvs(alpha, random_state=rng.integers(0, 2**32))[0]
 23        weights /= weights.sum()
 24
 25        min_per_class = 2
 26        total_min = n_classes * min_per_class
 27        if total_min > n_samples:
 28            weights = np.ones(n_classes) / n_classes
 29        else:
 30            # Distribute at least min_per_class, then scale
 31            y_counts = np.maximum(
 32                np.round(weights * n_samples), min_per_class
 33            ).astype(int)
 34            y_counts = (y_counts / y_counts.sum() * n_samples).astype(int)
 35            y_counts[-1] += n_samples - y_counts.sum()  # fix rounding
 36            y_counts = np.maximum(y_counts, min_per_class)
 37            y_counts[-1] += n_samples - y_counts.sum()
 38            weights = (y_counts / n_samples).tolist()
 39
 40        # --- Step 3: Informative features ---
 41        # Must support n_classes * n_clusters_per_class <= 2 ** n_informative
 42        # So let's first pick n_informative large enough or cap n_classes
 43        n_informative = max(
 44            4, int(rng.uniform(4, min(10, n_features)))
 45        )  # start higher
 46        n_informative = min(n_informative, n_features - 6, n_samples - 1)
 47
 48        # Cap n_classes based on n_informative
 49        max_possible_classes = 2**n_informative
 50        if n_classes > max_possible_classes:
 51            n_classes = max_possible_classes
 52            # Recompute weights
 53            alpha = [0.5] * n_classes
 54            weights = dirichlet.rvs(
 55                alpha, random_state=rng.integers(0, 2**32)
 56            )[0]
 57            weights = (weights / weights.sum()).tolist()
 58
 59        # --- Step 4: Redundant, repeated, noise ---
 60        n_redundant = min(
 61            n_informative,
 62            int(rng.uniform(0, 0.5) * (n_features - n_informative)),
 63        )
 64        available = n_features - n_informative - n_redundant
 65        n_repeated = (
 66            int(rng.uniform(0, 0.2) * available) if available > 0 else 0
 67        )
 68        n_noise = n_features - n_informative - n_redundant - n_repeated
 69
 70        if n_noise < 0:
 71            continue  # should not happen
 72
 73        # --- Step 5: Clusters per class ---
 74        max_clusters_total = 2**n_informative
 75        n_clusters_per_class = rng.integers(1, 4)
 76        n_clusters_per_class = min(
 77            n_clusters_per_class, max_clusters_total // n_classes
 78        )
 79        n_clusters_per_class = max(1, n_clusters_per_class)
 80
 81        # --- Step 6: Other parameters ---
 82        class_sep = loguniform(0.1, 10).rvs(random_state=rng)
 83        flip_y = rng.uniform(0.0, 0.5)
 84        hypercube = rng.choice([True, False])
 85        shift = rng.uniform(-1, 1, n_features) if rng.random() < 0.5 else 0.0
 86        scale = rng.uniform(0.5, 5.0)
 87
 88        # --- Final safety ---
 89        if n_informative + n_redundant + n_repeated > n_features:
 90            continue
 91
 92        try:
 93            X, y = make_classification(
 94                n_samples=n_samples,
 95                n_features=n_features,
 96                n_informative=n_informative,
 97                n_redundant=n_redundant,
 98                n_repeated=n_repeated,
 99                n_classes=n_classes,
100                n_clusters_per_class=n_clusters_per_class,
101                weights=weights,
102                flip_y=flip_y,
103                class_sep=class_sep,
104                hypercube=hypercube,
105                shift=shift,
106                scale=scale,
107                shuffle=True,
108                random_state=rng.integers(0, 2**32),
109            )
110        except Exception as e:
111            print(f"Skipped due to error: {e}")
112            continue
113
114        metadata = {
115            "n_samples": n_samples,
116            "n_features": n_features,
117            "n_classes": n_classes,
118            "n_informative": n_informative,
119            "n_redundant": n_redundant,
120            "n_repeated": n_repeated,
121            "n_noise": n_noise,
122            "weights": weights,
123            "flip_y": flip_y,
124            "class_sep": class_sep,
125            "n_clusters_per_class": n_clusters_per_class,
126            "hypercube": hypercube,
127            "scale": scale,
128        }
129
130        yield X, y, metadata
class HealthcareTimeSeriesGenerator:
 10class HealthcareTimeSeriesGenerator:
 11    def __init__(self, seed=42):
 12        np.random.seed(seed)
 13        random.seed(seed)
 14
 15        # Define realistic ranges for vital signs and lab values
 16        self.vital_ranges = {
 17            "heart_rate": (60, 100, 10),  # (min, max, std)
 18            "systolic_bp": (90, 140, 15),
 19            "diastolic_bp": (60, 90, 10),
 20            "temperature": (36.1, 37.2, 0.3),  # Celsius
 21            "respiratory_rate": (12, 20, 3),
 22            "oxygen_saturation": (95, 100, 2),
 23        }
 24
 25        self.lab_ranges = {
 26            "glucose": (70, 110, 20),  # mg/dL
 27            "creatinine": (0.6, 1.2, 0.2),  # mg/dL
 28            "hemoglobin": (12, 16, 1.5),  # g/dL
 29            "white_blood_cells": (4000, 11000, 1500),  # cells/μL
 30            "sodium": (136, 145, 3),  # mEq/L
 31            "potassium": (3.5, 5.0, 0.4),  # mEq/L
 32        }
 33
 34        # Medical conditions that affect vital signs
 35        self.conditions = [
 36            "hypertension",
 37            "diabetes",
 38            "copd",
 39            "heart_failure",
 40            "kidney_disease",
 41            "anemia",
 42            "infection",
 43            "healthy",
 44        ]
 45
 46    def generate_patient_demographics(self, n_patients=100):
 47        """Generate realistic patient demographics"""
 48        patients = []
 49
 50        for i in range(n_patients):
 51            age = np.random.normal(65, 15)  # Average hospital patient age
 52            age = max(18, min(95, int(age)))  # Clamp between 18-95
 53
 54            gender = random.choice(["M", "F"])
 55
 56            # Assign conditions based on age and gender
 57            conditions = self._assign_conditions(age, gender)
 58
 59            patient = {
 60                "patient_id": f"P{i+1:04d}",
 61                "age": age,
 62                "gender": gender,
 63                "conditions": conditions,
 64                "admission_date": self._random_date(),
 65                "length_of_stay": random.randint(1, 30),
 66            }
 67            patients.append(patient)
 68
 69        return pd.DataFrame(patients)
 70
 71    def _assign_conditions(self, age, gender):
 72        """Assign medical conditions based on demographics"""
 73        conditions = []
 74
 75        # Age-related condition probabilities
 76        if age > 50:
 77            if random.random() < 0.3:
 78                conditions.append("hypertension")
 79            if random.random() < 0.15:
 80                conditions.append("diabetes")
 81            if random.random() < 0.1:
 82                conditions.append("heart_failure")
 83
 84        if age > 60:
 85            if random.random() < 0.08:
 86                conditions.append("copd")
 87            if random.random() < 0.12:
 88                conditions.append("kidney_disease")
 89
 90        if gender == "F" and random.random() < 0.1:
 91            conditions.append("anemia")
 92
 93        if random.random() < 0.05:
 94            conditions.append("infection")
 95
 96        if not conditions:
 97            conditions.append("healthy")
 98
 99        return conditions
100
101    def _random_date(self):
102        """Generate random date within last 2 years"""
103        start_date = datetime.now() - timedelta(days=730)
104        random_days = random.randint(0, 730)
105        return start_date + timedelta(days=random_days)
106
107    def generate_time_series(
108        self, patients_df, measurements_per_day=4, include_missing=True
109    ):
110        """Generate time series data for all patients"""
111        all_measurements = []
112
113        for _, patient in patients_df.iterrows():
114            patient_measurements = self._generate_patient_timeseries(
115                patient, measurements_per_day, include_missing
116            )
117            all_measurements.extend(patient_measurements)
118
119        return pd.DataFrame(all_measurements)
120
121    def _generate_patient_timeseries(
122        self, patient, measurements_per_day, include_missing
123    ):
124        """Generate time series for a single patient"""
125        measurements = []
126
127        start_date = patient["admission_date"]
128        length_of_stay = patient["length_of_stay"]
129        conditions = patient["conditions"]
130
131        # Generate measurements for each day
132        for day in range(length_of_stay):
133            current_date = start_date + timedelta(days=day)
134
135            # Generate multiple measurements per day
136            for measurement_num in range(measurements_per_day):
137                timestamp = current_date + timedelta(
138                    hours=measurement_num * (24 / measurements_per_day)
139                )
140
141                measurement = {
142                    "patient_id": patient["patient_id"],
143                    "timestamp": timestamp,
144                    "day_of_stay": day + 1,
145                }
146
147                # Generate vital signs
148                vitals = self._generate_vitals(conditions, day, patient["age"])
149                measurement.update(vitals)
150
151                # Generate lab values (less frequent)
152                if (
153                    measurement_num == 0 or random.random() < 0.1
154                ):  # Morning labs or random
155                    labs = self._generate_labs(conditions, day)
156                    measurement.update(labs)
157                else:
158                    # Add NaN for missing lab values
159                    for lab in self.lab_ranges.keys():
160                        measurement[lab] = np.nan
161
162                # Add some random missing values if requested
163                if include_missing:
164                    measurement = self._add_missing_values(measurement)
165
166                measurements.append(measurement)
167
168        return measurements
169
170    def _generate_vitals(self, conditions, day, age):
171        """Generate vital signs based on patient conditions and progression"""
172        vitals = {}
173
174        for vital, (base_min, base_max, base_std) in self.vital_ranges.items():
175            base_mean = (base_min + base_max) / 2
176
177            # Adjust based on conditions
178            mean_adjustment = 0
179            std_adjustment = 1
180
181            if "hypertension" in conditions:
182                if "systolic" in vital:
183                    mean_adjustment += 20
184                elif "diastolic" in vital:
185                    mean_adjustment += 10
186
187            if "heart_failure" in conditions:
188                if vital == "heart_rate":
189                    mean_adjustment += 15
190                elif vital == "respiratory_rate":
191                    mean_adjustment += 5
192                elif vital == "oxygen_saturation":
193                    mean_adjustment -= 3
194
195            if "copd" in conditions:
196                if vital == "respiratory_rate":
197                    mean_adjustment += 8
198                elif vital == "oxygen_saturation":
199                    mean_adjustment -= 5
200
201            if "infection" in conditions:
202                if vital == "temperature":
203                    mean_adjustment += np.random.normal(1.5, 0.5)
204                elif vital == "heart_rate":
205                    mean_adjustment += 20
206
207            # Age adjustments
208            if age > 70:
209                if vital == "systolic_bp":
210                    mean_adjustment += 10
211                elif vital == "heart_rate":
212                    mean_adjustment -= 5
213
214            # Day progression (recovery/deterioration)
215            day_effect = np.sin(day * 0.2) * 2  # Subtle oscillation
216
217            # Generate value
218            adjusted_mean = base_mean + mean_adjustment + day_effect
219            adjusted_std = base_std * std_adjustment
220
221            value = np.random.normal(adjusted_mean, adjusted_std)
222
223            # Apply realistic bounds
224            if vital == "temperature":
225                value = max(35.0, min(42.0, value))
226            elif vital == "oxygen_saturation":
227                value = max(70, min(100, value))
228            elif vital == "heart_rate":
229                value = max(40, min(180, value))
230            elif "bp" in vital:
231                value = max(40, min(200, value))
232            elif vital == "respiratory_rate":
233                value = max(8, min(40, value))
234
235            vitals[vital] = round(value, 1)
236
237        return vitals
238
239    def _generate_labs(self, conditions, day):
240        """Generate lab values based on conditions"""
241        labs = {}
242
243        for lab, (base_min, base_max, base_std) in self.lab_ranges.items():
244            base_mean = (base_min + base_max) / 2
245
246            # Condition-based adjustments
247            mean_adjustment = 0
248
249            if "diabetes" in conditions and lab == "glucose":
250                mean_adjustment += np.random.normal(50, 20)
251
252            if "kidney_disease" in conditions:
253                if lab == "creatinine":
254                    mean_adjustment += np.random.normal(1.0, 0.5)
255                elif lab == "potassium":
256                    mean_adjustment += np.random.normal(0.5, 0.2)
257
258            if "anemia" in conditions and lab == "hemoglobin":
259                mean_adjustment -= np.random.normal(3, 1)
260
261            if "infection" in conditions and lab == "white_blood_cells":
262                mean_adjustment += np.random.normal(5000, 2000)
263
264            # Generate value
265            adjusted_mean = base_mean + mean_adjustment
266            value = np.random.normal(adjusted_mean, base_std)
267
268            # Apply bounds
269            if lab == "glucose":
270                value = max(30, min(500, value))
271            elif lab == "creatinine":
272                value = max(0.3, min(10.0, value))
273            elif lab == "hemoglobin":
274                value = max(5.0, min(20.0, value))
275            elif lab == "white_blood_cells":
276                value = max(1000, min(50000, value))
277            elif lab == "sodium":
278                value = max(120, min(160, value))
279            elif lab == "potassium":
280                value = max(2.0, min(7.0, value))
281
282            labs[lab] = round(value, 2)
283
284        return labs
285
286    def _add_missing_values(self, measurement, missing_prob=0.05):
287        """Randomly add missing values to simulate real-world data"""
288        for key, value in measurement.items():
289            if key not in [
290                "patient_id",
291                "timestamp",
292                "day_of_stay",
293            ] and not pd.isna(value):
294                if random.random() < missing_prob:
295                    measurement[key] = np.nan
296        return measurement
297
298    def generate_outcomes(self, patients_df, timeseries_df):
299        """Generate patient outcomes based on their data"""
300        outcomes = []
301
302        for _, patient in patients_df.iterrows():
303            patient_data = timeseries_df[
304                timeseries_df["patient_id"] == patient["patient_id"]
305            ]
306
307            # Calculate outcome probability based on conditions and vital trends
308            readmission_prob = self._calculate_readmission_risk(
309                patient, patient_data
310            )
311            mortality_risk = self._calculate_mortality_risk(
312                patient, patient_data
313            )
314
315            outcome = {
316                "patient_id": patient["patient_id"],
317                "readmitted_30_days": random.random() < readmission_prob,
318                "mortality_risk_score": round(mortality_risk, 3),
319                "length_of_stay_actual": patient["length_of_stay"],
320                "discharge_disposition": self._assign_discharge_disposition(
321                    patient, mortality_risk
322                ),
323            }
324            outcomes.append(outcome)
325
326        return pd.DataFrame(outcomes)
327
328    def _calculate_readmission_risk(self, patient, patient_data):
329        """Calculate 30-day readmission risk"""
330        base_risk = 0.1  # 10% base readmission rate
331
332        # Condition-based risk
333        if "heart_failure" in patient["conditions"]:
334            base_risk += 0.15
335        if "diabetes" in patient["conditions"]:
336            base_risk += 0.08
337        if "kidney_disease" in patient["conditions"]:
338            base_risk += 0.12
339
340        # Age-based risk
341        if patient["age"] > 75:
342            base_risk += 0.1
343
344        # Vital signs instability
345        if len(patient_data) > 0:
346            hr_std = patient_data["heart_rate"].std()
347            if hr_std > 15:
348                base_risk += 0.05
349
350        return min(0.8, base_risk)
351
352    def _calculate_mortality_risk(self, patient, patient_data):
353        """Calculate mortality risk score"""
354        risk_score = 0
355
356        # Age component
357        risk_score += patient["age"] * 0.02
358
359        # Condition components
360        condition_weights = {
361            "heart_failure": 0.3,
362            "kidney_disease": 0.25,
363            "copd": 0.2,
364            "infection": 0.15,
365            "diabetes": 0.1,
366            "hypertension": 0.05,
367        }
368
369        for condition in patient["conditions"]:
370            if condition in condition_weights:
371                risk_score += condition_weights[condition]
372
373        # Vital signs component
374        if len(patient_data) > 0:
375            # Abnormal vital signs increase risk
376            avg_o2_sat = patient_data["oxygen_saturation"].mean()
377            if avg_o2_sat < 92:
378                risk_score += 0.2
379
380            avg_temp = patient_data["temperature"].mean()
381            if avg_temp > 38.5:
382                risk_score += 0.15
383
384        return min(1.0, risk_score)
385
386    def _assign_discharge_disposition(self, patient, mortality_risk):
387        """Assign discharge disposition"""
388        if mortality_risk > 0.7:
389            return random.choice(["ICU Transfer", "Deceased"])
390        elif mortality_risk > 0.4:
391            return random.choice(
392                ["Skilled Nursing Facility", "Home with Services"]
393            )
394        else:
395            return random.choice(
396                ["Home", "Home with Services", "Rehabilitation"]
397            )
398
399    def create_visualizations(self, patients, timeseries, outcomes):
400        """Create comprehensive visualizations of the healthcare data"""
401
402        # Set up the plotting style
403        plt.style.use("default")
404        sns.set_palette("husl")
405
406        # Create figure with subplots
407        fig = plt.figure(figsize=(20, 16))
408
409        # 1. Patient Demographics
410        plt.subplot(3, 4, 1)
411        patients["age"].hist(
412            bins=15, alpha=0.7, color="skyblue", edgecolor="black"
413        )
414        plt.title("Age Distribution", fontsize=12, fontweight="bold")
415        plt.xlabel("Age")
416        plt.ylabel("Frequency")
417
418        # 2. Gender Distribution
419        plt.subplot(3, 4, 2)
420        gender_counts = patients["gender"].value_counts()
421        plt.pie(
422            gender_counts.values,
423            labels=gender_counts.index,
424            autopct="%1.1f%%",
425            colors=["lightcoral", "lightblue"],
426        )
427        plt.title("Gender Distribution", fontsize=12, fontweight="bold")
428
429        # 3. Medical Conditions Frequency
430        plt.subplot(3, 4, 3)
431        all_conditions = [
432            cond for conditions in patients["conditions"] for cond in conditions
433        ]
434        condition_counts = pd.Series(all_conditions).value_counts()
435        condition_counts.plot(kind="bar", color="lightgreen", alpha=0.8)
436        plt.title(
437            "Medical Conditions Frequency", fontsize=12, fontweight="bold"
438        )
439        plt.xticks(rotation=45)
440        plt.ylabel("Count")
441
442        # 4. Length of Stay Distribution
443        plt.subplot(3, 4, 4)
444        patients["length_of_stay"].hist(
445            bins=15, alpha=0.7, color="orange", edgecolor="black"
446        )
447        plt.title("Length of Stay Distribution", fontsize=12, fontweight="bold")
448        plt.xlabel("Days")
449        plt.ylabel("Frequency")
450
451        # 5. Heart Rate Time Series for Sample Patients
452        plt.subplot(3, 4, 5)
453        sample_patients = patients["patient_id"].head(5)
454        for pid in sample_patients:
455            patient_data = timeseries[timeseries["patient_id"] == pid].copy()
456            if len(patient_data) > 0:
457                patient_data = patient_data.sort_values("timestamp")
458                plt.plot(
459                    patient_data["day_of_stay"],
460                    patient_data["heart_rate"],
461                    marker="o",
462                    markersize=3,
463                    alpha=0.7,
464                    label=pid,
465                )
466        plt.title(
467            "Heart Rate Over Time (Sample Patients)",
468            fontsize=12,
469            fontweight="bold",
470        )
471        plt.xlabel("Day of Stay")
472        plt.ylabel("Heart Rate (bpm)")
473        plt.legend(fontsize=8)
474
475        # 6. Blood Pressure Correlation
476        plt.subplot(3, 4, 6)
477        clean_bp = timeseries.dropna(subset=["systolic_bp", "diastolic_bp"])
478        plt.scatter(
479            clean_bp["systolic_bp"],
480            clean_bp["diastolic_bp"],
481            alpha=0.5,
482            s=10,
483            color="red",
484        )
485        plt.title("Blood Pressure Correlation", fontsize=12, fontweight="bold")
486        plt.xlabel("Systolic BP")
487        plt.ylabel("Diastolic BP")
488
489        # 7. Temperature vs Heart Rate
490        plt.subplot(3, 4, 7)
491        clean_temp_hr = timeseries.dropna(subset=["temperature", "heart_rate"])
492        plt.scatter(
493            clean_temp_hr["temperature"],
494            clean_temp_hr["heart_rate"],
495            alpha=0.5,
496            s=10,
497            color="purple",
498        )
499        plt.title("Temperature vs Heart Rate", fontsize=12, fontweight="bold")
500        plt.xlabel("Temperature (°C)")
501        plt.ylabel("Heart Rate (bpm)")
502
503        # 8. Vital Signs Distribution
504        plt.subplot(3, 4, 8)
505        vital_cols = ["heart_rate", "respiratory_rate", "oxygen_saturation"]
506        timeseries[vital_cols].boxplot()
507        plt.title("Vital Signs Distribution", fontsize=12, fontweight="bold")
508        plt.xticks(rotation=45)
509
510        # 9. Lab Values Over Time
511        plt.subplot(3, 4, 9)
512        sample_patient = timeseries[
513            timeseries["patient_id"] == sample_patients.iloc[0]
514        ].copy()
515        sample_patient = sample_patient.sort_values("timestamp")
516
517        # Plot glucose if available
518        glucose_data = sample_patient.dropna(subset=["glucose"])
519        if len(glucose_data) > 0:
520            plt.plot(
521                glucose_data["day_of_stay"],
522                glucose_data["glucose"],
523                "o-",
524                color="green",
525                label="Glucose",
526            )
527
528        # Plot creatinine if available
529        creat_data = sample_patient.dropna(subset=["creatinine"])
530        if len(creat_data) > 0:
531            plt.twinx()
532            plt.plot(
533                creat_data["day_of_stay"],
534                creat_data["creatinine"],
535                "o-",
536                color="blue",
537                label="Creatinine",
538            )
539            plt.ylabel("Creatinine (mg/dL)", color="blue")
540
541        plt.title(
542            f"Lab Values - {sample_patients.iloc[0]}",
543            fontsize=12,
544            fontweight="bold",
545        )
546        plt.xlabel("Day of Stay")
547        plt.ylabel("Glucose (mg/dL)", color="green")
548
549        # 10. Readmission Risk by Age Group
550        plt.subplot(3, 4, 10)
551        merged_data = patients.merge(outcomes, on="patient_id")
552        merged_data["age_group"] = pd.cut(
553            merged_data["age"],
554            bins=[0, 40, 60, 80, 100],
555            labels=["<40", "40-60", "60-80", "80+"],
556        )
557        readmission_by_age = merged_data.groupby("age_group")[
558            "readmitted_30_days"
559        ].mean()
560        readmission_by_age.plot(kind="bar", color="salmon", alpha=0.8)
561        plt.title(
562            "30-Day Readmission Rate by Age", fontsize=12, fontweight="bold"
563        )
564        plt.ylabel("Readmission Rate")
565        plt.xticks(rotation=0)
566
567        # 11. Mortality Risk Distribution
568        plt.subplot(3, 4, 11)
569        outcomes["mortality_risk_score"].hist(
570            bins=20, alpha=0.7, color="darkred", edgecolor="black"
571        )
572        plt.title(
573            "Mortality Risk Score Distribution", fontsize=12, fontweight="bold"
574        )
575        plt.xlabel("Risk Score")
576        plt.ylabel("Frequency")
577
578        # 12. Missing Data Heatmap
579        plt.subplot(3, 4, 12)
580        # Calculate missing data percentage for each column
581        missing_data = timeseries.isnull().sum() / len(timeseries) * 100
582        missing_data = missing_data[missing_data > 0].sort_values(
583            ascending=False
584        )
585
586        if len(missing_data) > 0:
587            missing_data.plot(kind="bar", color="gray", alpha=0.8)
588            plt.title("Missing Data Percentage", fontsize=12, fontweight="bold")
589            plt.ylabel("Missing %")
590            plt.xticks(rotation=45)
591        else:
592            plt.text(
593                0.5,
594                0.5,
595                "No Missing Data",
596                ha="center",
597                va="center",
598                transform=plt.gca().transAxes,
599                fontsize=14,
600            )
601            plt.title("Missing Data Percentage", fontsize=12, fontweight="bold")
602
603        plt.tight_layout()
604        plt.savefig(
605            "healthcare_data_visualization.png", dpi=300, bbox_inches="tight"
606        )
607        print("Visualization saved as: healthcare_data_visualization.png")
608        plt.show()
609        # Create additional detailed plots
610        self.create_detailed_plots(patients, timeseries)
611
612    def create_detailed_plots(self, patients, timeseries):
613        """Create additional detailed visualizations"""
614
615        # Time Series Plot for Multiple Vital Signs
616        fig, axes = plt.subplots(2, 2, figsize=(16, 12))
617
618        # Select a patient with longer stay for better visualization
619        long_stay_patients = patients[patients["length_of_stay"] >= 7][
620            "patient_id"
621        ].head(3)
622
623        vital_signs = [
624            "heart_rate",
625            "systolic_bp",
626            "temperature",
627            "oxygen_saturation",
628        ]
629        colors = ["red", "blue", "orange", "green"]
630
631        for i, vital in enumerate(vital_signs):
632            ax = axes[i // 2, i % 2]
633
634            for j, pid in enumerate(long_stay_patients):
635                patient_data = timeseries[
636                    timeseries["patient_id"] == pid
637                ].copy()
638                patient_data = patient_data.sort_values("timestamp")
639                clean_data = patient_data.dropna(subset=[vital])
640
641                if len(clean_data) > 0:
642                    ax.plot(
643                        clean_data["day_of_stay"],
644                        clean_data[vital],
645                        marker="o",
646                        label=pid,
647                        alpha=0.7,
648                        linewidth=2,
649                    )
650
651            ax.set_title(
652                f'{vital.replace("_", " ").title()} Over Time',
653                fontweight="bold",
654            )
655            ax.set_xlabel("Day of Stay")
656            ax.set_ylabel(vital.replace("_", " ").title())
657            ax.legend()
658            ax.grid(True, alpha=0.3)
659
660        plt.tight_layout()
661        plt.savefig(
662            "detailed_vitals_timeseries.png", dpi=300, bbox_inches="tight"
663        )
664        print(
665            "Detailed vital signs plot saved as: detailed_vitals_timeseries.png"
666        )
667        plt.show()
668
669        # Correlation Heatmap
670        plt.figure(figsize=(12, 10))
671        numeric_cols = [
672            "heart_rate",
673            "systolic_bp",
674            "diastolic_bp",
675            "temperature",
676            "respiratory_rate",
677            "oxygen_saturation",
678            "glucose",
679            "creatinine",
680            "hemoglobin",
681            "white_blood_cells",
682            "sodium",
683            "potassium",
684        ]
685
686        correlation_matrix = timeseries[numeric_cols].corr()
687
688        sns.heatmap(
689            correlation_matrix,
690            annot=True,
691            cmap="coolwarm",
692            center=0,
693            square=True,
694            fmt=".2f",
695            cbar_kws={"shrink": 0.8},
696        )
697        plt.title(
698            "Healthcare Parameters Correlation Matrix",
699            fontsize=16,
700            fontweight="bold",
701        )
702        plt.tight_layout()
703        plt.savefig("correlation_heatmap.png", dpi=300, bbox_inches="tight")
704        print("Correlation heatmap saved as: correlation_heatmap.png")
705        plt.show()
def generate_synthetic_returns( n_days=2520, mu=0.0002, kappa=0.05, theta=0.0001, sigma_v=0.01, rho=-0.7, lambda_jump=0.05, jump_size_dist='normal', sigma_jump=0.02, noise_dist='normal', noise_scale=0.0005, noise_df=3.0, regime_params=None, random_seed=None):
  8def generate_synthetic_returns(
  9    n_days=252 * 10,  # ~10 years of daily data
 10    mu=0.0002,  # Daily drift
 11    kappa=0.05,  # Vol mean reversion
 12    theta=0.0001,  # Long-run variance
 13    sigma_v=0.01,  # Vol of vol
 14    rho=-0.7,  # Leverage effect
 15    lambda_jump=0.05,  # Jump intensity (per day)
 16    jump_size_dist="normal",  # "normal", "log_normal", or "exponential"
 17    sigma_jump=0.02,  # Jump size (scale parameter)
 18    noise_dist="normal",  # "normal" or "student_t"
 19    noise_scale=0.0005,  # Microstructure noise scale
 20    noise_df=3.0,  # Degrees of freedom for Student’s t
 21    regime_params=None,  # Regime switching params
 22    random_seed=None,  # Reproducibility
 23):
 24    """
 25    Generates synthetic stock returns with:
 26    - Stochastic volatility (Heston-like)
 27    - Jumps (Poisson-driven, with configurable distribution)
 28    - Regime switching (Markov)
 29    - Leverage effect
 30    - Fat tails (via jumps and noise)
 31    - Microstructure noise (Gaussian or Student’s t)
 32
 33    Args:
 34        jump_size_dist: Jump size distribution ("normal", "log_normal", "exponential").
 35        noise_dist: Microstructure noise distribution ("normal", "student_t").
 36        noise_df: Degrees of freedom for Student’s t noise (if used).
 37    """
 38    if random_seed is not None:
 39        np.random.seed(random_seed)
 40
 41    # Default regime switching (2 regimes: calm and turbulent)
 42    if regime_params is None:
 43        regime_params = {
 44            "transition_matrix": np.array([[0.99, 0.01], [0.03, 0.97]]),
 45            "theta_high_multiplier": 3.0,
 46            "kappa_high_multiplier": 2.0,
 47        }
 48
 49    # Initialize
 50    v = np.zeros(n_days)  # Variance
 51    r = np.zeros(n_days)  # Returns
 52    regime = np.zeros(n_days, dtype=int)
 53    v[0] = theta
 54
 55    # Simulate regime switching (Markov chain)
 56    for t in range(1, n_days):
 57        regime[t] = np.random.choice(
 58            [0, 1], p=regime_params["transition_matrix"][regime[t - 1]]
 59        )
 60
 61    # Simulate returns and volatility
 62    for t in range(1, n_days):
 63        # Regime-dependent params
 64        if regime[t] == 1:  # High-vol regime
 65            theta_t = theta * regime_params["theta_high_multiplier"]
 66            kappa_t = kappa * regime_params["kappa_high_multiplier"]
 67        else:
 68            theta_t = theta
 69            kappa_t = kappa
 70
 71        # Volatility process (Euler discretization)
 72        eta = np.random.normal()
 73        epsilon = rho * eta + np.sqrt(1 - rho**2) * np.random.normal()
 74        dv = kappa_t * (theta_t - v[t - 1]) + sigma_v * np.sqrt(v[t - 1]) * eta
 75        v[t] = max(v[t - 1] + dv, 1e-6)  # Ensure positivity
 76
 77        # Jumps (with configurable distribution)
 78        if np.random.poisson(lambda_jump) > 0:
 79            if jump_size_dist == "normal":
 80                J = np.random.normal(0, sigma_jump)
 81            elif jump_size_dist == "log_normal":
 82                J = (
 83                    np.exp(np.random.normal(0, sigma_jump)) - 1
 84                )  # Log-normal (positive skew)
 85            elif jump_size_dist == "exponential":
 86                J = np.random.exponential(sigma_jump) * np.sign(
 87                    np.random.uniform(-1, 1)
 88                )  # Double-sided
 89            else:
 90                raise ValueError(
 91                    "Invalid jump_size_dist. Use 'normal', 'log_normal', or 'exponential'."
 92                )
 93        else:
 94            J = 0
 95
 96        # Returns
 97        r[t] = mu + np.sqrt(v[t - 1]) * epsilon + J
 98
 99    # Microstructure noise (Gaussian or Student’s t)
100    if noise_dist == "normal":
101        r += np.random.normal(0, noise_scale, n_days)
102    elif noise_dist == "student_t":
103        r += (
104            np.random.standard_t(noise_df, n_days)
105            * noise_scale
106            / np.sqrt(noise_df / (noise_df - 2))
107        )
108    else:
109        raise ValueError("Invalid noise_dist. Use 'normal' or 'student_t'.")
110
111    # Create DataFrame
112    df = pd.DataFrame(
113        {"returns": r, "volatility": np.sqrt(v), "regime": regime},
114        index=pd.date_range(start="1970-01-01", periods=n_days),
115    )
116
117    return df

Generates synthetic stock returns with:

  • Stochastic volatility (Heston-like)
  • Jumps (Poisson-driven, with configurable distribution)
  • Regime switching (Markov)
  • Leverage effect
  • Fat tails (via jumps and noise)
  • Microstructure noise (Gaussian or Student’s t)

Args: jump_size_dist: Jump size distribution ("normal", "log_normal", "exponential"). noise_dist: Microstructure noise distribution ("normal", "student_t"). noise_df: Degrees of freedom for Student’s t noise (if used).

def plot_synthetic_returns(df, title='Synthetic Stock Returns Analysis', figsize=(14, 10)):
120def plot_synthetic_returns(
121    df, title="Synthetic Stock Returns Analysis", figsize=(14, 10)
122):
123    """
124    Plot synthetic stock returns with multiple panels:
125    - Returns over time
126    - Volatility (sqrt variance)
127    - Regime indicators
128    - Distribution vs. normal (QQ plot and histogram)
129    - Autocorrelation of returns and squared returns
130
131    Args:
132        df (pd.DataFrame): Output from generate_synthetic_returns
133            Must have: 'returns', 'volatility', 'regime'
134        title (str): Title for the plot
135        figsize (tuple): Figure size
136    """
137    # Set style
138    sns.set_style("darkgrid")
139    plt.rcParams["figure.dpi"] = 100
140
141    fig = plt.figure(figsize=figsize)
142    gs = fig.add_gridspec(3, 2, height_ratios=[1, 1, 1], hspace=0.4, wspace=0.3)
143
144    # -------------------------
145    # 1. Returns Over Time
146    # -------------------------
147    ax1 = fig.add_subplot(gs[0, 0])
148    ax1.plot(df.index, df["returns"], lw=0.8, color="tab:blue", alpha=0.9)
149    ax1.set_title("Daily Returns")
150    ax1.set_ylabel("Return")
151    ax1.axhline(0, color="gray", linestyle="--", lw=0.8)
152
153    # Highlight large jumps (optional)
154    threshold = df["returns"].std() * 3
155    jumps = df[np.abs(df["returns"]) > threshold]
156    if not jumps.empty:
157        ax1.scatter(
158            jumps.index,
159            jumps["returns"],
160            color="red",
161            s=10,
162            zorder=5,
163            label="Large Moves",
164        )
165        ax1.legend()
166
167    # -------------------------
168    # 2. Volatility
169    # -------------------------
170    ax2 = fig.add_subplot(gs[0, 1])
171    ax2.plot(df.index, df["volatility"], lw=1.2, color="tab:orange")
172    ax2.set_title("Volatility (Latent)")
173    ax2.set_ylabel("Volatility")
174
175    # Shade turbulent regimes
176    if "regime" in df.columns:
177        turbulent_days = df[df["regime"] == 1]
178        if not turbulent_days.empty:
179            ax2.fill_between(
180                turbulent_days.index,
181                df.loc[turbulent_days.index, "volatility"].min(),
182                df.loc[turbulent_days.index, "volatility"],
183                color="red",
184                alpha=0.2,
185                label="Turbulent Regime",
186            )
187            ax2.legend()
188
189    # -------------------------
190    # 3. Regime Plot
191    # -------------------------
192    ax3 = fig.add_subplot(gs[1, 0])
193    ax3.fill_between(
194        df.index,
195        0,
196        1,
197        where=(df["regime"] == 0),
198        interpolate=True,
199        color="green",
200        alpha=0.3,
201        label="Calm Regime",
202    )
203    ax3.fill_between(
204        df.index,
205        0,
206        1,
207        where=(df["regime"] == 1),
208        interpolate=True,
209        color="red",
210        alpha=0.3,
211        label="Turbulent Regime",
212    )
213    ax3.set_ylim(0, 1)
214    ax3.set_yticks([])
215    ax3.set_title("Regime Switching (Hidden State)")
216    ax3.legend(loc="upper right")
217
218    # -------------------------
219    # 4. Return Distribution (QQ + Histogram)
220    # -------------------------
221    from scipy import stats
222
223    ax4 = fig.add_subplot(gs[1, 1])
224    stats.probplot(df["returns"], dist="norm", plot=ax4)
225    ax4.set_title("QQ Plot (Fat Tails Detection)")
226    ax4.get_lines()[0].set_marker(".")
227    ax4.get_lines()[0].set_markersize(4)
228    ax4.get_lines()[1].set_color("red")
229    ax4.get_lines()[1].set_linewidth(1.5)
230
231    # -------------------------
232    # 5. Histogram with Normal Fit
233    # -------------------------
234    ax5 = fig.add_subplot(gs[2, 0])
235    mu_norm, std_norm = stats.norm.fit(df["returns"])
236    sns.histplot(
237        df["returns"],
238        bins=50,
239        kde=False,
240        stat="density",
241        ax=ax5,
242        alpha=0.7,
243        color="skyblue",
244    )
245    xmin, xmax = ax5.get_xlim()
246    x = np.linspace(xmin, xmax, 100)
247    p = stats.norm.pdf(x, mu_norm, std_norm)
248    ax5.plot(x, p, "k--", linewidth=1.5, label="Normal Fit")
249    ax5.set_title("Return Distribution")
250    ax5.set_xlabel("Return")
251    ax5.legend()
252
253    # Annotate kurtosis and skew
254    kurt = df["returns"].kurtosis()
255    skew = df["returns"].skew()
256    ax5.text(
257        0.02,
258        0.9,
259        f"Kurtosis: {kurt:.2f}\nSkewness: {skew:.2f}",
260        transform=ax5.transAxes,
261        fontsize=10,
262        bbox=dict(boxstyle="round", facecolor="wheat", alpha=0.8),
263    )
264
265    # -------------------------
266    # 6. Autocorrelation
267    # -------------------------
268    ax6 = fig.add_subplot(gs[2, 1])
269    from statsmodels.graphics.tsaplots import plot_acf
270
271    plot_acf(
272        df["returns"] ** 2,
273        ax=ax6,
274        lags=40,
275        title="ACF of Squared Returns",
276        alpha=0.05,
277    )
278    ax6.set_xlabel("Lag (Days)")
279    ax6.set_ylabel("Autocorrelation")
280
281    # Add title at top
282    fig.suptitle(title, fontsize=14, fontweight="bold", y=0.98)
283
284    # Adjust layout
285    plt.tight_layout()
286    plt.show()

Plot synthetic stock returns with multiple panels:

  • Returns over time
  • Volatility (sqrt variance)
  • Regime indicators
  • Distribution vs. normal (QQ plot and histogram)
  • Autocorrelation of returns and squared returns

Args: df (pd.DataFrame): Output from generate_synthetic_returns Must have: 'returns', 'volatility', 'regime' title (str): Title for the plot figsize (tuple): Figure size