mlsauce
1try: 2 from .adaopt import AdaOpt 3except ImportError as e: 4 print(f"Could not import some modules: {e}") 5 6try: 7 from .booster import ( 8 LSBoostClassifier, 9 LSBoostRegressor, 10 GenericBoostingClassifier, 11 GenericBoostingRegressor, 12 ) 13except ImportError as e: 14 print(f"Could not import some modules: {e}") 15 16try: 17 from .lazybooster import ( 18 LazyBoostingClassifier, 19 LazyBoostingRegressor, 20 LazyBoostingMTS, 21 ) 22except ImportError as e: 23 print(f"Could not import some modules: {e}") 24 25try: 26 from .multitaskregressor import MultiTaskRegressor 27except ImportError as e: 28 print(f"Could not import some modules: {e}") 29 30try: 31 from .datasets import download 32except ImportError as e: 33 print(f"Could not import some modules: {e}") 34 35try: 36 from .elasticnet import ElasticNetRegressor 37except ImportError as e: 38 print(f"Could not import some modules: {e}") 39 40try: 41 from .kernelridge import KRLSRegressor 42except ImportError as e: 43 print(f"Could not import some modules: {e}") 44 45try: 46 from .lasso import LassoRegressor 47except ImportError as e: 48 print(f"Could not import some modules: {e}") 49 50try: 51 from .ridge import RidgeRegressor 52except ImportError as e: 53 print(f"Could not import some modules: {e}") 54 55try: 56 from .stump import StumpClassifier 57except ImportError as e: 58 print(f"Could not import some modules: {e}") 59 60try: 61 from .isotonicregression import IsotonicRegressor 62except ImportError as e: 63 print(f"Could not import some modules: {e}") 64 65try: 66 from .fpca import GenericFunctionalForecaster 67except ImportError as e: 68 print(f"Could not import some modules: {e}") 69 70try: 71 from .generators import ( 72 make_diverse_classification, 73 HealthcareTimeSeriesGenerator, 74 generate_synthetic_returns, 75 plot_synthetic_returns, 76 ) 77except ImportError as e: 78 print(f"Could not import generators: {e}") 79 80try: 81 from .catencoder import RankTargetEncoder 82except ImportError as e: 83 print(f"Could not import RankTargetEncoder: {e}") 84 85try: 86 from .rollingoriginregression import RollingOriginForecaster 87except ImportError as e: 88 print(f"Could not import RollingOriginForecaster: {e}") 89# from .encoders import corrtarget_encoder 90 91try: 92 from .penalizedcv import penalized_cross_val_score 93except ImportError as e: 94 print(f"Could not import penalized_cross_val_score: {e}") 95 96try: 97 from .conformalbayesian import ConformalBayesianRegressor 98except ImportError as e: 99 print(f"Could not import ConformalBayesianRegressor: {e}") 100 101try: 102 from .conformalbayesian import ConformalBayesianClassifier 103except ImportError as e: 104 print(f"Could not import ConformalBayesianClassifier: {e}") 105 106try: 107 from .contextawaretheta import ContextAwareThetaForecaster 108except ImportError as e: 109 print(f"Could not import ContextAwareThetaForecaster: {e}") 110 111 112__all__ = [ 113 "AdaOpt", 114 "ConformalBayesianRegressor", 115 "ConformalBayesianClassifier", 116 "ContextAwareThetaForecaster", 117 "LSBoostClassifier", 118 "GenericBoostingClassifier", 119 "GenericBoostingRegressor", 120 "StumpClassifier", 121 "ElasticNetRegressor", 122 "KRLSRegressor", 123 "LassoRegressor", 124 "LSBoostRegressor", 125 "LSTMRegressor", 126 "RidgeRegressor", 127 "LazyBoostingClassifier", 128 "LazyBoostingMTS", 129 "LazyBoostingRegressor", 130 "MultiTaskRegressor", 131 "IsotonicRegressor", 132 "GenericFunctionalForecaster", 133 "RankTargetEncoder", 134 "RollingOriginForecaster", 135 # Other imports 136 # "corrtarget_encoder", 137 "download", 138 # Non-modules: 139 "get_config", 140 "set_config", 141 "config_context", 142 "penalized_cross_val_score", 143 "make_diverse_classification", 144 "HealthcareTimeSeriesGenerator", 145 "generate_synthetic_returns", 146 "plot_synthetic_returns", 147]
19class AdaOpt(BaseEstimator, ClassifierMixin): 20 """AdaOpt classifier. 21 22 Attributes: 23 24 n_iterations: int 25 number of iterations of the optimizer at training time. 26 27 learning_rate: float 28 controls the speed of the optimizer at training time. 29 30 reg_lambda: float 31 L2 regularization parameter for successive errors in the optimizer 32 (at training time). 33 34 reg_alpha: float 35 L1 regularization parameter for successive errors in the optimizer 36 (at training time). 37 38 eta: float 39 controls the slope in gradient descent (at training time). 40 41 gamma: float 42 controls the step size in gradient descent (at training time). 43 44 k: int 45 number of nearest neighbors selected at test time for classification. 46 47 tolerance: float 48 controls early stopping in gradient descent (at training time). 49 50 n_clusters: int 51 number of clusters, if MiniBatch k-means is used at test time 52 (for faster prediction). 53 54 batch_size: int 55 size of the batch, if MiniBatch k-means is used at test time 56 (for faster prediction). 57 58 row_sample: float 59 percentage of rows chosen from training set (by stratified subsampling, 60 for faster prediction). 61 62 type_dist: str 63 distance used for finding the nearest neighbors; currently `euclidean-f` 64 (euclidean distances calculated as whole), `euclidean` (euclidean distances 65 calculated row by row), `cosine` (cosine distance). 66 67 n_jobs: int 68 number of cpus for parallel processing (default: None) 69 70 verbose: int 71 progress bar for parallel processing (yes = 1) or not (no = 0) 72 73 cache: boolean 74 if the nearest neighbors are cached or not, for faster retrieval in 75 subsequent calls. 76 77 n_clusters_input: int 78 number of clusters (a priori) for clustering the features 79 80 clustering_method: str 81 clustering method: currently 'kmeans', 'gmm' 82 83 cluster_scaling: str 84 scaling method for clustering: currently 'standard', 'robust', 'minmax' 85 86 backend: str 87 backend for parallel processing: "cpu" or "gpu" or "tpu" 88 89 seed: int 90 reproducibility seed for nodes_sim=='uniform', clustering and dropout. 91 92 """ 93 94 def __init__( 95 self, 96 n_iterations=50, 97 learning_rate=0.3, 98 reg_lambda=0.1, 99 reg_alpha=0.5, 100 eta=0.01, 101 gamma=0.01, 102 k=3, 103 tolerance=0, 104 n_clusters=0, 105 batch_size=100, 106 row_sample=0.8, 107 type_dist="euclidean-f", 108 n_jobs=None, 109 verbose=0, 110 cache=True, 111 n_clusters_input=0, 112 clustering_method="kmeans", 113 cluster_scaling="standard", 114 backend="cpu", 115 seed=123, 116 ): 117 if n_clusters_input > 0: 118 assert clustering_method in ( 119 "kmeans", 120 "gmm", 121 ), "`clustering_method` must be in ('kmeans', 'gmm')" 122 assert cluster_scaling in ( 123 "standard", 124 "robust", 125 "minmax", 126 ), "`cluster_scaling` must be in ('standard', 'robust', 'minmax')" 127 128 assert type_dist in ( 129 "euclidean", 130 "manhattan", 131 "euclidean-f", 132 "cosine", 133 ), "must have: `type_dist` in ('euclidean', 'manhattan', 'euclidean-f', 'cosine') " 134 135 self.n_iterations = n_iterations 136 self.learning_rate = learning_rate 137 self.reg_lambda = reg_lambda 138 self.reg_alpha = reg_alpha 139 self.eta = eta 140 self.gamma = gamma 141 self.k = k 142 self.tolerance = tolerance 143 self.n_clusters = n_clusters 144 self.batch_size = batch_size 145 self.row_sample = row_sample 146 self.type_dist = type_dist 147 self.n_jobs = n_jobs 148 self.cache = cache 149 self.verbose = verbose 150 self.n_clusters_input = n_clusters_input 151 self.clustering_method = clustering_method 152 self.cluster_scaling = cluster_scaling 153 self.scaler_, self.label_encoder_, self.clusterer_ = None, None, None 154 self.backend = backend 155 self.seed = seed 156 157 def fit(self, X, y, **kwargs): 158 """Fit AdaOpt to training data (X, y) 159 160 Args: 161 162 X: {array-like}, shape = [n_samples, n_features] 163 Training vectors, where n_samples is the number 164 of samples and n_features is the number of features. 165 166 y: array-like, shape = [n_samples] 167 Target values. 168 169 **kwargs: additional parameters to be passed to self.cook_training_set. 170 171 Returns: 172 173 self: object. 174 175 """ 176 177 if self.n_clusters_input > 0: 178 clustered_X, self.scaler_, self.label_encoder_, self.clusterer_ = ( 179 cluster( 180 X, 181 n_clusters=self.n_clusters_input, 182 method=self.clustering_method, 183 type_scaling=self.cluster_scaling, 184 training=True, 185 seed=self.seed, 186 ) 187 ) 188 X = np.column_stack((X.copy(), clustered_X)) 189 190 if self.row_sample < 1: 191 index_subsample = subsample( 192 y, row_sample=self.row_sample, seed=self.seed 193 ) 194 y_ = y[index_subsample] 195 X_ = X[index_subsample, :] 196 else: 197 y_ = deepcopy(y) 198 X_ = deepcopy(X) 199 200 n, p = X_.shape 201 202 n_classes = len(np.unique(y_)) 203 204 assert n == len(y_), "must have X.shape[0] == len(y)" 205 206 res = adaoptc.fit_adaopt( 207 X=np.asarray(X_).astype(np.float64), 208 y=np.asarray(y_).astype(np.int64), 209 n_iterations=self.n_iterations, 210 n_X=n, 211 p_X=p, 212 n_classes=n_classes, 213 learning_rate=self.learning_rate, 214 reg_lambda=self.reg_lambda, 215 reg_alpha=self.reg_alpha, 216 eta=self.eta, 217 gamma=self.gamma, 218 tolerance=self.tolerance, 219 backend=self.backend, 220 ) 221 222 self.probs_training = res["probs"] 223 self.training_accuracy = res["training_accuracy"] 224 self.alphas = res["alphas"] 225 self.n_iterations = res["n_iterations"] 226 self.scaled_X_train = np.array(res["scaled_X_train"], dtype=np.float64) 227 self.n_classes_ = len(np.unique(y)) # for compatibility with sklearn 228 return self 229 230 def predict(self, X, **kwargs): 231 """Predict test data X. 232 233 Args: 234 235 X: {array-like}, shape = [n_samples, n_features] 236 Training vectors, where n_samples is the number 237 of samples and n_features is the number of features. 238 239 **kwargs: additional parameters to be passed to `predict_proba` 240 241 Returns: 242 243 model predictions: {array-like} 244 245 """ 246 247 return np.argmax(self.predict_proba(X, **kwargs), axis=1) 248 249 def predict_proba(self, X, **kwargs): 250 """Predict probabilities for test data X. 251 252 Args: 253 254 X: {array-like}, shape = [n_samples, n_features] 255 Training vectors, where n_samples is the number 256 of samples and n_features is the number of features. 257 258 **kwargs: additional parameters to be passed to 259 self.cook_test_set 260 261 Returns: 262 263 probability estimates for test data: {array-like} 264 265 """ 266 267 n_train, p_train = self.scaled_X_train.shape 268 269 if self.n_clusters_input > 0: 270 X = np.column_stack( 271 ( 272 X.copy(), 273 cluster( 274 X, 275 training=False, 276 scaler=self.scaler_, 277 label_encoder=self.label_encoder_, 278 clusterer=self.clusterer_, 279 seed=self.seed, 280 ), 281 ) 282 ) 283 284 n_test = X.shape[0] 285 286 if self.n_jobs is None: 287 return adaoptc.predict_proba_adaopt( 288 X_test=np.asarray(X, order="C").astype(np.float64), 289 scaled_X_train=np.asarray( 290 self.scaled_X_train, order="C" 291 ).astype(np.float64), 292 n_test=n_test, 293 n_train=n_train, 294 probs_train=self.probs_training, 295 k=self.k, 296 n_clusters=self.n_clusters, 297 batch_size=self.batch_size, 298 type_dist=self.type_dist, 299 cache=self.cache, 300 seed=self.seed, 301 backend=self.backend, 302 ) 303 304 # parallel: self.n_jobs is not None 305 assert self.type_dist in ( 306 "euclidean", 307 "manhattan", 308 "cosine", 309 ), "must have: `self.type_dist` in ('euclidean', 'manhattan', 'cosine') " 310 311 scaled_X_test = X / norm(X, ord=2, axis=1)[:, None] 312 313 if self.type_dist == "euclidean": 314 315 @delayed 316 @wrap_non_picklable_objects 317 def multiproc_func(i): 318 dists_test_i = adaoptc.distance_to_mat_euclidean2( 319 np.asarray(scaled_X_test.astype(np.float64), order="C")[ 320 i, : 321 ], 322 np.asarray( 323 self.scaled_X_train.astype(np.float64), order="C" 324 ), 325 np.zeros(n_train), 326 n_train, 327 p_train, 328 ) 329 330 kmin_test_i = adaoptc.find_kmin_x( 331 dists_test_i, n_x=n_train, k=self.k, cache=self.cache 332 ) 333 334 weights_test_i = adaoptc.calculate_weights(kmin_test_i[0]) 335 336 probs_test_i = adaoptc.calculate_probs( 337 kmin_test_i[1], self.probs_training 338 ) 339 340 return adaoptc.average_probs( 341 probs=probs_test_i, weights=weights_test_i 342 ) 343 344 if self.type_dist == "manhattan": 345 346 @delayed 347 @wrap_non_picklable_objects 348 def multiproc_func(i): 349 dists_test_i = adaoptc.distance_to_mat_manhattan2( 350 np.asarray(scaled_X_test.astype(np.float64), order="C")[ 351 i, : 352 ], 353 np.asarray( 354 self.scaled_X_train.astype(np.float64), order="C" 355 ), 356 np.zeros(n_train), 357 n_train, 358 p_train, 359 ) 360 361 kmin_test_i = adaoptc.find_kmin_x( 362 dists_test_i, n_x=n_train, k=self.k, cache=self.cache 363 ) 364 365 weights_test_i = adaoptc.calculate_weights(kmin_test_i[0]) 366 367 probs_test_i = adaoptc.calculate_probs( 368 kmin_test_i[1], self.probs_training 369 ) 370 371 return adaoptc.average_probs( 372 probs=probs_test_i, weights=weights_test_i 373 ) 374 375 if self.type_dist == "cosine": 376 377 @delayed 378 @wrap_non_picklable_objects 379 def multiproc_func(i, *args): 380 dists_test_i = adaoptc.distance_to_mat_cosine2( 381 np.asarray(scaled_X_test.astype(np.float64), order="C")[ 382 i, : 383 ], 384 np.asarray( 385 self.scaled_X_train.astype(np.float64), order="C" 386 ), 387 np.zeros(n_train), 388 n_train, 389 p_train, 390 ) 391 392 kmin_test_i = adaoptc.find_kmin_x( 393 dists_test_i, n_x=n_train, k=self.k, cache=self.cache 394 ) 395 396 weights_test_i = adaoptc.calculate_weights(kmin_test_i[0]) 397 398 probs_test_i = adaoptc.calculate_probs( 399 kmin_test_i[1], self.probs_training 400 ) 401 402 return adaoptc.average_probs( 403 probs=probs_test_i, weights=weights_test_i 404 ) 405 406 if self.verbose == 1: 407 res = Parallel(n_jobs=self.n_jobs, prefer="threads")( 408 (multiproc_func)(m) for m in tqdm(range(n_test)) 409 ) 410 411 else: 412 res = Parallel(n_jobs=self.n_jobs, prefer="threads")( 413 (multiproc_func)(m) for m in range(n_test) 414 ) 415 416 return np.asarray(res)
AdaOpt classifier.
Attributes:
n_iterations: int
number of iterations of the optimizer at training time.
learning_rate: float
controls the speed of the optimizer at training time.
reg_lambda: float
L2 regularization parameter for successive errors in the optimizer
(at training time).
reg_alpha: float
L1 regularization parameter for successive errors in the optimizer
(at training time).
eta: float
controls the slope in gradient descent (at training time).
gamma: float
controls the step size in gradient descent (at training time).
k: int
number of nearest neighbors selected at test time for classification.
tolerance: float
controls early stopping in gradient descent (at training time).
n_clusters: int
number of clusters, if MiniBatch k-means is used at test time
(for faster prediction).
batch_size: int
size of the batch, if MiniBatch k-means is used at test time
(for faster prediction).
row_sample: float
percentage of rows chosen from training set (by stratified subsampling,
for faster prediction).
type_dist: str
distance used for finding the nearest neighbors; currently `euclidean-f`
(euclidean distances calculated as whole), `euclidean` (euclidean distances
calculated row by row), `cosine` (cosine distance).
n_jobs: int
number of cpus for parallel processing (default: None)
verbose: int
progress bar for parallel processing (yes = 1) or not (no = 0)
cache: boolean
if the nearest neighbors are cached or not, for faster retrieval in
subsequent calls.
n_clusters_input: int
number of clusters (a priori) for clustering the features
clustering_method: str
clustering method: currently 'kmeans', 'gmm'
cluster_scaling: str
scaling method for clustering: currently 'standard', 'robust', 'minmax'
backend: str
backend for parallel processing: "cpu" or "gpu" or "tpu"
seed: int
reproducibility seed for nodes_sim=='uniform', clustering and dropout.
157 def fit(self, X, y, **kwargs): 158 """Fit AdaOpt to training data (X, y) 159 160 Args: 161 162 X: {array-like}, shape = [n_samples, n_features] 163 Training vectors, where n_samples is the number 164 of samples and n_features is the number of features. 165 166 y: array-like, shape = [n_samples] 167 Target values. 168 169 **kwargs: additional parameters to be passed to self.cook_training_set. 170 171 Returns: 172 173 self: object. 174 175 """ 176 177 if self.n_clusters_input > 0: 178 clustered_X, self.scaler_, self.label_encoder_, self.clusterer_ = ( 179 cluster( 180 X, 181 n_clusters=self.n_clusters_input, 182 method=self.clustering_method, 183 type_scaling=self.cluster_scaling, 184 training=True, 185 seed=self.seed, 186 ) 187 ) 188 X = np.column_stack((X.copy(), clustered_X)) 189 190 if self.row_sample < 1: 191 index_subsample = subsample( 192 y, row_sample=self.row_sample, seed=self.seed 193 ) 194 y_ = y[index_subsample] 195 X_ = X[index_subsample, :] 196 else: 197 y_ = deepcopy(y) 198 X_ = deepcopy(X) 199 200 n, p = X_.shape 201 202 n_classes = len(np.unique(y_)) 203 204 assert n == len(y_), "must have X.shape[0] == len(y)" 205 206 res = adaoptc.fit_adaopt( 207 X=np.asarray(X_).astype(np.float64), 208 y=np.asarray(y_).astype(np.int64), 209 n_iterations=self.n_iterations, 210 n_X=n, 211 p_X=p, 212 n_classes=n_classes, 213 learning_rate=self.learning_rate, 214 reg_lambda=self.reg_lambda, 215 reg_alpha=self.reg_alpha, 216 eta=self.eta, 217 gamma=self.gamma, 218 tolerance=self.tolerance, 219 backend=self.backend, 220 ) 221 222 self.probs_training = res["probs"] 223 self.training_accuracy = res["training_accuracy"] 224 self.alphas = res["alphas"] 225 self.n_iterations = res["n_iterations"] 226 self.scaled_X_train = np.array(res["scaled_X_train"], dtype=np.float64) 227 self.n_classes_ = len(np.unique(y)) # for compatibility with sklearn 228 return self
Fit AdaOpt to training data (X, y)
Args:
X: {array-like}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number
of samples and n_features is the number of features.
y: array-like, shape = [n_samples]
Target values.
**kwargs: additional parameters to be passed to self.cook_training_set.
Returns:
self: object.
230 def predict(self, X, **kwargs): 231 """Predict test data X. 232 233 Args: 234 235 X: {array-like}, shape = [n_samples, n_features] 236 Training vectors, where n_samples is the number 237 of samples and n_features is the number of features. 238 239 **kwargs: additional parameters to be passed to `predict_proba` 240 241 Returns: 242 243 model predictions: {array-like} 244 245 """ 246 247 return np.argmax(self.predict_proba(X, **kwargs), axis=1)
Predict test data X.
Args:
X: {array-like}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number
of samples and n_features is the number of features.
**kwargs: additional parameters to be passed to `predict_proba`
Returns:
model predictions: {array-like}
249 def predict_proba(self, X, **kwargs): 250 """Predict probabilities for test data X. 251 252 Args: 253 254 X: {array-like}, shape = [n_samples, n_features] 255 Training vectors, where n_samples is the number 256 of samples and n_features is the number of features. 257 258 **kwargs: additional parameters to be passed to 259 self.cook_test_set 260 261 Returns: 262 263 probability estimates for test data: {array-like} 264 265 """ 266 267 n_train, p_train = self.scaled_X_train.shape 268 269 if self.n_clusters_input > 0: 270 X = np.column_stack( 271 ( 272 X.copy(), 273 cluster( 274 X, 275 training=False, 276 scaler=self.scaler_, 277 label_encoder=self.label_encoder_, 278 clusterer=self.clusterer_, 279 seed=self.seed, 280 ), 281 ) 282 ) 283 284 n_test = X.shape[0] 285 286 if self.n_jobs is None: 287 return adaoptc.predict_proba_adaopt( 288 X_test=np.asarray(X, order="C").astype(np.float64), 289 scaled_X_train=np.asarray( 290 self.scaled_X_train, order="C" 291 ).astype(np.float64), 292 n_test=n_test, 293 n_train=n_train, 294 probs_train=self.probs_training, 295 k=self.k, 296 n_clusters=self.n_clusters, 297 batch_size=self.batch_size, 298 type_dist=self.type_dist, 299 cache=self.cache, 300 seed=self.seed, 301 backend=self.backend, 302 ) 303 304 # parallel: self.n_jobs is not None 305 assert self.type_dist in ( 306 "euclidean", 307 "manhattan", 308 "cosine", 309 ), "must have: `self.type_dist` in ('euclidean', 'manhattan', 'cosine') " 310 311 scaled_X_test = X / norm(X, ord=2, axis=1)[:, None] 312 313 if self.type_dist == "euclidean": 314 315 @delayed 316 @wrap_non_picklable_objects 317 def multiproc_func(i): 318 dists_test_i = adaoptc.distance_to_mat_euclidean2( 319 np.asarray(scaled_X_test.astype(np.float64), order="C")[ 320 i, : 321 ], 322 np.asarray( 323 self.scaled_X_train.astype(np.float64), order="C" 324 ), 325 np.zeros(n_train), 326 n_train, 327 p_train, 328 ) 329 330 kmin_test_i = adaoptc.find_kmin_x( 331 dists_test_i, n_x=n_train, k=self.k, cache=self.cache 332 ) 333 334 weights_test_i = adaoptc.calculate_weights(kmin_test_i[0]) 335 336 probs_test_i = adaoptc.calculate_probs( 337 kmin_test_i[1], self.probs_training 338 ) 339 340 return adaoptc.average_probs( 341 probs=probs_test_i, weights=weights_test_i 342 ) 343 344 if self.type_dist == "manhattan": 345 346 @delayed 347 @wrap_non_picklable_objects 348 def multiproc_func(i): 349 dists_test_i = adaoptc.distance_to_mat_manhattan2( 350 np.asarray(scaled_X_test.astype(np.float64), order="C")[ 351 i, : 352 ], 353 np.asarray( 354 self.scaled_X_train.astype(np.float64), order="C" 355 ), 356 np.zeros(n_train), 357 n_train, 358 p_train, 359 ) 360 361 kmin_test_i = adaoptc.find_kmin_x( 362 dists_test_i, n_x=n_train, k=self.k, cache=self.cache 363 ) 364 365 weights_test_i = adaoptc.calculate_weights(kmin_test_i[0]) 366 367 probs_test_i = adaoptc.calculate_probs( 368 kmin_test_i[1], self.probs_training 369 ) 370 371 return adaoptc.average_probs( 372 probs=probs_test_i, weights=weights_test_i 373 ) 374 375 if self.type_dist == "cosine": 376 377 @delayed 378 @wrap_non_picklable_objects 379 def multiproc_func(i, *args): 380 dists_test_i = adaoptc.distance_to_mat_cosine2( 381 np.asarray(scaled_X_test.astype(np.float64), order="C")[ 382 i, : 383 ], 384 np.asarray( 385 self.scaled_X_train.astype(np.float64), order="C" 386 ), 387 np.zeros(n_train), 388 n_train, 389 p_train, 390 ) 391 392 kmin_test_i = adaoptc.find_kmin_x( 393 dists_test_i, n_x=n_train, k=self.k, cache=self.cache 394 ) 395 396 weights_test_i = adaoptc.calculate_weights(kmin_test_i[0]) 397 398 probs_test_i = adaoptc.calculate_probs( 399 kmin_test_i[1], self.probs_training 400 ) 401 402 return adaoptc.average_probs( 403 probs=probs_test_i, weights=weights_test_i 404 ) 405 406 if self.verbose == 1: 407 res = Parallel(n_jobs=self.n_jobs, prefer="threads")( 408 (multiproc_func)(m) for m in tqdm(range(n_test)) 409 ) 410 411 else: 412 res = Parallel(n_jobs=self.n_jobs, prefer="threads")( 413 (multiproc_func)(m) for m in range(n_test) 414 ) 415 416 return np.asarray(res)
Predict probabilities for test data X.
Args:
X: {array-like}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number
of samples and n_features is the number of features.
**kwargs: additional parameters to be passed to
self.cook_test_set
Returns:
probability estimates for test data: {array-like}
19class ConformalBayesianRegressor(BaseEstimator, RegressorMixin): 20 def __init__( 21 self, 22 obj=Ridge, 23 level=95, 24 hyperparameter_bounds=None, 25 n_samples=20, 26 calibration_fraction=0.2, 27 scaling_method="standard", 28 random_state=None, 29 show_progress=False, 30 verbose=True, 31 n_jobs=-1, 32 ): 33 self.obj = obj 34 self.level = level 35 self.alpha_ = 1 - self.level / 100 36 self.hyperparameter_bounds = hyperparameter_bounds 37 self.n_samples = n_samples 38 self.calibration_fraction = calibration_fraction 39 self.scaling_method = scaling_method 40 self.random_state = random_state 41 self.verbose = verbose 42 self.show_progress = show_progress 43 self.n_jobs = n_jobs 44 45 self.is_fitted_ = False 46 47 def _sample_hyperparameters(self): 48 # Simple uniform sampling or use fixed bounds 49 configs = [] 50 for _ in range(self.n_samples): 51 if self.hyperparameter_bounds: 52 cfg = {} 53 for k, v in self.hyperparameter_bounds.items(): 54 if isinstance(v, list) and len(v) == 2: 55 # Always sample as float first 56 sampled_value = np.random.uniform(v[0], v[1]) 57 58 # If both bounds are integers, assume integer parameter 59 if isinstance(v[0], (int, np.integer)) and isinstance( 60 v[1], (int, np.integer) 61 ): 62 cfg[k] = int(sampled_value) 63 else: 64 cfg[k] = sampled_value 65 else: 66 # Assume fixed value 67 cfg[k] = v 68 else: 69 cfg = {} 70 configs.append(cfg) 71 return configs 72 73 def _train_models_parallel(self, X, y, configs): 74 def train_model(cfg): 75 self.obj.set_params(**cfg) 76 self.obj.fit(X, y) 77 return self.obj 78 79 if self.show_progress == False: 80 models = joblib.Parallel(n_jobs=self.n_jobs)( 81 joblib.delayed(train_model)(cfg) 82 for cfg in tqdm(configs, disable=not self.verbose) 83 ) 84 else: 85 models = joblib.Parallel(n_jobs=self.n_jobs)( 86 joblib.delayed(train_model)(cfg) for cfg in configs 87 ) 88 return models 89 90 def _predict_models(self, models, X): 91 if self.show_progress == False: 92 preds = [m.predict(X) for m in models] 93 else: 94 preds = [ 95 m.predict(X) for m in tqdm(models, disable=not self.verbose) 96 ] 97 return np.column_stack(preds) 98 99 def fit(self, X, y): 100 if hasattr(X, "values"): # keep DataFrame if possible 101 X = X.copy() 102 else: 103 X = np.asarray(X) 104 X = np.asarray(X) 105 y = np.asarray(y) 106 rng = check_random_state(self.random_state) 107 X, y = shuffle(X, y, random_state=rng) 108 # 1. Cluster with GMM (full covariance) 109 n_clusters = min(10, X.shape[0] // 30) 110 gmm = GaussianMixture( 111 n_components=n_clusters, 112 covariance_type="full", 113 random_state=self.random_state, 114 ) 115 clusters = gmm.fit_predict(X) 116 # 2. Stratified train/calibration split 117 X_train, X_calib, y_train, y_calib = train_test_split( 118 X, 119 y, 120 test_size=self.calibration_fraction, 121 random_state=self.random_state, 122 stratify=clusters, 123 ) 124 # 3. Scale features 125 if self.scaling_method == "standard": 126 scaler = StandardScaler().fit(X_train) 127 else: 128 raise ValueError("Scaling method must be 'standard'") 129 self.scaler_ = scaler 130 X_train_s = scaler.transform(X_train) 131 X_calib_s = scaler.transform(X_calib) 132 # 4. Train ensemble 133 configs = self._sample_hyperparameters() 134 self.models_ = self._train_models_parallel(X_train_s, y_train, configs) 135 # 5. Calibration residuals 136 preds_calib = self._predict_models(self.models_, X_calib_s) 137 self.calibration_residuals_ = y_calib - np.median(preds_calib, axis=1) 138 self.is_fitted_ = True 139 return self 140 141 def predict(self, X, return_pi=False): 142 """Obtain predictions and prediction intervals 143 144 Args: 145 146 X: array-like, shape = [n_samples, n_features]; 147 Testing set vectors, where n_samples is the number 148 of samples and n_features is the number of features. 149 150 return_pi: boolean 151 Whether the prediction interval is returned or not. 152 Default is False, for compatibility with other _estimators_. 153 If True, a tuple containing the predictions + lower and upper 154 bounds is returned. 155 156 """ 157 if not self.is_fitted_: 158 raise RuntimeError("Fit the model first") 159 X_s = self.scaler_.transform(X) 160 preds = self._predict_models(self.models_, X_s) 161 self.mean_ = np.median(preds, axis=1) 162 if return_pi == False: 163 return self.mean_ 164 DescribeResult = namedtuple( 165 "DescribeResult", ("mean", "lower", "upper") 166 ) 167 q = np.quantile(self.calibration_residuals_, q=self.alpha_ / 200) 168 return DescribeResult(self.mean_, self.mean_ + q, self.mean_ - q) 169 170 def get_coverage(self, y_true, lower, upper): 171 return np.mean((y_true >= lower) & (y_true <= upper))
Base class for all estimators in scikit-learn.
Inheriting from this class provides default implementations of:
- setting and getting parameters used by
GridSearchCVand friends; - textual and HTML representation displayed in terminals and IDEs;
- estimator serialization;
- parameters validation;
- data validation;
- feature names validation.
Read more in the :ref:User Guide <rolling_your_own_estimator>.
Notes
All estimators should specify all the parameters that can be set
at the class level in their __init__ as explicit keyword
arguments (no *args or **kwargs).
Examples
>>> import numpy as np
>>> from sklearn.base import BaseEstimator
>>> class MyEstimator(BaseEstimator):
... def __init__(self, *, param=1):
... self.param = param
... def fit(self, X, y=None):
... self.is_fitted_ = True
... return self
... def predict(self, X):
... return np.full(shape=X.shape[0], fill_value=self.param)
>>> estimator = MyEstimator(param=2)
>>> estimator.get_params()
{'param': 2}
>>> X = np.array([[1, 2], [2, 3], [3, 4]])
>>> y = np.array([1, 0, 1])
>>> estimator.fit(X, y).predict(X)
array([2, 2, 2])
>>> estimator.set_params(param=3).fit(X, y).predict(X)
array([3, 3, 3])
99 def fit(self, X, y): 100 if hasattr(X, "values"): # keep DataFrame if possible 101 X = X.copy() 102 else: 103 X = np.asarray(X) 104 X = np.asarray(X) 105 y = np.asarray(y) 106 rng = check_random_state(self.random_state) 107 X, y = shuffle(X, y, random_state=rng) 108 # 1. Cluster with GMM (full covariance) 109 n_clusters = min(10, X.shape[0] // 30) 110 gmm = GaussianMixture( 111 n_components=n_clusters, 112 covariance_type="full", 113 random_state=self.random_state, 114 ) 115 clusters = gmm.fit_predict(X) 116 # 2. Stratified train/calibration split 117 X_train, X_calib, y_train, y_calib = train_test_split( 118 X, 119 y, 120 test_size=self.calibration_fraction, 121 random_state=self.random_state, 122 stratify=clusters, 123 ) 124 # 3. Scale features 125 if self.scaling_method == "standard": 126 scaler = StandardScaler().fit(X_train) 127 else: 128 raise ValueError("Scaling method must be 'standard'") 129 self.scaler_ = scaler 130 X_train_s = scaler.transform(X_train) 131 X_calib_s = scaler.transform(X_calib) 132 # 4. Train ensemble 133 configs = self._sample_hyperparameters() 134 self.models_ = self._train_models_parallel(X_train_s, y_train, configs) 135 # 5. Calibration residuals 136 preds_calib = self._predict_models(self.models_, X_calib_s) 137 self.calibration_residuals_ = y_calib - np.median(preds_calib, axis=1) 138 self.is_fitted_ = True 139 return self
141 def predict(self, X, return_pi=False): 142 """Obtain predictions and prediction intervals 143 144 Args: 145 146 X: array-like, shape = [n_samples, n_features]; 147 Testing set vectors, where n_samples is the number 148 of samples and n_features is the number of features. 149 150 return_pi: boolean 151 Whether the prediction interval is returned or not. 152 Default is False, for compatibility with other _estimators_. 153 If True, a tuple containing the predictions + lower and upper 154 bounds is returned. 155 156 """ 157 if not self.is_fitted_: 158 raise RuntimeError("Fit the model first") 159 X_s = self.scaler_.transform(X) 160 preds = self._predict_models(self.models_, X_s) 161 self.mean_ = np.median(preds, axis=1) 162 if return_pi == False: 163 return self.mean_ 164 DescribeResult = namedtuple( 165 "DescribeResult", ("mean", "lower", "upper") 166 ) 167 q = np.quantile(self.calibration_residuals_, q=self.alpha_ / 200) 168 return DescribeResult(self.mean_, self.mean_ + q, self.mean_ - q)
Obtain predictions and prediction intervals
Args:
X: array-like, shape = [n_samples, n_features];
Testing set vectors, where n_samples is the number
of samples and n_features is the number of features.
return_pi: boolean
Whether the prediction interval is returned or not.
Default is False, for compatibility with other _estimators_.
If True, a tuple containing the predictions + lower and upper
bounds is returned.
21class ConformalBayesianClassifier(BaseEstimator, ClassifierMixin): 22 23 # construct the object ----- 24 _estimator_type = "classifier" 25 26 def __init__( 27 self, 28 obj=Ridge(), 29 hyperparameter_bounds=None, 30 n_samples=20, 31 calibration_fraction=0.2, 32 scaling_method="standard", 33 random_state=None, 34 verbose=True, 35 n_jobs=-1, 36 calibrated=False, 37 calibration_method="sigmoid", 38 calibration_cv=3, 39 use_classifier=True, 40 ): 41 """ 42 Conformal Bayesian Classifier with optional probability calibration. 43 44 Parameters 45 ---------- 46 obj : estimator object 47 Base estimator (default: Ridge() for regression-based, or classifier if use_classifier=True) 48 hyperparameter_bounds : dict, optional 49 Bounds for hyperparameter sampling 50 n_samples : int 51 Number of ensemble models to train 52 calibration_fraction : float 53 Fraction of data to use for conformal calibration 54 scaling_method : str 55 Feature scaling method (default: "standard") 56 random_state : int, optional 57 Random seed for reproducibility 58 verbose : bool 59 Whether to show progress bars 60 n_jobs : int 61 Number of parallel jobs (-1 for all cores) 62 calibrated : bool 63 Whether to apply probability calibration (default: False) 64 calibration_method : str 65 Method for calibration: "sigmoid" or "isotonic" (default: "sigmoid") 66 calibration_cv : int 67 Number of CV folds for calibration (default: 3) 68 use_classifier : bool 69 If True, use sklearn-like classifier with probability averaging. 70 If False, use regression-based approach with SimpleMultitaskClassifier (default: False) 71 """ 72 self.obj_base = obj 73 self.hyperparameter_bounds = hyperparameter_bounds 74 self.n_samples = n_samples 75 self.calibration_fraction = calibration_fraction 76 self.scaling_method = scaling_method 77 self.random_state = random_state 78 self.verbose = verbose 79 self.n_jobs = n_jobs 80 self.calibrated = calibrated 81 self.calibration_method = calibration_method 82 self.calibration_cv = calibration_cv 83 self.use_classifier = use_classifier 84 85 self.is_fitted_ = False 86 87 def _sample_hyperparameters(self): 88 """Sample hyperparameters for ensemble members""" 89 configs = [] 90 91 # Get parameter constraints from the model if available 92 param_constraints = {} 93 if hasattr(self.obj_base, "_parameter_constraints"): 94 param_constraints = self.obj_base._parameter_constraints 95 96 for _ in range(self.n_samples): 97 if self.hyperparameter_bounds: 98 cfg = {} 99 for k, v in self.hyperparameter_bounds.items(): 100 if isinstance(v, list) and len(v) == 2: 101 # If both bounds are integers, assume integer parameter 102 if isinstance(v[0], (int, np.integer)) and isinstance( 103 v[1], (int, np.integer) 104 ): 105 cfg[k] = np.random.randint(v[0], v[1] + 1) 106 else: 107 cfg[k] = np.random.uniform(v[0], v[1]) 108 else: 109 # Assume fixed value 110 cfg[k] = v 111 else: 112 cfg = {} 113 configs.append(cfg) 114 return configs 115 116 def _train_classifiers_parallel(self, X, y, configs): 117 """Train ensemble of classifiers in parallel""" 118 119 def train_classifier(cfg): 120 clf = clone(self.obj_base) 121 clf.set_params(**cfg) 122 clf.fit(X, y) 123 return clf 124 125 if self.verbose: 126 models = joblib.Parallel(n_jobs=self.n_jobs)( 127 joblib.delayed(train_classifier)(cfg) 128 for cfg in tqdm(configs, desc="Training classifiers") 129 ) 130 else: 131 models = joblib.Parallel(n_jobs=self.n_jobs)( 132 joblib.delayed(train_classifier)(cfg) for cfg in configs 133 ) 134 return models 135 136 def fit(self, X, y): 137 """ 138 Fit the conformal Bayesian classifier. 139 140 Parameters 141 ---------- 142 X : array-like of shape (n_samples, n_features) 143 Training data 144 y : array-like of shape (n_samples,) 145 Target values 146 147 Returns 148 ------- 149 self : object 150 Fitted classifier 151 """ 152 X = np.asarray(X) 153 y = np.asarray(y) 154 155 # Store classes 156 self.classes_ = np.unique(y) 157 self.n_classes_ = len(self.classes_) 158 159 if self.use_classifier: 160 # Simpler approach: use sklearn-like classifier with probability averaging 161 if not is_classifier(self.obj_base): 162 raise ValueError( 163 "use_classifier=True requires obj to be a classifier " 164 "(e.g., LogisticRegression, RandomForestClassifier)" 165 ) 166 167 rng = check_random_state(self.random_state) 168 X, y = shuffle(X, y, random_state=rng) 169 170 # Stratified train/calibration split 171 X_train, X_calib, y_train, y_calib = train_test_split( 172 X, 173 y, 174 test_size=self.calibration_fraction, 175 random_state=self.random_state, 176 stratify=y, 177 ) 178 179 # Scale features 180 if self.scaling_method == "standard": 181 self.scaler_ = StandardScaler().fit(X_train) 182 else: 183 raise ValueError("Scaling method must be 'standard'") 184 185 X_train_s = self.scaler_.transform(X_train) 186 X_calib_s = self.scaler_.transform(X_calib) 187 188 # Train ensemble of classifiers 189 configs = self._sample_hyperparameters() 190 self.classifiers_ = self._train_classifiers_parallel( 191 X_train_s, y_train, configs 192 ) 193 194 # Store calibration data for potential conformal prediction 195 self.X_calib_ = X_calib_s 196 self.y_calib_ = y_calib 197 198 if self.calibrated: 199 # Calibrate each classifier 200 if self.verbose: 201 print("Calibrating classifiers...") 202 203 self.calibrated_classifiers_ = [] 204 for clf in tqdm( 205 self.classifiers_, 206 disable=not self.verbose, 207 desc="Calibrating", 208 ): 209 cal_clf = CalibratedClassifierCV( 210 clf, 211 method=self.calibration_method, 212 cv="prefit", 213 ensemble=False, 214 ) 215 cal_clf.fit(X_calib_s, y_calib) 216 self.calibrated_classifiers_.append(cal_clf) 217 218 else: 219 # Original approach: use regression-based with SimpleMultitaskClassifier 220 base_regressor = ConformalBayesianRegressor( 221 self.obj_base, 222 hyperparameter_bounds=self.hyperparameter_bounds, 223 n_samples=self.n_samples, 224 calibration_fraction=self.calibration_fraction, 225 scaling_method=self.scaling_method, 226 random_state=self.random_state, 227 verbose=self.verbose, 228 n_jobs=self.n_jobs, 229 ) 230 231 # Wrap in multitask classifier 232 self.obj = ns.SimpleMultitaskClassifier(base_regressor) 233 234 if self.calibrated: 235 # Fit the base classifier first 236 self.obj.fit(X, y) 237 238 # Then wrap with calibration 239 self.calibrated_obj_ = CalibratedClassifierCV( 240 self.obj, 241 method=self.calibration_method, 242 cv="prefit", 243 ensemble=False, 244 ) 245 246 # Fit calibration on the same data 247 self.calibrated_obj_.fit(X, y) 248 else: 249 self.obj.fit(X, y) 250 251 self.is_fitted_ = True 252 return self 253 254 def predict_proba(self, X): 255 """ 256 Predict class probabilities. 257 258 Parameters 259 ---------- 260 X : array-like of shape (n_samples, n_features) 261 Test data 262 263 Returns 264 ------- 265 proba : array of shape (n_samples, n_classes) 266 Predicted probabilities for each class 267 """ 268 if not self.is_fitted_: 269 raise RuntimeError("Fit the model first") 270 271 X = np.asarray(X) 272 273 if self.use_classifier: 274 # Average probabilities from ensemble 275 X_s = self.scaler_.transform(X) 276 277 if self.calibrated: 278 # Use calibrated classifiers 279 probas = np.array( 280 [ 281 clf.predict_proba(X_s) 282 for clf in self.calibrated_classifiers_ 283 ] 284 ) 285 else: 286 # Use uncalibrated classifiers 287 probas = np.array( 288 [clf.predict_proba(X_s) for clf in self.classifiers_] 289 ) 290 291 # Average probabilities across ensemble 292 mean_proba = np.mean(probas, axis=0) 293 294 return mean_proba 295 else: 296 # Use regression-based approach 297 if self.calibrated: 298 return self.calibrated_obj_.predict_proba(X) 299 else: 300 return self.obj.predict_proba(X) 301 302 def predict(self, X): 303 """ 304 Predict class labels. 305 306 Parameters 307 ---------- 308 X : array-like of shape (n_samples, n_features) 309 Test data 310 311 Returns 312 ------- 313 y_pred : array of shape (n_samples,) 314 Predicted class labels 315 """ 316 proba = self.predict_proba(X) 317 return self.classes_[np.argmax(proba, axis=1)] 318 319 @property 320 def _estimator_type(self): 321 return "classifier"
Base class for all estimators in scikit-learn.
Inheriting from this class provides default implementations of:
- setting and getting parameters used by
GridSearchCVand friends; - textual and HTML representation displayed in terminals and IDEs;
- estimator serialization;
- parameters validation;
- data validation;
- feature names validation.
Read more in the :ref:User Guide <rolling_your_own_estimator>.
Notes
All estimators should specify all the parameters that can be set
at the class level in their __init__ as explicit keyword
arguments (no *args or **kwargs).
Examples
>>> import numpy as np
>>> from sklearn.base import BaseEstimator
>>> class MyEstimator(BaseEstimator):
... def __init__(self, *, param=1):
... self.param = param
... def fit(self, X, y=None):
... self.is_fitted_ = True
... return self
... def predict(self, X):
... return np.full(shape=X.shape[0], fill_value=self.param)
>>> estimator = MyEstimator(param=2)
>>> estimator.get_params()
{'param': 2}
>>> X = np.array([[1, 2], [2, 3], [3, 4]])
>>> y = np.array([1, 0, 1])
>>> estimator.fit(X, y).predict(X)
array([2, 2, 2])
>>> estimator.set_params(param=3).fit(X, y).predict(X)
array([3, 3, 3])
136 def fit(self, X, y): 137 """ 138 Fit the conformal Bayesian classifier. 139 140 Parameters 141 ---------- 142 X : array-like of shape (n_samples, n_features) 143 Training data 144 y : array-like of shape (n_samples,) 145 Target values 146 147 Returns 148 ------- 149 self : object 150 Fitted classifier 151 """ 152 X = np.asarray(X) 153 y = np.asarray(y) 154 155 # Store classes 156 self.classes_ = np.unique(y) 157 self.n_classes_ = len(self.classes_) 158 159 if self.use_classifier: 160 # Simpler approach: use sklearn-like classifier with probability averaging 161 if not is_classifier(self.obj_base): 162 raise ValueError( 163 "use_classifier=True requires obj to be a classifier " 164 "(e.g., LogisticRegression, RandomForestClassifier)" 165 ) 166 167 rng = check_random_state(self.random_state) 168 X, y = shuffle(X, y, random_state=rng) 169 170 # Stratified train/calibration split 171 X_train, X_calib, y_train, y_calib = train_test_split( 172 X, 173 y, 174 test_size=self.calibration_fraction, 175 random_state=self.random_state, 176 stratify=y, 177 ) 178 179 # Scale features 180 if self.scaling_method == "standard": 181 self.scaler_ = StandardScaler().fit(X_train) 182 else: 183 raise ValueError("Scaling method must be 'standard'") 184 185 X_train_s = self.scaler_.transform(X_train) 186 X_calib_s = self.scaler_.transform(X_calib) 187 188 # Train ensemble of classifiers 189 configs = self._sample_hyperparameters() 190 self.classifiers_ = self._train_classifiers_parallel( 191 X_train_s, y_train, configs 192 ) 193 194 # Store calibration data for potential conformal prediction 195 self.X_calib_ = X_calib_s 196 self.y_calib_ = y_calib 197 198 if self.calibrated: 199 # Calibrate each classifier 200 if self.verbose: 201 print("Calibrating classifiers...") 202 203 self.calibrated_classifiers_ = [] 204 for clf in tqdm( 205 self.classifiers_, 206 disable=not self.verbose, 207 desc="Calibrating", 208 ): 209 cal_clf = CalibratedClassifierCV( 210 clf, 211 method=self.calibration_method, 212 cv="prefit", 213 ensemble=False, 214 ) 215 cal_clf.fit(X_calib_s, y_calib) 216 self.calibrated_classifiers_.append(cal_clf) 217 218 else: 219 # Original approach: use regression-based with SimpleMultitaskClassifier 220 base_regressor = ConformalBayesianRegressor( 221 self.obj_base, 222 hyperparameter_bounds=self.hyperparameter_bounds, 223 n_samples=self.n_samples, 224 calibration_fraction=self.calibration_fraction, 225 scaling_method=self.scaling_method, 226 random_state=self.random_state, 227 verbose=self.verbose, 228 n_jobs=self.n_jobs, 229 ) 230 231 # Wrap in multitask classifier 232 self.obj = ns.SimpleMultitaskClassifier(base_regressor) 233 234 if self.calibrated: 235 # Fit the base classifier first 236 self.obj.fit(X, y) 237 238 # Then wrap with calibration 239 self.calibrated_obj_ = CalibratedClassifierCV( 240 self.obj, 241 method=self.calibration_method, 242 cv="prefit", 243 ensemble=False, 244 ) 245 246 # Fit calibration on the same data 247 self.calibrated_obj_.fit(X, y) 248 else: 249 self.obj.fit(X, y) 250 251 self.is_fitted_ = True 252 return self
Fit the conformal Bayesian classifier.
Parameters
X : array-like of shape (n_samples, n_features) Training data y : array-like of shape (n_samples,) Target values
Returns
self : object Fitted classifier
254 def predict_proba(self, X): 255 """ 256 Predict class probabilities. 257 258 Parameters 259 ---------- 260 X : array-like of shape (n_samples, n_features) 261 Test data 262 263 Returns 264 ------- 265 proba : array of shape (n_samples, n_classes) 266 Predicted probabilities for each class 267 """ 268 if not self.is_fitted_: 269 raise RuntimeError("Fit the model first") 270 271 X = np.asarray(X) 272 273 if self.use_classifier: 274 # Average probabilities from ensemble 275 X_s = self.scaler_.transform(X) 276 277 if self.calibrated: 278 # Use calibrated classifiers 279 probas = np.array( 280 [ 281 clf.predict_proba(X_s) 282 for clf in self.calibrated_classifiers_ 283 ] 284 ) 285 else: 286 # Use uncalibrated classifiers 287 probas = np.array( 288 [clf.predict_proba(X_s) for clf in self.classifiers_] 289 ) 290 291 # Average probabilities across ensemble 292 mean_proba = np.mean(probas, axis=0) 293 294 return mean_proba 295 else: 296 # Use regression-based approach 297 if self.calibrated: 298 return self.calibrated_obj_.predict_proba(X) 299 else: 300 return self.obj.predict_proba(X)
Predict class probabilities.
Parameters
X : array-like of shape (n_samples, n_features) Test data
Returns
proba : array of shape (n_samples, n_classes) Predicted probabilities for each class
302 def predict(self, X): 303 """ 304 Predict class labels. 305 306 Parameters 307 ---------- 308 X : array-like of shape (n_samples, n_features) 309 Test data 310 311 Returns 312 ------- 313 y_pred : array of shape (n_samples,) 314 Predicted class labels 315 """ 316 proba = self.predict_proba(X) 317 return self.classes_[np.argmax(proba, axis=1)]
Predict class labels.
Parameters
X : array-like of shape (n_samples, n_features) Test data
Returns
y_pred : array of shape (n_samples,) Predicted class labels
16class ContextAwareThetaForecaster: 17 """ 18 Unified Theta Method with multiple estimation modes. 19 20 Variants: 21 - 'classic': Standard Theta (SES + drift) 22 - 'cox': Context-aware with Cox partial likelihood 23 - 'ridge': Context-aware with Ridge regression 24 - 'ml': ML-enhanced with sklearn estimator 25 - 'rslope': R-style slopes via numerical differentiation (context-free) 26 27 Parameters 28 ---------- 29 mode : {'classic', 'cox', 'ridge', 'ml', 'rslope'} 30 Estimation mode 31 theta : float, default=0.5 32 Drift intensity (0=no drift, 0.5=classical, 1=full) 33 estimator : sklearn estimator, optional 34 For 'ml' and 'rslope' modes 35 tau : float, default=12 36 Temporal attention decay 37 sigma_val : float, optional 38 Value-based kernel bandwidth 39 kernel : {'temporal', 'value', 'hybrid'} 40 Attention kernel type 41 seasonal_period : int, optional 42 Seasonal period (auto-detected if None) 43 risk_set_size : int, default=15 44 Risk set size for Cox PL 45 stability_factor : float, default=0.8 46 Gamma clipping safety factor (0,1] 47 random_state : int, optional 48 Random seed for ML mode 49 """ 50 51 def __init__( 52 self, 53 mode: Literal["classic", "cox", "ridge", "ml", "rslope"] = "cox", 54 theta: float = 0.5, 55 estimator: Optional[Any] = None, 56 tau: float = 12.0, 57 sigma_val: Optional[float] = None, 58 kernel: Literal["temporal", "value", "hybrid"] = "temporal", 59 seasonal_period: Optional[int] = None, 60 risk_set_size: int = 15, 61 stability_factor: float = 0.8, 62 random_state: Optional[int] = None, 63 ): 64 self.mode = mode 65 self.theta = theta 66 self.estimator = estimator 67 self.tau = tau 68 self.sigma_val = sigma_val 69 self.kernel = kernel 70 self.seasonal_period = seasonal_period 71 self.risk_set_size = risk_set_size 72 self.stability_factor = stability_factor 73 self.random_state = random_state 74 75 # Fitted params 76 self.alpha_ = None 77 self.l_n_ = None 78 self.b0_ = None 79 self.gamma_ = None 80 self.mu_z_ = None 81 self.sigma_z_ = None 82 self.sigma2_ = None 83 self.seasonal_indices_ = None 84 self._y_train = None 85 self._fitted = False 86 87 # ============ SEASONAL DECOMPOSITION ============ 88 def _decompose(self, y: np.ndarray, period: int): 89 """Multiplicative seasonal decomposition""" 90 n = len(y) 91 if n < 2 * period: 92 return y, np.ones(n), y 93 94 # Centered MA trend 95 trend = np.full(n, np.nan) 96 half = period // 2 97 for i in range(half, n - half): 98 window = y[i - half : i + half + (period % 2)] 99 trend[i] = np.mean(window) 100 101 # Fill edges 102 valid = np.where(~np.isnan(trend))[0] 103 if len(valid) > 0: 104 trend[: valid[0]] = trend[valid[0]] 105 trend[valid[-1] + 1 :] = trend[valid[-1]] 106 107 # Seasonal indices 108 detrended = y / (trend + 1e-10) 109 seasonal = np.zeros(period) 110 for i in range(period): 111 seasonal[i] = np.nanmean(detrended[i::period]) 112 seasonal /= seasonal.mean() + 1e-10 113 114 seasonal_full = np.tile(seasonal, n // period + 1)[:n] 115 adjusted = y / (seasonal_full + 1e-10) 116 117 return adjusted, seasonal_full, trend 118 119 # ============ SES ============ 120 def _ses_level(self, y: np.ndarray, alpha: float) -> np.ndarray: 121 """Compute SES level array""" 122 level = np.zeros(len(y)) 123 level[0] = y[0] 124 for t in range(1, len(y)): 125 level[t] = alpha * y[t] + (1 - alpha) * level[t - 1] 126 return level 127 128 def _ses_nll(self, alpha: float, y: np.ndarray) -> float: 129 """SES negative log-likelihood""" 130 if alpha <= 0 or alpha >= 1: 131 return 1e10 132 level = self._ses_level(y, alpha) 133 resid = y[1:] - level[:-1] 134 sigma2 = np.var(resid) + 1e-10 135 return 0.5 * len(resid) * (np.log(2 * np.pi * sigma2) + 1) 136 137 def _fit_ses(self, y: np.ndarray): 138 """Estimate alpha via MLE""" 139 res = minimize( 140 lambda a: self._ses_nll(a[0], y), 141 [0.3], 142 bounds=[(0.01, 0.99)], 143 method="L-BFGS-B", 144 ) 145 alpha = res.x[0] 146 level_array = self._ses_level(y, alpha) 147 return alpha, level_array[-1], level_array 148 149 # ============ DRIFT ============ 150 def _estimate_drift(self, y: np.ndarray) -> float: 151 """Baseline drift: b0 = beta_OLS / 2""" 152 t = np.arange(len(y)) 153 beta = np.sum((t - t.mean()) * (y - y.mean())) / ( 154 np.sum((t - t.mean()) ** 2) + 1e-10 155 ) 156 return beta / 2.0 157 158 # ============ ATTENTION CONTEXT ============ 159 def _attention_kernel(self, Xj: float, Xt: float, j: int, t: int) -> float: 160 """Compute kernel weight""" 161 if self.kernel == "temporal": 162 return np.exp(-(t - j) / (self.tau + 1e-12)) 163 elif self.kernel == "value": 164 sigma = self.sigma_val if self.sigma_val else 1.0 165 return np.exp(-((Xj - Xt) ** 2) / (2 * sigma**2 + 1e-12)) 166 else: # hybrid 167 sigma = self.sigma_val if self.sigma_val else 1.0 168 return np.exp( 169 -(t - j) / (self.tau + 1e-12) 170 - ((Xj - Xt) ** 2) / (2 * sigma**2 + 1e-12) 171 ) 172 173 def _compute_context(self, y: np.ndarray) -> np.ndarray: 174 """Compute attention-weighted context z_t""" 175 n = len(y) 176 z = np.zeros(n) 177 for t in range(n): 178 weights = np.array( 179 [self._attention_kernel(y[j], y[t], j, t) for j in range(t + 1)] 180 ) 181 weights /= weights.sum() + 1e-12 182 z[t] = np.dot(weights, y[: t + 1]) 183 return z 184 185 # ============ GAMMA ESTIMATION ============ 186 def _partial_nll(self, gamma: float, z_star: np.ndarray) -> float: 187 """Cox partial negative log-likelihood (stable)""" 188 n = len(z_star) 189 k = min(self.risk_set_size, n // 2) 190 nll = 0.0 191 for t in range(k, n): 192 z_risk = z_star[max(0, t - k) : t + 1] 193 nll -= gamma * z_star[t] - logsumexp(gamma * z_risk) 194 return nll 195 196 def _estimate_gamma_cox(self, z_star: np.ndarray) -> float: 197 """Estimate gamma via Cox PL""" 198 res = minimize( 199 lambda g: self._partial_nll(g[0], z_star), [0.0], method="BFGS" 200 ) 201 return res.x[0] 202 203 def _estimate_gamma_ridge( 204 self, 205 y: np.ndarray, 206 z_star: np.ndarray, 207 level_array: np.ndarray, 208 alpha: float, 209 b0: float, 210 ) -> float: 211 """Estimate gamma via Ridge regression""" 212 try: 213 from sklearn.linear_model import Ridge 214 except ImportError: 215 warnings.warn("sklearn unavailable, using Cox method") 216 return self._estimate_gamma_cox(z_star) 217 218 n = len(y) 219 residuals = y - level_array 220 221 # Build design matrix 222 X_design = [] 223 y_resid = [] 224 for t in range(20, n): 225 D_h = self._D_n(1, alpha, t) 226 x_t = 0.5 * b0 * z_star[t - 1] * D_h 227 X_design.append(x_t) 228 y_resid.append(residuals[t]) 229 230 X_design = np.array(X_design).reshape(-1, 1) 231 y_resid = np.array(y_resid) 232 233 ridge = Ridge(alpha=10.0, fit_intercept=False) 234 ridge.fit(X_design, y_resid) 235 return ridge.coef_[0] 236 237 def _estimate_gamma_ml(self, y: np.ndarray, h: int) -> float: 238 """Estimate gamma via ML numerical differentiation""" 239 if self.estimator is None: 240 from sklearn.linear_model import LinearRegression 241 242 self.estimator = LinearRegression() 243 244 n = len(y) 245 if self.random_state is not None: 246 np.random.seed(self.random_state) 247 248 # Features 249 time_idx = np.arange(n + h) 250 time_norm = time_idx / n 251 n_random = 3 252 random_cov = np.random.randn(n + h, n_random) 253 X_all = np.column_stack([time_norm, random_cov]) 254 255 # Scale 256 from sklearn.preprocessing import StandardScaler 257 258 scaler = StandardScaler() 259 X_train = scaler.fit_transform(X_all[:n]) 260 X_all = scaler.transform(X_all) 261 262 # Fit 263 self.estimator.fit(X_train, y) 264 265 # Numerical differentiation 266 eps = 1e-4 ** (1 / 3) 267 h_eps = np.maximum(eps * np.abs(time_norm), eps / n) 268 269 t_plus = np.clip(time_norm + h_eps, 0, 2.0) 270 t_minus = np.clip(time_norm - h_eps, 0, 2.0) 271 272 X_plus = scaler.transform(np.column_stack([t_plus, random_cov])) 273 X_minus = scaler.transform(np.column_stack([t_minus, random_cov])) 274 275 fx_plus = self.estimator.predict(X_plus) 276 fx_minus = self.estimator.predict(X_minus) 277 278 slopes = (fx_plus - fx_minus) / (2 * h_eps) / n 279 280 # Approximate gamma from slopes 281 return self.theta * slopes.mean() / (0.5 * self.b0_ + 1e-12) 282 283 def _estimate_slopes_rslope(self, y: np.ndarray, h: int) -> np.ndarray: 284 """ 285 R-style slope estimation via numerical differentiation. 286 Returns slopes for forecast horizon (not gamma). 287 Similar to estimate_theta_slope() in R code. 288 """ 289 if self.estimator is None: 290 from sklearn.linear_model import LinearRegression 291 292 self.estimator = LinearRegression() 293 294 n = len(y) 295 if self.random_state is not None: 296 np.random.seed(self.random_state) 297 298 # Create features: time + random noise 299 time_idx = np.arange(n + h) 300 time_norm = time_idx / n 301 n_random = 3 302 random_cov = np.random.randn(n + h, n_random) 303 X_all = np.column_stack([time_norm, random_cov]) 304 305 # Scale features 306 from sklearn.preprocessing import StandardScaler 307 308 scaler = StandardScaler() 309 X_train = scaler.fit_transform(X_all[:n]) 310 X_all_scaled = scaler.transform(X_all) 311 312 # Fit model 313 self.estimator.fit(X_train, y) 314 315 # Numerical differentiation for ALL points (historical + forecast) 316 eps = 1e-4 ** (1 / 3) 317 h_eps = np.maximum(eps * np.abs(time_norm), eps / n) 318 319 t_plus = np.clip(time_norm + h_eps, 0, 2.0) 320 t_minus = np.clip(time_norm - h_eps, 0, 2.0) 321 322 X_plus = scaler.transform(np.column_stack([t_plus, random_cov])) 323 X_minus = scaler.transform(np.column_stack([t_minus, random_cov])) 324 325 fx_plus = self.estimator.predict(X_plus) 326 fx_minus = self.estimator.predict(X_minus) 327 328 # Slopes at each time point 329 slopes = (fx_plus - fx_minus) / (2 * h_eps) / n 330 331 # Return ONLY the forecast horizon slopes (last h values) 332 return slopes[-h:] * self.theta 333 334 # ============ DRIFT MULTIPLIER ============ 335 def _D_n(self, h: int, alpha: float, n: int) -> float: 336 """Drift multiplier D_n(h)""" 337 return (h - 1) + (1 - (1 - alpha) ** n) / (alpha + 1e-12) 338 339 # ============ FIT ============ 340 def fit(self, y: np.ndarray): 341 """Fit the model""" 342 y = np.asarray(y, dtype=float).ravel() 343 self._y_train = y.copy() 344 n = len(y) 345 346 # Detect seasonality 347 period = self.seasonal_period 348 if period is None: 349 period = 12 if n >= 24 else 1 350 351 # Decompose 352 if period > 1 and n >= 2 * period: 353 y_adj, seasonal_full, _ = self._decompose(y, period) 354 self.seasonal_indices_ = seasonal_full[-period:] 355 else: 356 y_adj = y.copy() 357 self.seasonal_indices_ = None 358 359 # SES 360 self.alpha_, self.l_n_, level_array = self._fit_ses(y_adj) 361 362 # Drift 363 self.b0_ = self._estimate_drift(y_adj) 364 365 # Context & Gamma 366 if self.mode == "classic": 367 self.gamma_ = 0.0 368 self.mu_z_ = 0.0 369 self.sigma_z_ = 1.0 370 elif self.mode == "rslope": 371 # R-style: no gamma, slopes computed during prediction 372 self.gamma_ = 0.0 373 self.mu_z_ = 0.0 374 self.sigma_z_ = 1.0 375 else: 376 # Compute context 377 z_raw = self._compute_context(y_adj) 378 self.mu_z_ = z_raw.mean() 379 self.sigma_z_ = z_raw.std() + 1e-12 380 z_star = (z_raw - self.mu_z_) / self.sigma_z_ 381 382 # Estimate gamma 383 if self.mode == "cox": 384 gamma_raw = self._estimate_gamma_cox(z_star) 385 elif self.mode == "ridge": 386 gamma_raw = self._estimate_gamma_ridge( 387 y_adj, z_star, level_array, self.alpha_, self.b0_ 388 ) 389 else: # ml 390 gamma_raw = self._estimate_gamma_ml(y_adj, 12) 391 392 # Stability constraint 393 D_max = self._D_n(36, self.alpha_, n) 394 stability_bound = 2.0 / (abs(self.b0_) * D_max + 1e-12) 395 self.gamma_ = np.clip( 396 gamma_raw, 397 -self.stability_factor * stability_bound, 398 self.stability_factor * stability_bound, 399 ) 400 401 # Innovation variance 402 residuals = y_adj[1:] - level_array[:-1] 403 self.sigma2_ = np.var(residuals, ddof=1) 404 405 self._fitted = True 406 return self 407 408 # ============ PREDICT ============ 409 def predict( 410 self, h: int, return_pi: bool = True, alpha_ci: float = 0.05 411 ) -> Dict[str, np.ndarray]: 412 """Generate forecasts""" 413 if not self._fitted: 414 raise RuntimeError("Call fit() first") 415 416 n = len(self._y_train) 417 418 # For rslope mode, compute slopes once 419 if self.mode == "rslope": 420 y_adj = ( 421 self._y_train 422 if self.seasonal_indices_ is None 423 else self._y_train 424 / np.tile( 425 self.seasonal_indices_, n // len(self.seasonal_indices_) + 1 426 )[:n] 427 ) 428 rslope_slopes = self._estimate_slopes_rslope(y_adj, h) 429 430 # Deseasonalized forecast (recursive for non-rslope, direct for rslope) 431 if self.mode == "rslope": 432 # R-style: direct application of slopes 433 fc = np.zeros(h) 434 for step in range(h): 435 D_h = self._D_n(step + 1, self.alpha_, n) 436 fc[step] = self.l_n_ + rslope_slopes[step] * D_h 437 else: 438 # Original recursive logic 439 fc = [] 440 history = list( 441 self._y_train 442 if self.seasonal_indices_ is None 443 else self._y_train 444 / np.tile( 445 self.seasonal_indices_, n // len(self.seasonal_indices_) + 1 446 )[:n] 447 ) 448 449 for step in range(1, h + 1): 450 # Recompute context 451 if self.mode != "classic": 452 t_now = len(history) - 1 453 weights = np.array( 454 [ 455 self._attention_kernel( 456 history[j], history[t_now], j, t_now 457 ) 458 for j in range(t_now + 1) 459 ] 460 ) 461 weights /= weights.sum() + 1e-12 462 z_h = np.dot(weights, history) 463 z_h_star = (z_h - self.mu_z_) / self.sigma_z_ 464 else: 465 z_h_star = 0.0 466 467 # Forecast 468 D_h = self._D_n(step, self.alpha_, n) 469 context_factor = 1.0 + self.gamma_ * z_h_star 470 fc_val = ( 471 self.l_n_ 472 + 0.5 * self.b0_ * self.theta * context_factor * D_h 473 ) 474 475 fc.append(fc_val) 476 history.append(fc_val) 477 478 fc = np.array(fc) 479 480 # Reseasonalize 481 if self.seasonal_indices_ is not None: 482 seasonal_fc = np.tile( 483 self.seasonal_indices_, (h // len(self.seasonal_indices_)) + 1 484 )[:h] 485 fc *= seasonal_fc 486 487 result = {"mean": fc} 488 489 # Prediction intervals 490 if return_pi: 491 z_score = norm.ppf(1 - alpha_ci / 2) 492 lower = [] 493 upper = [] 494 495 for step in range(1, h + 1): 496 D_h = self._D_n(step, self.alpha_, n) 497 var_ses = self.sigma2_ * ((step - 1) * self.alpha_**2 + 1) 498 var_ctx = ( 499 (0.5 * self.gamma_ * self.b0_ * self.sigma_z_ * D_h) ** 2 500 ) / n 501 se = np.sqrt(var_ses + var_ctx) 502 503 lower.append(fc[step - 1] - z_score * se) 504 upper.append(fc[step - 1] + z_score * se) 505 506 result["lower"] = np.array(lower) 507 result["upper"] = np.array(upper) 508 509 return result 510 511 # ============ UTILITIES ============ 512 def get_params(self) -> Dict[str, Any]: 513 """Get fitted parameters""" 514 return { 515 "mode": self.mode, 516 "alpha": self.alpha_, 517 "b0": self.b0_, 518 "gamma": self.gamma_, 519 "l_n": self.l_n_, 520 "theta": self.theta, 521 "sigma2": self.sigma2_, 522 "seasonal": self.seasonal_indices_ is not None, 523 } 524 525 def plot(self, forecast: Dict[str, np.ndarray], title: str = None): 526 """Rich visualization of forecasts""" 527 import matplotlib.pyplot as plt 528 529 n = len(self._y_train) 530 h = len(forecast["mean"]) 531 532 fig, axes = plt.subplots(2, 2, figsize=(14, 10)) 533 fig.suptitle( 534 title or f"Unified Theta: {self.mode.upper()} Mode", 535 fontsize=14, 536 fontweight="bold", 537 ) 538 539 # Plot 1: Forecasts with PI 540 ax1 = axes[0, 0] 541 train_idx = np.arange(n) 542 fc_idx = np.arange(n, n + h) 543 544 ax1.plot( 545 train_idx, 546 self._y_train, 547 "o-", 548 color="black", 549 label="Train", 550 linewidth=1.5, 551 markersize=3, 552 alpha=0.7, 553 ) 554 ax1.plot( 555 fc_idx, 556 forecast["mean"], 557 "s-", 558 color="steelblue", 559 label="Forecast", 560 linewidth=2.5, 561 markersize=5, 562 ) 563 if "lower" in forecast: 564 ax1.fill_between( 565 fc_idx, 566 forecast["lower"], 567 forecast["upper"], 568 color="lightblue", 569 alpha=0.3, 570 label="95% PI", 571 ) 572 ax1.axvline( 573 n - 0.5, color="red", linestyle="--", alpha=0.5, linewidth=2 574 ) 575 ax1.set_title("Forecasts with Prediction Intervals", fontweight="bold") 576 ax1.set_xlabel("Time") 577 ax1.set_ylabel("Value") 578 ax1.legend(loc="upper left") 579 ax1.grid(alpha=0.3) 580 581 # Plot 2: Context variable (if not classic/rslope) 582 ax2 = axes[0, 1] 583 if self.mode not in ["classic", "rslope"]: 584 y_adj = ( 585 self._y_train 586 if self.seasonal_indices_ is None 587 else self._y_train 588 / np.tile( 589 self.seasonal_indices_, n // len(self.seasonal_indices_) + 1 590 )[:n] 591 ) 592 z = self._compute_context(y_adj) 593 z_star = (z - self.mu_z_) / self.sigma_z_ 594 595 ax2.plot( 596 train_idx, 597 z_star, 598 color="purple", 599 linewidth=2, 600 label="z* (standardized)", 601 ) 602 ax2.axhline(0, color="black", linestyle="--", alpha=0.5) 603 ax2.fill_between( 604 train_idx, 605 0, 606 z_star, 607 where=(z_star > 0), 608 color="green", 609 alpha=0.2, 610 label="Above trend", 611 ) 612 ax2.fill_between( 613 train_idx, 614 0, 615 z_star, 616 where=(z_star < 0), 617 color="red", 618 alpha=0.2, 619 label="Below trend", 620 ) 621 ax2.set_title( 622 f"Context Signal (γ={self.gamma_:.4f})", fontweight="bold" 623 ) 624 ax2.set_xlabel("Time") 625 ax2.set_ylabel("z* (std dev)") 626 ax2.legend() 627 elif self.mode == "rslope": 628 ax2.text( 629 0.5, 630 0.5, 631 "R-Slope Mode\n(Direct ML Slopes)", 632 ha="center", 633 va="center", 634 transform=ax2.transAxes, 635 fontsize=14, 636 bbox=dict(boxstyle="round", facecolor="lightblue"), 637 ) 638 ax2.axis("off") 639 else: 640 ax2.text( 641 0.5, 642 0.5, 643 "Classic Mode\n(No Context)", 644 ha="center", 645 va="center", 646 transform=ax2.transAxes, 647 fontsize=14, 648 bbox=dict(boxstyle="round", facecolor="wheat"), 649 ) 650 ax2.axis("off") 651 ax2.grid(alpha=0.3) 652 653 # Plot 3: Residuals 654 ax3 = axes[1, 0] 655 y_adj = ( 656 self._y_train 657 if self.seasonal_indices_ is None 658 else self._y_train 659 / np.tile( 660 self.seasonal_indices_, n // len(self.seasonal_indices_) + 1 661 )[:n] 662 ) 663 level_array = self._ses_level(y_adj, self.alpha_) 664 residuals = y_adj[1:] - level_array[:-1] 665 666 ax3.scatter(train_idx[1:], residuals, alpha=0.6, s=20, color="coral") 667 ax3.axhline(0, color="black", linestyle="--", linewidth=1.5) 668 ax3.set_title(f"Residuals (σ²={self.sigma2_:.3f})", fontweight="bold") 669 ax3.set_xlabel("Time") 670 ax3.set_ylabel("Residual") 671 ax3.grid(alpha=0.3) 672 673 # Histogram 674 ax3_inset = ax3.inset_axes([0.65, 0.65, 0.3, 0.3]) 675 ax3_inset.hist( 676 residuals, bins=15, color="coral", alpha=0.7, edgecolor="black" 677 ) 678 ax3_inset.axvline(0, color="black", linestyle="--", linewidth=1) 679 ax3_inset.set_title("Distribution", fontsize=8) 680 ax3_inset.tick_params(labelsize=7) 681 682 # Plot 4: Summary table 683 ax4 = axes[1, 1] 684 ax4.axis("off") 685 686 params = self.get_params() 687 summary = f""" 688╔═══════════════════════════════════════╗ 689║ MODEL PARAMETERS ║ 690╠═══════════════════════════════════════╣ 691║ ║ 692║ Mode: {self.mode.upper():<20} ║ 693║ Theta: {self.theta:<20.4f} ║ 694║ Alpha (α): {params['alpha']:<20.4f} ║ 695║ Drift (b₀): {params['b0']:<20.4f} ║ 696║ Gamma (γ): {params['gamma']:<20.6f} ║ 697║ Level (ℓₙ): {params['l_n']:<20.2f} ║ 698║ σ²: {params['sigma2']:<20.4f} ║ 699║ Seasonal: {str(params['seasonal']):<20} ║ 700║ ║ 701╠═══════════════════════════════════════╣ 702║ FORECAST SUMMARY ║ 703╠═══════════════════════════════════════╣ 704║ ║ 705║ Horizon: {h:<20} steps ║ 706║ Final FC: {forecast['mean'][-1]:<20.2f} ║ 707""" 708 if "lower" in forecast: 709 summary += f"║ 95% PI: [{forecast['lower'][-1]:>6.2f}, {forecast['upper'][-1]:>6.2f}] ║\n" 710 711 summary += "║ ║\n" 712 summary += "╚═══════════════════════════════════════╝" 713 714 ax4.text( 715 0.05, 716 0.95, 717 summary, 718 fontsize=9, 719 family="monospace", 720 verticalalignment="top", 721 transform=ax4.transAxes, 722 bbox=dict( 723 boxstyle="round", 724 facecolor="lightyellow", 725 alpha=0.8, 726 edgecolor="black", 727 linewidth=1.5, 728 ), 729 ) 730 731 plt.tight_layout() 732 return fig
Unified Theta Method with multiple estimation modes.
Variants:
- 'classic': Standard Theta (SES + drift)
- 'cox': Context-aware with Cox partial likelihood
- 'ridge': Context-aware with Ridge regression
- 'ml': ML-enhanced with sklearn estimator
- 'rslope': R-style slopes via numerical differentiation (context-free)
Parameters
mode : {'classic', 'cox', 'ridge', 'ml', 'rslope'} Estimation mode theta : float, default=0.5 Drift intensity (0=no drift, 0.5=classical, 1=full) estimator : sklearn estimator, optional For 'ml' and 'rslope' modes tau : float, default=12 Temporal attention decay sigma_val : float, optional Value-based kernel bandwidth kernel : {'temporal', 'value', 'hybrid'} Attention kernel type seasonal_period : int, optional Seasonal period (auto-detected if None) risk_set_size : int, default=15 Risk set size for Cox PL stability_factor : float, default=0.8 Gamma clipping safety factor (0,1] random_state : int, optional Random seed for ML mode
340 def fit(self, y: np.ndarray): 341 """Fit the model""" 342 y = np.asarray(y, dtype=float).ravel() 343 self._y_train = y.copy() 344 n = len(y) 345 346 # Detect seasonality 347 period = self.seasonal_period 348 if period is None: 349 period = 12 if n >= 24 else 1 350 351 # Decompose 352 if period > 1 and n >= 2 * period: 353 y_adj, seasonal_full, _ = self._decompose(y, period) 354 self.seasonal_indices_ = seasonal_full[-period:] 355 else: 356 y_adj = y.copy() 357 self.seasonal_indices_ = None 358 359 # SES 360 self.alpha_, self.l_n_, level_array = self._fit_ses(y_adj) 361 362 # Drift 363 self.b0_ = self._estimate_drift(y_adj) 364 365 # Context & Gamma 366 if self.mode == "classic": 367 self.gamma_ = 0.0 368 self.mu_z_ = 0.0 369 self.sigma_z_ = 1.0 370 elif self.mode == "rslope": 371 # R-style: no gamma, slopes computed during prediction 372 self.gamma_ = 0.0 373 self.mu_z_ = 0.0 374 self.sigma_z_ = 1.0 375 else: 376 # Compute context 377 z_raw = self._compute_context(y_adj) 378 self.mu_z_ = z_raw.mean() 379 self.sigma_z_ = z_raw.std() + 1e-12 380 z_star = (z_raw - self.mu_z_) / self.sigma_z_ 381 382 # Estimate gamma 383 if self.mode == "cox": 384 gamma_raw = self._estimate_gamma_cox(z_star) 385 elif self.mode == "ridge": 386 gamma_raw = self._estimate_gamma_ridge( 387 y_adj, z_star, level_array, self.alpha_, self.b0_ 388 ) 389 else: # ml 390 gamma_raw = self._estimate_gamma_ml(y_adj, 12) 391 392 # Stability constraint 393 D_max = self._D_n(36, self.alpha_, n) 394 stability_bound = 2.0 / (abs(self.b0_) * D_max + 1e-12) 395 self.gamma_ = np.clip( 396 gamma_raw, 397 -self.stability_factor * stability_bound, 398 self.stability_factor * stability_bound, 399 ) 400 401 # Innovation variance 402 residuals = y_adj[1:] - level_array[:-1] 403 self.sigma2_ = np.var(residuals, ddof=1) 404 405 self._fitted = True 406 return self
Fit the model
409 def predict( 410 self, h: int, return_pi: bool = True, alpha_ci: float = 0.05 411 ) -> Dict[str, np.ndarray]: 412 """Generate forecasts""" 413 if not self._fitted: 414 raise RuntimeError("Call fit() first") 415 416 n = len(self._y_train) 417 418 # For rslope mode, compute slopes once 419 if self.mode == "rslope": 420 y_adj = ( 421 self._y_train 422 if self.seasonal_indices_ is None 423 else self._y_train 424 / np.tile( 425 self.seasonal_indices_, n // len(self.seasonal_indices_) + 1 426 )[:n] 427 ) 428 rslope_slopes = self._estimate_slopes_rslope(y_adj, h) 429 430 # Deseasonalized forecast (recursive for non-rslope, direct for rslope) 431 if self.mode == "rslope": 432 # R-style: direct application of slopes 433 fc = np.zeros(h) 434 for step in range(h): 435 D_h = self._D_n(step + 1, self.alpha_, n) 436 fc[step] = self.l_n_ + rslope_slopes[step] * D_h 437 else: 438 # Original recursive logic 439 fc = [] 440 history = list( 441 self._y_train 442 if self.seasonal_indices_ is None 443 else self._y_train 444 / np.tile( 445 self.seasonal_indices_, n // len(self.seasonal_indices_) + 1 446 )[:n] 447 ) 448 449 for step in range(1, h + 1): 450 # Recompute context 451 if self.mode != "classic": 452 t_now = len(history) - 1 453 weights = np.array( 454 [ 455 self._attention_kernel( 456 history[j], history[t_now], j, t_now 457 ) 458 for j in range(t_now + 1) 459 ] 460 ) 461 weights /= weights.sum() + 1e-12 462 z_h = np.dot(weights, history) 463 z_h_star = (z_h - self.mu_z_) / self.sigma_z_ 464 else: 465 z_h_star = 0.0 466 467 # Forecast 468 D_h = self._D_n(step, self.alpha_, n) 469 context_factor = 1.0 + self.gamma_ * z_h_star 470 fc_val = ( 471 self.l_n_ 472 + 0.5 * self.b0_ * self.theta * context_factor * D_h 473 ) 474 475 fc.append(fc_val) 476 history.append(fc_val) 477 478 fc = np.array(fc) 479 480 # Reseasonalize 481 if self.seasonal_indices_ is not None: 482 seasonal_fc = np.tile( 483 self.seasonal_indices_, (h // len(self.seasonal_indices_)) + 1 484 )[:h] 485 fc *= seasonal_fc 486 487 result = {"mean": fc} 488 489 # Prediction intervals 490 if return_pi: 491 z_score = norm.ppf(1 - alpha_ci / 2) 492 lower = [] 493 upper = [] 494 495 for step in range(1, h + 1): 496 D_h = self._D_n(step, self.alpha_, n) 497 var_ses = self.sigma2_ * ((step - 1) * self.alpha_**2 + 1) 498 var_ctx = ( 499 (0.5 * self.gamma_ * self.b0_ * self.sigma_z_ * D_h) ** 2 500 ) / n 501 se = np.sqrt(var_ses + var_ctx) 502 503 lower.append(fc[step - 1] - z_score * se) 504 upper.append(fc[step - 1] + z_score * se) 505 506 result["lower"] = np.array(lower) 507 result["upper"] = np.array(upper) 508 509 return result
Generate forecasts
18class LSBoostClassifier(BaseEstimator, ClassifierMixin): 19 """LSBoost classifier. 20 21 Attributes: 22 23 n_estimators: int 24 number of boosting iterations. 25 26 learning_rate: float 27 controls the learning speed at training time. 28 29 n_hidden_features: int 30 number of nodes in successive hidden layers. 31 32 reg_lambda: float 33 L2 regularization parameter for successive errors in the optimizer 34 (at training time). 35 36 alpha: float 37 compromise between L1 and L2 regularization (must be in [0, 1]), 38 for `solver` == 'enet'. 39 40 row_sample: float 41 percentage of rows chosen from the training set. 42 43 col_sample: float 44 percentage of columns chosen from the training set. 45 46 dropout: float 47 percentage of nodes dropped from the training set. 48 49 tolerance: float 50 controls early stopping in gradient descent (at training time). 51 52 direct_link: bool 53 indicates whether the original features are included (True) in model's 54 fitting or not (False). 55 56 verbose: int 57 progress bar (yes = 1) or not (no = 0) (currently). 58 59 seed: int 60 reproducibility seed for nodes_sim=='uniform', clustering and dropout. 61 62 backend: str 63 type of backend; must be in ('cpu', 'gpu', 'tpu') 64 65 solver: str 66 type of 'weak' learner; currently in ('ridge', 'lasso', 'enet'). 67 'enet' is a combination of 'ridge' and 'lasso' called Elastic Net. 68 69 activation: str 70 activation function: currently 'relu', 'relu6', 'sigmoid', 'tanh' 71 72 n_clusters: int 73 number of clusters for clustering the features 74 75 clustering_method: str 76 clustering method: currently 'kmeans', 'gmm' 77 78 cluster_scaling: str 79 scaling method for clustering: currently 'standard', 'robust', 'minmax' 80 81 degree: int 82 degree of features interactions to include in the model 83 84 weights_distr: str 85 distribution of weights for constructing the model's hidden layer; 86 currently 'uniform', 'gaussian' 87 88 hist: bool 89 indicates whether histogram features are used or not (default is False) 90 91 bins: int or str 92 number of bins for histogram features (same as numpy.histogram, default is 'auto') 93 94 Examples: 95 96 ```python 97 import numpy as np 98 from sklearn.datasets import load_digits, load_breast_cancer, load_wine, load_iris 99 from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score 100 from sklearn.tree import DecisionTreeRegressor 101 from sklearn.kernel_ridge import KernelRidge 102 from time import time 103 from os import chdir 104 from sklearn import metrics 105 import os 106 107 import mlsauce as ms 108 109 print("\n") 110 print("GenericBoosting Decision tree -----") 111 print("\n") 112 113 print("\n") 114 print("breast_cancer data -----") 115 116 # data 1 117 breast_cancer = load_breast_cancer() 118 X = breast_cancer.data 119 y = breast_cancer.target 120 # split data into training test and test set 121 np.random.seed(15029) 122 X_train, X_test, y_train, y_test = train_test_split(X, y, 123 test_size=0.2) 124 125 clf = DecisionTreeRegressor() 126 clf2 = KernelRidge() 127 128 obj = ms.GenericBoostingClassifier(clf, tolerance=1e-2) 129 print(obj.get_params()) 130 start = time() 131 obj.fit(X_train, y_train) 132 print(time()-start) 133 start = time() 134 print(obj.score(X_test, y_test)) 135 print(time()-start) 136 137 print(obj.obj['loss']) 138 139 obj = ms.GenericBoostingClassifier(clf, tolerance=1e-2, n_clusters=2) 140 print(obj.get_params()) 141 start = time() 142 obj.fit(X_train, y_train) 143 print(time()-start) 144 start = time() 145 print(obj.score(X_test, y_test)) 146 print(time()-start) 147 148 print(obj.obj['loss']) 149 150 151 # data 2 152 print("\n") 153 print("wine data -----") 154 155 wine = load_wine() 156 Z = wine.data 157 t = wine.target 158 np.random.seed(879423) 159 X_train, X_test, y_train, y_test = train_test_split(Z, t, 160 test_size=0.2) 161 162 obj = ms.GenericBoostingClassifier(clf) 163 print(obj.get_params()) 164 start = time() 165 obj.fit(X_train, y_train) 166 print(time()-start) 167 start = time() 168 print(obj.score(X_test, y_test)) 169 print(time()-start) 170 171 print(obj.obj['loss']) 172 173 obj = ms.GenericBoostingClassifier(clf, n_clusters=3) 174 print(obj.get_params()) 175 start = time() 176 obj.fit(X_train, y_train) 177 print(time()-start) 178 start = time() 179 print(obj.score(X_test, y_test)) 180 print(time()-start) 181 182 print(obj.obj['loss']) 183 184 # data 3 185 print("\n") 186 print("iris data -----") 187 188 iris = load_iris() 189 Z = iris.data 190 t = iris.target 191 np.random.seed(734563) 192 X_train, X_test, y_train, y_test = train_test_split(Z, t, 193 test_size=0.2) 194 195 196 obj = ms.GenericBoostingClassifier(clf) 197 print(obj.get_params()) 198 start = time() 199 obj.fit(X_train, y_train) 200 print(time()-start) 201 start = time() 202 print(obj.score(X_test, y_test)) 203 print(time()-start) 204 205 print(obj.obj['loss']) 206 207 208 print("\n") 209 print("GenericBoosting KRR -----") 210 print("\n") 211 212 obj = ms.GenericBoostingClassifier(clf2, tolerance=1e-2) 213 print(obj.get_params()) 214 start = time() 215 obj.fit(X_train, y_train) 216 print(time()-start) 217 start = time() 218 print(obj.score(X_test, y_test)) 219 print(time()-start) 220 221 print(obj.obj['loss']) 222 223 obj = ms.GenericBoostingClassifier(clf2, tolerance=1e-2, n_clusters=2) 224 print(obj.get_params()) 225 start = time() 226 obj.fit(X_train, y_train) 227 print(time()-start) 228 start = time() 229 print(obj.score(X_test, y_test)) 230 print(time()-start) 231 232 print(obj.obj['loss']) 233 234 235 # data 2 236 print("\n") 237 print("wine data -----") 238 239 wine = load_wine() 240 Z = wine.data 241 t = wine.target 242 np.random.seed(879423) 243 X_train, X_test, y_train, y_test = train_test_split(Z, t, 244 test_size=0.2) 245 246 obj = ms.GenericBoostingClassifier(clf2) 247 print(obj.get_params()) 248 start = time() 249 obj.fit(X_train, y_train) 250 print(time()-start) 251 start = time() 252 print(obj.score(X_test, y_test)) 253 print(time()-start) 254 255 print(obj.obj['loss']) 256 257 obj = ms.GenericBoostingClassifier(clf2, n_clusters=3) 258 print(obj.get_params()) 259 start = time() 260 obj.fit(X_train, y_train) 261 print(time()-start) 262 start = time() 263 print(obj.score(X_test, y_test)) 264 print(time()-start) 265 266 print(obj.obj['loss']) 267 268 # data 3 269 print("\n") 270 print("iris data -----") 271 272 iris = load_iris() 273 Z = iris.data 274 t = iris.target 275 np.random.seed(734563) 276 X_train, X_test, y_train, y_test = train_test_split(Z, t, 277 test_size=0.2) 278 279 280 obj = ms.GenericBoostingClassifier(clf2) 281 print(obj.get_params()) 282 start = time() 283 obj.fit(X_train, y_train) 284 print(time()-start) 285 start = time() 286 print(obj.score(X_test, y_test)) 287 print(time()-start) 288 289 print(obj.obj['loss']) 290 ``` 291 292 """ 293 294 def __init__( 295 self, 296 n_estimators=100, 297 learning_rate=0.1, 298 n_hidden_features=5, 299 reg_lambda=0.1, 300 alpha=0.5, 301 row_sample=1, 302 col_sample=1, 303 dropout=0, 304 tolerance=1e-4, 305 direct_link=1, 306 verbose=1, 307 seed=123, 308 backend="cpu", 309 solver="ridge", 310 activation="relu", 311 n_clusters=0, 312 clustering_method="kmeans", 313 cluster_scaling="standard", 314 degree=None, 315 weights_distr="uniform", 316 base_model=None, 317 hist=False, 318 bins="auto", 319 ): 320 321 self.base_model = base_model 322 self.hist = hist 323 self.bins = bins 324 self.hist_bins_ = None 325 326 if n_clusters > 0: 327 assert clustering_method in ( 328 "kmeans", 329 "gmm", 330 ), "`clustering_method` must be in ('kmeans', 'gmm')" 331 assert cluster_scaling in ( 332 "standard", 333 "robust", 334 "minmax", 335 ), "`cluster_scaling` must be in ('standard', 'robust', 'minmax')" 336 337 assert backend in ( 338 "cpu", 339 "gpu", 340 "tpu", 341 ), "`backend` must be in ('cpu', 'gpu', 'tpu')" 342 343 assert solver in ( 344 "ridge", 345 "lasso", 346 "enet", 347 ), "`solver` must be in ('ridge', 'lasso', 'enet')" 348 349 sys_platform = platform.system() 350 351 if (sys_platform == "Windows") and (backend in ("gpu", "tpu")): 352 warnings.warn( 353 "No GPU/TPU computing on Windows yet, backend set to 'cpu'" 354 ) 355 backend = "cpu" 356 357 self.n_estimators = n_estimators 358 self.learning_rate = learning_rate 359 self.n_hidden_features = n_hidden_features 360 self.reg_lambda = reg_lambda 361 assert alpha >= 0 and alpha <= 1, "`alpha` must be in [0, 1]" 362 self.alpha = alpha 363 self.row_sample = row_sample 364 self.col_sample = col_sample 365 self.dropout = dropout 366 self.tolerance = tolerance 367 self.direct_link = direct_link 368 self.verbose = verbose 369 self.seed = seed 370 self.backend = backend 371 self.obj = None 372 self.solver = solver 373 self.activation = activation 374 self.n_clusters = n_clusters 375 self.clustering_method = clustering_method 376 self.cluster_scaling = cluster_scaling 377 self.scaler_, self.label_encoder_, self.clusterer_ = None, None, None 378 self.degree = degree 379 self.poly_ = None 380 self.weights_distr = weights_distr 381 if self.backend in ("gpu", "tpu"): 382 check_and_install("jax") 383 check_and_install("jaxlib") 384 385 def fit(self, X, y, **kwargs): 386 """Fit Booster (classifier) to training data (X, y) 387 388 Args: 389 390 X: {array-like}, shape = [n_samples, n_features] 391 Training vectors, where n_samples is the number 392 of samples and n_features is the number of features. 393 394 y: array-like, shape = [n_samples] 395 Target values. 396 397 **kwargs: additional parameters to be passed to self.cook_training_set. 398 399 Returns: 400 401 self: object. 402 """ 403 404 if isinstance(X, pd.DataFrame): 405 X = X.values 406 407 if self.hist == True: 408 X, self.hist_bins_ = get_histo_features(X) 409 410 if isinstance(y, pd.Series): 411 y = y.values.ravel() 412 else: 413 y = np.asarray(y).ravel() 414 415 if self.degree is not None: 416 assert isinstance(self.degree, int), "`degree` must be an integer" 417 self.poly_ = PolynomialFeatures( 418 degree=self.degree, interaction_only=True, include_bias=False 419 ) 420 X = self.poly_.fit_transform(X) 421 422 if self.n_clusters > 0: 423 clustered_X, self.scaler_, self.label_encoder_, self.clusterer_ = ( 424 cluster( 425 X, 426 n_clusters=self.n_clusters, 427 method=self.clustering_method, 428 type_scaling=self.cluster_scaling, 429 training=True, 430 seed=self.seed, 431 ) 432 ) 433 X = np.column_stack((X, clustered_X)) 434 435 self.obj = boosterc.fit_booster_classifier( 436 np.asarray(X, order="C", dtype=np.float64), 437 np.asarray(y, order="C", dtype=np.int64), 438 n_estimators=self.n_estimators, 439 learning_rate=self.learning_rate, 440 n_hidden_features=self.n_hidden_features, 441 reg_lambda=self.reg_lambda, 442 alpha=self.alpha, 443 row_sample=self.row_sample, 444 col_sample=self.col_sample, 445 dropout=self.dropout, 446 tolerance=self.tolerance, 447 direct_link=self.direct_link, 448 verbose=self.verbose, 449 seed=self.seed, 450 backend=self.backend, 451 solver=self.solver, 452 activation=self.activation, 453 obj=self.base_model, 454 ) 455 456 self.classes_ = np.unique(y) # for compatibility with sklearn 457 self.n_classes_ = len(self.classes_) # for compatibility with sklearn 458 self.n_estimators = self.obj["n_estimators"] 459 return self 460 461 def predict(self, X, **kwargs): 462 """Predict test data X. 463 464 Args: 465 466 X: {array-like}, shape = [n_samples, n_features] 467 Training vectors, where n_samples is the number 468 of samples and n_features is the number of features. 469 470 **kwargs: additional parameters to be passed to `predict_proba` 471 472 473 Returns: 474 475 model predictions: {array-like} 476 """ 477 478 return np.argmax(self.predict_proba(X, **kwargs), axis=1) 479 480 def predict_proba(self, X, **kwargs): 481 """Predict probabilities for test data X. 482 483 Args: 484 485 X: {array-like}, shape = [n_samples, n_features] 486 Training vectors, where n_samples is the number 487 of samples and n_features is the number of features. 488 489 **kwargs: additional parameters to be passed to 490 self.cook_test_set 491 492 Returns: 493 494 probability estimates for test data: {array-like} 495 """ 496 497 if isinstance(X, pd.DataFrame): 498 X = X.values 499 500 if self.hist == True: 501 X = get_histo_features(X, bins=self.hist_bins_) 502 503 if self.degree is not None: 504 X = self.poly_.transform(X) 505 506 if self.n_clusters > 0: 507 X = np.column_stack( 508 ( 509 X, 510 cluster( 511 X, 512 training=False, 513 scaler=self.scaler_, 514 label_encoder=self.label_encoder_, 515 clusterer=self.clusterer_, 516 seed=self.seed, 517 ), 518 ) 519 ) 520 try: 521 return boosterc.predict_proba_booster_classifier( 522 self.obj, np.asarray(X, order="C") 523 ) 524 except ValueError: 525 pass 526 527 def update(self, X, y, eta=0.9): 528 """Update model with new data. 529 530 Args: 531 532 X: {array-like}, shape = [n_samples=1, n_features] 533 Training vectors, where n_samples is the number 534 of samples and n_features is the number of features. 535 536 y: float = [n_samples=1] 537 Target value. 538 539 eta: float 540 Inverse power applied to number of observations 541 (defines a learning rate). 542 543 Returns: 544 545 self: object. 546 """ 547 548 if isinstance(X, pd.DataFrame): 549 X = X.values 550 551 if self.degree is not None: 552 X = self.poly_.transform(X) 553 554 if self.n_clusters > 0: 555 X = np.column_stack( 556 ( 557 X, 558 cluster( 559 X, 560 training=False, 561 scaler=self.scaler_, 562 label_encoder=self.label_encoder_, 563 clusterer=self.clusterer_, 564 seed=self.seed, 565 ), 566 ) 567 ) 568 569 self.obj = boosterc.update_booster( 570 self.obj, 571 np.asarray(X, order="C"), 572 np.asarray(y, order="C").ravel(), 573 eta, 574 ) 575 576 return self
LSBoost classifier.
Attributes:
n_estimators: int
number of boosting iterations.
learning_rate: float
controls the learning speed at training time.
n_hidden_features: int
number of nodes in successive hidden layers.
reg_lambda: float
L2 regularization parameter for successive errors in the optimizer
(at training time).
alpha: float
compromise between L1 and L2 regularization (must be in [0, 1]),
for `solver` == 'enet'.
row_sample: float
percentage of rows chosen from the training set.
col_sample: float
percentage of columns chosen from the training set.
dropout: float
percentage of nodes dropped from the training set.
tolerance: float
controls early stopping in gradient descent (at training time).
direct_link: bool
indicates whether the original features are included (True) in model's
fitting or not (False).
verbose: int
progress bar (yes = 1) or not (no = 0) (currently).
seed: int
reproducibility seed for nodes_sim=='uniform', clustering and dropout.
backend: str
type of backend; must be in ('cpu', 'gpu', 'tpu')
solver: str
type of 'weak' learner; currently in ('ridge', 'lasso', 'enet').
'enet' is a combination of 'ridge' and 'lasso' called Elastic Net.
activation: str
activation function: currently 'relu', 'relu6', 'sigmoid', 'tanh'
n_clusters: int
number of clusters for clustering the features
clustering_method: str
clustering method: currently 'kmeans', 'gmm'
cluster_scaling: str
scaling method for clustering: currently 'standard', 'robust', 'minmax'
degree: int
degree of features interactions to include in the model
weights_distr: str
distribution of weights for constructing the model's hidden layer;
currently 'uniform', 'gaussian'
hist: bool
indicates whether histogram features are used or not (default is False)
bins: int or str
number of bins for histogram features (same as numpy.histogram, default is 'auto')
Examples:
```python
import numpy as np
from sklearn.datasets import load_digits, load_breast_cancer, load_wine, load_iris
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.kernel_ridge import KernelRidge
from time import time
from os import chdir
from sklearn import metrics
import os
import mlsauce as ms
print("
") print("GenericBoosting Decision tree -----") print(" ")
print("
") print("breast_cancer data -----")
# data 1
breast_cancer = load_breast_cancer()
X = breast_cancer.data
y = breast_cancer.target
# split data into training test and test set
np.random.seed(15029)
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2)
clf = DecisionTreeRegressor()
clf2 = KernelRidge()
obj = ms.GenericBoostingClassifier(clf, tolerance=1e-2)
print(obj.get_params())
start = time()
obj.fit(X_train, y_train)
print(time()-start)
start = time()
print(obj.score(X_test, y_test))
print(time()-start)
print(obj.obj['loss'])
obj = ms.GenericBoostingClassifier(clf, tolerance=1e-2, n_clusters=2)
print(obj.get_params())
start = time()
obj.fit(X_train, y_train)
print(time()-start)
start = time()
print(obj.score(X_test, y_test))
print(time()-start)
print(obj.obj['loss'])
# data 2
print("
") print("wine data -----")
wine = load_wine()
Z = wine.data
t = wine.target
np.random.seed(879423)
X_train, X_test, y_train, y_test = train_test_split(Z, t,
test_size=0.2)
obj = ms.GenericBoostingClassifier(clf)
print(obj.get_params())
start = time()
obj.fit(X_train, y_train)
print(time()-start)
start = time()
print(obj.score(X_test, y_test))
print(time()-start)
print(obj.obj['loss'])
obj = ms.GenericBoostingClassifier(clf, n_clusters=3)
print(obj.get_params())
start = time()
obj.fit(X_train, y_train)
print(time()-start)
start = time()
print(obj.score(X_test, y_test))
print(time()-start)
print(obj.obj['loss'])
# data 3
print("
") print("iris data -----")
iris = load_iris()
Z = iris.data
t = iris.target
np.random.seed(734563)
X_train, X_test, y_train, y_test = train_test_split(Z, t,
test_size=0.2)
obj = ms.GenericBoostingClassifier(clf)
print(obj.get_params())
start = time()
obj.fit(X_train, y_train)
print(time()-start)
start = time()
print(obj.score(X_test, y_test))
print(time()-start)
print(obj.obj['loss'])
print("
") print("GenericBoosting KRR -----") print(" ")
obj = ms.GenericBoostingClassifier(clf2, tolerance=1e-2)
print(obj.get_params())
start = time()
obj.fit(X_train, y_train)
print(time()-start)
start = time()
print(obj.score(X_test, y_test))
print(time()-start)
print(obj.obj['loss'])
obj = ms.GenericBoostingClassifier(clf2, tolerance=1e-2, n_clusters=2)
print(obj.get_params())
start = time()
obj.fit(X_train, y_train)
print(time()-start)
start = time()
print(obj.score(X_test, y_test))
print(time()-start)
print(obj.obj['loss'])
# data 2
print("
") print("wine data -----")
wine = load_wine()
Z = wine.data
t = wine.target
np.random.seed(879423)
X_train, X_test, y_train, y_test = train_test_split(Z, t,
test_size=0.2)
obj = ms.GenericBoostingClassifier(clf2)
print(obj.get_params())
start = time()
obj.fit(X_train, y_train)
print(time()-start)
start = time()
print(obj.score(X_test, y_test))
print(time()-start)
print(obj.obj['loss'])
obj = ms.GenericBoostingClassifier(clf2, n_clusters=3)
print(obj.get_params())
start = time()
obj.fit(X_train, y_train)
print(time()-start)
start = time()
print(obj.score(X_test, y_test))
print(time()-start)
print(obj.obj['loss'])
# data 3
print("
") print("iris data -----")
iris = load_iris()
Z = iris.data
t = iris.target
np.random.seed(734563)
X_train, X_test, y_train, y_test = train_test_split(Z, t,
test_size=0.2)
obj = ms.GenericBoostingClassifier(clf2)
print(obj.get_params())
start = time()
obj.fit(X_train, y_train)
print(time()-start)
start = time()
print(obj.score(X_test, y_test))
print(time()-start)
print(obj.obj['loss'])
```
385 def fit(self, X, y, **kwargs): 386 """Fit Booster (classifier) to training data (X, y) 387 388 Args: 389 390 X: {array-like}, shape = [n_samples, n_features] 391 Training vectors, where n_samples is the number 392 of samples and n_features is the number of features. 393 394 y: array-like, shape = [n_samples] 395 Target values. 396 397 **kwargs: additional parameters to be passed to self.cook_training_set. 398 399 Returns: 400 401 self: object. 402 """ 403 404 if isinstance(X, pd.DataFrame): 405 X = X.values 406 407 if self.hist == True: 408 X, self.hist_bins_ = get_histo_features(X) 409 410 if isinstance(y, pd.Series): 411 y = y.values.ravel() 412 else: 413 y = np.asarray(y).ravel() 414 415 if self.degree is not None: 416 assert isinstance(self.degree, int), "`degree` must be an integer" 417 self.poly_ = PolynomialFeatures( 418 degree=self.degree, interaction_only=True, include_bias=False 419 ) 420 X = self.poly_.fit_transform(X) 421 422 if self.n_clusters > 0: 423 clustered_X, self.scaler_, self.label_encoder_, self.clusterer_ = ( 424 cluster( 425 X, 426 n_clusters=self.n_clusters, 427 method=self.clustering_method, 428 type_scaling=self.cluster_scaling, 429 training=True, 430 seed=self.seed, 431 ) 432 ) 433 X = np.column_stack((X, clustered_X)) 434 435 self.obj = boosterc.fit_booster_classifier( 436 np.asarray(X, order="C", dtype=np.float64), 437 np.asarray(y, order="C", dtype=np.int64), 438 n_estimators=self.n_estimators, 439 learning_rate=self.learning_rate, 440 n_hidden_features=self.n_hidden_features, 441 reg_lambda=self.reg_lambda, 442 alpha=self.alpha, 443 row_sample=self.row_sample, 444 col_sample=self.col_sample, 445 dropout=self.dropout, 446 tolerance=self.tolerance, 447 direct_link=self.direct_link, 448 verbose=self.verbose, 449 seed=self.seed, 450 backend=self.backend, 451 solver=self.solver, 452 activation=self.activation, 453 obj=self.base_model, 454 ) 455 456 self.classes_ = np.unique(y) # for compatibility with sklearn 457 self.n_classes_ = len(self.classes_) # for compatibility with sklearn 458 self.n_estimators = self.obj["n_estimators"] 459 return self
Fit Booster (classifier) to training data (X, y)
Args:
X: {array-like}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number
of samples and n_features is the number of features.
y: array-like, shape = [n_samples]
Target values.
**kwargs: additional parameters to be passed to self.cook_training_set.
Returns:
self: object.
461 def predict(self, X, **kwargs): 462 """Predict test data X. 463 464 Args: 465 466 X: {array-like}, shape = [n_samples, n_features] 467 Training vectors, where n_samples is the number 468 of samples and n_features is the number of features. 469 470 **kwargs: additional parameters to be passed to `predict_proba` 471 472 473 Returns: 474 475 model predictions: {array-like} 476 """ 477 478 return np.argmax(self.predict_proba(X, **kwargs), axis=1)
Predict test data X.
Args:
X: {array-like}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number
of samples and n_features is the number of features.
**kwargs: additional parameters to be passed to `predict_proba`
Returns:
model predictions: {array-like}
480 def predict_proba(self, X, **kwargs): 481 """Predict probabilities for test data X. 482 483 Args: 484 485 X: {array-like}, shape = [n_samples, n_features] 486 Training vectors, where n_samples is the number 487 of samples and n_features is the number of features. 488 489 **kwargs: additional parameters to be passed to 490 self.cook_test_set 491 492 Returns: 493 494 probability estimates for test data: {array-like} 495 """ 496 497 if isinstance(X, pd.DataFrame): 498 X = X.values 499 500 if self.hist == True: 501 X = get_histo_features(X, bins=self.hist_bins_) 502 503 if self.degree is not None: 504 X = self.poly_.transform(X) 505 506 if self.n_clusters > 0: 507 X = np.column_stack( 508 ( 509 X, 510 cluster( 511 X, 512 training=False, 513 scaler=self.scaler_, 514 label_encoder=self.label_encoder_, 515 clusterer=self.clusterer_, 516 seed=self.seed, 517 ), 518 ) 519 ) 520 try: 521 return boosterc.predict_proba_booster_classifier( 522 self.obj, np.asarray(X, order="C") 523 ) 524 except ValueError: 525 pass
Predict probabilities for test data X.
Args:
X: {array-like}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number
of samples and n_features is the number of features.
**kwargs: additional parameters to be passed to
self.cook_test_set
Returns:
probability estimates for test data: {array-like}
579class GenericBoostingClassifier(LSBoostClassifier): 580 """Generic Boosting classifier (using any classifier as base learner). 581 582 Attributes: 583 584 base_model: object 585 base learner (default is ExtraTreeRegressor) to be boosted. 586 587 n_estimators: int 588 number of boosting iterations. 589 590 learning_rate: float 591 controls the learning speed at training time. 592 593 n_hidden_features: int 594 number of nodes in successive hidden layers. 595 596 row_sample: float 597 percentage of rows chosen from the training set. 598 599 col_sample: float 600 percentage of columns chosen from the training set. 601 602 dropout: float 603 percentage of nodes dropped from the training set. 604 605 tolerance: float 606 controls early stopping in gradient descent (at training time). 607 608 direct_link: bool 609 indicates whether the original features are included (True) in model's 610 fitting or not (False). 611 612 verbose: int 613 progress bar (yes = 1) or not (no = 0) (currently). 614 615 seed: int 616 reproducibility seed for nodes_sim=='uniform', clustering and dropout. 617 618 activation: str 619 activation function: currently 'relu', 'relu6', 'sigmoid', 'tanh' 620 621 n_clusters: int 622 number of clusters for clustering the features 623 624 clustering_method: str 625 clustering method: currently 'kmeans', 'gmm' 626 627 cluster_scaling: str 628 scaling method for clustering: currently 'standard', 'robust', 'minmax' 629 630 degree: int 631 degree of features interactions to include in the model 632 633 weights_distr: str 634 distribution of weights for constructing the model's hidden layer; 635 currently 'uniform', 'gaussian' 636 637 hist: bool 638 indicates whether histogram features are used or not (default is False) 639 640 bins: int or str 641 number of bins for histogram features (same as numpy.histogram, default is 'auto') 642 643 """ 644 645 def __init__( 646 self, 647 base_model=ExtraTreeRegressor(), 648 n_estimators=100, 649 learning_rate=0.1, 650 n_hidden_features=5, 651 row_sample=1, 652 col_sample=1, 653 dropout=0, 654 tolerance=1e-4, 655 direct_link=1, 656 verbose=1, 657 backend="cpu", 658 seed=123, 659 activation="relu", 660 n_clusters=0, 661 clustering_method="kmeans", 662 cluster_scaling="standard", 663 degree=None, 664 weights_distr="uniform", 665 hist=False, 666 bins="auto", 667 ): 668 self.base_model = base_model 669 self.hist = hist 670 self.bins = bins 671 self.hist_bins_ = None 672 673 super().__init__( 674 n_estimators=n_estimators, 675 learning_rate=learning_rate, 676 n_hidden_features=n_hidden_features, 677 row_sample=row_sample, 678 col_sample=col_sample, 679 dropout=dropout, 680 tolerance=tolerance, 681 direct_link=direct_link, 682 verbose=verbose, 683 backend=backend, 684 seed=seed, 685 activation=activation, 686 n_clusters=n_clusters, 687 clustering_method=clustering_method, 688 cluster_scaling=cluster_scaling, 689 degree=degree, 690 weights_distr=weights_distr, 691 base_model=self.base_model, 692 )
Generic Boosting classifier (using any classifier as base learner).
Attributes:
base_model: object
base learner (default is ExtraTreeRegressor) to be boosted.
n_estimators: int
number of boosting iterations.
learning_rate: float
controls the learning speed at training time.
n_hidden_features: int
number of nodes in successive hidden layers.
row_sample: float
percentage of rows chosen from the training set.
col_sample: float
percentage of columns chosen from the training set.
dropout: float
percentage of nodes dropped from the training set.
tolerance: float
controls early stopping in gradient descent (at training time).
direct_link: bool
indicates whether the original features are included (True) in model's
fitting or not (False).
verbose: int
progress bar (yes = 1) or not (no = 0) (currently).
seed: int
reproducibility seed for nodes_sim=='uniform', clustering and dropout.
activation: str
activation function: currently 'relu', 'relu6', 'sigmoid', 'tanh'
n_clusters: int
number of clusters for clustering the features
clustering_method: str
clustering method: currently 'kmeans', 'gmm'
cluster_scaling: str
scaling method for clustering: currently 'standard', 'robust', 'minmax'
degree: int
degree of features interactions to include in the model
weights_distr: str
distribution of weights for constructing the model's hidden layer;
currently 'uniform', 'gaussian'
hist: bool
indicates whether histogram features are used or not (default is False)
bins: int or str
number of bins for histogram features (same as numpy.histogram, default is 'auto')
460class GenericBoostingRegressor(LSBoostRegressor): 461 """Generic Boosting regressor. 462 463 Attributes: 464 465 base_model: object 466 base learner (default is ExtraTreeRegressor) to be boosted. 467 468 n_estimators: int 469 number of boosting iterations. 470 471 learning_rate: float 472 controls the learning speed at training time. 473 474 n_hidden_features: int 475 number of nodes in successive hidden layers. 476 477 row_sample: float 478 percentage of rows chosen from the training set. 479 480 col_sample: float 481 percentage of columns chosen from the training set. 482 483 dropout: float 484 percentage of nodes dropped from the training set. 485 486 tolerance: float 487 controls early stopping in gradient descent (at training time). 488 489 direct_link: bool 490 indicates whether the original features are included (True) in model's 491 fitting or not (False). 492 493 verbose: int 494 progress bar (yes = 1) or not (no = 0) (currently). 495 496 seed: int 497 reproducibility seed for nodes_sim=='uniform', clustering and dropout. 498 499 activation: str 500 activation function: currently 'relu', 'relu6', 'sigmoid', 'tanh' 501 502 type_pi: str. 503 type of prediction interval; currently "kde" (default) or "bootstrap". 504 Used only in `self.predict`, for `self.replications` > 0 and `self.kernel` 505 in ('gaussian', 'tophat'). Default is `None`. 506 507 replications: int. 508 number of replications (if needed) for predictive simulation. 509 Used only in `self.predict`, for `self.kernel` in ('gaussian', 510 'tophat') and `self.type_pi = 'kde'`. Default is `None`. 511 512 n_clusters: int 513 number of clusters for clustering the features 514 515 clustering_method: str 516 clustering method: currently 'kmeans', 'gmm' 517 518 cluster_scaling: str 519 scaling method for clustering: currently 'standard', 'robust', 'minmax' 520 521 degree: int 522 degree of features interactions to include in the model 523 524 weights_distr: str 525 distribution of weights for constructing the model's hidden layer; 526 either 'uniform' or 'gaussian' 527 528 hist: bool 529 whether to use histogram features or not 530 531 bins: int or str 532 number of bins for histogram features (same as numpy.histogram, default is 'auto') 533 534 """ 535 536 def __init__( 537 self, 538 base_model=ExtraTreeRegressor(), 539 n_estimators=100, 540 learning_rate=0.1, 541 n_hidden_features=5, 542 row_sample=1, 543 col_sample=1, 544 dropout=0, 545 tolerance=1e-4, 546 direct_link=1, 547 verbose=1, 548 backend="cpu", 549 seed=123, 550 activation="relu", 551 type_pi=None, 552 replications=None, 553 kernel=None, 554 n_clusters=0, 555 clustering_method="kmeans", 556 cluster_scaling="standard", 557 degree=None, 558 weights_distr="uniform", 559 hist=False, 560 bins="auto", 561 ): 562 self.base_model = base_model 563 self.hist = hist 564 self.bins = bins 565 self.hist_bins_ = None 566 567 super().__init__( 568 n_estimators=n_estimators, 569 learning_rate=learning_rate, 570 n_hidden_features=n_hidden_features, 571 row_sample=row_sample, 572 col_sample=col_sample, 573 dropout=dropout, 574 tolerance=tolerance, 575 direct_link=direct_link, 576 verbose=verbose, 577 backend=backend, 578 seed=seed, 579 activation=activation, 580 type_pi=type_pi, 581 replications=replications, 582 kernel=kernel, 583 n_clusters=n_clusters, 584 clustering_method=clustering_method, 585 cluster_scaling=cluster_scaling, 586 degree=degree, 587 weights_distr=weights_distr, 588 base_model=self.base_model, 589 )
Generic Boosting regressor.
Attributes:
base_model: object
base learner (default is ExtraTreeRegressor) to be boosted.
n_estimators: int
number of boosting iterations.
learning_rate: float
controls the learning speed at training time.
n_hidden_features: int
number of nodes in successive hidden layers.
row_sample: float
percentage of rows chosen from the training set.
col_sample: float
percentage of columns chosen from the training set.
dropout: float
percentage of nodes dropped from the training set.
tolerance: float
controls early stopping in gradient descent (at training time).
direct_link: bool
indicates whether the original features are included (True) in model's
fitting or not (False).
verbose: int
progress bar (yes = 1) or not (no = 0) (currently).
seed: int
reproducibility seed for nodes_sim=='uniform', clustering and dropout.
activation: str
activation function: currently 'relu', 'relu6', 'sigmoid', 'tanh'
type_pi: str.
type of prediction interval; currently "kde" (default) or "bootstrap".
Used only in `self.predict`, for `self.replications` > 0 and `self.kernel`
in ('gaussian', 'tophat'). Default is `None`.
replications: int.
number of replications (if needed) for predictive simulation.
Used only in `self.predict`, for `self.kernel` in ('gaussian',
'tophat') and `self.type_pi = 'kde'`. Default is `None`.
n_clusters: int
number of clusters for clustering the features
clustering_method: str
clustering method: currently 'kmeans', 'gmm'
cluster_scaling: str
scaling method for clustering: currently 'standard', 'robust', 'minmax'
degree: int
degree of features interactions to include in the model
weights_distr: str
distribution of weights for constructing the model's hidden layer;
either 'uniform' or 'gaussian'
hist: bool
whether to use histogram features or not
bins: int or str
number of bins for histogram features (same as numpy.histogram, default is 'auto')
12class StumpClassifier(BaseEstimator, ClassifierMixin): 13 """Stump classifier. 14 15 Attributes: 16 17 bins: int 18 Number of histogram bins; as in numpy.histogram. 19 """ 20 21 def __init__(self, bins="auto"): 22 self.bins = bins 23 self.obj = None 24 25 def fit(self, X, y, sample_weight=None, **kwargs): 26 """Fit Stump to training data (X, y) 27 28 Args: 29 30 X: {array-like}, shape = [n_samples, n_features] 31 Training vectors, where n_samples is the number 32 of samples and n_features is the number of features. 33 34 y: array-like, shape = [n_samples] 35 Target values. 36 37 sample_weight: array_like, shape = [n_samples] 38 Observations weights. 39 40 Returns: 41 42 self: object. 43 """ 44 45 if sample_weight is None: 46 self.obj = stumpc.fit_stump_classifier( 47 X=np.asarray(X, order="C"), 48 y=np.asarray(y, order="C"), 49 bins=self.bins, 50 ) 51 52 return self 53 54 self.obj = stumpc.fit_stump_classifier( 55 X=np.asarray(X, order="C"), 56 y=np.asarray(y, order="C"), 57 sample_weight=np.ravel(sample_weight, order="C"), 58 bins=self.bins, 59 ) 60 self.n_classes_ = len(np.unique(y)) # for compatibility with sklearn 61 return self 62 63 def predict(self, X, **kwargs): 64 """Predict test data X. 65 66 Args: 67 68 X: {array-like}, shape = [n_samples, n_features] 69 Training vectors, where n_samples is the number 70 of samples and n_features is the number of features. 71 72 **kwargs: additional parameters to be passed to `predict_proba` 73 74 75 Returns: 76 77 model predictions: {array-like} 78 """ 79 80 return np.argmax(self.predict_proba(X, **kwargs), axis=1) 81 82 def predict_proba(self, X, **kwargs): 83 """Predict probabilities for test data X. 84 85 Args: 86 87 X: {array-like}, shape = [n_samples, n_features] 88 Training vectors, where n_samples is the number 89 of samples and n_features is the number of features. 90 91 **kwargs: additional parameters to be passed to 92 self.cook_test_set 93 94 Returns: 95 96 probability estimates for test data: {array-like} 97 """ 98 99 return stumpc.predict_proba_stump_classifier( 100 self.obj, np.asarray(X, order="C") 101 )
Stump classifier.
Attributes:
bins: int
Number of histogram bins; as in numpy.histogram.
25 def fit(self, X, y, sample_weight=None, **kwargs): 26 """Fit Stump to training data (X, y) 27 28 Args: 29 30 X: {array-like}, shape = [n_samples, n_features] 31 Training vectors, where n_samples is the number 32 of samples and n_features is the number of features. 33 34 y: array-like, shape = [n_samples] 35 Target values. 36 37 sample_weight: array_like, shape = [n_samples] 38 Observations weights. 39 40 Returns: 41 42 self: object. 43 """ 44 45 if sample_weight is None: 46 self.obj = stumpc.fit_stump_classifier( 47 X=np.asarray(X, order="C"), 48 y=np.asarray(y, order="C"), 49 bins=self.bins, 50 ) 51 52 return self 53 54 self.obj = stumpc.fit_stump_classifier( 55 X=np.asarray(X, order="C"), 56 y=np.asarray(y, order="C"), 57 sample_weight=np.ravel(sample_weight, order="C"), 58 bins=self.bins, 59 ) 60 self.n_classes_ = len(np.unique(y)) # for compatibility with sklearn 61 return self
Fit Stump to training data (X, y)
Args:
X: {array-like}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number
of samples and n_features is the number of features.
y: array-like, shape = [n_samples]
Target values.
sample_weight: array_like, shape = [n_samples]
Observations weights.
Returns:
self: object.
63 def predict(self, X, **kwargs): 64 """Predict test data X. 65 66 Args: 67 68 X: {array-like}, shape = [n_samples, n_features] 69 Training vectors, where n_samples is the number 70 of samples and n_features is the number of features. 71 72 **kwargs: additional parameters to be passed to `predict_proba` 73 74 75 Returns: 76 77 model predictions: {array-like} 78 """ 79 80 return np.argmax(self.predict_proba(X, **kwargs), axis=1)
Predict test data X.
Args:
X: {array-like}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number
of samples and n_features is the number of features.
**kwargs: additional parameters to be passed to `predict_proba`
Returns:
model predictions: {array-like}
82 def predict_proba(self, X, **kwargs): 83 """Predict probabilities for test data X. 84 85 Args: 86 87 X: {array-like}, shape = [n_samples, n_features] 88 Training vectors, where n_samples is the number 89 of samples and n_features is the number of features. 90 91 **kwargs: additional parameters to be passed to 92 self.cook_test_set 93 94 Returns: 95 96 probability estimates for test data: {array-like} 97 """ 98 99 return stumpc.predict_proba_stump_classifier( 100 self.obj, np.asarray(X, order="C") 101 )
Predict probabilities for test data X.
Args:
X: {array-like}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number
of samples and n_features is the number of features.
**kwargs: additional parameters to be passed to
self.cook_test_set
Returns:
probability estimates for test data: {array-like}
19class ElasticNetRegressor(BaseEstimator, RegressorMixin): 20 """Elasticnet. 21 22 Attributes: 23 24 reg_lambda: float 25 regularization parameter. 26 27 alpha: float 28 compromise between L1 and L2 regularization (must be in [0, 1]), 29 for `solver` == 'enet'. 30 31 backend: str 32 type of backend; must be in ('cpu', 'gpu', 'tpu') 33 34 """ 35 36 def __init__(self, reg_lambda=0.1, alpha=0.5, backend="cpu"): 37 assert backend in ( 38 "cpu", 39 "gpu", 40 "tpu", 41 ), "`backend` must be in ('cpu', 'gpu', 'tpu')" 42 43 sys_platform = platform.system() 44 45 if (sys_platform == "Windows") and (backend in ("gpu", "tpu")): 46 warnings.warn( 47 "No GPU/TPU computing on Windows yet, backend set to 'cpu'" 48 ) 49 backend = "cpu" 50 51 self.reg_lambda = reg_lambda 52 self.alpha = alpha 53 self.backend = backend 54 if self.backend in ("gpu", "tpu"): 55 check_and_install("jax") 56 check_and_install("jaxlib") 57 58 def fit(self, X, y, **kwargs): 59 """Fit matrixops (classifier) to training data (X, y) 60 61 Args: 62 63 X: {array-like}, shape = [n_samples, n_features] 64 Training vectors, where n_samples is the number 65 of samples and n_features is the number of features. 66 67 y: array-like, shape = [n_samples] 68 Target values. 69 70 **kwargs: additional parameters to be passed to self.cook_training_set. 71 72 Returns: 73 74 self: object. 75 76 """ 77 fit_result = fit_elasticnet( 78 X, y, lam=self.reg_lambda, alpha=self.alpha, backend=self.backend 79 ) 80 self.coef_ = fit_result.coef_ 81 self.y_train_mean = fit_result.y_train_mean 82 self.scaler = fit_result.scaler 83 self.converged = fit_result.converged 84 return self 85 86 def predict(self, X, **kwargs): 87 """Predict test data X. 88 89 Args: 90 91 X: {array-like}, shape = [n_samples, n_features] 92 Training vectors, where n_samples is the number 93 of samples and n_features is the number of features. 94 95 **kwargs: additional parameters to be passed to `predict_proba` 96 97 Returns: 98 99 model predictions: {array-like} 100 101 """ 102 return predict_elasticnet(X, self, backend=self.backend)
Elasticnet.
Attributes:
reg_lambda: float
regularization parameter.
alpha: float
compromise between L1 and L2 regularization (must be in [0, 1]),
for `solver` == 'enet'.
backend: str
type of backend; must be in ('cpu', 'gpu', 'tpu')
58 def fit(self, X, y, **kwargs): 59 """Fit matrixops (classifier) to training data (X, y) 60 61 Args: 62 63 X: {array-like}, shape = [n_samples, n_features] 64 Training vectors, where n_samples is the number 65 of samples and n_features is the number of features. 66 67 y: array-like, shape = [n_samples] 68 Target values. 69 70 **kwargs: additional parameters to be passed to self.cook_training_set. 71 72 Returns: 73 74 self: object. 75 76 """ 77 fit_result = fit_elasticnet( 78 X, y, lam=self.reg_lambda, alpha=self.alpha, backend=self.backend 79 ) 80 self.coef_ = fit_result.coef_ 81 self.y_train_mean = fit_result.y_train_mean 82 self.scaler = fit_result.scaler 83 self.converged = fit_result.converged 84 return self
Fit matrixops (classifier) to training data (X, y)
Args:
X: {array-like}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number
of samples and n_features is the number of features.
y: array-like, shape = [n_samples]
Target values.
**kwargs: additional parameters to be passed to self.cook_training_set.
Returns:
self: object.
86 def predict(self, X, **kwargs): 87 """Predict test data X. 88 89 Args: 90 91 X: {array-like}, shape = [n_samples, n_features] 92 Training vectors, where n_samples is the number 93 of samples and n_features is the number of features. 94 95 **kwargs: additional parameters to be passed to `predict_proba` 96 97 Returns: 98 99 model predictions: {array-like} 100 101 """ 102 return predict_elasticnet(X, self, backend=self.backend)
Predict test data X.
Args:
X: {array-like}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number
of samples and n_features is the number of features.
**kwargs: additional parameters to be passed to `predict_proba`
Returns:
model predictions: {array-like}
36class KRLSRegressor(BaseEstimator, RegressorMixin): 37 38 def __init__(self, regularization=0.1, kernel=None, backend="cpu"): 39 40 if kernel is None: 41 if backend == "cpu": 42 43 def kernel(x, y): 44 return np.sqrt(np.sum(np.square(x - y))) 45 46 else: 47 48 def kernel(x, y): 49 device_put(x) 50 device_put(y) 51 return jnp.sqrt(jnp.sum(jnp.square(x - y))) 52 53 self.backend = backend 54 self.kernel = kernel 55 self.regularization = regularization 56 self.ym_ = None 57 self.scaler_ = StandardScaler() 58 self.X_ = None 59 self.coef_ = None 60 61 def fit(self, X, y): 62 self.ym_ = np.mean(y) 63 centered_y = y - self.ym_ 64 X_ = self.scaler_.fit_transform(X) 65 if self.backend == "cpu": 66 K = compute_kernel_matrix( 67 X_, self.kernel 68 ) + self.regularization * np.eye(X_.shape[0]) 69 self.coef_ = np.linalg.solve(K, centered_y) 70 else: 71 device_put(X_) 72 device_put(centered_y) 73 K = compute_kernel_matrix( 74 X_, self.kernel 75 ) + self.regularization * jnp.eye(X_.shape[0]) 76 self.coef_ = jnp.linalg.solve(K, centered_y) 77 self.X_ = X_ 78 return self 79 80 def predict(self, X): 81 X_ = self.scaler_.transform(X) 82 if self.backend != "cpu": 83 device_put(X_) 84 device_put(self.X_) 85 device_put(self.coef_) 86 device_put(self.ym_) 87 return ( 88 compute_kernel_matrix(self.X_, self.kernel, X_) @ self.coef_ 89 + self.ym_ 90 )
Base class for all estimators in scikit-learn.
Inheriting from this class provides default implementations of:
- setting and getting parameters used by
GridSearchCVand friends; - textual and HTML representation displayed in terminals and IDEs;
- estimator serialization;
- parameters validation;
- data validation;
- feature names validation.
Read more in the :ref:User Guide <rolling_your_own_estimator>.
Notes
All estimators should specify all the parameters that can be set
at the class level in their __init__ as explicit keyword
arguments (no *args or **kwargs).
Examples
>>> import numpy as np
>>> from sklearn.base import BaseEstimator
>>> class MyEstimator(BaseEstimator):
... def __init__(self, *, param=1):
... self.param = param
... def fit(self, X, y=None):
... self.is_fitted_ = True
... return self
... def predict(self, X):
... return np.full(shape=X.shape[0], fill_value=self.param)
>>> estimator = MyEstimator(param=2)
>>> estimator.get_params()
{'param': 2}
>>> X = np.array([[1, 2], [2, 3], [3, 4]])
>>> y = np.array([1, 0, 1])
>>> estimator.fit(X, y).predict(X)
array([2, 2, 2])
>>> estimator.set_params(param=3).fit(X, y).predict(X)
array([3, 3, 3])
61 def fit(self, X, y): 62 self.ym_ = np.mean(y) 63 centered_y = y - self.ym_ 64 X_ = self.scaler_.fit_transform(X) 65 if self.backend == "cpu": 66 K = compute_kernel_matrix( 67 X_, self.kernel 68 ) + self.regularization * np.eye(X_.shape[0]) 69 self.coef_ = np.linalg.solve(K, centered_y) 70 else: 71 device_put(X_) 72 device_put(centered_y) 73 K = compute_kernel_matrix( 74 X_, self.kernel 75 ) + self.regularization * jnp.eye(X_.shape[0]) 76 self.coef_ = jnp.linalg.solve(K, centered_y) 77 self.X_ = X_ 78 return self
24class LassoRegressor(BaseEstimator, RegressorMixin): 25 """Lasso. 26 27 Attributes: 28 29 reg_lambda: float 30 L1 regularization parameter. 31 32 max_iter: int 33 number of iterations of lasso shooting algorithm. 34 35 tol: float 36 tolerance for convergence of lasso shooting algorithm. 37 38 backend: str 39 type of backend; must be in ('cpu', 'gpu', 'tpu'). 40 41 """ 42 43 def __init__(self, reg_lambda=0.1, max_iter=10, tol=1e-3, backend="cpu"): 44 assert backend in ( 45 "cpu", 46 "gpu", 47 "tpu", 48 ), "`backend` must be in ('cpu', 'gpu', 'tpu')" 49 50 sys_platform = platform.system() 51 52 if (sys_platform == "Windows") and (backend in ("gpu", "tpu")): 53 warnings.warn( 54 "No GPU/TPU computing on Windows yet, backend set to 'cpu'" 55 ) 56 backend = "cpu" 57 58 self.reg_lambda = reg_lambda 59 self.max_iter = max_iter 60 self.tol = tol 61 self.backend = backend 62 if self.backend in ("gpu", "tpu"): 63 check_and_install("jax") 64 check_and_install("jaxlib") 65 66 def fit(self, X, y, **kwargs): 67 """Fit matrixops (classifier) to training data (X, y) 68 69 Args: 70 71 X: {array-like}, shape = [n_samples, n_features] 72 Training vectors, where n_samples is the number 73 of samples and n_features is the number of features. 74 75 y: array-like, shape = [n_samples] 76 Target values. 77 78 **kwargs: additional parameters to be passed to self.cook_training_set. 79 80 Returns: 81 82 self: object. 83 84 """ 85 86 self.ym, centered_y = mo.center_response(y) 87 self.xm = X.mean(axis=0) 88 self.xsd = X.std(axis=0) 89 self.xsd[self.xsd == 0] = 1 90 X_ = (X - self.xm[None, :]) / self.xsd[None, :] 91 XX = mo.crossprod(X_, backend=self.backend) 92 Xy = mo.crossprod(X_, centered_y, backend=self.backend) 93 XX2 = 2 * XX 94 Xy2 = 2 * Xy 95 96 if self.backend == "cpu": 97 # beta0, _, _, _ = np.linalg.lstsq(X_, centered_y, rcond=None) 98 beta0 = get_beta(X_, centered_y) 99 if len(np.asarray(y).shape) == 1: 100 res = mo.get_beta_1D( 101 beta0=np.asarray(beta0), 102 XX2=np.asarray(XX2), 103 Xy2=np.asarray(Xy2), 104 reg_lambda=self.reg_lambda, 105 max_iter=self.max_iter, 106 tol=self.tol, 107 ) 108 self.beta = res[0] 109 return self 110 111 res = mo.get_beta_2D( 112 beta0=np.asarray(beta0), 113 XX2=np.asarray(XX2), 114 Xy2=np.asarray(Xy2), 115 reg_lambda=self.reg_lambda, 116 max_iter=self.max_iter, 117 tol=self.tol, 118 ) 119 self.beta = res[0] 120 return self 121 122 invXX = jinv(XX + self.reg_lambda * jnp.eye(X_.shape[1])) 123 beta0 = mo.safe_sparse_dot(invXX, Xy, backend=self.backend) 124 if len(np.asarray(y).shape) == 1: 125 res = mo.get_beta_1D( 126 beta0=np.asarray(beta0), 127 XX2=np.asarray(XX2), 128 Xy2=np.asarray(Xy2), 129 reg_lambda=self.reg_lambda, 130 max_iter=self.max_iter, 131 tol=self.tol, 132 ) 133 self.beta = res[0] 134 return self 135 136 res = mo.get_beta_2D( 137 beta0=np.asarray(beta0), 138 XX2=np.asarray(XX2), 139 Xy2=np.asarray(Xy2), 140 reg_lambda=self.reg_lambda, 141 max_iter=self.max_iter, 142 tol=self.tol, 143 ) 144 self.beta = res[0] 145 return self 146 147 def predict(self, X, **kwargs): 148 """Predict test data X. 149 150 Args: 151 152 X: {array-like}, shape = [n_samples, n_features] 153 Training vectors, where n_samples is the number 154 of samples and n_features is the number of features. 155 156 **kwargs: additional parameters to be passed to `predict_proba` 157 158 159 Returns: 160 161 model predictions: {array-like} 162 163 """ 164 X_ = (X - self.xm[None, :]) / self.xsd[None, :] 165 166 if self.backend == "cpu": 167 if isinstance(self.ym, float): 168 return self.ym + mo.safe_sparse_dot(X_, self.beta) 169 return self.ym[None, :] + mo.safe_sparse_dot(X_, self.beta) 170 171 # if self.backend in ("gpu", "tpu"): 172 if isinstance(self.ym, float): 173 return self.ym + mo.safe_sparse_dot( 174 X_, self.beta, backend=self.backend 175 ) 176 return self.ym[None, :] + mo.safe_sparse_dot( 177 X_, self.beta, backend=self.backend 178 )
Lasso.
Attributes:
reg_lambda: float
L1 regularization parameter.
max_iter: int
number of iterations of lasso shooting algorithm.
tol: float
tolerance for convergence of lasso shooting algorithm.
backend: str
type of backend; must be in ('cpu', 'gpu', 'tpu').
66 def fit(self, X, y, **kwargs): 67 """Fit matrixops (classifier) to training data (X, y) 68 69 Args: 70 71 X: {array-like}, shape = [n_samples, n_features] 72 Training vectors, where n_samples is the number 73 of samples and n_features is the number of features. 74 75 y: array-like, shape = [n_samples] 76 Target values. 77 78 **kwargs: additional parameters to be passed to self.cook_training_set. 79 80 Returns: 81 82 self: object. 83 84 """ 85 86 self.ym, centered_y = mo.center_response(y) 87 self.xm = X.mean(axis=0) 88 self.xsd = X.std(axis=0) 89 self.xsd[self.xsd == 0] = 1 90 X_ = (X - self.xm[None, :]) / self.xsd[None, :] 91 XX = mo.crossprod(X_, backend=self.backend) 92 Xy = mo.crossprod(X_, centered_y, backend=self.backend) 93 XX2 = 2 * XX 94 Xy2 = 2 * Xy 95 96 if self.backend == "cpu": 97 # beta0, _, _, _ = np.linalg.lstsq(X_, centered_y, rcond=None) 98 beta0 = get_beta(X_, centered_y) 99 if len(np.asarray(y).shape) == 1: 100 res = mo.get_beta_1D( 101 beta0=np.asarray(beta0), 102 XX2=np.asarray(XX2), 103 Xy2=np.asarray(Xy2), 104 reg_lambda=self.reg_lambda, 105 max_iter=self.max_iter, 106 tol=self.tol, 107 ) 108 self.beta = res[0] 109 return self 110 111 res = mo.get_beta_2D( 112 beta0=np.asarray(beta0), 113 XX2=np.asarray(XX2), 114 Xy2=np.asarray(Xy2), 115 reg_lambda=self.reg_lambda, 116 max_iter=self.max_iter, 117 tol=self.tol, 118 ) 119 self.beta = res[0] 120 return self 121 122 invXX = jinv(XX + self.reg_lambda * jnp.eye(X_.shape[1])) 123 beta0 = mo.safe_sparse_dot(invXX, Xy, backend=self.backend) 124 if len(np.asarray(y).shape) == 1: 125 res = mo.get_beta_1D( 126 beta0=np.asarray(beta0), 127 XX2=np.asarray(XX2), 128 Xy2=np.asarray(Xy2), 129 reg_lambda=self.reg_lambda, 130 max_iter=self.max_iter, 131 tol=self.tol, 132 ) 133 self.beta = res[0] 134 return self 135 136 res = mo.get_beta_2D( 137 beta0=np.asarray(beta0), 138 XX2=np.asarray(XX2), 139 Xy2=np.asarray(Xy2), 140 reg_lambda=self.reg_lambda, 141 max_iter=self.max_iter, 142 tol=self.tol, 143 ) 144 self.beta = res[0] 145 return self
Fit matrixops (classifier) to training data (X, y)
Args:
X: {array-like}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number
of samples and n_features is the number of features.
y: array-like, shape = [n_samples]
Target values.
**kwargs: additional parameters to be passed to self.cook_training_set.
Returns:
self: object.
147 def predict(self, X, **kwargs): 148 """Predict test data X. 149 150 Args: 151 152 X: {array-like}, shape = [n_samples, n_features] 153 Training vectors, where n_samples is the number 154 of samples and n_features is the number of features. 155 156 **kwargs: additional parameters to be passed to `predict_proba` 157 158 159 Returns: 160 161 model predictions: {array-like} 162 163 """ 164 X_ = (X - self.xm[None, :]) / self.xsd[None, :] 165 166 if self.backend == "cpu": 167 if isinstance(self.ym, float): 168 return self.ym + mo.safe_sparse_dot(X_, self.beta) 169 return self.ym[None, :] + mo.safe_sparse_dot(X_, self.beta) 170 171 # if self.backend in ("gpu", "tpu"): 172 if isinstance(self.ym, float): 173 return self.ym + mo.safe_sparse_dot( 174 X_, self.beta, backend=self.backend 175 ) 176 return self.ym[None, :] + mo.safe_sparse_dot( 177 X_, self.beta, backend=self.backend 178 )
Predict test data X.
Args:
X: {array-like}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number
of samples and n_features is the number of features.
**kwargs: additional parameters to be passed to `predict_proba`
Returns:
model predictions: {array-like}
19class LSBoostRegressor(BaseEstimator, RegressorMixin): 20 """LSBoost regressor. 21 22 Attributes: 23 24 n_estimators: int 25 number of boosting iterations. 26 27 learning_rate: float 28 controls the learning speed at training time. 29 30 n_hidden_features: int 31 number of nodes in successive hidden layers. 32 33 reg_lambda: float 34 L2 regularization parameter for successive errors in the optimizer 35 (at training time). 36 37 alpha: float 38 compromise between L1 and L2 regularization (must be in [0, 1]), 39 for `solver` == 'enet' 40 41 row_sample: float 42 percentage of rows chosen from the training set. 43 44 col_sample: float 45 percentage of columns chosen from the training set. 46 47 dropout: float 48 percentage of nodes dropped from the training set. 49 50 tolerance: float 51 controls early stopping in gradient descent (at training time). 52 53 direct_link: bool 54 indicates whether the original features are included (True) in model's 55 fitting or not (False). 56 57 verbose: int 58 progress bar (yes = 1) or not (no = 0) (currently). 59 60 seed: int 61 reproducibility seed for nodes_sim=='uniform', clustering and dropout. 62 63 backend: str 64 type of backend; must be in ('cpu', 'gpu', 'tpu') 65 66 solver: str 67 type of 'weak' learner; currently in ('ridge', 'lasso') 68 69 activation: str 70 activation function: currently 'relu', 'relu6', 'sigmoid', 'tanh' 71 72 type_pi: str. 73 type of prediction interval; currently "kde" (default) or "bootstrap". 74 Used only in `self.predict`, for `self.replications` > 0 and `self.kernel` 75 in ('gaussian', 'tophat'). Default is `None`. 76 77 replications: int. 78 number of replications (if needed) for predictive simulation. 79 Used only in `self.predict`, for `self.kernel` in ('gaussian', 80 'tophat') and `self.type_pi = 'kde'`. Default is `None`. 81 82 n_clusters: int 83 number of clusters for clustering the features 84 85 clustering_method: str 86 clustering method: currently 'kmeans', 'gmm' 87 88 cluster_scaling: str 89 scaling method for clustering: currently 'standard', 'robust', 'minmax' 90 91 degree: int 92 degree of features interactions to include in the model 93 94 weights_distr: str 95 distribution of weights for constructing the model's hidden layer; 96 either 'uniform' or 'gaussian' 97 98 hist: bool 99 whether to use histogram features or not 100 101 bins: int or str 102 number of bins for histogram features (same as numpy.histogram, default is 'auto') 103 104 Examples: 105 106 ```python 107 import subprocess 108 import sys 109 import os 110 111 import mlsauce as ms 112 import numpy as np 113 import matplotlib.pyplot as plt 114 from sklearn.datasets import load_diabetes 115 from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score 116 from sklearn.tree import DecisionTreeRegressor 117 from time import time 118 from os import chdir 119 from sklearn import metrics 120 121 regr = DecisionTreeRegressor() 122 123 diabetes = load_diabetes() 124 X = diabetes.data 125 y = diabetes.target 126 # split data into training test and test set 127 np.random.seed(15029) 128 X_train, X_test, y_train, y_test = train_test_split(X, y, 129 test_size=0.2) 130 131 obj = ms.GenericBoostingRegressor(regr, col_sample=0.9, row_sample=0.9) 132 print(obj.get_params()) 133 start = time() 134 obj.fit(X_train, y_train) 135 print(time()-start) 136 start = time() 137 print(np.sqrt(np.mean(np.square(obj.predict(X_test) - y_test)))) 138 print(time()-start) 139 140 print(obj.obj['loss']) 141 142 obj = ms.GenericBoostingRegressor(regr, col_sample=0.9, row_sample=0.9, n_clusters=2) 143 print(obj.get_params()) 144 start = time() 145 obj.fit(X_train, y_train) 146 print(time()-start) 147 start = time() 148 print(np.sqrt(np.mean(np.square(obj.predict(X_test) - y_test)))) 149 print(time()-start) 150 151 print(obj.obj['loss']) 152 ``` 153 154 """ 155 156 def __init__( 157 self, 158 n_estimators=100, 159 learning_rate=0.1, 160 n_hidden_features=5, 161 reg_lambda=0.1, 162 alpha=0.5, 163 row_sample=1, 164 col_sample=1, 165 dropout=0, 166 tolerance=1e-4, 167 direct_link=1, 168 verbose=1, 169 seed=123, 170 backend="cpu", 171 solver="ridge", 172 activation="relu", 173 type_pi=None, 174 replications=None, 175 kernel=None, 176 n_clusters=0, 177 clustering_method="kmeans", 178 cluster_scaling="standard", 179 degree=None, 180 weights_distr="uniform", 181 base_model=None, 182 hist=False, 183 bins="auto", 184 ): 185 186 self.base_model = base_model 187 self.hist = hist 188 self.bins = bins 189 self.hist_bins_ = None 190 191 if n_clusters > 0: 192 assert clustering_method in ( 193 "kmeans", 194 "gmm", 195 ), "`clustering_method` must be in ('kmeans', 'gmm')" 196 assert cluster_scaling in ( 197 "standard", 198 "robust", 199 "minmax", 200 ), "`cluster_scaling` must be in ('standard', 'robust', 'minmax')" 201 202 assert backend in ( 203 "cpu", 204 "gpu", 205 "tpu", 206 ), "`backend` must be in ('cpu', 'gpu', 'tpu')" 207 208 assert solver in ( 209 "ridge", 210 "lasso", 211 "enet", 212 ), "`solver` must be in ('ridge', 'lasso', 'enet')" 213 214 sys_platform = platform.system() 215 216 if (sys_platform == "Windows") and (backend in ("gpu", "tpu")): 217 warnings.warn( 218 "No GPU/TPU computing on Windows yet, backend set to 'cpu'" 219 ) 220 backend = "cpu" 221 222 self.n_estimators = n_estimators 223 self.learning_rate = learning_rate 224 self.n_hidden_features = n_hidden_features 225 self.reg_lambda = reg_lambda 226 assert alpha >= 0 and alpha <= 1, "`alpha` must be in [0, 1]" 227 self.alpha = alpha 228 self.row_sample = row_sample 229 self.col_sample = col_sample 230 self.dropout = dropout 231 self.tolerance = tolerance 232 self.direct_link = direct_link 233 self.verbose = verbose 234 self.seed = seed 235 self.backend = backend 236 self.obj = None 237 self.solver = solver 238 self.activation = activation 239 self.type_pi = type_pi 240 self.replications = replications 241 self.kernel = kernel 242 self.n_clusters = n_clusters 243 self.clustering_method = clustering_method 244 self.cluster_scaling = cluster_scaling 245 self.scaler_, self.label_encoder_, self.clusterer_ = None, None, None 246 self.degree = degree 247 self.poly_ = None 248 self.weights_distr = weights_distr 249 if self.backend in ("gpu", "tpu"): 250 check_and_install("jax") 251 check_and_install("jaxlib") 252 253 def fit(self, X, y, **kwargs): 254 """Fit Booster (regressor) to training data (X, y) 255 256 Args: 257 258 X: {array-like}, shape = [n_samples, n_features] 259 Training vectors, where n_samples is the number 260 of samples and n_features is the number of features. 261 262 y: array-like, shape = [n_samples] 263 Target values. 264 265 **kwargs: additional parameters to be passed to self.cook_training_set. 266 267 Returns: 268 269 self: object. 270 """ 271 272 if isinstance(X, pd.DataFrame): 273 X = X.values 274 275 if self.hist == True: 276 X, self.hist_bins_ = get_histo_features(X) 277 278 if isinstance(y, pd.Series): 279 y = y.values.ravel() 280 else: 281 y = np.asarray(y).ravel() 282 283 if self.degree is not None: 284 assert isinstance(self.degree, int), "`degree` must be an integer" 285 self.poly_ = PolynomialFeatures( 286 degree=self.degree, interaction_only=True, include_bias=False 287 ) 288 X = self.poly_.fit_transform(X) 289 290 if self.n_clusters > 0: 291 clustered_X, self.scaler_, self.label_encoder_, self.clusterer_ = ( 292 cluster( 293 X, 294 n_clusters=self.n_clusters, 295 method=self.clustering_method, 296 type_scaling=self.cluster_scaling, 297 training=True, 298 seed=self.seed, 299 ) 300 ) 301 X = np.column_stack((X, clustered_X)) 302 303 self.obj = boosterc.fit_booster_regressor( 304 X=np.asarray(X, order="C", dtype=np.float64), 305 y=np.asarray(y, order="C", dtype=np.float64), 306 n_estimators=self.n_estimators, 307 learning_rate=self.learning_rate, 308 n_hidden_features=self.n_hidden_features, 309 reg_lambda=self.reg_lambda, 310 alpha=self.alpha, 311 row_sample=self.row_sample, 312 col_sample=self.col_sample, 313 dropout=self.dropout, 314 tolerance=self.tolerance, 315 direct_link=self.direct_link, 316 verbose=self.verbose, 317 seed=self.seed, 318 backend=self.backend, 319 solver=self.solver, 320 activation=self.activation, 321 obj=self.base_model, 322 ) 323 324 self.n_estimators = self.obj["n_estimators"] 325 326 self.X_ = X 327 328 self.y_ = y 329 330 return self 331 332 def predict(self, X, level=95, method=None, histo=False, **kwargs): 333 """Predict values for test data X. 334 335 Args: 336 337 X: {array-like}, shape = [n_samples, n_features] 338 Training vectors, where n_samples is the number 339 of samples and n_features is the number of features. 340 341 level: int 342 Level of confidence (default = 95) 343 344 method: str 345 `None`, or 'splitconformal', 'localconformal' 346 prediction (if you specify `return_pi = True`) 347 348 histo: bool 349 whether to use histogram features or not 350 351 **kwargs: additional parameters to be passed to 352 self.cook_test_set 353 354 Returns: 355 356 predicted values estimates for test data: {array-like} 357 """ 358 359 if isinstance(X, pd.DataFrame): 360 X = X.values 361 362 if self.hist == True: 363 X = get_histo_features(X, bins=self.hist_bins_) 364 365 if self.degree is not None: 366 X = self.poly_.transform(X) 367 368 if self.n_clusters > 0: 369 X = np.column_stack( 370 ( 371 X, 372 cluster( 373 X, 374 training=False, 375 scaler=self.scaler_, 376 label_encoder=self.label_encoder_, 377 clusterer=self.clusterer_, 378 seed=self.seed, 379 ), 380 ) 381 ) 382 if "return_pi" in kwargs: 383 assert method in ( 384 "splitconformal", 385 "localconformal", 386 ), "method must be in ('splitconformal', 'localconformal')" 387 self.pi = PredictionInterval( 388 obj=self, 389 method=method, 390 level=level, 391 type_pi=self.type_pi, 392 replications=self.replications, 393 kernel=self.kernel, 394 ) 395 self.pi.fit(self.X_, self.y_) 396 self.X_ = None 397 self.y_ = None 398 preds = self.pi.predict(X, return_pi=True) 399 return preds 400 # print(f"\n in predict self: {self} \n") 401 # print(f"\n in predict self.obj: {self.obj} \n") 402 # try: 403 return boosterc.predict_booster_regressor( 404 self.obj, 405 np.asarray(X, order="C"), 406 backend=self.backend, 407 ) 408 # except ValueError: 409 # pass 410 411 def update(self, X, y, eta=0.9): 412 """Update model with new data. 413 414 Args: 415 416 X: {array-like}, shape = [n_samples=1, n_features] 417 Training vectors, where n_samples is the number 418 of samples and n_features is the number of features. 419 420 y: float = [n_samples=1] 421 Target value. 422 423 eta: float 424 Inverse power applied to number of observations 425 (defines a learning rate). 426 427 Returns: 428 429 self: object. 430 """ 431 432 if isinstance(X, pd.DataFrame): 433 X = X.values 434 435 if self.degree is not None: 436 X = self.poly_.transform(X) 437 438 if self.n_clusters > 0: 439 X = np.column_stack( 440 ( 441 X, 442 cluster( 443 X, 444 training=False, 445 scaler=self.scaler_, 446 label_encoder=self.label_encoder_, 447 clusterer=self.clusterer_, 448 seed=self.seed, 449 ), 450 ) 451 ) 452 453 self.obj = boosterc.update_booster( 454 self.obj, np.asarray(X, order="C"), np.asarray(y, order="C"), eta 455 ) 456 457 return self
LSBoost regressor.
Attributes:
n_estimators: int
number of boosting iterations.
learning_rate: float
controls the learning speed at training time.
n_hidden_features: int
number of nodes in successive hidden layers.
reg_lambda: float
L2 regularization parameter for successive errors in the optimizer
(at training time).
alpha: float
compromise between L1 and L2 regularization (must be in [0, 1]),
for `solver` == 'enet'
row_sample: float
percentage of rows chosen from the training set.
col_sample: float
percentage of columns chosen from the training set.
dropout: float
percentage of nodes dropped from the training set.
tolerance: float
controls early stopping in gradient descent (at training time).
direct_link: bool
indicates whether the original features are included (True) in model's
fitting or not (False).
verbose: int
progress bar (yes = 1) or not (no = 0) (currently).
seed: int
reproducibility seed for nodes_sim=='uniform', clustering and dropout.
backend: str
type of backend; must be in ('cpu', 'gpu', 'tpu')
solver: str
type of 'weak' learner; currently in ('ridge', 'lasso')
activation: str
activation function: currently 'relu', 'relu6', 'sigmoid', 'tanh'
type_pi: str.
type of prediction interval; currently "kde" (default) or "bootstrap".
Used only in `self.predict`, for `self.replications` > 0 and `self.kernel`
in ('gaussian', 'tophat'). Default is `None`.
replications: int.
number of replications (if needed) for predictive simulation.
Used only in `self.predict`, for `self.kernel` in ('gaussian',
'tophat') and `self.type_pi = 'kde'`. Default is `None`.
n_clusters: int
number of clusters for clustering the features
clustering_method: str
clustering method: currently 'kmeans', 'gmm'
cluster_scaling: str
scaling method for clustering: currently 'standard', 'robust', 'minmax'
degree: int
degree of features interactions to include in the model
weights_distr: str
distribution of weights for constructing the model's hidden layer;
either 'uniform' or 'gaussian'
hist: bool
whether to use histogram features or not
bins: int or str
number of bins for histogram features (same as numpy.histogram, default is 'auto')
Examples:
import subprocess
import sys
import os
import mlsauce as ms
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.tree import DecisionTreeRegressor
from time import time
from os import chdir
from sklearn import metrics
regr = DecisionTreeRegressor()
diabetes = load_diabetes()
X = diabetes.data
y = diabetes.target
# split data into training test and test set
np.random.seed(15029)
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2)
obj = ms.GenericBoostingRegressor(regr, col_sample=0.9, row_sample=0.9)
print(obj.get_params())
start = time()
obj.fit(X_train, y_train)
print(time()-start)
start = time()
print(np.sqrt(np.mean(np.square(obj.predict(X_test) - y_test))))
print(time()-start)
print(obj.obj['loss'])
obj = ms.GenericBoostingRegressor(regr, col_sample=0.9, row_sample=0.9, n_clusters=2)
print(obj.get_params())
start = time()
obj.fit(X_train, y_train)
print(time()-start)
start = time()
print(np.sqrt(np.mean(np.square(obj.predict(X_test) - y_test))))
print(time()-start)
print(obj.obj['loss'])
253 def fit(self, X, y, **kwargs): 254 """Fit Booster (regressor) to training data (X, y) 255 256 Args: 257 258 X: {array-like}, shape = [n_samples, n_features] 259 Training vectors, where n_samples is the number 260 of samples and n_features is the number of features. 261 262 y: array-like, shape = [n_samples] 263 Target values. 264 265 **kwargs: additional parameters to be passed to self.cook_training_set. 266 267 Returns: 268 269 self: object. 270 """ 271 272 if isinstance(X, pd.DataFrame): 273 X = X.values 274 275 if self.hist == True: 276 X, self.hist_bins_ = get_histo_features(X) 277 278 if isinstance(y, pd.Series): 279 y = y.values.ravel() 280 else: 281 y = np.asarray(y).ravel() 282 283 if self.degree is not None: 284 assert isinstance(self.degree, int), "`degree` must be an integer" 285 self.poly_ = PolynomialFeatures( 286 degree=self.degree, interaction_only=True, include_bias=False 287 ) 288 X = self.poly_.fit_transform(X) 289 290 if self.n_clusters > 0: 291 clustered_X, self.scaler_, self.label_encoder_, self.clusterer_ = ( 292 cluster( 293 X, 294 n_clusters=self.n_clusters, 295 method=self.clustering_method, 296 type_scaling=self.cluster_scaling, 297 training=True, 298 seed=self.seed, 299 ) 300 ) 301 X = np.column_stack((X, clustered_X)) 302 303 self.obj = boosterc.fit_booster_regressor( 304 X=np.asarray(X, order="C", dtype=np.float64), 305 y=np.asarray(y, order="C", dtype=np.float64), 306 n_estimators=self.n_estimators, 307 learning_rate=self.learning_rate, 308 n_hidden_features=self.n_hidden_features, 309 reg_lambda=self.reg_lambda, 310 alpha=self.alpha, 311 row_sample=self.row_sample, 312 col_sample=self.col_sample, 313 dropout=self.dropout, 314 tolerance=self.tolerance, 315 direct_link=self.direct_link, 316 verbose=self.verbose, 317 seed=self.seed, 318 backend=self.backend, 319 solver=self.solver, 320 activation=self.activation, 321 obj=self.base_model, 322 ) 323 324 self.n_estimators = self.obj["n_estimators"] 325 326 self.X_ = X 327 328 self.y_ = y 329 330 return self
Fit Booster (regressor) to training data (X, y)
Args:
X: {array-like}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number
of samples and n_features is the number of features.
y: array-like, shape = [n_samples]
Target values.
**kwargs: additional parameters to be passed to self.cook_training_set.
Returns:
self: object.
332 def predict(self, X, level=95, method=None, histo=False, **kwargs): 333 """Predict values for test data X. 334 335 Args: 336 337 X: {array-like}, shape = [n_samples, n_features] 338 Training vectors, where n_samples is the number 339 of samples and n_features is the number of features. 340 341 level: int 342 Level of confidence (default = 95) 343 344 method: str 345 `None`, or 'splitconformal', 'localconformal' 346 prediction (if you specify `return_pi = True`) 347 348 histo: bool 349 whether to use histogram features or not 350 351 **kwargs: additional parameters to be passed to 352 self.cook_test_set 353 354 Returns: 355 356 predicted values estimates for test data: {array-like} 357 """ 358 359 if isinstance(X, pd.DataFrame): 360 X = X.values 361 362 if self.hist == True: 363 X = get_histo_features(X, bins=self.hist_bins_) 364 365 if self.degree is not None: 366 X = self.poly_.transform(X) 367 368 if self.n_clusters > 0: 369 X = np.column_stack( 370 ( 371 X, 372 cluster( 373 X, 374 training=False, 375 scaler=self.scaler_, 376 label_encoder=self.label_encoder_, 377 clusterer=self.clusterer_, 378 seed=self.seed, 379 ), 380 ) 381 ) 382 if "return_pi" in kwargs: 383 assert method in ( 384 "splitconformal", 385 "localconformal", 386 ), "method must be in ('splitconformal', 'localconformal')" 387 self.pi = PredictionInterval( 388 obj=self, 389 method=method, 390 level=level, 391 type_pi=self.type_pi, 392 replications=self.replications, 393 kernel=self.kernel, 394 ) 395 self.pi.fit(self.X_, self.y_) 396 self.X_ = None 397 self.y_ = None 398 preds = self.pi.predict(X, return_pi=True) 399 return preds 400 # print(f"\n in predict self: {self} \n") 401 # print(f"\n in predict self.obj: {self.obj} \n") 402 # try: 403 return boosterc.predict_booster_regressor( 404 self.obj, 405 np.asarray(X, order="C"), 406 backend=self.backend, 407 ) 408 # except ValueError: 409 # pass
Predict values for test data X.
Args:
X: {array-like}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number
of samples and n_features is the number of features.
level: int
Level of confidence (default = 95)
method: str
`None`, or 'splitconformal', 'localconformal'
prediction (if you specify `return_pi = True`)
histo: bool
whether to use histogram features or not
**kwargs: additional parameters to be passed to
self.cook_test_set
Returns:
predicted values estimates for test data: {array-like}
23class RidgeRegressor(BaseEstimator, RegressorMixin): 24 """Ridge. 25 26 Attributes: 27 28 reg_lambda: float 29 regularization parameter. 30 31 backend: str 32 type of backend; must be in ('cpu', 'gpu', 'tpu') 33 34 """ 35 36 def __init__(self, reg_lambda=0.1, backend="cpu"): 37 assert backend in ( 38 "cpu", 39 "gpu", 40 "tpu", 41 ), "`backend` must be in ('cpu', 'gpu', 'tpu')" 42 43 sys_platform = platform.system() 44 45 if (sys_platform == "Windows") and (backend in ("gpu", "tpu")): 46 warnings.warn( 47 "No GPU/TPU computing on Windows yet, backend set to 'cpu'" 48 ) 49 backend = "cpu" 50 51 self.reg_lambda = reg_lambda 52 self.backend = backend 53 if self.backend in ("gpu", "tpu"): 54 check_and_install("jax") 55 check_and_install("jaxlib") 56 57 def fit(self, X, y, **kwargs): 58 """Fit matrixops (classifier) to training data (X, y) 59 60 Args: 61 62 X: {array-like}, shape = [n_samples, n_features] 63 Training vectors, where n_samples is the number 64 of samples and n_features is the number of features. 65 66 y: array-like, shape = [n_samples] 67 Target values. 68 69 **kwargs: additional parameters to be passed to self.cook_training_set. 70 71 Returns: 72 73 self: object. 74 75 """ 76 self.ym, centered_y = mo.center_response(y) 77 self.xm = X.mean(axis=0) 78 self.xsd = X.std(axis=0) 79 self.xsd[self.xsd == 0] = 1 # avoid division by zero 80 X_ = (X - self.xm[None, :]) / self.xsd[None, :] 81 82 if self.backend == "cpu": 83 if len(centered_y.shape) <= 1: 84 eye_term = np.sqrt(self.reg_lambda) * np.eye(X.shape[1]) 85 X_ = np.row_stack((X_, eye_term)) 86 y_ = np.concatenate((centered_y, np.zeros(X.shape[1]))) 87 # self.beta, _, _, _ = np.linalg.lstsq(X_, y_, rcond=None) 88 self.beta = get_beta(X_, y_) 89 else: 90 try: 91 eye_term = np.sqrt(self.reg_lambda) * np.eye(X.shape[1]) 92 X_ = np.row_stack((X_, eye_term)) 93 y_ = np.row_stack( 94 ( 95 centered_y, 96 np.zeros((eye_term.shape[0], centered_y.shape[1])), 97 ) 98 ) 99 # self.beta, _, _, _ = np.linalg.lstsq(X_, y_, rcond=None) 100 self.beta = get_beta(X_, y_) 101 except Exception: 102 x = inv( 103 mo.crossprod(X_) + self.reg_lambda * np.eye(X_.shape[1]) 104 ) 105 hat_matrix = mo.tcrossprod(x, X_) 106 self.beta = mo.safe_sparse_dot(hat_matrix, centered_y) 107 return self 108 109 x = jinv( 110 mo.crossprod(X_, backend=self.backend) 111 + self.reg_lambda * jnp.eye(X_.shape[1]) 112 ) 113 hat_matrix = mo.tcrossprod(x, X_, backend=self.backend) 114 self.beta = mo.safe_sparse_dot( 115 hat_matrix, centered_y, backend=self.backend 116 ) 117 return self 118 119 def predict(self, X, **kwargs): 120 """Predict test data X. 121 122 Args: 123 124 X: {array-like}, shape = [n_samples, n_features] 125 Training vectors, where n_samples is the number 126 of samples and n_features is the number of features. 127 128 **kwargs: additional parameters to be passed to `predict_proba` 129 130 Returns: 131 132 model predictions: {array-like} 133 134 """ 135 X_ = (X - self.xm[None, :]) / self.xsd[None, :] 136 137 if self.backend == "cpu": 138 if isinstance(self.ym, float): 139 return self.ym + mo.safe_sparse_dot(X_, self.beta) 140 return self.ym[None, :] + mo.safe_sparse_dot(X_, self.beta) 141 142 # if self.backend in ("gpu", "tpu"): 143 if isinstance(self.ym, float): 144 return self.ym + mo.safe_sparse_dot( 145 X_, self.beta, backend=self.backend 146 ) 147 return self.ym[None, :] + mo.safe_sparse_dot( 148 X_, self.beta, backend=self.backend 149 )
Ridge.
Attributes:
reg_lambda: float
regularization parameter.
backend: str
type of backend; must be in ('cpu', 'gpu', 'tpu')
57 def fit(self, X, y, **kwargs): 58 """Fit matrixops (classifier) to training data (X, y) 59 60 Args: 61 62 X: {array-like}, shape = [n_samples, n_features] 63 Training vectors, where n_samples is the number 64 of samples and n_features is the number of features. 65 66 y: array-like, shape = [n_samples] 67 Target values. 68 69 **kwargs: additional parameters to be passed to self.cook_training_set. 70 71 Returns: 72 73 self: object. 74 75 """ 76 self.ym, centered_y = mo.center_response(y) 77 self.xm = X.mean(axis=0) 78 self.xsd = X.std(axis=0) 79 self.xsd[self.xsd == 0] = 1 # avoid division by zero 80 X_ = (X - self.xm[None, :]) / self.xsd[None, :] 81 82 if self.backend == "cpu": 83 if len(centered_y.shape) <= 1: 84 eye_term = np.sqrt(self.reg_lambda) * np.eye(X.shape[1]) 85 X_ = np.row_stack((X_, eye_term)) 86 y_ = np.concatenate((centered_y, np.zeros(X.shape[1]))) 87 # self.beta, _, _, _ = np.linalg.lstsq(X_, y_, rcond=None) 88 self.beta = get_beta(X_, y_) 89 else: 90 try: 91 eye_term = np.sqrt(self.reg_lambda) * np.eye(X.shape[1]) 92 X_ = np.row_stack((X_, eye_term)) 93 y_ = np.row_stack( 94 ( 95 centered_y, 96 np.zeros((eye_term.shape[0], centered_y.shape[1])), 97 ) 98 ) 99 # self.beta, _, _, _ = np.linalg.lstsq(X_, y_, rcond=None) 100 self.beta = get_beta(X_, y_) 101 except Exception: 102 x = inv( 103 mo.crossprod(X_) + self.reg_lambda * np.eye(X_.shape[1]) 104 ) 105 hat_matrix = mo.tcrossprod(x, X_) 106 self.beta = mo.safe_sparse_dot(hat_matrix, centered_y) 107 return self 108 109 x = jinv( 110 mo.crossprod(X_, backend=self.backend) 111 + self.reg_lambda * jnp.eye(X_.shape[1]) 112 ) 113 hat_matrix = mo.tcrossprod(x, X_, backend=self.backend) 114 self.beta = mo.safe_sparse_dot( 115 hat_matrix, centered_y, backend=self.backend 116 ) 117 return self
Fit matrixops (classifier) to training data (X, y)
Args:
X: {array-like}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number
of samples and n_features is the number of features.
y: array-like, shape = [n_samples]
Target values.
**kwargs: additional parameters to be passed to self.cook_training_set.
Returns:
self: object.
119 def predict(self, X, **kwargs): 120 """Predict test data X. 121 122 Args: 123 124 X: {array-like}, shape = [n_samples, n_features] 125 Training vectors, where n_samples is the number 126 of samples and n_features is the number of features. 127 128 **kwargs: additional parameters to be passed to `predict_proba` 129 130 Returns: 131 132 model predictions: {array-like} 133 134 """ 135 X_ = (X - self.xm[None, :]) / self.xsd[None, :] 136 137 if self.backend == "cpu": 138 if isinstance(self.ym, float): 139 return self.ym + mo.safe_sparse_dot(X_, self.beta) 140 return self.ym[None, :] + mo.safe_sparse_dot(X_, self.beta) 141 142 # if self.backend in ("gpu", "tpu"): 143 if isinstance(self.ym, float): 144 return self.ym + mo.safe_sparse_dot( 145 X_, self.beta, backend=self.backend 146 ) 147 return self.ym[None, :] + mo.safe_sparse_dot( 148 X_, self.beta, backend=self.backend 149 )
Predict test data X.
Args:
X: {array-like}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number
of samples and n_features is the number of features.
**kwargs: additional parameters to be passed to `predict_proba`
Returns:
model predictions: {array-like}
89class LazyBoostingClassifier(ClassifierMixin): 90 """ 91 92 Fitting -- almost -- all the classification algorithms 93 and returning their scores. 94 95 Parameters: 96 97 verbose: int, optional (default=0) 98 Any positive number for verbosity. 99 100 ignore_warnings: bool, optional (default=True) 101 When set to True, the warning related to algorigms that are not 102 able to run are ignored. 103 104 custom_metric: function, optional (default=None) 105 When function is provided, models are evaluated based on the custom 106 evaluation metric provided. 107 108 predictions: bool, optional (default=False) 109 When set to True, the predictions of all the models models are 110 returned as data frame. 111 112 sort_by: string, optional (default='Accuracy') 113 Sort models by a metric. Available options are 'Accuracy', 114 'Balanced Accuracy', 'ROC AUC', 'F1 Score' or a custom metric 115 identified by its name and provided by custom_metric. 116 117 random_state: int, optional (default=42) 118 Reproducibiility seed. 119 120 estimators: list, optional (default='all') 121 list of Estimators names or just 'all' for > 90 classifiers 122 (default='all') 123 124 preprocess: bool, preprocessing is done when set to True 125 126 n_jobs: int, when possible, run in parallel 127 For now, only used by individual models that support it. 128 129 n_layers: int, optional (default=3) 130 Number of layers of GenericBoostingClassifiers to be used. 131 132 All the other parameters are the same as GenericBoostingClassifier's. 133 134 Attributes: 135 136 models_: dict-object 137 Returns a dictionary with each model pipeline as value 138 with key as name of models. 139 140 best_model_: object 141 Returns the best model pipeline. 142 143 Examples 144 145 ```python 146 import os 147 import mlsauce as ms 148 from sklearn.datasets import load_breast_cancer, load_iris, load_wine, load_digits 149 from sklearn.model_selection import train_test_split 150 from time import time 151 152 load_models = [load_breast_cancer, load_iris, load_wine] 153 154 for model in load_models: 155 156 data = model() 157 X = data.data 158 y= data.target 159 160 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 13) 161 162 clf = ms.LazyBoostingClassifier(verbose=1, ignore_warnings=False, 163 custom_metric=None, preprocess=False) 164 165 start = time() 166 models, predictioms = clf.fit(X_train, X_test, y_train, y_test) 167 print(f"\nElapsed: {time() - start} seconds\n") 168 169 print(models) 170 ``` 171 172 """ 173 174 def __init__( 175 self, 176 verbose=0, 177 ignore_warnings=True, 178 custom_metric=None, 179 predictions=False, 180 sort_by="Accuracy", 181 random_state=42, 182 estimators="all", 183 preprocess=False, 184 n_jobs=None, 185 ): 186 self.verbose = verbose 187 self.ignore_warnings = ignore_warnings 188 self.custom_metric = custom_metric 189 self.predictions = predictions 190 self.sort_by = sort_by 191 self.models_ = {} 192 self.best_model_ = None 193 self.random_state = random_state 194 self.estimators = estimators 195 self.preprocess = preprocess 196 self.n_jobs = n_jobs 197 198 def fit(self, X_train, X_test, y_train, y_test, hist=False, **kwargs): 199 """Fit classifiers to X_train and y_train, predict and score on X_test, 200 y_test. 201 202 Parameters: 203 204 X_train: array-like, 205 Training vectors, where rows is the number of samples 206 and columns is the number of features. 207 208 X_test: array-like, 209 Testing vectors, where rows is the number of samples 210 and columns is the number of features. 211 212 y_train: array-like, 213 Training vectors, where rows is the number of samples 214 and columns is the number of features. 215 216 y_test: array-like, 217 Testing vectors, where rows is the number of samples 218 and columns is the number of features. 219 220 hist: bool, optional (default=False) 221 When set to True, the model is a GenericBoostingClassifier. 222 223 **kwargs: dict, 224 Additional arguments to be passed to the fit GenericBoostingClassifier. 225 226 Returns: 227 228 scores: Pandas DataFrame 229 Returns metrics of all the models in a Pandas DataFrame. 230 231 predictions: Pandas DataFrame 232 Returns predictions of all the models in a Pandas DataFrame. 233 """ 234 Accuracy = [] 235 B_Accuracy = [] 236 ROC_AUC = [] 237 F1 = [] 238 names = [] 239 TIME = [] 240 predictions = {} 241 242 if self.custom_metric is not None: 243 CUSTOM_METRIC = [] 244 245 if isinstance(X_train, np.ndarray): 246 X_train = pd.DataFrame(X_train) 247 X_test = pd.DataFrame(X_test) 248 249 numeric_features = X_train.select_dtypes(include=[np.number]).columns 250 categorical_features = X_train.select_dtypes(include=["object"]).columns 251 252 categorical_low, categorical_high = get_card_split( 253 X_train, categorical_features 254 ) 255 256 if self.preprocess is True: 257 preprocessor = ColumnTransformer( 258 transformers=[ 259 ("numeric", numeric_transformer, numeric_features), 260 ( 261 "categorical_low", 262 categorical_transformer_low, 263 categorical_low, 264 ), 265 ( 266 "categorical_high", 267 categorical_transformer_high, 268 categorical_high, 269 ), 270 ] 271 ) 272 273 # baseline models 274 try: 275 baseline_names = ["RandomForestClassifier", "XGBClassifier"] 276 baseline_models = [RandomForestClassifier(), xgb.XGBClassifier()] 277 except Exception as exception: 278 baseline_names = ["RandomForestClassifier"] 279 baseline_models = [RandomForestClassifier()] 280 281 if self.verbose > 0: 282 print("\n Fitting baseline models...") 283 for name, model in tqdm(zip(baseline_names, baseline_models)): 284 start = time.time() 285 try: 286 model.fit(X_train, y_train) 287 self.models_[name] = model 288 y_pred = model.predict(X_test) 289 accuracy = accuracy_score(y_test, y_pred, normalize=True) 290 b_accuracy = balanced_accuracy_score(y_test, y_pred) 291 f1 = f1_score(y_test, y_pred, average="weighted") 292 try: 293 roc_auc = roc_auc_score(y_test, y_pred) 294 except Exception as exception: 295 roc_auc = None 296 if self.ignore_warnings is False: 297 print("ROC AUC couldn't be calculated for " + name) 298 print(exception) 299 names.append(name) 300 Accuracy.append(accuracy) 301 B_Accuracy.append(b_accuracy) 302 ROC_AUC.append(roc_auc) 303 F1.append(f1) 304 TIME.append(time.time() - start) 305 if self.custom_metric is not None: 306 custom_metric = self.custom_metric(y_test, y_pred) 307 CUSTOM_METRIC.append(custom_metric) 308 if self.verbose > 0: 309 if self.custom_metric is not None: 310 print( 311 { 312 "Model": name, 313 "Accuracy": accuracy, 314 "Balanced Accuracy": b_accuracy, 315 "ROC AUC": roc_auc, 316 "F1 Score": f1, 317 self.custom_metric.__name__: custom_metric, 318 "Time taken": time.time() - start, 319 } 320 ) 321 else: 322 print( 323 { 324 "Model": name, 325 "Accuracy": accuracy, 326 "Balanced Accuracy": b_accuracy, 327 "ROC AUC": roc_auc, 328 "F1 Score": f1, 329 "Time taken": time.time() - start, 330 } 331 ) 332 if self.predictions: 333 predictions[name] = y_pred 334 except Exception as exception: 335 if self.ignore_warnings is False: 336 print(name + " model failed to execute") 337 print(exception) 338 339 if self.estimators == "all": 340 self.classifiers = REGRESSORS + MTASKREGRESSORS 341 else: 342 self.classifiers = [ 343 ("GBoostClassifier(" + est[0] + ")", est[1]()) 344 for est in all_estimators() 345 if ( 346 issubclass(est[1], RegressorMixin) 347 and (est[0] in self.estimators) 348 ) 349 ] + [ 350 ( 351 "GBoostClassifier(MultiTask(" + est[0] + "))", 352 partial(MultiTaskRegressor, regr=est[1]()), 353 ) 354 for est in all_estimators() 355 if ( 356 issubclass(est[1], RegressorMixin) 357 and (est[0] in self.estimators) 358 ) 359 ] 360 361 if self.preprocess is True: 362 363 if self.n_jobs is None: 364 365 for name, model in tqdm(self.classifiers): # do parallel exec 366 367 other_args = ( 368 {} 369 ) # use this trick for `random_state` too --> refactor 370 try: 371 if ( 372 "n_jobs" in model().get_params().keys() 373 and name.find("LogisticRegression") == -1 374 ): 375 other_args["n_jobs"] = self.n_jobs 376 except Exception: 377 pass 378 379 start = time.time() 380 381 try: 382 if "random_state" in model().get_params().keys(): 383 if hist is False: 384 fitted_clf = GenericBoostingClassifier( 385 {**other_args, **kwargs}, 386 verbose=self.verbose, 387 base_model=model( 388 random_state=self.random_state 389 ), 390 ) 391 else: 392 fitted_clf = GenericBoostingClassifier( 393 {**other_args, **kwargs}, 394 verbose=self.verbose, 395 base_model=model( 396 random_state=self.random_state 397 ), 398 hist=True, 399 ) 400 401 else: 402 if hist is False: 403 fitted_clf = GenericBoostingClassifier( 404 base_model=model(**kwargs), 405 verbose=self.verbose, 406 ) 407 else: 408 fitted_clf = GenericBoostingClassifier( 409 base_model=model(**kwargs), 410 verbose=self.verbose, 411 hist=True, 412 ) 413 414 if self.verbose > 0: 415 print("\n Fitting boosted " + name + " model...") 416 fitted_clf.fit(X_train, y_train) 417 418 pipe = Pipeline( 419 [ 420 ("preprocessor", preprocessor), 421 ("classifier", fitted_clf), 422 ] 423 ) 424 425 if self.verbose > 0: 426 print("\n Fitting boosted " + name + " model...") 427 pipe.fit(X_train, y_train) 428 self.models_[name] = pipe 429 y_pred = pipe.predict(X_test) 430 accuracy = accuracy_score( 431 y_test, y_pred, normalize=True 432 ) 433 b_accuracy = balanced_accuracy_score(y_test, y_pred) 434 f1 = f1_score(y_test, y_pred, average="weighted") 435 try: 436 roc_auc = roc_auc_score(y_test, y_pred) 437 except Exception as exception: 438 roc_auc = None 439 if self.ignore_warnings is False: 440 print( 441 "ROC AUC couldn't be calculated for " + name 442 ) 443 print(exception) 444 names.append(name) 445 Accuracy.append(accuracy) 446 B_Accuracy.append(b_accuracy) 447 ROC_AUC.append(roc_auc) 448 F1.append(f1) 449 TIME.append(time.time() - start) 450 if self.custom_metric is not None: 451 custom_metric = self.custom_metric(y_test, y_pred) 452 CUSTOM_METRIC.append(custom_metric) 453 if self.verbose > 0: 454 if self.custom_metric is not None: 455 print( 456 { 457 "Model": name, 458 "Accuracy": accuracy, 459 "Balanced Accuracy": b_accuracy, 460 "ROC AUC": roc_auc, 461 "F1 Score": f1, 462 self.custom_metric.__name__: custom_metric, 463 "Time taken": time.time() - start, 464 } 465 ) 466 else: 467 print( 468 { 469 "Model": name, 470 "Accuracy": accuracy, 471 "Balanced Accuracy": b_accuracy, 472 "ROC AUC": roc_auc, 473 "F1 Score": f1, 474 "Time taken": time.time() - start, 475 } 476 ) 477 if self.predictions: 478 predictions[name] = y_pred 479 except Exception as exception: 480 if self.ignore_warnings is False: 481 print(name + " model failed to execute") 482 print(exception) 483 484 else: 485 486 # train_model(self, name, model, X_train, y_train, X_test, y_test, 487 # use_preprocessing=False, preprocessor=None, 488 # **kwargs): 489 results = Parallel(n_jobs=self.n_jobs)( 490 delayed(self.train_model)( 491 name, 492 model, 493 X_train, 494 y_train, 495 X_test, 496 y_test, 497 use_preprocessing=True, 498 preprocessor=preprocessor, 499 **kwargs, 500 ) 501 for name, model in tqdm(self.classifiers) 502 ) 503 Accuracy = [res["accuracy"] for res in results] 504 B_Accuracy = [res["balanced_accuracy"] for res in results] 505 ROC_AUC = [res["roc_auc"] for res in results] 506 F1 = [res["f1"] for res in results] 507 names = [res["name"] for res in results] 508 TIME = [res["time"] for res in results] 509 if self.custom_metric is not None: 510 CUSTOM_METRIC = [res["custom_metric"] for res in results] 511 if self.predictions: 512 predictions = { 513 res["name"]: res["predictions"] for res in results 514 } 515 516 else: # no preprocessing 517 518 if self.n_jobs is None: 519 520 for name, model in tqdm(self.classifiers): # do parallel exec 521 start = time.time() 522 try: 523 if "random_state" in model().get_params().keys(): 524 if hist is False: 525 fitted_clf = GenericBoostingClassifier( 526 base_model=model( 527 random_state=self.random_state 528 ), 529 verbose=self.verbose, 530 **kwargs, 531 ) 532 else: 533 fitted_clf = GenericBoostingClassifier( 534 base_model=model( 535 random_state=self.random_state 536 ), 537 verbose=self.verbose, 538 hist=True, 539 **kwargs, 540 ) 541 542 else: 543 if hist is False: 544 fitted_clf = GenericBoostingClassifier( 545 base_model=model(), 546 verbose=self.verbose, 547 **kwargs, 548 ) 549 else: 550 fitted_clf = GenericBoostingClassifier( 551 base_model=model(), 552 verbose=self.verbose, 553 hist=True, 554 **kwargs, 555 ) 556 557 fitted_clf.fit(X_train, y_train) 558 559 self.models_[name] = fitted_clf 560 y_pred = fitted_clf.predict(X_test) 561 accuracy = accuracy_score( 562 y_test, y_pred, normalize=True 563 ) 564 b_accuracy = balanced_accuracy_score(y_test, y_pred) 565 f1 = f1_score(y_test, y_pred, average="weighted") 566 try: 567 roc_auc = roc_auc_score(y_test, y_pred) 568 except Exception as exception: 569 roc_auc = None 570 if self.ignore_warnings is False: 571 print( 572 "ROC AUC couldn't be calculated for " + name 573 ) 574 print(exception) 575 names.append(name) 576 Accuracy.append(accuracy) 577 B_Accuracy.append(b_accuracy) 578 ROC_AUC.append(roc_auc) 579 F1.append(f1) 580 TIME.append(time.time() - start) 581 if self.custom_metric is not None: 582 custom_metric = self.custom_metric(y_test, y_pred) 583 CUSTOM_METRIC.append(custom_metric) 584 if self.verbose > 0: 585 if self.custom_metric is not None: 586 print( 587 { 588 "Model": name, 589 "Accuracy": accuracy, 590 "Balanced Accuracy": b_accuracy, 591 "ROC AUC": roc_auc, 592 "F1 Score": f1, 593 self.custom_metric.__name__: custom_metric, 594 "Time taken": time.time() - start, 595 } 596 ) 597 else: 598 print( 599 { 600 "Model": name, 601 "Accuracy": accuracy, 602 "Balanced Accuracy": b_accuracy, 603 "ROC AUC": roc_auc, 604 "F1 Score": f1, 605 "Time taken": time.time() - start, 606 } 607 ) 608 if self.predictions: 609 predictions[name] = y_pred 610 except Exception as exception: 611 if self.ignore_warnings is False: 612 print(name + " model failed to execute") 613 print(exception) 614 615 else: 616 617 results = Parallel(n_jobs=self.n_jobs)( 618 delayed(self.train_model)( 619 name, 620 model, 621 X_train, 622 y_train, 623 X_test, 624 y_test, 625 use_preprocessing=False, 626 **kwargs, 627 ) 628 for name, model in tqdm(self.classifiers) 629 ) 630 Accuracy = [res["accuracy"] for res in results] 631 B_Accuracy = [res["balanced_accuracy"] for res in results] 632 ROC_AUC = [res["roc_auc"] for res in results] 633 F1 = [res["f1"] for res in results] 634 names = [res["name"] for res in results] 635 TIME = [res["time"] for res in results] 636 if self.custom_metric is not None: 637 CUSTOM_METRIC = [res["custom_metric"] for res in results] 638 if self.predictions: 639 predictions = { 640 res["name"]: res["predictions"] for res in results 641 } 642 643 if self.custom_metric is None: 644 scores = pd.DataFrame( 645 { 646 "Model": names, 647 "Accuracy": Accuracy, 648 "Balanced Accuracy": B_Accuracy, 649 "ROC AUC": ROC_AUC, 650 "F1 Score": F1, 651 "Time Taken": TIME, 652 } 653 ) 654 else: 655 scores = pd.DataFrame( 656 { 657 "Model": names, 658 "Accuracy": Accuracy, 659 "Balanced Accuracy": B_Accuracy, 660 "ROC AUC": ROC_AUC, 661 "F1 Score": F1, 662 "Custom metric": CUSTOM_METRIC, 663 "Time Taken": TIME, 664 } 665 ) 666 scores = scores.sort_values(by=self.sort_by, ascending=False).set_index( 667 "Model" 668 ) 669 670 self.best_model_ = self.models_[scores.index[0]] 671 672 if self.predictions: 673 predictions_df = pd.DataFrame.from_dict(predictions) 674 return scores, predictions_df if self.predictions is True else scores 675 676 def get_best_model(self): 677 """ 678 This function returns the best model pipeline based on the sort_by metric. 679 680 Returns: 681 682 best_model: object, 683 Returns the best model pipeline based on the sort_by metric. 684 685 """ 686 return self.best_model_ 687 688 def provide_models(self, X_train, X_test, y_train, y_test): 689 """Returns all the model objects trained. If fit hasn't been called yet, 690 then it's called to return the models. 691 692 Parameters: 693 694 X_train: array-like, 695 Training vectors, where rows is the number of samples 696 and columns is the number of features. 697 698 X_test: array-like, 699 Testing vectors, where rows is the number of samples 700 and columns is the number of features. 701 702 y_train: array-like, 703 Training vectors, where rows is the number of samples 704 and columns is the number of features. 705 706 y_test: array-like, 707 Testing vectors, where rows is the number of samples 708 and columns is the number of features. 709 710 Returns: 711 712 models: dict-object, 713 Returns a dictionary with each model's pipeline as value 714 and key = name of the model. 715 """ 716 if len(self.models_.keys()) == 0: 717 self.fit(X_train, X_test, y_train, y_test) 718 719 return self.models_ 720 721 def train_model( 722 self, 723 name, 724 model, 725 X_train, 726 y_train, 727 X_test, 728 y_test, 729 use_preprocessing=False, 730 preprocessor=None, 731 hist=False, 732 **kwargs, 733 ): 734 """ 735 Function to train a single model and return its results. 736 """ 737 other_args = {} 738 739 # Handle n_jobs parameter 740 try: 741 if ( 742 "n_jobs" in model().get_params().keys() 743 and "LogisticRegression" not in name 744 ): 745 other_args["n_jobs"] = self.n_jobs 746 except Exception: 747 pass 748 749 start = time.time() 750 751 try: 752 # Handle random_state parameter 753 if "random_state" in model().get_params().keys(): 754 if hist is False: 755 fitted_clf = GenericBoostingClassifier( 756 {**other_args, **kwargs}, 757 verbose=self.verbose, 758 base_model=model(random_state=self.random_state), 759 ) 760 else: 761 fitted_clf = GenericBoostingClassifier( 762 {**other_args, **kwargs}, 763 verbose=self.verbose, 764 base_model=model(random_state=self.random_state), 765 hist=True, 766 ) 767 else: 768 if hist is False: 769 fitted_clf = GenericBoostingClassifier( 770 base_model=model(**kwargs), 771 verbose=self.verbose, 772 ) 773 else: 774 fitted_clf = GenericBoostingClassifier( 775 base_model=model(**kwargs), 776 verbose=self.verbose, 777 hist=True, 778 ) 779 780 if self.verbose > 0: 781 print("\n Fitting boosted " + name + " model...") 782 783 fitted_clf.fit(X_train, y_train) 784 785 if use_preprocessing and preprocessor is not None: 786 pipe = Pipeline( 787 [ 788 ("preprocessor", preprocessor), 789 ("classifier", fitted_clf), 790 ] 791 ) 792 if self.verbose > 0: 793 print( 794 "\n Fitting pipeline with preprocessing for " 795 + name 796 + " model..." 797 ) 798 pipe.fit(X_train, y_train) 799 y_pred = pipe.predict(X_test) 800 else: 801 # Case with no preprocessing 802 if self.verbose > 0: 803 print( 804 "\n Fitting model without preprocessing for " 805 + name 806 + " model..." 807 ) 808 y_pred = fitted_clf.predict(X_test) 809 810 accuracy = accuracy_score(y_test, y_pred, normalize=True) 811 b_accuracy = balanced_accuracy_score(y_test, y_pred) 812 f1 = f1_score(y_test, y_pred, average="weighted") 813 roc_auc = None 814 815 try: 816 roc_auc = roc_auc_score(y_test, y_pred) 817 except Exception as exception: 818 if self.ignore_warnings is False: 819 print("ROC AUC couldn't be calculated for " + name) 820 print(exception) 821 822 custom_metric = None 823 if self.custom_metric is not None: 824 custom_metric = self.custom_metric(y_test, y_pred) 825 826 return { 827 "name": name, 828 "model": fitted_clf if not use_preprocessing else pipe, 829 "accuracy": accuracy, 830 "balanced_accuracy": b_accuracy, 831 "roc_auc": roc_auc, 832 "f1": f1, 833 "custom_metric": custom_metric, 834 "time": time.time() - start, 835 "predictions": y_pred, 836 } 837 except Exception as exception: 838 if self.ignore_warnings is False: 839 print(name + " model failed to execute") 840 print(exception) 841 return None
Fitting -- almost -- all the classification algorithms and returning their scores.
Parameters:
verbose: int, optional (default=0)
Any positive number for verbosity.
ignore_warnings: bool, optional (default=True)
When set to True, the warning related to algorigms that are not
able to run are ignored.
custom_metric: function, optional (default=None)
When function is provided, models are evaluated based on the custom
evaluation metric provided.
predictions: bool, optional (default=False)
When set to True, the predictions of all the models models are
returned as data frame.
sort_by: string, optional (default='Accuracy')
Sort models by a metric. Available options are 'Accuracy',
'Balanced Accuracy', 'ROC AUC', 'F1 Score' or a custom metric
identified by its name and provided by custom_metric.
random_state: int, optional (default=42)
Reproducibiility seed.
estimators: list, optional (default='all')
list of Estimators names or just 'all' for > 90 classifiers
(default='all')
preprocess: bool, preprocessing is done when set to True
n_jobs: int, when possible, run in parallel
For now, only used by individual models that support it.
n_layers: int, optional (default=3)
Number of layers of GenericBoostingClassifiers to be used.
All the other parameters are the same as GenericBoostingClassifier's.
Attributes:
models_: dict-object
Returns a dictionary with each model pipeline as value
with key as name of models.
best_model_: object
Returns the best model pipeline.
Examples
import os
import mlsauce as ms
from sklearn.datasets import load_breast_cancer, load_iris, load_wine, load_digits
from sklearn.model_selection import train_test_split
from time import time
load_models = [load_breast_cancer, load_iris, load_wine]
for model in load_models:
data = model()
X = data.data
y= data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 13)
clf = ms.LazyBoostingClassifier(verbose=1, ignore_warnings=False,
custom_metric=None, preprocess=False)
start = time()
models, predictioms = clf.fit(X_train, X_test, y_train, y_test)
print(f"
Elapsed: {time() - start} seconds
")
print(models)
198 def fit(self, X_train, X_test, y_train, y_test, hist=False, **kwargs): 199 """Fit classifiers to X_train and y_train, predict and score on X_test, 200 y_test. 201 202 Parameters: 203 204 X_train: array-like, 205 Training vectors, where rows is the number of samples 206 and columns is the number of features. 207 208 X_test: array-like, 209 Testing vectors, where rows is the number of samples 210 and columns is the number of features. 211 212 y_train: array-like, 213 Training vectors, where rows is the number of samples 214 and columns is the number of features. 215 216 y_test: array-like, 217 Testing vectors, where rows is the number of samples 218 and columns is the number of features. 219 220 hist: bool, optional (default=False) 221 When set to True, the model is a GenericBoostingClassifier. 222 223 **kwargs: dict, 224 Additional arguments to be passed to the fit GenericBoostingClassifier. 225 226 Returns: 227 228 scores: Pandas DataFrame 229 Returns metrics of all the models in a Pandas DataFrame. 230 231 predictions: Pandas DataFrame 232 Returns predictions of all the models in a Pandas DataFrame. 233 """ 234 Accuracy = [] 235 B_Accuracy = [] 236 ROC_AUC = [] 237 F1 = [] 238 names = [] 239 TIME = [] 240 predictions = {} 241 242 if self.custom_metric is not None: 243 CUSTOM_METRIC = [] 244 245 if isinstance(X_train, np.ndarray): 246 X_train = pd.DataFrame(X_train) 247 X_test = pd.DataFrame(X_test) 248 249 numeric_features = X_train.select_dtypes(include=[np.number]).columns 250 categorical_features = X_train.select_dtypes(include=["object"]).columns 251 252 categorical_low, categorical_high = get_card_split( 253 X_train, categorical_features 254 ) 255 256 if self.preprocess is True: 257 preprocessor = ColumnTransformer( 258 transformers=[ 259 ("numeric", numeric_transformer, numeric_features), 260 ( 261 "categorical_low", 262 categorical_transformer_low, 263 categorical_low, 264 ), 265 ( 266 "categorical_high", 267 categorical_transformer_high, 268 categorical_high, 269 ), 270 ] 271 ) 272 273 # baseline models 274 try: 275 baseline_names = ["RandomForestClassifier", "XGBClassifier"] 276 baseline_models = [RandomForestClassifier(), xgb.XGBClassifier()] 277 except Exception as exception: 278 baseline_names = ["RandomForestClassifier"] 279 baseline_models = [RandomForestClassifier()] 280 281 if self.verbose > 0: 282 print("\n Fitting baseline models...") 283 for name, model in tqdm(zip(baseline_names, baseline_models)): 284 start = time.time() 285 try: 286 model.fit(X_train, y_train) 287 self.models_[name] = model 288 y_pred = model.predict(X_test) 289 accuracy = accuracy_score(y_test, y_pred, normalize=True) 290 b_accuracy = balanced_accuracy_score(y_test, y_pred) 291 f1 = f1_score(y_test, y_pred, average="weighted") 292 try: 293 roc_auc = roc_auc_score(y_test, y_pred) 294 except Exception as exception: 295 roc_auc = None 296 if self.ignore_warnings is False: 297 print("ROC AUC couldn't be calculated for " + name) 298 print(exception) 299 names.append(name) 300 Accuracy.append(accuracy) 301 B_Accuracy.append(b_accuracy) 302 ROC_AUC.append(roc_auc) 303 F1.append(f1) 304 TIME.append(time.time() - start) 305 if self.custom_metric is not None: 306 custom_metric = self.custom_metric(y_test, y_pred) 307 CUSTOM_METRIC.append(custom_metric) 308 if self.verbose > 0: 309 if self.custom_metric is not None: 310 print( 311 { 312 "Model": name, 313 "Accuracy": accuracy, 314 "Balanced Accuracy": b_accuracy, 315 "ROC AUC": roc_auc, 316 "F1 Score": f1, 317 self.custom_metric.__name__: custom_metric, 318 "Time taken": time.time() - start, 319 } 320 ) 321 else: 322 print( 323 { 324 "Model": name, 325 "Accuracy": accuracy, 326 "Balanced Accuracy": b_accuracy, 327 "ROC AUC": roc_auc, 328 "F1 Score": f1, 329 "Time taken": time.time() - start, 330 } 331 ) 332 if self.predictions: 333 predictions[name] = y_pred 334 except Exception as exception: 335 if self.ignore_warnings is False: 336 print(name + " model failed to execute") 337 print(exception) 338 339 if self.estimators == "all": 340 self.classifiers = REGRESSORS + MTASKREGRESSORS 341 else: 342 self.classifiers = [ 343 ("GBoostClassifier(" + est[0] + ")", est[1]()) 344 for est in all_estimators() 345 if ( 346 issubclass(est[1], RegressorMixin) 347 and (est[0] in self.estimators) 348 ) 349 ] + [ 350 ( 351 "GBoostClassifier(MultiTask(" + est[0] + "))", 352 partial(MultiTaskRegressor, regr=est[1]()), 353 ) 354 for est in all_estimators() 355 if ( 356 issubclass(est[1], RegressorMixin) 357 and (est[0] in self.estimators) 358 ) 359 ] 360 361 if self.preprocess is True: 362 363 if self.n_jobs is None: 364 365 for name, model in tqdm(self.classifiers): # do parallel exec 366 367 other_args = ( 368 {} 369 ) # use this trick for `random_state` too --> refactor 370 try: 371 if ( 372 "n_jobs" in model().get_params().keys() 373 and name.find("LogisticRegression") == -1 374 ): 375 other_args["n_jobs"] = self.n_jobs 376 except Exception: 377 pass 378 379 start = time.time() 380 381 try: 382 if "random_state" in model().get_params().keys(): 383 if hist is False: 384 fitted_clf = GenericBoostingClassifier( 385 {**other_args, **kwargs}, 386 verbose=self.verbose, 387 base_model=model( 388 random_state=self.random_state 389 ), 390 ) 391 else: 392 fitted_clf = GenericBoostingClassifier( 393 {**other_args, **kwargs}, 394 verbose=self.verbose, 395 base_model=model( 396 random_state=self.random_state 397 ), 398 hist=True, 399 ) 400 401 else: 402 if hist is False: 403 fitted_clf = GenericBoostingClassifier( 404 base_model=model(**kwargs), 405 verbose=self.verbose, 406 ) 407 else: 408 fitted_clf = GenericBoostingClassifier( 409 base_model=model(**kwargs), 410 verbose=self.verbose, 411 hist=True, 412 ) 413 414 if self.verbose > 0: 415 print("\n Fitting boosted " + name + " model...") 416 fitted_clf.fit(X_train, y_train) 417 418 pipe = Pipeline( 419 [ 420 ("preprocessor", preprocessor), 421 ("classifier", fitted_clf), 422 ] 423 ) 424 425 if self.verbose > 0: 426 print("\n Fitting boosted " + name + " model...") 427 pipe.fit(X_train, y_train) 428 self.models_[name] = pipe 429 y_pred = pipe.predict(X_test) 430 accuracy = accuracy_score( 431 y_test, y_pred, normalize=True 432 ) 433 b_accuracy = balanced_accuracy_score(y_test, y_pred) 434 f1 = f1_score(y_test, y_pred, average="weighted") 435 try: 436 roc_auc = roc_auc_score(y_test, y_pred) 437 except Exception as exception: 438 roc_auc = None 439 if self.ignore_warnings is False: 440 print( 441 "ROC AUC couldn't be calculated for " + name 442 ) 443 print(exception) 444 names.append(name) 445 Accuracy.append(accuracy) 446 B_Accuracy.append(b_accuracy) 447 ROC_AUC.append(roc_auc) 448 F1.append(f1) 449 TIME.append(time.time() - start) 450 if self.custom_metric is not None: 451 custom_metric = self.custom_metric(y_test, y_pred) 452 CUSTOM_METRIC.append(custom_metric) 453 if self.verbose > 0: 454 if self.custom_metric is not None: 455 print( 456 { 457 "Model": name, 458 "Accuracy": accuracy, 459 "Balanced Accuracy": b_accuracy, 460 "ROC AUC": roc_auc, 461 "F1 Score": f1, 462 self.custom_metric.__name__: custom_metric, 463 "Time taken": time.time() - start, 464 } 465 ) 466 else: 467 print( 468 { 469 "Model": name, 470 "Accuracy": accuracy, 471 "Balanced Accuracy": b_accuracy, 472 "ROC AUC": roc_auc, 473 "F1 Score": f1, 474 "Time taken": time.time() - start, 475 } 476 ) 477 if self.predictions: 478 predictions[name] = y_pred 479 except Exception as exception: 480 if self.ignore_warnings is False: 481 print(name + " model failed to execute") 482 print(exception) 483 484 else: 485 486 # train_model(self, name, model, X_train, y_train, X_test, y_test, 487 # use_preprocessing=False, preprocessor=None, 488 # **kwargs): 489 results = Parallel(n_jobs=self.n_jobs)( 490 delayed(self.train_model)( 491 name, 492 model, 493 X_train, 494 y_train, 495 X_test, 496 y_test, 497 use_preprocessing=True, 498 preprocessor=preprocessor, 499 **kwargs, 500 ) 501 for name, model in tqdm(self.classifiers) 502 ) 503 Accuracy = [res["accuracy"] for res in results] 504 B_Accuracy = [res["balanced_accuracy"] for res in results] 505 ROC_AUC = [res["roc_auc"] for res in results] 506 F1 = [res["f1"] for res in results] 507 names = [res["name"] for res in results] 508 TIME = [res["time"] for res in results] 509 if self.custom_metric is not None: 510 CUSTOM_METRIC = [res["custom_metric"] for res in results] 511 if self.predictions: 512 predictions = { 513 res["name"]: res["predictions"] for res in results 514 } 515 516 else: # no preprocessing 517 518 if self.n_jobs is None: 519 520 for name, model in tqdm(self.classifiers): # do parallel exec 521 start = time.time() 522 try: 523 if "random_state" in model().get_params().keys(): 524 if hist is False: 525 fitted_clf = GenericBoostingClassifier( 526 base_model=model( 527 random_state=self.random_state 528 ), 529 verbose=self.verbose, 530 **kwargs, 531 ) 532 else: 533 fitted_clf = GenericBoostingClassifier( 534 base_model=model( 535 random_state=self.random_state 536 ), 537 verbose=self.verbose, 538 hist=True, 539 **kwargs, 540 ) 541 542 else: 543 if hist is False: 544 fitted_clf = GenericBoostingClassifier( 545 base_model=model(), 546 verbose=self.verbose, 547 **kwargs, 548 ) 549 else: 550 fitted_clf = GenericBoostingClassifier( 551 base_model=model(), 552 verbose=self.verbose, 553 hist=True, 554 **kwargs, 555 ) 556 557 fitted_clf.fit(X_train, y_train) 558 559 self.models_[name] = fitted_clf 560 y_pred = fitted_clf.predict(X_test) 561 accuracy = accuracy_score( 562 y_test, y_pred, normalize=True 563 ) 564 b_accuracy = balanced_accuracy_score(y_test, y_pred) 565 f1 = f1_score(y_test, y_pred, average="weighted") 566 try: 567 roc_auc = roc_auc_score(y_test, y_pred) 568 except Exception as exception: 569 roc_auc = None 570 if self.ignore_warnings is False: 571 print( 572 "ROC AUC couldn't be calculated for " + name 573 ) 574 print(exception) 575 names.append(name) 576 Accuracy.append(accuracy) 577 B_Accuracy.append(b_accuracy) 578 ROC_AUC.append(roc_auc) 579 F1.append(f1) 580 TIME.append(time.time() - start) 581 if self.custom_metric is not None: 582 custom_metric = self.custom_metric(y_test, y_pred) 583 CUSTOM_METRIC.append(custom_metric) 584 if self.verbose > 0: 585 if self.custom_metric is not None: 586 print( 587 { 588 "Model": name, 589 "Accuracy": accuracy, 590 "Balanced Accuracy": b_accuracy, 591 "ROC AUC": roc_auc, 592 "F1 Score": f1, 593 self.custom_metric.__name__: custom_metric, 594 "Time taken": time.time() - start, 595 } 596 ) 597 else: 598 print( 599 { 600 "Model": name, 601 "Accuracy": accuracy, 602 "Balanced Accuracy": b_accuracy, 603 "ROC AUC": roc_auc, 604 "F1 Score": f1, 605 "Time taken": time.time() - start, 606 } 607 ) 608 if self.predictions: 609 predictions[name] = y_pred 610 except Exception as exception: 611 if self.ignore_warnings is False: 612 print(name + " model failed to execute") 613 print(exception) 614 615 else: 616 617 results = Parallel(n_jobs=self.n_jobs)( 618 delayed(self.train_model)( 619 name, 620 model, 621 X_train, 622 y_train, 623 X_test, 624 y_test, 625 use_preprocessing=False, 626 **kwargs, 627 ) 628 for name, model in tqdm(self.classifiers) 629 ) 630 Accuracy = [res["accuracy"] for res in results] 631 B_Accuracy = [res["balanced_accuracy"] for res in results] 632 ROC_AUC = [res["roc_auc"] for res in results] 633 F1 = [res["f1"] for res in results] 634 names = [res["name"] for res in results] 635 TIME = [res["time"] for res in results] 636 if self.custom_metric is not None: 637 CUSTOM_METRIC = [res["custom_metric"] for res in results] 638 if self.predictions: 639 predictions = { 640 res["name"]: res["predictions"] for res in results 641 } 642 643 if self.custom_metric is None: 644 scores = pd.DataFrame( 645 { 646 "Model": names, 647 "Accuracy": Accuracy, 648 "Balanced Accuracy": B_Accuracy, 649 "ROC AUC": ROC_AUC, 650 "F1 Score": F1, 651 "Time Taken": TIME, 652 } 653 ) 654 else: 655 scores = pd.DataFrame( 656 { 657 "Model": names, 658 "Accuracy": Accuracy, 659 "Balanced Accuracy": B_Accuracy, 660 "ROC AUC": ROC_AUC, 661 "F1 Score": F1, 662 "Custom metric": CUSTOM_METRIC, 663 "Time Taken": TIME, 664 } 665 ) 666 scores = scores.sort_values(by=self.sort_by, ascending=False).set_index( 667 "Model" 668 ) 669 670 self.best_model_ = self.models_[scores.index[0]] 671 672 if self.predictions: 673 predictions_df = pd.DataFrame.from_dict(predictions) 674 return scores, predictions_df if self.predictions is True else scores
Fit classifiers to X_train and y_train, predict and score on X_test, y_test.
Parameters:
X_train: array-like,
Training vectors, where rows is the number of samples
and columns is the number of features.
X_test: array-like,
Testing vectors, where rows is the number of samples
and columns is the number of features.
y_train: array-like,
Training vectors, where rows is the number of samples
and columns is the number of features.
y_test: array-like,
Testing vectors, where rows is the number of samples
and columns is the number of features.
hist: bool, optional (default=False)
When set to True, the model is a GenericBoostingClassifier.
**kwargs: dict,
Additional arguments to be passed to the fit GenericBoostingClassifier.
Returns:
scores: Pandas DataFrame
Returns metrics of all the models in a Pandas DataFrame.
predictions: Pandas DataFrame
Returns predictions of all the models in a Pandas DataFrame.
688 def provide_models(self, X_train, X_test, y_train, y_test): 689 """Returns all the model objects trained. If fit hasn't been called yet, 690 then it's called to return the models. 691 692 Parameters: 693 694 X_train: array-like, 695 Training vectors, where rows is the number of samples 696 and columns is the number of features. 697 698 X_test: array-like, 699 Testing vectors, where rows is the number of samples 700 and columns is the number of features. 701 702 y_train: array-like, 703 Training vectors, where rows is the number of samples 704 and columns is the number of features. 705 706 y_test: array-like, 707 Testing vectors, where rows is the number of samples 708 and columns is the number of features. 709 710 Returns: 711 712 models: dict-object, 713 Returns a dictionary with each model's pipeline as value 714 and key = name of the model. 715 """ 716 if len(self.models_.keys()) == 0: 717 self.fit(X_train, X_test, y_train, y_test) 718 719 return self.models_
Returns all the model objects trained. If fit hasn't been called yet, then it's called to return the models.
Parameters:
X_train: array-like, Training vectors, where rows is the number of samples and columns is the number of features.
X_test: array-like, Testing vectors, where rows is the number of samples and columns is the number of features.
y_train: array-like, Training vectors, where rows is the number of samples and columns is the number of features.
y_test: array-like, Testing vectors, where rows is the number of samples and columns is the number of features.
Returns:
models: dict-object,
Returns a dictionary with each model's pipeline as value
and key = name of the model.
106class LazyBoostingMTS(ns.MTS): 107 """ 108 109 Fitting -- almost -- all the regression algorithms with layers of 110 nnetsauce's CustomRegressor to multivariate time series 111 and returning their scores. 112 113 Parameters: 114 115 verbose: int, optional (default=0) 116 Any positive number for verbosity. 117 118 ignore_warnings: bool, optional (default=True) 119 When set to True, the warning related to algorigms that are not 120 able to run are ignored. 121 122 custom_metric: function, optional (default=None) 123 When function is provided, models are evaluated based on the custom 124 evaluation metric provided. 125 126 predictions: bool, optional (default=False) 127 When set to True, the predictions of all the models models are returned as dataframe. 128 129 sort_by: string, optional (default='RMSE') 130 Sort models by a metric. Available options are 'RMSE', 'MAE', 'MPL', 'MPE', 'MAPE', 131 'R-Squared', 'Adjusted R-Squared' or a custom metric identified by its name and 132 provided by custom_metric. 133 134 random_state: int, optional (default=42) 135 Reproducibiility seed. 136 137 estimators: list, optional (default='all') 138 list of Estimators (regression algorithms) names or just 'all' (default='all') 139 140 preprocess: bool, preprocessing is done when set to True 141 142 h: int, optional (default=None) 143 Number of steps ahead to predict (when used, must be > 0 and < X_test.shape[0]). 144 145 All the other parameters are the same as MTS's. 146 147 Attributes: 148 149 models_: dict-object 150 Returns a dictionary with each model pipeline as value 151 with key as name of models. 152 153 best_model_: object 154 Returns the best model pipeline based on the sort_by metric. 155 156 Examples: 157 158 See https://thierrymoudiki.github.io/blog/2023/10/29/python/quasirandomizednn/MTS-LazyPredict 159 160 """ 161 162 def __init__( 163 self, 164 verbose=0, 165 ignore_warnings=True, 166 custom_metric=None, 167 predictions=False, 168 sort_by=None, # leave it as is 169 random_state=42, 170 estimators="all", 171 preprocess=False, 172 h=None, 173 # MTS attributes 174 obj=None, 175 n_hidden_features=5, 176 activation_name="relu", 177 a=0.01, 178 nodes_sim="sobol", 179 bias=True, 180 dropout=0, 181 direct_link=True, 182 n_clusters=2, 183 cluster_encode=True, 184 type_clust="kmeans", 185 type_scaling=("std", "std", "std"), 186 lags=15, 187 type_pi="scp2-kde", 188 block_size=None, 189 replications=None, 190 kernel=None, 191 agg="mean", 192 seed=123, 193 backend="cpu", 194 show_progress=False, 195 ): 196 self.verbose = verbose 197 self.ignore_warnings = ignore_warnings 198 self.custom_metric = custom_metric 199 self.predictions = predictions 200 self.sort_by = sort_by 201 self.models_ = {} 202 self.best_model_ = None 203 self.random_state = random_state 204 self.estimators = estimators 205 self.preprocess = preprocess 206 self.h = h 207 super().__init__( 208 obj=obj, 209 n_hidden_features=n_hidden_features, 210 activation_name=activation_name, 211 a=a, 212 nodes_sim=nodes_sim, 213 bias=bias, 214 dropout=dropout, 215 direct_link=direct_link, 216 n_clusters=n_clusters, 217 cluster_encode=cluster_encode, 218 type_clust=type_clust, 219 type_scaling=type_scaling, 220 seed=seed, 221 backend=backend, 222 lags=lags, 223 type_pi=type_pi, 224 block_size=block_size, 225 replications=replications, 226 kernel=kernel, 227 agg=agg, 228 verbose=verbose, 229 show_progress=show_progress, 230 ) 231 if self.replications is not None or self.type_pi == "gaussian": 232 if self.sort_by is None: 233 self.sort_by = "WINKLERSCORE" 234 else: 235 if self.sort_by is None: 236 self.sort_by = "RMSE" 237 238 def fit(self, X_train, X_test, xreg=None, per_series=False, **kwargs): 239 """Fit Regression algorithms to X_train, predict and score on X_test. 240 241 Parameters: 242 243 X_train: array-like or data frame, 244 Training vectors, where rows is the number of samples 245 and columns is the number of features. 246 247 X_test: array-like or data frame, 248 Testing vectors, where rows is the number of samples 249 and columns is the number of features. 250 251 xreg: array-like, optional (default=None) 252 Additional (external) regressors to be passed to self.obj 253 xreg must be in 'increasing' order (most recent observations last) 254 255 per_series: bool, optional (default=False) 256 When set to True, the metrics are computed series by series. 257 258 **kwargs: dict, optional (default=None) 259 Additional parameters to be passed to `fit` method of `obj`. 260 261 Returns: 262 263 scores: Pandas DataFrame 264 Returns metrics of all the models in a Pandas DataFrame. 265 266 predictions: Pandas DataFrame 267 Returns predictions of all the models in a Pandas DataFrame. 268 269 """ 270 R2 = [] 271 ADJR2 = [] 272 ME = [] 273 MPL = [] 274 RMSE = [] 275 MAE = [] 276 MPE = [] 277 MAPE = [] 278 WINKLERSCORE = [] 279 COVERAGE = [] 280 281 # WIN = [] 282 names = [] 283 TIME = [] 284 predictions = {} 285 286 if self.custom_metric is not None: 287 CUSTOM_METRIC = [] 288 289 if self.h is None: 290 assert X_test is not None, "If h is None, X_test must be provided." 291 292 if isinstance(X_train, np.ndarray): 293 X_train = pd.DataFrame(X_train) 294 X_test = pd.DataFrame(X_test) 295 296 self.series_names = X_train.columns.tolist() 297 298 X_train = convert_df_to_numeric(X_train) 299 X_test = convert_df_to_numeric(X_test) 300 301 numeric_features = X_train.select_dtypes(include=[np.number]).columns 302 categorical_features = X_train.select_dtypes(include=["object"]).columns 303 304 categorical_low, categorical_high = get_card_split( 305 X_train, categorical_features 306 ) 307 308 if self.preprocess: 309 preprocessor = ColumnTransformer( 310 transformers=[ 311 ("numeric", numeric_transformer, numeric_features), 312 ( 313 "categorical_low", 314 categorical_transformer_low, 315 categorical_low, 316 ), 317 ( 318 "categorical_high", 319 categorical_transformer_high, 320 categorical_high, 321 ), 322 ] 323 ) 324 325 # baselines (Classical MTS) ---- 326 for i, name in enumerate(["ARIMA", "ETS", "Theta", "VAR", "VECM"]): 327 try: 328 start = time.time() 329 regr = ns.ClassicalMTS(model=name) 330 regr.fit(X_train, **kwargs) 331 self.models_[name] = regr 332 if self.h is None: 333 X_pred = regr.predict(h=X_test.shape[0], **kwargs) 334 else: 335 assert self.h > 0, "h must be > 0" 336 X_pred = regr.predict(h=self.h, **kwargs) 337 try: 338 X_test = X_test[0 : self.h, :] 339 except Exception as e: 340 X_test = X_test.iloc[0 : self.h, :] 341 342 if per_series == False: 343 rmse = np.sqrt(np.mean((X_test - X_pred.mean) ** 2)) 344 mae = mean_absolute_error(X_test, X_pred.mean) 345 mpl = mean_pinball_loss(X_test, X_pred.mean) 346 else: 347 rmse = mean_errors( 348 actual=X_test, 349 pred=X_pred, 350 scoring="root_mean_squared_error", 351 per_series=True, 352 ) 353 mae = mean_errors( 354 actual=X_test, 355 pred=X_pred, 356 scoring="mean_absolute_error", 357 per_series=True, 358 ) 359 mpl = mean_errors( 360 actual=X_test, 361 pred=X_pred, 362 scoring="mean_pinball_loss", 363 per_series=True, 364 ) 365 except Exception as exception: 366 continue 367 368 names.append(name) 369 RMSE.append(rmse) 370 MAE.append(mae) 371 MPL.append(mpl) 372 373 if self.custom_metric is not None: 374 try: 375 if self.h is None: 376 custom_metric = self.custom_metric(X_test, X_pred) 377 else: 378 custom_metric = self.custom_metric(X_test_h, X_pred) 379 CUSTOM_METRIC.append(custom_metric) 380 except Exception as e: 381 custom_metric = np.iinfo(np.float32).max 382 CUSTOM_METRIC.append(np.iinfo(np.float32).max) 383 384 if (self.replications is not None) or (self.type_pi == "gaussian"): 385 if per_series == False: 386 winklerscore = winkler_score( 387 obj=X_pred, actual=X_test, level=95 388 ) 389 coveragecalc = coverage(X_pred, X_test, level=95) 390 else: 391 winklerscore = winkler_score( 392 obj=X_pred, actual=X_test, level=95, per_series=True 393 ) 394 coveragecalc = coverage( 395 X_pred, X_test, level=95, per_series=True 396 ) 397 WINKLERSCORE.append(winklerscore) 398 COVERAGE.append(coveragecalc) 399 TIME.append(time.time() - start) 400 401 if self.estimators == "all": 402 self.regressors = MTSREGRESSORS 403 else: 404 self.regressors = [ 405 ("MTS(GenericBooster(" + est[0] + "))", est[1]) 406 for est in all_estimators() 407 if ( 408 issubclass(est[1], RegressorMixin) 409 and (est[0] in self.estimators) 410 ) 411 ] 412 413 if self.preprocess is True: 414 for name, model in tqdm(self.regressors): # do parallel exec 415 start = time.time() 416 try: 417 if "random_state" in model().get_params().keys(): 418 pipe = Pipeline( 419 steps=[ 420 ("preprocessor", preprocessor), 421 ( 422 "regressor", 423 ns.MTS( 424 obj=GenericBoostingRegressor( 425 model( 426 random_state=self.random_state, 427 **kwargs, 428 ) 429 ), 430 n_hidden_features=self.n_hidden_features, 431 activation_name=self.activation_name, 432 a=self.a, 433 nodes_sim=self.nodes_sim, 434 bias=self.bias, 435 dropout=self.dropout, 436 direct_link=self.direct_link, 437 n_clusters=self.n_clusters, 438 cluster_encode=self.cluster_encode, 439 type_clust=self.type_clust, 440 type_scaling=self.type_scaling, 441 lags=self.lags, 442 type_pi=self.type_pi, 443 block_size=self.block_size, 444 replications=self.replications, 445 kernel=self.kernel, 446 agg=self.agg, 447 seed=self.seed, 448 backend=self.backend, 449 show_progress=self.show_progress, 450 ), 451 ), 452 ] 453 ) 454 else: # "random_state" in model().get_params().keys() 455 pipe = Pipeline( 456 steps=[ 457 ("preprocessor", preprocessor), 458 ( 459 "regressor", 460 ns.MTS( 461 obj=GenericBoostingRegressor( 462 model(**kwargs) 463 ), 464 n_hidden_features=self.n_hidden_features, 465 activation_name=self.activation_name, 466 a=self.a, 467 nodes_sim=self.nodes_sim, 468 bias=self.bias, 469 dropout=self.dropout, 470 direct_link=self.direct_link, 471 n_clusters=self.n_clusters, 472 cluster_encode=self.cluster_encode, 473 type_clust=self.type_clust, 474 type_scaling=self.type_scaling, 475 lags=self.lags, 476 type_pi=self.type_pi, 477 block_size=self.block_size, 478 replications=self.replications, 479 kernel=self.kernel, 480 agg=self.agg, 481 seed=self.seed, 482 backend=self.backend, 483 show_progress=self.show_progress, 484 ), 485 ), 486 ] 487 ) 488 489 pipe.fit(X_train, **kwargs) 490 # pipe.fit(X_train, xreg=xreg) 491 492 self.models_[name] = pipe 493 494 if self.h is None: 495 X_pred = pipe["regressor"].predict(h=self.h, **kwargs) 496 else: 497 assert self.h > 0, "h must be > 0" 498 X_pred = pipe["regressor"].predict(h=self.h, **kwargs) 499 500 if (self.replications is not None) or ( 501 self.type_pi == "gaussian" 502 ): 503 if per_series == False: 504 rmse = np.sqrt(np.mean((X_test - X_pred.mean) ** 2)) 505 mae = mean_absolute_error(X_test, X_pred.mean) 506 mpl = mean_pinball_loss(X_test, X_pred.mean) 507 winklerscore = winkler_score( 508 obj=X_pred, actual=X_test, level=95 509 ) 510 coveragecalc = coverage(X_pred, X_test, level=95) 511 else: 512 rmse = mean_errors( 513 actual=X_test, 514 pred=X_pred, 515 scoring="root_mean_squared_error", 516 per_series=True, 517 ) 518 mae = mean_errors( 519 actual=X_test, 520 pred=X_pred, 521 scoring="mean_absolute_error", 522 per_series=True, 523 ) 524 mpl = mean_errors( 525 actual=X_test, 526 pred=X_pred, 527 scoring="mean_pinball_loss", 528 per_series=True, 529 ) 530 winklerscore = winkler_score( 531 obj=X_pred, 532 actual=X_test, 533 level=95, 534 per_series=True, 535 ) 536 coveragecalc = coverage( 537 X_pred, X_test, level=95, per_series=True 538 ) 539 else: 540 if per_series == False: 541 rmse = np.sqrt(np.mean((X_test - X_pred) ** 2)) 542 mae = mean_absolute_error(X_test, X_pred) 543 mpl = mean_pinball_loss(X_test, X_pred) 544 else: 545 rmse = mean_errors( 546 actual=X_test, 547 pred=X_pred, 548 scoring="root_mean_squared_error", 549 per_series=True, 550 ) 551 mae = mean_errors( 552 actual=X_test, 553 pred=X_pred, 554 scoring="mean_absolute_error", 555 per_series=True, 556 ) 557 mpl = mean_errors( 558 actual=X_test, 559 pred=X_pred, 560 scoring="mean_pinball_loss", 561 per_series=True, 562 ) 563 564 names.append(name) 565 RMSE.append(rmse) 566 MAE.append(mae) 567 MPL.append(mpl) 568 569 if (self.replications is not None) or ( 570 self.type_pi == "gaussian" 571 ): 572 WINKLERSCORE.append(winklerscore) 573 COVERAGE.append(coveragecalc) 574 TIME.append(time.time() - start) 575 576 if self.custom_metric is not None: 577 try: 578 custom_metric = self.custom_metric(X_test, X_pred) 579 CUSTOM_METRIC.append(custom_metric) 580 except Exception as e: 581 custom_metric = np.iinfo(np.float32).max 582 CUSTOM_METRIC.append(custom_metric) 583 584 if self.verbose > 0: 585 if (self.replications is not None) or ( 586 self.type_pi == "gaussian" 587 ): 588 scores_verbose = { 589 "Model": name, 590 "RMSE": rmse, 591 "MAE": mae, 592 "MPL": mpl, 593 "WINKLERSCORE": winklerscore, 594 "COVERAGE": coveragecalc, 595 "Time taken": time.time() - start, 596 } 597 else: 598 scores_verbose = { 599 "Model": name, 600 "RMSE": rmse, 601 "MAE": mae, 602 "MPL": mpl, 603 "Time taken": time.time() - start, 604 } 605 606 if self.custom_metric is not None: 607 scores_verbose["Custom metric"] = custom_metric 608 609 if self.predictions: 610 predictions[name] = X_pred 611 except Exception as exception: 612 if self.ignore_warnings is False: 613 print(name + " model failed to execute") 614 print(exception) 615 616 else: # no preprocessing 617 618 for name, model in tqdm(self.regressors): # do parallel exec 619 start = time.time() 620 try: 621 if "random_state" in model().get_params().keys(): 622 pipe = ns.MTS( 623 obj=model(random_state=self.random_state, **kwargs), 624 n_hidden_features=self.n_hidden_features, 625 activation_name=self.activation_name, 626 a=self.a, 627 nodes_sim=self.nodes_sim, 628 bias=self.bias, 629 dropout=self.dropout, 630 direct_link=self.direct_link, 631 n_clusters=self.n_clusters, 632 cluster_encode=self.cluster_encode, 633 type_clust=self.type_clust, 634 type_scaling=self.type_scaling, 635 lags=self.lags, 636 type_pi=self.type_pi, 637 block_size=self.block_size, 638 replications=self.replications, 639 kernel=self.kernel, 640 agg=self.agg, 641 seed=self.seed, 642 backend=self.backend, 643 show_progress=self.show_progress, 644 ) 645 else: 646 pipe = ns.MTS( 647 obj=model(**kwargs), 648 n_hidden_features=self.n_hidden_features, 649 activation_name=self.activation_name, 650 a=self.a, 651 nodes_sim=self.nodes_sim, 652 bias=self.bias, 653 dropout=self.dropout, 654 direct_link=self.direct_link, 655 n_clusters=self.n_clusters, 656 cluster_encode=self.cluster_encode, 657 type_clust=self.type_clust, 658 type_scaling=self.type_scaling, 659 lags=self.lags, 660 type_pi=self.type_pi, 661 block_size=self.block_size, 662 replications=self.replications, 663 kernel=self.kernel, 664 agg=self.agg, 665 seed=self.seed, 666 backend=self.backend, 667 show_progress=self.show_progress, 668 ) 669 670 pipe.fit(X_train, xreg, **kwargs) 671 # pipe.fit(X_train, xreg=xreg) # DO xreg like in `ahead` 672 673 self.models_[name] = pipe 674 675 if self.preprocess is True: 676 if self.h is None: 677 X_pred = pipe["regressor"].predict( 678 h=X_test.shape[0], **kwargs 679 ) 680 else: 681 assert ( 682 self.h > 0 and self.h <= X_test.shape[0] 683 ), "h must be > 0 and < X_test.shape[0]" 684 X_pred = pipe["regressor"].predict( 685 h=self.h, **kwargs 686 ) 687 688 else: 689 690 if self.h is None: 691 X_pred = pipe.predict( 692 h=X_test.shape[0], **kwargs 693 ) # X_pred = pipe.predict(h=X_test.shape[0], new_xreg=new_xreg) ## DO xreg like in `ahead` 694 else: 695 assert ( 696 self.h > 0 and self.h <= X_test.shape[0] 697 ), "h must be > 0 and < X_test.shape[0]" 698 X_pred = pipe.predict(h=self.h, **kwargs) 699 700 if self.h is None: 701 if (self.replications is not None) or ( 702 self.type_pi == "gaussian" 703 ): 704 705 if per_series == True: 706 rmse = mean_errors( 707 actual=X_test, 708 pred=X_pred.mean, 709 scoring="root_mean_squared_error", 710 per_series=True, 711 ) 712 mae = mean_errors( 713 actual=X_test, 714 pred=X_pred.mean, 715 scoring="mean_absolute_error", 716 per_series=True, 717 ) 718 mpl = mean_errors( 719 actual=X_test, 720 pred=X_pred.mean, 721 scoring="mean_pinball_loss", 722 per_series=True, 723 ) 724 winklerscore = winkler_score( 725 obj=X_pred, 726 actual=X_test, 727 level=95, 728 per_series=True, 729 ) 730 coveragecalc = coverage( 731 X_pred, X_test, level=95, per_series=True 732 ) 733 else: 734 rmse = np.sqrt( 735 np.mean((X_test - X_pred.mean) ** 2) 736 ) 737 mae = mean_absolute_error(X_test, X_pred.mean) 738 mpl = mean_pinball_loss(X_test, X_pred.mean) 739 winklerscore = winkler_score( 740 obj=X_pred, actual=X_test, level=95 741 ) 742 coveragecalc = coverage( 743 X_pred, X_test, level=95 744 ) 745 else: # no prediction interval 746 if per_series == True: 747 rmse = mean_errors( 748 actual=X_test, 749 pred=X_pred, 750 scoring="root_mean_squared_error", 751 per_series=True, 752 ) 753 mae = mean_errors( 754 actual=X_test, 755 pred=X_pred, 756 scoring="mean_absolute_error", 757 per_series=True, 758 ) 759 mpl = mean_errors( 760 actual=X_test, 761 pred=X_pred, 762 scoring="mean_pinball_loss", 763 per_series=True, 764 ) 765 else: 766 rmse = np.sqrt(np.mean((X_test - X_pred) ** 2)) 767 mae = mean_absolute_error(X_test, X_pred) 768 mpl = mean_pinball_loss(X_test, X_pred) 769 else: # self.h is not None 770 if (self.replications is not None) or ( 771 self.type_pi == "gaussian" 772 ): 773 774 if per_series == False: 775 if isinstance(X_test, pd.DataFrame) == False: 776 X_test_h = X_test[0 : self.h, :] 777 rmse = np.sqrt( 778 np.mean((X_test_h - X_pred.mean) ** 2) 779 ) 780 mae = mean_absolute_error( 781 X_test_h, X_pred.mean 782 ) 783 mpl = mean_pinball_loss( 784 X_test_h, X_pred.mean 785 ) 786 winklerscore = winkler_score( 787 obj=X_pred, actual=X_test_h, level=95 788 ) 789 coveragecalc = coverage( 790 X_pred, X_test_h, level=95 791 ) 792 else: 793 X_test_h = X_test.iloc[0 : self.h, :] 794 rmse = np.sqrt( 795 np.mean((X_test_h - X_pred.mean) ** 2) 796 ) 797 mae = mean_absolute_error( 798 X_test_h, X_pred.mean 799 ) 800 mpl = mean_pinball_loss( 801 X_test_h, X_pred.mean 802 ) 803 winklerscore = winkler_score( 804 obj=X_pred, actual=X_test_h, level=95 805 ) 806 coveragecalc = coverage( 807 X_pred, X_test_h, level=95 808 ) 809 else: 810 if isinstance(X_test, pd.DataFrame): 811 X_test_h = X_test.iloc[0 : self.h, :] 812 rmse = mean_errors( 813 actual=X_test_h, 814 pred=X_pred, 815 scoring="root_mean_squared_error", 816 per_series=True, 817 ) 818 mae = mean_errors( 819 actual=X_test_h, 820 pred=X_pred, 821 scoring="mean_absolute_error", 822 per_series=True, 823 ) 824 mpl = mean_errors( 825 actual=X_test_h, 826 pred=X_pred, 827 scoring="mean_pinball_loss", 828 per_series=True, 829 ) 830 winklerscore = winkler_score( 831 obj=X_pred, 832 actual=X_test_h, 833 level=95, 834 per_series=True, 835 ) 836 coveragecalc = coverage( 837 X_pred, 838 X_test_h, 839 level=95, 840 per_series=True, 841 ) 842 else: 843 X_test_h = X_test[0 : self.h, :] 844 rmse = mean_errors( 845 actual=X_test_h, 846 pred=X_pred, 847 scoring="root_mean_squared_error", 848 per_series=True, 849 ) 850 mae = mean_errors( 851 actual=X_test_h, 852 pred=X_pred, 853 scoring="mean_absolute_error", 854 per_series=True, 855 ) 856 mpl = mean_errors( 857 actual=X_test_h, 858 pred=X_pred, 859 scoring="mean_pinball_loss", 860 per_series=True, 861 ) 862 winklerscore = winkler_score( 863 obj=X_pred, 864 actual=X_test_h, 865 level=95, 866 per_series=True, 867 ) 868 coveragecalc = coverage( 869 X_pred, 870 X_test_h, 871 level=95, 872 per_series=True, 873 ) 874 else: # no prediction interval 875 876 if per_series == False: 877 if isinstance(X_test, pd.DataFrame): 878 X_test_h = X_test.iloc[0 : self.h, :] 879 rmse = np.sqrt( 880 np.mean((X_test_h - X_pred) ** 2) 881 ) 882 mae = mean_absolute_error(X_test_h, X_pred) 883 mpl = mean_pinball_loss(X_test_h, X_pred) 884 else: 885 X_test_h = X_test[0 : self.h, :] 886 rmse = np.sqrt( 887 np.mean((X_test_h - X_pred) ** 2) 888 ) 889 mae = mean_absolute_error(X_test_h, X_pred) 890 mpl = mean_pinball_loss(X_test_h, X_pred) 891 else: 892 if isinstance(X_test, pd.DataFrame): 893 X_test_h = X_test.iloc[0 : self.h, :] 894 rmse = mean_errors( 895 actual=X_test_h, 896 pred=X_pred, 897 scoring="root_mean_squared_error", 898 per_series=True, 899 ) 900 mae = mean_errors( 901 actual=X_test_h, 902 pred=X_pred, 903 scoring="mean_absolute_error", 904 per_series=True, 905 ) 906 mpl = mean_errors( 907 actual=X_test_h, 908 pred=X_pred, 909 scoring="mean_pinball_loss", 910 per_series=True, 911 ) 912 else: 913 X_test_h = X_test[0 : self.h, :] 914 rmse = mean_errors( 915 actual=X_test_h, 916 pred=X_pred, 917 scoring="root_mean_squared_error", 918 per_series=True, 919 ) 920 mae = mean_errors( 921 actual=X_test_h, 922 pred=X_pred, 923 scoring="mean_absolute_error", 924 per_series=True, 925 ) 926 927 names.append(name) 928 RMSE.append(rmse) 929 MAE.append(mae) 930 MPL.append(mpl) 931 if (self.replications is not None) or ( 932 self.type_pi == "gaussian" 933 ): 934 WINKLERSCORE.append(winklerscore) 935 COVERAGE.append(coveragecalc) 936 TIME.append(time.time() - start) 937 938 if self.custom_metric is not None: 939 try: 940 if self.h is None: 941 custom_metric = self.custom_metric( 942 X_test, X_pred 943 ) 944 else: 945 custom_metric = self.custom_metric( 946 X_test_h, X_pred 947 ) 948 CUSTOM_METRIC.append(custom_metric) 949 except Exception as e: 950 custom_metric = np.iinfo(np.float32).max 951 CUSTOM_METRIC.append(np.iinfo(np.float32).max) 952 953 if self.verbose > 0: 954 if (self.replications is not None) or ( 955 self.type_pi == "gaussian" 956 ): 957 scores_verbose = { 958 "Model": name, 959 "RMSE": rmse, 960 "MAE": mae, 961 "MPL": mpl, 962 "WINKLERSCORE": winklerscore, 963 "COVERAGE": coveragecalc, 964 "Time taken": time.time() - start, 965 } 966 else: 967 scores_verbose = { 968 "Model": name, 969 "RMSE": rmse, 970 "MAE": mae, 971 "MPL": mpl, 972 "Time taken": time.time() - start, 973 } 974 975 if self.custom_metric is not None: 976 scores_verbose["Custom metric"] = custom_metric 977 978 if self.predictions: 979 predictions[name] = X_pred 980 981 except Exception as exception: 982 if self.ignore_warnings is False: 983 print(name + " model failed to execute") 984 print(exception) 985 986 if (self.replications is not None) or (self.type_pi == "gaussian"): 987 scores = { 988 "Model": names, 989 "RMSE": RMSE, 990 "MAE": MAE, 991 "MPL": MPL, 992 "WINKLERSCORE": WINKLERSCORE, 993 "COVERAGE": COVERAGE, 994 "Time Taken": TIME, 995 } 996 else: 997 scores = { 998 "Model": names, 999 "RMSE": RMSE, 1000 "MAE": MAE, 1001 "MPL": MPL, 1002 "Time Taken": TIME, 1003 } 1004 1005 if self.custom_metric is not None: 1006 scores["Custom metric"] = CUSTOM_METRIC 1007 1008 if per_series: 1009 scores = dict_to_dataframe_series(scores, self.series_names) 1010 else: 1011 scores = pd.DataFrame(scores) 1012 1013 try: # case per_series, can't be sorted 1014 scores = scores.sort_values( 1015 by=self.sort_by, ascending=True 1016 ).set_index("Model") 1017 1018 self.best_model_ = self.models_[scores.index[0]] 1019 except Exception as e: 1020 pass 1021 1022 if self.predictions is True: 1023 1024 return scores, predictions 1025 1026 return scores 1027 1028 def get_best_model(self): 1029 """ 1030 This function returns the best model pipeline based on the sort_by metric. 1031 1032 Returns: 1033 1034 best_model: object, 1035 Returns the best model pipeline based on the sort_by metric. 1036 1037 """ 1038 return self.best_model_ 1039 1040 def provide_models(self, X_train, X_test): 1041 """ 1042 This function returns all the model objects trained in fit function. 1043 If fit is not called already, then we call fit and then return the models. 1044 1045 Parameters: 1046 1047 X_train : array-like, 1048 Training vectors, where rows is the number of samples 1049 and columns is the number of features. 1050 1051 X_test : array-like, 1052 Testing vectors, where rows is the number of samples 1053 and columns is the number of features. 1054 1055 Returns: 1056 1057 models: dict-object, 1058 Returns a dictionary with each model pipeline as value 1059 with key as name of models. 1060 1061 """ 1062 if self.h is None: 1063 if len(self.models_.keys()) == 0: 1064 self.fit(X_train, X_test) 1065 else: 1066 if len(self.models_.keys()) == 0: 1067 if isinstance(X_test, pd.DataFrame): 1068 self.fit(X_train, X_test.iloc[0 : self.h, :]) 1069 else: 1070 self.fit(X_train, X_test[0 : self.h, :]) 1071 1072 return self.models_
Fitting -- almost -- all the regression algorithms with layers of nnetsauce's CustomRegressor to multivariate time series and returning their scores.
Parameters:
verbose: int, optional (default=0)
Any positive number for verbosity.
ignore_warnings: bool, optional (default=True)
When set to True, the warning related to algorigms that are not
able to run are ignored.
custom_metric: function, optional (default=None)
When function is provided, models are evaluated based on the custom
evaluation metric provided.
predictions: bool, optional (default=False)
When set to True, the predictions of all the models models are returned as dataframe.
sort_by: string, optional (default='RMSE')
Sort models by a metric. Available options are 'RMSE', 'MAE', 'MPL', 'MPE', 'MAPE',
'R-Squared', 'Adjusted R-Squared' or a custom metric identified by its name and
provided by custom_metric.
random_state: int, optional (default=42)
Reproducibiility seed.
estimators: list, optional (default='all')
list of Estimators (regression algorithms) names or just 'all' (default='all')
preprocess: bool, preprocessing is done when set to True
h: int, optional (default=None)
Number of steps ahead to predict (when used, must be > 0 and < X_test.shape[0]).
All the other parameters are the same as MTS's.
Attributes:
models_: dict-object
Returns a dictionary with each model pipeline as value
with key as name of models.
best_model_: object
Returns the best model pipeline based on the sort_by metric.
Examples:
See https://thierrymoudiki.github.io/blog/2023/10/29/python/quasirandomizednn/MTS-LazyPredict
238 def fit(self, X_train, X_test, xreg=None, per_series=False, **kwargs): 239 """Fit Regression algorithms to X_train, predict and score on X_test. 240 241 Parameters: 242 243 X_train: array-like or data frame, 244 Training vectors, where rows is the number of samples 245 and columns is the number of features. 246 247 X_test: array-like or data frame, 248 Testing vectors, where rows is the number of samples 249 and columns is the number of features. 250 251 xreg: array-like, optional (default=None) 252 Additional (external) regressors to be passed to self.obj 253 xreg must be in 'increasing' order (most recent observations last) 254 255 per_series: bool, optional (default=False) 256 When set to True, the metrics are computed series by series. 257 258 **kwargs: dict, optional (default=None) 259 Additional parameters to be passed to `fit` method of `obj`. 260 261 Returns: 262 263 scores: Pandas DataFrame 264 Returns metrics of all the models in a Pandas DataFrame. 265 266 predictions: Pandas DataFrame 267 Returns predictions of all the models in a Pandas DataFrame. 268 269 """ 270 R2 = [] 271 ADJR2 = [] 272 ME = [] 273 MPL = [] 274 RMSE = [] 275 MAE = [] 276 MPE = [] 277 MAPE = [] 278 WINKLERSCORE = [] 279 COVERAGE = [] 280 281 # WIN = [] 282 names = [] 283 TIME = [] 284 predictions = {} 285 286 if self.custom_metric is not None: 287 CUSTOM_METRIC = [] 288 289 if self.h is None: 290 assert X_test is not None, "If h is None, X_test must be provided." 291 292 if isinstance(X_train, np.ndarray): 293 X_train = pd.DataFrame(X_train) 294 X_test = pd.DataFrame(X_test) 295 296 self.series_names = X_train.columns.tolist() 297 298 X_train = convert_df_to_numeric(X_train) 299 X_test = convert_df_to_numeric(X_test) 300 301 numeric_features = X_train.select_dtypes(include=[np.number]).columns 302 categorical_features = X_train.select_dtypes(include=["object"]).columns 303 304 categorical_low, categorical_high = get_card_split( 305 X_train, categorical_features 306 ) 307 308 if self.preprocess: 309 preprocessor = ColumnTransformer( 310 transformers=[ 311 ("numeric", numeric_transformer, numeric_features), 312 ( 313 "categorical_low", 314 categorical_transformer_low, 315 categorical_low, 316 ), 317 ( 318 "categorical_high", 319 categorical_transformer_high, 320 categorical_high, 321 ), 322 ] 323 ) 324 325 # baselines (Classical MTS) ---- 326 for i, name in enumerate(["ARIMA", "ETS", "Theta", "VAR", "VECM"]): 327 try: 328 start = time.time() 329 regr = ns.ClassicalMTS(model=name) 330 regr.fit(X_train, **kwargs) 331 self.models_[name] = regr 332 if self.h is None: 333 X_pred = regr.predict(h=X_test.shape[0], **kwargs) 334 else: 335 assert self.h > 0, "h must be > 0" 336 X_pred = regr.predict(h=self.h, **kwargs) 337 try: 338 X_test = X_test[0 : self.h, :] 339 except Exception as e: 340 X_test = X_test.iloc[0 : self.h, :] 341 342 if per_series == False: 343 rmse = np.sqrt(np.mean((X_test - X_pred.mean) ** 2)) 344 mae = mean_absolute_error(X_test, X_pred.mean) 345 mpl = mean_pinball_loss(X_test, X_pred.mean) 346 else: 347 rmse = mean_errors( 348 actual=X_test, 349 pred=X_pred, 350 scoring="root_mean_squared_error", 351 per_series=True, 352 ) 353 mae = mean_errors( 354 actual=X_test, 355 pred=X_pred, 356 scoring="mean_absolute_error", 357 per_series=True, 358 ) 359 mpl = mean_errors( 360 actual=X_test, 361 pred=X_pred, 362 scoring="mean_pinball_loss", 363 per_series=True, 364 ) 365 except Exception as exception: 366 continue 367 368 names.append(name) 369 RMSE.append(rmse) 370 MAE.append(mae) 371 MPL.append(mpl) 372 373 if self.custom_metric is not None: 374 try: 375 if self.h is None: 376 custom_metric = self.custom_metric(X_test, X_pred) 377 else: 378 custom_metric = self.custom_metric(X_test_h, X_pred) 379 CUSTOM_METRIC.append(custom_metric) 380 except Exception as e: 381 custom_metric = np.iinfo(np.float32).max 382 CUSTOM_METRIC.append(np.iinfo(np.float32).max) 383 384 if (self.replications is not None) or (self.type_pi == "gaussian"): 385 if per_series == False: 386 winklerscore = winkler_score( 387 obj=X_pred, actual=X_test, level=95 388 ) 389 coveragecalc = coverage(X_pred, X_test, level=95) 390 else: 391 winklerscore = winkler_score( 392 obj=X_pred, actual=X_test, level=95, per_series=True 393 ) 394 coveragecalc = coverage( 395 X_pred, X_test, level=95, per_series=True 396 ) 397 WINKLERSCORE.append(winklerscore) 398 COVERAGE.append(coveragecalc) 399 TIME.append(time.time() - start) 400 401 if self.estimators == "all": 402 self.regressors = MTSREGRESSORS 403 else: 404 self.regressors = [ 405 ("MTS(GenericBooster(" + est[0] + "))", est[1]) 406 for est in all_estimators() 407 if ( 408 issubclass(est[1], RegressorMixin) 409 and (est[0] in self.estimators) 410 ) 411 ] 412 413 if self.preprocess is True: 414 for name, model in tqdm(self.regressors): # do parallel exec 415 start = time.time() 416 try: 417 if "random_state" in model().get_params().keys(): 418 pipe = Pipeline( 419 steps=[ 420 ("preprocessor", preprocessor), 421 ( 422 "regressor", 423 ns.MTS( 424 obj=GenericBoostingRegressor( 425 model( 426 random_state=self.random_state, 427 **kwargs, 428 ) 429 ), 430 n_hidden_features=self.n_hidden_features, 431 activation_name=self.activation_name, 432 a=self.a, 433 nodes_sim=self.nodes_sim, 434 bias=self.bias, 435 dropout=self.dropout, 436 direct_link=self.direct_link, 437 n_clusters=self.n_clusters, 438 cluster_encode=self.cluster_encode, 439 type_clust=self.type_clust, 440 type_scaling=self.type_scaling, 441 lags=self.lags, 442 type_pi=self.type_pi, 443 block_size=self.block_size, 444 replications=self.replications, 445 kernel=self.kernel, 446 agg=self.agg, 447 seed=self.seed, 448 backend=self.backend, 449 show_progress=self.show_progress, 450 ), 451 ), 452 ] 453 ) 454 else: # "random_state" in model().get_params().keys() 455 pipe = Pipeline( 456 steps=[ 457 ("preprocessor", preprocessor), 458 ( 459 "regressor", 460 ns.MTS( 461 obj=GenericBoostingRegressor( 462 model(**kwargs) 463 ), 464 n_hidden_features=self.n_hidden_features, 465 activation_name=self.activation_name, 466 a=self.a, 467 nodes_sim=self.nodes_sim, 468 bias=self.bias, 469 dropout=self.dropout, 470 direct_link=self.direct_link, 471 n_clusters=self.n_clusters, 472 cluster_encode=self.cluster_encode, 473 type_clust=self.type_clust, 474 type_scaling=self.type_scaling, 475 lags=self.lags, 476 type_pi=self.type_pi, 477 block_size=self.block_size, 478 replications=self.replications, 479 kernel=self.kernel, 480 agg=self.agg, 481 seed=self.seed, 482 backend=self.backend, 483 show_progress=self.show_progress, 484 ), 485 ), 486 ] 487 ) 488 489 pipe.fit(X_train, **kwargs) 490 # pipe.fit(X_train, xreg=xreg) 491 492 self.models_[name] = pipe 493 494 if self.h is None: 495 X_pred = pipe["regressor"].predict(h=self.h, **kwargs) 496 else: 497 assert self.h > 0, "h must be > 0" 498 X_pred = pipe["regressor"].predict(h=self.h, **kwargs) 499 500 if (self.replications is not None) or ( 501 self.type_pi == "gaussian" 502 ): 503 if per_series == False: 504 rmse = np.sqrt(np.mean((X_test - X_pred.mean) ** 2)) 505 mae = mean_absolute_error(X_test, X_pred.mean) 506 mpl = mean_pinball_loss(X_test, X_pred.mean) 507 winklerscore = winkler_score( 508 obj=X_pred, actual=X_test, level=95 509 ) 510 coveragecalc = coverage(X_pred, X_test, level=95) 511 else: 512 rmse = mean_errors( 513 actual=X_test, 514 pred=X_pred, 515 scoring="root_mean_squared_error", 516 per_series=True, 517 ) 518 mae = mean_errors( 519 actual=X_test, 520 pred=X_pred, 521 scoring="mean_absolute_error", 522 per_series=True, 523 ) 524 mpl = mean_errors( 525 actual=X_test, 526 pred=X_pred, 527 scoring="mean_pinball_loss", 528 per_series=True, 529 ) 530 winklerscore = winkler_score( 531 obj=X_pred, 532 actual=X_test, 533 level=95, 534 per_series=True, 535 ) 536 coveragecalc = coverage( 537 X_pred, X_test, level=95, per_series=True 538 ) 539 else: 540 if per_series == False: 541 rmse = np.sqrt(np.mean((X_test - X_pred) ** 2)) 542 mae = mean_absolute_error(X_test, X_pred) 543 mpl = mean_pinball_loss(X_test, X_pred) 544 else: 545 rmse = mean_errors( 546 actual=X_test, 547 pred=X_pred, 548 scoring="root_mean_squared_error", 549 per_series=True, 550 ) 551 mae = mean_errors( 552 actual=X_test, 553 pred=X_pred, 554 scoring="mean_absolute_error", 555 per_series=True, 556 ) 557 mpl = mean_errors( 558 actual=X_test, 559 pred=X_pred, 560 scoring="mean_pinball_loss", 561 per_series=True, 562 ) 563 564 names.append(name) 565 RMSE.append(rmse) 566 MAE.append(mae) 567 MPL.append(mpl) 568 569 if (self.replications is not None) or ( 570 self.type_pi == "gaussian" 571 ): 572 WINKLERSCORE.append(winklerscore) 573 COVERAGE.append(coveragecalc) 574 TIME.append(time.time() - start) 575 576 if self.custom_metric is not None: 577 try: 578 custom_metric = self.custom_metric(X_test, X_pred) 579 CUSTOM_METRIC.append(custom_metric) 580 except Exception as e: 581 custom_metric = np.iinfo(np.float32).max 582 CUSTOM_METRIC.append(custom_metric) 583 584 if self.verbose > 0: 585 if (self.replications is not None) or ( 586 self.type_pi == "gaussian" 587 ): 588 scores_verbose = { 589 "Model": name, 590 "RMSE": rmse, 591 "MAE": mae, 592 "MPL": mpl, 593 "WINKLERSCORE": winklerscore, 594 "COVERAGE": coveragecalc, 595 "Time taken": time.time() - start, 596 } 597 else: 598 scores_verbose = { 599 "Model": name, 600 "RMSE": rmse, 601 "MAE": mae, 602 "MPL": mpl, 603 "Time taken": time.time() - start, 604 } 605 606 if self.custom_metric is not None: 607 scores_verbose["Custom metric"] = custom_metric 608 609 if self.predictions: 610 predictions[name] = X_pred 611 except Exception as exception: 612 if self.ignore_warnings is False: 613 print(name + " model failed to execute") 614 print(exception) 615 616 else: # no preprocessing 617 618 for name, model in tqdm(self.regressors): # do parallel exec 619 start = time.time() 620 try: 621 if "random_state" in model().get_params().keys(): 622 pipe = ns.MTS( 623 obj=model(random_state=self.random_state, **kwargs), 624 n_hidden_features=self.n_hidden_features, 625 activation_name=self.activation_name, 626 a=self.a, 627 nodes_sim=self.nodes_sim, 628 bias=self.bias, 629 dropout=self.dropout, 630 direct_link=self.direct_link, 631 n_clusters=self.n_clusters, 632 cluster_encode=self.cluster_encode, 633 type_clust=self.type_clust, 634 type_scaling=self.type_scaling, 635 lags=self.lags, 636 type_pi=self.type_pi, 637 block_size=self.block_size, 638 replications=self.replications, 639 kernel=self.kernel, 640 agg=self.agg, 641 seed=self.seed, 642 backend=self.backend, 643 show_progress=self.show_progress, 644 ) 645 else: 646 pipe = ns.MTS( 647 obj=model(**kwargs), 648 n_hidden_features=self.n_hidden_features, 649 activation_name=self.activation_name, 650 a=self.a, 651 nodes_sim=self.nodes_sim, 652 bias=self.bias, 653 dropout=self.dropout, 654 direct_link=self.direct_link, 655 n_clusters=self.n_clusters, 656 cluster_encode=self.cluster_encode, 657 type_clust=self.type_clust, 658 type_scaling=self.type_scaling, 659 lags=self.lags, 660 type_pi=self.type_pi, 661 block_size=self.block_size, 662 replications=self.replications, 663 kernel=self.kernel, 664 agg=self.agg, 665 seed=self.seed, 666 backend=self.backend, 667 show_progress=self.show_progress, 668 ) 669 670 pipe.fit(X_train, xreg, **kwargs) 671 # pipe.fit(X_train, xreg=xreg) # DO xreg like in `ahead` 672 673 self.models_[name] = pipe 674 675 if self.preprocess is True: 676 if self.h is None: 677 X_pred = pipe["regressor"].predict( 678 h=X_test.shape[0], **kwargs 679 ) 680 else: 681 assert ( 682 self.h > 0 and self.h <= X_test.shape[0] 683 ), "h must be > 0 and < X_test.shape[0]" 684 X_pred = pipe["regressor"].predict( 685 h=self.h, **kwargs 686 ) 687 688 else: 689 690 if self.h is None: 691 X_pred = pipe.predict( 692 h=X_test.shape[0], **kwargs 693 ) # X_pred = pipe.predict(h=X_test.shape[0], new_xreg=new_xreg) ## DO xreg like in `ahead` 694 else: 695 assert ( 696 self.h > 0 and self.h <= X_test.shape[0] 697 ), "h must be > 0 and < X_test.shape[0]" 698 X_pred = pipe.predict(h=self.h, **kwargs) 699 700 if self.h is None: 701 if (self.replications is not None) or ( 702 self.type_pi == "gaussian" 703 ): 704 705 if per_series == True: 706 rmse = mean_errors( 707 actual=X_test, 708 pred=X_pred.mean, 709 scoring="root_mean_squared_error", 710 per_series=True, 711 ) 712 mae = mean_errors( 713 actual=X_test, 714 pred=X_pred.mean, 715 scoring="mean_absolute_error", 716 per_series=True, 717 ) 718 mpl = mean_errors( 719 actual=X_test, 720 pred=X_pred.mean, 721 scoring="mean_pinball_loss", 722 per_series=True, 723 ) 724 winklerscore = winkler_score( 725 obj=X_pred, 726 actual=X_test, 727 level=95, 728 per_series=True, 729 ) 730 coveragecalc = coverage( 731 X_pred, X_test, level=95, per_series=True 732 ) 733 else: 734 rmse = np.sqrt( 735 np.mean((X_test - X_pred.mean) ** 2) 736 ) 737 mae = mean_absolute_error(X_test, X_pred.mean) 738 mpl = mean_pinball_loss(X_test, X_pred.mean) 739 winklerscore = winkler_score( 740 obj=X_pred, actual=X_test, level=95 741 ) 742 coveragecalc = coverage( 743 X_pred, X_test, level=95 744 ) 745 else: # no prediction interval 746 if per_series == True: 747 rmse = mean_errors( 748 actual=X_test, 749 pred=X_pred, 750 scoring="root_mean_squared_error", 751 per_series=True, 752 ) 753 mae = mean_errors( 754 actual=X_test, 755 pred=X_pred, 756 scoring="mean_absolute_error", 757 per_series=True, 758 ) 759 mpl = mean_errors( 760 actual=X_test, 761 pred=X_pred, 762 scoring="mean_pinball_loss", 763 per_series=True, 764 ) 765 else: 766 rmse = np.sqrt(np.mean((X_test - X_pred) ** 2)) 767 mae = mean_absolute_error(X_test, X_pred) 768 mpl = mean_pinball_loss(X_test, X_pred) 769 else: # self.h is not None 770 if (self.replications is not None) or ( 771 self.type_pi == "gaussian" 772 ): 773 774 if per_series == False: 775 if isinstance(X_test, pd.DataFrame) == False: 776 X_test_h = X_test[0 : self.h, :] 777 rmse = np.sqrt( 778 np.mean((X_test_h - X_pred.mean) ** 2) 779 ) 780 mae = mean_absolute_error( 781 X_test_h, X_pred.mean 782 ) 783 mpl = mean_pinball_loss( 784 X_test_h, X_pred.mean 785 ) 786 winklerscore = winkler_score( 787 obj=X_pred, actual=X_test_h, level=95 788 ) 789 coveragecalc = coverage( 790 X_pred, X_test_h, level=95 791 ) 792 else: 793 X_test_h = X_test.iloc[0 : self.h, :] 794 rmse = np.sqrt( 795 np.mean((X_test_h - X_pred.mean) ** 2) 796 ) 797 mae = mean_absolute_error( 798 X_test_h, X_pred.mean 799 ) 800 mpl = mean_pinball_loss( 801 X_test_h, X_pred.mean 802 ) 803 winklerscore = winkler_score( 804 obj=X_pred, actual=X_test_h, level=95 805 ) 806 coveragecalc = coverage( 807 X_pred, X_test_h, level=95 808 ) 809 else: 810 if isinstance(X_test, pd.DataFrame): 811 X_test_h = X_test.iloc[0 : self.h, :] 812 rmse = mean_errors( 813 actual=X_test_h, 814 pred=X_pred, 815 scoring="root_mean_squared_error", 816 per_series=True, 817 ) 818 mae = mean_errors( 819 actual=X_test_h, 820 pred=X_pred, 821 scoring="mean_absolute_error", 822 per_series=True, 823 ) 824 mpl = mean_errors( 825 actual=X_test_h, 826 pred=X_pred, 827 scoring="mean_pinball_loss", 828 per_series=True, 829 ) 830 winklerscore = winkler_score( 831 obj=X_pred, 832 actual=X_test_h, 833 level=95, 834 per_series=True, 835 ) 836 coveragecalc = coverage( 837 X_pred, 838 X_test_h, 839 level=95, 840 per_series=True, 841 ) 842 else: 843 X_test_h = X_test[0 : self.h, :] 844 rmse = mean_errors( 845 actual=X_test_h, 846 pred=X_pred, 847 scoring="root_mean_squared_error", 848 per_series=True, 849 ) 850 mae = mean_errors( 851 actual=X_test_h, 852 pred=X_pred, 853 scoring="mean_absolute_error", 854 per_series=True, 855 ) 856 mpl = mean_errors( 857 actual=X_test_h, 858 pred=X_pred, 859 scoring="mean_pinball_loss", 860 per_series=True, 861 ) 862 winklerscore = winkler_score( 863 obj=X_pred, 864 actual=X_test_h, 865 level=95, 866 per_series=True, 867 ) 868 coveragecalc = coverage( 869 X_pred, 870 X_test_h, 871 level=95, 872 per_series=True, 873 ) 874 else: # no prediction interval 875 876 if per_series == False: 877 if isinstance(X_test, pd.DataFrame): 878 X_test_h = X_test.iloc[0 : self.h, :] 879 rmse = np.sqrt( 880 np.mean((X_test_h - X_pred) ** 2) 881 ) 882 mae = mean_absolute_error(X_test_h, X_pred) 883 mpl = mean_pinball_loss(X_test_h, X_pred) 884 else: 885 X_test_h = X_test[0 : self.h, :] 886 rmse = np.sqrt( 887 np.mean((X_test_h - X_pred) ** 2) 888 ) 889 mae = mean_absolute_error(X_test_h, X_pred) 890 mpl = mean_pinball_loss(X_test_h, X_pred) 891 else: 892 if isinstance(X_test, pd.DataFrame): 893 X_test_h = X_test.iloc[0 : self.h, :] 894 rmse = mean_errors( 895 actual=X_test_h, 896 pred=X_pred, 897 scoring="root_mean_squared_error", 898 per_series=True, 899 ) 900 mae = mean_errors( 901 actual=X_test_h, 902 pred=X_pred, 903 scoring="mean_absolute_error", 904 per_series=True, 905 ) 906 mpl = mean_errors( 907 actual=X_test_h, 908 pred=X_pred, 909 scoring="mean_pinball_loss", 910 per_series=True, 911 ) 912 else: 913 X_test_h = X_test[0 : self.h, :] 914 rmse = mean_errors( 915 actual=X_test_h, 916 pred=X_pred, 917 scoring="root_mean_squared_error", 918 per_series=True, 919 ) 920 mae = mean_errors( 921 actual=X_test_h, 922 pred=X_pred, 923 scoring="mean_absolute_error", 924 per_series=True, 925 ) 926 927 names.append(name) 928 RMSE.append(rmse) 929 MAE.append(mae) 930 MPL.append(mpl) 931 if (self.replications is not None) or ( 932 self.type_pi == "gaussian" 933 ): 934 WINKLERSCORE.append(winklerscore) 935 COVERAGE.append(coveragecalc) 936 TIME.append(time.time() - start) 937 938 if self.custom_metric is not None: 939 try: 940 if self.h is None: 941 custom_metric = self.custom_metric( 942 X_test, X_pred 943 ) 944 else: 945 custom_metric = self.custom_metric( 946 X_test_h, X_pred 947 ) 948 CUSTOM_METRIC.append(custom_metric) 949 except Exception as e: 950 custom_metric = np.iinfo(np.float32).max 951 CUSTOM_METRIC.append(np.iinfo(np.float32).max) 952 953 if self.verbose > 0: 954 if (self.replications is not None) or ( 955 self.type_pi == "gaussian" 956 ): 957 scores_verbose = { 958 "Model": name, 959 "RMSE": rmse, 960 "MAE": mae, 961 "MPL": mpl, 962 "WINKLERSCORE": winklerscore, 963 "COVERAGE": coveragecalc, 964 "Time taken": time.time() - start, 965 } 966 else: 967 scores_verbose = { 968 "Model": name, 969 "RMSE": rmse, 970 "MAE": mae, 971 "MPL": mpl, 972 "Time taken": time.time() - start, 973 } 974 975 if self.custom_metric is not None: 976 scores_verbose["Custom metric"] = custom_metric 977 978 if self.predictions: 979 predictions[name] = X_pred 980 981 except Exception as exception: 982 if self.ignore_warnings is False: 983 print(name + " model failed to execute") 984 print(exception) 985 986 if (self.replications is not None) or (self.type_pi == "gaussian"): 987 scores = { 988 "Model": names, 989 "RMSE": RMSE, 990 "MAE": MAE, 991 "MPL": MPL, 992 "WINKLERSCORE": WINKLERSCORE, 993 "COVERAGE": COVERAGE, 994 "Time Taken": TIME, 995 } 996 else: 997 scores = { 998 "Model": names, 999 "RMSE": RMSE, 1000 "MAE": MAE, 1001 "MPL": MPL, 1002 "Time Taken": TIME, 1003 } 1004 1005 if self.custom_metric is not None: 1006 scores["Custom metric"] = CUSTOM_METRIC 1007 1008 if per_series: 1009 scores = dict_to_dataframe_series(scores, self.series_names) 1010 else: 1011 scores = pd.DataFrame(scores) 1012 1013 try: # case per_series, can't be sorted 1014 scores = scores.sort_values( 1015 by=self.sort_by, ascending=True 1016 ).set_index("Model") 1017 1018 self.best_model_ = self.models_[scores.index[0]] 1019 except Exception as e: 1020 pass 1021 1022 if self.predictions is True: 1023 1024 return scores, predictions 1025 1026 return scores
Fit Regression algorithms to X_train, predict and score on X_test.
Parameters:
X_train: array-like or data frame,
Training vectors, where rows is the number of samples
and columns is the number of features.
X_test: array-like or data frame,
Testing vectors, where rows is the number of samples
and columns is the number of features.
xreg: array-like, optional (default=None)
Additional (external) regressors to be passed to self.obj
xreg must be in 'increasing' order (most recent observations last)
per_series: bool, optional (default=False)
When set to True, the metrics are computed series by series.
**kwargs: dict, optional (default=None)
Additional parameters to be passed to `fit` method of `obj`.
Returns:
scores: Pandas DataFrame
Returns metrics of all the models in a Pandas DataFrame.
predictions: Pandas DataFrame
Returns predictions of all the models in a Pandas DataFrame.
1040 def provide_models(self, X_train, X_test): 1041 """ 1042 This function returns all the model objects trained in fit function. 1043 If fit is not called already, then we call fit and then return the models. 1044 1045 Parameters: 1046 1047 X_train : array-like, 1048 Training vectors, where rows is the number of samples 1049 and columns is the number of features. 1050 1051 X_test : array-like, 1052 Testing vectors, where rows is the number of samples 1053 and columns is the number of features. 1054 1055 Returns: 1056 1057 models: dict-object, 1058 Returns a dictionary with each model pipeline as value 1059 with key as name of models. 1060 1061 """ 1062 if self.h is None: 1063 if len(self.models_.keys()) == 0: 1064 self.fit(X_train, X_test) 1065 else: 1066 if len(self.models_.keys()) == 0: 1067 if isinstance(X_test, pd.DataFrame): 1068 self.fit(X_train, X_test.iloc[0 : self.h, :]) 1069 else: 1070 self.fit(X_train, X_test[0 : self.h, :]) 1071 1072 return self.models_
This function returns all the model objects trained in fit function. If fit is not called already, then we call fit and then return the models.
Parameters:
X_train : array-like,
Training vectors, where rows is the number of samples
and columns is the number of features.
X_test : array-like,
Testing vectors, where rows is the number of samples
and columns is the number of features.
Returns:
models: dict-object,
Returns a dictionary with each model pipeline as value
with key as name of models.
93class LazyBoostingRegressor(RegressorMixin): 94 """ 95 Fitting -- almost -- all the regression algorithms 96 and returning their scores. 97 98 Parameters: 99 100 verbose: int, optional (default=0) 101 Any positive number for verbosity. 102 103 ignore_warnings: bool, optional (default=True) 104 When set to True, the warning related to algorigms that are not able to run are ignored. 105 106 custom_metric: function, optional (default=None) 107 When function is provided, models are evaluated based on the custom evaluation metric provided. 108 109 predictions: bool, optional (default=False) 110 When set to True, the predictions of all the models models are returned as dataframe. 111 112 sort_by: string, optional (default='RMSE') 113 Sort models by a metric. Available options are 'R-Squared', 'Adjusted R-Squared', 'RMSE', 'Time Taken' and 'Custom Metric'. 114 or a custom metric identified by its name and provided by custom_metric. 115 116 random_state: int, optional (default=42) 117 Reproducibiility seed. 118 119 estimators: list, optional (default='all') 120 list of Estimators names or just 'all' (default='all') 121 122 preprocess: bool 123 preprocessing is done when set to True 124 125 n_jobs : int, when possible, run in parallel 126 For now, only used by individual models that support it. 127 128 n_layers: int, optional (default=3) 129 Number of layers of CustomRegressors to be used. 130 131 All the other parameters are the same as CustomRegressor's. 132 133 Attributes: 134 135 models_: dict-object 136 Returns a dictionary with each model pipeline as value 137 with key as name of models. 138 139 best_model_: object 140 Returns the best model pipeline based on the sort_by metric. 141 142 Examples: 143 144 ```python 145 import os 146 import mlsauce as ms 147 from sklearn.datasets import load_diabetes 148 from sklearn.model_selection import train_test_split 149 150 data = load_diabetes() 151 X = data.data 152 y= data.target 153 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 123) 154 155 regr = ms.LazyBoostingRegressor(verbose=0, ignore_warnings=True, 156 custom_metric=None, preprocess=True) 157 models, predictioms = regr.fit(X_train, X_test, y_train, y_test) 158 model_dictionary = regr.provide_models(X_train, X_test, y_train, y_test) 159 print(models) 160 ``` 161 162 """ 163 164 def __init__( 165 self, 166 verbose=0, 167 ignore_warnings=True, 168 custom_metric=None, 169 predictions=False, 170 sort_by="RMSE", 171 random_state=42, 172 estimators="all", 173 preprocess=False, 174 n_jobs=None, 175 ): 176 self.verbose = verbose 177 self.ignore_warnings = ignore_warnings 178 self.custom_metric = custom_metric 179 self.predictions = predictions 180 self.sort_by = sort_by 181 self.models_ = {} 182 self.best_model_ = None 183 self.random_state = random_state 184 self.estimators = estimators 185 self.preprocess = preprocess 186 self.n_jobs = n_jobs 187 188 def fit(self, X_train, X_test, y_train, y_test, hist=False, **kwargs): 189 """Fit Regression algorithms to X_train and y_train, predict and score on X_test, y_test. 190 191 Parameters: 192 193 X_train : array-like, 194 Training vectors, where rows is the number of samples 195 and columns is the number of features. 196 197 X_test : array-like, 198 Testing vectors, where rows is the number of samples 199 and columns is the number of features. 200 201 y_train : array-like, 202 Training vectors, where rows is the number of samples 203 and columns is the number of features. 204 205 y_test : array-like, 206 Testing vectors, where rows is the number of samples 207 and columns is the number of features. 208 209 hist: bool, optional (default=False) 210 When set to True, the model is a HistGenericBoostingRegressor. 211 212 **kwargs: dict, 213 Additional parameters to be passed to the GenericBoostingRegressor. 214 215 Returns: 216 ------- 217 scores: Pandas DataFrame 218 Returns metrics of all the models in a Pandas DataFrame. 219 220 predictions : Pandas DataFrame 221 Returns predictions of all the models in a Pandas DataFrame. 222 223 """ 224 R2 = [] 225 ADJR2 = [] 226 RMSE = [] 227 # WIN = [] 228 names = [] 229 TIME = [] 230 predictions = {} 231 232 if self.custom_metric: 233 CUSTOM_METRIC = [] 234 235 if isinstance(X_train, np.ndarray): 236 X_train = pd.DataFrame(X_train) 237 X_test = pd.DataFrame(X_test) 238 239 numeric_features = X_train.select_dtypes(include=[np.number]).columns 240 categorical_features = X_train.select_dtypes(include=["object"]).columns 241 242 categorical_low, categorical_high = get_card_split( 243 X_train, categorical_features 244 ) 245 246 if self.preprocess is True: 247 preprocessor = ColumnTransformer( 248 transformers=[ 249 ("numeric", numeric_transformer, numeric_features), 250 ( 251 "categorical_low", 252 categorical_transformer_low, 253 categorical_low, 254 ), 255 ( 256 "categorical_high", 257 categorical_transformer_high, 258 categorical_high, 259 ), 260 ] 261 ) 262 263 # base models 264 try: 265 baseline_names = [ 266 "RandomForestRegressor", 267 "XGBRegressor", 268 "GradientBoostingRegressor", 269 ] 270 baseline_models = [ 271 RandomForestRegressor(), 272 xgb.XGBRegressor(), 273 GradientBoostingRegressor(), 274 ] 275 except Exception as exception: 276 baseline_names = [ 277 "RandomForestRegressor", 278 "GradientBoostingRegressor", 279 ] 280 baseline_models = [ 281 RandomForestRegressor(), 282 GradientBoostingRegressor(), 283 ] 284 285 if self.verbose > 0: 286 print("\n Fitting baseline models...") 287 for name, model in tqdm(zip(baseline_names, baseline_models)): 288 start = time.time() 289 try: 290 model.fit(X_train, y_train.ravel()) 291 self.models_[name] = model 292 y_pred = model.predict(X_test) 293 r_squared = r2_score(y_test, y_pred) 294 adj_rsquared = adjusted_rsquared( 295 r_squared, X_test.shape[0], X_test.shape[1] 296 ) 297 rmse = root_mean_squared_error(y_test, y_pred) 298 299 names.append(name) 300 R2.append(r_squared) 301 ADJR2.append(adj_rsquared) 302 RMSE.append(rmse) 303 TIME.append(time.time() - start) 304 305 if self.custom_metric: 306 custom_metric = self.custom_metric(y_test, y_pred) 307 CUSTOM_METRIC.append(custom_metric) 308 309 if self.verbose > 0: 310 scores_verbose = { 311 "Model": name, 312 "R-Squared": r_squared, 313 "Adjusted R-Squared": adj_rsquared, 314 "RMSE": rmse, 315 "Time taken": time.time() - start, 316 } 317 318 if self.custom_metric: 319 scores_verbose["Custom metric"] = custom_metric 320 321 print(scores_verbose) 322 if self.predictions: 323 predictions[name] = y_pred 324 except Exception as exception: 325 if self.ignore_warnings is False: 326 print(name + " model failed to execute") 327 print(exception) 328 329 if self.estimators == "all": 330 self.regressors = REGRESSORS 331 else: 332 self.regressors = [ 333 ("GenericBooster(" + est[0] + ")", est[1](**kwargs)) 334 for est in all_estimators() 335 if ( 336 issubclass(est[1], RegressorMixin) 337 and (est[0] in self.estimators) 338 ) 339 ] 340 341 if self.preprocess is True: 342 343 if self.n_jobs is None: 344 345 for name, regr in tqdm(self.regressors): # do parallel exec 346 347 start = time.time() 348 349 try: 350 351 if hist is False: 352 353 model = GenericBoostingRegressor( 354 base_model=regr(), 355 verbose=self.verbose, 356 **kwargs, 357 ) 358 359 else: 360 361 model = HistGenericBoostingRegressor( 362 base_model=regr(), 363 verbose=self.verbose, 364 **kwargs, 365 ) 366 367 model.fit(X_train, y_train.ravel()) 368 369 pipe = Pipeline( 370 steps=[ 371 ("preprocessor", preprocessor), 372 ("regressor", model), 373 ] 374 ) 375 if self.verbose > 0: 376 print("\n Fitting boosted " + name + " model...") 377 pipe.fit(X_train, y_train.ravel()) 378 379 self.models_[name] = pipe 380 y_pred = pipe.predict(X_test) 381 r_squared = r2_score(y_test, y_pred) 382 adj_rsquared = adjusted_rsquared( 383 r_squared, X_test.shape[0], X_test.shape[1] 384 ) 385 rmse = root_mean_squared_error(y_test, y_pred) 386 387 names.append(name) 388 R2.append(r_squared) 389 ADJR2.append(adj_rsquared) 390 RMSE.append(rmse) 391 TIME.append(time.time() - start) 392 393 if self.custom_metric: 394 custom_metric = self.custom_metric(y_test, y_pred) 395 CUSTOM_METRIC.append(custom_metric) 396 397 if self.verbose > 0: 398 scores_verbose = { 399 "Model": name, 400 "R-Squared": r_squared, 401 "Adjusted R-Squared": adj_rsquared, 402 "RMSE": rmse, 403 "Time taken": time.time() - start, 404 } 405 406 if self.custom_metric: 407 scores_verbose["Custom metric"] = custom_metric 408 409 print(scores_verbose) 410 if self.predictions: 411 predictions[name] = y_pred 412 413 except Exception as exception: 414 415 if self.ignore_warnings is False: 416 print(name + " model failed to execute") 417 print(exception) 418 419 else: 420 421 results = Parallel(n_jobs=self.n_jobs)( 422 delayed(self.train_model)( 423 name, 424 model, 425 X_train, 426 y_train, 427 X_test, 428 y_test, 429 use_preprocessing=True, 430 preprocessor=preprocessor, 431 **kwargs, 432 ) 433 for name, model in tqdm(self.regressors) 434 ) 435 R2 = [ 436 result["r_squared"] 437 for result in results 438 if result is not None 439 ] 440 ADJR2 = [ 441 result["adj_rsquared"] 442 for result in results 443 if result is not None 444 ] 445 RMSE = [ 446 result["rmse"] for result in results if result is not None 447 ] 448 TIME = [ 449 result["time"] for result in results if result is not None 450 ] 451 names = [ 452 result["name"] for result in results if result is not None 453 ] 454 if self.custom_metric: 455 CUSTOM_METRIC = [ 456 result["custom_metric"] 457 for result in results 458 if result is not None 459 ] 460 if self.predictions: 461 predictions = { 462 result["name"]: result["predictions"] 463 for result in results 464 if result is not None 465 } 466 467 else: # self.preprocess is False; no preprocessing 468 469 if self.n_jobs is None: 470 471 for name, regr in tqdm(self.regressors): # do parallel exec 472 start = time.time() 473 try: 474 475 if hist is False: 476 model = GenericBoostingRegressor( 477 base_model=regr(), 478 verbose=self.verbose, 479 **kwargs, 480 ) 481 else: 482 model = HistGenericBoostingRegressor( 483 base_model=regr(), 484 verbose=self.verbose, 485 **kwargs, 486 ) 487 488 if self.verbose > 0: 489 print("\n Fitting boosted " + name + " model...") 490 model.fit(X_train, y_train.ravel()) 491 492 self.models_[name] = model 493 y_pred = model.predict(X_test) 494 495 r_squared = r2_score(y_test, y_pred) 496 adj_rsquared = adjusted_rsquared( 497 r_squared, X_test.shape[0], X_test.shape[1] 498 ) 499 rmse = root_mean_squared_error(y_test, y_pred) 500 501 names.append(name) 502 R2.append(r_squared) 503 ADJR2.append(adj_rsquared) 504 RMSE.append(rmse) 505 TIME.append(time.time() - start) 506 507 if self.custom_metric: 508 custom_metric = self.custom_metric(y_test, y_pred) 509 CUSTOM_METRIC.append(custom_metric) 510 511 if self.verbose > 0: 512 scores_verbose = { 513 "Model": name, 514 "R-Squared": r_squared, 515 "Adjusted R-Squared": adj_rsquared, 516 "RMSE": rmse, 517 "Time taken": time.time() - start, 518 } 519 520 if self.custom_metric: 521 scores_verbose["Custom metric"] = custom_metric 522 523 print(scores_verbose) 524 if self.predictions: 525 predictions[name] = y_pred 526 except Exception as exception: 527 if self.ignore_warnings is False: 528 print(name + " model failed to execute") 529 print(exception) 530 531 else: 532 533 results = Parallel(n_jobs=self.n_jobs)( 534 delayed(self.train_model)( 535 name, 536 model, 537 X_train, 538 y_train, 539 X_test, 540 y_test, 541 use_preprocessing=False, 542 **kwargs, 543 ) 544 for name, model in tqdm(self.regressors) 545 ) 546 R2 = [ 547 result["r_squared"] 548 for result in results 549 if result is not None 550 ] 551 ADJR2 = [ 552 result["adj_rsquared"] 553 for result in results 554 if result is not None 555 ] 556 RMSE = [ 557 result["rmse"] for result in results if result is not None 558 ] 559 TIME = [ 560 result["time"] for result in results if result is not None 561 ] 562 names = [ 563 result["name"] for result in results if result is not None 564 ] 565 if self.custom_metric: 566 CUSTOM_METRIC = [ 567 result["custom_metric"] 568 for result in results 569 if result is not None 570 ] 571 if self.predictions: 572 predictions = { 573 result["name"]: result["predictions"] 574 for result in results 575 if result is not None 576 } 577 578 scores = { 579 "Model": names, 580 "Adjusted R-Squared": ADJR2, 581 "R-Squared": R2, 582 "RMSE": RMSE, 583 "Time Taken": TIME, 584 } 585 586 if self.custom_metric: 587 scores["Custom metric"] = CUSTOM_METRIC 588 589 scores = pd.DataFrame(scores) 590 scores = scores.sort_values(by=self.sort_by, ascending=True).set_index( 591 "Model" 592 ) 593 594 self.best_model_ = self.models_[scores.index[0]] 595 596 if self.predictions: 597 predictions_df = pd.DataFrame.from_dict(predictions) 598 return scores, predictions_df if self.predictions is True else scores 599 600 def get_best_model(self): 601 """ 602 This function returns the best model pipeline based on the sort_by metric. 603 604 Returns: 605 606 best_model: object, 607 Returns the best model pipeline based on the sort_by metric. 608 609 """ 610 return self.best_model_ 611 612 def provide_models(self, X_train, X_test, y_train, y_test): 613 """ 614 This function returns all the model objects trained in fit function. 615 If fit is not called already, then we call fit and then return the models. 616 617 Parameters: 618 619 X_train : array-like, 620 Training vectors, where rows is the number of samples 621 and columns is the number of features. 622 623 X_test : array-like, 624 Testing vectors, where rows is the number of samples 625 and columns is the number of features. 626 627 y_train : array-like, 628 Training vectors, where rows is the number of samples 629 and columns is the number of features. 630 631 y_test : array-like, 632 Testing vectors, where rows is the number of samples 633 and columns is the number of features. 634 635 Returns: 636 637 models: dict-object, 638 Returns a dictionary with each model pipeline as value 639 with key as name of models. 640 641 """ 642 if len(self.models_.keys()) == 0: 643 self.fit(X_train, X_test, y_train.ravel(), y_test.values) 644 645 return self.models_ 646 647 def train_model( 648 self, 649 name, 650 regr, 651 X_train, 652 y_train, 653 X_test, 654 y_test, 655 use_preprocessing=False, 656 preprocessor=None, 657 hist=False, 658 **kwargs, 659 ): 660 """ 661 Function to train a single regression model and return its results. 662 """ 663 start = time.time() 664 665 try: 666 if hist is False: 667 model = GenericBoostingRegressor( 668 base_model=regr(), verbose=self.verbose, **kwargs 669 ) 670 else: 671 model = HistGenericBoostingRegressor( 672 base_model=regr(), verbose=self.verbose, **kwargs 673 ) 674 675 if use_preprocessing and preprocessor is not None: 676 pipe = Pipeline( 677 steps=[ 678 ("preprocessor", preprocessor), 679 ("regressor", model), 680 ] 681 ) 682 if self.verbose > 0: 683 print( 684 "\n Fitting boosted " 685 + name 686 + " model with preprocessing..." 687 ) 688 pipe.fit(X_train, y_train.ravel()) 689 y_pred = pipe.predict(X_test) 690 fitted_model = pipe 691 else: 692 # Case with no preprocessing 693 if self.verbose > 0: 694 print( 695 "\n Fitting boosted " 696 + name 697 + " model without preprocessing..." 698 ) 699 model.fit(X_train, y_train.ravel()) 700 y_pred = model.predict(X_test) 701 fitted_model = model 702 703 r_squared = r2_score(y_test, y_pred) 704 adj_rsquared = adjusted_rsquared( 705 r_squared, X_test.shape[0], X_test.shape[1] 706 ) 707 rmse = root_mean_squared_error(y_test, y_pred) 708 709 custom_metric = None 710 if self.custom_metric: 711 custom_metric = self.custom_metric(y_test, y_pred) 712 713 return { 714 "name": name, 715 "model": fitted_model, 716 "r_squared": r_squared, 717 "adj_rsquared": adj_rsquared, 718 "rmse": rmse, 719 "custom_metric": custom_metric, 720 "time": time.time() - start, 721 "predictions": y_pred, 722 } 723 724 except Exception as exception: 725 if self.ignore_warnings is False: 726 print(name + " model failed to execute") 727 print(exception) 728 return None
Fitting -- almost -- all the regression algorithms and returning their scores.
Parameters:
verbose: int, optional (default=0)
Any positive number for verbosity.
ignore_warnings: bool, optional (default=True)
When set to True, the warning related to algorigms that are not able to run are ignored.
custom_metric: function, optional (default=None)
When function is provided, models are evaluated based on the custom evaluation metric provided.
predictions: bool, optional (default=False)
When set to True, the predictions of all the models models are returned as dataframe.
sort_by: string, optional (default='RMSE')
Sort models by a metric. Available options are 'R-Squared', 'Adjusted R-Squared', 'RMSE', 'Time Taken' and 'Custom Metric'.
or a custom metric identified by its name and provided by custom_metric.
random_state: int, optional (default=42)
Reproducibiility seed.
estimators: list, optional (default='all')
list of Estimators names or just 'all' (default='all')
preprocess: bool
preprocessing is done when set to True
n_jobs : int, when possible, run in parallel
For now, only used by individual models that support it.
n_layers: int, optional (default=3)
Number of layers of CustomRegressors to be used.
All the other parameters are the same as CustomRegressor's.
Attributes:
models_: dict-object
Returns a dictionary with each model pipeline as value
with key as name of models.
best_model_: object
Returns the best model pipeline based on the sort_by metric.
Examples:
import os
import mlsauce as ms
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
data = load_diabetes()
X = data.data
y= data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 123)
regr = ms.LazyBoostingRegressor(verbose=0, ignore_warnings=True,
custom_metric=None, preprocess=True)
models, predictioms = regr.fit(X_train, X_test, y_train, y_test)
model_dictionary = regr.provide_models(X_train, X_test, y_train, y_test)
print(models)
188 def fit(self, X_train, X_test, y_train, y_test, hist=False, **kwargs): 189 """Fit Regression algorithms to X_train and y_train, predict and score on X_test, y_test. 190 191 Parameters: 192 193 X_train : array-like, 194 Training vectors, where rows is the number of samples 195 and columns is the number of features. 196 197 X_test : array-like, 198 Testing vectors, where rows is the number of samples 199 and columns is the number of features. 200 201 y_train : array-like, 202 Training vectors, where rows is the number of samples 203 and columns is the number of features. 204 205 y_test : array-like, 206 Testing vectors, where rows is the number of samples 207 and columns is the number of features. 208 209 hist: bool, optional (default=False) 210 When set to True, the model is a HistGenericBoostingRegressor. 211 212 **kwargs: dict, 213 Additional parameters to be passed to the GenericBoostingRegressor. 214 215 Returns: 216 ------- 217 scores: Pandas DataFrame 218 Returns metrics of all the models in a Pandas DataFrame. 219 220 predictions : Pandas DataFrame 221 Returns predictions of all the models in a Pandas DataFrame. 222 223 """ 224 R2 = [] 225 ADJR2 = [] 226 RMSE = [] 227 # WIN = [] 228 names = [] 229 TIME = [] 230 predictions = {} 231 232 if self.custom_metric: 233 CUSTOM_METRIC = [] 234 235 if isinstance(X_train, np.ndarray): 236 X_train = pd.DataFrame(X_train) 237 X_test = pd.DataFrame(X_test) 238 239 numeric_features = X_train.select_dtypes(include=[np.number]).columns 240 categorical_features = X_train.select_dtypes(include=["object"]).columns 241 242 categorical_low, categorical_high = get_card_split( 243 X_train, categorical_features 244 ) 245 246 if self.preprocess is True: 247 preprocessor = ColumnTransformer( 248 transformers=[ 249 ("numeric", numeric_transformer, numeric_features), 250 ( 251 "categorical_low", 252 categorical_transformer_low, 253 categorical_low, 254 ), 255 ( 256 "categorical_high", 257 categorical_transformer_high, 258 categorical_high, 259 ), 260 ] 261 ) 262 263 # base models 264 try: 265 baseline_names = [ 266 "RandomForestRegressor", 267 "XGBRegressor", 268 "GradientBoostingRegressor", 269 ] 270 baseline_models = [ 271 RandomForestRegressor(), 272 xgb.XGBRegressor(), 273 GradientBoostingRegressor(), 274 ] 275 except Exception as exception: 276 baseline_names = [ 277 "RandomForestRegressor", 278 "GradientBoostingRegressor", 279 ] 280 baseline_models = [ 281 RandomForestRegressor(), 282 GradientBoostingRegressor(), 283 ] 284 285 if self.verbose > 0: 286 print("\n Fitting baseline models...") 287 for name, model in tqdm(zip(baseline_names, baseline_models)): 288 start = time.time() 289 try: 290 model.fit(X_train, y_train.ravel()) 291 self.models_[name] = model 292 y_pred = model.predict(X_test) 293 r_squared = r2_score(y_test, y_pred) 294 adj_rsquared = adjusted_rsquared( 295 r_squared, X_test.shape[0], X_test.shape[1] 296 ) 297 rmse = root_mean_squared_error(y_test, y_pred) 298 299 names.append(name) 300 R2.append(r_squared) 301 ADJR2.append(adj_rsquared) 302 RMSE.append(rmse) 303 TIME.append(time.time() - start) 304 305 if self.custom_metric: 306 custom_metric = self.custom_metric(y_test, y_pred) 307 CUSTOM_METRIC.append(custom_metric) 308 309 if self.verbose > 0: 310 scores_verbose = { 311 "Model": name, 312 "R-Squared": r_squared, 313 "Adjusted R-Squared": adj_rsquared, 314 "RMSE": rmse, 315 "Time taken": time.time() - start, 316 } 317 318 if self.custom_metric: 319 scores_verbose["Custom metric"] = custom_metric 320 321 print(scores_verbose) 322 if self.predictions: 323 predictions[name] = y_pred 324 except Exception as exception: 325 if self.ignore_warnings is False: 326 print(name + " model failed to execute") 327 print(exception) 328 329 if self.estimators == "all": 330 self.regressors = REGRESSORS 331 else: 332 self.regressors = [ 333 ("GenericBooster(" + est[0] + ")", est[1](**kwargs)) 334 for est in all_estimators() 335 if ( 336 issubclass(est[1], RegressorMixin) 337 and (est[0] in self.estimators) 338 ) 339 ] 340 341 if self.preprocess is True: 342 343 if self.n_jobs is None: 344 345 for name, regr in tqdm(self.regressors): # do parallel exec 346 347 start = time.time() 348 349 try: 350 351 if hist is False: 352 353 model = GenericBoostingRegressor( 354 base_model=regr(), 355 verbose=self.verbose, 356 **kwargs, 357 ) 358 359 else: 360 361 model = HistGenericBoostingRegressor( 362 base_model=regr(), 363 verbose=self.verbose, 364 **kwargs, 365 ) 366 367 model.fit(X_train, y_train.ravel()) 368 369 pipe = Pipeline( 370 steps=[ 371 ("preprocessor", preprocessor), 372 ("regressor", model), 373 ] 374 ) 375 if self.verbose > 0: 376 print("\n Fitting boosted " + name + " model...") 377 pipe.fit(X_train, y_train.ravel()) 378 379 self.models_[name] = pipe 380 y_pred = pipe.predict(X_test) 381 r_squared = r2_score(y_test, y_pred) 382 adj_rsquared = adjusted_rsquared( 383 r_squared, X_test.shape[0], X_test.shape[1] 384 ) 385 rmse = root_mean_squared_error(y_test, y_pred) 386 387 names.append(name) 388 R2.append(r_squared) 389 ADJR2.append(adj_rsquared) 390 RMSE.append(rmse) 391 TIME.append(time.time() - start) 392 393 if self.custom_metric: 394 custom_metric = self.custom_metric(y_test, y_pred) 395 CUSTOM_METRIC.append(custom_metric) 396 397 if self.verbose > 0: 398 scores_verbose = { 399 "Model": name, 400 "R-Squared": r_squared, 401 "Adjusted R-Squared": adj_rsquared, 402 "RMSE": rmse, 403 "Time taken": time.time() - start, 404 } 405 406 if self.custom_metric: 407 scores_verbose["Custom metric"] = custom_metric 408 409 print(scores_verbose) 410 if self.predictions: 411 predictions[name] = y_pred 412 413 except Exception as exception: 414 415 if self.ignore_warnings is False: 416 print(name + " model failed to execute") 417 print(exception) 418 419 else: 420 421 results = Parallel(n_jobs=self.n_jobs)( 422 delayed(self.train_model)( 423 name, 424 model, 425 X_train, 426 y_train, 427 X_test, 428 y_test, 429 use_preprocessing=True, 430 preprocessor=preprocessor, 431 **kwargs, 432 ) 433 for name, model in tqdm(self.regressors) 434 ) 435 R2 = [ 436 result["r_squared"] 437 for result in results 438 if result is not None 439 ] 440 ADJR2 = [ 441 result["adj_rsquared"] 442 for result in results 443 if result is not None 444 ] 445 RMSE = [ 446 result["rmse"] for result in results if result is not None 447 ] 448 TIME = [ 449 result["time"] for result in results if result is not None 450 ] 451 names = [ 452 result["name"] for result in results if result is not None 453 ] 454 if self.custom_metric: 455 CUSTOM_METRIC = [ 456 result["custom_metric"] 457 for result in results 458 if result is not None 459 ] 460 if self.predictions: 461 predictions = { 462 result["name"]: result["predictions"] 463 for result in results 464 if result is not None 465 } 466 467 else: # self.preprocess is False; no preprocessing 468 469 if self.n_jobs is None: 470 471 for name, regr in tqdm(self.regressors): # do parallel exec 472 start = time.time() 473 try: 474 475 if hist is False: 476 model = GenericBoostingRegressor( 477 base_model=regr(), 478 verbose=self.verbose, 479 **kwargs, 480 ) 481 else: 482 model = HistGenericBoostingRegressor( 483 base_model=regr(), 484 verbose=self.verbose, 485 **kwargs, 486 ) 487 488 if self.verbose > 0: 489 print("\n Fitting boosted " + name + " model...") 490 model.fit(X_train, y_train.ravel()) 491 492 self.models_[name] = model 493 y_pred = model.predict(X_test) 494 495 r_squared = r2_score(y_test, y_pred) 496 adj_rsquared = adjusted_rsquared( 497 r_squared, X_test.shape[0], X_test.shape[1] 498 ) 499 rmse = root_mean_squared_error(y_test, y_pred) 500 501 names.append(name) 502 R2.append(r_squared) 503 ADJR2.append(adj_rsquared) 504 RMSE.append(rmse) 505 TIME.append(time.time() - start) 506 507 if self.custom_metric: 508 custom_metric = self.custom_metric(y_test, y_pred) 509 CUSTOM_METRIC.append(custom_metric) 510 511 if self.verbose > 0: 512 scores_verbose = { 513 "Model": name, 514 "R-Squared": r_squared, 515 "Adjusted R-Squared": adj_rsquared, 516 "RMSE": rmse, 517 "Time taken": time.time() - start, 518 } 519 520 if self.custom_metric: 521 scores_verbose["Custom metric"] = custom_metric 522 523 print(scores_verbose) 524 if self.predictions: 525 predictions[name] = y_pred 526 except Exception as exception: 527 if self.ignore_warnings is False: 528 print(name + " model failed to execute") 529 print(exception) 530 531 else: 532 533 results = Parallel(n_jobs=self.n_jobs)( 534 delayed(self.train_model)( 535 name, 536 model, 537 X_train, 538 y_train, 539 X_test, 540 y_test, 541 use_preprocessing=False, 542 **kwargs, 543 ) 544 for name, model in tqdm(self.regressors) 545 ) 546 R2 = [ 547 result["r_squared"] 548 for result in results 549 if result is not None 550 ] 551 ADJR2 = [ 552 result["adj_rsquared"] 553 for result in results 554 if result is not None 555 ] 556 RMSE = [ 557 result["rmse"] for result in results if result is not None 558 ] 559 TIME = [ 560 result["time"] for result in results if result is not None 561 ] 562 names = [ 563 result["name"] for result in results if result is not None 564 ] 565 if self.custom_metric: 566 CUSTOM_METRIC = [ 567 result["custom_metric"] 568 for result in results 569 if result is not None 570 ] 571 if self.predictions: 572 predictions = { 573 result["name"]: result["predictions"] 574 for result in results 575 if result is not None 576 } 577 578 scores = { 579 "Model": names, 580 "Adjusted R-Squared": ADJR2, 581 "R-Squared": R2, 582 "RMSE": RMSE, 583 "Time Taken": TIME, 584 } 585 586 if self.custom_metric: 587 scores["Custom metric"] = CUSTOM_METRIC 588 589 scores = pd.DataFrame(scores) 590 scores = scores.sort_values(by=self.sort_by, ascending=True).set_index( 591 "Model" 592 ) 593 594 self.best_model_ = self.models_[scores.index[0]] 595 596 if self.predictions: 597 predictions_df = pd.DataFrame.from_dict(predictions) 598 return scores, predictions_df if self.predictions is True else scores
Fit Regression algorithms to X_train and y_train, predict and score on X_test, y_test.
Parameters:
X_train : array-like,
Training vectors, where rows is the number of samples
and columns is the number of features.
X_test : array-like,
Testing vectors, where rows is the number of samples
and columns is the number of features.
y_train : array-like,
Training vectors, where rows is the number of samples
and columns is the number of features.
y_test : array-like,
Testing vectors, where rows is the number of samples
and columns is the number of features.
hist: bool, optional (default=False)
When set to True, the model is a HistGenericBoostingRegressor.
**kwargs: dict,
Additional parameters to be passed to the GenericBoostingRegressor.
Returns:
scores: Pandas DataFrame Returns metrics of all the models in a Pandas DataFrame.
predictions : Pandas DataFrame Returns predictions of all the models in a Pandas DataFrame.
612 def provide_models(self, X_train, X_test, y_train, y_test): 613 """ 614 This function returns all the model objects trained in fit function. 615 If fit is not called already, then we call fit and then return the models. 616 617 Parameters: 618 619 X_train : array-like, 620 Training vectors, where rows is the number of samples 621 and columns is the number of features. 622 623 X_test : array-like, 624 Testing vectors, where rows is the number of samples 625 and columns is the number of features. 626 627 y_train : array-like, 628 Training vectors, where rows is the number of samples 629 and columns is the number of features. 630 631 y_test : array-like, 632 Testing vectors, where rows is the number of samples 633 and columns is the number of features. 634 635 Returns: 636 637 models: dict-object, 638 Returns a dictionary with each model pipeline as value 639 with key as name of models. 640 641 """ 642 if len(self.models_.keys()) == 0: 643 self.fit(X_train, X_test, y_train.ravel(), y_test.values) 644 645 return self.models_
This function returns all the model objects trained in fit function. If fit is not called already, then we call fit and then return the models.
Parameters:
X_train : array-like,
Training vectors, where rows is the number of samples
and columns is the number of features.
X_test : array-like,
Testing vectors, where rows is the number of samples
and columns is the number of features.
y_train : array-like,
Training vectors, where rows is the number of samples
and columns is the number of features.
y_test : array-like,
Testing vectors, where rows is the number of samples
and columns is the number of features.
Returns:
models: dict-object,
Returns a dictionary with each model pipeline as value
with key as name of models.
9class MultiTaskRegressor(BaseEstimator, RegressorMixin): 10 """ 11 A class for multi-task regression 12 13 Parameters 14 ---------- 15 regr: object 16 A regressor object 17 18 Attributes 19 ---------- 20 objs: list 21 A list containing the fitted regressor objects 22 23 """ 24 25 def __init__(self, regr): 26 assert ( 27 is_multitask_estimator(regr) == False 28 ), "The regressor is already a multi-task regressor" 29 self.regr = regr 30 self.objs = [] 31 32 def fit(self, X, y): 33 """ 34 Fit the regressor 35 36 Parameters 37 ---------- 38 X: array-like 39 The input data 40 y: array-like 41 The target values 42 43 """ 44 n_tasks = y.shape[1] 45 assert n_tasks > 1, "The number of columns in y must be greater than 1" 46 self.n_outputs_ = n_tasks 47 try: 48 for i in range(n_tasks): 49 self.regr.fit(X, y.iloc[:, i].values) 50 self.objs.append(deepcopy(self.regr)) 51 except Exception: 52 for i in range(n_tasks): 53 self.regr.fit(X, y[:, i]) 54 self.objs.append(deepcopy(self.regr)) 55 return self 56 57 def predict(self, X): 58 """ 59 Predict the target values 60 61 Parameters 62 ---------- 63 X: array-like 64 The input data 65 66 Returns 67 ------- 68 y_pred: array-like 69 The predicted target values 70 71 """ 72 assert len(self.objs) > 0, "The regressor has not been fitted yet" 73 y_pred = np.zeros((X.shape[0], self.n_outputs_)) 74 for i in range(self.n_outputs_): 75 y_pred[:, i] = self.objs[i].predict(X) 76 return y_pred
A class for multi-task regression
Parameters
regr: object A regressor object
Attributes
objs: list A list containing the fitted regressor objects
32 def fit(self, X, y): 33 """ 34 Fit the regressor 35 36 Parameters 37 ---------- 38 X: array-like 39 The input data 40 y: array-like 41 The target values 42 43 """ 44 n_tasks = y.shape[1] 45 assert n_tasks > 1, "The number of columns in y must be greater than 1" 46 self.n_outputs_ = n_tasks 47 try: 48 for i in range(n_tasks): 49 self.regr.fit(X, y.iloc[:, i].values) 50 self.objs.append(deepcopy(self.regr)) 51 except Exception: 52 for i in range(n_tasks): 53 self.regr.fit(X, y[:, i]) 54 self.objs.append(deepcopy(self.regr)) 55 return self
Fit the regressor
Parameters
X: array-like The input data y: array-like The target values
57 def predict(self, X): 58 """ 59 Predict the target values 60 61 Parameters 62 ---------- 63 X: array-like 64 The input data 65 66 Returns 67 ------- 68 y_pred: array-like 69 The predicted target values 70 71 """ 72 assert len(self.objs) > 0, "The regressor has not been fitted yet" 73 y_pred = np.zeros((X.shape[0], self.n_outputs_)) 74 for i in range(self.n_outputs_): 75 y_pred[:, i] = self.objs[i].predict(X) 76 return y_pred
Predict the target values
Parameters
X: array-like The input data
Returns
y_pred: array-like The predicted target values
8class IsotonicRegressor(BaseEstimator, RegressorMixin): 9 """Isotonic Regressor with postprocessing. 10 11 This class takes a base regressor and applies isotonic regression as 12 postprocessing in the predict method. The isotonic regression ensures 13 that the predictions are monotonically increasing or decreasing. 14 15 Attributes: 16 regr: estimator 17 Base regressor to use for initial predictions. 18 19 increasing: bool, default=True 20 If True, the isotonic regression will be monotonically increasing. 21 If False, it will be monotonically decreasing. 22 23 out_of_bounds: str, default='nan' 24 The out_of_bounds parameter for IsotonicRegression. 25 Can be 'nan', 'clip', or 'raise'. 26 """ 27 28 def __init__(self, regr, increasing=True, out_of_bounds="nan"): 29 """Initialize the IsotonicRegressor. 30 31 Args: 32 regr: estimator 33 Base regressor to use for initial predictions. 34 35 increasing: bool, default=True 36 If True, the isotonic regression will be monotonically increasing. 37 If False, it will be monotonically decreasing. 38 39 out_of_bounds: str, default='nan' 40 The out_of_bounds parameter for IsotonicRegression. 41 Can be 'nan', 'clip', or 'raise'. 42 """ 43 self.regr = regr 44 self.increasing = increasing 45 self.out_of_bounds = out_of_bounds 46 47 def fit(self, X, y, **kwargs): 48 """Fit the model. 49 50 Args: 51 X: {array-like}, shape = [n_samples, n_features] 52 Training vectors, where n_samples is the number 53 of samples and n_features is the number of features. 54 55 y: array-like, shape = [n_samples] 56 Target values. 57 58 **kwargs: additional parameters to be passed to the base regressor. 59 60 Returns: 61 self: object. 62 """ 63 # Validate input 64 X, y = check_X_y(X, y) 65 # Fit the base regressor 66 self.regr.fit(X, y, **kwargs) 67 # Get predictions from base regressor for training data 68 y_pred_base = self.regr.predict(X) 69 # Fit isotonic regression on the base predictions vs actual targets 70 self.isotonic_regressor_ = IsotonicRegression( 71 increasing=self.increasing, out_of_bounds=self.out_of_bounds 72 ) 73 self.isotonic_regressor_.fit(y_pred_base, y) 74 return self 75 76 def predict(self, X, **kwargs): 77 """Predict using the model. 78 79 Args: 80 X: {array-like}, shape = [n_samples, n_features] 81 Samples. 82 83 **kwargs: additional parameters to be passed to the base regressor. 84 85 Returns: 86 y_pred: array-like, shape = [n_samples] 87 Predicted values. 88 """ 89 # Check if fitted 90 check_is_fitted(self, ["regr", "isotonic_regressor_"]) 91 # Validate input 92 X = check_array(X) 93 # Get predictions from base regressor 94 y_pred_base = self.regr.predict(X, **kwargs) 95 # Apply isotonic regression postprocessing 96 return self.isotonic_regressor_.predict(y_pred_base)
Isotonic Regressor with postprocessing.
This class takes a base regressor and applies isotonic regression as postprocessing in the predict method. The isotonic regression ensures that the predictions are monotonically increasing or decreasing.
Attributes: regr: estimator Base regressor to use for initial predictions.
increasing: bool, default=True
If True, the isotonic regression will be monotonically increasing.
If False, it will be monotonically decreasing.
out_of_bounds: str, default='nan'
The out_of_bounds parameter for IsotonicRegression.
Can be 'nan', 'clip', or 'raise'.
47 def fit(self, X, y, **kwargs): 48 """Fit the model. 49 50 Args: 51 X: {array-like}, shape = [n_samples, n_features] 52 Training vectors, where n_samples is the number 53 of samples and n_features is the number of features. 54 55 y: array-like, shape = [n_samples] 56 Target values. 57 58 **kwargs: additional parameters to be passed to the base regressor. 59 60 Returns: 61 self: object. 62 """ 63 # Validate input 64 X, y = check_X_y(X, y) 65 # Fit the base regressor 66 self.regr.fit(X, y, **kwargs) 67 # Get predictions from base regressor for training data 68 y_pred_base = self.regr.predict(X) 69 # Fit isotonic regression on the base predictions vs actual targets 70 self.isotonic_regressor_ = IsotonicRegression( 71 increasing=self.increasing, out_of_bounds=self.out_of_bounds 72 ) 73 self.isotonic_regressor_.fit(y_pred_base, y) 74 return self
Fit the model.
Args: X: {array-like}, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features.
y: array-like, shape = [n_samples]
Target values.
**kwargs: additional parameters to be passed to the base regressor.
Returns: self: object.
76 def predict(self, X, **kwargs): 77 """Predict using the model. 78 79 Args: 80 X: {array-like}, shape = [n_samples, n_features] 81 Samples. 82 83 **kwargs: additional parameters to be passed to the base regressor. 84 85 Returns: 86 y_pred: array-like, shape = [n_samples] 87 Predicted values. 88 """ 89 # Check if fitted 90 check_is_fitted(self, ["regr", "isotonic_regressor_"]) 91 # Validate input 92 X = check_array(X) 93 # Get predictions from base regressor 94 y_pred_base = self.regr.predict(X, **kwargs) 95 # Apply isotonic regression postprocessing 96 return self.isotonic_regressor_.predict(y_pred_base)
Predict using the model.
Args: X: {array-like}, shape = [n_samples, n_features] Samples.
**kwargs: additional parameters to be passed to the base regressor.
Returns: y_pred: array-like, shape = [n_samples] Predicted values.
25class GenericFunctionalForecaster(BaseEstimator, RegressorMixin): 26 """ 27 Functional time series forecaster using dimensionality reduction and regression. 28 29 Following Hyndman-Ullah methodology: 30 1. Extract functional components using dimensionality reduction 31 2. Model relationships between components and functional data using regression 32 3. Forecast future functional curves 33 34 Parameters 35 ---------- 36 n_components : int, default=8 37 Number of components to extract. 38 reduction_method : str, default='pca' 39 Dimensionality reduction method. 40 reduction_params : dict, optional 41 Additional parameters for the reduction method. 42 rolling_window : int, optional 43 Window size for rolling regression. If None, uses full training set. 44 forecast_method : {'ar', 'last_value'}, default='ar' 45 Method for forecasting coefficients. 46 regressor : sklearn regressor, optional 47 Any sklearn regressor. If None, uses LinearRegression. 48 regressor_params : dict, optional 49 Additional parameters for the regressor. 50 """ 51 52 def __init__( 53 self, 54 n_components: int = 8, 55 reduction_method: str = "pca", 56 reduction_params: Optional[dict] = None, 57 rolling_window: Optional[int] = None, 58 forecast_method: Literal["ar", "last_value"] = "ar", 59 regressor: Optional[BaseEstimator] = None, 60 regressor_params: Optional[dict] = None, 61 ): 62 self.n_components = n_components 63 self.reduction_method = reduction_method 64 self.reduction_params = reduction_params or {} 65 self.rolling_window = rolling_window 66 self.forecast_method = forecast_method 67 self.regressor = ( 68 regressor if regressor is not None else LinearRegression() 69 ) 70 self.regressor_params = regressor_params or {} 71 72 # Available reduction methods 73 self._reduction_methods = { 74 "pca": PCA, 75 "kernel_pca": KernelPCA, 76 "truncated_svd": TruncatedSVD, 77 "factor_analysis": FactorAnalysis, 78 "fast_ica": FastICA, 79 "nmf": NMF, 80 "minibatch_sparse_pca": MiniBatchSparsePCA, 81 "mds": MDS, 82 "isomap": Isomap, 83 "lle": LocallyLinearEmbedding, 84 } 85 86 if reduction_method not in self._reduction_methods: 87 raise ValueError( 88 f"reduction_method must be one of {list(self._reduction_methods.keys())}" 89 ) 90 91 def _create_regressor(self): 92 """Create a fresh regressor instance with parameters.""" 93 if hasattr(self.regressor, "__class__"): 94 # Create new instance from class 95 regressor = self.regressor.__class__(**self.regressor_params) 96 else: 97 # Clone existing instance 98 from sklearn.base import clone 99 100 regressor = clone(self.regressor) 101 # Apply additional parameters 102 for param, value in self.regressor_params.items(): 103 setattr(regressor, param, value) 104 105 return regressor 106 107 def fit( 108 self, X: Union[np.ndarray, pd.DataFrame] 109 ) -> "GenericFunctionalForecaster": 110 """ 111 Fit the functional forecaster. 112 113 Parameters 114 ---------- 115 X : np.ndarray or pd.DataFrame, shape (n_samples, n_points) 116 Functional time series data. 117 118 Returns 119 ------- 120 self : object 121 Fitted forecaster. 122 """ 123 # Input validation and conversion 124 if isinstance(X, pd.DataFrame): 125 X = X.values 126 X = check_array(X) 127 128 self.X_ = X.copy() 129 self.n_samples_, self.n_points_ = X.shape 130 131 # 1. Standardize the functional data 132 self.scaler_ = StandardScaler() 133 X_scaled = self.scaler_.fit_transform(X) 134 135 # 2. Fit dimensionality reduction 136 self._fit_reduction_method(X_scaled) 137 138 # 3. Extract components (reduced features) 139 self.reduced_features_ = self.reducer_.transform(X_scaled) 140 141 # 4. Fit regression models 142 if self.rolling_window is not None: 143 self._fit_rolling_regression(X_scaled) 144 else: 145 self._fit_full_regression(X_scaled) 146 147 self.is_fitted_ = True 148 return self 149 150 def _fit_reduction_method(self, X_scaled): 151 """Fit the dimensionality reduction method.""" 152 reduction_class = self._reduction_methods[self.reduction_method] 153 154 # Handle method-specific parameters 155 if self.reduction_method == "kernel_pca": 156 if "kernel" not in self.reduction_params: 157 self.reduction_params["kernel"] = "rbf" 158 if "fit_inverse_transform" not in self.reduction_params: 159 self.reduction_params["fit_inverse_transform"] = True 160 elif self.reduction_method == "minibatch_sparse_pca": 161 if "alpha" not in self.reduction_params: 162 self.reduction_params["alpha"] = 1.0 163 if "batch_size" not in self.reduction_params: 164 self.reduction_params["batch_size"] = min(3, self.n_samples_) 165 166 # Initialize and fit the reducer 167 self.reducer_ = reduction_class( 168 n_components=self.n_components, **self.reduction_params 169 ) 170 self.reducer_.fit(X_scaled) 171 172 # Store components/basis functions for reconstruction 173 if hasattr(self.reducer_, "components_"): 174 self.components_ = ( 175 self.reducer_.components_ 176 ) # Shape: (n_components, n_points) 177 elif hasattr(self.reducer_, "inverse_transform"): 178 # For methods like KernelPCA, create identity mapping to get components 179 try: 180 identity_matrix = np.eye(self.n_components) 181 reconstructed = self.reducer_.inverse_transform(identity_matrix) 182 if reconstructed.shape == (self.n_components, self.n_points_): 183 self.components_ = reconstructed 184 else: 185 self.components_ = reconstructed.T 186 except Exception as e: 187 warnings.warn( 188 f"Could not extract components for {self.reduction_method}: {e}" 189 ) 190 self.components_ = None 191 else: 192 warnings.warn( 193 f"No reconstruction available for {self.reduction_method}" 194 ) 195 self.components_ = None 196 197 def _fit_rolling_regression(self, X_scaled): 198 """ 199 Fit rolling regression models. 200 201 For each window, fit: reduced_features[window] -> next_scaled_curve 202 This maintains scale consistency throughout. 203 """ 204 if self.n_samples_ <= self.rolling_window: 205 raise ValueError( 206 f"Need more than {self.rolling_window} samples for rolling window, " 207 f"got {self.n_samples_}" 208 ) 209 210 self.rolling_models_ = [] 211 self.rolling_coefs_ = [] 212 213 n_windows = self.n_samples_ - self.rolling_window 214 215 for i in range(n_windows): 216 # Input: window of reduced features 217 X_window = self.reduced_features_[ 218 i : i + self.rolling_window 219 ] # (window, n_components) 220 221 # Target: next scaled functional curve 222 y_next_scaled = X_scaled[i : i + self.rolling_window] # (n_points,) 223 224 # Create and fit regressor 225 regressor = self._create_regressor() 226 227 try: 228 # Fit regression: reduced_features_window -> scaled_functional_curve 229 regressor.fit(X_window, y_next_scaled) 230 231 # Store model and coefficients 232 self.rolling_models_.append(regressor) 233 234 # Extract coefficients - shape depends on regressor type 235 if hasattr(regressor, "coef_"): 236 coef = regressor.coef_ 237 # For multioutput: coef shape is (n_outputs, n_features) = (n_points, n_components) 238 # For single output with multiple features: (n_features,) = (n_components,) 239 # We expect multioutput here since y_next_scaled is (n_points,) 240 if coef.ndim == 1: 241 # This shouldn't happen with multioutput, but handle gracefully 242 warnings.warn( 243 f"Unexpected single output coefficients at window {i}" 244 ) 245 coef = coef.reshape(1, -1) # (1, n_components) 246 self.rolling_coefs_.append(coef) # (n_points, n_components) 247 else: 248 # Fallback: use least squares 249 warnings.warn( 250 f"Regressor has no coef_ attribute, using least squares at window {i}" 251 ) 252 coef = np.linalg.lstsq(X_window, y_next_scaled, rcond=None)[ 253 0 254 ].T 255 if coef.ndim == 1: 256 coef = coef.reshape(1, -1) 257 self.rolling_coefs_.append(coef) 258 259 except Exception as e: 260 warnings.warn( 261 f"Regression failed at window {i}: {e}. Using least squares fallback." 262 ) 263 # Least squares fallback 264 coef = np.linalg.lstsq(X_window, y_next_scaled, rcond=None)[ 265 0 266 ].T # (n_points, n_components) 267 if coef.ndim == 1: 268 coef = coef.reshape(1, -1) 269 self.rolling_coefs_.append(coef) 270 self.rolling_models_.append(None) 271 272 # Convert to array for easier manipulation 273 # Shape: (n_windows, n_points, n_components) 274 self.rolling_coefs_ = np.array(self.rolling_coefs_) 275 276 def _fit_full_regression(self, X_scaled): 277 """ 278 Fit regression using full training set. 279 280 Fit: reduced_features -> scaled_functional_data 281 """ 282 # Create regressor 283 regressor = self._create_regressor() 284 285 try: 286 # Fit: all reduced features -> all scaled functional curves 287 regressor.fit(self.reduced_features_, X_scaled) 288 self.full_model_ = regressor 289 290 # Store coefficients 291 if hasattr(regressor, "coef_"): 292 self.coefs_ = ( 293 regressor.coef_ 294 ) # (n_points, n_components) for multioutput 295 else: 296 # Fallback to least squares 297 warnings.warn( 298 "Regressor has no coef_ attribute, using least squares" 299 ) 300 self.coefs_ = np.linalg.lstsq( 301 self.reduced_features_, X_scaled, rcond=None 302 )[0].T 303 304 except Exception as e: 305 warnings.warn( 306 f"Full regression failed: {e}. Using least squares fallback." 307 ) 308 # Least squares fallback 309 self.coefs_ = np.linalg.lstsq( 310 self.reduced_features_, X_scaled, rcond=None 311 )[0].T 312 self.full_model_ = None 313 314 def forecast(self, steps: int = 5) -> np.ndarray: 315 """ 316 Forecast functional time series. 317 318 Parameters 319 ---------- 320 steps : int 321 Number of steps to forecast. 322 323 Returns 324 ------- 325 np.ndarray, shape (steps, n_points) 326 Forecasted functional curves. 327 """ 328 check_is_fitted(self, "is_fitted_") 329 330 if self.rolling_window is not None: 331 return self._forecast_rolling(steps) 332 else: 333 return self._forecast_full(steps) 334 335 def _forecast_rolling(self, steps: int) -> np.ndarray: 336 """Forecast using rolling regression approach.""" 337 # rolling_coefs_ shape: (n_windows, n_points, n_components) 338 n_windows, n_points, n_components = self.rolling_coefs_.shape 339 # Forecast coefficients for each point and component 340 forecasted_coefs = np.zeros((steps, n_points, n_components)) 341 342 for point_idx in range(n_points): 343 for comp_idx in range(n_components): 344 # Get time series of coefficients for this (point, component) 345 coef_series = self.rolling_coefs_[:, point_idx, comp_idx] 346 # Forecast this coefficient series 347 if self.forecast_method == "ar" and len(coef_series) > 1: 348 try: 349 # Fit AR model to coefficient series 350 ar_model = AutoReg( 351 coef_series, lags=min(2, len(coef_series) - 1) 352 ).fit() 353 forecasted_values = ar_model.predict( 354 start=len(coef_series), 355 end=len(coef_series) + steps - 1, 356 ) 357 forecasted_coefs[:, point_idx, comp_idx] = ( 358 forecasted_values 359 ) 360 except Exception as e: 361 warnings.warn( 362 f"AR forecasting failed for point {point_idx}, component {comp_idx}: {e}" 363 ) 364 # Use last value 365 forecasted_coefs[:, point_idx, comp_idx] = coef_series[ 366 -1 367 ] 368 else: 369 # Use last value 370 forecasted_coefs[:, point_idx, comp_idx] = coef_series[-1] 371 372 # Reconstruct functional forecasts from predicted coefficients 373 forecasts_scaled = np.zeros((steps, n_points)) 374 375 if self.components_ is not None: 376 # Use learned components for reconstruction 377 # For each forecast step and each point, sum over components 378 for step in range(steps): 379 for point_idx in range(n_points): 380 # forecasted_coefs[step, point_idx, :] has shape (n_components,) 381 # self.components_[:, point_idx] has shape (n_components,) 382 forecasts_scaled[step, point_idx] = np.dot( 383 forecasted_coefs[step, point_idx, :], 384 self.components_[:, point_idx], 385 ) 386 else: 387 # No reconstruction available - use direct prediction 388 warnings.warn( 389 f"No reconstruction available for {self.reduction_method}. Using last known values." 390 ) 391 last_scaled = self.scaler_.transform(self.X_[-1:]) 392 forecasts_scaled = np.tile(last_scaled, (steps, 1)) 393 # Transform back to original scale 394 return self.scaler_.inverse_transform(forecasts_scaled) 395 396 def _forecast_full(self, steps: int) -> np.ndarray: 397 """Forecast using full training set approach.""" 398 # First, forecast the reduced features themselves 399 forecasted_features = np.zeros((steps, self.n_components)) 400 401 for comp in range(self.n_components): 402 # Get time series of this component 403 feature_series = self.reduced_features_[:, comp] 404 405 if self.forecast_method == "ar" and len(feature_series) > 1: 406 try: 407 # Fit AR model to feature series 408 ar_model = AutoReg( 409 feature_series, lags=min(2, len(feature_series) - 1) 410 ).fit() 411 forecasted_values = ar_model.predict( 412 start=len(feature_series), 413 end=len(feature_series) + steps - 1, 414 ) 415 forecasted_features[:, comp] = forecasted_values 416 except Exception as e: 417 warnings.warn( 418 f"AR forecasting failed for component {comp}: {e}" 419 ) 420 # Use last value 421 forecasted_features[:, comp] = feature_series[-1] 422 else: 423 # Use last value 424 forecasted_features[:, comp] = feature_series[-1] 425 426 # Reconstruct functional data from forecasted features 427 if hasattr(self, "full_model_") and self.full_model_ is not None: 428 # Use the fitted model to predict 429 try: 430 forecasts_scaled = self.full_model_.predict(forecasted_features) 431 except: 432 # Fallback to coefficient multiplication 433 forecasts_scaled = forecasted_features @ self.coefs_.T 434 else: 435 # Use stored coefficients 436 forecasts_scaled = forecasted_features @ self.coefs_.T 437 438 # Transform back to original scale 439 forecasts = self.scaler_.inverse_transform(forecasts_scaled) 440 return forecasts 441 442 def plot_components(self, n_plot: int = 3) -> None: 443 """Plot functional components.""" 444 check_is_fitted(self, "is_fitted_") 445 446 if self.components_ is None: 447 print(f"Components not available for {self.reduction_method}") 448 return 449 450 plt.figure(figsize=(12, 6)) 451 for i in range(min(n_plot, self.n_components)): 452 plt.plot(self.components_[i], label=f"Component {i+1}", linewidth=2) 453 454 plt.title(f"{self.reduction_method.upper()} Components") 455 plt.xlabel("Domain Point") 456 plt.ylabel("Component Value") 457 plt.legend() 458 plt.grid(True, alpha=0.3) 459 plt.show() 460 461 def plot_reduced_features(self, n_plot: int = 4) -> None: 462 """Plot reduced features over time.""" 463 check_is_fitted(self, "is_fitted_") 464 465 plt.figure(figsize=(12, 8)) 466 n_subplot_cols = 2 467 n_subplot_rows = (min(n_plot, self.n_components) + 1) // 2 468 469 for i in range(min(n_plot, self.n_components)): 470 plt.subplot(n_subplot_rows, n_subplot_cols, i + 1) 471 plt.plot( 472 self.reduced_features_[:, i], "o-", linewidth=2, markersize=4 473 ) 474 plt.title(f"Reduced Feature {i+1}") 475 plt.xlabel("Time") 476 plt.ylabel("Value") 477 plt.grid(True, alpha=0.3) 478 479 plt.tight_layout() 480 plt.show() 481 482 def plot_forecast( 483 self, actual: Optional[np.ndarray] = None, steps: int = 5 484 ) -> None: 485 """Plot forecasted curves.""" 486 forecasts = self.forecast(steps=steps) 487 488 plt.figure(figsize=(12, 6)) 489 490 # Plot some historical curves 491 n_history = min(5, len(self.X_)) 492 for i in range(n_history): 493 idx = -(n_history - i) 494 plt.plot( 495 self.X_[idx], 496 "b-", 497 alpha=0.3, 498 linewidth=1, 499 label="Historical" if i == 0 else "", 500 ) 501 502 # Plot actual test data if provided 503 if actual is not None: 504 for i in range(min(3, len(actual))): 505 plt.plot( 506 actual[i], 507 "k-", 508 alpha=0.7, 509 linewidth=2, 510 label="Actual" if i == 0 else "", 511 ) 512 513 # Plot forecasts 514 for i in range(steps): 515 plt.plot( 516 forecasts[i], 517 "r--", 518 linewidth=2, 519 alpha=0.7, 520 label="Forecast" if i == 0 else "", 521 ) 522 523 plt.title("Functional Time Series Forecast") 524 plt.xlabel("Domain Point") 525 plt.ylabel("Value") 526 plt.legend() 527 plt.grid(True, alpha=0.3) 528 plt.show() 529 530 def get_model_info(self) -> dict: 531 """Get information about the fitted model.""" 532 info = { 533 "n_components": self.n_components, 534 "reduction_method": self.reduction_method, 535 "rolling_window": self.rolling_window, 536 "forecast_method": self.forecast_method, 537 "regressor": self.regressor.__class__.__name__, 538 "regressor_params": self.regressor_params, 539 "is_fitted": getattr(self, "is_fitted_", False), 540 } 541 542 if hasattr(self, "reduced_features_"): 543 info.update( 544 { 545 "n_samples": self.n_samples_, 546 "n_points": self.n_points_, 547 "explained_variance_ratio": getattr( 548 self.reducer_, "explained_variance_ratio_", None 549 ), 550 "has_components": self.components_ is not None, 551 "coefficient_shape": ( 552 getattr(self, "rolling_coefs_", np.array([])).shape 553 if hasattr(self, "rolling_coefs_") 554 else getattr(self, "coefs_", np.array([])).shape 555 ), 556 } 557 ) 558 559 return info
Functional time series forecaster using dimensionality reduction and regression.
Following Hyndman-Ullah methodology:
- Extract functional components using dimensionality reduction
- Model relationships between components and functional data using regression
- Forecast future functional curves
Parameters
n_components : int, default=8 Number of components to extract. reduction_method : str, default='pca' Dimensionality reduction method. reduction_params : dict, optional Additional parameters for the reduction method. rolling_window : int, optional Window size for rolling regression. If None, uses full training set. forecast_method : {'ar', 'last_value'}, default='ar' Method for forecasting coefficients. regressor : sklearn regressor, optional Any sklearn regressor. If None, uses LinearRegression. regressor_params : dict, optional Additional parameters for the regressor.
107 def fit( 108 self, X: Union[np.ndarray, pd.DataFrame] 109 ) -> "GenericFunctionalForecaster": 110 """ 111 Fit the functional forecaster. 112 113 Parameters 114 ---------- 115 X : np.ndarray or pd.DataFrame, shape (n_samples, n_points) 116 Functional time series data. 117 118 Returns 119 ------- 120 self : object 121 Fitted forecaster. 122 """ 123 # Input validation and conversion 124 if isinstance(X, pd.DataFrame): 125 X = X.values 126 X = check_array(X) 127 128 self.X_ = X.copy() 129 self.n_samples_, self.n_points_ = X.shape 130 131 # 1. Standardize the functional data 132 self.scaler_ = StandardScaler() 133 X_scaled = self.scaler_.fit_transform(X) 134 135 # 2. Fit dimensionality reduction 136 self._fit_reduction_method(X_scaled) 137 138 # 3. Extract components (reduced features) 139 self.reduced_features_ = self.reducer_.transform(X_scaled) 140 141 # 4. Fit regression models 142 if self.rolling_window is not None: 143 self._fit_rolling_regression(X_scaled) 144 else: 145 self._fit_full_regression(X_scaled) 146 147 self.is_fitted_ = True 148 return self
Fit the functional forecaster.
Parameters
X : np.ndarray or pd.DataFrame, shape (n_samples, n_points) Functional time series data.
Returns
self : object Fitted forecaster.
18class RankTargetEncoder(BaseEstimator, TransformerMixin): 19 """ 20 Rank-based target encoder using Spearman rho or Kendall tau via 21 Gaussian copula with proper cross-validation. 22 23 This encoder uses cross-validation and pseudo-targets generated via 24 Gaussian copula with specified rank correlation to create robust, 25 regularized encodings that prevent overfitting. 26 27 Parameters: 28 ----------- 29 correlation_type : str, default='spearman' 30 Type of rank correlation ('spearman' or 'kendall'). 31 correlation_strength : float, default=0.5 32 Desired strength of rank correlation (between 0 and 1). 33 shrinkage : float, default=10 34 Shrinkage parameter for regularization (Bayesian average). 35 n_folds : int, default=3 36 Number of CV folds for leakage-free encoding. 37 ensemble_size : int, default=5 38 Number of pseudo-targets to average over (reduces variance). 39 aggregate : str, default='mean' 40 Aggregation method for combining values within categories ('mean' or 'median'). 41 random_state : int, default=42 42 Random seed for reproducibility. 43 """ 44 45 def __init__( 46 self, 47 correlation_type="spearman", 48 correlation_strength=0.5, 49 shrinkage=10, 50 n_folds=3, 51 ensemble_size=5, 52 aggregate="mean", 53 random_state=42, 54 ): 55 self.correlation_type = correlation_type 56 self.correlation_strength = correlation_strength 57 self.shrinkage = shrinkage 58 self.n_folds = n_folds 59 self.ensemble_size = ensemble_size 60 self.aggregate = aggregate 61 self.random_state = random_state 62 self.cat_columns_ = [] 63 64 # Validate inputs 65 if correlation_type not in ["spearman", "kendall"]: 66 raise ValueError("correlation_type must be 'spearman' or 'kendall'") 67 if not (0 <= correlation_strength <= 1): 68 raise ValueError("correlation_strength must be in [0, 1]") 69 if shrinkage < 0: 70 raise ValueError("shrinkage must be non-negative") 71 if n_folds < 2: 72 raise ValueError("n_folds must be at least 2") 73 if ensemble_size < 1: 74 raise ValueError("ensemble_size must be at least 1") 75 if aggregate not in ["mean", "median"]: 76 raise ValueError("aggregate must be 'mean' or 'median'") 77 78 def _generate_pseudo_target(self, y, random_state): 79 """Generate pseudo-target with specified rank correlation to y.""" 80 y = np.asarray(y) 81 n = len(y) 82 if n <= 1: 83 return y.copy() 84 85 # Convert to uniform margins via ranks 86 ranks = rankdata(y, method="average") 87 u_y = ranks / (n + 1) 88 89 # Transform to Gaussian 90 g_y = norm.ppf(u_y) 91 92 # Convert rank correlation to Gaussian correlation 93 if self.correlation_type == "spearman": 94 rho_g = 2 * np.sin(np.pi * self.correlation_strength / 6) 95 else: # kendall 96 rho_g = np.sin(np.pi * self.correlation_strength / 2) 97 rho_g = np.clip(rho_g, -1.0, 1.0) 98 99 # Generate correlated Gaussian variable 100 rng = np.random.RandomState(random_state) 101 eta = rng.normal(size=n) 102 g_z = rho_g * g_y + np.sqrt(1 - rho_g**2) * eta 103 104 # Transform back to original scale via quantiles 105 u_z = norm.cdf(g_z) 106 y_sorted = np.sort(y) 107 z = np.quantile(y_sorted, u_z, method="linear") 108 109 return z 110 111 def _compute_category_statistics(self, categories, values): 112 """Compute category-wise statistics with proper handling.""" 113 if len(categories) == 0: 114 return {} 115 116 df = pd.DataFrame({"cat": categories, "val": values}) 117 118 if self.aggregate == "mean": 119 cat_stats = df.groupby("cat")["val"].agg(["mean", "count"]) 120 return dict( 121 zip(cat_stats.index, zip(cat_stats["mean"], cat_stats["count"])) 122 ) 123 else: # median 124 cat_stats = df.groupby("cat")["val"].agg(["median", "count"]) 125 return dict( 126 zip( 127 cat_stats.index, 128 zip(cat_stats["median"], cat_stats["count"]), 129 ) 130 ) 131 132 def _apply_shrinkage(self, category_stats, global_stat): 133 """Apply shrinkage regularization to category statistics.""" 134 regularized = {} 135 for cat, (stat, count) in category_stats.items(): 136 regularized[cat] = (count * stat + self.shrinkage * global_stat) / ( 137 count + self.shrinkage 138 ) 139 return regularized 140 141 def _identify_categorical_columns(self, X): 142 """Identify categorical columns in the DataFrame.""" 143 cat_cols = [] 144 for col in X.columns: 145 # Check if column is object type or has low cardinality 146 if ( 147 X[col].dtype == "object" 148 or X[col].dtype.name == "category" 149 or X[col].nunique() / len(X) < 0.05 150 ): # heuristic for categorical 151 cat_cols.append(col) 152 return cat_cols 153 154 def fit(self, X, y): 155 """Fit the encoder using cross-validation to prevent leakage.""" 156 if not isinstance(X, pd.DataFrame): 157 raise ValueError("X must be a pandas DataFrame") 158 159 X = X.reset_index(drop=True) # Ensure clean integer indices 160 y = np.asarray(y) 161 162 if len(X) != len(y): 163 raise ValueError("X and y must have the same number of samples") 164 165 if len(X) == 0: 166 raise ValueError("X cannot be empty") 167 168 self.feature_names_in_ = list(X.columns) 169 self.y_mean_ = np.mean(y) if len(y) > 0 else 0.0 170 self.category_mappings_ = {} 171 172 # Identify categorical columns 173 self.cat_columns_ = self._identify_categorical_columns(X) 174 self.non_cat_columns_ = [ 175 col for col in X.columns if col not in self.cat_columns_ 176 ] 177 178 # Set up cross-validation 179 kf = KFold( 180 n_splits=self.n_folds, shuffle=True, random_state=self.random_state 181 ) 182 183 for col in self.cat_columns_: 184 if X[col].nunique() <= 1: 185 # Handle constant columns 186 self.category_mappings_[col] = {X[col].iloc[0]: self.y_mean_} 187 continue 188 189 # Collect encodings for each category across all CV folds and ensemble members 190 category_encodings = defaultdict(list) 191 192 for ensemble_idx in range(self.ensemble_size): 193 ensemble_seed = self.random_state + ensemble_idx 194 fold_encodings = np.full(len(y), np.nan) 195 196 for fold_idx, (train_idx, val_idx) in enumerate(kf.split(X)): 197 # Training data for this fold 198 X_train_fold = X.iloc[train_idx] 199 y_train_fold = y[train_idx] 200 X_val_fold = X.iloc[val_idx] 201 202 if len(y_train_fold) == 0: 203 continue 204 205 # Generate pseudo-target for this fold and ensemble member 206 # Use deterministic seed based on ensemble_idx, fold_idx, and column 207 fold_seed = ( 208 ensemble_seed + fold_idx * 1000 + hash(col) % 10000 209 ) 210 z_train = self._generate_pseudo_target( 211 y_train_fold, fold_seed 212 ) 213 214 # Compute category statistics 215 cat_stats = self._compute_category_statistics( 216 X_train_fold[col].values, z_train 217 ) 218 219 if not cat_stats: 220 continue 221 222 # Apply shrinkage regularization 223 if self.aggregate == "mean": 224 global_stat = np.mean(z_train) 225 else: # median 226 global_stat = np.median(z_train) 227 228 regularized_stats = self._apply_shrinkage( 229 cat_stats, global_stat 230 ) 231 232 # Encode validation fold 233 for idx in val_idx: 234 category = X_val_fold.loc[idx, col] 235 if category in regularized_stats: 236 fold_encodings[idx] = regularized_stats[category] 237 else: 238 fold_encodings[idx] = global_stat 239 240 # Collect encodings by category for this ensemble member 241 for idx, encoding in enumerate(fold_encodings): 242 if not np.isnan(encoding): 243 category = X.iloc[idx][col] 244 category_encodings[category].append(encoding) 245 246 # Average encodings for each category across all ensemble members and folds 247 final_mappings = {} 248 for category, encodings in category_encodings.items(): 249 if encodings: 250 final_mappings[category] = np.mean(encodings) 251 else: 252 final_mappings[category] = self.y_mean_ 253 254 self.category_mappings_[col] = final_mappings 255 256 return self 257 258 def transform(self, X): 259 """Transform categorical columns using learned encodings.""" 260 if not hasattr(self, "category_mappings_"): 261 raise NotFittedError( 262 "This %s instance is not fitted yet." % self.__class__.__name__ 263 ) 264 265 if not isinstance(X, pd.DataFrame): 266 raise ValueError("X must be a pandas DataFrame") 267 268 # Check for missing columns 269 missing_cols = set(self.feature_names_in_) - set(X.columns) 270 if missing_cols: 271 raise ValueError(f"Missing columns from training: {missing_cols}") 272 273 X_encoded = X.copy() 274 275 for col in self.feature_names_in_: 276 if col not in X.columns: 277 # This shouldn't happen due to check above, but be safe 278 X_encoded[col] = self.y_mean_ 279 continue 280 281 # Only encode categorical columns, leave others unchanged 282 if col in self.cat_columns_: 283 mappings = self.category_mappings_[col] 284 X_encoded[col] = X[col].map(mappings).fillna(self.y_mean_) 285 # Non-categorical columns are left as-is 286 287 return X_encoded 288 289 def fit_transform(self, X, y, **fit_params): 290 """Fit encoder and return encoded version of X.""" 291 return self.fit(X, y).transform(X) 292 293 def get_feature_names_out(self, input_features=None): 294 """Get output feature names for transformation.""" 295 if not hasattr(self, "category_mappings_"): 296 raise NotFittedError( 297 "This %s instance is not fitted yet." % self.__class__.__name__ 298 ) 299 300 if input_features is None: 301 return np.array(self.feature_names_in_) 302 else: 303 return np.array(input_features) 304 305 def get_category_mappings(self): 306 """Get the learned category mappings for inspection.""" 307 if not hasattr(self, "category_mappings_"): 308 raise NotFittedError( 309 "This %s instance is not fitted yet." % self.__class__.__name__ 310 ) 311 312 return self.category_mappings_.copy() 313 314 def validate_encoding(self, X, y, plot=True): 315 """ 316 Comprehensive validation of the encoding process, including correlation 317 preservation, distribution analysis, and category-level statistics. 318 319 Parameters: 320 ----------- 321 X : pandas DataFrame 322 Input features (must be the same as used in fitting) 323 y : array-like 324 True target values 325 plot : bool, default=True 326 Whether to generate diagnostic plots 327 328 Returns: 329 -------- 330 dict 331 Dictionary containing validation metrics and statistics 332 """ 333 if not hasattr(self, "category_mappings_"): 334 raise NotFittedError( 335 "This %s instance is not fitted yet." % self.__class__.__name__ 336 ) 337 338 if not isinstance(X, pd.DataFrame): 339 raise ValueError("X must be a pandas DataFrame") 340 341 X = X.reset_index(drop=True) 342 y = np.asarray(y) 343 344 # Generate multiple pseudo-targets for robust statistics 345 pseudo_targets = [] 346 correlations_achieved = [] 347 348 for i in range(self.ensemble_size): 349 seed = self.random_state + i 350 z = self._generate_pseudo_target(y, seed) 351 pseudo_targets.append(z) 352 353 # Compute achieved correlation 354 if self.correlation_type == "spearman": 355 from scipy.stats import spearmanr 356 357 corr, _ = spearmanr(y, z) 358 else: # kendall 359 from scipy.stats import kendalltau 360 361 corr, _ = kendalltau(y, z) 362 correlations_achieved.append(corr) 363 364 pseudo_targets = np.array(pseudo_targets) 365 mean_pseudo_target = np.mean(pseudo_targets, axis=0) 366 367 # Transform the data 368 X_encoded = self.transform(X) 369 370 # Compute overall validation metrics 371 validation_results = { 372 "target_correlation": self.correlation_strength, 373 "achieved_correlations": correlations_achieved, 374 "mean_achieved_correlation": np.mean(correlations_achieved), 375 "std_achieved_correlation": np.std(correlations_achieved), 376 "correlation_bias": np.mean(correlations_achieved) 377 - self.correlation_strength, 378 "original_target_stats": { 379 "mean": np.mean(y), 380 "std": np.std(y), 381 "min": np.min(y), 382 "max": np.max(y), 383 "median": np.median(y), 384 }, 385 "pseudo_target_stats": { 386 "mean": np.mean(mean_pseudo_target), 387 "std": np.std(mean_pseudo_target), 388 "min": np.min(mean_pseudo_target), 389 "max": np.max(mean_pseudo_target), 390 "median": np.median(mean_pseudo_target), 391 }, 392 } 393 394 # Category-level analysis 395 category_correlations = {} 396 category_stats = {} 397 398 for col in self.cat_columns_: 399 if col not in X.columns: 400 continue 401 402 unique_categories = X[col].unique() 403 cat_corrs = [] 404 cat_means_original = [] 405 cat_means_pseudo = [] 406 cat_counts = [] 407 408 for category in unique_categories: 409 mask = X[col] == category 410 if ( 411 np.sum(mask) > 5 412 ): # Only analyze categories with sufficient samples 413 if self.correlation_type == "spearman": 414 corr, _ = spearmanr(y[mask], mean_pseudo_target[mask]) 415 else: 416 corr, _ = kendalltau(y[mask], mean_pseudo_target[mask]) 417 418 cat_corrs.append(corr) 419 cat_means_original.append(np.mean(y[mask])) 420 cat_means_pseudo.append(np.mean(mean_pseudo_target[mask])) 421 cat_counts.append(np.sum(mask)) 422 423 category_correlations[col] = { 424 "mean_correlation": np.mean(cat_corrs) if cat_corrs else np.nan, 425 "std_correlation": np.std(cat_corrs) if cat_corrs else np.nan, 426 "min_correlation": np.min(cat_corrs) if cat_corrs else np.nan, 427 "max_correlation": np.max(cat_corrs) if cat_corrs else np.nan, 428 } 429 430 category_stats[col] = { 431 "n_categories": len(unique_categories), 432 "n_analyzed_categories": len(cat_corrs), 433 "category_means_original": cat_means_original, 434 "category_means_pseudo": cat_means_pseudo, 435 "category_counts": cat_counts, 436 } 437 438 validation_results["category_correlations"] = category_correlations 439 validation_results["category_stats"] = category_stats 440 441 # Generate plots if requested 442 if plot: 443 try: 444 fig, axes = plt.subplots(2, 2, figsize=(15, 12)) 445 axes = axes.flatten() 446 447 # Scatter plot: Original vs Pseudo-targets 448 axes[0].scatter(y, mean_pseudo_target, alpha=0.6, s=20) 449 axes[0].set_xlabel("Original Target") 450 axes[0].set_ylabel("Pseudo Target") 451 axes[0].set_title( 452 f'Original vs Pseudo-targets\n{self.correlation_type.capitalize()} correlation: {validation_results["mean_achieved_correlation"]:.3f}' 453 ) 454 455 # Add correlation line 456 z = np.polyfit(y, mean_pseudo_target, 1) 457 p = np.poly1d(z) 458 axes[0].plot(y, p(y), "r--", alpha=0.8) 459 460 # Distribution comparison 461 axes[1].hist( 462 y, alpha=0.7, bins=30, label="Original", density=True 463 ) 464 axes[1].hist( 465 mean_pseudo_target, 466 alpha=0.7, 467 bins=30, 468 label="Pseudo", 469 density=True, 470 ) 471 axes[1].set_xlabel("Value") 472 axes[1].set_ylabel("Density") 473 axes[1].set_title("Distribution Comparison") 474 axes[1].legend() 475 476 # Rank comparison 477 original_ranks = rankdata(y, method="average") 478 pseudo_ranks = rankdata(mean_pseudo_target, method="average") 479 axes[2].scatter(original_ranks, pseudo_ranks, alpha=0.6, s=20) 480 axes[2].set_xlabel("Original Ranks") 481 axes[2].set_ylabel("Pseudo Ranks") 482 axes[2].set_title("Rank Preservation") 483 484 # Category analysis - residual plot 485 residuals = y - mean_pseudo_target 486 # Use first categorical column for coloring if available 487 if self.cat_columns_: 488 cat_col = self.cat_columns_[0] 489 unique_cats = X[cat_col].unique()[ 490 :10 491 ] # Limit to top 10 categories 492 colors = plt.cm.tab10(np.linspace(0, 1, len(unique_cats))) 493 494 for i, category in enumerate(unique_cats): 495 mask = X[cat_col] == category 496 if np.sum(mask) > 0: 497 axes[3].scatter( 498 mean_pseudo_target[mask], 499 residuals[mask], 500 alpha=0.6, 501 s=20, 502 color=colors[i], 503 label=str(category), 504 ) 505 506 axes[3].axhline(y=0, color="r", linestyle="--", alpha=0.8) 507 axes[3].set_xlabel("Pseudo Target") 508 axes[3].set_ylabel("Residuals (Original - Pseudo)") 509 axes[3].set_title("Residuals by Category") 510 axes[3].legend(bbox_to_anchor=(1.05, 1), loc="upper left") 511 else: 512 axes[3].scatter( 513 mean_pseudo_target, residuals, alpha=0.6, s=20 514 ) 515 axes[3].axhline(y=0, color="r", linestyle="--", alpha=0.8) 516 axes[3].set_xlabel("Pseudo Target") 517 axes[3].set_ylabel("Residuals (Original - Pseudo)") 518 axes[3].set_title("Residual Plot") 519 520 plt.tight_layout() 521 plt.show() 522 523 except ImportError: 524 print("Matplotlib/seaborn not available for plotting") 525 526 return validation_results 527 528 def get_validation_report(self, validation_results): 529 """ 530 Generate a human-readable validation report from validation results. 531 532 Parameters: 533 ----------- 534 validation_results : dict 535 Results from validate_encoding method 536 537 Returns: 538 -------- 539 str 540 Formatted validation report 541 """ 542 report = [] 543 report.append("=" * 60) 544 report.append("RANK TARGET ENCODER VALIDATION REPORT") 545 report.append("=" * 60) 546 547 report.append(f"\nCORRELATION VALIDATION:") 548 report.append( 549 f"Target {self.correlation_type} correlation: {validation_results['target_correlation']:.3f}" 550 ) 551 report.append( 552 f"Achieved mean correlation: {validation_results['mean_achieved_correlation']:.3f}" 553 ) 554 report.append( 555 f"Correlation bias: {validation_results['correlation_bias']:.3f}" 556 ) 557 report.append( 558 f"Correlation std across ensemble: {validation_results['std_achieved_correlation']:.3f}" 559 ) 560 561 report.append(f"\nDISTRIBUTION COMPARISON:") 562 orig = validation_results["original_target_stats"] 563 pseudo = validation_results["pseudo_target_stats"] 564 report.append( 565 f"Original target - Mean: {orig['mean']:.3f}, Std: {orig['std']:.3f}" 566 ) 567 report.append( 568 f"Pseudo target - Mean: {pseudo['mean']:.3f}, Std: {pseudo['std']:.3f}" 569 ) 570 571 report.append(f"\nCATEGORY-LEVEL ANALYSIS:") 572 for col, stats in validation_results["category_correlations"].items(): 573 if not np.isnan(stats["mean_correlation"]): 574 report.append( 575 f" {col}: {stats['mean_correlation']:.3f} ± {stats['std_correlation']:.3f} " 576 f"(min: {stats['min_correlation']:.3f}, max: {stats['max_correlation']:.3f})" 577 ) 578 579 report.append(f"\nROBUST STATISTICS:") 580 report.append(f"Ensemble size: {self.ensemble_size}") 581 report.append( 582 f"Individual correlations: {[f'{c:.3f}' for c in validation_results['achieved_correlations']]}" 583 ) 584 585 report.append("=" * 60) 586 return "\n".join(report)
Rank-based target encoder using Spearman rho or Kendall tau via Gaussian copula with proper cross-validation.
This encoder uses cross-validation and pseudo-targets generated via Gaussian copula with specified rank correlation to create robust, regularized encodings that prevent overfitting.
Parameters:
correlation_type : str, default='spearman' Type of rank correlation ('spearman' or 'kendall'). correlation_strength : float, default=0.5 Desired strength of rank correlation (between 0 and 1). shrinkage : float, default=10 Shrinkage parameter for regularization (Bayesian average). n_folds : int, default=3 Number of CV folds for leakage-free encoding. ensemble_size : int, default=5 Number of pseudo-targets to average over (reduces variance). aggregate : str, default='mean' Aggregation method for combining values within categories ('mean' or 'median'). random_state : int, default=42 Random seed for reproducibility.
154 def fit(self, X, y): 155 """Fit the encoder using cross-validation to prevent leakage.""" 156 if not isinstance(X, pd.DataFrame): 157 raise ValueError("X must be a pandas DataFrame") 158 159 X = X.reset_index(drop=True) # Ensure clean integer indices 160 y = np.asarray(y) 161 162 if len(X) != len(y): 163 raise ValueError("X and y must have the same number of samples") 164 165 if len(X) == 0: 166 raise ValueError("X cannot be empty") 167 168 self.feature_names_in_ = list(X.columns) 169 self.y_mean_ = np.mean(y) if len(y) > 0 else 0.0 170 self.category_mappings_ = {} 171 172 # Identify categorical columns 173 self.cat_columns_ = self._identify_categorical_columns(X) 174 self.non_cat_columns_ = [ 175 col for col in X.columns if col not in self.cat_columns_ 176 ] 177 178 # Set up cross-validation 179 kf = KFold( 180 n_splits=self.n_folds, shuffle=True, random_state=self.random_state 181 ) 182 183 for col in self.cat_columns_: 184 if X[col].nunique() <= 1: 185 # Handle constant columns 186 self.category_mappings_[col] = {X[col].iloc[0]: self.y_mean_} 187 continue 188 189 # Collect encodings for each category across all CV folds and ensemble members 190 category_encodings = defaultdict(list) 191 192 for ensemble_idx in range(self.ensemble_size): 193 ensemble_seed = self.random_state + ensemble_idx 194 fold_encodings = np.full(len(y), np.nan) 195 196 for fold_idx, (train_idx, val_idx) in enumerate(kf.split(X)): 197 # Training data for this fold 198 X_train_fold = X.iloc[train_idx] 199 y_train_fold = y[train_idx] 200 X_val_fold = X.iloc[val_idx] 201 202 if len(y_train_fold) == 0: 203 continue 204 205 # Generate pseudo-target for this fold and ensemble member 206 # Use deterministic seed based on ensemble_idx, fold_idx, and column 207 fold_seed = ( 208 ensemble_seed + fold_idx * 1000 + hash(col) % 10000 209 ) 210 z_train = self._generate_pseudo_target( 211 y_train_fold, fold_seed 212 ) 213 214 # Compute category statistics 215 cat_stats = self._compute_category_statistics( 216 X_train_fold[col].values, z_train 217 ) 218 219 if not cat_stats: 220 continue 221 222 # Apply shrinkage regularization 223 if self.aggregate == "mean": 224 global_stat = np.mean(z_train) 225 else: # median 226 global_stat = np.median(z_train) 227 228 regularized_stats = self._apply_shrinkage( 229 cat_stats, global_stat 230 ) 231 232 # Encode validation fold 233 for idx in val_idx: 234 category = X_val_fold.loc[idx, col] 235 if category in regularized_stats: 236 fold_encodings[idx] = regularized_stats[category] 237 else: 238 fold_encodings[idx] = global_stat 239 240 # Collect encodings by category for this ensemble member 241 for idx, encoding in enumerate(fold_encodings): 242 if not np.isnan(encoding): 243 category = X.iloc[idx][col] 244 category_encodings[category].append(encoding) 245 246 # Average encodings for each category across all ensemble members and folds 247 final_mappings = {} 248 for category, encodings in category_encodings.items(): 249 if encodings: 250 final_mappings[category] = np.mean(encodings) 251 else: 252 final_mappings[category] = self.y_mean_ 253 254 self.category_mappings_[col] = final_mappings 255 256 return self
Fit the encoder using cross-validation to prevent leakage.
7class RollingOriginForecaster(BaseEstimator, RegressorMixin): 8 """ 9 A flexible rolling origin forecaster that supports both autoregressive and 10 exogenous features modes, with multiple prediction strategies. 11 """ 12 13 def __init__( 14 self, 15 estimator, 16 max_horizon=1, 17 n_lags=1, 18 mode="auto", 19 multi_output="auto", 20 recursive=True, 21 ): 22 self.estimator = estimator 23 self.max_horizon = max_horizon 24 self.n_lags = n_lags 25 self.mode = mode 26 self.multi_output = multi_output 27 self.recursive = recursive 28 29 def fit(self, X=None, y=None): 30 if y is None: 31 raise ValueError("y cannot be None") 32 33 # Determine mode 34 if self.mode == "auto": 35 self.mode_ = "ar" if X is None else "exog" 36 else: 37 self.mode_ = self.mode 38 39 # Fit in appropriate mode 40 if self.mode_ == "ar": 41 return self._fit_ar(y) 42 else: 43 return self._fit_exog(X, y) 44 45 def predict(self, X=None, h=None): 46 check_is_fitted(self) 47 48 # Validate horizon 49 if h is None: 50 h = self.max_horizon 51 elif h > self.max_horizon: 52 raise ValueError( 53 f"Requested horizon {h} exceeds max_horizon {self.max_horizon}" 54 ) 55 56 if self.mode_ == "ar": 57 return self._predict_ar(h) 58 else: 59 if X is None: 60 raise ValueError("X cannot be None in exog mode") 61 X = check_array(X) 62 return self._predict_exog(X, h) 63 64 def _fit_ar(self, y): 65 """Fit autoregressive model.""" 66 y = check_array(y, ensure_2d=False) 67 68 # Validate series length 69 if len(y) < self.n_lags + 1: 70 raise ValueError( 71 f"y must have at least n_lags+1 ({self.n_lags+1}) samples" 72 ) 73 74 # Create lagged features matrix 75 X, y = self._create_lagged_features(y) 76 77 # Fit model 78 self.estimator_ = self._fit_model(X, y) 79 80 # Store last window for predictions 81 self.last_window_ = y[-self.n_lags :].reshape(1, -1) 82 83 return self 84 85 def _fit_exog(self, X, y): 86 """Fit model with exogenous features.""" 87 X, y = check_X_y(X, y, multi_output=True) 88 self.n_features_in_ = X.shape[1] 89 90 # Validate series length 91 if len(y) < self.max_horizon: 92 raise ValueError( 93 f"Need at least max_horizon ({self.max_horizon}) samples" 94 ) 95 96 # Fit model 97 self.estimator_ = self._fit_model(X, y) 98 99 return self 100 101 def _fit_model(self, X, y): 102 """Internal method to fit model with selected strategy.""" 103 # Determine multi-output strategy 104 if self.multi_output == "auto": 105 try: 106 # Test if estimator supports multi-output 107 dummy = clone(self.estimator) 108 test_shape = (min(10, len(X)), self.max_horizon) 109 dummy.fit(X[: min(10, len(X))], np.zeros(test_shape)) 110 self.multi_output_ = True 111 except Exception: 112 self.multi_output_ = False 113 else: 114 self.multi_output_ = self.multi_output 115 116 # Prepare targets for multi-step forecasting 117 if y.ndim == 1: 118 y = y.reshape(-1, 1) 119 120 if self.multi_output_: 121 # Create shifted targets matrix 122 T = len(y) 123 if T < self.max_horizon: 124 raise ValueError("Time series too short for max_horizon") 125 y_shifted = np.zeros((T - self.max_horizon + 1, self.max_horizon)) 126 for i in range(self.max_horizon): 127 y_shifted[:, i] = y[i : T - self.max_horizon + 1 + i].ravel() 128 129 # Trim X to match 130 X = X[: len(y_shifted)] 131 y = y_shifted 132 133 return clone(self.estimator).fit(X, y) 134 135 else: 136 if self.recursive: 137 # Single model for recursive predictions 138 return clone(self.estimator).fit(X, y.ravel()) 139 else: 140 # Separate models for each horizon 141 self.estimators_ = [] 142 for i in range(self.max_horizon): 143 X_i = X[: len(y) - i] 144 y_i = y[i:].ravel() 145 est = clone(self.estimator) 146 est.fit(X_i, y_i) 147 self.estimators_.append(est) 148 return self 149 150 def _predict_ar(self, h): 151 """Make autoregressive predictions.""" 152 current_window = self.last_window_.copy() 153 predictions = np.zeros((1, h)) 154 155 for i in range(h): 156 pred = self.estimator_.predict(current_window)[0] 157 predictions[0, i] = pred 158 # Update window 159 current_window = np.roll(current_window, -1) 160 current_window[0, -1] = pred 161 162 return predictions 163 164 def _predict_exog(self, X, h): 165 """Make predictions with exogenous features.""" 166 if hasattr(self, "estimators_"): # Direct strategy 167 preds = np.zeros((X.shape[0], h)) 168 for i in range(h): 169 preds[:, i] = self.estimators_[i].predict(X).ravel() 170 return preds 171 else: 172 if self.multi_output_: 173 pred_out = self.estimator_.predict(X) 174 if pred_out.shape[1] < h: 175 # Pad if model outputs fewer horizons than requested 176 pad_width = ((0, 0), (0, h - pred_out.shape[1])) 177 pred_out = np.pad(pred_out, pad_width, mode="constant") 178 return pred_out[:, :h] 179 else: # Recursive 180 preds = np.zeros((X.shape[0], h)) 181 current_pred = self.estimator_.predict(X) 182 if current_pred.ndim == 1: 183 current_pred = current_pred.reshape(-1, 1) 184 preds[:, 0] = current_pred.ravel() 185 186 for i in range(1, h): 187 if X.shape[1] > 1: 188 X_new = np.hstack([X[:, 1:], current_pred]) 189 else: 190 X_new = current_pred 191 current_pred = self.estimator_.predict(X_new) 192 if current_pred.ndim == 1: 193 current_pred = current_pred.reshape(-1, 1) 194 preds[:, i] = current_pred.ravel() 195 196 return preds 197 198 def _create_lagged_features(self, y): 199 """Create lagged features matrix for AR mode.""" 200 if len(y) <= self.n_lags: 201 raise ValueError("Not enough samples to create lagged features") 202 n_samples = len(y) - self.n_lags 203 X = np.zeros((n_samples, self.n_lags)) 204 for i in range(self.n_lags): 205 X[:, i] = y[i : i + n_samples] 206 y = y[self.n_lags :] 207 return X, y 208 209 def get_params(self, deep=True): 210 """Get parameters for this estimator.""" 211 params = { 212 "estimator": self.estimator, 213 "max_horizon": self.max_horizon, 214 "n_lags": self.n_lags, 215 "mode": self.mode, 216 "multi_output": self.multi_output, 217 "recursive": self.recursive, 218 } 219 if deep: 220 for key, value in params.items(): 221 if hasattr(value, "get_params"): 222 params[key] = value.get_params(deep) 223 return params 224 225 def set_params(self, **params): 226 """Set the parameters of this estimator.""" 227 for parameter, value in params.items(): 228 setattr(self, parameter, value) 229 return self
A flexible rolling origin forecaster that supports both autoregressive and exogenous features modes, with multiple prediction strategies.
29 def fit(self, X=None, y=None): 30 if y is None: 31 raise ValueError("y cannot be None") 32 33 # Determine mode 34 if self.mode == "auto": 35 self.mode_ = "ar" if X is None else "exog" 36 else: 37 self.mode_ = self.mode 38 39 # Fit in appropriate mode 40 if self.mode_ == "ar": 41 return self._fit_ar(y) 42 else: 43 return self._fit_exog(X, y)
45 def predict(self, X=None, h=None): 46 check_is_fitted(self) 47 48 # Validate horizon 49 if h is None: 50 h = self.max_horizon 51 elif h > self.max_horizon: 52 raise ValueError( 53 f"Requested horizon {h} exceeds max_horizon {self.max_horizon}" 54 ) 55 56 if self.mode_ == "ar": 57 return self._predict_ar(h) 58 else: 59 if X is None: 60 raise ValueError("X cannot be None in exog mode") 61 X = check_array(X) 62 return self._predict_exog(X, h)
7def penalized_cross_val_score( 8 estimator, 9 X, 10 y, 11 param_dict, 12 cv=5, 13 scorer=None, 14 penalty_strength=0.1, 15 penalty_type="ci", 16 greater_is_better=False, 17): 18 """ 19 Calculates a penalized cross-validation score that balances mean performance 20 and result stability (low variability across folds). 21 22 Parameters: 23 ----------- 24 estimator : sklearn estimator 25 Model to evaluate. 26 X, y : array-like 27 Training data. 28 param_dict : dict 29 Hyperparameters to set on the estimator. 30 cv : int, default=5 31 Number of cross-validation folds (must be >= 2). 32 scorer : callable or str, optional 33 Scikit-learn scorer (e.g., from sklearn.metrics.make_scorer). 34 penalty_strength : float, default=0.1 35 Multiplicative factor for the variability penalty. 36 penalty_strength=0.1 = penalize by up to 10% of mean score 37 penalty_type : {'std', 'max', 'range', 'ci'} 38 Type of variability to penalize: 39 - 'std': standard deviation of fold scores 40 - 'max': maximum deviation from mean across folds 41 - 'range': difference between best and worst fold 42 - 'ci': approximate 95% confidence interval width (2 * SEM) 43 greater_is_better : bool 44 Whether higher raw scores are better (e.g., accuracy=True, RMSE=False). 45 46 Returns: 47 -------- 48 penalized_score : float 49 Mean CV score adjusted by penalty. Always "lower is better" in effect, 50 so unstable models are penalized. 51 """ 52 53 if penalty_strength < 0: 54 raise ValueError("penalty_strength must be non-negative.") 55 56 if cv < 2: 57 raise ValueError("cv must be at least 2.") 58 59 # Validate parameters 60 estimator_params = estimator.get_params() 61 missing_params = [key for key in param_dict if key not in estimator_params] 62 if missing_params: 63 raise ValueError( 64 f"Estimator does not have parameters: {', '.join(missing_params)}" 65 ) 66 67 # Clone and configure estimator 68 current_estimator = clone(estimator) 69 current_estimator.set_params(**param_dict) 70 71 # Perform cross-validation 72 cv_scores = cross_val_score(current_estimator, X, y, cv=cv, scoring=scorer) 73 74 if len(cv_scores) == 0: 75 raise ValueError("Cross-validation scores are empty.") 76 77 mean_score = np.mean(cv_scores) 78 79 # Compute variability measure 80 if penalty_type == "std": 81 variability_measure = np.std(cv_scores) 82 elif penalty_type == "max": 83 variability_measure = np.max(np.abs(cv_scores - mean_score)) 84 elif penalty_type == "range": 85 variability_measure = np.ptp(cv_scores) # max - min 86 elif penalty_type == "ci": 87 # Approximate 95% CI width: 2 * standard error of the mean 88 variability_measure = 2 * (np.std(cv_scores) / np.sqrt(len(cv_scores))) 89 else: 90 raise ValueError("penalty_type must be 'std', 'max', 'range', or 'ci'.") 91 92 # Scale penalty relative to mean score magnitude 93 if abs(mean_score) > 1e-10: # avoid division by zero 94 normalized_penalty = penalty_strength * ( 95 variability_measure / abs(mean_score) 96 ) 97 else: 98 normalized_penalty = penalty_strength * variability_measure 99 100 # Apply penalty: make worse for instability 101 if greater_is_better: 102 return mean_score - normalized_penalty # lower score = penalized 103 else: 104 return mean_score + normalized_penalty # higher score = penalized
Calculates a penalized cross-validation score that balances mean performance and result stability (low variability across folds).
Parameters:
estimator : sklearn estimator Model to evaluate. X, y : array-like Training data. param_dict : dict Hyperparameters to set on the estimator. cv : int, default=5 Number of cross-validation folds (must be >= 2). scorer : callable or str, optional Scikit-learn scorer (e.g., from sklearn.metrics.make_scorer). penalty_strength : float, default=0.1 Multiplicative factor for the variability penalty. penalty_strength=0.1 = penalize by up to 10% of mean score penalty_type : {'std', 'max', 'range', 'ci'} Type of variability to penalize: - 'std': standard deviation of fold scores - 'max': maximum deviation from mean across folds - 'range': difference between best and worst fold - 'ci': approximate 95% confidence interval width (2 * SEM) greater_is_better : bool Whether higher raw scores are better (e.g., accuracy=True, RMSE=False).
Returns:
penalized_score : float Mean CV score adjusted by penalty. Always "lower is better" in effect, so unstable models are penalized.
7def make_diverse_classification(n_datasets=100, random_state=None): 8 rng = np.random.default_rng(random_state) 9 10 for _ in range(n_datasets): 11 # Sample parameters 12 n_samples = int(loguniform(100, 10000).rvs(random_state=rng)) 13 n_features = int(rng.uniform(10, 50)) 14 15 # --- Step 1: Choose n_classes safely --- 16 max_classes_by_sample = max(2, n_samples // 10) 17 n_classes = rng.integers(2, min(100, max_classes_by_sample) + 1) 18 n_classes = max(2, min(n_classes, n_samples // 2)) 19 20 # --- Step 2: Class weights with minimum samples --- 21 alpha = [0.5] * n_classes 22 weights = dirichlet.rvs(alpha, random_state=rng.integers(0, 2**32))[0] 23 weights /= weights.sum() 24 25 min_per_class = 2 26 total_min = n_classes * min_per_class 27 if total_min > n_samples: 28 weights = np.ones(n_classes) / n_classes 29 else: 30 # Distribute at least min_per_class, then scale 31 y_counts = np.maximum( 32 np.round(weights * n_samples), min_per_class 33 ).astype(int) 34 y_counts = (y_counts / y_counts.sum() * n_samples).astype(int) 35 y_counts[-1] += n_samples - y_counts.sum() # fix rounding 36 y_counts = np.maximum(y_counts, min_per_class) 37 y_counts[-1] += n_samples - y_counts.sum() 38 weights = (y_counts / n_samples).tolist() 39 40 # --- Step 3: Informative features --- 41 # Must support n_classes * n_clusters_per_class <= 2 ** n_informative 42 # So let's first pick n_informative large enough or cap n_classes 43 n_informative = max( 44 4, int(rng.uniform(4, min(10, n_features))) 45 ) # start higher 46 n_informative = min(n_informative, n_features - 6, n_samples - 1) 47 48 # Cap n_classes based on n_informative 49 max_possible_classes = 2**n_informative 50 if n_classes > max_possible_classes: 51 n_classes = max_possible_classes 52 # Recompute weights 53 alpha = [0.5] * n_classes 54 weights = dirichlet.rvs( 55 alpha, random_state=rng.integers(0, 2**32) 56 )[0] 57 weights = (weights / weights.sum()).tolist() 58 59 # --- Step 4: Redundant, repeated, noise --- 60 n_redundant = min( 61 n_informative, 62 int(rng.uniform(0, 0.5) * (n_features - n_informative)), 63 ) 64 available = n_features - n_informative - n_redundant 65 n_repeated = ( 66 int(rng.uniform(0, 0.2) * available) if available > 0 else 0 67 ) 68 n_noise = n_features - n_informative - n_redundant - n_repeated 69 70 if n_noise < 0: 71 continue # should not happen 72 73 # --- Step 5: Clusters per class --- 74 max_clusters_total = 2**n_informative 75 n_clusters_per_class = rng.integers(1, 4) 76 n_clusters_per_class = min( 77 n_clusters_per_class, max_clusters_total // n_classes 78 ) 79 n_clusters_per_class = max(1, n_clusters_per_class) 80 81 # --- Step 6: Other parameters --- 82 class_sep = loguniform(0.1, 10).rvs(random_state=rng) 83 flip_y = rng.uniform(0.0, 0.5) 84 hypercube = rng.choice([True, False]) 85 shift = rng.uniform(-1, 1, n_features) if rng.random() < 0.5 else 0.0 86 scale = rng.uniform(0.5, 5.0) 87 88 # --- Final safety --- 89 if n_informative + n_redundant + n_repeated > n_features: 90 continue 91 92 try: 93 X, y = make_classification( 94 n_samples=n_samples, 95 n_features=n_features, 96 n_informative=n_informative, 97 n_redundant=n_redundant, 98 n_repeated=n_repeated, 99 n_classes=n_classes, 100 n_clusters_per_class=n_clusters_per_class, 101 weights=weights, 102 flip_y=flip_y, 103 class_sep=class_sep, 104 hypercube=hypercube, 105 shift=shift, 106 scale=scale, 107 shuffle=True, 108 random_state=rng.integers(0, 2**32), 109 ) 110 except Exception as e: 111 print(f"Skipped due to error: {e}") 112 continue 113 114 metadata = { 115 "n_samples": n_samples, 116 "n_features": n_features, 117 "n_classes": n_classes, 118 "n_informative": n_informative, 119 "n_redundant": n_redundant, 120 "n_repeated": n_repeated, 121 "n_noise": n_noise, 122 "weights": weights, 123 "flip_y": flip_y, 124 "class_sep": class_sep, 125 "n_clusters_per_class": n_clusters_per_class, 126 "hypercube": hypercube, 127 "scale": scale, 128 } 129 130 yield X, y, metadata
10class HealthcareTimeSeriesGenerator: 11 def __init__(self, seed=42): 12 np.random.seed(seed) 13 random.seed(seed) 14 15 # Define realistic ranges for vital signs and lab values 16 self.vital_ranges = { 17 "heart_rate": (60, 100, 10), # (min, max, std) 18 "systolic_bp": (90, 140, 15), 19 "diastolic_bp": (60, 90, 10), 20 "temperature": (36.1, 37.2, 0.3), # Celsius 21 "respiratory_rate": (12, 20, 3), 22 "oxygen_saturation": (95, 100, 2), 23 } 24 25 self.lab_ranges = { 26 "glucose": (70, 110, 20), # mg/dL 27 "creatinine": (0.6, 1.2, 0.2), # mg/dL 28 "hemoglobin": (12, 16, 1.5), # g/dL 29 "white_blood_cells": (4000, 11000, 1500), # cells/μL 30 "sodium": (136, 145, 3), # mEq/L 31 "potassium": (3.5, 5.0, 0.4), # mEq/L 32 } 33 34 # Medical conditions that affect vital signs 35 self.conditions = [ 36 "hypertension", 37 "diabetes", 38 "copd", 39 "heart_failure", 40 "kidney_disease", 41 "anemia", 42 "infection", 43 "healthy", 44 ] 45 46 def generate_patient_demographics(self, n_patients=100): 47 """Generate realistic patient demographics""" 48 patients = [] 49 50 for i in range(n_patients): 51 age = np.random.normal(65, 15) # Average hospital patient age 52 age = max(18, min(95, int(age))) # Clamp between 18-95 53 54 gender = random.choice(["M", "F"]) 55 56 # Assign conditions based on age and gender 57 conditions = self._assign_conditions(age, gender) 58 59 patient = { 60 "patient_id": f"P{i+1:04d}", 61 "age": age, 62 "gender": gender, 63 "conditions": conditions, 64 "admission_date": self._random_date(), 65 "length_of_stay": random.randint(1, 30), 66 } 67 patients.append(patient) 68 69 return pd.DataFrame(patients) 70 71 def _assign_conditions(self, age, gender): 72 """Assign medical conditions based on demographics""" 73 conditions = [] 74 75 # Age-related condition probabilities 76 if age > 50: 77 if random.random() < 0.3: 78 conditions.append("hypertension") 79 if random.random() < 0.15: 80 conditions.append("diabetes") 81 if random.random() < 0.1: 82 conditions.append("heart_failure") 83 84 if age > 60: 85 if random.random() < 0.08: 86 conditions.append("copd") 87 if random.random() < 0.12: 88 conditions.append("kidney_disease") 89 90 if gender == "F" and random.random() < 0.1: 91 conditions.append("anemia") 92 93 if random.random() < 0.05: 94 conditions.append("infection") 95 96 if not conditions: 97 conditions.append("healthy") 98 99 return conditions 100 101 def _random_date(self): 102 """Generate random date within last 2 years""" 103 start_date = datetime.now() - timedelta(days=730) 104 random_days = random.randint(0, 730) 105 return start_date + timedelta(days=random_days) 106 107 def generate_time_series( 108 self, patients_df, measurements_per_day=4, include_missing=True 109 ): 110 """Generate time series data for all patients""" 111 all_measurements = [] 112 113 for _, patient in patients_df.iterrows(): 114 patient_measurements = self._generate_patient_timeseries( 115 patient, measurements_per_day, include_missing 116 ) 117 all_measurements.extend(patient_measurements) 118 119 return pd.DataFrame(all_measurements) 120 121 def _generate_patient_timeseries( 122 self, patient, measurements_per_day, include_missing 123 ): 124 """Generate time series for a single patient""" 125 measurements = [] 126 127 start_date = patient["admission_date"] 128 length_of_stay = patient["length_of_stay"] 129 conditions = patient["conditions"] 130 131 # Generate measurements for each day 132 for day in range(length_of_stay): 133 current_date = start_date + timedelta(days=day) 134 135 # Generate multiple measurements per day 136 for measurement_num in range(measurements_per_day): 137 timestamp = current_date + timedelta( 138 hours=measurement_num * (24 / measurements_per_day) 139 ) 140 141 measurement = { 142 "patient_id": patient["patient_id"], 143 "timestamp": timestamp, 144 "day_of_stay": day + 1, 145 } 146 147 # Generate vital signs 148 vitals = self._generate_vitals(conditions, day, patient["age"]) 149 measurement.update(vitals) 150 151 # Generate lab values (less frequent) 152 if ( 153 measurement_num == 0 or random.random() < 0.1 154 ): # Morning labs or random 155 labs = self._generate_labs(conditions, day) 156 measurement.update(labs) 157 else: 158 # Add NaN for missing lab values 159 for lab in self.lab_ranges.keys(): 160 measurement[lab] = np.nan 161 162 # Add some random missing values if requested 163 if include_missing: 164 measurement = self._add_missing_values(measurement) 165 166 measurements.append(measurement) 167 168 return measurements 169 170 def _generate_vitals(self, conditions, day, age): 171 """Generate vital signs based on patient conditions and progression""" 172 vitals = {} 173 174 for vital, (base_min, base_max, base_std) in self.vital_ranges.items(): 175 base_mean = (base_min + base_max) / 2 176 177 # Adjust based on conditions 178 mean_adjustment = 0 179 std_adjustment = 1 180 181 if "hypertension" in conditions: 182 if "systolic" in vital: 183 mean_adjustment += 20 184 elif "diastolic" in vital: 185 mean_adjustment += 10 186 187 if "heart_failure" in conditions: 188 if vital == "heart_rate": 189 mean_adjustment += 15 190 elif vital == "respiratory_rate": 191 mean_adjustment += 5 192 elif vital == "oxygen_saturation": 193 mean_adjustment -= 3 194 195 if "copd" in conditions: 196 if vital == "respiratory_rate": 197 mean_adjustment += 8 198 elif vital == "oxygen_saturation": 199 mean_adjustment -= 5 200 201 if "infection" in conditions: 202 if vital == "temperature": 203 mean_adjustment += np.random.normal(1.5, 0.5) 204 elif vital == "heart_rate": 205 mean_adjustment += 20 206 207 # Age adjustments 208 if age > 70: 209 if vital == "systolic_bp": 210 mean_adjustment += 10 211 elif vital == "heart_rate": 212 mean_adjustment -= 5 213 214 # Day progression (recovery/deterioration) 215 day_effect = np.sin(day * 0.2) * 2 # Subtle oscillation 216 217 # Generate value 218 adjusted_mean = base_mean + mean_adjustment + day_effect 219 adjusted_std = base_std * std_adjustment 220 221 value = np.random.normal(adjusted_mean, adjusted_std) 222 223 # Apply realistic bounds 224 if vital == "temperature": 225 value = max(35.0, min(42.0, value)) 226 elif vital == "oxygen_saturation": 227 value = max(70, min(100, value)) 228 elif vital == "heart_rate": 229 value = max(40, min(180, value)) 230 elif "bp" in vital: 231 value = max(40, min(200, value)) 232 elif vital == "respiratory_rate": 233 value = max(8, min(40, value)) 234 235 vitals[vital] = round(value, 1) 236 237 return vitals 238 239 def _generate_labs(self, conditions, day): 240 """Generate lab values based on conditions""" 241 labs = {} 242 243 for lab, (base_min, base_max, base_std) in self.lab_ranges.items(): 244 base_mean = (base_min + base_max) / 2 245 246 # Condition-based adjustments 247 mean_adjustment = 0 248 249 if "diabetes" in conditions and lab == "glucose": 250 mean_adjustment += np.random.normal(50, 20) 251 252 if "kidney_disease" in conditions: 253 if lab == "creatinine": 254 mean_adjustment += np.random.normal(1.0, 0.5) 255 elif lab == "potassium": 256 mean_adjustment += np.random.normal(0.5, 0.2) 257 258 if "anemia" in conditions and lab == "hemoglobin": 259 mean_adjustment -= np.random.normal(3, 1) 260 261 if "infection" in conditions and lab == "white_blood_cells": 262 mean_adjustment += np.random.normal(5000, 2000) 263 264 # Generate value 265 adjusted_mean = base_mean + mean_adjustment 266 value = np.random.normal(adjusted_mean, base_std) 267 268 # Apply bounds 269 if lab == "glucose": 270 value = max(30, min(500, value)) 271 elif lab == "creatinine": 272 value = max(0.3, min(10.0, value)) 273 elif lab == "hemoglobin": 274 value = max(5.0, min(20.0, value)) 275 elif lab == "white_blood_cells": 276 value = max(1000, min(50000, value)) 277 elif lab == "sodium": 278 value = max(120, min(160, value)) 279 elif lab == "potassium": 280 value = max(2.0, min(7.0, value)) 281 282 labs[lab] = round(value, 2) 283 284 return labs 285 286 def _add_missing_values(self, measurement, missing_prob=0.05): 287 """Randomly add missing values to simulate real-world data""" 288 for key, value in measurement.items(): 289 if key not in [ 290 "patient_id", 291 "timestamp", 292 "day_of_stay", 293 ] and not pd.isna(value): 294 if random.random() < missing_prob: 295 measurement[key] = np.nan 296 return measurement 297 298 def generate_outcomes(self, patients_df, timeseries_df): 299 """Generate patient outcomes based on their data""" 300 outcomes = [] 301 302 for _, patient in patients_df.iterrows(): 303 patient_data = timeseries_df[ 304 timeseries_df["patient_id"] == patient["patient_id"] 305 ] 306 307 # Calculate outcome probability based on conditions and vital trends 308 readmission_prob = self._calculate_readmission_risk( 309 patient, patient_data 310 ) 311 mortality_risk = self._calculate_mortality_risk( 312 patient, patient_data 313 ) 314 315 outcome = { 316 "patient_id": patient["patient_id"], 317 "readmitted_30_days": random.random() < readmission_prob, 318 "mortality_risk_score": round(mortality_risk, 3), 319 "length_of_stay_actual": patient["length_of_stay"], 320 "discharge_disposition": self._assign_discharge_disposition( 321 patient, mortality_risk 322 ), 323 } 324 outcomes.append(outcome) 325 326 return pd.DataFrame(outcomes) 327 328 def _calculate_readmission_risk(self, patient, patient_data): 329 """Calculate 30-day readmission risk""" 330 base_risk = 0.1 # 10% base readmission rate 331 332 # Condition-based risk 333 if "heart_failure" in patient["conditions"]: 334 base_risk += 0.15 335 if "diabetes" in patient["conditions"]: 336 base_risk += 0.08 337 if "kidney_disease" in patient["conditions"]: 338 base_risk += 0.12 339 340 # Age-based risk 341 if patient["age"] > 75: 342 base_risk += 0.1 343 344 # Vital signs instability 345 if len(patient_data) > 0: 346 hr_std = patient_data["heart_rate"].std() 347 if hr_std > 15: 348 base_risk += 0.05 349 350 return min(0.8, base_risk) 351 352 def _calculate_mortality_risk(self, patient, patient_data): 353 """Calculate mortality risk score""" 354 risk_score = 0 355 356 # Age component 357 risk_score += patient["age"] * 0.02 358 359 # Condition components 360 condition_weights = { 361 "heart_failure": 0.3, 362 "kidney_disease": 0.25, 363 "copd": 0.2, 364 "infection": 0.15, 365 "diabetes": 0.1, 366 "hypertension": 0.05, 367 } 368 369 for condition in patient["conditions"]: 370 if condition in condition_weights: 371 risk_score += condition_weights[condition] 372 373 # Vital signs component 374 if len(patient_data) > 0: 375 # Abnormal vital signs increase risk 376 avg_o2_sat = patient_data["oxygen_saturation"].mean() 377 if avg_o2_sat < 92: 378 risk_score += 0.2 379 380 avg_temp = patient_data["temperature"].mean() 381 if avg_temp > 38.5: 382 risk_score += 0.15 383 384 return min(1.0, risk_score) 385 386 def _assign_discharge_disposition(self, patient, mortality_risk): 387 """Assign discharge disposition""" 388 if mortality_risk > 0.7: 389 return random.choice(["ICU Transfer", "Deceased"]) 390 elif mortality_risk > 0.4: 391 return random.choice( 392 ["Skilled Nursing Facility", "Home with Services"] 393 ) 394 else: 395 return random.choice( 396 ["Home", "Home with Services", "Rehabilitation"] 397 ) 398 399 def create_visualizations(self, patients, timeseries, outcomes): 400 """Create comprehensive visualizations of the healthcare data""" 401 402 # Set up the plotting style 403 plt.style.use("default") 404 sns.set_palette("husl") 405 406 # Create figure with subplots 407 fig = plt.figure(figsize=(20, 16)) 408 409 # 1. Patient Demographics 410 plt.subplot(3, 4, 1) 411 patients["age"].hist( 412 bins=15, alpha=0.7, color="skyblue", edgecolor="black" 413 ) 414 plt.title("Age Distribution", fontsize=12, fontweight="bold") 415 plt.xlabel("Age") 416 plt.ylabel("Frequency") 417 418 # 2. Gender Distribution 419 plt.subplot(3, 4, 2) 420 gender_counts = patients["gender"].value_counts() 421 plt.pie( 422 gender_counts.values, 423 labels=gender_counts.index, 424 autopct="%1.1f%%", 425 colors=["lightcoral", "lightblue"], 426 ) 427 plt.title("Gender Distribution", fontsize=12, fontweight="bold") 428 429 # 3. Medical Conditions Frequency 430 plt.subplot(3, 4, 3) 431 all_conditions = [ 432 cond for conditions in patients["conditions"] for cond in conditions 433 ] 434 condition_counts = pd.Series(all_conditions).value_counts() 435 condition_counts.plot(kind="bar", color="lightgreen", alpha=0.8) 436 plt.title( 437 "Medical Conditions Frequency", fontsize=12, fontweight="bold" 438 ) 439 plt.xticks(rotation=45) 440 plt.ylabel("Count") 441 442 # 4. Length of Stay Distribution 443 plt.subplot(3, 4, 4) 444 patients["length_of_stay"].hist( 445 bins=15, alpha=0.7, color="orange", edgecolor="black" 446 ) 447 plt.title("Length of Stay Distribution", fontsize=12, fontweight="bold") 448 plt.xlabel("Days") 449 plt.ylabel("Frequency") 450 451 # 5. Heart Rate Time Series for Sample Patients 452 plt.subplot(3, 4, 5) 453 sample_patients = patients["patient_id"].head(5) 454 for pid in sample_patients: 455 patient_data = timeseries[timeseries["patient_id"] == pid].copy() 456 if len(patient_data) > 0: 457 patient_data = patient_data.sort_values("timestamp") 458 plt.plot( 459 patient_data["day_of_stay"], 460 patient_data["heart_rate"], 461 marker="o", 462 markersize=3, 463 alpha=0.7, 464 label=pid, 465 ) 466 plt.title( 467 "Heart Rate Over Time (Sample Patients)", 468 fontsize=12, 469 fontweight="bold", 470 ) 471 plt.xlabel("Day of Stay") 472 plt.ylabel("Heart Rate (bpm)") 473 plt.legend(fontsize=8) 474 475 # 6. Blood Pressure Correlation 476 plt.subplot(3, 4, 6) 477 clean_bp = timeseries.dropna(subset=["systolic_bp", "diastolic_bp"]) 478 plt.scatter( 479 clean_bp["systolic_bp"], 480 clean_bp["diastolic_bp"], 481 alpha=0.5, 482 s=10, 483 color="red", 484 ) 485 plt.title("Blood Pressure Correlation", fontsize=12, fontweight="bold") 486 plt.xlabel("Systolic BP") 487 plt.ylabel("Diastolic BP") 488 489 # 7. Temperature vs Heart Rate 490 plt.subplot(3, 4, 7) 491 clean_temp_hr = timeseries.dropna(subset=["temperature", "heart_rate"]) 492 plt.scatter( 493 clean_temp_hr["temperature"], 494 clean_temp_hr["heart_rate"], 495 alpha=0.5, 496 s=10, 497 color="purple", 498 ) 499 plt.title("Temperature vs Heart Rate", fontsize=12, fontweight="bold") 500 plt.xlabel("Temperature (°C)") 501 plt.ylabel("Heart Rate (bpm)") 502 503 # 8. Vital Signs Distribution 504 plt.subplot(3, 4, 8) 505 vital_cols = ["heart_rate", "respiratory_rate", "oxygen_saturation"] 506 timeseries[vital_cols].boxplot() 507 plt.title("Vital Signs Distribution", fontsize=12, fontweight="bold") 508 plt.xticks(rotation=45) 509 510 # 9. Lab Values Over Time 511 plt.subplot(3, 4, 9) 512 sample_patient = timeseries[ 513 timeseries["patient_id"] == sample_patients.iloc[0] 514 ].copy() 515 sample_patient = sample_patient.sort_values("timestamp") 516 517 # Plot glucose if available 518 glucose_data = sample_patient.dropna(subset=["glucose"]) 519 if len(glucose_data) > 0: 520 plt.plot( 521 glucose_data["day_of_stay"], 522 glucose_data["glucose"], 523 "o-", 524 color="green", 525 label="Glucose", 526 ) 527 528 # Plot creatinine if available 529 creat_data = sample_patient.dropna(subset=["creatinine"]) 530 if len(creat_data) > 0: 531 plt.twinx() 532 plt.plot( 533 creat_data["day_of_stay"], 534 creat_data["creatinine"], 535 "o-", 536 color="blue", 537 label="Creatinine", 538 ) 539 plt.ylabel("Creatinine (mg/dL)", color="blue") 540 541 plt.title( 542 f"Lab Values - {sample_patients.iloc[0]}", 543 fontsize=12, 544 fontweight="bold", 545 ) 546 plt.xlabel("Day of Stay") 547 plt.ylabel("Glucose (mg/dL)", color="green") 548 549 # 10. Readmission Risk by Age Group 550 plt.subplot(3, 4, 10) 551 merged_data = patients.merge(outcomes, on="patient_id") 552 merged_data["age_group"] = pd.cut( 553 merged_data["age"], 554 bins=[0, 40, 60, 80, 100], 555 labels=["<40", "40-60", "60-80", "80+"], 556 ) 557 readmission_by_age = merged_data.groupby("age_group")[ 558 "readmitted_30_days" 559 ].mean() 560 readmission_by_age.plot(kind="bar", color="salmon", alpha=0.8) 561 plt.title( 562 "30-Day Readmission Rate by Age", fontsize=12, fontweight="bold" 563 ) 564 plt.ylabel("Readmission Rate") 565 plt.xticks(rotation=0) 566 567 # 11. Mortality Risk Distribution 568 plt.subplot(3, 4, 11) 569 outcomes["mortality_risk_score"].hist( 570 bins=20, alpha=0.7, color="darkred", edgecolor="black" 571 ) 572 plt.title( 573 "Mortality Risk Score Distribution", fontsize=12, fontweight="bold" 574 ) 575 plt.xlabel("Risk Score") 576 plt.ylabel("Frequency") 577 578 # 12. Missing Data Heatmap 579 plt.subplot(3, 4, 12) 580 # Calculate missing data percentage for each column 581 missing_data = timeseries.isnull().sum() / len(timeseries) * 100 582 missing_data = missing_data[missing_data > 0].sort_values( 583 ascending=False 584 ) 585 586 if len(missing_data) > 0: 587 missing_data.plot(kind="bar", color="gray", alpha=0.8) 588 plt.title("Missing Data Percentage", fontsize=12, fontweight="bold") 589 plt.ylabel("Missing %") 590 plt.xticks(rotation=45) 591 else: 592 plt.text( 593 0.5, 594 0.5, 595 "No Missing Data", 596 ha="center", 597 va="center", 598 transform=plt.gca().transAxes, 599 fontsize=14, 600 ) 601 plt.title("Missing Data Percentage", fontsize=12, fontweight="bold") 602 603 plt.tight_layout() 604 plt.savefig( 605 "healthcare_data_visualization.png", dpi=300, bbox_inches="tight" 606 ) 607 print("Visualization saved as: healthcare_data_visualization.png") 608 plt.show() 609 # Create additional detailed plots 610 self.create_detailed_plots(patients, timeseries) 611 612 def create_detailed_plots(self, patients, timeseries): 613 """Create additional detailed visualizations""" 614 615 # Time Series Plot for Multiple Vital Signs 616 fig, axes = plt.subplots(2, 2, figsize=(16, 12)) 617 618 # Select a patient with longer stay for better visualization 619 long_stay_patients = patients[patients["length_of_stay"] >= 7][ 620 "patient_id" 621 ].head(3) 622 623 vital_signs = [ 624 "heart_rate", 625 "systolic_bp", 626 "temperature", 627 "oxygen_saturation", 628 ] 629 colors = ["red", "blue", "orange", "green"] 630 631 for i, vital in enumerate(vital_signs): 632 ax = axes[i // 2, i % 2] 633 634 for j, pid in enumerate(long_stay_patients): 635 patient_data = timeseries[ 636 timeseries["patient_id"] == pid 637 ].copy() 638 patient_data = patient_data.sort_values("timestamp") 639 clean_data = patient_data.dropna(subset=[vital]) 640 641 if len(clean_data) > 0: 642 ax.plot( 643 clean_data["day_of_stay"], 644 clean_data[vital], 645 marker="o", 646 label=pid, 647 alpha=0.7, 648 linewidth=2, 649 ) 650 651 ax.set_title( 652 f'{vital.replace("_", " ").title()} Over Time', 653 fontweight="bold", 654 ) 655 ax.set_xlabel("Day of Stay") 656 ax.set_ylabel(vital.replace("_", " ").title()) 657 ax.legend() 658 ax.grid(True, alpha=0.3) 659 660 plt.tight_layout() 661 plt.savefig( 662 "detailed_vitals_timeseries.png", dpi=300, bbox_inches="tight" 663 ) 664 print( 665 "Detailed vital signs plot saved as: detailed_vitals_timeseries.png" 666 ) 667 plt.show() 668 669 # Correlation Heatmap 670 plt.figure(figsize=(12, 10)) 671 numeric_cols = [ 672 "heart_rate", 673 "systolic_bp", 674 "diastolic_bp", 675 "temperature", 676 "respiratory_rate", 677 "oxygen_saturation", 678 "glucose", 679 "creatinine", 680 "hemoglobin", 681 "white_blood_cells", 682 "sodium", 683 "potassium", 684 ] 685 686 correlation_matrix = timeseries[numeric_cols].corr() 687 688 sns.heatmap( 689 correlation_matrix, 690 annot=True, 691 cmap="coolwarm", 692 center=0, 693 square=True, 694 fmt=".2f", 695 cbar_kws={"shrink": 0.8}, 696 ) 697 plt.title( 698 "Healthcare Parameters Correlation Matrix", 699 fontsize=16, 700 fontweight="bold", 701 ) 702 plt.tight_layout() 703 plt.savefig("correlation_heatmap.png", dpi=300, bbox_inches="tight") 704 print("Correlation heatmap saved as: correlation_heatmap.png") 705 plt.show()
8def generate_synthetic_returns( 9 n_days=252 * 10, # ~10 years of daily data 10 mu=0.0002, # Daily drift 11 kappa=0.05, # Vol mean reversion 12 theta=0.0001, # Long-run variance 13 sigma_v=0.01, # Vol of vol 14 rho=-0.7, # Leverage effect 15 lambda_jump=0.05, # Jump intensity (per day) 16 jump_size_dist="normal", # "normal", "log_normal", or "exponential" 17 sigma_jump=0.02, # Jump size (scale parameter) 18 noise_dist="normal", # "normal" or "student_t" 19 noise_scale=0.0005, # Microstructure noise scale 20 noise_df=3.0, # Degrees of freedom for Student’s t 21 regime_params=None, # Regime switching params 22 random_seed=None, # Reproducibility 23): 24 """ 25 Generates synthetic stock returns with: 26 - Stochastic volatility (Heston-like) 27 - Jumps (Poisson-driven, with configurable distribution) 28 - Regime switching (Markov) 29 - Leverage effect 30 - Fat tails (via jumps and noise) 31 - Microstructure noise (Gaussian or Student’s t) 32 33 Args: 34 jump_size_dist: Jump size distribution ("normal", "log_normal", "exponential"). 35 noise_dist: Microstructure noise distribution ("normal", "student_t"). 36 noise_df: Degrees of freedom for Student’s t noise (if used). 37 """ 38 if random_seed is not None: 39 np.random.seed(random_seed) 40 41 # Default regime switching (2 regimes: calm and turbulent) 42 if regime_params is None: 43 regime_params = { 44 "transition_matrix": np.array([[0.99, 0.01], [0.03, 0.97]]), 45 "theta_high_multiplier": 3.0, 46 "kappa_high_multiplier": 2.0, 47 } 48 49 # Initialize 50 v = np.zeros(n_days) # Variance 51 r = np.zeros(n_days) # Returns 52 regime = np.zeros(n_days, dtype=int) 53 v[0] = theta 54 55 # Simulate regime switching (Markov chain) 56 for t in range(1, n_days): 57 regime[t] = np.random.choice( 58 [0, 1], p=regime_params["transition_matrix"][regime[t - 1]] 59 ) 60 61 # Simulate returns and volatility 62 for t in range(1, n_days): 63 # Regime-dependent params 64 if regime[t] == 1: # High-vol regime 65 theta_t = theta * regime_params["theta_high_multiplier"] 66 kappa_t = kappa * regime_params["kappa_high_multiplier"] 67 else: 68 theta_t = theta 69 kappa_t = kappa 70 71 # Volatility process (Euler discretization) 72 eta = np.random.normal() 73 epsilon = rho * eta + np.sqrt(1 - rho**2) * np.random.normal() 74 dv = kappa_t * (theta_t - v[t - 1]) + sigma_v * np.sqrt(v[t - 1]) * eta 75 v[t] = max(v[t - 1] + dv, 1e-6) # Ensure positivity 76 77 # Jumps (with configurable distribution) 78 if np.random.poisson(lambda_jump) > 0: 79 if jump_size_dist == "normal": 80 J = np.random.normal(0, sigma_jump) 81 elif jump_size_dist == "log_normal": 82 J = ( 83 np.exp(np.random.normal(0, sigma_jump)) - 1 84 ) # Log-normal (positive skew) 85 elif jump_size_dist == "exponential": 86 J = np.random.exponential(sigma_jump) * np.sign( 87 np.random.uniform(-1, 1) 88 ) # Double-sided 89 else: 90 raise ValueError( 91 "Invalid jump_size_dist. Use 'normal', 'log_normal', or 'exponential'." 92 ) 93 else: 94 J = 0 95 96 # Returns 97 r[t] = mu + np.sqrt(v[t - 1]) * epsilon + J 98 99 # Microstructure noise (Gaussian or Student’s t) 100 if noise_dist == "normal": 101 r += np.random.normal(0, noise_scale, n_days) 102 elif noise_dist == "student_t": 103 r += ( 104 np.random.standard_t(noise_df, n_days) 105 * noise_scale 106 / np.sqrt(noise_df / (noise_df - 2)) 107 ) 108 else: 109 raise ValueError("Invalid noise_dist. Use 'normal' or 'student_t'.") 110 111 # Create DataFrame 112 df = pd.DataFrame( 113 {"returns": r, "volatility": np.sqrt(v), "regime": regime}, 114 index=pd.date_range(start="1970-01-01", periods=n_days), 115 ) 116 117 return df
Generates synthetic stock returns with:
- Stochastic volatility (Heston-like)
- Jumps (Poisson-driven, with configurable distribution)
- Regime switching (Markov)
- Leverage effect
- Fat tails (via jumps and noise)
- Microstructure noise (Gaussian or Student’s t)
Args: jump_size_dist: Jump size distribution ("normal", "log_normal", "exponential"). noise_dist: Microstructure noise distribution ("normal", "student_t"). noise_df: Degrees of freedom for Student’s t noise (if used).
120def plot_synthetic_returns( 121 df, title="Synthetic Stock Returns Analysis", figsize=(14, 10) 122): 123 """ 124 Plot synthetic stock returns with multiple panels: 125 - Returns over time 126 - Volatility (sqrt variance) 127 - Regime indicators 128 - Distribution vs. normal (QQ plot and histogram) 129 - Autocorrelation of returns and squared returns 130 131 Args: 132 df (pd.DataFrame): Output from generate_synthetic_returns 133 Must have: 'returns', 'volatility', 'regime' 134 title (str): Title for the plot 135 figsize (tuple): Figure size 136 """ 137 # Set style 138 sns.set_style("darkgrid") 139 plt.rcParams["figure.dpi"] = 100 140 141 fig = plt.figure(figsize=figsize) 142 gs = fig.add_gridspec(3, 2, height_ratios=[1, 1, 1], hspace=0.4, wspace=0.3) 143 144 # ------------------------- 145 # 1. Returns Over Time 146 # ------------------------- 147 ax1 = fig.add_subplot(gs[0, 0]) 148 ax1.plot(df.index, df["returns"], lw=0.8, color="tab:blue", alpha=0.9) 149 ax1.set_title("Daily Returns") 150 ax1.set_ylabel("Return") 151 ax1.axhline(0, color="gray", linestyle="--", lw=0.8) 152 153 # Highlight large jumps (optional) 154 threshold = df["returns"].std() * 3 155 jumps = df[np.abs(df["returns"]) > threshold] 156 if not jumps.empty: 157 ax1.scatter( 158 jumps.index, 159 jumps["returns"], 160 color="red", 161 s=10, 162 zorder=5, 163 label="Large Moves", 164 ) 165 ax1.legend() 166 167 # ------------------------- 168 # 2. Volatility 169 # ------------------------- 170 ax2 = fig.add_subplot(gs[0, 1]) 171 ax2.plot(df.index, df["volatility"], lw=1.2, color="tab:orange") 172 ax2.set_title("Volatility (Latent)") 173 ax2.set_ylabel("Volatility") 174 175 # Shade turbulent regimes 176 if "regime" in df.columns: 177 turbulent_days = df[df["regime"] == 1] 178 if not turbulent_days.empty: 179 ax2.fill_between( 180 turbulent_days.index, 181 df.loc[turbulent_days.index, "volatility"].min(), 182 df.loc[turbulent_days.index, "volatility"], 183 color="red", 184 alpha=0.2, 185 label="Turbulent Regime", 186 ) 187 ax2.legend() 188 189 # ------------------------- 190 # 3. Regime Plot 191 # ------------------------- 192 ax3 = fig.add_subplot(gs[1, 0]) 193 ax3.fill_between( 194 df.index, 195 0, 196 1, 197 where=(df["regime"] == 0), 198 interpolate=True, 199 color="green", 200 alpha=0.3, 201 label="Calm Regime", 202 ) 203 ax3.fill_between( 204 df.index, 205 0, 206 1, 207 where=(df["regime"] == 1), 208 interpolate=True, 209 color="red", 210 alpha=0.3, 211 label="Turbulent Regime", 212 ) 213 ax3.set_ylim(0, 1) 214 ax3.set_yticks([]) 215 ax3.set_title("Regime Switching (Hidden State)") 216 ax3.legend(loc="upper right") 217 218 # ------------------------- 219 # 4. Return Distribution (QQ + Histogram) 220 # ------------------------- 221 from scipy import stats 222 223 ax4 = fig.add_subplot(gs[1, 1]) 224 stats.probplot(df["returns"], dist="norm", plot=ax4) 225 ax4.set_title("QQ Plot (Fat Tails Detection)") 226 ax4.get_lines()[0].set_marker(".") 227 ax4.get_lines()[0].set_markersize(4) 228 ax4.get_lines()[1].set_color("red") 229 ax4.get_lines()[1].set_linewidth(1.5) 230 231 # ------------------------- 232 # 5. Histogram with Normal Fit 233 # ------------------------- 234 ax5 = fig.add_subplot(gs[2, 0]) 235 mu_norm, std_norm = stats.norm.fit(df["returns"]) 236 sns.histplot( 237 df["returns"], 238 bins=50, 239 kde=False, 240 stat="density", 241 ax=ax5, 242 alpha=0.7, 243 color="skyblue", 244 ) 245 xmin, xmax = ax5.get_xlim() 246 x = np.linspace(xmin, xmax, 100) 247 p = stats.norm.pdf(x, mu_norm, std_norm) 248 ax5.plot(x, p, "k--", linewidth=1.5, label="Normal Fit") 249 ax5.set_title("Return Distribution") 250 ax5.set_xlabel("Return") 251 ax5.legend() 252 253 # Annotate kurtosis and skew 254 kurt = df["returns"].kurtosis() 255 skew = df["returns"].skew() 256 ax5.text( 257 0.02, 258 0.9, 259 f"Kurtosis: {kurt:.2f}\nSkewness: {skew:.2f}", 260 transform=ax5.transAxes, 261 fontsize=10, 262 bbox=dict(boxstyle="round", facecolor="wheat", alpha=0.8), 263 ) 264 265 # ------------------------- 266 # 6. Autocorrelation 267 # ------------------------- 268 ax6 = fig.add_subplot(gs[2, 1]) 269 from statsmodels.graphics.tsaplots import plot_acf 270 271 plot_acf( 272 df["returns"] ** 2, 273 ax=ax6, 274 lags=40, 275 title="ACF of Squared Returns", 276 alpha=0.05, 277 ) 278 ax6.set_xlabel("Lag (Days)") 279 ax6.set_ylabel("Autocorrelation") 280 281 # Add title at top 282 fig.suptitle(title, fontsize=14, fontweight="bold", y=0.98) 283 284 # Adjust layout 285 plt.tight_layout() 286 plt.show()
Plot synthetic stock returns with multiple panels:
- Returns over time
- Volatility (sqrt variance)
- Regime indicators
- Distribution vs. normal (QQ plot and histogram)
- Autocorrelation of returns and squared returns
Args: df (pd.DataFrame): Output from generate_synthetic_returns Must have: 'returns', 'volatility', 'regime' title (str): Title for the plot figsize (tuple): Figure size