mlsauce
1import sys 2import logging 3import os 4 5from ._config import get_config, set_config, config_context 6 7logger = logging.getLogger(__name__) 8 9 10# PEP0440 compatible formatted version, see: 11# https://www.python.org/dev/peps/pep-0440/ 12# 13# Generic release markers: 14# X.Y 15# X.Y.Z # For bugfix releases 16# 17# Admissible pre-release markers: 18# X.YaN # Alpha release 19# X.YbN # Beta release 20# X.YrcN # Release Candidate 21# X.Y # Final release 22# 23# Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer. 24# 'X.Y.dev0' is the canonical version of 'X.Y.dev' 25# 26# __version__ = "0.10.0" 27 28 29# On OSX, we can get a runtime error due to multiple OpenMP libraries loaded 30# simultaneously. This can happen for instance when calling BLAS inside a 31# prange. Setting the following environment variable allows multiple OpenMP 32# libraries to be loaded. It should not degrade performances since we manually 33# take care of potential over-subcription performance issues, in sections of 34# the code where nested OpenMP loops can happen, by dynamically reconfiguring 35# the inner OpenMP runtime to temporarily disable it while under the scope of 36# the outer OpenMP parallel section. 37os.environ.setdefault("KMP_DUPLICATE_LIB_OK", "True") 38 39# Workaround issue discovered in intel-openmp 2019.5: 40# https://github.com/ContinuumIO/anaconda-issues/issues/11294 41os.environ.setdefault("KMP_INIT_AT_FORK", "FALSE") 42 43try: 44 # This variable is injected in the __builtins__ by the build 45 # process. It is used to enable importing subpackages of mlsauce when 46 # the binaries are not built 47 # mypy error: Cannot determine type of '__MLSAUCE_SETUP__' 48 __MLSAUCE_SETUP__ # type: ignore 49except NameError: 50 __MLSAUCE_SETUP__ = False 51 52if __MLSAUCE_SETUP__: 53 sys.stderr.write("Partial import of mlsauce during the build process.\n") 54 # We are not importing the rest of scikit-learn during the build 55 # process, as it may not be compiled yet 56 57else: 58 from .adaopt import AdaOpt 59 from .booster import ( 60 LSBoostClassifier, 61 LSBoostRegressor, 62 GenericBoostingClassifier, 63 GenericBoostingRegressor, 64 ) 65 from .lazybooster import LazyBoostingClassifier, LazyBoostingRegressor 66 from .multitaskregressor import MultiTaskRegressor 67 from .datasets import download 68 from .elasticnet import ElasticNetRegressor 69 from .lasso import LassoRegressor 70 from .ridge import RidgeRegressor 71 from .stump import StumpClassifier 72 73 # from .encoders import corrtarget_encoder 74 75 __all__ = [ 76 "AdaOpt", 77 "LSBoostClassifier", 78 "GenericBoostingClassifier", 79 "GenericBoostingRegressor", 80 "StumpClassifier", 81 "ElasticNetRegressor", 82 "LassoRegressor", 83 "LSBoostRegressor", 84 "RidgeRegressor", 85 "LazyBoostingClassifier", 86 "LazyBoostingRegressor", 87 "MultiTaskRegressor", 88 # Other imports 89 # "corrtarget_encoder", 90 "download", 91 # Non-modules: 92 "get_config", 93 "set_config", 94 "config_context", 95 ] 96 97 98def setup_module(module): 99 """Fixture for the tests to assure globally controllable seeding of RNGs""" 100 import os 101 import numpy as np 102 import random 103 104 # Check if a random seed exists in the environment, if not create one. 105 _random_seed = os.environ.get("MLSAUCE_SEED", None) 106 if _random_seed is None: 107 _random_seed = np.random.uniform() * np.iinfo(np.int32).max 108 _random_seed = int(_random_seed) 109 print("I: Seeding RNGs with %r" % _random_seed) 110 np.random.seed(_random_seed) 111 random.seed(_random_seed)
19class AdaOpt(BaseEstimator, ClassifierMixin): 20 """AdaOpt classifier. 21 22 Attributes: 23 24 n_iterations: int 25 number of iterations of the optimizer at training time. 26 27 learning_rate: float 28 controls the speed of the optimizer at training time. 29 30 reg_lambda: float 31 L2 regularization parameter for successive errors in the optimizer 32 (at training time). 33 34 reg_alpha: float 35 L1 regularization parameter for successive errors in the optimizer 36 (at training time). 37 38 eta: float 39 controls the slope in gradient descent (at training time). 40 41 gamma: float 42 controls the step size in gradient descent (at training time). 43 44 k: int 45 number of nearest neighbors selected at test time for classification. 46 47 tolerance: float 48 controls early stopping in gradient descent (at training time). 49 50 n_clusters: int 51 number of clusters, if MiniBatch k-means is used at test time 52 (for faster prediction). 53 54 batch_size: int 55 size of the batch, if MiniBatch k-means is used at test time 56 (for faster prediction). 57 58 row_sample: float 59 percentage of rows chosen from training set (by stratified subsampling, 60 for faster prediction). 61 62 type_dist: str 63 distance used for finding the nearest neighbors; currently `euclidean-f` 64 (euclidean distances calculated as whole), `euclidean` (euclidean distances 65 calculated row by row), `cosine` (cosine distance). 66 67 n_jobs: int 68 number of cpus for parallel processing (default: None) 69 70 verbose: int 71 progress bar for parallel processing (yes = 1) or not (no = 0) 72 73 cache: boolean 74 if the nearest neighbors are cached or not, for faster retrieval in 75 subsequent calls. 76 77 n_clusters_input: int 78 number of clusters (a priori) for clustering the features 79 80 clustering_method: str 81 clustering method: currently 'kmeans', 'gmm' 82 83 cluster_scaling: str 84 scaling method for clustering: currently 'standard', 'robust', 'minmax' 85 86 seed: int 87 reproducibility seed for nodes_sim=='uniform', clustering and dropout. 88 89 """ 90 91 def __init__( 92 self, 93 n_iterations=50, 94 learning_rate=0.3, 95 reg_lambda=0.1, 96 reg_alpha=0.5, 97 eta=0.01, 98 gamma=0.01, 99 k=3, 100 tolerance=0, 101 n_clusters=0, 102 batch_size=100, 103 row_sample=0.8, 104 type_dist="euclidean-f", 105 n_jobs=None, 106 verbose=0, 107 cache=True, 108 n_clusters_input=0, 109 clustering_method="kmeans", 110 cluster_scaling="standard", 111 seed=123, 112 ): 113 if n_clusters_input > 0: 114 assert clustering_method in ( 115 "kmeans", 116 "gmm", 117 ), "`clustering_method` must be in ('kmeans', 'gmm')" 118 assert cluster_scaling in ( 119 "standard", 120 "robust", 121 "minmax", 122 ), "`cluster_scaling` must be in ('standard', 'robust', 'minmax')" 123 124 assert type_dist in ( 125 "euclidean", 126 "manhattan", 127 "euclidean-f", 128 "cosine", 129 ), "must have: `type_dist` in ('euclidean', 'manhattan', 'euclidean-f', 'cosine') " 130 131 self.n_iterations = n_iterations 132 self.learning_rate = learning_rate 133 self.reg_lambda = reg_lambda 134 self.reg_alpha = reg_alpha 135 self.eta = eta 136 self.gamma = gamma 137 self.k = k 138 self.tolerance = tolerance 139 self.n_clusters = n_clusters 140 self.batch_size = batch_size 141 self.row_sample = row_sample 142 self.type_dist = type_dist 143 self.n_jobs = n_jobs 144 self.cache = cache 145 self.verbose = verbose 146 self.n_clusters_input = n_clusters_input 147 self.clustering_method = clustering_method 148 self.cluster_scaling = cluster_scaling 149 self.scaler_, self.label_encoder_, self.clusterer_ = None, None, None 150 self.seed = seed 151 152 def fit(self, X, y, **kwargs): 153 """Fit AdaOpt to training data (X, y) 154 155 Args: 156 157 X: {array-like}, shape = [n_samples, n_features] 158 Training vectors, where n_samples is the number 159 of samples and n_features is the number of features. 160 161 y: array-like, shape = [n_samples] 162 Target values. 163 164 **kwargs: additional parameters to be passed to self.cook_training_set. 165 166 Returns: 167 168 self: object. 169 170 """ 171 172 if self.n_clusters_input > 0: 173 clustered_X, self.scaler_, self.label_encoder_, self.clusterer_ = ( 174 cluster( 175 X, 176 n_clusters=self.n_clusters_input, 177 method=self.clustering_method, 178 type_scaling=self.cluster_scaling, 179 training=True, 180 seed=self.seed, 181 ) 182 ) 183 X = np.column_stack((X.copy(), clustered_X)) 184 185 if self.row_sample < 1: 186 index_subsample = subsample( 187 y, row_sample=self.row_sample, seed=self.seed 188 ) 189 y_ = y[index_subsample] 190 X_ = X[index_subsample, :] 191 else: 192 y_ = pickle.loads(pickle.dumps(y, -1)) 193 X_ = pickle.loads(pickle.dumps(X, -1)) 194 195 n, p = X_.shape 196 197 n_classes = len(np.unique(y_)) 198 199 assert n == len(y_), "must have X.shape[0] == len(y)" 200 201 res = adaoptc.fit_adaopt( 202 X=np.asarray(X_).astype(np.float64), 203 y=np.asarray(y_).astype(np.int64), 204 n_iterations=self.n_iterations, 205 n_X=n, 206 p_X=p, 207 n_classes=n_classes, 208 learning_rate=self.learning_rate, 209 reg_lambda=self.reg_lambda, 210 reg_alpha=self.reg_alpha, 211 eta=self.eta, 212 gamma=self.gamma, 213 tolerance=self.tolerance, 214 ) 215 216 self.probs_training = res["probs"] 217 self.training_accuracy = res["training_accuracy"] 218 self.alphas = res["alphas"] 219 self.n_iterations = res["n_iterations"] 220 self.scaled_X_train = np.array(res["scaled_X_train"], dtype=np.float64) 221 self.n_classes_ = len(np.unique(y)) # for compatibility with sklearn 222 return self 223 224 def predict(self, X, **kwargs): 225 """Predict test data X. 226 227 Args: 228 229 X: {array-like}, shape = [n_samples, n_features] 230 Training vectors, where n_samples is the number 231 of samples and n_features is the number of features. 232 233 **kwargs: additional parameters to be passed to `predict_proba` 234 235 Returns: 236 237 model predictions: {array-like} 238 239 """ 240 241 return np.argmax(self.predict_proba(X, **kwargs), axis=1) 242 243 def predict_proba(self, X, **kwargs): 244 """Predict probabilities for test data X. 245 246 Args: 247 248 X: {array-like}, shape = [n_samples, n_features] 249 Training vectors, where n_samples is the number 250 of samples and n_features is the number of features. 251 252 **kwargs: additional parameters to be passed to 253 self.cook_test_set 254 255 Returns: 256 257 probability estimates for test data: {array-like} 258 259 """ 260 261 n_train, p_train = self.scaled_X_train.shape 262 263 if self.n_clusters_input > 0: 264 X = np.column_stack( 265 ( 266 X.copy(), 267 cluster( 268 X, 269 training=False, 270 scaler=self.scaler_, 271 label_encoder=self.label_encoder_, 272 clusterer=self.clusterer_, 273 seed=self.seed, 274 ), 275 ) 276 ) 277 278 n_test = X.shape[0] 279 280 if self.n_jobs is None: 281 return adaoptc.predict_proba_adaopt( 282 X_test=np.asarray(X, order="C").astype(np.float64), 283 scaled_X_train=np.asarray( 284 self.scaled_X_train, order="C" 285 ).astype(np.float64), 286 n_test=n_test, 287 n_train=n_train, 288 probs_train=self.probs_training, 289 k=self.k, 290 n_clusters=self.n_clusters, 291 batch_size=self.batch_size, 292 type_dist=self.type_dist, 293 cache=self.cache, 294 seed=self.seed, 295 ) 296 297 # parallel: self.n_jobs is not None 298 assert self.type_dist in ( 299 "euclidean", 300 "manhattan", 301 "cosine", 302 ), "must have: `self.type_dist` in ('euclidean', 'manhattan', 'cosine') " 303 304 scaled_X_test = X / norm(X, ord=2, axis=1)[:, None] 305 306 if self.type_dist == "euclidean": 307 308 @delayed 309 @wrap_non_picklable_objects 310 def multiproc_func(i): 311 dists_test_i = adaoptc.distance_to_mat_euclidean2( 312 np.asarray(scaled_X_test.astype(np.float64), order="C")[ 313 i, : 314 ], 315 np.asarray( 316 self.scaled_X_train.astype(np.float64), order="C" 317 ), 318 np.zeros(n_train), 319 n_train, 320 p_train, 321 ) 322 323 kmin_test_i = adaoptc.find_kmin_x( 324 dists_test_i, n_x=n_train, k=self.k, cache=self.cache 325 ) 326 327 weights_test_i = adaoptc.calculate_weights(kmin_test_i[0]) 328 329 probs_test_i = adaoptc.calculate_probs( 330 kmin_test_i[1], self.probs_training 331 ) 332 333 return adaoptc.average_probs( 334 probs=probs_test_i, weights=weights_test_i 335 ) 336 337 if self.type_dist == "manhattan": 338 339 @delayed 340 @wrap_non_picklable_objects 341 def multiproc_func(i): 342 dists_test_i = adaoptc.distance_to_mat_manhattan2( 343 np.asarray(scaled_X_test.astype(np.float64), order="C")[ 344 i, : 345 ], 346 np.asarray( 347 self.scaled_X_train.astype(np.float64), order="C" 348 ), 349 np.zeros(n_train), 350 n_train, 351 p_train, 352 ) 353 354 kmin_test_i = adaoptc.find_kmin_x( 355 dists_test_i, n_x=n_train, k=self.k, cache=self.cache 356 ) 357 358 weights_test_i = adaoptc.calculate_weights(kmin_test_i[0]) 359 360 probs_test_i = adaoptc.calculate_probs( 361 kmin_test_i[1], self.probs_training 362 ) 363 364 return adaoptc.average_probs( 365 probs=probs_test_i, weights=weights_test_i 366 ) 367 368 if self.type_dist == "cosine": 369 370 @delayed 371 @wrap_non_picklable_objects 372 def multiproc_func(i, *args): 373 dists_test_i = adaoptc.distance_to_mat_cosine2( 374 np.asarray(scaled_X_test.astype(np.float64), order="C")[ 375 i, : 376 ], 377 np.asarray( 378 self.scaled_X_train.astype(np.float64), order="C" 379 ), 380 np.zeros(n_train), 381 n_train, 382 p_train, 383 ) 384 385 kmin_test_i = adaoptc.find_kmin_x( 386 dists_test_i, n_x=n_train, k=self.k, cache=self.cache 387 ) 388 389 weights_test_i = adaoptc.calculate_weights(kmin_test_i[0]) 390 391 probs_test_i = adaoptc.calculate_probs( 392 kmin_test_i[1], self.probs_training 393 ) 394 395 return adaoptc.average_probs( 396 probs=probs_test_i, weights=weights_test_i 397 ) 398 399 if self.verbose == 1: 400 res = Parallel(n_jobs=self.n_jobs, prefer="threads")( 401 (multiproc_func)(m) for m in tqdm(range(n_test)) 402 ) 403 404 else: 405 res = Parallel(n_jobs=self.n_jobs, prefer="threads")( 406 (multiproc_func)(m) for m in range(n_test) 407 ) 408 409 return np.asarray(res)
AdaOpt classifier.
Attributes:
n_iterations: int
number of iterations of the optimizer at training time.
learning_rate: float
controls the speed of the optimizer at training time.
reg_lambda: float
L2 regularization parameter for successive errors in the optimizer
(at training time).
reg_alpha: float
L1 regularization parameter for successive errors in the optimizer
(at training time).
eta: float
controls the slope in gradient descent (at training time).
gamma: float
controls the step size in gradient descent (at training time).
k: int
number of nearest neighbors selected at test time for classification.
tolerance: float
controls early stopping in gradient descent (at training time).
n_clusters: int
number of clusters, if MiniBatch k-means is used at test time
(for faster prediction).
batch_size: int
size of the batch, if MiniBatch k-means is used at test time
(for faster prediction).
row_sample: float
percentage of rows chosen from training set (by stratified subsampling,
for faster prediction).
type_dist: str
distance used for finding the nearest neighbors; currently `euclidean-f`
(euclidean distances calculated as whole), `euclidean` (euclidean distances
calculated row by row), `cosine` (cosine distance).
n_jobs: int
number of cpus for parallel processing (default: None)
verbose: int
progress bar for parallel processing (yes = 1) or not (no = 0)
cache: boolean
if the nearest neighbors are cached or not, for faster retrieval in
subsequent calls.
n_clusters_input: int
number of clusters (a priori) for clustering the features
clustering_method: str
clustering method: currently 'kmeans', 'gmm'
cluster_scaling: str
scaling method for clustering: currently 'standard', 'robust', 'minmax'
seed: int
reproducibility seed for nodes_sim=='uniform', clustering and dropout.
152 def fit(self, X, y, **kwargs): 153 """Fit AdaOpt to training data (X, y) 154 155 Args: 156 157 X: {array-like}, shape = [n_samples, n_features] 158 Training vectors, where n_samples is the number 159 of samples and n_features is the number of features. 160 161 y: array-like, shape = [n_samples] 162 Target values. 163 164 **kwargs: additional parameters to be passed to self.cook_training_set. 165 166 Returns: 167 168 self: object. 169 170 """ 171 172 if self.n_clusters_input > 0: 173 clustered_X, self.scaler_, self.label_encoder_, self.clusterer_ = ( 174 cluster( 175 X, 176 n_clusters=self.n_clusters_input, 177 method=self.clustering_method, 178 type_scaling=self.cluster_scaling, 179 training=True, 180 seed=self.seed, 181 ) 182 ) 183 X = np.column_stack((X.copy(), clustered_X)) 184 185 if self.row_sample < 1: 186 index_subsample = subsample( 187 y, row_sample=self.row_sample, seed=self.seed 188 ) 189 y_ = y[index_subsample] 190 X_ = X[index_subsample, :] 191 else: 192 y_ = pickle.loads(pickle.dumps(y, -1)) 193 X_ = pickle.loads(pickle.dumps(X, -1)) 194 195 n, p = X_.shape 196 197 n_classes = len(np.unique(y_)) 198 199 assert n == len(y_), "must have X.shape[0] == len(y)" 200 201 res = adaoptc.fit_adaopt( 202 X=np.asarray(X_).astype(np.float64), 203 y=np.asarray(y_).astype(np.int64), 204 n_iterations=self.n_iterations, 205 n_X=n, 206 p_X=p, 207 n_classes=n_classes, 208 learning_rate=self.learning_rate, 209 reg_lambda=self.reg_lambda, 210 reg_alpha=self.reg_alpha, 211 eta=self.eta, 212 gamma=self.gamma, 213 tolerance=self.tolerance, 214 ) 215 216 self.probs_training = res["probs"] 217 self.training_accuracy = res["training_accuracy"] 218 self.alphas = res["alphas"] 219 self.n_iterations = res["n_iterations"] 220 self.scaled_X_train = np.array(res["scaled_X_train"], dtype=np.float64) 221 self.n_classes_ = len(np.unique(y)) # for compatibility with sklearn 222 return self
Fit AdaOpt to training data (X, y)
Args:
X: {array-like}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number
of samples and n_features is the number of features.
y: array-like, shape = [n_samples]
Target values.
**kwargs: additional parameters to be passed to self.cook_training_set.
Returns:
self: object.
224 def predict(self, X, **kwargs): 225 """Predict test data X. 226 227 Args: 228 229 X: {array-like}, shape = [n_samples, n_features] 230 Training vectors, where n_samples is the number 231 of samples and n_features is the number of features. 232 233 **kwargs: additional parameters to be passed to `predict_proba` 234 235 Returns: 236 237 model predictions: {array-like} 238 239 """ 240 241 return np.argmax(self.predict_proba(X, **kwargs), axis=1)
Predict test data X.
Args:
X: {array-like}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number
of samples and n_features is the number of features.
**kwargs: additional parameters to be passed to `predict_proba`
Returns:
model predictions: {array-like}
243 def predict_proba(self, X, **kwargs): 244 """Predict probabilities for test data X. 245 246 Args: 247 248 X: {array-like}, shape = [n_samples, n_features] 249 Training vectors, where n_samples is the number 250 of samples and n_features is the number of features. 251 252 **kwargs: additional parameters to be passed to 253 self.cook_test_set 254 255 Returns: 256 257 probability estimates for test data: {array-like} 258 259 """ 260 261 n_train, p_train = self.scaled_X_train.shape 262 263 if self.n_clusters_input > 0: 264 X = np.column_stack( 265 ( 266 X.copy(), 267 cluster( 268 X, 269 training=False, 270 scaler=self.scaler_, 271 label_encoder=self.label_encoder_, 272 clusterer=self.clusterer_, 273 seed=self.seed, 274 ), 275 ) 276 ) 277 278 n_test = X.shape[0] 279 280 if self.n_jobs is None: 281 return adaoptc.predict_proba_adaopt( 282 X_test=np.asarray(X, order="C").astype(np.float64), 283 scaled_X_train=np.asarray( 284 self.scaled_X_train, order="C" 285 ).astype(np.float64), 286 n_test=n_test, 287 n_train=n_train, 288 probs_train=self.probs_training, 289 k=self.k, 290 n_clusters=self.n_clusters, 291 batch_size=self.batch_size, 292 type_dist=self.type_dist, 293 cache=self.cache, 294 seed=self.seed, 295 ) 296 297 # parallel: self.n_jobs is not None 298 assert self.type_dist in ( 299 "euclidean", 300 "manhattan", 301 "cosine", 302 ), "must have: `self.type_dist` in ('euclidean', 'manhattan', 'cosine') " 303 304 scaled_X_test = X / norm(X, ord=2, axis=1)[:, None] 305 306 if self.type_dist == "euclidean": 307 308 @delayed 309 @wrap_non_picklable_objects 310 def multiproc_func(i): 311 dists_test_i = adaoptc.distance_to_mat_euclidean2( 312 np.asarray(scaled_X_test.astype(np.float64), order="C")[ 313 i, : 314 ], 315 np.asarray( 316 self.scaled_X_train.astype(np.float64), order="C" 317 ), 318 np.zeros(n_train), 319 n_train, 320 p_train, 321 ) 322 323 kmin_test_i = adaoptc.find_kmin_x( 324 dists_test_i, n_x=n_train, k=self.k, cache=self.cache 325 ) 326 327 weights_test_i = adaoptc.calculate_weights(kmin_test_i[0]) 328 329 probs_test_i = adaoptc.calculate_probs( 330 kmin_test_i[1], self.probs_training 331 ) 332 333 return adaoptc.average_probs( 334 probs=probs_test_i, weights=weights_test_i 335 ) 336 337 if self.type_dist == "manhattan": 338 339 @delayed 340 @wrap_non_picklable_objects 341 def multiproc_func(i): 342 dists_test_i = adaoptc.distance_to_mat_manhattan2( 343 np.asarray(scaled_X_test.astype(np.float64), order="C")[ 344 i, : 345 ], 346 np.asarray( 347 self.scaled_X_train.astype(np.float64), order="C" 348 ), 349 np.zeros(n_train), 350 n_train, 351 p_train, 352 ) 353 354 kmin_test_i = adaoptc.find_kmin_x( 355 dists_test_i, n_x=n_train, k=self.k, cache=self.cache 356 ) 357 358 weights_test_i = adaoptc.calculate_weights(kmin_test_i[0]) 359 360 probs_test_i = adaoptc.calculate_probs( 361 kmin_test_i[1], self.probs_training 362 ) 363 364 return adaoptc.average_probs( 365 probs=probs_test_i, weights=weights_test_i 366 ) 367 368 if self.type_dist == "cosine": 369 370 @delayed 371 @wrap_non_picklable_objects 372 def multiproc_func(i, *args): 373 dists_test_i = adaoptc.distance_to_mat_cosine2( 374 np.asarray(scaled_X_test.astype(np.float64), order="C")[ 375 i, : 376 ], 377 np.asarray( 378 self.scaled_X_train.astype(np.float64), order="C" 379 ), 380 np.zeros(n_train), 381 n_train, 382 p_train, 383 ) 384 385 kmin_test_i = adaoptc.find_kmin_x( 386 dists_test_i, n_x=n_train, k=self.k, cache=self.cache 387 ) 388 389 weights_test_i = adaoptc.calculate_weights(kmin_test_i[0]) 390 391 probs_test_i = adaoptc.calculate_probs( 392 kmin_test_i[1], self.probs_training 393 ) 394 395 return adaoptc.average_probs( 396 probs=probs_test_i, weights=weights_test_i 397 ) 398 399 if self.verbose == 1: 400 res = Parallel(n_jobs=self.n_jobs, prefer="threads")( 401 (multiproc_func)(m) for m in tqdm(range(n_test)) 402 ) 403 404 else: 405 res = Parallel(n_jobs=self.n_jobs, prefer="threads")( 406 (multiproc_func)(m) for m in range(n_test) 407 ) 408 409 return np.asarray(res)
Predict probabilities for test data X.
Args:
X: {array-like}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number
of samples and n_features is the number of features.
**kwargs: additional parameters to be passed to
self.cook_test_set
Returns:
probability estimates for test data: {array-like}
18class LSBoostClassifier(BaseEstimator, ClassifierMixin): 19 """LSBoost classifier. 20 21 Attributes: 22 23 n_estimators: int 24 number of boosting iterations. 25 26 learning_rate: float 27 controls the learning speed at training time. 28 29 n_hidden_features: int 30 number of nodes in successive hidden layers. 31 32 reg_lambda: float 33 L2 regularization parameter for successive errors in the optimizer 34 (at training time). 35 36 alpha: float 37 compromise between L1 and L2 regularization (must be in [0, 1]), 38 for `solver` == 'enet'. 39 40 row_sample: float 41 percentage of rows chosen from the training set. 42 43 col_sample: float 44 percentage of columns chosen from the training set. 45 46 dropout: float 47 percentage of nodes dropped from the training set. 48 49 tolerance: float 50 controls early stopping in gradient descent (at training time). 51 52 direct_link: bool 53 indicates whether the original features are included (True) in model's 54 fitting or not (False). 55 56 verbose: int 57 progress bar (yes = 1) or not (no = 0) (currently). 58 59 seed: int 60 reproducibility seed for nodes_sim=='uniform', clustering and dropout. 61 62 backend: str 63 type of backend; must be in ('cpu', 'gpu', 'tpu') 64 65 solver: str 66 type of 'weak' learner; currently in ('ridge', 'lasso', 'enet'). 67 'enet' is a combination of 'ridge' and 'lasso' called Elastic Net. 68 69 activation: str 70 activation function: currently 'relu', 'relu6', 'sigmoid', 'tanh' 71 72 n_clusters: int 73 number of clusters for clustering the features 74 75 clustering_method: str 76 clustering method: currently 'kmeans', 'gmm' 77 78 cluster_scaling: str 79 scaling method for clustering: currently 'standard', 'robust', 'minmax' 80 81 degree: int 82 degree of features interactions to include in the model 83 84 weights_distr: str 85 distribution of weights for constructing the model's hidden layer; 86 currently 'uniform', 'gaussian' 87 88 hist: bool 89 indicates whether histogram features are used or not (default is False) 90 91 bins: int or str 92 number of bins for histogram features (same as numpy.histogram, default is 'auto') 93 94 Examples: 95 96 ```python 97 import numpy as np 98 from sklearn.datasets import load_digits, load_breast_cancer, load_wine, load_iris 99 from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score 100 from sklearn.tree import DecisionTreeRegressor 101 from sklearn.kernel_ridge import KernelRidge 102 from time import time 103 from os import chdir 104 from sklearn import metrics 105 import os 106 107 import mlsauce as ms 108 109 print("\n") 110 print("GenericBoosting Decision tree -----") 111 print("\n") 112 113 print("\n") 114 print("breast_cancer data -----") 115 116 # data 1 117 breast_cancer = load_breast_cancer() 118 X = breast_cancer.data 119 y = breast_cancer.target 120 # split data into training test and test set 121 np.random.seed(15029) 122 X_train, X_test, y_train, y_test = train_test_split(X, y, 123 test_size=0.2) 124 125 clf = DecisionTreeRegressor() 126 clf2 = KernelRidge() 127 128 obj = ms.GenericBoostingClassifier(clf, tolerance=1e-2) 129 print(obj.get_params()) 130 start = time() 131 obj.fit(X_train, y_train) 132 print(time()-start) 133 start = time() 134 print(obj.score(X_test, y_test)) 135 print(time()-start) 136 137 print(obj.obj['loss']) 138 139 obj = ms.GenericBoostingClassifier(clf, tolerance=1e-2, n_clusters=2) 140 print(obj.get_params()) 141 start = time() 142 obj.fit(X_train, y_train) 143 print(time()-start) 144 start = time() 145 print(obj.score(X_test, y_test)) 146 print(time()-start) 147 148 print(obj.obj['loss']) 149 150 151 # data 2 152 print("\n") 153 print("wine data -----") 154 155 wine = load_wine() 156 Z = wine.data 157 t = wine.target 158 np.random.seed(879423) 159 X_train, X_test, y_train, y_test = train_test_split(Z, t, 160 test_size=0.2) 161 162 obj = ms.GenericBoostingClassifier(clf) 163 print(obj.get_params()) 164 start = time() 165 obj.fit(X_train, y_train) 166 print(time()-start) 167 start = time() 168 print(obj.score(X_test, y_test)) 169 print(time()-start) 170 171 print(obj.obj['loss']) 172 173 obj = ms.GenericBoostingClassifier(clf, n_clusters=3) 174 print(obj.get_params()) 175 start = time() 176 obj.fit(X_train, y_train) 177 print(time()-start) 178 start = time() 179 print(obj.score(X_test, y_test)) 180 print(time()-start) 181 182 print(obj.obj['loss']) 183 184 # data 3 185 print("\n") 186 print("iris data -----") 187 188 iris = load_iris() 189 Z = iris.data 190 t = iris.target 191 np.random.seed(734563) 192 X_train, X_test, y_train, y_test = train_test_split(Z, t, 193 test_size=0.2) 194 195 196 obj = ms.GenericBoostingClassifier(clf) 197 print(obj.get_params()) 198 start = time() 199 obj.fit(X_train, y_train) 200 print(time()-start) 201 start = time() 202 print(obj.score(X_test, y_test)) 203 print(time()-start) 204 205 print(obj.obj['loss']) 206 207 208 print("\n") 209 print("GenericBoosting KRR -----") 210 print("\n") 211 212 obj = ms.GenericBoostingClassifier(clf2, tolerance=1e-2) 213 print(obj.get_params()) 214 start = time() 215 obj.fit(X_train, y_train) 216 print(time()-start) 217 start = time() 218 print(obj.score(X_test, y_test)) 219 print(time()-start) 220 221 print(obj.obj['loss']) 222 223 obj = ms.GenericBoostingClassifier(clf2, tolerance=1e-2, n_clusters=2) 224 print(obj.get_params()) 225 start = time() 226 obj.fit(X_train, y_train) 227 print(time()-start) 228 start = time() 229 print(obj.score(X_test, y_test)) 230 print(time()-start) 231 232 print(obj.obj['loss']) 233 234 235 # data 2 236 print("\n") 237 print("wine data -----") 238 239 wine = load_wine() 240 Z = wine.data 241 t = wine.target 242 np.random.seed(879423) 243 X_train, X_test, y_train, y_test = train_test_split(Z, t, 244 test_size=0.2) 245 246 obj = ms.GenericBoostingClassifier(clf2) 247 print(obj.get_params()) 248 start = time() 249 obj.fit(X_train, y_train) 250 print(time()-start) 251 start = time() 252 print(obj.score(X_test, y_test)) 253 print(time()-start) 254 255 print(obj.obj['loss']) 256 257 obj = ms.GenericBoostingClassifier(clf2, n_clusters=3) 258 print(obj.get_params()) 259 start = time() 260 obj.fit(X_train, y_train) 261 print(time()-start) 262 start = time() 263 print(obj.score(X_test, y_test)) 264 print(time()-start) 265 266 print(obj.obj['loss']) 267 268 # data 3 269 print("\n") 270 print("iris data -----") 271 272 iris = load_iris() 273 Z = iris.data 274 t = iris.target 275 np.random.seed(734563) 276 X_train, X_test, y_train, y_test = train_test_split(Z, t, 277 test_size=0.2) 278 279 280 obj = ms.GenericBoostingClassifier(clf2) 281 print(obj.get_params()) 282 start = time() 283 obj.fit(X_train, y_train) 284 print(time()-start) 285 start = time() 286 print(obj.score(X_test, y_test)) 287 print(time()-start) 288 289 print(obj.obj['loss']) 290 ``` 291 292 """ 293 294 def __init__( 295 self, 296 n_estimators=100, 297 learning_rate=0.1, 298 n_hidden_features=5, 299 reg_lambda=0.1, 300 alpha=0.5, 301 row_sample=1, 302 col_sample=1, 303 dropout=0, 304 tolerance=1e-4, 305 direct_link=1, 306 verbose=1, 307 seed=123, 308 backend="cpu", 309 solver="ridge", 310 activation="relu", 311 n_clusters=0, 312 clustering_method="kmeans", 313 cluster_scaling="standard", 314 degree=None, 315 weights_distr="uniform", 316 base_model=None, 317 hist=False, 318 bins="auto", 319 ): 320 321 self.base_model = base_model 322 self.hist = hist 323 self.bins = bins 324 self.hist_bins_ = None 325 326 if n_clusters > 0: 327 assert clustering_method in ( 328 "kmeans", 329 "gmm", 330 ), "`clustering_method` must be in ('kmeans', 'gmm')" 331 assert cluster_scaling in ( 332 "standard", 333 "robust", 334 "minmax", 335 ), "`cluster_scaling` must be in ('standard', 'robust', 'minmax')" 336 337 assert backend in ( 338 "cpu", 339 "gpu", 340 "tpu", 341 ), "`backend` must be in ('cpu', 'gpu', 'tpu')" 342 343 assert solver in ( 344 "ridge", 345 "lasso", 346 "enet", 347 ), "`solver` must be in ('ridge', 'lasso', 'enet')" 348 349 sys_platform = platform.system() 350 351 if (sys_platform == "Windows") and (backend in ("gpu", "tpu")): 352 warnings.warn( 353 "No GPU/TPU computing on Windows yet, backend set to 'cpu'" 354 ) 355 backend = "cpu" 356 357 self.n_estimators = n_estimators 358 self.learning_rate = learning_rate 359 self.n_hidden_features = n_hidden_features 360 self.reg_lambda = reg_lambda 361 assert alpha >= 0 and alpha <= 1, "`alpha` must be in [0, 1]" 362 self.alpha = alpha 363 self.row_sample = row_sample 364 self.col_sample = col_sample 365 self.dropout = dropout 366 self.tolerance = tolerance 367 self.direct_link = direct_link 368 self.verbose = verbose 369 self.seed = seed 370 self.backend = backend 371 self.obj = None 372 self.solver = solver 373 self.activation = activation 374 self.n_clusters = n_clusters 375 self.clustering_method = clustering_method 376 self.cluster_scaling = cluster_scaling 377 self.scaler_, self.label_encoder_, self.clusterer_ = None, None, None 378 self.degree = degree 379 self.poly_ = None 380 self.weights_distr = weights_distr 381 if self.backend in ("gpu", "tpu"): 382 check_and_install("jax") 383 check_and_install("jaxlib") 384 385 def fit(self, X, y, **kwargs): 386 """Fit Booster (classifier) to training data (X, y) 387 388 Args: 389 390 X: {array-like}, shape = [n_samples, n_features] 391 Training vectors, where n_samples is the number 392 of samples and n_features is the number of features. 393 394 y: array-like, shape = [n_samples] 395 Target values. 396 397 **kwargs: additional parameters to be passed to self.cook_training_set. 398 399 Returns: 400 401 self: object. 402 """ 403 404 if isinstance(X, pd.DataFrame): 405 X = X.values 406 407 if self.hist == True: 408 X, self.hist_bins_ = get_histo_features(X) 409 410 if isinstance(y, pd.Series): 411 y = y.values.ravel() 412 else: 413 y = y.ravel() 414 415 if self.degree is not None: 416 assert isinstance(self.degree, int), "`degree` must be an integer" 417 self.poly_ = PolynomialFeatures( 418 degree=self.degree, interaction_only=True, include_bias=False 419 ) 420 X = self.poly_.fit_transform(X) 421 422 if self.n_clusters > 0: 423 clustered_X, self.scaler_, self.label_encoder_, self.clusterer_ = ( 424 cluster( 425 X, 426 n_clusters=self.n_clusters, 427 method=self.clustering_method, 428 type_scaling=self.cluster_scaling, 429 training=True, 430 seed=self.seed, 431 ) 432 ) 433 X = np.column_stack((X, clustered_X)) 434 435 self.obj = boosterc.fit_booster_classifier( 436 np.asarray(X, order="C"), 437 np.asarray(y, order="C"), 438 n_estimators=self.n_estimators, 439 learning_rate=self.learning_rate, 440 n_hidden_features=self.n_hidden_features, 441 reg_lambda=self.reg_lambda, 442 alpha=self.alpha, 443 row_sample=self.row_sample, 444 col_sample=self.col_sample, 445 dropout=self.dropout, 446 tolerance=self.tolerance, 447 direct_link=self.direct_link, 448 verbose=self.verbose, 449 seed=self.seed, 450 backend=self.backend, 451 solver=self.solver, 452 activation=self.activation, 453 obj=self.base_model, 454 ) 455 456 self.classes_ = np.unique(y) # for compatibility with sklearn 457 self.n_classes_ = len(self.classes_) # for compatibility with sklearn 458 self.n_estimators = self.obj["n_estimators"] 459 return self 460 461 def predict(self, X, **kwargs): 462 """Predict test data X. 463 464 Args: 465 466 X: {array-like}, shape = [n_samples, n_features] 467 Training vectors, where n_samples is the number 468 of samples and n_features is the number of features. 469 470 **kwargs: additional parameters to be passed to `predict_proba` 471 472 473 Returns: 474 475 model predictions: {array-like} 476 """ 477 478 return np.argmax(self.predict_proba(X, **kwargs), axis=1) 479 480 def predict_proba(self, X, **kwargs): 481 """Predict probabilities for test data X. 482 483 Args: 484 485 X: {array-like}, shape = [n_samples, n_features] 486 Training vectors, where n_samples is the number 487 of samples and n_features is the number of features. 488 489 **kwargs: additional parameters to be passed to 490 self.cook_test_set 491 492 Returns: 493 494 probability estimates for test data: {array-like} 495 """ 496 497 if isinstance(X, pd.DataFrame): 498 X = X.values 499 500 if self.hist == True: 501 X = get_histo_features(X, bins=self.hist_bins_) 502 503 if self.degree is not None: 504 X = self.poly_.transform(X) 505 506 if self.n_clusters > 0: 507 X = np.column_stack( 508 ( 509 X, 510 cluster( 511 X, 512 training=False, 513 scaler=self.scaler_, 514 label_encoder=self.label_encoder_, 515 clusterer=self.clusterer_, 516 seed=self.seed, 517 ), 518 ) 519 ) 520 try: 521 return boosterc.predict_proba_booster_classifier( 522 self.obj, np.asarray(X, order="C") 523 ) 524 except ValueError: 525 pass 526 527 def update(self, X, y, eta=0.9): 528 """Update model with new data. 529 530 Args: 531 532 X: {array-like}, shape = [n_samples=1, n_features] 533 Training vectors, where n_samples is the number 534 of samples and n_features is the number of features. 535 536 y: float = [n_samples=1] 537 Target value. 538 539 eta: float 540 Inverse power applied to number of observations 541 (defines a learning rate). 542 543 Returns: 544 545 self: object. 546 """ 547 548 if isinstance(X, pd.DataFrame): 549 X = X.values 550 551 if self.degree is not None: 552 X = self.poly_.transform(X) 553 554 if self.n_clusters > 0: 555 X = np.column_stack( 556 ( 557 X, 558 cluster( 559 X, 560 training=False, 561 scaler=self.scaler_, 562 label_encoder=self.label_encoder_, 563 clusterer=self.clusterer_, 564 seed=self.seed, 565 ), 566 ) 567 ) 568 569 self.obj = boosterc.update_booster( 570 self.obj, 571 np.asarray(X, order="C"), 572 np.asarray(y, order="C").ravel(), 573 eta, 574 ) 575 576 return self
LSBoost classifier.
Attributes:
n_estimators: int
number of boosting iterations.
learning_rate: float
controls the learning speed at training time.
n_hidden_features: int
number of nodes in successive hidden layers.
reg_lambda: float
L2 regularization parameter for successive errors in the optimizer
(at training time).
alpha: float
compromise between L1 and L2 regularization (must be in [0, 1]),
for `solver` == 'enet'.
row_sample: float
percentage of rows chosen from the training set.
col_sample: float
percentage of columns chosen from the training set.
dropout: float
percentage of nodes dropped from the training set.
tolerance: float
controls early stopping in gradient descent (at training time).
direct_link: bool
indicates whether the original features are included (True) in model's
fitting or not (False).
verbose: int
progress bar (yes = 1) or not (no = 0) (currently).
seed: int
reproducibility seed for nodes_sim=='uniform', clustering and dropout.
backend: str
type of backend; must be in ('cpu', 'gpu', 'tpu')
solver: str
type of 'weak' learner; currently in ('ridge', 'lasso', 'enet').
'enet' is a combination of 'ridge' and 'lasso' called Elastic Net.
activation: str
activation function: currently 'relu', 'relu6', 'sigmoid', 'tanh'
n_clusters: int
number of clusters for clustering the features
clustering_method: str
clustering method: currently 'kmeans', 'gmm'
cluster_scaling: str
scaling method for clustering: currently 'standard', 'robust', 'minmax'
degree: int
degree of features interactions to include in the model
weights_distr: str
distribution of weights for constructing the model's hidden layer;
currently 'uniform', 'gaussian'
hist: bool
indicates whether histogram features are used or not (default is False)
bins: int or str
number of bins for histogram features (same as numpy.histogram, default is 'auto')
Examples:
```python
import numpy as np
from sklearn.datasets import load_digits, load_breast_cancer, load_wine, load_iris
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.kernel_ridge import KernelRidge
from time import time
from os import chdir
from sklearn import metrics
import os
import mlsauce as ms
print("
") print("GenericBoosting Decision tree -----") print(" ")
print("
") print("breast_cancer data -----")
# data 1
breast_cancer = load_breast_cancer()
X = breast_cancer.data
y = breast_cancer.target
# split data into training test and test set
np.random.seed(15029)
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2)
clf = DecisionTreeRegressor()
clf2 = KernelRidge()
obj = ms.GenericBoostingClassifier(clf, tolerance=1e-2)
print(obj.get_params())
start = time()
obj.fit(X_train, y_train)
print(time()-start)
start = time()
print(obj.score(X_test, y_test))
print(time()-start)
print(obj.obj['loss'])
obj = ms.GenericBoostingClassifier(clf, tolerance=1e-2, n_clusters=2)
print(obj.get_params())
start = time()
obj.fit(X_train, y_train)
print(time()-start)
start = time()
print(obj.score(X_test, y_test))
print(time()-start)
print(obj.obj['loss'])
# data 2
print("
") print("wine data -----")
wine = load_wine()
Z = wine.data
t = wine.target
np.random.seed(879423)
X_train, X_test, y_train, y_test = train_test_split(Z, t,
test_size=0.2)
obj = ms.GenericBoostingClassifier(clf)
print(obj.get_params())
start = time()
obj.fit(X_train, y_train)
print(time()-start)
start = time()
print(obj.score(X_test, y_test))
print(time()-start)
print(obj.obj['loss'])
obj = ms.GenericBoostingClassifier(clf, n_clusters=3)
print(obj.get_params())
start = time()
obj.fit(X_train, y_train)
print(time()-start)
start = time()
print(obj.score(X_test, y_test))
print(time()-start)
print(obj.obj['loss'])
# data 3
print("
") print("iris data -----")
iris = load_iris()
Z = iris.data
t = iris.target
np.random.seed(734563)
X_train, X_test, y_train, y_test = train_test_split(Z, t,
test_size=0.2)
obj = ms.GenericBoostingClassifier(clf)
print(obj.get_params())
start = time()
obj.fit(X_train, y_train)
print(time()-start)
start = time()
print(obj.score(X_test, y_test))
print(time()-start)
print(obj.obj['loss'])
print("
") print("GenericBoosting KRR -----") print(" ")
obj = ms.GenericBoostingClassifier(clf2, tolerance=1e-2)
print(obj.get_params())
start = time()
obj.fit(X_train, y_train)
print(time()-start)
start = time()
print(obj.score(X_test, y_test))
print(time()-start)
print(obj.obj['loss'])
obj = ms.GenericBoostingClassifier(clf2, tolerance=1e-2, n_clusters=2)
print(obj.get_params())
start = time()
obj.fit(X_train, y_train)
print(time()-start)
start = time()
print(obj.score(X_test, y_test))
print(time()-start)
print(obj.obj['loss'])
# data 2
print("
") print("wine data -----")
wine = load_wine()
Z = wine.data
t = wine.target
np.random.seed(879423)
X_train, X_test, y_train, y_test = train_test_split(Z, t,
test_size=0.2)
obj = ms.GenericBoostingClassifier(clf2)
print(obj.get_params())
start = time()
obj.fit(X_train, y_train)
print(time()-start)
start = time()
print(obj.score(X_test, y_test))
print(time()-start)
print(obj.obj['loss'])
obj = ms.GenericBoostingClassifier(clf2, n_clusters=3)
print(obj.get_params())
start = time()
obj.fit(X_train, y_train)
print(time()-start)
start = time()
print(obj.score(X_test, y_test))
print(time()-start)
print(obj.obj['loss'])
# data 3
print("
") print("iris data -----")
iris = load_iris()
Z = iris.data
t = iris.target
np.random.seed(734563)
X_train, X_test, y_train, y_test = train_test_split(Z, t,
test_size=0.2)
obj = ms.GenericBoostingClassifier(clf2)
print(obj.get_params())
start = time()
obj.fit(X_train, y_train)
print(time()-start)
start = time()
print(obj.score(X_test, y_test))
print(time()-start)
print(obj.obj['loss'])
```
385 def fit(self, X, y, **kwargs): 386 """Fit Booster (classifier) to training data (X, y) 387 388 Args: 389 390 X: {array-like}, shape = [n_samples, n_features] 391 Training vectors, where n_samples is the number 392 of samples and n_features is the number of features. 393 394 y: array-like, shape = [n_samples] 395 Target values. 396 397 **kwargs: additional parameters to be passed to self.cook_training_set. 398 399 Returns: 400 401 self: object. 402 """ 403 404 if isinstance(X, pd.DataFrame): 405 X = X.values 406 407 if self.hist == True: 408 X, self.hist_bins_ = get_histo_features(X) 409 410 if isinstance(y, pd.Series): 411 y = y.values.ravel() 412 else: 413 y = y.ravel() 414 415 if self.degree is not None: 416 assert isinstance(self.degree, int), "`degree` must be an integer" 417 self.poly_ = PolynomialFeatures( 418 degree=self.degree, interaction_only=True, include_bias=False 419 ) 420 X = self.poly_.fit_transform(X) 421 422 if self.n_clusters > 0: 423 clustered_X, self.scaler_, self.label_encoder_, self.clusterer_ = ( 424 cluster( 425 X, 426 n_clusters=self.n_clusters, 427 method=self.clustering_method, 428 type_scaling=self.cluster_scaling, 429 training=True, 430 seed=self.seed, 431 ) 432 ) 433 X = np.column_stack((X, clustered_X)) 434 435 self.obj = boosterc.fit_booster_classifier( 436 np.asarray(X, order="C"), 437 np.asarray(y, order="C"), 438 n_estimators=self.n_estimators, 439 learning_rate=self.learning_rate, 440 n_hidden_features=self.n_hidden_features, 441 reg_lambda=self.reg_lambda, 442 alpha=self.alpha, 443 row_sample=self.row_sample, 444 col_sample=self.col_sample, 445 dropout=self.dropout, 446 tolerance=self.tolerance, 447 direct_link=self.direct_link, 448 verbose=self.verbose, 449 seed=self.seed, 450 backend=self.backend, 451 solver=self.solver, 452 activation=self.activation, 453 obj=self.base_model, 454 ) 455 456 self.classes_ = np.unique(y) # for compatibility with sklearn 457 self.n_classes_ = len(self.classes_) # for compatibility with sklearn 458 self.n_estimators = self.obj["n_estimators"] 459 return self
Fit Booster (classifier) to training data (X, y)
Args:
X: {array-like}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number
of samples and n_features is the number of features.
y: array-like, shape = [n_samples]
Target values.
**kwargs: additional parameters to be passed to self.cook_training_set.
Returns:
self: object.
461 def predict(self, X, **kwargs): 462 """Predict test data X. 463 464 Args: 465 466 X: {array-like}, shape = [n_samples, n_features] 467 Training vectors, where n_samples is the number 468 of samples and n_features is the number of features. 469 470 **kwargs: additional parameters to be passed to `predict_proba` 471 472 473 Returns: 474 475 model predictions: {array-like} 476 """ 477 478 return np.argmax(self.predict_proba(X, **kwargs), axis=1)
Predict test data X.
Args:
X: {array-like}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number
of samples and n_features is the number of features.
**kwargs: additional parameters to be passed to `predict_proba`
Returns:
model predictions: {array-like}
480 def predict_proba(self, X, **kwargs): 481 """Predict probabilities for test data X. 482 483 Args: 484 485 X: {array-like}, shape = [n_samples, n_features] 486 Training vectors, where n_samples is the number 487 of samples and n_features is the number of features. 488 489 **kwargs: additional parameters to be passed to 490 self.cook_test_set 491 492 Returns: 493 494 probability estimates for test data: {array-like} 495 """ 496 497 if isinstance(X, pd.DataFrame): 498 X = X.values 499 500 if self.hist == True: 501 X = get_histo_features(X, bins=self.hist_bins_) 502 503 if self.degree is not None: 504 X = self.poly_.transform(X) 505 506 if self.n_clusters > 0: 507 X = np.column_stack( 508 ( 509 X, 510 cluster( 511 X, 512 training=False, 513 scaler=self.scaler_, 514 label_encoder=self.label_encoder_, 515 clusterer=self.clusterer_, 516 seed=self.seed, 517 ), 518 ) 519 ) 520 try: 521 return boosterc.predict_proba_booster_classifier( 522 self.obj, np.asarray(X, order="C") 523 ) 524 except ValueError: 525 pass
Predict probabilities for test data X.
Args:
X: {array-like}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number
of samples and n_features is the number of features.
**kwargs: additional parameters to be passed to
self.cook_test_set
Returns:
probability estimates for test data: {array-like}
579class GenericBoostingClassifier(LSBoostClassifier): 580 """Generic Boosting classifier (using any classifier as base learner). 581 582 Attributes: 583 584 base_model: object 585 base learner (default is ExtraTreeRegressor) to be boosted. 586 587 n_estimators: int 588 number of boosting iterations. 589 590 learning_rate: float 591 controls the learning speed at training time. 592 593 n_hidden_features: int 594 number of nodes in successive hidden layers. 595 596 reg_lambda: float 597 L2 regularization parameter for successive errors in the optimizer 598 (at training time). 599 600 alpha: float 601 compromise between L1 and L2 regularization (must be in [0, 1]), 602 for `solver` == 'enet'. 603 604 row_sample: float 605 percentage of rows chosen from the training set. 606 607 col_sample: float 608 percentage of columns chosen from the training set. 609 610 dropout: float 611 percentage of nodes dropped from the training set. 612 613 tolerance: float 614 controls early stopping in gradient descent (at training time). 615 616 direct_link: bool 617 indicates whether the original features are included (True) in model's 618 fitting or not (False). 619 620 verbose: int 621 progress bar (yes = 1) or not (no = 0) (currently). 622 623 seed: int 624 reproducibility seed for nodes_sim=='uniform', clustering and dropout. 625 626 backend: str 627 type of backend; must be in ('cpu', 'gpu', 'tpu') 628 629 solver: str 630 type of 'weak' learner; currently in ('ridge', 'lasso', 'enet'). 631 'enet' is a combination of 'ridge' and 'lasso' called Elastic Net. 632 633 activation: str 634 activation function: currently 'relu', 'relu6', 'sigmoid', 'tanh' 635 636 n_clusters: int 637 number of clusters for clustering the features 638 639 clustering_method: str 640 clustering method: currently 'kmeans', 'gmm' 641 642 cluster_scaling: str 643 scaling method for clustering: currently 'standard', 'robust', 'minmax' 644 645 degree: int 646 degree of features interactions to include in the model 647 648 weights_distr: str 649 distribution of weights for constructing the model's hidden layer; 650 currently 'uniform', 'gaussian' 651 652 hist: bool 653 indicates whether histogram features are used or not (default is False) 654 655 bins: int or str 656 number of bins for histogram features (same as numpy.histogram, default is 'auto') 657 658 """ 659 660 def __init__( 661 self, 662 base_model=ExtraTreeRegressor(), 663 n_estimators=100, 664 learning_rate=0.1, 665 n_hidden_features=5, 666 reg_lambda=0.1, 667 alpha=0.5, 668 row_sample=1, 669 col_sample=1, 670 dropout=0, 671 tolerance=1e-4, 672 direct_link=1, 673 verbose=1, 674 seed=123, 675 backend="cpu", 676 solver="ridge", 677 activation="relu", 678 n_clusters=0, 679 clustering_method="kmeans", 680 cluster_scaling="standard", 681 degree=None, 682 weights_distr="uniform", 683 hist=False, 684 bins="auto", 685 ): 686 self.base_model = base_model 687 self.hist = hist 688 self.bins = bins 689 self.hist_bins_ = None 690 691 super().__init__( 692 n_estimators=n_estimators, 693 learning_rate=learning_rate, 694 n_hidden_features=n_hidden_features, 695 reg_lambda=reg_lambda, 696 alpha=alpha, 697 row_sample=row_sample, 698 col_sample=col_sample, 699 dropout=dropout, 700 tolerance=tolerance, 701 direct_link=direct_link, 702 verbose=verbose, 703 seed=seed, 704 backend=backend, 705 solver=solver, 706 activation=activation, 707 n_clusters=n_clusters, 708 clustering_method=clustering_method, 709 cluster_scaling=cluster_scaling, 710 degree=degree, 711 weights_distr=weights_distr, 712 base_model=self.base_model, 713 )
Generic Boosting classifier (using any classifier as base learner).
Attributes:
base_model: object
base learner (default is ExtraTreeRegressor) to be boosted.
n_estimators: int
number of boosting iterations.
learning_rate: float
controls the learning speed at training time.
n_hidden_features: int
number of nodes in successive hidden layers.
reg_lambda: float
L2 regularization parameter for successive errors in the optimizer
(at training time).
alpha: float
compromise between L1 and L2 regularization (must be in [0, 1]),
for `solver` == 'enet'.
row_sample: float
percentage of rows chosen from the training set.
col_sample: float
percentage of columns chosen from the training set.
dropout: float
percentage of nodes dropped from the training set.
tolerance: float
controls early stopping in gradient descent (at training time).
direct_link: bool
indicates whether the original features are included (True) in model's
fitting or not (False).
verbose: int
progress bar (yes = 1) or not (no = 0) (currently).
seed: int
reproducibility seed for nodes_sim=='uniform', clustering and dropout.
backend: str
type of backend; must be in ('cpu', 'gpu', 'tpu')
solver: str
type of 'weak' learner; currently in ('ridge', 'lasso', 'enet').
'enet' is a combination of 'ridge' and 'lasso' called Elastic Net.
activation: str
activation function: currently 'relu', 'relu6', 'sigmoid', 'tanh'
n_clusters: int
number of clusters for clustering the features
clustering_method: str
clustering method: currently 'kmeans', 'gmm'
cluster_scaling: str
scaling method for clustering: currently 'standard', 'robust', 'minmax'
degree: int
degree of features interactions to include in the model
weights_distr: str
distribution of weights for constructing the model's hidden layer;
currently 'uniform', 'gaussian'
hist: bool
indicates whether histogram features are used or not (default is False)
bins: int or str
number of bins for histogram features (same as numpy.histogram, default is 'auto')
460class GenericBoostingRegressor(LSBoostRegressor): 461 """Generic Boosting regressor. 462 463 Attributes: 464 465 base_model: object 466 base learner (default is ExtraTreeRegressor) to be boosted. 467 468 n_estimators: int 469 number of boosting iterations. 470 471 learning_rate: float 472 controls the learning speed at training time. 473 474 n_hidden_features: int 475 number of nodes in successive hidden layers. 476 477 reg_lambda: float 478 L2 regularization parameter for successive errors in the optimizer 479 (at training time). 480 481 alpha: float 482 compromise between L1 and L2 regularization (must be in [0, 1]), 483 for `solver` == 'enet' 484 485 row_sample: float 486 percentage of rows chosen from the training set. 487 488 col_sample: float 489 percentage of columns chosen from the training set. 490 491 dropout: float 492 percentage of nodes dropped from the training set. 493 494 tolerance: float 495 controls early stopping in gradient descent (at training time). 496 497 direct_link: bool 498 indicates whether the original features are included (True) in model's 499 fitting or not (False). 500 501 verbose: int 502 progress bar (yes = 1) or not (no = 0) (currently). 503 504 seed: int 505 reproducibility seed for nodes_sim=='uniform', clustering and dropout. 506 507 backend: str 508 type of backend; must be in ('cpu', 'gpu', 'tpu') 509 510 solver: str 511 type of 'weak' learner; currently in ('ridge', 'lasso') 512 513 activation: str 514 activation function: currently 'relu', 'relu6', 'sigmoid', 'tanh' 515 516 type_pi: str. 517 type of prediction interval; currently "kde" (default) or "bootstrap". 518 Used only in `self.predict`, for `self.replications` > 0 and `self.kernel` 519 in ('gaussian', 'tophat'). Default is `None`. 520 521 replications: int. 522 number of replications (if needed) for predictive simulation. 523 Used only in `self.predict`, for `self.kernel` in ('gaussian', 524 'tophat') and `self.type_pi = 'kde'`. Default is `None`. 525 526 n_clusters: int 527 number of clusters for clustering the features 528 529 clustering_method: str 530 clustering method: currently 'kmeans', 'gmm' 531 532 cluster_scaling: str 533 scaling method for clustering: currently 'standard', 'robust', 'minmax' 534 535 degree: int 536 degree of features interactions to include in the model 537 538 weights_distr: str 539 distribution of weights for constructing the model's hidden layer; 540 either 'uniform' or 'gaussian' 541 542 hist: bool 543 whether to use histogram features or not 544 545 bins: int or str 546 number of bins for histogram features (same as numpy.histogram, default is 'auto') 547 548 """ 549 550 def __init__( 551 self, 552 base_model=ExtraTreeRegressor(), 553 n_estimators=100, 554 learning_rate=0.1, 555 n_hidden_features=5, 556 reg_lambda=0.1, 557 alpha=0.5, 558 row_sample=1, 559 col_sample=1, 560 dropout=0, 561 tolerance=1e-4, 562 direct_link=1, 563 verbose=1, 564 seed=123, 565 backend="cpu", 566 solver="ridge", 567 activation="relu", 568 type_pi=None, 569 replications=None, 570 kernel=None, 571 n_clusters=0, 572 clustering_method="kmeans", 573 cluster_scaling="standard", 574 degree=None, 575 weights_distr="uniform", 576 hist=False, 577 bins="auto", 578 ): 579 self.base_model = base_model 580 self.hist = hist 581 self.bins = bins 582 self.hist_bins_ = None 583 584 super().__init__( 585 n_estimators=n_estimators, 586 learning_rate=learning_rate, 587 n_hidden_features=n_hidden_features, 588 reg_lambda=reg_lambda, 589 alpha=alpha, 590 row_sample=row_sample, 591 col_sample=col_sample, 592 dropout=dropout, 593 tolerance=tolerance, 594 direct_link=direct_link, 595 verbose=verbose, 596 seed=seed, 597 backend=backend, 598 solver=solver, 599 activation=activation, 600 type_pi=type_pi, 601 replications=replications, 602 kernel=kernel, 603 n_clusters=n_clusters, 604 clustering_method=clustering_method, 605 cluster_scaling=cluster_scaling, 606 degree=degree, 607 weights_distr=weights_distr, 608 base_model=self.base_model, 609 )
Generic Boosting regressor.
Attributes:
base_model: object
base learner (default is ExtraTreeRegressor) to be boosted.
n_estimators: int
number of boosting iterations.
learning_rate: float
controls the learning speed at training time.
n_hidden_features: int
number of nodes in successive hidden layers.
reg_lambda: float
L2 regularization parameter for successive errors in the optimizer
(at training time).
alpha: float
compromise between L1 and L2 regularization (must be in [0, 1]),
for `solver` == 'enet'
row_sample: float
percentage of rows chosen from the training set.
col_sample: float
percentage of columns chosen from the training set.
dropout: float
percentage of nodes dropped from the training set.
tolerance: float
controls early stopping in gradient descent (at training time).
direct_link: bool
indicates whether the original features are included (True) in model's
fitting or not (False).
verbose: int
progress bar (yes = 1) or not (no = 0) (currently).
seed: int
reproducibility seed for nodes_sim=='uniform', clustering and dropout.
backend: str
type of backend; must be in ('cpu', 'gpu', 'tpu')
solver: str
type of 'weak' learner; currently in ('ridge', 'lasso')
activation: str
activation function: currently 'relu', 'relu6', 'sigmoid', 'tanh'
type_pi: str.
type of prediction interval; currently "kde" (default) or "bootstrap".
Used only in `self.predict`, for `self.replications` > 0 and `self.kernel`
in ('gaussian', 'tophat'). Default is `None`.
replications: int.
number of replications (if needed) for predictive simulation.
Used only in `self.predict`, for `self.kernel` in ('gaussian',
'tophat') and `self.type_pi = 'kde'`. Default is `None`.
n_clusters: int
number of clusters for clustering the features
clustering_method: str
clustering method: currently 'kmeans', 'gmm'
cluster_scaling: str
scaling method for clustering: currently 'standard', 'robust', 'minmax'
degree: int
degree of features interactions to include in the model
weights_distr: str
distribution of weights for constructing the model's hidden layer;
either 'uniform' or 'gaussian'
hist: bool
whether to use histogram features or not
bins: int or str
number of bins for histogram features (same as numpy.histogram, default is 'auto')
12class StumpClassifier(BaseEstimator, ClassifierMixin): 13 """Stump classifier. 14 15 Attributes: 16 17 bins: int 18 Number of histogram bins; as in numpy.histogram. 19 """ 20 21 def __init__(self, bins="auto"): 22 self.bins = bins 23 self.obj = None 24 25 def fit(self, X, y, sample_weight=None, **kwargs): 26 """Fit Stump to training data (X, y) 27 28 Args: 29 30 X: {array-like}, shape = [n_samples, n_features] 31 Training vectors, where n_samples is the number 32 of samples and n_features is the number of features. 33 34 y: array-like, shape = [n_samples] 35 Target values. 36 37 sample_weight: array_like, shape = [n_samples] 38 Observations weights. 39 40 Returns: 41 42 self: object. 43 """ 44 45 if sample_weight is None: 46 self.obj = stumpc.fit_stump_classifier( 47 X=np.asarray(X, order="C"), 48 y=np.asarray(y, order="C"), 49 bins=self.bins, 50 ) 51 52 return self 53 54 self.obj = stumpc.fit_stump_classifier( 55 X=np.asarray(X, order="C"), 56 y=np.asarray(y, order="C"), 57 sample_weight=np.ravel(sample_weight, order="C"), 58 bins=self.bins, 59 ) 60 self.n_classes_ = len(np.unique(y)) # for compatibility with sklearn 61 return self 62 63 def predict(self, X, **kwargs): 64 """Predict test data X. 65 66 Args: 67 68 X: {array-like}, shape = [n_samples, n_features] 69 Training vectors, where n_samples is the number 70 of samples and n_features is the number of features. 71 72 **kwargs: additional parameters to be passed to `predict_proba` 73 74 75 Returns: 76 77 model predictions: {array-like} 78 """ 79 80 return np.argmax(self.predict_proba(X, **kwargs), axis=1) 81 82 def predict_proba(self, X, **kwargs): 83 """Predict probabilities for test data X. 84 85 Args: 86 87 X: {array-like}, shape = [n_samples, n_features] 88 Training vectors, where n_samples is the number 89 of samples and n_features is the number of features. 90 91 **kwargs: additional parameters to be passed to 92 self.cook_test_set 93 94 Returns: 95 96 probability estimates for test data: {array-like} 97 """ 98 99 return stumpc.predict_proba_stump_classifier( 100 self.obj, np.asarray(X, order="C") 101 )
Stump classifier.
Attributes:
bins: int
Number of histogram bins; as in numpy.histogram.
25 def fit(self, X, y, sample_weight=None, **kwargs): 26 """Fit Stump to training data (X, y) 27 28 Args: 29 30 X: {array-like}, shape = [n_samples, n_features] 31 Training vectors, where n_samples is the number 32 of samples and n_features is the number of features. 33 34 y: array-like, shape = [n_samples] 35 Target values. 36 37 sample_weight: array_like, shape = [n_samples] 38 Observations weights. 39 40 Returns: 41 42 self: object. 43 """ 44 45 if sample_weight is None: 46 self.obj = stumpc.fit_stump_classifier( 47 X=np.asarray(X, order="C"), 48 y=np.asarray(y, order="C"), 49 bins=self.bins, 50 ) 51 52 return self 53 54 self.obj = stumpc.fit_stump_classifier( 55 X=np.asarray(X, order="C"), 56 y=np.asarray(y, order="C"), 57 sample_weight=np.ravel(sample_weight, order="C"), 58 bins=self.bins, 59 ) 60 self.n_classes_ = len(np.unique(y)) # for compatibility with sklearn 61 return self
Fit Stump to training data (X, y)
Args:
X: {array-like}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number
of samples and n_features is the number of features.
y: array-like, shape = [n_samples]
Target values.
sample_weight: array_like, shape = [n_samples]
Observations weights.
Returns:
self: object.
63 def predict(self, X, **kwargs): 64 """Predict test data X. 65 66 Args: 67 68 X: {array-like}, shape = [n_samples, n_features] 69 Training vectors, where n_samples is the number 70 of samples and n_features is the number of features. 71 72 **kwargs: additional parameters to be passed to `predict_proba` 73 74 75 Returns: 76 77 model predictions: {array-like} 78 """ 79 80 return np.argmax(self.predict_proba(X, **kwargs), axis=1)
Predict test data X.
Args:
X: {array-like}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number
of samples and n_features is the number of features.
**kwargs: additional parameters to be passed to `predict_proba`
Returns:
model predictions: {array-like}
82 def predict_proba(self, X, **kwargs): 83 """Predict probabilities for test data X. 84 85 Args: 86 87 X: {array-like}, shape = [n_samples, n_features] 88 Training vectors, where n_samples is the number 89 of samples and n_features is the number of features. 90 91 **kwargs: additional parameters to be passed to 92 self.cook_test_set 93 94 Returns: 95 96 probability estimates for test data: {array-like} 97 """ 98 99 return stumpc.predict_proba_stump_classifier( 100 self.obj, np.asarray(X, order="C") 101 )
Predict probabilities for test data X.
Args:
X: {array-like}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number
of samples and n_features is the number of features.
**kwargs: additional parameters to be passed to
self.cook_test_set
Returns:
probability estimates for test data: {array-like}
19class ElasticNetRegressor(BaseEstimator, RegressorMixin): 20 """Elasticnet. 21 22 Attributes: 23 24 reg_lambda: float 25 regularization parameter. 26 27 alpha: float 28 compromise between L1 and L2 regularization (must be in [0, 1]), 29 for `solver` == 'enet'. 30 31 backend: str 32 type of backend; must be in ('cpu', 'gpu', 'tpu') 33 34 """ 35 36 def __init__(self, reg_lambda=0.1, alpha=0.5, backend="cpu"): 37 assert backend in ( 38 "cpu", 39 "gpu", 40 "tpu", 41 ), "`backend` must be in ('cpu', 'gpu', 'tpu')" 42 43 sys_platform = platform.system() 44 45 if (sys_platform == "Windows") and (backend in ("gpu", "tpu")): 46 warnings.warn( 47 "No GPU/TPU computing on Windows yet, backend set to 'cpu'" 48 ) 49 backend = "cpu" 50 51 self.reg_lambda = reg_lambda 52 self.alpha = alpha 53 self.backend = backend 54 if self.backend in ("gpu", "tpu"): 55 check_and_install("jax") 56 check_and_install("jaxlib") 57 58 def fit(self, X, y, **kwargs): 59 """Fit matrixops (classifier) to training data (X, y) 60 61 Args: 62 63 X: {array-like}, shape = [n_samples, n_features] 64 Training vectors, where n_samples is the number 65 of samples and n_features is the number of features. 66 67 y: array-like, shape = [n_samples] 68 Target values. 69 70 **kwargs: additional parameters to be passed to self.cook_training_set. 71 72 Returns: 73 74 self: object. 75 76 """ 77 fit_result = fit_elasticnet(X, y, lam=self.reg_lambda, alpha=self.alpha) 78 self.coef_ = fit_result.coef_ 79 self.y_train_mean = fit_result.y_train_mean 80 self.scaler = fit_result.scaler 81 self.converged = fit_result.converged 82 return self 83 84 def predict(self, X, **kwargs): 85 """Predict test data X. 86 87 Args: 88 89 X: {array-like}, shape = [n_samples, n_features] 90 Training vectors, where n_samples is the number 91 of samples and n_features is the number of features. 92 93 **kwargs: additional parameters to be passed to `predict_proba` 94 95 Returns: 96 97 model predictions: {array-like} 98 99 """ 100 return predict_elasticnet(X, self)
Elasticnet.
Attributes:
reg_lambda: float
regularization parameter.
alpha: float
compromise between L1 and L2 regularization (must be in [0, 1]),
for `solver` == 'enet'.
backend: str
type of backend; must be in ('cpu', 'gpu', 'tpu')
58 def fit(self, X, y, **kwargs): 59 """Fit matrixops (classifier) to training data (X, y) 60 61 Args: 62 63 X: {array-like}, shape = [n_samples, n_features] 64 Training vectors, where n_samples is the number 65 of samples and n_features is the number of features. 66 67 y: array-like, shape = [n_samples] 68 Target values. 69 70 **kwargs: additional parameters to be passed to self.cook_training_set. 71 72 Returns: 73 74 self: object. 75 76 """ 77 fit_result = fit_elasticnet(X, y, lam=self.reg_lambda, alpha=self.alpha) 78 self.coef_ = fit_result.coef_ 79 self.y_train_mean = fit_result.y_train_mean 80 self.scaler = fit_result.scaler 81 self.converged = fit_result.converged 82 return self
Fit matrixops (classifier) to training data (X, y)
Args:
X: {array-like}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number
of samples and n_features is the number of features.
y: array-like, shape = [n_samples]
Target values.
**kwargs: additional parameters to be passed to self.cook_training_set.
Returns:
self: object.
84 def predict(self, X, **kwargs): 85 """Predict test data X. 86 87 Args: 88 89 X: {array-like}, shape = [n_samples, n_features] 90 Training vectors, where n_samples is the number 91 of samples and n_features is the number of features. 92 93 **kwargs: additional parameters to be passed to `predict_proba` 94 95 Returns: 96 97 model predictions: {array-like} 98 99 """ 100 return predict_elasticnet(X, self)
Predict test data X.
Args:
X: {array-like}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number
of samples and n_features is the number of features.
**kwargs: additional parameters to be passed to `predict_proba`
Returns:
model predictions: {array-like}
24class LassoRegressor(BaseEstimator, RegressorMixin): 25 """Lasso. 26 27 Attributes: 28 29 reg_lambda: float 30 L1 regularization parameter. 31 32 max_iter: int 33 number of iterations of lasso shooting algorithm. 34 35 tol: float 36 tolerance for convergence of lasso shooting algorithm. 37 38 backend: str 39 type of backend; must be in ('cpu', 'gpu', 'tpu'). 40 41 """ 42 43 def __init__(self, reg_lambda=0.1, max_iter=10, tol=1e-3, backend="cpu"): 44 assert backend in ( 45 "cpu", 46 "gpu", 47 "tpu", 48 ), "`backend` must be in ('cpu', 'gpu', 'tpu')" 49 50 sys_platform = platform.system() 51 52 if (sys_platform == "Windows") and (backend in ("gpu", "tpu")): 53 warnings.warn( 54 "No GPU/TPU computing on Windows yet, backend set to 'cpu'" 55 ) 56 backend = "cpu" 57 58 self.reg_lambda = reg_lambda 59 self.max_iter = max_iter 60 self.tol = tol 61 self.backend = backend 62 if self.backend in ("gpu", "tpu"): 63 check_and_install("jax") 64 check_and_install("jaxlib") 65 66 def fit(self, X, y, **kwargs): 67 """Fit matrixops (classifier) to training data (X, y) 68 69 Args: 70 71 X: {array-like}, shape = [n_samples, n_features] 72 Training vectors, where n_samples is the number 73 of samples and n_features is the number of features. 74 75 y: array-like, shape = [n_samples] 76 Target values. 77 78 **kwargs: additional parameters to be passed to self.cook_training_set. 79 80 Returns: 81 82 self: object. 83 84 """ 85 86 self.ym, centered_y = mo.center_response(y) 87 self.xm = X.mean(axis=0) 88 self.xsd = X.std(axis=0) 89 self.xsd[self.xsd == 0] = 1 90 X_ = (X - self.xm[None, :]) / self.xsd[None, :] 91 XX = mo.crossprod(X_, backend=self.backend) 92 Xy = mo.crossprod(X_, centered_y, backend=self.backend) 93 XX2 = 2 * XX 94 Xy2 = 2 * Xy 95 96 if self.backend == "cpu": 97 # beta0, _, _, _ = np.linalg.lstsq(X_, centered_y, rcond=None) 98 beta0 = get_beta(X_, centered_y) 99 if len(np.asarray(y).shape) == 1: 100 res = mo.get_beta_1D( 101 beta0=np.asarray(beta0), 102 XX2=np.asarray(XX2), 103 Xy2=np.asarray(Xy2), 104 reg_lambda=self.reg_lambda, 105 max_iter=self.max_iter, 106 tol=self.tol, 107 ) 108 self.beta = res[0] 109 return self 110 111 res = mo.get_beta_2D( 112 beta0=np.asarray(beta0), 113 XX2=np.asarray(XX2), 114 Xy2=np.asarray(Xy2), 115 reg_lambda=self.reg_lambda, 116 max_iter=self.max_iter, 117 tol=self.tol, 118 ) 119 self.beta = res[0] 120 return self 121 122 invXX = jinv(XX + self.reg_lambda * jnp.eye(X_.shape[1])) 123 beta0 = mo.safe_sparse_dot(invXX, Xy, backend=self.backend) 124 if len(np.asarray(y).shape) == 1: 125 res = mo.get_beta_1D( 126 beta0=np.asarray(beta0), 127 XX2=np.asarray(XX2), 128 Xy2=np.asarray(Xy2), 129 reg_lambda=self.reg_lambda, 130 max_iter=self.max_iter, 131 tol=self.tol, 132 ) 133 self.beta = res[0] 134 return self 135 136 res = mo.get_beta_2D( 137 beta0=np.asarray(beta0), 138 XX2=np.asarray(XX2), 139 Xy2=np.asarray(Xy2), 140 reg_lambda=self.reg_lambda, 141 max_iter=self.max_iter, 142 tol=self.tol, 143 ) 144 self.beta = res[0] 145 return self 146 147 def predict(self, X, **kwargs): 148 """Predict test data X. 149 150 Args: 151 152 X: {array-like}, shape = [n_samples, n_features] 153 Training vectors, where n_samples is the number 154 of samples and n_features is the number of features. 155 156 **kwargs: additional parameters to be passed to `predict_proba` 157 158 159 Returns: 160 161 model predictions: {array-like} 162 163 """ 164 X_ = (X - self.xm[None, :]) / self.xsd[None, :] 165 166 if self.backend == "cpu": 167 if isinstance(self.ym, float): 168 return self.ym + mo.safe_sparse_dot(X_, self.beta) 169 return self.ym[None, :] + mo.safe_sparse_dot(X_, self.beta) 170 171 # if self.backend in ("gpu", "tpu"): 172 if isinstance(self.ym, float): 173 return self.ym + mo.safe_sparse_dot( 174 X_, self.beta, backend=self.backend 175 ) 176 return self.ym[None, :] + mo.safe_sparse_dot( 177 X_, self.beta, backend=self.backend 178 )
Lasso.
Attributes:
reg_lambda: float
L1 regularization parameter.
max_iter: int
number of iterations of lasso shooting algorithm.
tol: float
tolerance for convergence of lasso shooting algorithm.
backend: str
type of backend; must be in ('cpu', 'gpu', 'tpu').
66 def fit(self, X, y, **kwargs): 67 """Fit matrixops (classifier) to training data (X, y) 68 69 Args: 70 71 X: {array-like}, shape = [n_samples, n_features] 72 Training vectors, where n_samples is the number 73 of samples and n_features is the number of features. 74 75 y: array-like, shape = [n_samples] 76 Target values. 77 78 **kwargs: additional parameters to be passed to self.cook_training_set. 79 80 Returns: 81 82 self: object. 83 84 """ 85 86 self.ym, centered_y = mo.center_response(y) 87 self.xm = X.mean(axis=0) 88 self.xsd = X.std(axis=0) 89 self.xsd[self.xsd == 0] = 1 90 X_ = (X - self.xm[None, :]) / self.xsd[None, :] 91 XX = mo.crossprod(X_, backend=self.backend) 92 Xy = mo.crossprod(X_, centered_y, backend=self.backend) 93 XX2 = 2 * XX 94 Xy2 = 2 * Xy 95 96 if self.backend == "cpu": 97 # beta0, _, _, _ = np.linalg.lstsq(X_, centered_y, rcond=None) 98 beta0 = get_beta(X_, centered_y) 99 if len(np.asarray(y).shape) == 1: 100 res = mo.get_beta_1D( 101 beta0=np.asarray(beta0), 102 XX2=np.asarray(XX2), 103 Xy2=np.asarray(Xy2), 104 reg_lambda=self.reg_lambda, 105 max_iter=self.max_iter, 106 tol=self.tol, 107 ) 108 self.beta = res[0] 109 return self 110 111 res = mo.get_beta_2D( 112 beta0=np.asarray(beta0), 113 XX2=np.asarray(XX2), 114 Xy2=np.asarray(Xy2), 115 reg_lambda=self.reg_lambda, 116 max_iter=self.max_iter, 117 tol=self.tol, 118 ) 119 self.beta = res[0] 120 return self 121 122 invXX = jinv(XX + self.reg_lambda * jnp.eye(X_.shape[1])) 123 beta0 = mo.safe_sparse_dot(invXX, Xy, backend=self.backend) 124 if len(np.asarray(y).shape) == 1: 125 res = mo.get_beta_1D( 126 beta0=np.asarray(beta0), 127 XX2=np.asarray(XX2), 128 Xy2=np.asarray(Xy2), 129 reg_lambda=self.reg_lambda, 130 max_iter=self.max_iter, 131 tol=self.tol, 132 ) 133 self.beta = res[0] 134 return self 135 136 res = mo.get_beta_2D( 137 beta0=np.asarray(beta0), 138 XX2=np.asarray(XX2), 139 Xy2=np.asarray(Xy2), 140 reg_lambda=self.reg_lambda, 141 max_iter=self.max_iter, 142 tol=self.tol, 143 ) 144 self.beta = res[0] 145 return self
Fit matrixops (classifier) to training data (X, y)
Args:
X: {array-like}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number
of samples and n_features is the number of features.
y: array-like, shape = [n_samples]
Target values.
**kwargs: additional parameters to be passed to self.cook_training_set.
Returns:
self: object.
147 def predict(self, X, **kwargs): 148 """Predict test data X. 149 150 Args: 151 152 X: {array-like}, shape = [n_samples, n_features] 153 Training vectors, where n_samples is the number 154 of samples and n_features is the number of features. 155 156 **kwargs: additional parameters to be passed to `predict_proba` 157 158 159 Returns: 160 161 model predictions: {array-like} 162 163 """ 164 X_ = (X - self.xm[None, :]) / self.xsd[None, :] 165 166 if self.backend == "cpu": 167 if isinstance(self.ym, float): 168 return self.ym + mo.safe_sparse_dot(X_, self.beta) 169 return self.ym[None, :] + mo.safe_sparse_dot(X_, self.beta) 170 171 # if self.backend in ("gpu", "tpu"): 172 if isinstance(self.ym, float): 173 return self.ym + mo.safe_sparse_dot( 174 X_, self.beta, backend=self.backend 175 ) 176 return self.ym[None, :] + mo.safe_sparse_dot( 177 X_, self.beta, backend=self.backend 178 )
Predict test data X.
Args:
X: {array-like}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number
of samples and n_features is the number of features.
**kwargs: additional parameters to be passed to `predict_proba`
Returns:
model predictions: {array-like}
19class LSBoostRegressor(BaseEstimator, RegressorMixin): 20 """LSBoost regressor. 21 22 Attributes: 23 24 n_estimators: int 25 number of boosting iterations. 26 27 learning_rate: float 28 controls the learning speed at training time. 29 30 n_hidden_features: int 31 number of nodes in successive hidden layers. 32 33 reg_lambda: float 34 L2 regularization parameter for successive errors in the optimizer 35 (at training time). 36 37 alpha: float 38 compromise between L1 and L2 regularization (must be in [0, 1]), 39 for `solver` == 'enet' 40 41 row_sample: float 42 percentage of rows chosen from the training set. 43 44 col_sample: float 45 percentage of columns chosen from the training set. 46 47 dropout: float 48 percentage of nodes dropped from the training set. 49 50 tolerance: float 51 controls early stopping in gradient descent (at training time). 52 53 direct_link: bool 54 indicates whether the original features are included (True) in model's 55 fitting or not (False). 56 57 verbose: int 58 progress bar (yes = 1) or not (no = 0) (currently). 59 60 seed: int 61 reproducibility seed for nodes_sim=='uniform', clustering and dropout. 62 63 backend: str 64 type of backend; must be in ('cpu', 'gpu', 'tpu') 65 66 solver: str 67 type of 'weak' learner; currently in ('ridge', 'lasso') 68 69 activation: str 70 activation function: currently 'relu', 'relu6', 'sigmoid', 'tanh' 71 72 type_pi: str. 73 type of prediction interval; currently "kde" (default) or "bootstrap". 74 Used only in `self.predict`, for `self.replications` > 0 and `self.kernel` 75 in ('gaussian', 'tophat'). Default is `None`. 76 77 replications: int. 78 number of replications (if needed) for predictive simulation. 79 Used only in `self.predict`, for `self.kernel` in ('gaussian', 80 'tophat') and `self.type_pi = 'kde'`. Default is `None`. 81 82 n_clusters: int 83 number of clusters for clustering the features 84 85 clustering_method: str 86 clustering method: currently 'kmeans', 'gmm' 87 88 cluster_scaling: str 89 scaling method for clustering: currently 'standard', 'robust', 'minmax' 90 91 degree: int 92 degree of features interactions to include in the model 93 94 weights_distr: str 95 distribution of weights for constructing the model's hidden layer; 96 either 'uniform' or 'gaussian' 97 98 hist: bool 99 whether to use histogram features or not 100 101 bins: int or str 102 number of bins for histogram features (same as numpy.histogram, default is 'auto') 103 104 Examples: 105 106 ```python 107 import subprocess 108 import sys 109 import os 110 111 subprocess.check_call([sys.executable, "-m", "pip", "install", "matplotlib"]) 112 113 import mlsauce as ms 114 import numpy as np 115 import matplotlib.pyplot as plt 116 from sklearn.datasets import load_diabetes 117 from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score 118 from sklearn.tree import DecisionTreeRegressor 119 from time import time 120 from os import chdir 121 from sklearn import metrics 122 123 regr = DecisionTreeRegressor() 124 125 diabetes = load_diabetes() 126 X = diabetes.data 127 y = diabetes.target 128 # split data into training test and test set 129 np.random.seed(15029) 130 X_train, X_test, y_train, y_test = train_test_split(X, y, 131 test_size=0.2) 132 133 obj = ms.GenericBoostingRegressor(regr, col_sample=0.9, row_sample=0.9) 134 print(obj.get_params()) 135 start = time() 136 obj.fit(X_train, y_train) 137 print(time()-start) 138 start = time() 139 print(np.sqrt(np.mean(np.square(obj.predict(X_test) - y_test)))) 140 print(time()-start) 141 142 print(obj.obj['loss']) 143 144 obj = ms.GenericBoostingRegressor(regr, col_sample=0.9, row_sample=0.9, n_clusters=2) 145 print(obj.get_params()) 146 start = time() 147 obj.fit(X_train, y_train) 148 print(time()-start) 149 start = time() 150 print(np.sqrt(np.mean(np.square(obj.predict(X_test) - y_test)))) 151 print(time()-start) 152 153 print(obj.obj['loss']) 154 ``` 155 156 """ 157 158 def __init__( 159 self, 160 n_estimators=100, 161 learning_rate=0.1, 162 n_hidden_features=5, 163 reg_lambda=0.1, 164 alpha=0.5, 165 row_sample=1, 166 col_sample=1, 167 dropout=0, 168 tolerance=1e-4, 169 direct_link=1, 170 verbose=1, 171 seed=123, 172 backend="cpu", 173 solver="ridge", 174 activation="relu", 175 type_pi=None, 176 replications=None, 177 kernel=None, 178 n_clusters=0, 179 clustering_method="kmeans", 180 cluster_scaling="standard", 181 degree=None, 182 weights_distr="uniform", 183 base_model=None, 184 hist=False, 185 bins="auto", 186 ): 187 188 self.base_model = base_model 189 self.hist = hist 190 self.bins = bins 191 self.hist_bins_ = None 192 193 if n_clusters > 0: 194 assert clustering_method in ( 195 "kmeans", 196 "gmm", 197 ), "`clustering_method` must be in ('kmeans', 'gmm')" 198 assert cluster_scaling in ( 199 "standard", 200 "robust", 201 "minmax", 202 ), "`cluster_scaling` must be in ('standard', 'robust', 'minmax')" 203 204 assert backend in ( 205 "cpu", 206 "gpu", 207 "tpu", 208 ), "`backend` must be in ('cpu', 'gpu', 'tpu')" 209 210 assert solver in ( 211 "ridge", 212 "lasso", 213 "enet", 214 ), "`solver` must be in ('ridge', 'lasso', 'enet')" 215 216 sys_platform = platform.system() 217 218 if (sys_platform == "Windows") and (backend in ("gpu", "tpu")): 219 warnings.warn( 220 "No GPU/TPU computing on Windows yet, backend set to 'cpu'" 221 ) 222 backend = "cpu" 223 224 self.n_estimators = n_estimators 225 self.learning_rate = learning_rate 226 self.n_hidden_features = n_hidden_features 227 self.reg_lambda = reg_lambda 228 assert alpha >= 0 and alpha <= 1, "`alpha` must be in [0, 1]" 229 self.alpha = alpha 230 self.row_sample = row_sample 231 self.col_sample = col_sample 232 self.dropout = dropout 233 self.tolerance = tolerance 234 self.direct_link = direct_link 235 self.verbose = verbose 236 self.seed = seed 237 self.backend = backend 238 self.obj = None 239 self.solver = solver 240 self.activation = activation 241 self.type_pi = type_pi 242 self.replications = replications 243 self.kernel = kernel 244 self.n_clusters = n_clusters 245 self.clustering_method = clustering_method 246 self.cluster_scaling = cluster_scaling 247 self.scaler_, self.label_encoder_, self.clusterer_ = None, None, None 248 self.degree = degree 249 self.poly_ = None 250 self.weights_distr = weights_distr 251 if self.backend in ("gpu", "tpu"): 252 check_and_install("jax") 253 check_and_install("jaxlib") 254 255 def fit(self, X, y, **kwargs): 256 """Fit Booster (regressor) to training data (X, y) 257 258 Args: 259 260 X: {array-like}, shape = [n_samples, n_features] 261 Training vectors, where n_samples is the number 262 of samples and n_features is the number of features. 263 264 y: array-like, shape = [n_samples] 265 Target values. 266 267 **kwargs: additional parameters to be passed to self.cook_training_set. 268 269 Returns: 270 271 self: object. 272 """ 273 274 if isinstance(X, pd.DataFrame): 275 X = X.values 276 277 if self.hist == True: 278 X, self.hist_bins_ = get_histo_features(X) 279 280 if isinstance(y, pd.Series): 281 y = y.values.ravel() 282 else: 283 y = y.ravel() 284 285 if self.degree is not None: 286 assert isinstance(self.degree, int), "`degree` must be an integer" 287 self.poly_ = PolynomialFeatures( 288 degree=self.degree, interaction_only=True, include_bias=False 289 ) 290 X = self.poly_.fit_transform(X) 291 292 if self.n_clusters > 0: 293 clustered_X, self.scaler_, self.label_encoder_, self.clusterer_ = ( 294 cluster( 295 X, 296 n_clusters=self.n_clusters, 297 method=self.clustering_method, 298 type_scaling=self.cluster_scaling, 299 training=True, 300 seed=self.seed, 301 ) 302 ) 303 X = np.column_stack((X, clustered_X)) 304 305 self.obj = boosterc.fit_booster_regressor( 306 X=np.asarray(X, order="C"), 307 y=np.asarray(y, order="C"), 308 n_estimators=self.n_estimators, 309 learning_rate=self.learning_rate, 310 n_hidden_features=self.n_hidden_features, 311 reg_lambda=self.reg_lambda, 312 alpha=self.alpha, 313 row_sample=self.row_sample, 314 col_sample=self.col_sample, 315 dropout=self.dropout, 316 tolerance=self.tolerance, 317 direct_link=self.direct_link, 318 verbose=self.verbose, 319 seed=self.seed, 320 backend=self.backend, 321 solver=self.solver, 322 activation=self.activation, 323 obj=self.base_model, 324 ) 325 326 self.n_estimators = self.obj["n_estimators"] 327 328 self.X_ = X 329 330 self.y_ = y 331 332 return self 333 334 def predict(self, X, level=95, method=None, histo=False, **kwargs): 335 """Predict values for test data X. 336 337 Args: 338 339 X: {array-like}, shape = [n_samples, n_features] 340 Training vectors, where n_samples is the number 341 of samples and n_features is the number of features. 342 343 level: int 344 Level of confidence (default = 95) 345 346 method: str 347 `None`, or 'splitconformal', 'localconformal' 348 prediction (if you specify `return_pi = True`) 349 350 histo: bool 351 whether to use histogram features or not 352 353 **kwargs: additional parameters to be passed to 354 self.cook_test_set 355 356 Returns: 357 358 predicted values estimates for test data: {array-like} 359 """ 360 361 if isinstance(X, pd.DataFrame): 362 X = X.values 363 364 if self.hist == True: 365 X = get_histo_features(X, bins=self.hist_bins_) 366 367 if self.degree is not None: 368 X = self.poly_.transform(X) 369 370 if self.n_clusters > 0: 371 X = np.column_stack( 372 ( 373 X, 374 cluster( 375 X, 376 training=False, 377 scaler=self.scaler_, 378 label_encoder=self.label_encoder_, 379 clusterer=self.clusterer_, 380 seed=self.seed, 381 ), 382 ) 383 ) 384 if "return_pi" in kwargs: 385 assert method in ( 386 "splitconformal", 387 "localconformal", 388 ), "method must be in ('splitconformal', 'localconformal')" 389 self.pi = PredictionInterval( 390 obj=self, 391 method=method, 392 level=level, 393 type_pi=self.type_pi, 394 replications=self.replications, 395 kernel=self.kernel, 396 ) 397 self.pi.fit(self.X_, self.y_) 398 self.X_ = None 399 self.y_ = None 400 preds = self.pi.predict(X, return_pi=True) 401 return preds 402 # print(f"\n in predict self: {self} \n") 403 # print(f"\n in predict self.obj: {self.obj} \n") 404 # try: 405 return boosterc.predict_booster_regressor( 406 self.obj, np.asarray(X, order="C") 407 ) 408 # except ValueError: 409 # pass 410 411 def update(self, X, y, eta=0.9): 412 """Update model with new data. 413 414 Args: 415 416 X: {array-like}, shape = [n_samples=1, n_features] 417 Training vectors, where n_samples is the number 418 of samples and n_features is the number of features. 419 420 y: float = [n_samples=1] 421 Target value. 422 423 eta: float 424 Inverse power applied to number of observations 425 (defines a learning rate). 426 427 Returns: 428 429 self: object. 430 """ 431 432 if isinstance(X, pd.DataFrame): 433 X = X.values 434 435 if self.degree is not None: 436 X = self.poly_.transform(X) 437 438 if self.n_clusters > 0: 439 X = np.column_stack( 440 ( 441 X, 442 cluster( 443 X, 444 training=False, 445 scaler=self.scaler_, 446 label_encoder=self.label_encoder_, 447 clusterer=self.clusterer_, 448 seed=self.seed, 449 ), 450 ) 451 ) 452 453 self.obj = boosterc.update_booster( 454 self.obj, np.asarray(X, order="C"), np.asarray(y, order="C"), eta 455 ) 456 457 return self
LSBoost regressor.
Attributes:
n_estimators: int
number of boosting iterations.
learning_rate: float
controls the learning speed at training time.
n_hidden_features: int
number of nodes in successive hidden layers.
reg_lambda: float
L2 regularization parameter for successive errors in the optimizer
(at training time).
alpha: float
compromise between L1 and L2 regularization (must be in [0, 1]),
for `solver` == 'enet'
row_sample: float
percentage of rows chosen from the training set.
col_sample: float
percentage of columns chosen from the training set.
dropout: float
percentage of nodes dropped from the training set.
tolerance: float
controls early stopping in gradient descent (at training time).
direct_link: bool
indicates whether the original features are included (True) in model's
fitting or not (False).
verbose: int
progress bar (yes = 1) or not (no = 0) (currently).
seed: int
reproducibility seed for nodes_sim=='uniform', clustering and dropout.
backend: str
type of backend; must be in ('cpu', 'gpu', 'tpu')
solver: str
type of 'weak' learner; currently in ('ridge', 'lasso')
activation: str
activation function: currently 'relu', 'relu6', 'sigmoid', 'tanh'
type_pi: str.
type of prediction interval; currently "kde" (default) or "bootstrap".
Used only in `self.predict`, for `self.replications` > 0 and `self.kernel`
in ('gaussian', 'tophat'). Default is `None`.
replications: int.
number of replications (if needed) for predictive simulation.
Used only in `self.predict`, for `self.kernel` in ('gaussian',
'tophat') and `self.type_pi = 'kde'`. Default is `None`.
n_clusters: int
number of clusters for clustering the features
clustering_method: str
clustering method: currently 'kmeans', 'gmm'
cluster_scaling: str
scaling method for clustering: currently 'standard', 'robust', 'minmax'
degree: int
degree of features interactions to include in the model
weights_distr: str
distribution of weights for constructing the model's hidden layer;
either 'uniform' or 'gaussian'
hist: bool
whether to use histogram features or not
bins: int or str
number of bins for histogram features (same as numpy.histogram, default is 'auto')
Examples:
import subprocess
import sys
import os
subprocess.check_call([sys.executable, "-m", "pip", "install", "matplotlib"])
import mlsauce as ms
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.tree import DecisionTreeRegressor
from time import time
from os import chdir
from sklearn import metrics
regr = DecisionTreeRegressor()
diabetes = load_diabetes()
X = diabetes.data
y = diabetes.target
# split data into training test and test set
np.random.seed(15029)
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2)
obj = ms.GenericBoostingRegressor(regr, col_sample=0.9, row_sample=0.9)
print(obj.get_params())
start = time()
obj.fit(X_train, y_train)
print(time()-start)
start = time()
print(np.sqrt(np.mean(np.square(obj.predict(X_test) - y_test))))
print(time()-start)
print(obj.obj['loss'])
obj = ms.GenericBoostingRegressor(regr, col_sample=0.9, row_sample=0.9, n_clusters=2)
print(obj.get_params())
start = time()
obj.fit(X_train, y_train)
print(time()-start)
start = time()
print(np.sqrt(np.mean(np.square(obj.predict(X_test) - y_test))))
print(time()-start)
print(obj.obj['loss'])
255 def fit(self, X, y, **kwargs): 256 """Fit Booster (regressor) to training data (X, y) 257 258 Args: 259 260 X: {array-like}, shape = [n_samples, n_features] 261 Training vectors, where n_samples is the number 262 of samples and n_features is the number of features. 263 264 y: array-like, shape = [n_samples] 265 Target values. 266 267 **kwargs: additional parameters to be passed to self.cook_training_set. 268 269 Returns: 270 271 self: object. 272 """ 273 274 if isinstance(X, pd.DataFrame): 275 X = X.values 276 277 if self.hist == True: 278 X, self.hist_bins_ = get_histo_features(X) 279 280 if isinstance(y, pd.Series): 281 y = y.values.ravel() 282 else: 283 y = y.ravel() 284 285 if self.degree is not None: 286 assert isinstance(self.degree, int), "`degree` must be an integer" 287 self.poly_ = PolynomialFeatures( 288 degree=self.degree, interaction_only=True, include_bias=False 289 ) 290 X = self.poly_.fit_transform(X) 291 292 if self.n_clusters > 0: 293 clustered_X, self.scaler_, self.label_encoder_, self.clusterer_ = ( 294 cluster( 295 X, 296 n_clusters=self.n_clusters, 297 method=self.clustering_method, 298 type_scaling=self.cluster_scaling, 299 training=True, 300 seed=self.seed, 301 ) 302 ) 303 X = np.column_stack((X, clustered_X)) 304 305 self.obj = boosterc.fit_booster_regressor( 306 X=np.asarray(X, order="C"), 307 y=np.asarray(y, order="C"), 308 n_estimators=self.n_estimators, 309 learning_rate=self.learning_rate, 310 n_hidden_features=self.n_hidden_features, 311 reg_lambda=self.reg_lambda, 312 alpha=self.alpha, 313 row_sample=self.row_sample, 314 col_sample=self.col_sample, 315 dropout=self.dropout, 316 tolerance=self.tolerance, 317 direct_link=self.direct_link, 318 verbose=self.verbose, 319 seed=self.seed, 320 backend=self.backend, 321 solver=self.solver, 322 activation=self.activation, 323 obj=self.base_model, 324 ) 325 326 self.n_estimators = self.obj["n_estimators"] 327 328 self.X_ = X 329 330 self.y_ = y 331 332 return self
Fit Booster (regressor) to training data (X, y)
Args:
X: {array-like}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number
of samples and n_features is the number of features.
y: array-like, shape = [n_samples]
Target values.
**kwargs: additional parameters to be passed to self.cook_training_set.
Returns:
self: object.
334 def predict(self, X, level=95, method=None, histo=False, **kwargs): 335 """Predict values for test data X. 336 337 Args: 338 339 X: {array-like}, shape = [n_samples, n_features] 340 Training vectors, where n_samples is the number 341 of samples and n_features is the number of features. 342 343 level: int 344 Level of confidence (default = 95) 345 346 method: str 347 `None`, or 'splitconformal', 'localconformal' 348 prediction (if you specify `return_pi = True`) 349 350 histo: bool 351 whether to use histogram features or not 352 353 **kwargs: additional parameters to be passed to 354 self.cook_test_set 355 356 Returns: 357 358 predicted values estimates for test data: {array-like} 359 """ 360 361 if isinstance(X, pd.DataFrame): 362 X = X.values 363 364 if self.hist == True: 365 X = get_histo_features(X, bins=self.hist_bins_) 366 367 if self.degree is not None: 368 X = self.poly_.transform(X) 369 370 if self.n_clusters > 0: 371 X = np.column_stack( 372 ( 373 X, 374 cluster( 375 X, 376 training=False, 377 scaler=self.scaler_, 378 label_encoder=self.label_encoder_, 379 clusterer=self.clusterer_, 380 seed=self.seed, 381 ), 382 ) 383 ) 384 if "return_pi" in kwargs: 385 assert method in ( 386 "splitconformal", 387 "localconformal", 388 ), "method must be in ('splitconformal', 'localconformal')" 389 self.pi = PredictionInterval( 390 obj=self, 391 method=method, 392 level=level, 393 type_pi=self.type_pi, 394 replications=self.replications, 395 kernel=self.kernel, 396 ) 397 self.pi.fit(self.X_, self.y_) 398 self.X_ = None 399 self.y_ = None 400 preds = self.pi.predict(X, return_pi=True) 401 return preds 402 # print(f"\n in predict self: {self} \n") 403 # print(f"\n in predict self.obj: {self.obj} \n") 404 # try: 405 return boosterc.predict_booster_regressor( 406 self.obj, np.asarray(X, order="C") 407 ) 408 # except ValueError: 409 # pass
Predict values for test data X.
Args:
X: {array-like}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number
of samples and n_features is the number of features.
level: int
Level of confidence (default = 95)
method: str
`None`, or 'splitconformal', 'localconformal'
prediction (if you specify `return_pi = True`)
histo: bool
whether to use histogram features or not
**kwargs: additional parameters to be passed to
self.cook_test_set
Returns:
predicted values estimates for test data: {array-like}
23class RidgeRegressor(BaseEstimator, RegressorMixin): 24 """Ridge. 25 26 Attributes: 27 28 reg_lambda: float 29 regularization parameter. 30 31 backend: str 32 type of backend; must be in ('cpu', 'gpu', 'tpu') 33 34 """ 35 36 def __init__(self, reg_lambda=0.1, backend="cpu"): 37 assert backend in ( 38 "cpu", 39 "gpu", 40 "tpu", 41 ), "`backend` must be in ('cpu', 'gpu', 'tpu')" 42 43 sys_platform = platform.system() 44 45 if (sys_platform == "Windows") and (backend in ("gpu", "tpu")): 46 warnings.warn( 47 "No GPU/TPU computing on Windows yet, backend set to 'cpu'" 48 ) 49 backend = "cpu" 50 51 self.reg_lambda = reg_lambda 52 self.backend = backend 53 if self.backend in ("gpu", "tpu"): 54 check_and_install("jax") 55 check_and_install("jaxlib") 56 57 def fit(self, X, y, **kwargs): 58 """Fit matrixops (classifier) to training data (X, y) 59 60 Args: 61 62 X: {array-like}, shape = [n_samples, n_features] 63 Training vectors, where n_samples is the number 64 of samples and n_features is the number of features. 65 66 y: array-like, shape = [n_samples] 67 Target values. 68 69 **kwargs: additional parameters to be passed to self.cook_training_set. 70 71 Returns: 72 73 self: object. 74 75 """ 76 self.ym, centered_y = mo.center_response(y) 77 self.xm = X.mean(axis=0) 78 self.xsd = X.std(axis=0) 79 self.xsd[self.xsd == 0] = 1 # avoid division by zero 80 X_ = (X - self.xm[None, :]) / self.xsd[None, :] 81 82 if self.backend == "cpu": 83 if len(centered_y.shape) <= 1: 84 eye_term = np.sqrt(self.reg_lambda) * np.eye(X.shape[1]) 85 X_ = np.row_stack((X_, eye_term)) 86 y_ = np.concatenate((centered_y, np.zeros(X.shape[1]))) 87 # self.beta, _, _, _ = np.linalg.lstsq(X_, y_, rcond=None) 88 self.beta = get_beta(X_, y_) 89 else: 90 try: 91 eye_term = np.sqrt(self.reg_lambda) * np.eye(X.shape[1]) 92 X_ = np.row_stack((X_, eye_term)) 93 y_ = np.row_stack( 94 ( 95 centered_y, 96 np.zeros((eye_term.shape[0], centered_y.shape[1])), 97 ) 98 ) 99 # self.beta, _, _, _ = np.linalg.lstsq(X_, y_, rcond=None) 100 self.beta = get_beta(X_, y_) 101 except Exception: 102 x = inv( 103 mo.crossprod(X_) + self.reg_lambda * np.eye(X_.shape[1]) 104 ) 105 hat_matrix = mo.tcrossprod(x, X_) 106 self.beta = mo.safe_sparse_dot(hat_matrix, centered_y) 107 return self 108 109 x = jinv( 110 mo.crossprod(X_, backend=self.backend) 111 + self.reg_lambda * jnp.eye(X_.shape[1]) 112 ) 113 hat_matrix = mo.tcrossprod(x, X_, backend=self.backend) 114 self.beta = mo.safe_sparse_dot( 115 hat_matrix, centered_y, backend=self.backend 116 ) 117 return self 118 119 def predict(self, X, **kwargs): 120 """Predict test data X. 121 122 Args: 123 124 X: {array-like}, shape = [n_samples, n_features] 125 Training vectors, where n_samples is the number 126 of samples and n_features is the number of features. 127 128 **kwargs: additional parameters to be passed to `predict_proba` 129 130 Returns: 131 132 model predictions: {array-like} 133 134 """ 135 X_ = (X - self.xm[None, :]) / self.xsd[None, :] 136 137 if self.backend == "cpu": 138 if isinstance(self.ym, float): 139 return self.ym + mo.safe_sparse_dot(X_, self.beta) 140 return self.ym[None, :] + mo.safe_sparse_dot(X_, self.beta) 141 142 # if self.backend in ("gpu", "tpu"): 143 if isinstance(self.ym, float): 144 return self.ym + mo.safe_sparse_dot( 145 X_, self.beta, backend=self.backend 146 ) 147 return self.ym[None, :] + mo.safe_sparse_dot( 148 X_, self.beta, backend=self.backend 149 )
Ridge.
Attributes:
reg_lambda: float
regularization parameter.
backend: str
type of backend; must be in ('cpu', 'gpu', 'tpu')
57 def fit(self, X, y, **kwargs): 58 """Fit matrixops (classifier) to training data (X, y) 59 60 Args: 61 62 X: {array-like}, shape = [n_samples, n_features] 63 Training vectors, where n_samples is the number 64 of samples and n_features is the number of features. 65 66 y: array-like, shape = [n_samples] 67 Target values. 68 69 **kwargs: additional parameters to be passed to self.cook_training_set. 70 71 Returns: 72 73 self: object. 74 75 """ 76 self.ym, centered_y = mo.center_response(y) 77 self.xm = X.mean(axis=0) 78 self.xsd = X.std(axis=0) 79 self.xsd[self.xsd == 0] = 1 # avoid division by zero 80 X_ = (X - self.xm[None, :]) / self.xsd[None, :] 81 82 if self.backend == "cpu": 83 if len(centered_y.shape) <= 1: 84 eye_term = np.sqrt(self.reg_lambda) * np.eye(X.shape[1]) 85 X_ = np.row_stack((X_, eye_term)) 86 y_ = np.concatenate((centered_y, np.zeros(X.shape[1]))) 87 # self.beta, _, _, _ = np.linalg.lstsq(X_, y_, rcond=None) 88 self.beta = get_beta(X_, y_) 89 else: 90 try: 91 eye_term = np.sqrt(self.reg_lambda) * np.eye(X.shape[1]) 92 X_ = np.row_stack((X_, eye_term)) 93 y_ = np.row_stack( 94 ( 95 centered_y, 96 np.zeros((eye_term.shape[0], centered_y.shape[1])), 97 ) 98 ) 99 # self.beta, _, _, _ = np.linalg.lstsq(X_, y_, rcond=None) 100 self.beta = get_beta(X_, y_) 101 except Exception: 102 x = inv( 103 mo.crossprod(X_) + self.reg_lambda * np.eye(X_.shape[1]) 104 ) 105 hat_matrix = mo.tcrossprod(x, X_) 106 self.beta = mo.safe_sparse_dot(hat_matrix, centered_y) 107 return self 108 109 x = jinv( 110 mo.crossprod(X_, backend=self.backend) 111 + self.reg_lambda * jnp.eye(X_.shape[1]) 112 ) 113 hat_matrix = mo.tcrossprod(x, X_, backend=self.backend) 114 self.beta = mo.safe_sparse_dot( 115 hat_matrix, centered_y, backend=self.backend 116 ) 117 return self
Fit matrixops (classifier) to training data (X, y)
Args:
X: {array-like}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number
of samples and n_features is the number of features.
y: array-like, shape = [n_samples]
Target values.
**kwargs: additional parameters to be passed to self.cook_training_set.
Returns:
self: object.
119 def predict(self, X, **kwargs): 120 """Predict test data X. 121 122 Args: 123 124 X: {array-like}, shape = [n_samples, n_features] 125 Training vectors, where n_samples is the number 126 of samples and n_features is the number of features. 127 128 **kwargs: additional parameters to be passed to `predict_proba` 129 130 Returns: 131 132 model predictions: {array-like} 133 134 """ 135 X_ = (X - self.xm[None, :]) / self.xsd[None, :] 136 137 if self.backend == "cpu": 138 if isinstance(self.ym, float): 139 return self.ym + mo.safe_sparse_dot(X_, self.beta) 140 return self.ym[None, :] + mo.safe_sparse_dot(X_, self.beta) 141 142 # if self.backend in ("gpu", "tpu"): 143 if isinstance(self.ym, float): 144 return self.ym + mo.safe_sparse_dot( 145 X_, self.beta, backend=self.backend 146 ) 147 return self.ym[None, :] + mo.safe_sparse_dot( 148 X_, self.beta, backend=self.backend 149 )
Predict test data X.
Args:
X: {array-like}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number
of samples and n_features is the number of features.
**kwargs: additional parameters to be passed to `predict_proba`
Returns:
model predictions: {array-like}
89class LazyBoostingClassifier(ClassifierMixin): 90 """ 91 92 Fitting -- almost -- all the classification algorithms 93 and returning their scores. 94 95 Parameters: 96 97 verbose: int, optional (default=0) 98 Any positive number for verbosity. 99 100 ignore_warnings: bool, optional (default=True) 101 When set to True, the warning related to algorigms that are not 102 able to run are ignored. 103 104 custom_metric: function, optional (default=None) 105 When function is provided, models are evaluated based on the custom 106 evaluation metric provided. 107 108 predictions: bool, optional (default=False) 109 When set to True, the predictions of all the models models are 110 returned as data frame. 111 112 sort_by: string, optional (default='Accuracy') 113 Sort models by a metric. Available options are 'Accuracy', 114 'Balanced Accuracy', 'ROC AUC', 'F1 Score' or a custom metric 115 identified by its name and provided by custom_metric. 116 117 random_state: int, optional (default=42) 118 Reproducibiility seed. 119 120 estimators: list, optional (default='all') 121 list of Estimators names or just 'all' for > 90 classifiers 122 (default='all') 123 124 preprocess: bool, preprocessing is done when set to True 125 126 n_jobs: int, when possible, run in parallel 127 For now, only used by individual models that support it. 128 129 n_layers: int, optional (default=3) 130 Number of layers of GenericBoostingClassifiers to be used. 131 132 All the other parameters are the same as GenericBoostingClassifier's. 133 134 Attributes: 135 136 models_: dict-object 137 Returns a dictionary with each model pipeline as value 138 with key as name of models. 139 140 best_model_: object 141 Returns the best model pipeline. 142 143 Examples 144 145 ```python 146 import os 147 import mlsauce as ms 148 from sklearn.datasets import load_breast_cancer, load_iris, load_wine, load_digits 149 from sklearn.model_selection import train_test_split 150 from time import time 151 152 load_models = [load_breast_cancer, load_iris, load_wine] 153 154 for model in load_models: 155 156 data = model() 157 X = data.data 158 y= data.target 159 160 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 13) 161 162 clf = ms.LazyBoostingClassifier(verbose=1, ignore_warnings=False, 163 custom_metric=None, preprocess=False) 164 165 start = time() 166 models, predictioms = clf.fit(X_train, X_test, y_train, y_test) 167 print(f"\nElapsed: {time() - start} seconds\n") 168 169 print(models) 170 ``` 171 172 """ 173 174 def __init__( 175 self, 176 verbose=0, 177 ignore_warnings=True, 178 custom_metric=None, 179 predictions=False, 180 sort_by="Accuracy", 181 random_state=42, 182 estimators="all", 183 preprocess=False, 184 n_jobs=None, 185 ): 186 self.verbose = verbose 187 self.ignore_warnings = ignore_warnings 188 self.custom_metric = custom_metric 189 self.predictions = predictions 190 self.sort_by = sort_by 191 self.models_ = {} 192 self.best_model_ = None 193 self.random_state = random_state 194 self.estimators = estimators 195 self.preprocess = preprocess 196 self.n_jobs = n_jobs 197 198 def fit(self, X_train, X_test, y_train, y_test, hist=False, **kwargs): 199 """Fit classifiers to X_train and y_train, predict and score on X_test, 200 y_test. 201 202 Parameters: 203 204 X_train: array-like, 205 Training vectors, where rows is the number of samples 206 and columns is the number of features. 207 208 X_test: array-like, 209 Testing vectors, where rows is the number of samples 210 and columns is the number of features. 211 212 y_train: array-like, 213 Training vectors, where rows is the number of samples 214 and columns is the number of features. 215 216 y_test: array-like, 217 Testing vectors, where rows is the number of samples 218 and columns is the number of features. 219 220 hist: bool, optional (default=False) 221 When set to True, the model is a GenericBoostingClassifier. 222 223 **kwargs: dict, 224 Additional arguments to be passed to the fit GenericBoostingClassifier. 225 226 Returns: 227 228 scores: Pandas DataFrame 229 Returns metrics of all the models in a Pandas DataFrame. 230 231 predictions: Pandas DataFrame 232 Returns predictions of all the models in a Pandas DataFrame. 233 """ 234 Accuracy = [] 235 B_Accuracy = [] 236 ROC_AUC = [] 237 F1 = [] 238 names = [] 239 TIME = [] 240 predictions = {} 241 242 if self.custom_metric is not None: 243 CUSTOM_METRIC = [] 244 245 if isinstance(X_train, np.ndarray): 246 X_train = pd.DataFrame(X_train) 247 X_test = pd.DataFrame(X_test) 248 249 numeric_features = X_train.select_dtypes(include=[np.number]).columns 250 categorical_features = X_train.select_dtypes(include=["object"]).columns 251 252 categorical_low, categorical_high = get_card_split( 253 X_train, categorical_features 254 ) 255 256 if self.preprocess is True: 257 preprocessor = ColumnTransformer( 258 transformers=[ 259 ("numeric", numeric_transformer, numeric_features), 260 ( 261 "categorical_low", 262 categorical_transformer_low, 263 categorical_low, 264 ), 265 ( 266 "categorical_high", 267 categorical_transformer_high, 268 categorical_high, 269 ), 270 ] 271 ) 272 273 # baseline models 274 try: 275 baseline_names = ["RandomForestClassifier", "XGBClassifier"] 276 baseline_models = [RandomForestClassifier(), xgb.XGBClassifier()] 277 except Exception as exception: 278 baseline_names = ["RandomForestClassifier"] 279 baseline_models = [RandomForestClassifier()] 280 281 if self.verbose > 0: 282 print("\n Fitting baseline models...") 283 for name, model in tqdm(zip(baseline_names, baseline_models)): 284 start = time.time() 285 try: 286 model.fit(X_train, y_train) 287 self.models_[name] = model 288 y_pred = model.predict(X_test) 289 accuracy = accuracy_score(y_test, y_pred, normalize=True) 290 b_accuracy = balanced_accuracy_score(y_test, y_pred) 291 f1 = f1_score(y_test, y_pred, average="weighted") 292 try: 293 roc_auc = roc_auc_score(y_test, y_pred) 294 except Exception as exception: 295 roc_auc = None 296 if self.ignore_warnings is False: 297 print("ROC AUC couldn't be calculated for " + name) 298 print(exception) 299 names.append(name) 300 Accuracy.append(accuracy) 301 B_Accuracy.append(b_accuracy) 302 ROC_AUC.append(roc_auc) 303 F1.append(f1) 304 TIME.append(time.time() - start) 305 if self.custom_metric is not None: 306 custom_metric = self.custom_metric(y_test, y_pred) 307 CUSTOM_METRIC.append(custom_metric) 308 if self.verbose > 0: 309 if self.custom_metric is not None: 310 print( 311 { 312 "Model": name, 313 "Accuracy": accuracy, 314 "Balanced Accuracy": b_accuracy, 315 "ROC AUC": roc_auc, 316 "F1 Score": f1, 317 self.custom_metric.__name__: custom_metric, 318 "Time taken": time.time() - start, 319 } 320 ) 321 else: 322 print( 323 { 324 "Model": name, 325 "Accuracy": accuracy, 326 "Balanced Accuracy": b_accuracy, 327 "ROC AUC": roc_auc, 328 "F1 Score": f1, 329 "Time taken": time.time() - start, 330 } 331 ) 332 if self.predictions: 333 predictions[name] = y_pred 334 except Exception as exception: 335 if self.ignore_warnings is False: 336 print(name + " model failed to execute") 337 print(exception) 338 339 if self.estimators == "all": 340 self.classifiers = REGRESSORS + MTASKREGRESSORS 341 else: 342 self.classifiers = [ 343 ("GBoostClassifier(" + est[0] + ")", est[1]()) 344 for est in all_estimators() 345 if ( 346 issubclass(est[1], RegressorMixin) 347 and (est[0] in self.estimators) 348 ) 349 ] + [ 350 ( 351 "GBoostClassifier(MultiTask(" + est[0] + "))", 352 partial(MultiTaskRegressor, regr=est[1]()), 353 ) 354 for est in all_estimators() 355 if ( 356 issubclass(est[1], RegressorMixin) 357 and (est[0] in self.estimators) 358 ) 359 ] 360 361 if self.preprocess is True: 362 363 if self.n_jobs is None: 364 365 for name, model in tqdm(self.classifiers): # do parallel exec 366 367 other_args = ( 368 {} 369 ) # use this trick for `random_state` too --> refactor 370 try: 371 if ( 372 "n_jobs" in model().get_params().keys() 373 and name.find("LogisticRegression") == -1 374 ): 375 other_args["n_jobs"] = self.n_jobs 376 except Exception: 377 pass 378 379 start = time.time() 380 381 try: 382 if "random_state" in model().get_params().keys(): 383 if hist is False: 384 fitted_clf = GenericBoostingClassifier( 385 {**other_args, **kwargs}, 386 verbose=self.verbose, 387 base_model=model( 388 random_state=self.random_state 389 ), 390 ) 391 else: 392 fitted_clf = GenericBoostingClassifier( 393 {**other_args, **kwargs}, 394 verbose=self.verbose, 395 base_model=model( 396 random_state=self.random_state 397 ), 398 hist=True, 399 ) 400 401 else: 402 if hist is False: 403 fitted_clf = GenericBoostingClassifier( 404 base_model=model(**kwargs), 405 verbose=self.verbose, 406 ) 407 else: 408 fitted_clf = GenericBoostingClassifier( 409 base_model=model(**kwargs), 410 verbose=self.verbose, 411 hist=True, 412 ) 413 414 if self.verbose > 0: 415 print("\n Fitting boosted " + name + " model...") 416 fitted_clf.fit(X_train, y_train) 417 418 pipe = Pipeline( 419 [ 420 ("preprocessor", preprocessor), 421 ("classifier", fitted_clf), 422 ] 423 ) 424 425 if self.verbose > 0: 426 print("\n Fitting boosted " + name + " model...") 427 pipe.fit(X_train, y_train) 428 self.models_[name] = pipe 429 y_pred = pipe.predict(X_test) 430 accuracy = accuracy_score( 431 y_test, y_pred, normalize=True 432 ) 433 b_accuracy = balanced_accuracy_score(y_test, y_pred) 434 f1 = f1_score(y_test, y_pred, average="weighted") 435 try: 436 roc_auc = roc_auc_score(y_test, y_pred) 437 except Exception as exception: 438 roc_auc = None 439 if self.ignore_warnings is False: 440 print( 441 "ROC AUC couldn't be calculated for " + name 442 ) 443 print(exception) 444 names.append(name) 445 Accuracy.append(accuracy) 446 B_Accuracy.append(b_accuracy) 447 ROC_AUC.append(roc_auc) 448 F1.append(f1) 449 TIME.append(time.time() - start) 450 if self.custom_metric is not None: 451 custom_metric = self.custom_metric(y_test, y_pred) 452 CUSTOM_METRIC.append(custom_metric) 453 if self.verbose > 0: 454 if self.custom_metric is not None: 455 print( 456 { 457 "Model": name, 458 "Accuracy": accuracy, 459 "Balanced Accuracy": b_accuracy, 460 "ROC AUC": roc_auc, 461 "F1 Score": f1, 462 self.custom_metric.__name__: custom_metric, 463 "Time taken": time.time() - start, 464 } 465 ) 466 else: 467 print( 468 { 469 "Model": name, 470 "Accuracy": accuracy, 471 "Balanced Accuracy": b_accuracy, 472 "ROC AUC": roc_auc, 473 "F1 Score": f1, 474 "Time taken": time.time() - start, 475 } 476 ) 477 if self.predictions: 478 predictions[name] = y_pred 479 except Exception as exception: 480 if self.ignore_warnings is False: 481 print(name + " model failed to execute") 482 print(exception) 483 484 else: 485 486 # train_model(self, name, model, X_train, y_train, X_test, y_test, 487 # use_preprocessing=False, preprocessor=None, 488 # **kwargs): 489 results = Parallel(n_jobs=self.n_jobs)( 490 delayed(self.train_model)( 491 name, 492 model, 493 X_train, 494 y_train, 495 X_test, 496 y_test, 497 use_preprocessing=True, 498 preprocessor=preprocessor, 499 **kwargs 500 ) 501 for name, model in tqdm(self.classifiers) 502 ) 503 Accuracy = [res["accuracy"] for res in results] 504 B_Accuracy = [res["balanced_accuracy"] for res in results] 505 ROC_AUC = [res["roc_auc"] for res in results] 506 F1 = [res["f1"] for res in results] 507 names = [res["name"] for res in results] 508 TIME = [res["time"] for res in results] 509 if self.custom_metric is not None: 510 CUSTOM_METRIC = [res["custom_metric"] for res in results] 511 if self.predictions: 512 predictions = { 513 res["name"]: res["predictions"] for res in results 514 } 515 516 else: # no preprocessing 517 518 if self.n_jobs is None: 519 520 for name, model in tqdm(self.classifiers): # do parallel exec 521 start = time.time() 522 try: 523 if "random_state" in model().get_params().keys(): 524 if hist is False: 525 fitted_clf = GenericBoostingClassifier( 526 base_model=model( 527 random_state=self.random_state 528 ), 529 verbose=self.verbose, 530 **kwargs 531 ) 532 else: 533 fitted_clf = GenericBoostingClassifier( 534 base_model=model( 535 random_state=self.random_state 536 ), 537 verbose=self.verbose, 538 hist=True, 539 **kwargs 540 ) 541 542 else: 543 if hist is False: 544 fitted_clf = GenericBoostingClassifier( 545 base_model=model(), 546 verbose=self.verbose, 547 **kwargs 548 ) 549 else: 550 fitted_clf = GenericBoostingClassifier( 551 base_model=model(), 552 verbose=self.verbose, 553 hist=True, 554 **kwargs 555 ) 556 557 fitted_clf.fit(X_train, y_train) 558 559 self.models_[name] = fitted_clf 560 y_pred = fitted_clf.predict(X_test) 561 accuracy = accuracy_score( 562 y_test, y_pred, normalize=True 563 ) 564 b_accuracy = balanced_accuracy_score(y_test, y_pred) 565 f1 = f1_score(y_test, y_pred, average="weighted") 566 try: 567 roc_auc = roc_auc_score(y_test, y_pred) 568 except Exception as exception: 569 roc_auc = None 570 if self.ignore_warnings is False: 571 print( 572 "ROC AUC couldn't be calculated for " + name 573 ) 574 print(exception) 575 names.append(name) 576 Accuracy.append(accuracy) 577 B_Accuracy.append(b_accuracy) 578 ROC_AUC.append(roc_auc) 579 F1.append(f1) 580 TIME.append(time.time() - start) 581 if self.custom_metric is not None: 582 custom_metric = self.custom_metric(y_test, y_pred) 583 CUSTOM_METRIC.append(custom_metric) 584 if self.verbose > 0: 585 if self.custom_metric is not None: 586 print( 587 { 588 "Model": name, 589 "Accuracy": accuracy, 590 "Balanced Accuracy": b_accuracy, 591 "ROC AUC": roc_auc, 592 "F1 Score": f1, 593 self.custom_metric.__name__: custom_metric, 594 "Time taken": time.time() - start, 595 } 596 ) 597 else: 598 print( 599 { 600 "Model": name, 601 "Accuracy": accuracy, 602 "Balanced Accuracy": b_accuracy, 603 "ROC AUC": roc_auc, 604 "F1 Score": f1, 605 "Time taken": time.time() - start, 606 } 607 ) 608 if self.predictions: 609 predictions[name] = y_pred 610 except Exception as exception: 611 if self.ignore_warnings is False: 612 print(name + " model failed to execute") 613 print(exception) 614 615 else: 616 617 results = Parallel(n_jobs=self.n_jobs)( 618 delayed(self.train_model)( 619 name, 620 model, 621 X_train, 622 y_train, 623 X_test, 624 y_test, 625 use_preprocessing=False, 626 **kwargs 627 ) 628 for name, model in tqdm(self.classifiers) 629 ) 630 Accuracy = [res["accuracy"] for res in results] 631 B_Accuracy = [res["balanced_accuracy"] for res in results] 632 ROC_AUC = [res["roc_auc"] for res in results] 633 F1 = [res["f1"] for res in results] 634 names = [res["name"] for res in results] 635 TIME = [res["time"] for res in results] 636 if self.custom_metric is not None: 637 CUSTOM_METRIC = [res["custom_metric"] for res in results] 638 if self.predictions: 639 predictions = { 640 res["name"]: res["predictions"] for res in results 641 } 642 643 if self.custom_metric is None: 644 scores = pd.DataFrame( 645 { 646 "Model": names, 647 "Accuracy": Accuracy, 648 "Balanced Accuracy": B_Accuracy, 649 "ROC AUC": ROC_AUC, 650 "F1 Score": F1, 651 "Time Taken": TIME, 652 } 653 ) 654 else: 655 scores = pd.DataFrame( 656 { 657 "Model": names, 658 "Accuracy": Accuracy, 659 "Balanced Accuracy": B_Accuracy, 660 "ROC AUC": ROC_AUC, 661 "F1 Score": F1, 662 "Custom metric": CUSTOM_METRIC, 663 "Time Taken": TIME, 664 } 665 ) 666 scores = scores.sort_values(by=self.sort_by, ascending=False).set_index( 667 "Model" 668 ) 669 670 self.best_model_ = self.models_[scores.index[0]] 671 672 if self.predictions: 673 predictions_df = pd.DataFrame.from_dict(predictions) 674 return scores, predictions_df if self.predictions is True else scores 675 676 def get_best_model(self): 677 """ 678 This function returns the best model pipeline based on the sort_by metric. 679 680 Returns: 681 682 best_model: object, 683 Returns the best model pipeline based on the sort_by metric. 684 685 """ 686 return self.best_model_ 687 688 def provide_models(self, X_train, X_test, y_train, y_test): 689 """Returns all the model objects trained. If fit hasn't been called yet, 690 then it's called to return the models. 691 692 Parameters: 693 694 X_train: array-like, 695 Training vectors, where rows is the number of samples 696 and columns is the number of features. 697 698 X_test: array-like, 699 Testing vectors, where rows is the number of samples 700 and columns is the number of features. 701 702 y_train: array-like, 703 Training vectors, where rows is the number of samples 704 and columns is the number of features. 705 706 y_test: array-like, 707 Testing vectors, where rows is the number of samples 708 and columns is the number of features. 709 710 Returns: 711 712 models: dict-object, 713 Returns a dictionary with each model's pipeline as value 714 and key = name of the model. 715 """ 716 if len(self.models_.keys()) == 0: 717 self.fit(X_train, X_test, y_train, y_test) 718 719 return self.models_ 720 721 def train_model( 722 self, 723 name, 724 model, 725 X_train, 726 y_train, 727 X_test, 728 y_test, 729 use_preprocessing=False, 730 preprocessor=None, 731 hist=False, 732 **kwargs 733 ): 734 """ 735 Function to train a single model and return its results. 736 """ 737 other_args = {} 738 739 # Handle n_jobs parameter 740 try: 741 if ( 742 "n_jobs" in model().get_params().keys() 743 and "LogisticRegression" not in name 744 ): 745 other_args["n_jobs"] = self.n_jobs 746 except Exception: 747 pass 748 749 start = time.time() 750 751 try: 752 # Handle random_state parameter 753 if "random_state" in model().get_params().keys(): 754 if hist is False: 755 fitted_clf = GenericBoostingClassifier( 756 {**other_args, **kwargs}, 757 verbose=self.verbose, 758 base_model=model(random_state=self.random_state), 759 ) 760 else: 761 fitted_clf = GenericBoostingClassifier( 762 {**other_args, **kwargs}, 763 verbose=self.verbose, 764 base_model=model(random_state=self.random_state), 765 hist=True, 766 ) 767 else: 768 if hist is False: 769 fitted_clf = GenericBoostingClassifier( 770 base_model=model(**kwargs), 771 verbose=self.verbose, 772 ) 773 else: 774 fitted_clf = GenericBoostingClassifier( 775 base_model=model(**kwargs), 776 verbose=self.verbose, 777 hist=True, 778 ) 779 780 if self.verbose > 0: 781 print("\n Fitting boosted " + name + " model...") 782 783 fitted_clf.fit(X_train, y_train) 784 785 if use_preprocessing and preprocessor is not None: 786 pipe = Pipeline( 787 [ 788 ("preprocessor", preprocessor), 789 ("classifier", fitted_clf), 790 ] 791 ) 792 if self.verbose > 0: 793 print( 794 "\n Fitting pipeline with preprocessing for " 795 + name 796 + " model..." 797 ) 798 pipe.fit(X_train, y_train) 799 y_pred = pipe.predict(X_test) 800 else: 801 # Case with no preprocessing 802 if self.verbose > 0: 803 print( 804 "\n Fitting model without preprocessing for " 805 + name 806 + " model..." 807 ) 808 y_pred = fitted_clf.predict(X_test) 809 810 accuracy = accuracy_score(y_test, y_pred, normalize=True) 811 b_accuracy = balanced_accuracy_score(y_test, y_pred) 812 f1 = f1_score(y_test, y_pred, average="weighted") 813 roc_auc = None 814 815 try: 816 roc_auc = roc_auc_score(y_test, y_pred) 817 except Exception as exception: 818 if self.ignore_warnings is False: 819 print("ROC AUC couldn't be calculated for " + name) 820 print(exception) 821 822 custom_metric = None 823 if self.custom_metric is not None: 824 custom_metric = self.custom_metric(y_test, y_pred) 825 826 return { 827 "name": name, 828 "model": fitted_clf if not use_preprocessing else pipe, 829 "accuracy": accuracy, 830 "balanced_accuracy": b_accuracy, 831 "roc_auc": roc_auc, 832 "f1": f1, 833 "custom_metric": custom_metric, 834 "time": time.time() - start, 835 "predictions": y_pred, 836 } 837 except Exception as exception: 838 if self.ignore_warnings is False: 839 print(name + " model failed to execute") 840 print(exception) 841 return None
Fitting -- almost -- all the classification algorithms and returning their scores.
Parameters:
verbose: int, optional (default=0)
Any positive number for verbosity.
ignore_warnings: bool, optional (default=True)
When set to True, the warning related to algorigms that are not
able to run are ignored.
custom_metric: function, optional (default=None)
When function is provided, models are evaluated based on the custom
evaluation metric provided.
predictions: bool, optional (default=False)
When set to True, the predictions of all the models models are
returned as data frame.
sort_by: string, optional (default='Accuracy')
Sort models by a metric. Available options are 'Accuracy',
'Balanced Accuracy', 'ROC AUC', 'F1 Score' or a custom metric
identified by its name and provided by custom_metric.
random_state: int, optional (default=42)
Reproducibiility seed.
estimators: list, optional (default='all')
list of Estimators names or just 'all' for > 90 classifiers
(default='all')
preprocess: bool, preprocessing is done when set to True
n_jobs: int, when possible, run in parallel
For now, only used by individual models that support it.
n_layers: int, optional (default=3)
Number of layers of GenericBoostingClassifiers to be used.
All the other parameters are the same as GenericBoostingClassifier's.
Attributes:
models_: dict-object
Returns a dictionary with each model pipeline as value
with key as name of models.
best_model_: object
Returns the best model pipeline.
Examples
import os
import mlsauce as ms
from sklearn.datasets import load_breast_cancer, load_iris, load_wine, load_digits
from sklearn.model_selection import train_test_split
from time import time
load_models = [load_breast_cancer, load_iris, load_wine]
for model in load_models:
data = model()
X = data.data
y= data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 13)
clf = ms.LazyBoostingClassifier(verbose=1, ignore_warnings=False,
custom_metric=None, preprocess=False)
start = time()
models, predictioms = clf.fit(X_train, X_test, y_train, y_test)
print(f"
Elapsed: {time() - start} seconds
")
print(models)
198 def fit(self, X_train, X_test, y_train, y_test, hist=False, **kwargs): 199 """Fit classifiers to X_train and y_train, predict and score on X_test, 200 y_test. 201 202 Parameters: 203 204 X_train: array-like, 205 Training vectors, where rows is the number of samples 206 and columns is the number of features. 207 208 X_test: array-like, 209 Testing vectors, where rows is the number of samples 210 and columns is the number of features. 211 212 y_train: array-like, 213 Training vectors, where rows is the number of samples 214 and columns is the number of features. 215 216 y_test: array-like, 217 Testing vectors, where rows is the number of samples 218 and columns is the number of features. 219 220 hist: bool, optional (default=False) 221 When set to True, the model is a GenericBoostingClassifier. 222 223 **kwargs: dict, 224 Additional arguments to be passed to the fit GenericBoostingClassifier. 225 226 Returns: 227 228 scores: Pandas DataFrame 229 Returns metrics of all the models in a Pandas DataFrame. 230 231 predictions: Pandas DataFrame 232 Returns predictions of all the models in a Pandas DataFrame. 233 """ 234 Accuracy = [] 235 B_Accuracy = [] 236 ROC_AUC = [] 237 F1 = [] 238 names = [] 239 TIME = [] 240 predictions = {} 241 242 if self.custom_metric is not None: 243 CUSTOM_METRIC = [] 244 245 if isinstance(X_train, np.ndarray): 246 X_train = pd.DataFrame(X_train) 247 X_test = pd.DataFrame(X_test) 248 249 numeric_features = X_train.select_dtypes(include=[np.number]).columns 250 categorical_features = X_train.select_dtypes(include=["object"]).columns 251 252 categorical_low, categorical_high = get_card_split( 253 X_train, categorical_features 254 ) 255 256 if self.preprocess is True: 257 preprocessor = ColumnTransformer( 258 transformers=[ 259 ("numeric", numeric_transformer, numeric_features), 260 ( 261 "categorical_low", 262 categorical_transformer_low, 263 categorical_low, 264 ), 265 ( 266 "categorical_high", 267 categorical_transformer_high, 268 categorical_high, 269 ), 270 ] 271 ) 272 273 # baseline models 274 try: 275 baseline_names = ["RandomForestClassifier", "XGBClassifier"] 276 baseline_models = [RandomForestClassifier(), xgb.XGBClassifier()] 277 except Exception as exception: 278 baseline_names = ["RandomForestClassifier"] 279 baseline_models = [RandomForestClassifier()] 280 281 if self.verbose > 0: 282 print("\n Fitting baseline models...") 283 for name, model in tqdm(zip(baseline_names, baseline_models)): 284 start = time.time() 285 try: 286 model.fit(X_train, y_train) 287 self.models_[name] = model 288 y_pred = model.predict(X_test) 289 accuracy = accuracy_score(y_test, y_pred, normalize=True) 290 b_accuracy = balanced_accuracy_score(y_test, y_pred) 291 f1 = f1_score(y_test, y_pred, average="weighted") 292 try: 293 roc_auc = roc_auc_score(y_test, y_pred) 294 except Exception as exception: 295 roc_auc = None 296 if self.ignore_warnings is False: 297 print("ROC AUC couldn't be calculated for " + name) 298 print(exception) 299 names.append(name) 300 Accuracy.append(accuracy) 301 B_Accuracy.append(b_accuracy) 302 ROC_AUC.append(roc_auc) 303 F1.append(f1) 304 TIME.append(time.time() - start) 305 if self.custom_metric is not None: 306 custom_metric = self.custom_metric(y_test, y_pred) 307 CUSTOM_METRIC.append(custom_metric) 308 if self.verbose > 0: 309 if self.custom_metric is not None: 310 print( 311 { 312 "Model": name, 313 "Accuracy": accuracy, 314 "Balanced Accuracy": b_accuracy, 315 "ROC AUC": roc_auc, 316 "F1 Score": f1, 317 self.custom_metric.__name__: custom_metric, 318 "Time taken": time.time() - start, 319 } 320 ) 321 else: 322 print( 323 { 324 "Model": name, 325 "Accuracy": accuracy, 326 "Balanced Accuracy": b_accuracy, 327 "ROC AUC": roc_auc, 328 "F1 Score": f1, 329 "Time taken": time.time() - start, 330 } 331 ) 332 if self.predictions: 333 predictions[name] = y_pred 334 except Exception as exception: 335 if self.ignore_warnings is False: 336 print(name + " model failed to execute") 337 print(exception) 338 339 if self.estimators == "all": 340 self.classifiers = REGRESSORS + MTASKREGRESSORS 341 else: 342 self.classifiers = [ 343 ("GBoostClassifier(" + est[0] + ")", est[1]()) 344 for est in all_estimators() 345 if ( 346 issubclass(est[1], RegressorMixin) 347 and (est[0] in self.estimators) 348 ) 349 ] + [ 350 ( 351 "GBoostClassifier(MultiTask(" + est[0] + "))", 352 partial(MultiTaskRegressor, regr=est[1]()), 353 ) 354 for est in all_estimators() 355 if ( 356 issubclass(est[1], RegressorMixin) 357 and (est[0] in self.estimators) 358 ) 359 ] 360 361 if self.preprocess is True: 362 363 if self.n_jobs is None: 364 365 for name, model in tqdm(self.classifiers): # do parallel exec 366 367 other_args = ( 368 {} 369 ) # use this trick for `random_state` too --> refactor 370 try: 371 if ( 372 "n_jobs" in model().get_params().keys() 373 and name.find("LogisticRegression") == -1 374 ): 375 other_args["n_jobs"] = self.n_jobs 376 except Exception: 377 pass 378 379 start = time.time() 380 381 try: 382 if "random_state" in model().get_params().keys(): 383 if hist is False: 384 fitted_clf = GenericBoostingClassifier( 385 {**other_args, **kwargs}, 386 verbose=self.verbose, 387 base_model=model( 388 random_state=self.random_state 389 ), 390 ) 391 else: 392 fitted_clf = GenericBoostingClassifier( 393 {**other_args, **kwargs}, 394 verbose=self.verbose, 395 base_model=model( 396 random_state=self.random_state 397 ), 398 hist=True, 399 ) 400 401 else: 402 if hist is False: 403 fitted_clf = GenericBoostingClassifier( 404 base_model=model(**kwargs), 405 verbose=self.verbose, 406 ) 407 else: 408 fitted_clf = GenericBoostingClassifier( 409 base_model=model(**kwargs), 410 verbose=self.verbose, 411 hist=True, 412 ) 413 414 if self.verbose > 0: 415 print("\n Fitting boosted " + name + " model...") 416 fitted_clf.fit(X_train, y_train) 417 418 pipe = Pipeline( 419 [ 420 ("preprocessor", preprocessor), 421 ("classifier", fitted_clf), 422 ] 423 ) 424 425 if self.verbose > 0: 426 print("\n Fitting boosted " + name + " model...") 427 pipe.fit(X_train, y_train) 428 self.models_[name] = pipe 429 y_pred = pipe.predict(X_test) 430 accuracy = accuracy_score( 431 y_test, y_pred, normalize=True 432 ) 433 b_accuracy = balanced_accuracy_score(y_test, y_pred) 434 f1 = f1_score(y_test, y_pred, average="weighted") 435 try: 436 roc_auc = roc_auc_score(y_test, y_pred) 437 except Exception as exception: 438 roc_auc = None 439 if self.ignore_warnings is False: 440 print( 441 "ROC AUC couldn't be calculated for " + name 442 ) 443 print(exception) 444 names.append(name) 445 Accuracy.append(accuracy) 446 B_Accuracy.append(b_accuracy) 447 ROC_AUC.append(roc_auc) 448 F1.append(f1) 449 TIME.append(time.time() - start) 450 if self.custom_metric is not None: 451 custom_metric = self.custom_metric(y_test, y_pred) 452 CUSTOM_METRIC.append(custom_metric) 453 if self.verbose > 0: 454 if self.custom_metric is not None: 455 print( 456 { 457 "Model": name, 458 "Accuracy": accuracy, 459 "Balanced Accuracy": b_accuracy, 460 "ROC AUC": roc_auc, 461 "F1 Score": f1, 462 self.custom_metric.__name__: custom_metric, 463 "Time taken": time.time() - start, 464 } 465 ) 466 else: 467 print( 468 { 469 "Model": name, 470 "Accuracy": accuracy, 471 "Balanced Accuracy": b_accuracy, 472 "ROC AUC": roc_auc, 473 "F1 Score": f1, 474 "Time taken": time.time() - start, 475 } 476 ) 477 if self.predictions: 478 predictions[name] = y_pred 479 except Exception as exception: 480 if self.ignore_warnings is False: 481 print(name + " model failed to execute") 482 print(exception) 483 484 else: 485 486 # train_model(self, name, model, X_train, y_train, X_test, y_test, 487 # use_preprocessing=False, preprocessor=None, 488 # **kwargs): 489 results = Parallel(n_jobs=self.n_jobs)( 490 delayed(self.train_model)( 491 name, 492 model, 493 X_train, 494 y_train, 495 X_test, 496 y_test, 497 use_preprocessing=True, 498 preprocessor=preprocessor, 499 **kwargs 500 ) 501 for name, model in tqdm(self.classifiers) 502 ) 503 Accuracy = [res["accuracy"] for res in results] 504 B_Accuracy = [res["balanced_accuracy"] for res in results] 505 ROC_AUC = [res["roc_auc"] for res in results] 506 F1 = [res["f1"] for res in results] 507 names = [res["name"] for res in results] 508 TIME = [res["time"] for res in results] 509 if self.custom_metric is not None: 510 CUSTOM_METRIC = [res["custom_metric"] for res in results] 511 if self.predictions: 512 predictions = { 513 res["name"]: res["predictions"] for res in results 514 } 515 516 else: # no preprocessing 517 518 if self.n_jobs is None: 519 520 for name, model in tqdm(self.classifiers): # do parallel exec 521 start = time.time() 522 try: 523 if "random_state" in model().get_params().keys(): 524 if hist is False: 525 fitted_clf = GenericBoostingClassifier( 526 base_model=model( 527 random_state=self.random_state 528 ), 529 verbose=self.verbose, 530 **kwargs 531 ) 532 else: 533 fitted_clf = GenericBoostingClassifier( 534 base_model=model( 535 random_state=self.random_state 536 ), 537 verbose=self.verbose, 538 hist=True, 539 **kwargs 540 ) 541 542 else: 543 if hist is False: 544 fitted_clf = GenericBoostingClassifier( 545 base_model=model(), 546 verbose=self.verbose, 547 **kwargs 548 ) 549 else: 550 fitted_clf = GenericBoostingClassifier( 551 base_model=model(), 552 verbose=self.verbose, 553 hist=True, 554 **kwargs 555 ) 556 557 fitted_clf.fit(X_train, y_train) 558 559 self.models_[name] = fitted_clf 560 y_pred = fitted_clf.predict(X_test) 561 accuracy = accuracy_score( 562 y_test, y_pred, normalize=True 563 ) 564 b_accuracy = balanced_accuracy_score(y_test, y_pred) 565 f1 = f1_score(y_test, y_pred, average="weighted") 566 try: 567 roc_auc = roc_auc_score(y_test, y_pred) 568 except Exception as exception: 569 roc_auc = None 570 if self.ignore_warnings is False: 571 print( 572 "ROC AUC couldn't be calculated for " + name 573 ) 574 print(exception) 575 names.append(name) 576 Accuracy.append(accuracy) 577 B_Accuracy.append(b_accuracy) 578 ROC_AUC.append(roc_auc) 579 F1.append(f1) 580 TIME.append(time.time() - start) 581 if self.custom_metric is not None: 582 custom_metric = self.custom_metric(y_test, y_pred) 583 CUSTOM_METRIC.append(custom_metric) 584 if self.verbose > 0: 585 if self.custom_metric is not None: 586 print( 587 { 588 "Model": name, 589 "Accuracy": accuracy, 590 "Balanced Accuracy": b_accuracy, 591 "ROC AUC": roc_auc, 592 "F1 Score": f1, 593 self.custom_metric.__name__: custom_metric, 594 "Time taken": time.time() - start, 595 } 596 ) 597 else: 598 print( 599 { 600 "Model": name, 601 "Accuracy": accuracy, 602 "Balanced Accuracy": b_accuracy, 603 "ROC AUC": roc_auc, 604 "F1 Score": f1, 605 "Time taken": time.time() - start, 606 } 607 ) 608 if self.predictions: 609 predictions[name] = y_pred 610 except Exception as exception: 611 if self.ignore_warnings is False: 612 print(name + " model failed to execute") 613 print(exception) 614 615 else: 616 617 results = Parallel(n_jobs=self.n_jobs)( 618 delayed(self.train_model)( 619 name, 620 model, 621 X_train, 622 y_train, 623 X_test, 624 y_test, 625 use_preprocessing=False, 626 **kwargs 627 ) 628 for name, model in tqdm(self.classifiers) 629 ) 630 Accuracy = [res["accuracy"] for res in results] 631 B_Accuracy = [res["balanced_accuracy"] for res in results] 632 ROC_AUC = [res["roc_auc"] for res in results] 633 F1 = [res["f1"] for res in results] 634 names = [res["name"] for res in results] 635 TIME = [res["time"] for res in results] 636 if self.custom_metric is not None: 637 CUSTOM_METRIC = [res["custom_metric"] for res in results] 638 if self.predictions: 639 predictions = { 640 res["name"]: res["predictions"] for res in results 641 } 642 643 if self.custom_metric is None: 644 scores = pd.DataFrame( 645 { 646 "Model": names, 647 "Accuracy": Accuracy, 648 "Balanced Accuracy": B_Accuracy, 649 "ROC AUC": ROC_AUC, 650 "F1 Score": F1, 651 "Time Taken": TIME, 652 } 653 ) 654 else: 655 scores = pd.DataFrame( 656 { 657 "Model": names, 658 "Accuracy": Accuracy, 659 "Balanced Accuracy": B_Accuracy, 660 "ROC AUC": ROC_AUC, 661 "F1 Score": F1, 662 "Custom metric": CUSTOM_METRIC, 663 "Time Taken": TIME, 664 } 665 ) 666 scores = scores.sort_values(by=self.sort_by, ascending=False).set_index( 667 "Model" 668 ) 669 670 self.best_model_ = self.models_[scores.index[0]] 671 672 if self.predictions: 673 predictions_df = pd.DataFrame.from_dict(predictions) 674 return scores, predictions_df if self.predictions is True else scores
Fit classifiers to X_train and y_train, predict and score on X_test, y_test.
Parameters:
X_train: array-like,
Training vectors, where rows is the number of samples
and columns is the number of features.
X_test: array-like,
Testing vectors, where rows is the number of samples
and columns is the number of features.
y_train: array-like,
Training vectors, where rows is the number of samples
and columns is the number of features.
y_test: array-like,
Testing vectors, where rows is the number of samples
and columns is the number of features.
hist: bool, optional (default=False)
When set to True, the model is a GenericBoostingClassifier.
**kwargs: dict,
Additional arguments to be passed to the fit GenericBoostingClassifier.
Returns:
scores: Pandas DataFrame
Returns metrics of all the models in a Pandas DataFrame.
predictions: Pandas DataFrame
Returns predictions of all the models in a Pandas DataFrame.
688 def provide_models(self, X_train, X_test, y_train, y_test): 689 """Returns all the model objects trained. If fit hasn't been called yet, 690 then it's called to return the models. 691 692 Parameters: 693 694 X_train: array-like, 695 Training vectors, where rows is the number of samples 696 and columns is the number of features. 697 698 X_test: array-like, 699 Testing vectors, where rows is the number of samples 700 and columns is the number of features. 701 702 y_train: array-like, 703 Training vectors, where rows is the number of samples 704 and columns is the number of features. 705 706 y_test: array-like, 707 Testing vectors, where rows is the number of samples 708 and columns is the number of features. 709 710 Returns: 711 712 models: dict-object, 713 Returns a dictionary with each model's pipeline as value 714 and key = name of the model. 715 """ 716 if len(self.models_.keys()) == 0: 717 self.fit(X_train, X_test, y_train, y_test) 718 719 return self.models_
Returns all the model objects trained. If fit hasn't been called yet, then it's called to return the models.
Parameters:
X_train: array-like, Training vectors, where rows is the number of samples and columns is the number of features.
X_test: array-like, Testing vectors, where rows is the number of samples and columns is the number of features.
y_train: array-like, Training vectors, where rows is the number of samples and columns is the number of features.
y_test: array-like, Testing vectors, where rows is the number of samples and columns is the number of features.
Returns:
models: dict-object,
Returns a dictionary with each model's pipeline as value
and key = name of the model.
93class LazyBoostingRegressor(RegressorMixin): 94 """ 95 Fitting -- almost -- all the regression algorithms 96 and returning their scores. 97 98 Parameters: 99 100 verbose: int, optional (default=0) 101 Any positive number for verbosity. 102 103 ignore_warnings: bool, optional (default=True) 104 When set to True, the warning related to algorigms that are not able to run are ignored. 105 106 custom_metric: function, optional (default=None) 107 When function is provided, models are evaluated based on the custom evaluation metric provided. 108 109 predictions: bool, optional (default=False) 110 When set to True, the predictions of all the models models are returned as dataframe. 111 112 sort_by: string, optional (default='RMSE') 113 Sort models by a metric. Available options are 'R-Squared', 'Adjusted R-Squared', 'RMSE', 'Time Taken' and 'Custom Metric'. 114 or a custom metric identified by its name and provided by custom_metric. 115 116 random_state: int, optional (default=42) 117 Reproducibiility seed. 118 119 estimators: list, optional (default='all') 120 list of Estimators names or just 'all' (default='all') 121 122 preprocess: bool 123 preprocessing is done when set to True 124 125 n_jobs : int, when possible, run in parallel 126 For now, only used by individual models that support it. 127 128 n_layers: int, optional (default=3) 129 Number of layers of CustomRegressors to be used. 130 131 All the other parameters are the same as CustomRegressor's. 132 133 Attributes: 134 135 models_: dict-object 136 Returns a dictionary with each model pipeline as value 137 with key as name of models. 138 139 best_model_: object 140 Returns the best model pipeline based on the sort_by metric. 141 142 Examples: 143 144 ```python 145 import os 146 import mlsauce as ms 147 from sklearn.datasets import load_diabetes 148 from sklearn.model_selection import train_test_split 149 150 data = load_diabetes() 151 X = data.data 152 y= data.target 153 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 123) 154 155 regr = ms.LazyBoostingRegressor(verbose=0, ignore_warnings=True, 156 custom_metric=None, preprocess=True) 157 models, predictioms = regr.fit(X_train, X_test, y_train, y_test) 158 model_dictionary = regr.provide_models(X_train, X_test, y_train, y_test) 159 print(models) 160 ``` 161 162 """ 163 164 def __init__( 165 self, 166 verbose=0, 167 ignore_warnings=True, 168 custom_metric=None, 169 predictions=False, 170 sort_by="RMSE", 171 random_state=42, 172 estimators="all", 173 preprocess=False, 174 n_jobs=None, 175 ): 176 self.verbose = verbose 177 self.ignore_warnings = ignore_warnings 178 self.custom_metric = custom_metric 179 self.predictions = predictions 180 self.sort_by = sort_by 181 self.models_ = {} 182 self.best_model_ = None 183 self.random_state = random_state 184 self.estimators = estimators 185 self.preprocess = preprocess 186 self.n_jobs = n_jobs 187 188 def fit(self, X_train, X_test, y_train, y_test, hist=False, **kwargs): 189 """Fit Regression algorithms to X_train and y_train, predict and score on X_test, y_test. 190 191 Parameters: 192 193 X_train : array-like, 194 Training vectors, where rows is the number of samples 195 and columns is the number of features. 196 197 X_test : array-like, 198 Testing vectors, where rows is the number of samples 199 and columns is the number of features. 200 201 y_train : array-like, 202 Training vectors, where rows is the number of samples 203 and columns is the number of features. 204 205 y_test : array-like, 206 Testing vectors, where rows is the number of samples 207 and columns is the number of features. 208 209 hist: bool, optional (default=False) 210 When set to True, the model is a HistGenericBoostingRegressor. 211 212 **kwargs: dict, 213 Additional parameters to be passed to the GenericBoostingRegressor. 214 215 Returns: 216 ------- 217 scores: Pandas DataFrame 218 Returns metrics of all the models in a Pandas DataFrame. 219 220 predictions : Pandas DataFrame 221 Returns predictions of all the models in a Pandas DataFrame. 222 223 """ 224 R2 = [] 225 ADJR2 = [] 226 RMSE = [] 227 # WIN = [] 228 names = [] 229 TIME = [] 230 predictions = {} 231 232 if self.custom_metric: 233 CUSTOM_METRIC = [] 234 235 if isinstance(X_train, np.ndarray): 236 X_train = pd.DataFrame(X_train) 237 X_test = pd.DataFrame(X_test) 238 239 numeric_features = X_train.select_dtypes(include=[np.number]).columns 240 categorical_features = X_train.select_dtypes(include=["object"]).columns 241 242 categorical_low, categorical_high = get_card_split( 243 X_train, categorical_features 244 ) 245 246 if self.preprocess is True: 247 preprocessor = ColumnTransformer( 248 transformers=[ 249 ("numeric", numeric_transformer, numeric_features), 250 ( 251 "categorical_low", 252 categorical_transformer_low, 253 categorical_low, 254 ), 255 ( 256 "categorical_high", 257 categorical_transformer_high, 258 categorical_high, 259 ), 260 ] 261 ) 262 263 # base models 264 try: 265 baseline_names = [ 266 "RandomForestRegressor", 267 "XGBRegressor", 268 "GradientBoostingRegressor", 269 ] 270 baseline_models = [ 271 RandomForestRegressor(), 272 xgb.XGBRegressor(), 273 GradientBoostingRegressor(), 274 ] 275 except Exception as exception: 276 baseline_names = [ 277 "RandomForestRegressor", 278 "GradientBoostingRegressor", 279 ] 280 baseline_models = [ 281 RandomForestRegressor(), 282 GradientBoostingRegressor(), 283 ] 284 285 if self.verbose > 0: 286 print("\n Fitting baseline models...") 287 for name, model in tqdm(zip(baseline_names, baseline_models)): 288 start = time.time() 289 try: 290 model.fit(X_train, y_train.ravel()) 291 self.models_[name] = model 292 y_pred = model.predict(X_test) 293 r_squared = r2_score(y_test, y_pred) 294 adj_rsquared = adjusted_rsquared( 295 r_squared, X_test.shape[0], X_test.shape[1] 296 ) 297 rmse = root_mean_squared_error(y_test, y_pred) 298 299 names.append(name) 300 R2.append(r_squared) 301 ADJR2.append(adj_rsquared) 302 RMSE.append(rmse) 303 TIME.append(time.time() - start) 304 305 if self.custom_metric: 306 custom_metric = self.custom_metric(y_test, y_pred) 307 CUSTOM_METRIC.append(custom_metric) 308 309 if self.verbose > 0: 310 scores_verbose = { 311 "Model": name, 312 "R-Squared": r_squared, 313 "Adjusted R-Squared": adj_rsquared, 314 "RMSE": rmse, 315 "Time taken": time.time() - start, 316 } 317 318 if self.custom_metric: 319 scores_verbose["Custom metric"] = custom_metric 320 321 print(scores_verbose) 322 if self.predictions: 323 predictions[name] = y_pred 324 except Exception as exception: 325 if self.ignore_warnings is False: 326 print(name + " model failed to execute") 327 print(exception) 328 329 if self.estimators == "all": 330 self.regressors = REGRESSORS 331 else: 332 self.regressors = [ 333 ("GenericBooster(" + est[0] + ")", est[1](**kwargs)) 334 for est in all_estimators() 335 if ( 336 issubclass(est[1], RegressorMixin) 337 and (est[0] in self.estimators) 338 ) 339 ] 340 341 if self.preprocess is True: 342 343 if self.n_jobs is None: 344 345 for name, regr in tqdm(self.regressors): # do parallel exec 346 347 start = time.time() 348 349 try: 350 351 if hist is False: 352 353 model = GenericBoostingRegressor( 354 base_model=regr(), 355 verbose=self.verbose, 356 **kwargs 357 ) 358 359 else: 360 361 model = HistGenericBoostingRegressor( 362 base_model=regr(), 363 verbose=self.verbose, 364 **kwargs 365 ) 366 367 model.fit(X_train, y_train.ravel()) 368 369 pipe = Pipeline( 370 steps=[ 371 ("preprocessor", preprocessor), 372 ("regressor", model), 373 ] 374 ) 375 if self.verbose > 0: 376 print("\n Fitting boosted " + name + " model...") 377 pipe.fit(X_train, y_train.ravel()) 378 379 self.models_[name] = pipe 380 y_pred = pipe.predict(X_test) 381 r_squared = r2_score(y_test, y_pred) 382 adj_rsquared = adjusted_rsquared( 383 r_squared, X_test.shape[0], X_test.shape[1] 384 ) 385 rmse = root_mean_squared_error(y_test, y_pred) 386 387 names.append(name) 388 R2.append(r_squared) 389 ADJR2.append(adj_rsquared) 390 RMSE.append(rmse) 391 TIME.append(time.time() - start) 392 393 if self.custom_metric: 394 custom_metric = self.custom_metric(y_test, y_pred) 395 CUSTOM_METRIC.append(custom_metric) 396 397 if self.verbose > 0: 398 scores_verbose = { 399 "Model": name, 400 "R-Squared": r_squared, 401 "Adjusted R-Squared": adj_rsquared, 402 "RMSE": rmse, 403 "Time taken": time.time() - start, 404 } 405 406 if self.custom_metric: 407 scores_verbose["Custom metric"] = custom_metric 408 409 print(scores_verbose) 410 if self.predictions: 411 predictions[name] = y_pred 412 413 except Exception as exception: 414 415 if self.ignore_warnings is False: 416 print(name + " model failed to execute") 417 print(exception) 418 419 else: 420 421 results = Parallel(n_jobs=self.n_jobs)( 422 delayed(self.train_model)( 423 name, 424 model, 425 X_train, 426 y_train, 427 X_test, 428 y_test, 429 use_preprocessing=True, 430 preprocessor=preprocessor, 431 **kwargs 432 ) 433 for name, model in tqdm(self.regressors) 434 ) 435 R2 = [ 436 result["r_squared"] 437 for result in results 438 if result is not None 439 ] 440 ADJR2 = [ 441 result["adj_rsquared"] 442 for result in results 443 if result is not None 444 ] 445 RMSE = [ 446 result["rmse"] for result in results if result is not None 447 ] 448 TIME = [ 449 result["time"] for result in results if result is not None 450 ] 451 names = [ 452 result["name"] for result in results if result is not None 453 ] 454 if self.custom_metric: 455 CUSTOM_METRIC = [ 456 result["custom_metric"] 457 for result in results 458 if result is not None 459 ] 460 if self.predictions: 461 predictions = { 462 result["name"]: result["predictions"] 463 for result in results 464 if result is not None 465 } 466 467 else: # self.preprocess is False; no preprocessing 468 469 if self.n_jobs is None: 470 471 for name, regr in tqdm(self.regressors): # do parallel exec 472 start = time.time() 473 try: 474 475 if hist is False: 476 model = GenericBoostingRegressor( 477 base_model=regr(), 478 verbose=self.verbose, 479 **kwargs 480 ) 481 else: 482 model = HistGenericBoostingRegressor( 483 base_model=regr(), 484 verbose=self.verbose, 485 **kwargs 486 ) 487 488 if self.verbose > 0: 489 print("\n Fitting boosted " + name + " model...") 490 model.fit(X_train, y_train.ravel()) 491 492 self.models_[name] = model 493 y_pred = model.predict(X_test) 494 495 r_squared = r2_score(y_test, y_pred) 496 adj_rsquared = adjusted_rsquared( 497 r_squared, X_test.shape[0], X_test.shape[1] 498 ) 499 rmse = root_mean_squared_error(y_test, y_pred) 500 501 names.append(name) 502 R2.append(r_squared) 503 ADJR2.append(adj_rsquared) 504 RMSE.append(rmse) 505 TIME.append(time.time() - start) 506 507 if self.custom_metric: 508 custom_metric = self.custom_metric(y_test, y_pred) 509 CUSTOM_METRIC.append(custom_metric) 510 511 if self.verbose > 0: 512 scores_verbose = { 513 "Model": name, 514 "R-Squared": r_squared, 515 "Adjusted R-Squared": adj_rsquared, 516 "RMSE": rmse, 517 "Time taken": time.time() - start, 518 } 519 520 if self.custom_metric: 521 scores_verbose["Custom metric"] = custom_metric 522 523 print(scores_verbose) 524 if self.predictions: 525 predictions[name] = y_pred 526 except Exception as exception: 527 if self.ignore_warnings is False: 528 print(name + " model failed to execute") 529 print(exception) 530 531 else: 532 533 results = Parallel(n_jobs=self.n_jobs)( 534 delayed(self.train_model)( 535 name, 536 model, 537 X_train, 538 y_train, 539 X_test, 540 y_test, 541 use_preprocessing=False, 542 **kwargs 543 ) 544 for name, model in tqdm(self.regressors) 545 ) 546 R2 = [ 547 result["r_squared"] 548 for result in results 549 if result is not None 550 ] 551 ADJR2 = [ 552 result["adj_rsquared"] 553 for result in results 554 if result is not None 555 ] 556 RMSE = [ 557 result["rmse"] for result in results if result is not None 558 ] 559 TIME = [ 560 result["time"] for result in results if result is not None 561 ] 562 names = [ 563 result["name"] for result in results if result is not None 564 ] 565 if self.custom_metric: 566 CUSTOM_METRIC = [ 567 result["custom_metric"] 568 for result in results 569 if result is not None 570 ] 571 if self.predictions: 572 predictions = { 573 result["name"]: result["predictions"] 574 for result in results 575 if result is not None 576 } 577 578 scores = { 579 "Model": names, 580 "Adjusted R-Squared": ADJR2, 581 "R-Squared": R2, 582 "RMSE": RMSE, 583 "Time Taken": TIME, 584 } 585 586 if self.custom_metric: 587 scores["Custom metric"] = CUSTOM_METRIC 588 589 scores = pd.DataFrame(scores) 590 scores = scores.sort_values(by=self.sort_by, ascending=True).set_index( 591 "Model" 592 ) 593 594 self.best_model_ = self.models_[scores.index[0]] 595 596 if self.predictions: 597 predictions_df = pd.DataFrame.from_dict(predictions) 598 return scores, predictions_df if self.predictions is True else scores 599 600 def get_best_model(self): 601 """ 602 This function returns the best model pipeline based on the sort_by metric. 603 604 Returns: 605 606 best_model: object, 607 Returns the best model pipeline based on the sort_by metric. 608 609 """ 610 return self.best_model_ 611 612 def provide_models(self, X_train, X_test, y_train, y_test): 613 """ 614 This function returns all the model objects trained in fit function. 615 If fit is not called already, then we call fit and then return the models. 616 617 Parameters: 618 619 X_train : array-like, 620 Training vectors, where rows is the number of samples 621 and columns is the number of features. 622 623 X_test : array-like, 624 Testing vectors, where rows is the number of samples 625 and columns is the number of features. 626 627 y_train : array-like, 628 Training vectors, where rows is the number of samples 629 and columns is the number of features. 630 631 y_test : array-like, 632 Testing vectors, where rows is the number of samples 633 and columns is the number of features. 634 635 Returns: 636 637 models: dict-object, 638 Returns a dictionary with each model pipeline as value 639 with key as name of models. 640 641 """ 642 if len(self.models_.keys()) == 0: 643 self.fit(X_train, X_test, y_train.ravel(), y_test.values) 644 645 return self.models_ 646 647 def train_model( 648 self, 649 name, 650 regr, 651 X_train, 652 y_train, 653 X_test, 654 y_test, 655 use_preprocessing=False, 656 preprocessor=None, 657 hist=False, 658 **kwargs 659 ): 660 """ 661 Function to train a single regression model and return its results. 662 """ 663 start = time.time() 664 665 try: 666 if hist is False: 667 model = GenericBoostingRegressor( 668 base_model=regr(), verbose=self.verbose, **kwargs 669 ) 670 else: 671 model = HistGenericBoostingRegressor( 672 base_model=regr(), verbose=self.verbose, **kwargs 673 ) 674 675 if use_preprocessing and preprocessor is not None: 676 pipe = Pipeline( 677 steps=[ 678 ("preprocessor", preprocessor), 679 ("regressor", model), 680 ] 681 ) 682 if self.verbose > 0: 683 print( 684 "\n Fitting boosted " 685 + name 686 + " model with preprocessing..." 687 ) 688 pipe.fit(X_train, y_train.ravel()) 689 y_pred = pipe.predict(X_test) 690 fitted_model = pipe 691 else: 692 # Case with no preprocessing 693 if self.verbose > 0: 694 print( 695 "\n Fitting boosted " 696 + name 697 + " model without preprocessing..." 698 ) 699 model.fit(X_train, y_train.ravel()) 700 y_pred = model.predict(X_test) 701 fitted_model = model 702 703 r_squared = r2_score(y_test, y_pred) 704 adj_rsquared = adjusted_rsquared( 705 r_squared, X_test.shape[0], X_test.shape[1] 706 ) 707 rmse = root_mean_squared_error(y_test, y_pred) 708 709 custom_metric = None 710 if self.custom_metric: 711 custom_metric = self.custom_metric(y_test, y_pred) 712 713 return { 714 "name": name, 715 "model": fitted_model, 716 "r_squared": r_squared, 717 "adj_rsquared": adj_rsquared, 718 "rmse": rmse, 719 "custom_metric": custom_metric, 720 "time": time.time() - start, 721 "predictions": y_pred, 722 } 723 724 except Exception as exception: 725 if self.ignore_warnings is False: 726 print(name + " model failed to execute") 727 print(exception) 728 return None
Fitting -- almost -- all the regression algorithms and returning their scores.
Parameters:
verbose: int, optional (default=0)
Any positive number for verbosity.
ignore_warnings: bool, optional (default=True)
When set to True, the warning related to algorigms that are not able to run are ignored.
custom_metric: function, optional (default=None)
When function is provided, models are evaluated based on the custom evaluation metric provided.
predictions: bool, optional (default=False)
When set to True, the predictions of all the models models are returned as dataframe.
sort_by: string, optional (default='RMSE')
Sort models by a metric. Available options are 'R-Squared', 'Adjusted R-Squared', 'RMSE', 'Time Taken' and 'Custom Metric'.
or a custom metric identified by its name and provided by custom_metric.
random_state: int, optional (default=42)
Reproducibiility seed.
estimators: list, optional (default='all')
list of Estimators names or just 'all' (default='all')
preprocess: bool
preprocessing is done when set to True
n_jobs : int, when possible, run in parallel
For now, only used by individual models that support it.
n_layers: int, optional (default=3)
Number of layers of CustomRegressors to be used.
All the other parameters are the same as CustomRegressor's.
Attributes:
models_: dict-object
Returns a dictionary with each model pipeline as value
with key as name of models.
best_model_: object
Returns the best model pipeline based on the sort_by metric.
Examples:
import os
import mlsauce as ms
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
data = load_diabetes()
X = data.data
y= data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 123)
regr = ms.LazyBoostingRegressor(verbose=0, ignore_warnings=True,
custom_metric=None, preprocess=True)
models, predictioms = regr.fit(X_train, X_test, y_train, y_test)
model_dictionary = regr.provide_models(X_train, X_test, y_train, y_test)
print(models)
188 def fit(self, X_train, X_test, y_train, y_test, hist=False, **kwargs): 189 """Fit Regression algorithms to X_train and y_train, predict and score on X_test, y_test. 190 191 Parameters: 192 193 X_train : array-like, 194 Training vectors, where rows is the number of samples 195 and columns is the number of features. 196 197 X_test : array-like, 198 Testing vectors, where rows is the number of samples 199 and columns is the number of features. 200 201 y_train : array-like, 202 Training vectors, where rows is the number of samples 203 and columns is the number of features. 204 205 y_test : array-like, 206 Testing vectors, where rows is the number of samples 207 and columns is the number of features. 208 209 hist: bool, optional (default=False) 210 When set to True, the model is a HistGenericBoostingRegressor. 211 212 **kwargs: dict, 213 Additional parameters to be passed to the GenericBoostingRegressor. 214 215 Returns: 216 ------- 217 scores: Pandas DataFrame 218 Returns metrics of all the models in a Pandas DataFrame. 219 220 predictions : Pandas DataFrame 221 Returns predictions of all the models in a Pandas DataFrame. 222 223 """ 224 R2 = [] 225 ADJR2 = [] 226 RMSE = [] 227 # WIN = [] 228 names = [] 229 TIME = [] 230 predictions = {} 231 232 if self.custom_metric: 233 CUSTOM_METRIC = [] 234 235 if isinstance(X_train, np.ndarray): 236 X_train = pd.DataFrame(X_train) 237 X_test = pd.DataFrame(X_test) 238 239 numeric_features = X_train.select_dtypes(include=[np.number]).columns 240 categorical_features = X_train.select_dtypes(include=["object"]).columns 241 242 categorical_low, categorical_high = get_card_split( 243 X_train, categorical_features 244 ) 245 246 if self.preprocess is True: 247 preprocessor = ColumnTransformer( 248 transformers=[ 249 ("numeric", numeric_transformer, numeric_features), 250 ( 251 "categorical_low", 252 categorical_transformer_low, 253 categorical_low, 254 ), 255 ( 256 "categorical_high", 257 categorical_transformer_high, 258 categorical_high, 259 ), 260 ] 261 ) 262 263 # base models 264 try: 265 baseline_names = [ 266 "RandomForestRegressor", 267 "XGBRegressor", 268 "GradientBoostingRegressor", 269 ] 270 baseline_models = [ 271 RandomForestRegressor(), 272 xgb.XGBRegressor(), 273 GradientBoostingRegressor(), 274 ] 275 except Exception as exception: 276 baseline_names = [ 277 "RandomForestRegressor", 278 "GradientBoostingRegressor", 279 ] 280 baseline_models = [ 281 RandomForestRegressor(), 282 GradientBoostingRegressor(), 283 ] 284 285 if self.verbose > 0: 286 print("\n Fitting baseline models...") 287 for name, model in tqdm(zip(baseline_names, baseline_models)): 288 start = time.time() 289 try: 290 model.fit(X_train, y_train.ravel()) 291 self.models_[name] = model 292 y_pred = model.predict(X_test) 293 r_squared = r2_score(y_test, y_pred) 294 adj_rsquared = adjusted_rsquared( 295 r_squared, X_test.shape[0], X_test.shape[1] 296 ) 297 rmse = root_mean_squared_error(y_test, y_pred) 298 299 names.append(name) 300 R2.append(r_squared) 301 ADJR2.append(adj_rsquared) 302 RMSE.append(rmse) 303 TIME.append(time.time() - start) 304 305 if self.custom_metric: 306 custom_metric = self.custom_metric(y_test, y_pred) 307 CUSTOM_METRIC.append(custom_metric) 308 309 if self.verbose > 0: 310 scores_verbose = { 311 "Model": name, 312 "R-Squared": r_squared, 313 "Adjusted R-Squared": adj_rsquared, 314 "RMSE": rmse, 315 "Time taken": time.time() - start, 316 } 317 318 if self.custom_metric: 319 scores_verbose["Custom metric"] = custom_metric 320 321 print(scores_verbose) 322 if self.predictions: 323 predictions[name] = y_pred 324 except Exception as exception: 325 if self.ignore_warnings is False: 326 print(name + " model failed to execute") 327 print(exception) 328 329 if self.estimators == "all": 330 self.regressors = REGRESSORS 331 else: 332 self.regressors = [ 333 ("GenericBooster(" + est[0] + ")", est[1](**kwargs)) 334 for est in all_estimators() 335 if ( 336 issubclass(est[1], RegressorMixin) 337 and (est[0] in self.estimators) 338 ) 339 ] 340 341 if self.preprocess is True: 342 343 if self.n_jobs is None: 344 345 for name, regr in tqdm(self.regressors): # do parallel exec 346 347 start = time.time() 348 349 try: 350 351 if hist is False: 352 353 model = GenericBoostingRegressor( 354 base_model=regr(), 355 verbose=self.verbose, 356 **kwargs 357 ) 358 359 else: 360 361 model = HistGenericBoostingRegressor( 362 base_model=regr(), 363 verbose=self.verbose, 364 **kwargs 365 ) 366 367 model.fit(X_train, y_train.ravel()) 368 369 pipe = Pipeline( 370 steps=[ 371 ("preprocessor", preprocessor), 372 ("regressor", model), 373 ] 374 ) 375 if self.verbose > 0: 376 print("\n Fitting boosted " + name + " model...") 377 pipe.fit(X_train, y_train.ravel()) 378 379 self.models_[name] = pipe 380 y_pred = pipe.predict(X_test) 381 r_squared = r2_score(y_test, y_pred) 382 adj_rsquared = adjusted_rsquared( 383 r_squared, X_test.shape[0], X_test.shape[1] 384 ) 385 rmse = root_mean_squared_error(y_test, y_pred) 386 387 names.append(name) 388 R2.append(r_squared) 389 ADJR2.append(adj_rsquared) 390 RMSE.append(rmse) 391 TIME.append(time.time() - start) 392 393 if self.custom_metric: 394 custom_metric = self.custom_metric(y_test, y_pred) 395 CUSTOM_METRIC.append(custom_metric) 396 397 if self.verbose > 0: 398 scores_verbose = { 399 "Model": name, 400 "R-Squared": r_squared, 401 "Adjusted R-Squared": adj_rsquared, 402 "RMSE": rmse, 403 "Time taken": time.time() - start, 404 } 405 406 if self.custom_metric: 407 scores_verbose["Custom metric"] = custom_metric 408 409 print(scores_verbose) 410 if self.predictions: 411 predictions[name] = y_pred 412 413 except Exception as exception: 414 415 if self.ignore_warnings is False: 416 print(name + " model failed to execute") 417 print(exception) 418 419 else: 420 421 results = Parallel(n_jobs=self.n_jobs)( 422 delayed(self.train_model)( 423 name, 424 model, 425 X_train, 426 y_train, 427 X_test, 428 y_test, 429 use_preprocessing=True, 430 preprocessor=preprocessor, 431 **kwargs 432 ) 433 for name, model in tqdm(self.regressors) 434 ) 435 R2 = [ 436 result["r_squared"] 437 for result in results 438 if result is not None 439 ] 440 ADJR2 = [ 441 result["adj_rsquared"] 442 for result in results 443 if result is not None 444 ] 445 RMSE = [ 446 result["rmse"] for result in results if result is not None 447 ] 448 TIME = [ 449 result["time"] for result in results if result is not None 450 ] 451 names = [ 452 result["name"] for result in results if result is not None 453 ] 454 if self.custom_metric: 455 CUSTOM_METRIC = [ 456 result["custom_metric"] 457 for result in results 458 if result is not None 459 ] 460 if self.predictions: 461 predictions = { 462 result["name"]: result["predictions"] 463 for result in results 464 if result is not None 465 } 466 467 else: # self.preprocess is False; no preprocessing 468 469 if self.n_jobs is None: 470 471 for name, regr in tqdm(self.regressors): # do parallel exec 472 start = time.time() 473 try: 474 475 if hist is False: 476 model = GenericBoostingRegressor( 477 base_model=regr(), 478 verbose=self.verbose, 479 **kwargs 480 ) 481 else: 482 model = HistGenericBoostingRegressor( 483 base_model=regr(), 484 verbose=self.verbose, 485 **kwargs 486 ) 487 488 if self.verbose > 0: 489 print("\n Fitting boosted " + name + " model...") 490 model.fit(X_train, y_train.ravel()) 491 492 self.models_[name] = model 493 y_pred = model.predict(X_test) 494 495 r_squared = r2_score(y_test, y_pred) 496 adj_rsquared = adjusted_rsquared( 497 r_squared, X_test.shape[0], X_test.shape[1] 498 ) 499 rmse = root_mean_squared_error(y_test, y_pred) 500 501 names.append(name) 502 R2.append(r_squared) 503 ADJR2.append(adj_rsquared) 504 RMSE.append(rmse) 505 TIME.append(time.time() - start) 506 507 if self.custom_metric: 508 custom_metric = self.custom_metric(y_test, y_pred) 509 CUSTOM_METRIC.append(custom_metric) 510 511 if self.verbose > 0: 512 scores_verbose = { 513 "Model": name, 514 "R-Squared": r_squared, 515 "Adjusted R-Squared": adj_rsquared, 516 "RMSE": rmse, 517 "Time taken": time.time() - start, 518 } 519 520 if self.custom_metric: 521 scores_verbose["Custom metric"] = custom_metric 522 523 print(scores_verbose) 524 if self.predictions: 525 predictions[name] = y_pred 526 except Exception as exception: 527 if self.ignore_warnings is False: 528 print(name + " model failed to execute") 529 print(exception) 530 531 else: 532 533 results = Parallel(n_jobs=self.n_jobs)( 534 delayed(self.train_model)( 535 name, 536 model, 537 X_train, 538 y_train, 539 X_test, 540 y_test, 541 use_preprocessing=False, 542 **kwargs 543 ) 544 for name, model in tqdm(self.regressors) 545 ) 546 R2 = [ 547 result["r_squared"] 548 for result in results 549 if result is not None 550 ] 551 ADJR2 = [ 552 result["adj_rsquared"] 553 for result in results 554 if result is not None 555 ] 556 RMSE = [ 557 result["rmse"] for result in results if result is not None 558 ] 559 TIME = [ 560 result["time"] for result in results if result is not None 561 ] 562 names = [ 563 result["name"] for result in results if result is not None 564 ] 565 if self.custom_metric: 566 CUSTOM_METRIC = [ 567 result["custom_metric"] 568 for result in results 569 if result is not None 570 ] 571 if self.predictions: 572 predictions = { 573 result["name"]: result["predictions"] 574 for result in results 575 if result is not None 576 } 577 578 scores = { 579 "Model": names, 580 "Adjusted R-Squared": ADJR2, 581 "R-Squared": R2, 582 "RMSE": RMSE, 583 "Time Taken": TIME, 584 } 585 586 if self.custom_metric: 587 scores["Custom metric"] = CUSTOM_METRIC 588 589 scores = pd.DataFrame(scores) 590 scores = scores.sort_values(by=self.sort_by, ascending=True).set_index( 591 "Model" 592 ) 593 594 self.best_model_ = self.models_[scores.index[0]] 595 596 if self.predictions: 597 predictions_df = pd.DataFrame.from_dict(predictions) 598 return scores, predictions_df if self.predictions is True else scores
Fit Regression algorithms to X_train and y_train, predict and score on X_test, y_test.
Parameters:
X_train : array-like,
Training vectors, where rows is the number of samples
and columns is the number of features.
X_test : array-like,
Testing vectors, where rows is the number of samples
and columns is the number of features.
y_train : array-like,
Training vectors, where rows is the number of samples
and columns is the number of features.
y_test : array-like,
Testing vectors, where rows is the number of samples
and columns is the number of features.
hist: bool, optional (default=False)
When set to True, the model is a HistGenericBoostingRegressor.
**kwargs: dict,
Additional parameters to be passed to the GenericBoostingRegressor.
Returns:
scores: Pandas DataFrame Returns metrics of all the models in a Pandas DataFrame.
predictions : Pandas DataFrame Returns predictions of all the models in a Pandas DataFrame.
612 def provide_models(self, X_train, X_test, y_train, y_test): 613 """ 614 This function returns all the model objects trained in fit function. 615 If fit is not called already, then we call fit and then return the models. 616 617 Parameters: 618 619 X_train : array-like, 620 Training vectors, where rows is the number of samples 621 and columns is the number of features. 622 623 X_test : array-like, 624 Testing vectors, where rows is the number of samples 625 and columns is the number of features. 626 627 y_train : array-like, 628 Training vectors, where rows is the number of samples 629 and columns is the number of features. 630 631 y_test : array-like, 632 Testing vectors, where rows is the number of samples 633 and columns is the number of features. 634 635 Returns: 636 637 models: dict-object, 638 Returns a dictionary with each model pipeline as value 639 with key as name of models. 640 641 """ 642 if len(self.models_.keys()) == 0: 643 self.fit(X_train, X_test, y_train.ravel(), y_test.values) 644 645 return self.models_
This function returns all the model objects trained in fit function. If fit is not called already, then we call fit and then return the models.
Parameters:
X_train : array-like,
Training vectors, where rows is the number of samples
and columns is the number of features.
X_test : array-like,
Testing vectors, where rows is the number of samples
and columns is the number of features.
y_train : array-like,
Training vectors, where rows is the number of samples
and columns is the number of features.
y_test : array-like,
Testing vectors, where rows is the number of samples
and columns is the number of features.
Returns:
models: dict-object,
Returns a dictionary with each model pipeline as value
with key as name of models.
9class MultiTaskRegressor(BaseEstimator, RegressorMixin): 10 """ 11 A class for multi-task regression 12 13 Parameters 14 ---------- 15 regr: object 16 A regressor object 17 18 Attributes 19 ---------- 20 objs: list 21 A list containing the fitted regressor objects 22 23 """ 24 25 def __init__(self, regr): 26 assert ( 27 is_multitask_estimator(regr) == False 28 ), "The regressor is already a multi-task regressor" 29 self.regr = regr 30 self.objs = [] 31 32 def fit(self, X, y): 33 """ 34 Fit the regressor 35 36 Parameters 37 ---------- 38 X: array-like 39 The input data 40 y: array-like 41 The target values 42 43 """ 44 n_tasks = y.shape[1] 45 assert n_tasks > 1, "The number of columns in y must be greater than 1" 46 self.n_outputs_ = n_tasks 47 try: 48 for i in range(n_tasks): 49 self.regr.fit(X, y.iloc[:, i].values) 50 self.objs.append(deepcopy(self.regr)) 51 except Exception: 52 for i in range(n_tasks): 53 self.regr.fit(X, y[:, i]) 54 self.objs.append(deepcopy(self.regr)) 55 return self 56 57 def predict(self, X): 58 """ 59 Predict the target values 60 61 Parameters 62 ---------- 63 X: array-like 64 The input data 65 66 Returns 67 ------- 68 y_pred: array-like 69 The predicted target values 70 71 """ 72 assert len(self.objs) > 0, "The regressor has not been fitted yet" 73 y_pred = np.zeros((X.shape[0], self.n_outputs_)) 74 for i in range(self.n_outputs_): 75 y_pred[:, i] = self.objs[i].predict(X) 76 return y_pred
A class for multi-task regression
Parameters
regr: object A regressor object
Attributes
objs: list A list containing the fitted regressor objects
32 def fit(self, X, y): 33 """ 34 Fit the regressor 35 36 Parameters 37 ---------- 38 X: array-like 39 The input data 40 y: array-like 41 The target values 42 43 """ 44 n_tasks = y.shape[1] 45 assert n_tasks > 1, "The number of columns in y must be greater than 1" 46 self.n_outputs_ = n_tasks 47 try: 48 for i in range(n_tasks): 49 self.regr.fit(X, y.iloc[:, i].values) 50 self.objs.append(deepcopy(self.regr)) 51 except Exception: 52 for i in range(n_tasks): 53 self.regr.fit(X, y[:, i]) 54 self.objs.append(deepcopy(self.regr)) 55 return self
Fit the regressor
Parameters
X: array-like The input data y: array-like The target values
57 def predict(self, X): 58 """ 59 Predict the target values 60 61 Parameters 62 ---------- 63 X: array-like 64 The input data 65 66 Returns 67 ------- 68 y_pred: array-like 69 The predicted target values 70 71 """ 72 assert len(self.objs) > 0, "The regressor has not been fitted yet" 73 y_pred = np.zeros((X.shape[0], self.n_outputs_)) 74 for i in range(self.n_outputs_): 75 y_pred[:, i] = self.objs[i].predict(X) 76 return y_pred
Predict the target values
Parameters
X: array-like The input data
Returns
y_pred: array-like The predicted target values
16def get_config(): 17 """Retrieve current values for configuration set by :func:`set_config` 18 19 Returns 20 ------- 21 config : dict 22 Keys are parameter names that can be passed to :func:`set_config`. 23 24 See Also 25 -------- 26 config_context: Context manager for global mlsauce configuration 27 set_config: Set global mlsauce configuration 28 """ 29 return _global_config.copy()
Retrieve current values for configuration set by set_config()
Returns
config : dict
Keys are parameter names that can be passed to set_config()
.
See Also
config_context: Context manager for global mlsauce configuration set_config: Set global mlsauce configuration
32def set_config( 33 assume_finite=None, 34 working_memory=None, 35 print_changed_only=None, 36 display=None, 37): 38 """Set global mlsauce configuration 39 40 .. versionadded:: 0.3.0 41 42 Parameters 43 ---------- 44 assume_finite : bool, optional 45 If True, validation for finiteness will be skipped, 46 saving time, but leading to potential crashes. If 47 False, validation for finiteness will be performed, 48 avoiding error. Global default: False. 49 50 .. versionadded:: 0.3.0 51 52 working_memory : int, optional 53 If set, mlsauce will attempt to limit the size of temporary arrays 54 to this number of MiB (per job when parallelised), often saving both 55 computation time and memory on expensive operations that can be 56 performed in chunks. Global default: 1024. 57 58 .. versionadded:: 0.3.0 59 60 print_changed_only : bool, optional 61 If True, only the parameters that were set to non-default 62 values will be printed when printing an estimator. For example, 63 ``print(SVC())`` while True will only print 'SVC()' while the default 64 behaviour would be to print 'SVC(C=1.0, cache_size=200, ...)' with 65 all the non-changed parameters. 66 67 .. versionadded:: 0.3.0 68 69 display : {'text', 'diagram'}, optional 70 If 'diagram', estimators will be displayed as text in a jupyter lab 71 of notebook context. If 'text', estimators will be displayed as 72 text. Default is 'text'. 73 74 .. versionadded:: 0.3.0 75 76 See Also 77 -------- 78 config_context: Context manager for global mlsauce configuration 79 get_config: Retrieve current values of the global configuration 80 """ 81 if assume_finite is not None: 82 _global_config["assume_finite"] = assume_finite 83 if working_memory is not None: 84 _global_config["working_memory"] = working_memory 85 if print_changed_only is not None: 86 _global_config["print_changed_only"] = print_changed_only 87 if display is not None: 88 _global_config["display"] = display
Set global mlsauce configuration
New in version 0.3.0.
Parameters
assume_finite : bool, optional If True, validation for finiteness will be skipped, saving time, but leading to potential crashes. If False, validation for finiteness will be performed, avoiding error. Global default: False.
*New in version 0.3.0.*
working_memory : int, optional If set, mlsauce will attempt to limit the size of temporary arrays to this number of MiB (per job when parallelised), often saving both computation time and memory on expensive operations that can be performed in chunks. Global default: 1024.
*New in version 0.3.0.*
print_changed_only : bool, optional
If True, only the parameters that were set to non-default
values will be printed when printing an estimator. For example,
print(SVC())
while True will only print 'SVC()' while the default
behaviour would be to print 'SVC(C=1.0, cache_size=200, ...)' with
all the non-changed parameters.
*New in version 0.3.0.*
display : {'text', 'diagram'}, optional If 'diagram', estimators will be displayed as text in a jupyter lab of notebook context. If 'text', estimators will be displayed as text. Default is 'text'.
*New in version 0.3.0.*
See Also
config_context: Context manager for global mlsauce configuration get_config: Retrieve current values of the global configuration
91@contextmanager 92def config_context(**new_config): 93 """Context manager for global mlsauce configuration 94 95 Parameters 96 ---------- 97 assume_finite : bool, optional 98 If True, validation for finiteness will be skipped, 99 saving time, but leading to potential crashes. If 100 False, validation for finiteness will be performed, 101 avoiding error. Global default: False. 102 103 working_memory : int, optional 104 If set, mlsauce will attempt to limit the size of temporary arrays 105 to this number of MiB (per job when parallelised), often saving both 106 computation time and memory on expensive operations that can be 107 performed in chunks. Global default: 1024. 108 109 print_changed_only : bool, optional 110 If True, only the parameters that were set to non-default 111 values will be printed when printing an estimator. For example, 112 ``print(SVC())`` while True will only print 'SVC()', but would print 113 'SVC(C=1.0, cache_size=200, ...)' with all the non-changed parameters 114 when False. Default is True. 115 116 .. versionadded:: 0.3.0 117 118 display : {'text', 'diagram'}, optional 119 If 'diagram', estimators will be displayed as text in a jupyter lab 120 of notebook context. If 'text', estimators will be displayed as 121 text. Default is 'text'. 122 123 .. versionadded:: 0.3.0 124 125 Notes 126 ----- 127 All settings, not just those presently modified, will be returned to 128 their previous values when the context manager is exited. This is not 129 thread-safe. 130 131 Examples 132 -------- 133 >>> import mlsauce 134 >>> from mlsauce.utils.validation import assert_all_finite 135 >>> with mlsauce.config_context(assume_finite=True): 136 ... assert_all_finite([float('nan')]) 137 >>> with mlsauce.config_context(assume_finite=True): 138 ... with mlsauce.config_context(assume_finite=False): 139 ... assert_all_finite([float('nan')]) 140 Traceback (most recent call last): 141 ... 142 ValueError: Input contains NaN, ... 143 144 See Also 145 -------- 146 set_config: Set global mlsauce configuration 147 get_config: Retrieve current values of the global configuration 148 """ 149 old_config = get_config().copy() 150 set_config(**new_config) 151 152 try: 153 yield 154 finally: 155 set_config(**old_config)
Context manager for global mlsauce configuration
Parameters
assume_finite : bool, optional If True, validation for finiteness will be skipped, saving time, but leading to potential crashes. If False, validation for finiteness will be performed, avoiding error. Global default: False.
working_memory : int, optional If set, mlsauce will attempt to limit the size of temporary arrays to this number of MiB (per job when parallelised), often saving both computation time and memory on expensive operations that can be performed in chunks. Global default: 1024.
print_changed_only : bool, optional
If True, only the parameters that were set to non-default
values will be printed when printing an estimator. For example,
print(SVC())
while True will only print 'SVC()', but would print
'SVC(C=1.0, cache_size=200, ...)' with all the non-changed parameters
when False. Default is True.
*New in version 0.3.0.*
display : {'text', 'diagram'}, optional If 'diagram', estimators will be displayed as text in a jupyter lab of notebook context. If 'text', estimators will be displayed as text. Default is 'text'.
*New in version 0.3.0.*
Notes
All settings, not just those presently modified, will be returned to their previous values when the context manager is exited. This is not thread-safe.
Examples
>>> import mlsauce
>>> from mlsauce.utils.validation import assert_all_finite
>>> with config_context(assume_finite=True):
... assert_all_finite([float('nan')])
>>> with config_context(assume_finite=True):
... with config_context(assume_finite=False):
... assert_all_finite([float('nan')])
Traceback (most recent call last):
...
ValueError: Input contains NaN, ...
See Also
set_config: Set global mlsauce configuration get_config: Retrieve current values of the global configuration