mlsauce

View Source

  1import sys
  2import logging
  3import os
  4
  5from ._config import get_config, set_config, config_context
  6
  7logger = logging.getLogger(__name__)
  8
  9
 10# PEP0440 compatible formatted version, see:
 11# https://www.python.org/dev/peps/pep-0440/
 12#
 13# Generic release markers:
 14#   X.Y
 15#   X.Y.Z   # For bugfix releases
 16#
 17# Admissible pre-release markers:
 18#   X.YaN   # Alpha release
 19#   X.YbN   # Beta release
 20#   X.YrcN  # Release Candidate
 21#   X.Y     # Final release
 22#
 23# Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer.
 24# 'X.Y.dev0' is the canonical version of 'X.Y.dev'
 25#
 26# __version__ = "0.10.0"
 27
 28
 29# On OSX, we can get a runtime error due to multiple OpenMP libraries loaded
 30# simultaneously. This can happen for instance when calling BLAS inside a
 31# prange. Setting the following environment variable allows multiple OpenMP
 32# libraries to be loaded. It should not degrade performances since we manually
 33# take care of potential over-subcription performance issues, in sections of
 34# the code where nested OpenMP loops can happen, by dynamically reconfiguring
 35# the inner OpenMP runtime to temporarily disable it while under the scope of
 36# the outer OpenMP parallel section.
 37os.environ.setdefault("KMP_DUPLICATE_LIB_OK", "True")
 38
 39# Workaround issue discovered in intel-openmp 2019.5:
 40# https://github.com/ContinuumIO/anaconda-issues/issues/11294
 41os.environ.setdefault("KMP_INIT_AT_FORK", "FALSE")
 42
 43try:
 44    # This variable is injected in the __builtins__ by the build
 45    # process. It is used to enable importing subpackages of mlsauce when
 46    # the binaries are not built
 47    # mypy error: Cannot determine type of '__MLSAUCE_SETUP__'
 48    __MLSAUCE_SETUP__  # type: ignore
 49except NameError:
 50    __MLSAUCE_SETUP__ = False
 51
 52if __MLSAUCE_SETUP__:
 53    sys.stderr.write("Partial import of mlsauce during the build process.\n")
 54    # We are not importing the rest of scikit-learn during the build
 55    # process, as it may not be compiled yet
 56
 57else:
 58    from .adaopt import AdaOpt
 59    from .booster import (
 60        LSBoostClassifier,
 61        LSBoostRegressor,
 62        GenericBoostingClassifier,
 63        GenericBoostingRegressor,
 64    )
 65    from .lazybooster import LazyBoostingClassifier, LazyBoostingRegressor
 66    from .multitaskregressor import MultiTaskRegressor
 67    from .datasets import download
 68    from .elasticnet import ElasticNetRegressor
 69    from .lasso import LassoRegressor
 70    from .ridge import RidgeRegressor
 71    from .stump import StumpClassifier
 72
 73    # from .encoders import corrtarget_encoder
 74
 75    __all__ = [
 76        "AdaOpt",
 77        "LSBoostClassifier",
 78        "GenericBoostingClassifier",
 79        "GenericBoostingRegressor",
 80        "StumpClassifier",
 81        "ElasticNetRegressor",
 82        "LassoRegressor",
 83        "LSBoostRegressor",
 84        "RidgeRegressor",
 85        "LazyBoostingClassifier",
 86        "LazyBoostingRegressor",
 87        "MultiTaskRegressor",
 88        # Other imports
 89        # "corrtarget_encoder",
 90        "download",
 91        # Non-modules:
 92        "get_config",
 93        "set_config",
 94        "config_context",
 95    ]
 96
 97
 98def setup_module(module):
 99    """Fixture for the tests to assure globally controllable seeding of RNGs"""
100    import os
101    import numpy as np
102    import random
103
104    # Check if a random seed exists in the environment, if not create one.
105    _random_seed = os.environ.get("MLSAUCE_SEED", None)
106    if _random_seed is None:
107        _random_seed = np.random.uniform() * np.iinfo(np.int32).max
108    _random_seed = int(_random_seed)
109    print("I: Seeding RNGs with %r" % _random_seed)
110    np.random.seed(_random_seed)
111    random.seed(_random_seed)

class AdaOpt(sklearn.base.BaseEstimator, sklearn.base.ClassifierMixin): View Source

 19class AdaOpt(BaseEstimator, ClassifierMixin):
 20    """AdaOpt classifier.
 21
 22    Attributes:
 23
 24        n_iterations: int
 25            number of iterations of the optimizer at training time.
 26
 27        learning_rate: float
 28            controls the speed of the optimizer at training time.
 29
 30        reg_lambda: float
 31            L2 regularization parameter for successive errors in the optimizer
 32            (at training time).
 33
 34        reg_alpha: float
 35            L1 regularization parameter for successive errors in the optimizer
 36            (at training time).
 37
 38        eta: float
 39            controls the slope in gradient descent (at training time).
 40
 41        gamma: float
 42            controls the step size in gradient descent (at training time).
 43
 44        k: int
 45            number of nearest neighbors selected at test time for classification.
 46
 47        tolerance: float
 48            controls early stopping in gradient descent (at training time).
 49
 50        n_clusters: int
 51            number of clusters, if MiniBatch k-means is used at test time
 52            (for faster prediction).
 53
 54        batch_size: int
 55            size of the batch, if MiniBatch k-means is used at test time
 56            (for faster prediction).
 57
 58        row_sample: float
 59            percentage of rows chosen from training set (by stratified subsampling,
 60            for faster prediction).
 61
 62        type_dist: str
 63            distance used for finding the nearest neighbors; currently `euclidean-f`
 64            (euclidean distances calculated as whole), `euclidean` (euclidean distances
 65            calculated row by row), `cosine` (cosine distance).
 66
 67        n_jobs: int
 68            number of cpus for parallel processing (default: None)
 69
 70        verbose: int
 71            progress bar for parallel processing (yes = 1) or not (no = 0)
 72
 73        cache: boolean
 74            if the nearest neighbors are cached or not, for faster retrieval in
 75            subsequent calls.
 76
 77        n_clusters_input: int
 78            number of clusters (a priori) for clustering the features
 79
 80        clustering_method: str
 81            clustering method: currently 'kmeans', 'gmm'
 82
 83        cluster_scaling: str
 84            scaling method for clustering: currently 'standard', 'robust', 'minmax'
 85
 86        seed: int
 87            reproducibility seed for nodes_sim=='uniform', clustering and dropout.
 88
 89    """
 90
 91    def __init__(
 92        self,
 93        n_iterations=50,
 94        learning_rate=0.3,
 95        reg_lambda=0.1,
 96        reg_alpha=0.5,
 97        eta=0.01,
 98        gamma=0.01,
 99        k=3,
100        tolerance=0,
101        n_clusters=0,
102        batch_size=100,
103        row_sample=0.8,
104        type_dist="euclidean-f",
105        n_jobs=None,
106        verbose=0,
107        cache=True,
108        n_clusters_input=0,
109        clustering_method="kmeans",
110        cluster_scaling="standard",
111        seed=123,
112    ):
113        if n_clusters_input > 0:
114            assert clustering_method in (
115                "kmeans",
116                "gmm",
117            ), "`clustering_method` must be in ('kmeans', 'gmm')"
118            assert cluster_scaling in (
119                "standard",
120                "robust",
121                "minmax",
122            ), "`cluster_scaling` must be in ('standard', 'robust', 'minmax')"
123
124        assert type_dist in (
125            "euclidean",
126            "manhattan",
127            "euclidean-f",
128            "cosine",
129        ), "must have: `type_dist` in ('euclidean', 'manhattan', 'euclidean-f', 'cosine') "
130
131        self.n_iterations = n_iterations
132        self.learning_rate = learning_rate
133        self.reg_lambda = reg_lambda
134        self.reg_alpha = reg_alpha
135        self.eta = eta
136        self.gamma = gamma
137        self.k = k
138        self.tolerance = tolerance
139        self.n_clusters = n_clusters
140        self.batch_size = batch_size
141        self.row_sample = row_sample
142        self.type_dist = type_dist
143        self.n_jobs = n_jobs
144        self.cache = cache
145        self.verbose = verbose
146        self.n_clusters_input = n_clusters_input
147        self.clustering_method = clustering_method
148        self.cluster_scaling = cluster_scaling
149        self.scaler_, self.label_encoder_, self.clusterer_ = None, None, None
150        self.seed = seed
151
152    def fit(self, X, y, **kwargs):
153        """Fit AdaOpt to training data (X, y)
154
155        Args:
156
157            X: {array-like}, shape = [n_samples, n_features]
158                Training vectors, where n_samples is the number
159                of samples and n_features is the number of features.
160
161            y: array-like, shape = [n_samples]
162                Target values.
163
164            **kwargs: additional parameters to be passed to self.cook_training_set.
165
166        Returns:
167
168            self: object.
169
170        """
171
172        if self.n_clusters_input > 0:
173            clustered_X, self.scaler_, self.label_encoder_, self.clusterer_ = (
174                cluster(
175                    X,
176                    n_clusters=self.n_clusters_input,
177                    method=self.clustering_method,
178                    type_scaling=self.cluster_scaling,
179                    training=True,
180                    seed=self.seed,
181                )
182            )
183            X = np.column_stack((X.copy(), clustered_X))
184
185        if self.row_sample < 1:
186            index_subsample = subsample(
187                y, row_sample=self.row_sample, seed=self.seed
188            )
189            y_ = y[index_subsample]
190            X_ = X[index_subsample, :]
191        else:
192            y_ = pickle.loads(pickle.dumps(y, -1))
193            X_ = pickle.loads(pickle.dumps(X, -1))
194
195        n, p = X_.shape
196
197        n_classes = len(np.unique(y_))
198
199        assert n == len(y_), "must have X.shape[0] == len(y)"
200
201        res = adaoptc.fit_adaopt(
202            X=np.asarray(X_).astype(np.float64),
203            y=np.asarray(y_).astype(np.int64),
204            n_iterations=self.n_iterations,
205            n_X=n,
206            p_X=p,
207            n_classes=n_classes,
208            learning_rate=self.learning_rate,
209            reg_lambda=self.reg_lambda,
210            reg_alpha=self.reg_alpha,
211            eta=self.eta,
212            gamma=self.gamma,
213            tolerance=self.tolerance,
214        )
215
216        self.probs_training = res["probs"]
217        self.training_accuracy = res["training_accuracy"]
218        self.alphas = res["alphas"]
219        self.n_iterations = res["n_iterations"]
220        self.scaled_X_train = np.array(res["scaled_X_train"], dtype=np.float64)
221        self.n_classes_ = len(np.unique(y))  # for compatibility with sklearn
222        return self
223
224    def predict(self, X, **kwargs):
225        """Predict test data X.
226
227        Args:
228
229            X: {array-like}, shape = [n_samples, n_features]
230                Training vectors, where n_samples is the number
231                of samples and n_features is the number of features.
232
233            **kwargs: additional parameters to be passed to `predict_proba`
234
235        Returns:
236
237            model predictions: {array-like}
238
239        """
240
241        return np.argmax(self.predict_proba(X, **kwargs), axis=1)
242
243    def predict_proba(self, X, **kwargs):
244        """Predict probabilities for test data X.
245
246        Args:
247
248            X: {array-like}, shape = [n_samples, n_features]
249                Training vectors, where n_samples is the number
250                of samples and n_features is the number of features.
251
252            **kwargs: additional parameters to be passed to
253                self.cook_test_set
254
255        Returns:
256
257            probability estimates for test data: {array-like}
258
259        """
260
261        n_train, p_train = self.scaled_X_train.shape
262
263        if self.n_clusters_input > 0:
264            X = np.column_stack(
265                (
266                    X.copy(),
267                    cluster(
268                        X,
269                        training=False,
270                        scaler=self.scaler_,
271                        label_encoder=self.label_encoder_,
272                        clusterer=self.clusterer_,
273                        seed=self.seed,
274                    ),
275                )
276            )
277
278        n_test = X.shape[0]
279
280        if self.n_jobs is None:
281            return adaoptc.predict_proba_adaopt(
282                X_test=np.asarray(X, order="C").astype(np.float64),
283                scaled_X_train=np.asarray(
284                    self.scaled_X_train, order="C"
285                ).astype(np.float64),
286                n_test=n_test,
287                n_train=n_train,
288                probs_train=self.probs_training,
289                k=self.k,
290                n_clusters=self.n_clusters,
291                batch_size=self.batch_size,
292                type_dist=self.type_dist,
293                cache=self.cache,
294                seed=self.seed,
295            )
296
297        # parallel: self.n_jobs is not None
298        assert self.type_dist in (
299            "euclidean",
300            "manhattan",
301            "cosine",
302        ), "must have: `self.type_dist` in ('euclidean', 'manhattan', 'cosine') "
303
304        scaled_X_test = X / norm(X, ord=2, axis=1)[:, None]
305
306        if self.type_dist == "euclidean":
307
308            @delayed
309            @wrap_non_picklable_objects
310            def multiproc_func(i):
311                dists_test_i = adaoptc.distance_to_mat_euclidean2(
312                    np.asarray(scaled_X_test.astype(np.float64), order="C")[
313                        i, :
314                    ],
315                    np.asarray(
316                        self.scaled_X_train.astype(np.float64), order="C"
317                    ),
318                    np.zeros(n_train),
319                    n_train,
320                    p_train,
321                )
322
323                kmin_test_i = adaoptc.find_kmin_x(
324                    dists_test_i, n_x=n_train, k=self.k, cache=self.cache
325                )
326
327                weights_test_i = adaoptc.calculate_weights(kmin_test_i[0])
328
329                probs_test_i = adaoptc.calculate_probs(
330                    kmin_test_i[1], self.probs_training
331                )
332
333                return adaoptc.average_probs(
334                    probs=probs_test_i, weights=weights_test_i
335                )
336
337        if self.type_dist == "manhattan":
338
339            @delayed
340            @wrap_non_picklable_objects
341            def multiproc_func(i):
342                dists_test_i = adaoptc.distance_to_mat_manhattan2(
343                    np.asarray(scaled_X_test.astype(np.float64), order="C")[
344                        i, :
345                    ],
346                    np.asarray(
347                        self.scaled_X_train.astype(np.float64), order="C"
348                    ),
349                    np.zeros(n_train),
350                    n_train,
351                    p_train,
352                )
353
354                kmin_test_i = adaoptc.find_kmin_x(
355                    dists_test_i, n_x=n_train, k=self.k, cache=self.cache
356                )
357
358                weights_test_i = adaoptc.calculate_weights(kmin_test_i[0])
359
360                probs_test_i = adaoptc.calculate_probs(
361                    kmin_test_i[1], self.probs_training
362                )
363
364                return adaoptc.average_probs(
365                    probs=probs_test_i, weights=weights_test_i
366                )
367
368        if self.type_dist == "cosine":
369
370            @delayed
371            @wrap_non_picklable_objects
372            def multiproc_func(i, *args):
373                dists_test_i = adaoptc.distance_to_mat_cosine2(
374                    np.asarray(scaled_X_test.astype(np.float64), order="C")[
375                        i, :
376                    ],
377                    np.asarray(
378                        self.scaled_X_train.astype(np.float64), order="C"
379                    ),
380                    np.zeros(n_train),
381                    n_train,
382                    p_train,
383                )
384
385                kmin_test_i = adaoptc.find_kmin_x(
386                    dists_test_i, n_x=n_train, k=self.k, cache=self.cache
387                )
388
389                weights_test_i = adaoptc.calculate_weights(kmin_test_i[0])
390
391                probs_test_i = adaoptc.calculate_probs(
392                    kmin_test_i[1], self.probs_training
393                )
394
395                return adaoptc.average_probs(
396                    probs=probs_test_i, weights=weights_test_i
397                )
398
399        if self.verbose == 1:
400            res = Parallel(n_jobs=self.n_jobs, prefer="threads")(
401                (multiproc_func)(m) for m in tqdm(range(n_test))
402            )
403
404        else:
405            res = Parallel(n_jobs=self.n_jobs, prefer="threads")(
406                (multiproc_func)(m) for m in range(n_test)
407            )
408
409        return np.asarray(res)

AdaOpt classifier.

Attributes:

n_iterations: int
    number of iterations of the optimizer at training time.

learning_rate: float
    controls the speed of the optimizer at training time.

reg_lambda: float
    L2 regularization parameter for successive errors in the optimizer
    (at training time).

reg_alpha: float
    L1 regularization parameter for successive errors in the optimizer
    (at training time).

eta: float
    controls the slope in gradient descent (at training time).

gamma: float
    controls the step size in gradient descent (at training time).

k: int
    number of nearest neighbors selected at test time for classification.

tolerance: float
    controls early stopping in gradient descent (at training time).

n_clusters: int
    number of clusters, if MiniBatch k-means is used at test time
    (for faster prediction).

batch_size: int
    size of the batch, if MiniBatch k-means is used at test time
    (for faster prediction).

row_sample: float
    percentage of rows chosen from training set (by stratified subsampling,
    for faster prediction).

type_dist: str
    distance used for finding the nearest neighbors; currently `euclidean-f`
    (euclidean distances calculated as whole), `euclidean` (euclidean distances
    calculated row by row), `cosine` (cosine distance).

n_jobs: int
    number of cpus for parallel processing (default: None)

verbose: int
    progress bar for parallel processing (yes = 1) or not (no = 0)

cache: boolean
    if the nearest neighbors are cached or not, for faster retrieval in
    subsequent calls.

n_clusters_input: int
    number of clusters (a priori) for clustering the features

clustering_method: str
    clustering method: currently 'kmeans', 'gmm'

cluster_scaling: str
    scaling method for clustering: currently 'standard', 'robust', 'minmax'

seed: int
    reproducibility seed for nodes_sim=='uniform', clustering and dropout.

def fit(self, X, y, **kwargs): View Source

152    def fit(self, X, y, **kwargs):
153        """Fit AdaOpt to training data (X, y)
154
155        Args:
156
157            X: {array-like}, shape = [n_samples, n_features]
158                Training vectors, where n_samples is the number
159                of samples and n_features is the number of features.
160
161            y: array-like, shape = [n_samples]
162                Target values.
163
164            **kwargs: additional parameters to be passed to self.cook_training_set.
165
166        Returns:
167
168            self: object.
169
170        """
171
172        if self.n_clusters_input > 0:
173            clustered_X, self.scaler_, self.label_encoder_, self.clusterer_ = (
174                cluster(
175                    X,
176                    n_clusters=self.n_clusters_input,
177                    method=self.clustering_method,
178                    type_scaling=self.cluster_scaling,
179                    training=True,
180                    seed=self.seed,
181                )
182            )
183            X = np.column_stack((X.copy(), clustered_X))
184
185        if self.row_sample < 1:
186            index_subsample = subsample(
187                y, row_sample=self.row_sample, seed=self.seed
188            )
189            y_ = y[index_subsample]
190            X_ = X[index_subsample, :]
191        else:
192            y_ = pickle.loads(pickle.dumps(y, -1))
193            X_ = pickle.loads(pickle.dumps(X, -1))
194
195        n, p = X_.shape
196
197        n_classes = len(np.unique(y_))
198
199        assert n == len(y_), "must have X.shape[0] == len(y)"
200
201        res = adaoptc.fit_adaopt(
202            X=np.asarray(X_).astype(np.float64),
203            y=np.asarray(y_).astype(np.int64),
204            n_iterations=self.n_iterations,
205            n_X=n,
206            p_X=p,
207            n_classes=n_classes,
208            learning_rate=self.learning_rate,
209            reg_lambda=self.reg_lambda,
210            reg_alpha=self.reg_alpha,
211            eta=self.eta,
212            gamma=self.gamma,
213            tolerance=self.tolerance,
214        )
215
216        self.probs_training = res["probs"]
217        self.training_accuracy = res["training_accuracy"]
218        self.alphas = res["alphas"]
219        self.n_iterations = res["n_iterations"]
220        self.scaled_X_train = np.array(res["scaled_X_train"], dtype=np.float64)
221        self.n_classes_ = len(np.unique(y))  # for compatibility with sklearn
222        return self

Fit AdaOpt to training data (X, y)

Args:

X: {array-like}, shape = [n_samples, n_features]
    Training vectors, where n_samples is the number
    of samples and n_features is the number of features.

y: array-like, shape = [n_samples]
    Target values.

**kwargs: additional parameters to be passed to self.cook_training_set.

Returns:

self: object.

def predict(self, X, **kwargs): View Source

224    def predict(self, X, **kwargs):
225        """Predict test data X.
226
227        Args:
228
229            X: {array-like}, shape = [n_samples, n_features]
230                Training vectors, where n_samples is the number
231                of samples and n_features is the number of features.
232
233            **kwargs: additional parameters to be passed to `predict_proba`
234
235        Returns:
236
237            model predictions: {array-like}
238
239        """
240
241        return np.argmax(self.predict_proba(X, **kwargs), axis=1)

Predict test data X.

Args:

X: {array-like}, shape = [n_samples, n_features]
    Training vectors, where n_samples is the number
    of samples and n_features is the number of features.

**kwargs: additional parameters to be passed to `predict_proba`

Returns:

model predictions: {array-like}

def predict_proba(self, X, **kwargs): View Source

243    def predict_proba(self, X, **kwargs):
244        """Predict probabilities for test data X.
245
246        Args:
247
248            X: {array-like}, shape = [n_samples, n_features]
249                Training vectors, where n_samples is the number
250                of samples and n_features is the number of features.
251
252            **kwargs: additional parameters to be passed to
253                self.cook_test_set
254
255        Returns:
256
257            probability estimates for test data: {array-like}
258
259        """
260
261        n_train, p_train = self.scaled_X_train.shape
262
263        if self.n_clusters_input > 0:
264            X = np.column_stack(
265                (
266                    X.copy(),
267                    cluster(
268                        X,
269                        training=False,
270                        scaler=self.scaler_,
271                        label_encoder=self.label_encoder_,
272                        clusterer=self.clusterer_,
273                        seed=self.seed,
274                    ),
275                )
276            )
277
278        n_test = X.shape[0]
279
280        if self.n_jobs is None:
281            return adaoptc.predict_proba_adaopt(
282                X_test=np.asarray(X, order="C").astype(np.float64),
283                scaled_X_train=np.asarray(
284                    self.scaled_X_train, order="C"
285                ).astype(np.float64),
286                n_test=n_test,
287                n_train=n_train,
288                probs_train=self.probs_training,
289                k=self.k,
290                n_clusters=self.n_clusters,
291                batch_size=self.batch_size,
292                type_dist=self.type_dist,
293                cache=self.cache,
294                seed=self.seed,
295            )
296
297        # parallel: self.n_jobs is not None
298        assert self.type_dist in (
299            "euclidean",
300            "manhattan",
301            "cosine",
302        ), "must have: `self.type_dist` in ('euclidean', 'manhattan', 'cosine') "
303
304        scaled_X_test = X / norm(X, ord=2, axis=1)[:, None]
305
306        if self.type_dist == "euclidean":
307
308            @delayed
309            @wrap_non_picklable_objects
310            def multiproc_func(i):
311                dists_test_i = adaoptc.distance_to_mat_euclidean2(
312                    np.asarray(scaled_X_test.astype(np.float64), order="C")[
313                        i, :
314                    ],
315                    np.asarray(
316                        self.scaled_X_train.astype(np.float64), order="C"
317                    ),
318                    np.zeros(n_train),
319                    n_train,
320                    p_train,
321                )
322
323                kmin_test_i = adaoptc.find_kmin_x(
324                    dists_test_i, n_x=n_train, k=self.k, cache=self.cache
325                )
326
327                weights_test_i = adaoptc.calculate_weights(kmin_test_i[0])
328
329                probs_test_i = adaoptc.calculate_probs(
330                    kmin_test_i[1], self.probs_training
331                )
332
333                return adaoptc.average_probs(
334                    probs=probs_test_i, weights=weights_test_i
335                )
336
337        if self.type_dist == "manhattan":
338
339            @delayed
340            @wrap_non_picklable_objects
341            def multiproc_func(i):
342                dists_test_i = adaoptc.distance_to_mat_manhattan2(
343                    np.asarray(scaled_X_test.astype(np.float64), order="C")[
344                        i, :
345                    ],
346                    np.asarray(
347                        self.scaled_X_train.astype(np.float64), order="C"
348                    ),
349                    np.zeros(n_train),
350                    n_train,
351                    p_train,
352                )
353
354                kmin_test_i = adaoptc.find_kmin_x(
355                    dists_test_i, n_x=n_train, k=self.k, cache=self.cache
356                )
357
358                weights_test_i = adaoptc.calculate_weights(kmin_test_i[0])
359
360                probs_test_i = adaoptc.calculate_probs(
361                    kmin_test_i[1], self.probs_training
362                )
363
364                return adaoptc.average_probs(
365                    probs=probs_test_i, weights=weights_test_i
366                )
367
368        if self.type_dist == "cosine":
369
370            @delayed
371            @wrap_non_picklable_objects
372            def multiproc_func(i, *args):
373                dists_test_i = adaoptc.distance_to_mat_cosine2(
374                    np.asarray(scaled_X_test.astype(np.float64), order="C")[
375                        i, :
376                    ],
377                    np.asarray(
378                        self.scaled_X_train.astype(np.float64), order="C"
379                    ),
380                    np.zeros(n_train),
381                    n_train,
382                    p_train,
383                )
384
385                kmin_test_i = adaoptc.find_kmin_x(
386                    dists_test_i, n_x=n_train, k=self.k, cache=self.cache
387                )
388
389                weights_test_i = adaoptc.calculate_weights(kmin_test_i[0])
390
391                probs_test_i = adaoptc.calculate_probs(
392                    kmin_test_i[1], self.probs_training
393                )
394
395                return adaoptc.average_probs(
396                    probs=probs_test_i, weights=weights_test_i
397                )
398
399        if self.verbose == 1:
400            res = Parallel(n_jobs=self.n_jobs, prefer="threads")(
401                (multiproc_func)(m) for m in tqdm(range(n_test))
402            )
403
404        else:
405            res = Parallel(n_jobs=self.n_jobs, prefer="threads")(
406                (multiproc_func)(m) for m in range(n_test)
407            )
408
409        return np.asarray(res)

Predict probabilities for test data X.

Args:

X: {array-like}, shape = [n_samples, n_features]
    Training vectors, where n_samples is the number
    of samples and n_features is the number of features.

**kwargs: additional parameters to be passed to
    self.cook_test_set

Returns:

probability estimates for test data: {array-like}

class LSBoostClassifier(sklearn.base.BaseEstimator, sklearn.base.ClassifierMixin): View Source

 18class LSBoostClassifier(BaseEstimator, ClassifierMixin):
 19    """LSBoost classifier.
 20
 21    Attributes:
 22
 23        n_estimators: int
 24            number of boosting iterations.
 25
 26        learning_rate: float
 27            controls the learning speed at training time.
 28
 29        n_hidden_features: int
 30            number of nodes in successive hidden layers.
 31
 32        reg_lambda: float
 33            L2 regularization parameter for successive errors in the optimizer
 34            (at training time).
 35
 36        alpha: float
 37            compromise between L1 and L2 regularization (must be in [0, 1]),
 38            for `solver` == 'enet'.
 39
 40        row_sample: float
 41            percentage of rows chosen from the training set.
 42
 43        col_sample: float
 44            percentage of columns chosen from the training set.
 45
 46        dropout: float
 47            percentage of nodes dropped from the training set.
 48
 49        tolerance: float
 50            controls early stopping in gradient descent (at training time).
 51
 52        direct_link: bool
 53            indicates whether the original features are included (True) in model's
 54            fitting or not (False).
 55
 56        verbose: int
 57            progress bar (yes = 1) or not (no = 0) (currently).
 58
 59        seed: int
 60            reproducibility seed for nodes_sim=='uniform', clustering and dropout.
 61
 62        backend: str
 63            type of backend; must be in ('cpu', 'gpu', 'tpu')
 64
 65        solver: str
 66            type of 'weak' learner; currently in ('ridge', 'lasso', 'enet').
 67            'enet' is a combination of 'ridge' and 'lasso' called Elastic Net.
 68
 69        activation: str
 70            activation function: currently 'relu', 'relu6', 'sigmoid', 'tanh'
 71
 72        n_clusters: int
 73            number of clusters for clustering the features
 74
 75        clustering_method: str
 76            clustering method: currently 'kmeans', 'gmm'
 77
 78        cluster_scaling: str
 79            scaling method for clustering: currently 'standard', 'robust', 'minmax'
 80
 81        degree: int
 82            degree of features interactions to include in the model
 83
 84        weights_distr: str
 85            distribution of weights for constructing the model's hidden layer;
 86            currently 'uniform', 'gaussian'
 87
 88        hist: bool
 89            indicates whether histogram features are used or not (default is False)
 90
 91        bins: int or str
 92            number of bins for histogram features (same as numpy.histogram, default is 'auto')
 93
 94    Examples:
 95
 96        ```python
 97        import numpy as np
 98        from sklearn.datasets import load_digits, load_breast_cancer, load_wine, load_iris
 99        from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
100        from sklearn.tree import DecisionTreeRegressor
101        from sklearn.kernel_ridge import KernelRidge
102        from time import time
103        from os import chdir
104        from sklearn import metrics
105        import os
106
107        import mlsauce as ms
108
109        print("\n")
110        print("GenericBoosting Decision tree -----")
111        print("\n")
112
113        print("\n")
114        print("breast_cancer data -----")
115
116        # data 1
117        breast_cancer = load_breast_cancer()
118        X = breast_cancer.data
119        y = breast_cancer.target
120        # split data into training test and test set
121        np.random.seed(15029)
122        X_train, X_test, y_train, y_test = train_test_split(X, y,
123                                                            test_size=0.2)
124
125        clf = DecisionTreeRegressor()
126        clf2 = KernelRidge()
127
128        obj = ms.GenericBoostingClassifier(clf, tolerance=1e-2)
129        print(obj.get_params())
130        start = time()
131        obj.fit(X_train, y_train)
132        print(time()-start)
133        start = time()
134        print(obj.score(X_test, y_test))
135        print(time()-start)
136
137        print(obj.obj['loss'])
138
139        obj = ms.GenericBoostingClassifier(clf, tolerance=1e-2, n_clusters=2)
140        print(obj.get_params())
141        start = time()
142        obj.fit(X_train, y_train)
143        print(time()-start)
144        start = time()
145        print(obj.score(X_test, y_test))
146        print(time()-start)
147
148        print(obj.obj['loss'])
149
150
151        # data 2
152        print("\n")
153        print("wine data -----")
154
155        wine = load_wine()
156        Z = wine.data
157        t = wine.target
158        np.random.seed(879423)
159        X_train, X_test, y_train, y_test = train_test_split(Z, t,
160                                                            test_size=0.2)
161
162        obj = ms.GenericBoostingClassifier(clf)
163        print(obj.get_params())
164        start = time()
165        obj.fit(X_train, y_train)
166        print(time()-start)
167        start = time()
168        print(obj.score(X_test, y_test))
169        print(time()-start)
170
171        print(obj.obj['loss'])
172
173        obj = ms.GenericBoostingClassifier(clf, n_clusters=3)
174        print(obj.get_params())
175        start = time()
176        obj.fit(X_train, y_train)
177        print(time()-start)
178        start = time()
179        print(obj.score(X_test, y_test))
180        print(time()-start)
181
182        print(obj.obj['loss'])
183
184        # data 3
185        print("\n")
186        print("iris data -----")
187
188        iris = load_iris()
189        Z = iris.data
190        t = iris.target
191        np.random.seed(734563)
192        X_train, X_test, y_train, y_test = train_test_split(Z, t,
193                                                            test_size=0.2)
194
195
196        obj = ms.GenericBoostingClassifier(clf)
197        print(obj.get_params())
198        start = time()
199        obj.fit(X_train, y_train)
200        print(time()-start)
201        start = time()
202        print(obj.score(X_test, y_test))
203        print(time()-start)
204
205        print(obj.obj['loss'])
206
207
208        print("\n")
209        print("GenericBoosting  KRR -----")
210        print("\n")
211
212        obj = ms.GenericBoostingClassifier(clf2, tolerance=1e-2)
213        print(obj.get_params())
214        start = time()
215        obj.fit(X_train, y_train)
216        print(time()-start)
217        start = time()
218        print(obj.score(X_test, y_test))
219        print(time()-start)
220
221        print(obj.obj['loss'])
222
223        obj = ms.GenericBoostingClassifier(clf2, tolerance=1e-2, n_clusters=2)
224        print(obj.get_params())
225        start = time()
226        obj.fit(X_train, y_train)
227        print(time()-start)
228        start = time()
229        print(obj.score(X_test, y_test))
230        print(time()-start)
231
232        print(obj.obj['loss'])
233
234
235        # data 2
236        print("\n")
237        print("wine data -----")
238
239        wine = load_wine()
240        Z = wine.data
241        t = wine.target
242        np.random.seed(879423)
243        X_train, X_test, y_train, y_test = train_test_split(Z, t,
244                                                            test_size=0.2)
245
246        obj = ms.GenericBoostingClassifier(clf2)
247        print(obj.get_params())
248        start = time()
249        obj.fit(X_train, y_train)
250        print(time()-start)
251        start = time()
252        print(obj.score(X_test, y_test))
253        print(time()-start)
254
255        print(obj.obj['loss'])
256
257        obj = ms.GenericBoostingClassifier(clf2, n_clusters=3)
258        print(obj.get_params())
259        start = time()
260        obj.fit(X_train, y_train)
261        print(time()-start)
262        start = time()
263        print(obj.score(X_test, y_test))
264        print(time()-start)
265
266        print(obj.obj['loss'])
267
268        # data 3
269        print("\n")
270        print("iris data -----")
271
272        iris = load_iris()
273        Z = iris.data
274        t = iris.target
275        np.random.seed(734563)
276        X_train, X_test, y_train, y_test = train_test_split(Z, t,
277                                                            test_size=0.2)
278
279
280        obj = ms.GenericBoostingClassifier(clf2)
281        print(obj.get_params())
282        start = time()
283        obj.fit(X_train, y_train)
284        print(time()-start)
285        start = time()
286        print(obj.score(X_test, y_test))
287        print(time()-start)
288
289        print(obj.obj['loss'])
290    ```
291
292    """
293
294    def __init__(
295        self,
296        n_estimators=100,
297        learning_rate=0.1,
298        n_hidden_features=5,
299        reg_lambda=0.1,
300        alpha=0.5,
301        row_sample=1,
302        col_sample=1,
303        dropout=0,
304        tolerance=1e-4,
305        direct_link=1,
306        verbose=1,
307        seed=123,
308        backend="cpu",
309        solver="ridge",
310        activation="relu",
311        n_clusters=0,
312        clustering_method="kmeans",
313        cluster_scaling="standard",
314        degree=None,
315        weights_distr="uniform",
316        base_model=None,
317        hist=False,
318        bins="auto",
319    ):
320
321        self.base_model = base_model
322        self.hist = hist
323        self.bins = bins
324        self.hist_bins_ = None
325
326        if n_clusters > 0:
327            assert clustering_method in (
328                "kmeans",
329                "gmm",
330            ), "`clustering_method` must be in ('kmeans', 'gmm')"
331            assert cluster_scaling in (
332                "standard",
333                "robust",
334                "minmax",
335            ), "`cluster_scaling` must be in ('standard', 'robust', 'minmax')"
336
337        assert backend in (
338            "cpu",
339            "gpu",
340            "tpu",
341        ), "`backend` must be in ('cpu', 'gpu', 'tpu')"
342
343        assert solver in (
344            "ridge",
345            "lasso",
346            "enet",
347        ), "`solver` must be in ('ridge', 'lasso', 'enet')"
348
349        sys_platform = platform.system()
350
351        if (sys_platform == "Windows") and (backend in ("gpu", "tpu")):
352            warnings.warn(
353                "No GPU/TPU computing on Windows yet, backend set to 'cpu'"
354            )
355            backend = "cpu"
356
357        self.n_estimators = n_estimators
358        self.learning_rate = learning_rate
359        self.n_hidden_features = n_hidden_features
360        self.reg_lambda = reg_lambda
361        assert alpha >= 0 and alpha <= 1, "`alpha` must be in [0, 1]"
362        self.alpha = alpha
363        self.row_sample = row_sample
364        self.col_sample = col_sample
365        self.dropout = dropout
366        self.tolerance = tolerance
367        self.direct_link = direct_link
368        self.verbose = verbose
369        self.seed = seed
370        self.backend = backend
371        self.obj = None
372        self.solver = solver
373        self.activation = activation
374        self.n_clusters = n_clusters
375        self.clustering_method = clustering_method
376        self.cluster_scaling = cluster_scaling
377        self.scaler_, self.label_encoder_, self.clusterer_ = None, None, None
378        self.degree = degree
379        self.poly_ = None
380        self.weights_distr = weights_distr
381        if self.backend in ("gpu", "tpu"):
382            check_and_install("jax")
383            check_and_install("jaxlib")
384
385    def fit(self, X, y, **kwargs):
386        """Fit Booster (classifier) to training data (X, y)
387
388        Args:
389
390            X: {array-like}, shape = [n_samples, n_features]
391                Training vectors, where n_samples is the number
392                of samples and n_features is the number of features.
393
394            y: array-like, shape = [n_samples]
395                Target values.
396
397            **kwargs: additional parameters to be passed to self.cook_training_set.
398
399        Returns:
400
401            self: object.
402        """
403
404        if isinstance(X, pd.DataFrame):
405            X = X.values
406
407        if self.hist == True:
408            X, self.hist_bins_ = get_histo_features(X)
409
410        if isinstance(y, pd.Series):
411            y = y.values.ravel()
412        else:
413            y = y.ravel()
414
415        if self.degree is not None:
416            assert isinstance(self.degree, int), "`degree` must be an integer"
417            self.poly_ = PolynomialFeatures(
418                degree=self.degree, interaction_only=True, include_bias=False
419            )
420            X = self.poly_.fit_transform(X)
421
422        if self.n_clusters > 0:
423            clustered_X, self.scaler_, self.label_encoder_, self.clusterer_ = (
424                cluster(
425                    X,
426                    n_clusters=self.n_clusters,
427                    method=self.clustering_method,
428                    type_scaling=self.cluster_scaling,
429                    training=True,
430                    seed=self.seed,
431                )
432            )
433            X = np.column_stack((X, clustered_X))
434
435        self.obj = boosterc.fit_booster_classifier(
436            np.asarray(X, order="C"),
437            np.asarray(y, order="C"),
438            n_estimators=self.n_estimators,
439            learning_rate=self.learning_rate,
440            n_hidden_features=self.n_hidden_features,
441            reg_lambda=self.reg_lambda,
442            alpha=self.alpha,
443            row_sample=self.row_sample,
444            col_sample=self.col_sample,
445            dropout=self.dropout,
446            tolerance=self.tolerance,
447            direct_link=self.direct_link,
448            verbose=self.verbose,
449            seed=self.seed,
450            backend=self.backend,
451            solver=self.solver,
452            activation=self.activation,
453            obj=self.base_model,
454        )
455
456        self.classes_ = np.unique(y)  # for compatibility with sklearn
457        self.n_classes_ = len(self.classes_)  # for compatibility with sklearn
458        self.n_estimators = self.obj["n_estimators"]
459        return self
460
461    def predict(self, X, **kwargs):
462        """Predict test data X.
463
464        Args:
465
466            X: {array-like}, shape = [n_samples, n_features]
467                Training vectors, where n_samples is the number
468                of samples and n_features is the number of features.
469
470            **kwargs: additional parameters to be passed to `predict_proba`
471
472
473        Returns:
474
475            model predictions: {array-like}
476        """
477
478        return np.argmax(self.predict_proba(X, **kwargs), axis=1)
479
480    def predict_proba(self, X, **kwargs):
481        """Predict probabilities for test data X.
482
483        Args:
484
485            X: {array-like}, shape = [n_samples, n_features]
486                Training vectors, where n_samples is the number
487                of samples and n_features is the number of features.
488
489            **kwargs: additional parameters to be passed to
490                self.cook_test_set
491
492        Returns:
493
494            probability estimates for test data: {array-like}
495        """
496
497        if isinstance(X, pd.DataFrame):
498            X = X.values
499
500        if self.hist == True:
501            X = get_histo_features(X, bins=self.hist_bins_)
502
503        if self.degree is not None:
504            X = self.poly_.transform(X)
505
506        if self.n_clusters > 0:
507            X = np.column_stack(
508                (
509                    X,
510                    cluster(
511                        X,
512                        training=False,
513                        scaler=self.scaler_,
514                        label_encoder=self.label_encoder_,
515                        clusterer=self.clusterer_,
516                        seed=self.seed,
517                    ),
518                )
519            )
520        try:
521            return boosterc.predict_proba_booster_classifier(
522                self.obj, np.asarray(X, order="C")
523            )
524        except ValueError:
525            pass
526
527    def update(self, X, y, eta=0.9):
528        """Update model with new data.
529
530        Args:
531
532            X: {array-like}, shape = [n_samples=1, n_features]
533                Training vectors, where n_samples is the number
534                of samples and n_features is the number of features.
535
536            y: float = [n_samples=1]
537               Target value.
538
539            eta: float
540                Inverse power applied to number of observations
541                (defines a learning rate).
542
543        Returns:
544
545            self: object.
546        """
547
548        if isinstance(X, pd.DataFrame):
549            X = X.values
550
551        if self.degree is not None:
552            X = self.poly_.transform(X)
553
554        if self.n_clusters > 0:
555            X = np.column_stack(
556                (
557                    X,
558                    cluster(
559                        X,
560                        training=False,
561                        scaler=self.scaler_,
562                        label_encoder=self.label_encoder_,
563                        clusterer=self.clusterer_,
564                        seed=self.seed,
565                    ),
566                )
567            )
568
569        self.obj = boosterc.update_booster(
570            self.obj,
571            np.asarray(X, order="C"),
572            np.asarray(y, order="C").ravel(),
573            eta,
574        )
575
576        return self

LSBoost classifier.

Attributes:

    n_estimators: int
        number of boosting iterations.

    learning_rate: float
        controls the learning speed at training time.

    n_hidden_features: int
        number of nodes in successive hidden layers.

    reg_lambda: float
        L2 regularization parameter for successive errors in the optimizer
        (at training time).

    alpha: float
        compromise between L1 and L2 regularization (must be in [0, 1]),
        for `solver` == 'enet'.

    row_sample: float
        percentage of rows chosen from the training set.

    col_sample: float
        percentage of columns chosen from the training set.

    dropout: float
        percentage of nodes dropped from the training set.

    tolerance: float
        controls early stopping in gradient descent (at training time).

    direct_link: bool
        indicates whether the original features are included (True) in model's
        fitting or not (False).

    verbose: int
        progress bar (yes = 1) or not (no = 0) (currently).

    seed: int
        reproducibility seed for nodes_sim=='uniform', clustering and dropout.

    backend: str
        type of backend; must be in ('cpu', 'gpu', 'tpu')

    solver: str
        type of 'weak' learner; currently in ('ridge', 'lasso', 'enet').
        'enet' is a combination of 'ridge' and 'lasso' called Elastic Net.

    activation: str
        activation function: currently 'relu', 'relu6', 'sigmoid', 'tanh'

    n_clusters: int
        number of clusters for clustering the features

    clustering_method: str
        clustering method: currently 'kmeans', 'gmm'

    cluster_scaling: str
        scaling method for clustering: currently 'standard', 'robust', 'minmax'

    degree: int
        degree of features interactions to include in the model

    weights_distr: str
        distribution of weights for constructing the model's hidden layer;
        currently 'uniform', 'gaussian'

    hist: bool
        indicates whether histogram features are used or not (default is False)

    bins: int or str
        number of bins for histogram features (same as numpy.histogram, default is 'auto')

Examples:

    ```python
    import numpy as np
    from sklearn.datasets import load_digits, load_breast_cancer, load_wine, load_iris
    from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
    from sklearn.tree import DecisionTreeRegressor
    from sklearn.kernel_ridge import KernelRidge
    from time import time
    from os import chdir
    from sklearn import metrics
    import os

    import mlsauce as ms

    print("

") print("GenericBoosting Decision tree -----") print(" ")

    print("

") print("breast_cancer data -----")

    # data 1
    breast_cancer = load_breast_cancer()
    X = breast_cancer.data
    y = breast_cancer.target
    # split data into training test and test set
    np.random.seed(15029)
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=0.2)

    clf = DecisionTreeRegressor()
    clf2 = KernelRidge()

    obj = ms.GenericBoostingClassifier(clf, tolerance=1e-2)
    print(obj.get_params())
    start = time()
    obj.fit(X_train, y_train)
    print(time()-start)
    start = time()
    print(obj.score(X_test, y_test))
    print(time()-start)

    print(obj.obj['loss'])

    obj = ms.GenericBoostingClassifier(clf, tolerance=1e-2, n_clusters=2)
    print(obj.get_params())
    start = time()
    obj.fit(X_train, y_train)
    print(time()-start)
    start = time()
    print(obj.score(X_test, y_test))
    print(time()-start)

    print(obj.obj['loss'])


    # data 2
    print("

") print("wine data -----")

    wine = load_wine()
    Z = wine.data
    t = wine.target
    np.random.seed(879423)
    X_train, X_test, y_train, y_test = train_test_split(Z, t,
                                                        test_size=0.2)

    obj = ms.GenericBoostingClassifier(clf)
    print(obj.get_params())
    start = time()
    obj.fit(X_train, y_train)
    print(time()-start)
    start = time()
    print(obj.score(X_test, y_test))
    print(time()-start)

    print(obj.obj['loss'])

    obj = ms.GenericBoostingClassifier(clf, n_clusters=3)
    print(obj.get_params())
    start = time()
    obj.fit(X_train, y_train)
    print(time()-start)
    start = time()
    print(obj.score(X_test, y_test))
    print(time()-start)

    print(obj.obj['loss'])

    # data 3
    print("

") print("iris data -----")

    iris = load_iris()
    Z = iris.data
    t = iris.target
    np.random.seed(734563)
    X_train, X_test, y_train, y_test = train_test_split(Z, t,
                                                        test_size=0.2)


    obj = ms.GenericBoostingClassifier(clf)
    print(obj.get_params())
    start = time()
    obj.fit(X_train, y_train)
    print(time()-start)
    start = time()
    print(obj.score(X_test, y_test))
    print(time()-start)

    print(obj.obj['loss'])


    print("

") print("GenericBoosting KRR -----") print(" ")

    obj = ms.GenericBoostingClassifier(clf2, tolerance=1e-2)
    print(obj.get_params())
    start = time()
    obj.fit(X_train, y_train)
    print(time()-start)
    start = time()
    print(obj.score(X_test, y_test))
    print(time()-start)

    print(obj.obj['loss'])

    obj = ms.GenericBoostingClassifier(clf2, tolerance=1e-2, n_clusters=2)
    print(obj.get_params())
    start = time()
    obj.fit(X_train, y_train)
    print(time()-start)
    start = time()
    print(obj.score(X_test, y_test))
    print(time()-start)

    print(obj.obj['loss'])


    # data 2
    print("

") print("wine data -----")

    wine = load_wine()
    Z = wine.data
    t = wine.target
    np.random.seed(879423)
    X_train, X_test, y_train, y_test = train_test_split(Z, t,
                                                        test_size=0.2)

    obj = ms.GenericBoostingClassifier(clf2)
    print(obj.get_params())
    start = time()
    obj.fit(X_train, y_train)
    print(time()-start)
    start = time()
    print(obj.score(X_test, y_test))
    print(time()-start)

    print(obj.obj['loss'])

    obj = ms.GenericBoostingClassifier(clf2, n_clusters=3)
    print(obj.get_params())
    start = time()
    obj.fit(X_train, y_train)
    print(time()-start)
    start = time()
    print(obj.score(X_test, y_test))
    print(time()-start)

    print(obj.obj['loss'])

    # data 3
    print("

") print("iris data -----")

    iris = load_iris()
    Z = iris.data
    t = iris.target
    np.random.seed(734563)
    X_train, X_test, y_train, y_test = train_test_split(Z, t,
                                                        test_size=0.2)


    obj = ms.GenericBoostingClassifier(clf2)
    print(obj.get_params())
    start = time()
    obj.fit(X_train, y_train)
    print(time()-start)
    start = time()
    print(obj.score(X_test, y_test))
    print(time()-start)

    print(obj.obj['loss'])
```

def fit(self, X, y, **kwargs): View Source

385    def fit(self, X, y, **kwargs):
386        """Fit Booster (classifier) to training data (X, y)
387
388        Args:
389
390            X: {array-like}, shape = [n_samples, n_features]
391                Training vectors, where n_samples is the number
392                of samples and n_features is the number of features.
393
394            y: array-like, shape = [n_samples]
395                Target values.
396
397            **kwargs: additional parameters to be passed to self.cook_training_set.
398
399        Returns:
400
401            self: object.
402        """
403
404        if isinstance(X, pd.DataFrame):
405            X = X.values
406
407        if self.hist == True:
408            X, self.hist_bins_ = get_histo_features(X)
409
410        if isinstance(y, pd.Series):
411            y = y.values.ravel()
412        else:
413            y = y.ravel()
414
415        if self.degree is not None:
416            assert isinstance(self.degree, int), "`degree` must be an integer"
417            self.poly_ = PolynomialFeatures(
418                degree=self.degree, interaction_only=True, include_bias=False
419            )
420            X = self.poly_.fit_transform(X)
421
422        if self.n_clusters > 0:
423            clustered_X, self.scaler_, self.label_encoder_, self.clusterer_ = (
424                cluster(
425                    X,
426                    n_clusters=self.n_clusters,
427                    method=self.clustering_method,
428                    type_scaling=self.cluster_scaling,
429                    training=True,
430                    seed=self.seed,
431                )
432            )
433            X = np.column_stack((X, clustered_X))
434
435        self.obj = boosterc.fit_booster_classifier(
436            np.asarray(X, order="C"),
437            np.asarray(y, order="C"),
438            n_estimators=self.n_estimators,
439            learning_rate=self.learning_rate,
440            n_hidden_features=self.n_hidden_features,
441            reg_lambda=self.reg_lambda,
442            alpha=self.alpha,
443            row_sample=self.row_sample,
444            col_sample=self.col_sample,
445            dropout=self.dropout,
446            tolerance=self.tolerance,
447            direct_link=self.direct_link,
448            verbose=self.verbose,
449            seed=self.seed,
450            backend=self.backend,
451            solver=self.solver,
452            activation=self.activation,
453            obj=self.base_model,
454        )
455
456        self.classes_ = np.unique(y)  # for compatibility with sklearn
457        self.n_classes_ = len(self.classes_)  # for compatibility with sklearn
458        self.n_estimators = self.obj["n_estimators"]
459        return self

Fit Booster (classifier) to training data (X, y)

Args:

X: {array-like}, shape = [n_samples, n_features]
    Training vectors, where n_samples is the number
    of samples and n_features is the number of features.

y: array-like, shape = [n_samples]
    Target values.

**kwargs: additional parameters to be passed to self.cook_training_set.

Returns:

self: object.

def predict(self, X, **kwargs): View Source

461    def predict(self, X, **kwargs):
462        """Predict test data X.
463
464        Args:
465
466            X: {array-like}, shape = [n_samples, n_features]
467                Training vectors, where n_samples is the number
468                of samples and n_features is the number of features.
469
470            **kwargs: additional parameters to be passed to `predict_proba`
471
472
473        Returns:
474
475            model predictions: {array-like}
476        """
477
478        return np.argmax(self.predict_proba(X, **kwargs), axis=1)

Predict test data X.

Args:

X: {array-like}, shape = [n_samples, n_features]
    Training vectors, where n_samples is the number
    of samples and n_features is the number of features.

**kwargs: additional parameters to be passed to `predict_proba`

Returns:

model predictions: {array-like}

def predict_proba(self, X, **kwargs): View Source

480    def predict_proba(self, X, **kwargs):
481        """Predict probabilities for test data X.
482
483        Args:
484
485            X: {array-like}, shape = [n_samples, n_features]
486                Training vectors, where n_samples is the number
487                of samples and n_features is the number of features.
488
489            **kwargs: additional parameters to be passed to
490                self.cook_test_set
491
492        Returns:
493
494            probability estimates for test data: {array-like}
495        """
496
497        if isinstance(X, pd.DataFrame):
498            X = X.values
499
500        if self.hist == True:
501            X = get_histo_features(X, bins=self.hist_bins_)
502
503        if self.degree is not None:
504            X = self.poly_.transform(X)
505
506        if self.n_clusters > 0:
507            X = np.column_stack(
508                (
509                    X,
510                    cluster(
511                        X,
512                        training=False,
513                        scaler=self.scaler_,
514                        label_encoder=self.label_encoder_,
515                        clusterer=self.clusterer_,
516                        seed=self.seed,
517                    ),
518                )
519            )
520        try:
521            return boosterc.predict_proba_booster_classifier(
522                self.obj, np.asarray(X, order="C")
523            )
524        except ValueError:
525            pass

Predict probabilities for test data X.

Args:

X: {array-like}, shape = [n_samples, n_features]
    Training vectors, where n_samples is the number
    of samples and n_features is the number of features.

**kwargs: additional parameters to be passed to
    self.cook_test_set

Returns:

probability estimates for test data: {array-like}

class GenericBoostingClassifier(mlsauce.LSBoostClassifier): View Source

579class GenericBoostingClassifier(LSBoostClassifier):
580    """Generic Boosting classifier (using any classifier as base learner).
581
582    Attributes:
583
584        base_model: object
585            base learner (default is ExtraTreeRegressor) to be boosted.
586
587        n_estimators: int
588            number of boosting iterations.
589
590        learning_rate: float
591            controls the learning speed at training time.
592
593        n_hidden_features: int
594            number of nodes in successive hidden layers.
595
596        reg_lambda: float
597            L2 regularization parameter for successive errors in the optimizer
598            (at training time).
599
600        alpha: float
601            compromise between L1 and L2 regularization (must be in [0, 1]),
602            for `solver` == 'enet'.
603
604        row_sample: float
605            percentage of rows chosen from the training set.
606
607        col_sample: float
608            percentage of columns chosen from the training set.
609
610        dropout: float
611            percentage of nodes dropped from the training set.
612
613        tolerance: float
614            controls early stopping in gradient descent (at training time).
615
616        direct_link: bool
617            indicates whether the original features are included (True) in model's
618            fitting or not (False).
619
620        verbose: int
621            progress bar (yes = 1) or not (no = 0) (currently).
622
623        seed: int
624            reproducibility seed for nodes_sim=='uniform', clustering and dropout.
625
626        backend: str
627            type of backend; must be in ('cpu', 'gpu', 'tpu')
628
629        solver: str
630            type of 'weak' learner; currently in ('ridge', 'lasso', 'enet').
631            'enet' is a combination of 'ridge' and 'lasso' called Elastic Net.
632
633        activation: str
634            activation function: currently 'relu', 'relu6', 'sigmoid', 'tanh'
635
636        n_clusters: int
637            number of clusters for clustering the features
638
639        clustering_method: str
640            clustering method: currently 'kmeans', 'gmm'
641
642        cluster_scaling: str
643            scaling method for clustering: currently 'standard', 'robust', 'minmax'
644
645        degree: int
646            degree of features interactions to include in the model
647
648        weights_distr: str
649            distribution of weights for constructing the model's hidden layer;
650            currently 'uniform', 'gaussian'
651
652        hist: bool
653            indicates whether histogram features are used or not (default is False)
654
655        bins: int or str
656            number of bins for histogram features (same as numpy.histogram, default is 'auto')
657
658    """
659
660    def __init__(
661        self,
662        base_model=ExtraTreeRegressor(),
663        n_estimators=100,
664        learning_rate=0.1,
665        n_hidden_features=5,
666        reg_lambda=0.1,
667        alpha=0.5,
668        row_sample=1,
669        col_sample=1,
670        dropout=0,
671        tolerance=1e-4,
672        direct_link=1,
673        verbose=1,
674        seed=123,
675        backend="cpu",
676        solver="ridge",
677        activation="relu",
678        n_clusters=0,
679        clustering_method="kmeans",
680        cluster_scaling="standard",
681        degree=None,
682        weights_distr="uniform",
683        hist=False,
684        bins="auto",
685    ):
686        self.base_model = base_model
687        self.hist = hist
688        self.bins = bins
689        self.hist_bins_ = None
690
691        super().__init__(
692            n_estimators=n_estimators,
693            learning_rate=learning_rate,
694            n_hidden_features=n_hidden_features,
695            reg_lambda=reg_lambda,
696            alpha=alpha,
697            row_sample=row_sample,
698            col_sample=col_sample,
699            dropout=dropout,
700            tolerance=tolerance,
701            direct_link=direct_link,
702            verbose=verbose,
703            seed=seed,
704            backend=backend,
705            solver=solver,
706            activation=activation,
707            n_clusters=n_clusters,
708            clustering_method=clustering_method,
709            cluster_scaling=cluster_scaling,
710            degree=degree,
711            weights_distr=weights_distr,
712            base_model=self.base_model,
713        )

Generic Boosting classifier (using any classifier as base learner).

Attributes:

base_model: object
    base learner (default is ExtraTreeRegressor) to be boosted.

n_estimators: int
    number of boosting iterations.

learning_rate: float
    controls the learning speed at training time.

n_hidden_features: int
    number of nodes in successive hidden layers.

reg_lambda: float
    L2 regularization parameter for successive errors in the optimizer
    (at training time).

alpha: float
    compromise between L1 and L2 regularization (must be in [0, 1]),
    for `solver` == 'enet'.

row_sample: float
    percentage of rows chosen from the training set.

col_sample: float
    percentage of columns chosen from the training set.

dropout: float
    percentage of nodes dropped from the training set.

tolerance: float
    controls early stopping in gradient descent (at training time).

direct_link: bool
    indicates whether the original features are included (True) in model's
    fitting or not (False).

verbose: int
    progress bar (yes = 1) or not (no = 0) (currently).

seed: int
    reproducibility seed for nodes_sim=='uniform', clustering and dropout.

backend: str
    type of backend; must be in ('cpu', 'gpu', 'tpu')

solver: str
    type of 'weak' learner; currently in ('ridge', 'lasso', 'enet').
    'enet' is a combination of 'ridge' and 'lasso' called Elastic Net.

activation: str
    activation function: currently 'relu', 'relu6', 'sigmoid', 'tanh'

n_clusters: int
    number of clusters for clustering the features

clustering_method: str
    clustering method: currently 'kmeans', 'gmm'

cluster_scaling: str
    scaling method for clustering: currently 'standard', 'robust', 'minmax'

degree: int
    degree of features interactions to include in the model

weights_distr: str
    distribution of weights for constructing the model's hidden layer;
    currently 'uniform', 'gaussian'

hist: bool
    indicates whether histogram features are used or not (default is False)

bins: int or str
    number of bins for histogram features (same as numpy.histogram, default is 'auto')

class GenericBoostingRegressor(mlsauce.LSBoostRegressor): View Source

460class GenericBoostingRegressor(LSBoostRegressor):
461    """Generic Boosting regressor.
462
463    Attributes:
464
465        base_model: object
466            base learner (default is ExtraTreeRegressor) to be boosted.
467
468        n_estimators: int
469            number of boosting iterations.
470
471        learning_rate: float
472            controls the learning speed at training time.
473
474        n_hidden_features: int
475            number of nodes in successive hidden layers.
476
477        reg_lambda: float
478            L2 regularization parameter for successive errors in the optimizer
479            (at training time).
480
481        alpha: float
482            compromise between L1 and L2 regularization (must be in [0, 1]),
483            for `solver` == 'enet'
484
485        row_sample: float
486            percentage of rows chosen from the training set.
487
488        col_sample: float
489            percentage of columns chosen from the training set.
490
491        dropout: float
492            percentage of nodes dropped from the training set.
493
494        tolerance: float
495            controls early stopping in gradient descent (at training time).
496
497        direct_link: bool
498            indicates whether the original features are included (True) in model's
499            fitting or not (False).
500
501        verbose: int
502            progress bar (yes = 1) or not (no = 0) (currently).
503
504        seed: int
505            reproducibility seed for nodes_sim=='uniform', clustering and dropout.
506
507        backend: str
508            type of backend; must be in ('cpu', 'gpu', 'tpu')
509
510        solver: str
511            type of 'weak' learner; currently in ('ridge', 'lasso')
512
513        activation: str
514            activation function: currently 'relu', 'relu6', 'sigmoid', 'tanh'
515
516        type_pi: str.
517            type of prediction interval; currently "kde" (default) or "bootstrap".
518            Used only in `self.predict`, for `self.replications` > 0 and `self.kernel`
519            in ('gaussian', 'tophat'). Default is `None`.
520
521        replications: int.
522            number of replications (if needed) for predictive simulation.
523            Used only in `self.predict`, for `self.kernel` in ('gaussian',
524            'tophat') and `self.type_pi = 'kde'`. Default is `None`.
525
526        n_clusters: int
527            number of clusters for clustering the features
528
529        clustering_method: str
530            clustering method: currently 'kmeans', 'gmm'
531
532        cluster_scaling: str
533            scaling method for clustering: currently 'standard', 'robust', 'minmax'
534
535        degree: int
536            degree of features interactions to include in the model
537
538        weights_distr: str
539            distribution of weights for constructing the model's hidden layer;
540            either 'uniform' or 'gaussian'
541
542        hist: bool
543            whether to use histogram features or not
544
545        bins: int or str
546            number of bins for histogram features (same as numpy.histogram, default is 'auto')
547
548    """
549
550    def __init__(
551        self,
552        base_model=ExtraTreeRegressor(),
553        n_estimators=100,
554        learning_rate=0.1,
555        n_hidden_features=5,
556        reg_lambda=0.1,
557        alpha=0.5,
558        row_sample=1,
559        col_sample=1,
560        dropout=0,
561        tolerance=1e-4,
562        direct_link=1,
563        verbose=1,
564        seed=123,
565        backend="cpu",
566        solver="ridge",
567        activation="relu",
568        type_pi=None,
569        replications=None,
570        kernel=None,
571        n_clusters=0,
572        clustering_method="kmeans",
573        cluster_scaling="standard",
574        degree=None,
575        weights_distr="uniform",
576        hist=False,
577        bins="auto",
578    ):
579        self.base_model = base_model
580        self.hist = hist
581        self.bins = bins
582        self.hist_bins_ = None
583
584        super().__init__(
585            n_estimators=n_estimators,
586            learning_rate=learning_rate,
587            n_hidden_features=n_hidden_features,
588            reg_lambda=reg_lambda,
589            alpha=alpha,
590            row_sample=row_sample,
591            col_sample=col_sample,
592            dropout=dropout,
593            tolerance=tolerance,
594            direct_link=direct_link,
595            verbose=verbose,
596            seed=seed,
597            backend=backend,
598            solver=solver,
599            activation=activation,
600            type_pi=type_pi,
601            replications=replications,
602            kernel=kernel,
603            n_clusters=n_clusters,
604            clustering_method=clustering_method,
605            cluster_scaling=cluster_scaling,
606            degree=degree,
607            weights_distr=weights_distr,
608            base_model=self.base_model,
609        )

Generic Boosting regressor.

Attributes:

base_model: object
    base learner (default is ExtraTreeRegressor) to be boosted.

n_estimators: int
    number of boosting iterations.

learning_rate: float
    controls the learning speed at training time.

n_hidden_features: int
    number of nodes in successive hidden layers.

reg_lambda: float
    L2 regularization parameter for successive errors in the optimizer
    (at training time).

alpha: float
    compromise between L1 and L2 regularization (must be in [0, 1]),
    for `solver` == 'enet'

row_sample: float
    percentage of rows chosen from the training set.

col_sample: float
    percentage of columns chosen from the training set.

dropout: float
    percentage of nodes dropped from the training set.

tolerance: float
    controls early stopping in gradient descent (at training time).

direct_link: bool
    indicates whether the original features are included (True) in model's
    fitting or not (False).

verbose: int
    progress bar (yes = 1) or not (no = 0) (currently).

seed: int
    reproducibility seed for nodes_sim=='uniform', clustering and dropout.

backend: str
    type of backend; must be in ('cpu', 'gpu', 'tpu')

solver: str
    type of 'weak' learner; currently in ('ridge', 'lasso')

activation: str
    activation function: currently 'relu', 'relu6', 'sigmoid', 'tanh'

type_pi: str.
    type of prediction interval; currently "kde" (default) or "bootstrap".
    Used only in `self.predict`, for `self.replications` > 0 and `self.kernel`
    in ('gaussian', 'tophat'). Default is `None`.

replications: int.
    number of replications (if needed) for predictive simulation.
    Used only in `self.predict`, for `self.kernel` in ('gaussian',
    'tophat') and `self.type_pi = 'kde'`. Default is `None`.

n_clusters: int
    number of clusters for clustering the features

clustering_method: str
    clustering method: currently 'kmeans', 'gmm'

cluster_scaling: str
    scaling method for clustering: currently 'standard', 'robust', 'minmax'

degree: int
    degree of features interactions to include in the model

weights_distr: str
    distribution of weights for constructing the model's hidden layer;
    either 'uniform' or 'gaussian'

hist: bool
    whether to use histogram features or not

bins: int or str
    number of bins for histogram features (same as numpy.histogram, default is 'auto')

class StumpClassifier(sklearn.base.BaseEstimator, sklearn.base.ClassifierMixin): View Source

 12class StumpClassifier(BaseEstimator, ClassifierMixin):
 13    """Stump classifier.
 14
 15    Attributes:
 16
 17        bins: int
 18            Number of histogram bins; as in numpy.histogram.
 19    """
 20
 21    def __init__(self, bins="auto"):
 22        self.bins = bins
 23        self.obj = None
 24
 25    def fit(self, X, y, sample_weight=None, **kwargs):
 26        """Fit Stump to training data (X, y)
 27
 28        Args:
 29
 30            X: {array-like}, shape = [n_samples, n_features]
 31                Training vectors, where n_samples is the number
 32                of samples and n_features is the number of features.
 33
 34            y: array-like, shape = [n_samples]
 35                Target values.
 36
 37            sample_weight: array_like, shape = [n_samples]
 38                Observations weights.
 39
 40        Returns:
 41
 42            self: object.
 43        """
 44
 45        if sample_weight is None:
 46            self.obj = stumpc.fit_stump_classifier(
 47                X=np.asarray(X, order="C"),
 48                y=np.asarray(y, order="C"),
 49                bins=self.bins,
 50            )
 51
 52            return self
 53
 54        self.obj = stumpc.fit_stump_classifier(
 55            X=np.asarray(X, order="C"),
 56            y=np.asarray(y, order="C"),
 57            sample_weight=np.ravel(sample_weight, order="C"),
 58            bins=self.bins,
 59        )
 60        self.n_classes_ = len(np.unique(y))  # for compatibility with sklearn
 61        return self
 62
 63    def predict(self, X, **kwargs):
 64        """Predict test data X.
 65
 66        Args:
 67
 68            X: {array-like}, shape = [n_samples, n_features]
 69                Training vectors, where n_samples is the number
 70                of samples and n_features is the number of features.
 71
 72            **kwargs: additional parameters to be passed to `predict_proba`
 73
 74
 75        Returns:
 76
 77            model predictions: {array-like}
 78        """
 79
 80        return np.argmax(self.predict_proba(X, **kwargs), axis=1)
 81
 82    def predict_proba(self, X, **kwargs):
 83        """Predict probabilities for test data X.
 84
 85        Args:
 86
 87            X: {array-like}, shape = [n_samples, n_features]
 88                Training vectors, where n_samples is the number
 89                of samples and n_features is the number of features.
 90
 91            **kwargs: additional parameters to be passed to
 92                self.cook_test_set
 93
 94        Returns:
 95
 96            probability estimates for test data: {array-like}
 97        """
 98
 99        return stumpc.predict_proba_stump_classifier(
100            self.obj, np.asarray(X, order="C")
101        )

Stump classifier.

Attributes:

bins: int
    Number of histogram bins; as in numpy.histogram.

def fit(self, X, y, sample_weight=None, **kwargs): View Source

25    def fit(self, X, y, sample_weight=None, **kwargs):
26        """Fit Stump to training data (X, y)
27
28        Args:
29
30            X: {array-like}, shape = [n_samples, n_features]
31                Training vectors, where n_samples is the number
32                of samples and n_features is the number of features.
33
34            y: array-like, shape = [n_samples]
35                Target values.
36
37            sample_weight: array_like, shape = [n_samples]
38                Observations weights.
39
40        Returns:
41
42            self: object.
43        """
44
45        if sample_weight is None:
46            self.obj = stumpc.fit_stump_classifier(
47                X=np.asarray(X, order="C"),
48                y=np.asarray(y, order="C"),
49                bins=self.bins,
50            )
51
52            return self
53
54        self.obj = stumpc.fit_stump_classifier(
55            X=np.asarray(X, order="C"),
56            y=np.asarray(y, order="C"),
57            sample_weight=np.ravel(sample_weight, order="C"),
58            bins=self.bins,
59        )
60        self.n_classes_ = len(np.unique(y))  # for compatibility with sklearn
61        return self

Fit Stump to training data (X, y)

Args:

X: {array-like}, shape = [n_samples, n_features]
    Training vectors, where n_samples is the number
    of samples and n_features is the number of features.

y: array-like, shape = [n_samples]
    Target values.

sample_weight: array_like, shape = [n_samples]
    Observations weights.

Returns:

self: object.

def predict(self, X, **kwargs): View Source

63    def predict(self, X, **kwargs):
64        """Predict test data X.
65
66        Args:
67
68            X: {array-like}, shape = [n_samples, n_features]
69                Training vectors, where n_samples is the number
70                of samples and n_features is the number of features.
71
72            **kwargs: additional parameters to be passed to `predict_proba`
73
74
75        Returns:
76
77            model predictions: {array-like}
78        """
79
80        return np.argmax(self.predict_proba(X, **kwargs), axis=1)

Predict test data X.

Args:

X: {array-like}, shape = [n_samples, n_features]
    Training vectors, where n_samples is the number
    of samples and n_features is the number of features.

**kwargs: additional parameters to be passed to `predict_proba`

Returns:

model predictions: {array-like}

def predict_proba(self, X, **kwargs): View Source

 82    def predict_proba(self, X, **kwargs):
 83        """Predict probabilities for test data X.
 84
 85        Args:
 86
 87            X: {array-like}, shape = [n_samples, n_features]
 88                Training vectors, where n_samples is the number
 89                of samples and n_features is the number of features.
 90
 91            **kwargs: additional parameters to be passed to
 92                self.cook_test_set
 93
 94        Returns:
 95
 96            probability estimates for test data: {array-like}
 97        """
 98
 99        return stumpc.predict_proba_stump_classifier(
100            self.obj, np.asarray(X, order="C")
101        )

Predict probabilities for test data X.

Args:

X: {array-like}, shape = [n_samples, n_features]
    Training vectors, where n_samples is the number
    of samples and n_features is the number of features.

**kwargs: additional parameters to be passed to
    self.cook_test_set

Returns:

probability estimates for test data: {array-like}

class ElasticNetRegressor(sklearn.base.BaseEstimator, sklearn.base.RegressorMixin): View Source

 19class ElasticNetRegressor(BaseEstimator, RegressorMixin):
 20    """Elasticnet.
 21
 22    Attributes:
 23
 24        reg_lambda: float
 25            regularization parameter.
 26
 27        alpha: float
 28            compromise between L1 and L2 regularization (must be in [0, 1]),
 29            for `solver` == 'enet'.
 30
 31        backend: str
 32            type of backend; must be in ('cpu', 'gpu', 'tpu')
 33
 34    """
 35
 36    def __init__(self, reg_lambda=0.1, alpha=0.5, backend="cpu"):
 37        assert backend in (
 38            "cpu",
 39            "gpu",
 40            "tpu",
 41        ), "`backend` must be in ('cpu', 'gpu', 'tpu')"
 42
 43        sys_platform = platform.system()
 44
 45        if (sys_platform == "Windows") and (backend in ("gpu", "tpu")):
 46            warnings.warn(
 47                "No GPU/TPU computing on Windows yet, backend set to 'cpu'"
 48            )
 49            backend = "cpu"
 50
 51        self.reg_lambda = reg_lambda
 52        self.alpha = alpha
 53        self.backend = backend
 54        if self.backend in ("gpu", "tpu"):
 55            check_and_install("jax")
 56            check_and_install("jaxlib")
 57
 58    def fit(self, X, y, **kwargs):
 59        """Fit matrixops (classifier) to training data (X, y)
 60
 61        Args:
 62
 63            X: {array-like}, shape = [n_samples, n_features]
 64                Training vectors, where n_samples is the number
 65                of samples and n_features is the number of features.
 66
 67            y: array-like, shape = [n_samples]
 68                Target values.
 69
 70            **kwargs: additional parameters to be passed to self.cook_training_set.
 71
 72        Returns:
 73
 74            self: object.
 75
 76        """
 77        fit_result = fit_elasticnet(X, y, lam=self.reg_lambda, alpha=self.alpha)
 78        self.coef_ = fit_result.coef_
 79        self.y_train_mean = fit_result.y_train_mean
 80        self.scaler = fit_result.scaler
 81        self.converged = fit_result.converged
 82        return self
 83
 84    def predict(self, X, **kwargs):
 85        """Predict test data X.
 86
 87        Args:
 88
 89            X: {array-like}, shape = [n_samples, n_features]
 90                Training vectors, where n_samples is the number
 91                of samples and n_features is the number of features.
 92
 93            **kwargs: additional parameters to be passed to `predict_proba`
 94
 95        Returns:
 96
 97            model predictions: {array-like}
 98
 99        """
100        return predict_elasticnet(X, self)

Elasticnet.

Attributes:

reg_lambda: float
    regularization parameter.

alpha: float
    compromise between L1 and L2 regularization (must be in [0, 1]),
    for `solver` == 'enet'.

backend: str
    type of backend; must be in ('cpu', 'gpu', 'tpu')

def fit(self, X, y, **kwargs): View Source

58    def fit(self, X, y, **kwargs):
59        """Fit matrixops (classifier) to training data (X, y)
60
61        Args:
62
63            X: {array-like}, shape = [n_samples, n_features]
64                Training vectors, where n_samples is the number
65                of samples and n_features is the number of features.
66
67            y: array-like, shape = [n_samples]
68                Target values.
69
70            **kwargs: additional parameters to be passed to self.cook_training_set.
71
72        Returns:
73
74            self: object.
75
76        """
77        fit_result = fit_elasticnet(X, y, lam=self.reg_lambda, alpha=self.alpha)
78        self.coef_ = fit_result.coef_
79        self.y_train_mean = fit_result.y_train_mean
80        self.scaler = fit_result.scaler
81        self.converged = fit_result.converged
82        return self

Fit matrixops (classifier) to training data (X, y)

Args:

X: {array-like}, shape = [n_samples, n_features]
    Training vectors, where n_samples is the number
    of samples and n_features is the number of features.

y: array-like, shape = [n_samples]
    Target values.

**kwargs: additional parameters to be passed to self.cook_training_set.

Returns:

self: object.

def predict(self, X, **kwargs): View Source

 84    def predict(self, X, **kwargs):
 85        """Predict test data X.
 86
 87        Args:
 88
 89            X: {array-like}, shape = [n_samples, n_features]
 90                Training vectors, where n_samples is the number
 91                of samples and n_features is the number of features.
 92
 93            **kwargs: additional parameters to be passed to `predict_proba`
 94
 95        Returns:
 96
 97            model predictions: {array-like}
 98
 99        """
100        return predict_elasticnet(X, self)

Predict test data X.

Args:

X: {array-like}, shape = [n_samples, n_features]
    Training vectors, where n_samples is the number
    of samples and n_features is the number of features.

**kwargs: additional parameters to be passed to `predict_proba`

Returns:

model predictions: {array-like}

class LassoRegressor(sklearn.base.BaseEstimator, sklearn.base.RegressorMixin): View Source

 24class LassoRegressor(BaseEstimator, RegressorMixin):
 25    """Lasso.
 26
 27    Attributes:
 28
 29        reg_lambda: float
 30            L1 regularization parameter.
 31
 32        max_iter: int
 33            number of iterations of lasso shooting algorithm.
 34
 35        tol: float
 36            tolerance for convergence of lasso shooting algorithm.
 37
 38        backend: str
 39            type of backend; must be in ('cpu', 'gpu', 'tpu').
 40
 41    """
 42
 43    def __init__(self, reg_lambda=0.1, max_iter=10, tol=1e-3, backend="cpu"):
 44        assert backend in (
 45            "cpu",
 46            "gpu",
 47            "tpu",
 48        ), "`backend` must be in ('cpu', 'gpu', 'tpu')"
 49
 50        sys_platform = platform.system()
 51
 52        if (sys_platform == "Windows") and (backend in ("gpu", "tpu")):
 53            warnings.warn(
 54                "No GPU/TPU computing on Windows yet, backend set to 'cpu'"
 55            )
 56            backend = "cpu"
 57
 58        self.reg_lambda = reg_lambda
 59        self.max_iter = max_iter
 60        self.tol = tol
 61        self.backend = backend
 62        if self.backend in ("gpu", "tpu"):
 63            check_and_install("jax")
 64            check_and_install("jaxlib")
 65
 66    def fit(self, X, y, **kwargs):
 67        """Fit matrixops (classifier) to training data (X, y)
 68
 69        Args:
 70
 71            X: {array-like}, shape = [n_samples, n_features]
 72                Training vectors, where n_samples is the number
 73                of samples and n_features is the number of features.
 74
 75            y: array-like, shape = [n_samples]
 76                Target values.
 77
 78            **kwargs: additional parameters to be passed to self.cook_training_set.
 79
 80        Returns:
 81
 82            self: object.
 83
 84        """
 85
 86        self.ym, centered_y = mo.center_response(y)
 87        self.xm = X.mean(axis=0)
 88        self.xsd = X.std(axis=0)
 89        self.xsd[self.xsd == 0] = 1
 90        X_ = (X - self.xm[None, :]) / self.xsd[None, :]
 91        XX = mo.crossprod(X_, backend=self.backend)
 92        Xy = mo.crossprod(X_, centered_y, backend=self.backend)
 93        XX2 = 2 * XX
 94        Xy2 = 2 * Xy
 95
 96        if self.backend == "cpu":
 97            # beta0, _, _, _ = np.linalg.lstsq(X_, centered_y, rcond=None)
 98            beta0 = get_beta(X_, centered_y)
 99            if len(np.asarray(y).shape) == 1:
100                res = mo.get_beta_1D(
101                    beta0=np.asarray(beta0),
102                    XX2=np.asarray(XX2),
103                    Xy2=np.asarray(Xy2),
104                    reg_lambda=self.reg_lambda,
105                    max_iter=self.max_iter,
106                    tol=self.tol,
107                )
108                self.beta = res[0]
109                return self
110
111            res = mo.get_beta_2D(
112                beta0=np.asarray(beta0),
113                XX2=np.asarray(XX2),
114                Xy2=np.asarray(Xy2),
115                reg_lambda=self.reg_lambda,
116                max_iter=self.max_iter,
117                tol=self.tol,
118            )
119            self.beta = res[0]
120            return self
121
122        invXX = jinv(XX + self.reg_lambda * jnp.eye(X_.shape[1]))
123        beta0 = mo.safe_sparse_dot(invXX, Xy, backend=self.backend)
124        if len(np.asarray(y).shape) == 1:
125            res = mo.get_beta_1D(
126                beta0=np.asarray(beta0),
127                XX2=np.asarray(XX2),
128                Xy2=np.asarray(Xy2),
129                reg_lambda=self.reg_lambda,
130                max_iter=self.max_iter,
131                tol=self.tol,
132            )
133            self.beta = res[0]
134            return self
135
136        res = mo.get_beta_2D(
137            beta0=np.asarray(beta0),
138            XX2=np.asarray(XX2),
139            Xy2=np.asarray(Xy2),
140            reg_lambda=self.reg_lambda,
141            max_iter=self.max_iter,
142            tol=self.tol,
143        )
144        self.beta = res[0]
145        return self
146
147    def predict(self, X, **kwargs):
148        """Predict test data X.
149
150        Args:
151
152            X: {array-like}, shape = [n_samples, n_features]
153                Training vectors, where n_samples is the number
154                of samples and n_features is the number of features.
155
156            **kwargs: additional parameters to be passed to `predict_proba`
157
158
159        Returns:
160
161            model predictions: {array-like}
162
163        """
164        X_ = (X - self.xm[None, :]) / self.xsd[None, :]
165
166        if self.backend == "cpu":
167            if isinstance(self.ym, float):
168                return self.ym + mo.safe_sparse_dot(X_, self.beta)
169            return self.ym[None, :] + mo.safe_sparse_dot(X_, self.beta)
170
171        # if self.backend in ("gpu", "tpu"):
172        if isinstance(self.ym, float):
173            return self.ym + mo.safe_sparse_dot(
174                X_, self.beta, backend=self.backend
175            )
176        return self.ym[None, :] + mo.safe_sparse_dot(
177            X_, self.beta, backend=self.backend
178        )

Lasso.

Attributes:

reg_lambda: float
    L1 regularization parameter.

max_iter: int
    number of iterations of lasso shooting algorithm.

tol: float
    tolerance for convergence of lasso shooting algorithm.

backend: str
    type of backend; must be in ('cpu', 'gpu', 'tpu').

def fit(self, X, y, **kwargs): View Source

 66    def fit(self, X, y, **kwargs):
 67        """Fit matrixops (classifier) to training data (X, y)
 68
 69        Args:
 70
 71            X: {array-like}, shape = [n_samples, n_features]
 72                Training vectors, where n_samples is the number
 73                of samples and n_features is the number of features.
 74
 75            y: array-like, shape = [n_samples]
 76                Target values.
 77
 78            **kwargs: additional parameters to be passed to self.cook_training_set.
 79
 80        Returns:
 81
 82            self: object.
 83
 84        """
 85
 86        self.ym, centered_y = mo.center_response(y)
 87        self.xm = X.mean(axis=0)
 88        self.xsd = X.std(axis=0)
 89        self.xsd[self.xsd == 0] = 1
 90        X_ = (X - self.xm[None, :]) / self.xsd[None, :]
 91        XX = mo.crossprod(X_, backend=self.backend)
 92        Xy = mo.crossprod(X_, centered_y, backend=self.backend)
 93        XX2 = 2 * XX
 94        Xy2 = 2 * Xy
 95
 96        if self.backend == "cpu":
 97            # beta0, _, _, _ = np.linalg.lstsq(X_, centered_y, rcond=None)
 98            beta0 = get_beta(X_, centered_y)
 99            if len(np.asarray(y).shape) == 1:
100                res = mo.get_beta_1D(
101                    beta0=np.asarray(beta0),
102                    XX2=np.asarray(XX2),
103                    Xy2=np.asarray(Xy2),
104                    reg_lambda=self.reg_lambda,
105                    max_iter=self.max_iter,
106                    tol=self.tol,
107                )
108                self.beta = res[0]
109                return self
110
111            res = mo.get_beta_2D(
112                beta0=np.asarray(beta0),
113                XX2=np.asarray(XX2),
114                Xy2=np.asarray(Xy2),
115                reg_lambda=self.reg_lambda,
116                max_iter=self.max_iter,
117                tol=self.tol,
118            )
119            self.beta = res[0]
120            return self
121
122        invXX = jinv(XX + self.reg_lambda * jnp.eye(X_.shape[1]))
123        beta0 = mo.safe_sparse_dot(invXX, Xy, backend=self.backend)
124        if len(np.asarray(y).shape) == 1:
125            res = mo.get_beta_1D(
126                beta0=np.asarray(beta0),
127                XX2=np.asarray(XX2),
128                Xy2=np.asarray(Xy2),
129                reg_lambda=self.reg_lambda,
130                max_iter=self.max_iter,
131                tol=self.tol,
132            )
133            self.beta = res[0]
134            return self
135
136        res = mo.get_beta_2D(
137            beta0=np.asarray(beta0),
138            XX2=np.asarray(XX2),
139            Xy2=np.asarray(Xy2),
140            reg_lambda=self.reg_lambda,
141            max_iter=self.max_iter,
142            tol=self.tol,
143        )
144        self.beta = res[0]
145        return self

Fit matrixops (classifier) to training data (X, y)

Args:

X: {array-like}, shape = [n_samples, n_features]
    Training vectors, where n_samples is the number
    of samples and n_features is the number of features.

y: array-like, shape = [n_samples]
    Target values.

**kwargs: additional parameters to be passed to self.cook_training_set.

Returns:

self: object.

def predict(self, X, **kwargs): View Source

147    def predict(self, X, **kwargs):
148        """Predict test data X.
149
150        Args:
151
152            X: {array-like}, shape = [n_samples, n_features]
153                Training vectors, where n_samples is the number
154                of samples and n_features is the number of features.
155
156            **kwargs: additional parameters to be passed to `predict_proba`
157
158
159        Returns:
160
161            model predictions: {array-like}
162
163        """
164        X_ = (X - self.xm[None, :]) / self.xsd[None, :]
165
166        if self.backend == "cpu":
167            if isinstance(self.ym, float):
168                return self.ym + mo.safe_sparse_dot(X_, self.beta)
169            return self.ym[None, :] + mo.safe_sparse_dot(X_, self.beta)
170
171        # if self.backend in ("gpu", "tpu"):
172        if isinstance(self.ym, float):
173            return self.ym + mo.safe_sparse_dot(
174                X_, self.beta, backend=self.backend
175            )
176        return self.ym[None, :] + mo.safe_sparse_dot(
177            X_, self.beta, backend=self.backend
178        )

Predict test data X.

Args:

X: {array-like}, shape = [n_samples, n_features]
    Training vectors, where n_samples is the number
    of samples and n_features is the number of features.

**kwargs: additional parameters to be passed to `predict_proba`

Returns:

model predictions: {array-like}

class LSBoostRegressor(sklearn.base.BaseEstimator, sklearn.base.RegressorMixin): View Source

 19class LSBoostRegressor(BaseEstimator, RegressorMixin):
 20    """LSBoost regressor.
 21
 22    Attributes:
 23
 24        n_estimators: int
 25            number of boosting iterations.
 26
 27        learning_rate: float
 28            controls the learning speed at training time.
 29
 30        n_hidden_features: int
 31            number of nodes in successive hidden layers.
 32
 33        reg_lambda: float
 34            L2 regularization parameter for successive errors in the optimizer
 35            (at training time).
 36
 37        alpha: float
 38            compromise between L1 and L2 regularization (must be in [0, 1]),
 39            for `solver` == 'enet'
 40
 41        row_sample: float
 42            percentage of rows chosen from the training set.
 43
 44        col_sample: float
 45            percentage of columns chosen from the training set.
 46
 47        dropout: float
 48            percentage of nodes dropped from the training set.
 49
 50        tolerance: float
 51            controls early stopping in gradient descent (at training time).
 52
 53        direct_link: bool
 54            indicates whether the original features are included (True) in model's
 55            fitting or not (False).
 56
 57        verbose: int
 58            progress bar (yes = 1) or not (no = 0) (currently).
 59
 60        seed: int
 61            reproducibility seed for nodes_sim=='uniform', clustering and dropout.
 62
 63        backend: str
 64            type of backend; must be in ('cpu', 'gpu', 'tpu')
 65
 66        solver: str
 67            type of 'weak' learner; currently in ('ridge', 'lasso')
 68
 69        activation: str
 70            activation function: currently 'relu', 'relu6', 'sigmoid', 'tanh'
 71
 72        type_pi: str.
 73            type of prediction interval; currently "kde" (default) or "bootstrap".
 74            Used only in `self.predict`, for `self.replications` > 0 and `self.kernel`
 75            in ('gaussian', 'tophat'). Default is `None`.
 76
 77        replications: int.
 78            number of replications (if needed) for predictive simulation.
 79            Used only in `self.predict`, for `self.kernel` in ('gaussian',
 80            'tophat') and `self.type_pi = 'kde'`. Default is `None`.
 81
 82        n_clusters: int
 83            number of clusters for clustering the features
 84
 85        clustering_method: str
 86            clustering method: currently 'kmeans', 'gmm'
 87
 88        cluster_scaling: str
 89            scaling method for clustering: currently 'standard', 'robust', 'minmax'
 90
 91        degree: int
 92            degree of features interactions to include in the model
 93
 94        weights_distr: str
 95            distribution of weights for constructing the model's hidden layer;
 96            either 'uniform' or 'gaussian'
 97
 98        hist: bool
 99            whether to use histogram features or not
100
101        bins: int or str
102            number of bins for histogram features (same as numpy.histogram, default is 'auto')
103
104    Examples:
105
106        ```python
107        import subprocess
108        import sys
109        import os
110
111        subprocess.check_call([sys.executable, "-m", "pip", "install", "matplotlib"])
112
113        import mlsauce as ms
114        import numpy as np
115        import matplotlib.pyplot as plt
116        from sklearn.datasets import load_diabetes
117        from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
118        from sklearn.tree import DecisionTreeRegressor
119        from time import time
120        from os import chdir
121        from sklearn import metrics
122
123        regr = DecisionTreeRegressor()
124
125        diabetes = load_diabetes()
126        X = diabetes.data
127        y = diabetes.target
128        # split data into training test and test set
129        np.random.seed(15029)
130        X_train, X_test, y_train, y_test = train_test_split(X, y,
131                                                            test_size=0.2)
132
133        obj = ms.GenericBoostingRegressor(regr, col_sample=0.9, row_sample=0.9)
134        print(obj.get_params())
135        start = time()
136        obj.fit(X_train, y_train)
137        print(time()-start)
138        start = time()
139        print(np.sqrt(np.mean(np.square(obj.predict(X_test) - y_test))))
140        print(time()-start)
141
142        print(obj.obj['loss'])
143
144        obj = ms.GenericBoostingRegressor(regr, col_sample=0.9, row_sample=0.9, n_clusters=2)
145        print(obj.get_params())
146        start = time()
147        obj.fit(X_train, y_train)
148        print(time()-start)
149        start = time()
150        print(np.sqrt(np.mean(np.square(obj.predict(X_test) - y_test))))
151        print(time()-start)
152
153        print(obj.obj['loss'])
154        ```
155
156    """
157
158    def __init__(
159        self,
160        n_estimators=100,
161        learning_rate=0.1,
162        n_hidden_features=5,
163        reg_lambda=0.1,
164        alpha=0.5,
165        row_sample=1,
166        col_sample=1,
167        dropout=0,
168        tolerance=1e-4,
169        direct_link=1,
170        verbose=1,
171        seed=123,
172        backend="cpu",
173        solver="ridge",
174        activation="relu",
175        type_pi=None,
176        replications=None,
177        kernel=None,
178        n_clusters=0,
179        clustering_method="kmeans",
180        cluster_scaling="standard",
181        degree=None,
182        weights_distr="uniform",
183        base_model=None,
184        hist=False,
185        bins="auto",
186    ):
187
188        self.base_model = base_model
189        self.hist = hist
190        self.bins = bins
191        self.hist_bins_ = None
192
193        if n_clusters > 0:
194            assert clustering_method in (
195                "kmeans",
196                "gmm",
197            ), "`clustering_method` must be in ('kmeans', 'gmm')"
198            assert cluster_scaling in (
199                "standard",
200                "robust",
201                "minmax",
202            ), "`cluster_scaling` must be in ('standard', 'robust', 'minmax')"
203
204        assert backend in (
205            "cpu",
206            "gpu",
207            "tpu",
208        ), "`backend` must be in ('cpu', 'gpu', 'tpu')"
209
210        assert solver in (
211            "ridge",
212            "lasso",
213            "enet",
214        ), "`solver` must be in ('ridge', 'lasso', 'enet')"
215
216        sys_platform = platform.system()
217
218        if (sys_platform == "Windows") and (backend in ("gpu", "tpu")):
219            warnings.warn(
220                "No GPU/TPU computing on Windows yet, backend set to 'cpu'"
221            )
222            backend = "cpu"
223
224        self.n_estimators = n_estimators
225        self.learning_rate = learning_rate
226        self.n_hidden_features = n_hidden_features
227        self.reg_lambda = reg_lambda
228        assert alpha >= 0 and alpha <= 1, "`alpha` must be in [0, 1]"
229        self.alpha = alpha
230        self.row_sample = row_sample
231        self.col_sample = col_sample
232        self.dropout = dropout
233        self.tolerance = tolerance
234        self.direct_link = direct_link
235        self.verbose = verbose
236        self.seed = seed
237        self.backend = backend
238        self.obj = None
239        self.solver = solver
240        self.activation = activation
241        self.type_pi = type_pi
242        self.replications = replications
243        self.kernel = kernel
244        self.n_clusters = n_clusters
245        self.clustering_method = clustering_method
246        self.cluster_scaling = cluster_scaling
247        self.scaler_, self.label_encoder_, self.clusterer_ = None, None, None
248        self.degree = degree
249        self.poly_ = None
250        self.weights_distr = weights_distr
251        if self.backend in ("gpu", "tpu"):
252            check_and_install("jax")
253            check_and_install("jaxlib")
254
255    def fit(self, X, y, **kwargs):
256        """Fit Booster (regressor) to training data (X, y)
257
258        Args:
259
260            X: {array-like}, shape = [n_samples, n_features]
261                Training vectors, where n_samples is the number
262                of samples and n_features is the number of features.
263
264            y: array-like, shape = [n_samples]
265               Target values.
266
267            **kwargs: additional parameters to be passed to self.cook_training_set.
268
269        Returns:
270
271            self: object.
272        """
273
274        if isinstance(X, pd.DataFrame):
275            X = X.values
276
277        if self.hist == True:
278            X, self.hist_bins_ = get_histo_features(X)
279
280        if isinstance(y, pd.Series):
281            y = y.values.ravel()
282        else:
283            y = y.ravel()
284
285        if self.degree is not None:
286            assert isinstance(self.degree, int), "`degree` must be an integer"
287            self.poly_ = PolynomialFeatures(
288                degree=self.degree, interaction_only=True, include_bias=False
289            )
290            X = self.poly_.fit_transform(X)
291
292        if self.n_clusters > 0:
293            clustered_X, self.scaler_, self.label_encoder_, self.clusterer_ = (
294                cluster(
295                    X,
296                    n_clusters=self.n_clusters,
297                    method=self.clustering_method,
298                    type_scaling=self.cluster_scaling,
299                    training=True,
300                    seed=self.seed,
301                )
302            )
303            X = np.column_stack((X, clustered_X))
304
305        self.obj = boosterc.fit_booster_regressor(
306            X=np.asarray(X, order="C"),
307            y=np.asarray(y, order="C"),
308            n_estimators=self.n_estimators,
309            learning_rate=self.learning_rate,
310            n_hidden_features=self.n_hidden_features,
311            reg_lambda=self.reg_lambda,
312            alpha=self.alpha,
313            row_sample=self.row_sample,
314            col_sample=self.col_sample,
315            dropout=self.dropout,
316            tolerance=self.tolerance,
317            direct_link=self.direct_link,
318            verbose=self.verbose,
319            seed=self.seed,
320            backend=self.backend,
321            solver=self.solver,
322            activation=self.activation,
323            obj=self.base_model,
324        )
325
326        self.n_estimators = self.obj["n_estimators"]
327
328        self.X_ = X
329
330        self.y_ = y
331
332        return self
333
334    def predict(self, X, level=95, method=None, histo=False, **kwargs):
335        """Predict values for test data X.
336
337        Args:
338
339            X: {array-like}, shape = [n_samples, n_features]
340                Training vectors, where n_samples is the number
341                of samples and n_features is the number of features.
342
343            level: int
344                Level of confidence (default = 95)
345
346            method: str
347                `None`, or 'splitconformal', 'localconformal'
348                prediction (if you specify `return_pi = True`)
349
350            histo: bool
351                whether to use histogram features or not
352
353            **kwargs: additional parameters to be passed to
354                self.cook_test_set
355
356        Returns:
357
358            predicted values estimates for test data: {array-like}
359        """
360
361        if isinstance(X, pd.DataFrame):
362            X = X.values
363
364        if self.hist == True:
365            X = get_histo_features(X, bins=self.hist_bins_)
366
367        if self.degree is not None:
368            X = self.poly_.transform(X)
369
370        if self.n_clusters > 0:
371            X = np.column_stack(
372                (
373                    X,
374                    cluster(
375                        X,
376                        training=False,
377                        scaler=self.scaler_,
378                        label_encoder=self.label_encoder_,
379                        clusterer=self.clusterer_,
380                        seed=self.seed,
381                    ),
382                )
383            )
384        if "return_pi" in kwargs:
385            assert method in (
386                "splitconformal",
387                "localconformal",
388            ), "method must be in ('splitconformal', 'localconformal')"
389            self.pi = PredictionInterval(
390                obj=self,
391                method=method,
392                level=level,
393                type_pi=self.type_pi,
394                replications=self.replications,
395                kernel=self.kernel,
396            )
397            self.pi.fit(self.X_, self.y_)
398            self.X_ = None
399            self.y_ = None
400            preds = self.pi.predict(X, return_pi=True)
401            return preds
402        # print(f"\n in predict self: {self} \n")
403        # print(f"\n in predict self.obj: {self.obj} \n")
404        # try:
405        return boosterc.predict_booster_regressor(
406            self.obj, np.asarray(X, order="C")
407        )
408        # except ValueError:
409        #    pass
410
411    def update(self, X, y, eta=0.9):
412        """Update model with new data.
413
414        Args:
415
416            X: {array-like}, shape = [n_samples=1, n_features]
417                Training vectors, where n_samples is the number
418                of samples and n_features is the number of features.
419
420            y: float = [n_samples=1]
421               Target value.
422
423            eta: float
424                Inverse power applied to number of observations
425                (defines a learning rate).
426
427        Returns:
428
429            self: object.
430        """
431
432        if isinstance(X, pd.DataFrame):
433            X = X.values
434
435        if self.degree is not None:
436            X = self.poly_.transform(X)
437
438        if self.n_clusters > 0:
439            X = np.column_stack(
440                (
441                    X,
442                    cluster(
443                        X,
444                        training=False,
445                        scaler=self.scaler_,
446                        label_encoder=self.label_encoder_,
447                        clusterer=self.clusterer_,
448                        seed=self.seed,
449                    ),
450                )
451            )
452
453        self.obj = boosterc.update_booster(
454            self.obj, np.asarray(X, order="C"), np.asarray(y, order="C"), eta
455        )
456
457        return self

LSBoost regressor.

Attributes:

n_estimators: int
    number of boosting iterations.

learning_rate: float
    controls the learning speed at training time.

n_hidden_features: int
    number of nodes in successive hidden layers.

reg_lambda: float
    L2 regularization parameter for successive errors in the optimizer
    (at training time).

alpha: float
    compromise between L1 and L2 regularization (must be in [0, 1]),
    for `solver` == 'enet'

row_sample: float
    percentage of rows chosen from the training set.

col_sample: float
    percentage of columns chosen from the training set.

dropout: float
    percentage of nodes dropped from the training set.

tolerance: float
    controls early stopping in gradient descent (at training time).

direct_link: bool
    indicates whether the original features are included (True) in model's
    fitting or not (False).

verbose: int
    progress bar (yes = 1) or not (no = 0) (currently).

seed: int
    reproducibility seed for nodes_sim=='uniform', clustering and dropout.

backend: str
    type of backend; must be in ('cpu', 'gpu', 'tpu')

solver: str
    type of 'weak' learner; currently in ('ridge', 'lasso')

activation: str
    activation function: currently 'relu', 'relu6', 'sigmoid', 'tanh'

type_pi: str.
    type of prediction interval; currently "kde" (default) or "bootstrap".
    Used only in `self.predict`, for `self.replications` > 0 and `self.kernel`
    in ('gaussian', 'tophat'). Default is `None`.

replications: int.
    number of replications (if needed) for predictive simulation.
    Used only in `self.predict`, for `self.kernel` in ('gaussian',
    'tophat') and `self.type_pi = 'kde'`. Default is `None`.

n_clusters: int
    number of clusters for clustering the features

clustering_method: str
    clustering method: currently 'kmeans', 'gmm'

cluster_scaling: str
    scaling method for clustering: currently 'standard', 'robust', 'minmax'

degree: int
    degree of features interactions to include in the model

weights_distr: str
    distribution of weights for constructing the model's hidden layer;
    either 'uniform' or 'gaussian'

hist: bool
    whether to use histogram features or not

bins: int or str
    number of bins for histogram features (same as numpy.histogram, default is 'auto')

Examples:


import subprocess
import sys
import os

subprocess.check_call([sys.executable, "-m", "pip", "install", "matplotlib"])

import mlsauce as ms
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.tree import DecisionTreeRegressor
from time import time
from os import chdir
from sklearn import metrics

regr = DecisionTreeRegressor()

diabetes = load_diabetes()
X = diabetes.data
y = diabetes.target
# split data into training test and test set
np.random.seed(15029)
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2)

obj = ms.GenericBoostingRegressor(regr, col_sample=0.9, row_sample=0.9)
print(obj.get_params())
start = time()
obj.fit(X_train, y_train)
print(time()-start)
start = time()
print(np.sqrt(np.mean(np.square(obj.predict(X_test) - y_test))))
print(time()-start)

print(obj.obj['loss'])

obj = ms.GenericBoostingRegressor(regr, col_sample=0.9, row_sample=0.9, n_clusters=2)
print(obj.get_params())
start = time()
obj.fit(X_train, y_train)
print(time()-start)
start = time()
print(np.sqrt(np.mean(np.square(obj.predict(X_test) - y_test))))
print(time()-start)

print(obj.obj['loss'])

def fit(self, X, y, **kwargs): View Source

255    def fit(self, X, y, **kwargs):
256        """Fit Booster (regressor) to training data (X, y)
257
258        Args:
259
260            X: {array-like}, shape = [n_samples, n_features]
261                Training vectors, where n_samples is the number
262                of samples and n_features is the number of features.
263
264            y: array-like, shape = [n_samples]
265               Target values.
266
267            **kwargs: additional parameters to be passed to self.cook_training_set.
268
269        Returns:
270
271            self: object.
272        """
273
274        if isinstance(X, pd.DataFrame):
275            X = X.values
276
277        if self.hist == True:
278            X, self.hist_bins_ = get_histo_features(X)
279
280        if isinstance(y, pd.Series):
281            y = y.values.ravel()
282        else:
283            y = y.ravel()
284
285        if self.degree is not None:
286            assert isinstance(self.degree, int), "`degree` must be an integer"
287            self.poly_ = PolynomialFeatures(
288                degree=self.degree, interaction_only=True, include_bias=False
289            )
290            X = self.poly_.fit_transform(X)
291
292        if self.n_clusters > 0:
293            clustered_X, self.scaler_, self.label_encoder_, self.clusterer_ = (
294                cluster(
295                    X,
296                    n_clusters=self.n_clusters,
297                    method=self.clustering_method,
298                    type_scaling=self.cluster_scaling,
299                    training=True,
300                    seed=self.seed,
301                )
302            )
303            X = np.column_stack((X, clustered_X))
304
305        self.obj = boosterc.fit_booster_regressor(
306            X=np.asarray(X, order="C"),
307            y=np.asarray(y, order="C"),
308            n_estimators=self.n_estimators,
309            learning_rate=self.learning_rate,
310            n_hidden_features=self.n_hidden_features,
311            reg_lambda=self.reg_lambda,
312            alpha=self.alpha,
313            row_sample=self.row_sample,
314            col_sample=self.col_sample,
315            dropout=self.dropout,
316            tolerance=self.tolerance,
317            direct_link=self.direct_link,
318            verbose=self.verbose,
319            seed=self.seed,
320            backend=self.backend,
321            solver=self.solver,
322            activation=self.activation,
323            obj=self.base_model,
324        )
325
326        self.n_estimators = self.obj["n_estimators"]
327
328        self.X_ = X
329
330        self.y_ = y
331
332        return self

Fit Booster (regressor) to training data (X, y)

Args:

X: {array-like}, shape = [n_samples, n_features]
    Training vectors, where n_samples is the number
    of samples and n_features is the number of features.

y: array-like, shape = [n_samples]
   Target values.

**kwargs: additional parameters to be passed to self.cook_training_set.

Returns:

self: object.

def predict(self, X, level=95, method=None, histo=False, **kwargs): View Source

334    def predict(self, X, level=95, method=None, histo=False, **kwargs):
335        """Predict values for test data X.
336
337        Args:
338
339            X: {array-like}, shape = [n_samples, n_features]
340                Training vectors, where n_samples is the number
341                of samples and n_features is the number of features.
342
343            level: int
344                Level of confidence (default = 95)
345
346            method: str
347                `None`, or 'splitconformal', 'localconformal'
348                prediction (if you specify `return_pi = True`)
349
350            histo: bool
351                whether to use histogram features or not
352
353            **kwargs: additional parameters to be passed to
354                self.cook_test_set
355
356        Returns:
357
358            predicted values estimates for test data: {array-like}
359        """
360
361        if isinstance(X, pd.DataFrame):
362            X = X.values
363
364        if self.hist == True:
365            X = get_histo_features(X, bins=self.hist_bins_)
366
367        if self.degree is not None:
368            X = self.poly_.transform(X)
369
370        if self.n_clusters > 0:
371            X = np.column_stack(
372                (
373                    X,
374                    cluster(
375                        X,
376                        training=False,
377                        scaler=self.scaler_,
378                        label_encoder=self.label_encoder_,
379                        clusterer=self.clusterer_,
380                        seed=self.seed,
381                    ),
382                )
383            )
384        if "return_pi" in kwargs:
385            assert method in (
386                "splitconformal",
387                "localconformal",
388            ), "method must be in ('splitconformal', 'localconformal')"
389            self.pi = PredictionInterval(
390                obj=self,
391                method=method,
392                level=level,
393                type_pi=self.type_pi,
394                replications=self.replications,
395                kernel=self.kernel,
396            )
397            self.pi.fit(self.X_, self.y_)
398            self.X_ = None
399            self.y_ = None
400            preds = self.pi.predict(X, return_pi=True)
401            return preds
402        # print(f"\n in predict self: {self} \n")
403        # print(f"\n in predict self.obj: {self.obj} \n")
404        # try:
405        return boosterc.predict_booster_regressor(
406            self.obj, np.asarray(X, order="C")
407        )
408        # except ValueError:
409        #    pass

Predict values for test data X.

Args:

X: {array-like}, shape = [n_samples, n_features]
    Training vectors, where n_samples is the number
    of samples and n_features is the number of features.

level: int
    Level of confidence (default = 95)

method: str
    `None`, or 'splitconformal', 'localconformal'
    prediction (if you specify `return_pi = True`)

histo: bool
    whether to use histogram features or not

**kwargs: additional parameters to be passed to
    self.cook_test_set

Returns:

predicted values estimates for test data: {array-like}

class RidgeRegressor(sklearn.base.BaseEstimator, sklearn.base.RegressorMixin): View Source

 23class RidgeRegressor(BaseEstimator, RegressorMixin):
 24    """Ridge.
 25
 26    Attributes:
 27
 28        reg_lambda: float
 29            regularization parameter.
 30
 31        backend: str
 32            type of backend; must be in ('cpu', 'gpu', 'tpu')
 33
 34    """
 35
 36    def __init__(self, reg_lambda=0.1, backend="cpu"):
 37        assert backend in (
 38            "cpu",
 39            "gpu",
 40            "tpu",
 41        ), "`backend` must be in ('cpu', 'gpu', 'tpu')"
 42
 43        sys_platform = platform.system()
 44
 45        if (sys_platform == "Windows") and (backend in ("gpu", "tpu")):
 46            warnings.warn(
 47                "No GPU/TPU computing on Windows yet, backend set to 'cpu'"
 48            )
 49            backend = "cpu"
 50
 51        self.reg_lambda = reg_lambda
 52        self.backend = backend
 53        if self.backend in ("gpu", "tpu"):
 54            check_and_install("jax")
 55            check_and_install("jaxlib")
 56
 57    def fit(self, X, y, **kwargs):
 58        """Fit matrixops (classifier) to training data (X, y)
 59
 60        Args:
 61
 62            X: {array-like}, shape = [n_samples, n_features]
 63                Training vectors, where n_samples is the number
 64                of samples and n_features is the number of features.
 65
 66            y: array-like, shape = [n_samples]
 67                Target values.
 68
 69            **kwargs: additional parameters to be passed to self.cook_training_set.
 70
 71        Returns:
 72
 73            self: object.
 74
 75        """
 76        self.ym, centered_y = mo.center_response(y)
 77        self.xm = X.mean(axis=0)
 78        self.xsd = X.std(axis=0)
 79        self.xsd[self.xsd == 0] = 1  # avoid division by zero
 80        X_ = (X - self.xm[None, :]) / self.xsd[None, :]
 81
 82        if self.backend == "cpu":
 83            if len(centered_y.shape) <= 1:
 84                eye_term = np.sqrt(self.reg_lambda) * np.eye(X.shape[1])
 85                X_ = np.row_stack((X_, eye_term))
 86                y_ = np.concatenate((centered_y, np.zeros(X.shape[1])))
 87                # self.beta, _, _, _ = np.linalg.lstsq(X_, y_, rcond=None)
 88                self.beta = get_beta(X_, y_)
 89            else:
 90                try:
 91                    eye_term = np.sqrt(self.reg_lambda) * np.eye(X.shape[1])
 92                    X_ = np.row_stack((X_, eye_term))
 93                    y_ = np.row_stack(
 94                        (
 95                            centered_y,
 96                            np.zeros((eye_term.shape[0], centered_y.shape[1])),
 97                        )
 98                    )
 99                    # self.beta, _, _, _ = np.linalg.lstsq(X_, y_, rcond=None)
100                    self.beta = get_beta(X_, y_)
101                except Exception:
102                    x = inv(
103                        mo.crossprod(X_) + self.reg_lambda * np.eye(X_.shape[1])
104                    )
105                    hat_matrix = mo.tcrossprod(x, X_)
106                    self.beta = mo.safe_sparse_dot(hat_matrix, centered_y)
107            return self
108
109        x = jinv(
110            mo.crossprod(X_, backend=self.backend)
111            + self.reg_lambda * jnp.eye(X_.shape[1])
112        )
113        hat_matrix = mo.tcrossprod(x, X_, backend=self.backend)
114        self.beta = mo.safe_sparse_dot(
115            hat_matrix, centered_y, backend=self.backend
116        )
117        return self
118
119    def predict(self, X, **kwargs):
120        """Predict test data X.
121
122        Args:
123
124            X: {array-like}, shape = [n_samples, n_features]
125                Training vectors, where n_samples is the number
126                of samples and n_features is the number of features.
127
128            **kwargs: additional parameters to be passed to `predict_proba`
129
130        Returns:
131
132            model predictions: {array-like}
133
134        """
135        X_ = (X - self.xm[None, :]) / self.xsd[None, :]
136
137        if self.backend == "cpu":
138            if isinstance(self.ym, float):
139                return self.ym + mo.safe_sparse_dot(X_, self.beta)
140            return self.ym[None, :] + mo.safe_sparse_dot(X_, self.beta)
141
142        # if self.backend in ("gpu", "tpu"):
143        if isinstance(self.ym, float):
144            return self.ym + mo.safe_sparse_dot(
145                X_, self.beta, backend=self.backend
146            )
147        return self.ym[None, :] + mo.safe_sparse_dot(
148            X_, self.beta, backend=self.backend
149        )

Ridge.

Attributes:

reg_lambda: float
    regularization parameter.

backend: str
    type of backend; must be in ('cpu', 'gpu', 'tpu')

def fit(self, X, y, **kwargs): View Source

 57    def fit(self, X, y, **kwargs):
 58        """Fit matrixops (classifier) to training data (X, y)
 59
 60        Args:
 61
 62            X: {array-like}, shape = [n_samples, n_features]
 63                Training vectors, where n_samples is the number
 64                of samples and n_features is the number of features.
 65
 66            y: array-like, shape = [n_samples]
 67                Target values.
 68
 69            **kwargs: additional parameters to be passed to self.cook_training_set.
 70
 71        Returns:
 72
 73            self: object.
 74
 75        """
 76        self.ym, centered_y = mo.center_response(y)
 77        self.xm = X.mean(axis=0)
 78        self.xsd = X.std(axis=0)
 79        self.xsd[self.xsd == 0] = 1  # avoid division by zero
 80        X_ = (X - self.xm[None, :]) / self.xsd[None, :]
 81
 82        if self.backend == "cpu":
 83            if len(centered_y.shape) <= 1:
 84                eye_term = np.sqrt(self.reg_lambda) * np.eye(X.shape[1])
 85                X_ = np.row_stack((X_, eye_term))
 86                y_ = np.concatenate((centered_y, np.zeros(X.shape[1])))
 87                # self.beta, _, _, _ = np.linalg.lstsq(X_, y_, rcond=None)
 88                self.beta = get_beta(X_, y_)
 89            else:
 90                try:
 91                    eye_term = np.sqrt(self.reg_lambda) * np.eye(X.shape[1])
 92                    X_ = np.row_stack((X_, eye_term))
 93                    y_ = np.row_stack(
 94                        (
 95                            centered_y,
 96                            np.zeros((eye_term.shape[0], centered_y.shape[1])),
 97                        )
 98                    )
 99                    # self.beta, _, _, _ = np.linalg.lstsq(X_, y_, rcond=None)
100                    self.beta = get_beta(X_, y_)
101                except Exception:
102                    x = inv(
103                        mo.crossprod(X_) + self.reg_lambda * np.eye(X_.shape[1])
104                    )
105                    hat_matrix = mo.tcrossprod(x, X_)
106                    self.beta = mo.safe_sparse_dot(hat_matrix, centered_y)
107            return self
108
109        x = jinv(
110            mo.crossprod(X_, backend=self.backend)
111            + self.reg_lambda * jnp.eye(X_.shape[1])
112        )
113        hat_matrix = mo.tcrossprod(x, X_, backend=self.backend)
114        self.beta = mo.safe_sparse_dot(
115            hat_matrix, centered_y, backend=self.backend
116        )
117        return self

Fit matrixops (classifier) to training data (X, y)

Args:

X: {array-like}, shape = [n_samples, n_features]
    Training vectors, where n_samples is the number
    of samples and n_features is the number of features.

y: array-like, shape = [n_samples]
    Target values.

**kwargs: additional parameters to be passed to self.cook_training_set.

Returns:

self: object.

def predict(self, X, **kwargs): View Source

119    def predict(self, X, **kwargs):
120        """Predict test data X.
121
122        Args:
123
124            X: {array-like}, shape = [n_samples, n_features]
125                Training vectors, where n_samples is the number
126                of samples and n_features is the number of features.
127
128            **kwargs: additional parameters to be passed to `predict_proba`
129
130        Returns:
131
132            model predictions: {array-like}
133
134        """
135        X_ = (X - self.xm[None, :]) / self.xsd[None, :]
136
137        if self.backend == "cpu":
138            if isinstance(self.ym, float):
139                return self.ym + mo.safe_sparse_dot(X_, self.beta)
140            return self.ym[None, :] + mo.safe_sparse_dot(X_, self.beta)
141
142        # if self.backend in ("gpu", "tpu"):
143        if isinstance(self.ym, float):
144            return self.ym + mo.safe_sparse_dot(
145                X_, self.beta, backend=self.backend
146            )
147        return self.ym[None, :] + mo.safe_sparse_dot(
148            X_, self.beta, backend=self.backend
149        )

Predict test data X.

Args:

X: {array-like}, shape = [n_samples, n_features]
    Training vectors, where n_samples is the number
    of samples and n_features is the number of features.

**kwargs: additional parameters to be passed to `predict_proba`

Returns:

model predictions: {array-like}

class LazyBoostingClassifier(sklearn.base.ClassifierMixin): View Source

 89class LazyBoostingClassifier(ClassifierMixin):
 90    """
 91
 92    Fitting -- almost -- all the classification algorithms
 93    and returning their scores.
 94
 95    Parameters:
 96
 97        verbose: int, optional (default=0)
 98            Any positive number for verbosity.
 99
100        ignore_warnings: bool, optional (default=True)
101            When set to True, the warning related to algorigms that are not
102            able to run are ignored.
103
104        custom_metric: function, optional (default=None)
105            When function is provided, models are evaluated based on the custom
106              evaluation metric provided.
107
108        predictions: bool, optional (default=False)
109            When set to True, the predictions of all the models models are
110            returned as data frame.
111
112        sort_by: string, optional (default='Accuracy')
113            Sort models by a metric. Available options are 'Accuracy',
114            'Balanced Accuracy', 'ROC AUC', 'F1 Score' or a custom metric
115            identified by its name and provided by custom_metric.
116
117        random_state: int, optional (default=42)
118            Reproducibiility seed.
119
120        estimators: list, optional (default='all')
121            list of Estimators names or just 'all' for > 90 classifiers
122            (default='all')
123
124        preprocess: bool, preprocessing is done when set to True
125
126        n_jobs: int, when possible, run in parallel
127            For now, only used by individual models that support it.
128
129        n_layers: int, optional (default=3)
130            Number of layers of GenericBoostingClassifiers to be used.
131
132        All the other parameters are the same as GenericBoostingClassifier's.
133
134    Attributes:
135
136        models_: dict-object
137            Returns a dictionary with each model pipeline as value
138            with key as name of models.
139
140        best_model_: object
141            Returns the best model pipeline.
142
143    Examples
144
145        ```python
146        import os
147        import mlsauce as ms
148        from sklearn.datasets import load_breast_cancer, load_iris, load_wine, load_digits
149        from sklearn.model_selection import train_test_split
150        from time import time
151
152        load_models = [load_breast_cancer, load_iris, load_wine]
153
154        for model in load_models:
155
156            data = model()
157            X = data.data
158            y= data.target
159
160            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 13)
161
162            clf = ms.LazyBoostingClassifier(verbose=1, ignore_warnings=False,
163                                            custom_metric=None, preprocess=False)
164
165            start = time()
166            models, predictioms = clf.fit(X_train, X_test, y_train, y_test)
167            print(f"\nElapsed: {time() - start} seconds\n")
168
169            print(models)
170        ```
171
172    """
173
174    def __init__(
175        self,
176        verbose=0,
177        ignore_warnings=True,
178        custom_metric=None,
179        predictions=False,
180        sort_by="Accuracy",
181        random_state=42,
182        estimators="all",
183        preprocess=False,
184        n_jobs=None,
185    ):
186        self.verbose = verbose
187        self.ignore_warnings = ignore_warnings
188        self.custom_metric = custom_metric
189        self.predictions = predictions
190        self.sort_by = sort_by
191        self.models_ = {}
192        self.best_model_ = None
193        self.random_state = random_state
194        self.estimators = estimators
195        self.preprocess = preprocess
196        self.n_jobs = n_jobs
197
198    def fit(self, X_train, X_test, y_train, y_test, hist=False, **kwargs):
199        """Fit classifiers to X_train and y_train, predict and score on X_test,
200        y_test.
201
202        Parameters:
203
204            X_train: array-like,
205                Training vectors, where rows is the number of samples
206                and columns is the number of features.
207
208            X_test: array-like,
209                Testing vectors, where rows is the number of samples
210                and columns is the number of features.
211
212            y_train: array-like,
213                Training vectors, where rows is the number of samples
214                and columns is the number of features.
215
216            y_test: array-like,
217                Testing vectors, where rows is the number of samples
218                and columns is the number of features.
219
220            hist: bool, optional (default=False)
221                When set to True, the model is a GenericBoostingClassifier.
222
223            **kwargs: dict,
224                Additional arguments to be passed to the fit GenericBoostingClassifier.
225
226        Returns:
227
228            scores: Pandas DataFrame
229                Returns metrics of all the models in a Pandas DataFrame.
230
231            predictions: Pandas DataFrame
232                Returns predictions of all the models in a Pandas DataFrame.
233        """
234        Accuracy = []
235        B_Accuracy = []
236        ROC_AUC = []
237        F1 = []
238        names = []
239        TIME = []
240        predictions = {}
241
242        if self.custom_metric is not None:
243            CUSTOM_METRIC = []
244
245        if isinstance(X_train, np.ndarray):
246            X_train = pd.DataFrame(X_train)
247            X_test = pd.DataFrame(X_test)
248
249        numeric_features = X_train.select_dtypes(include=[np.number]).columns
250        categorical_features = X_train.select_dtypes(include=["object"]).columns
251
252        categorical_low, categorical_high = get_card_split(
253            X_train, categorical_features
254        )
255
256        if self.preprocess is True:
257            preprocessor = ColumnTransformer(
258                transformers=[
259                    ("numeric", numeric_transformer, numeric_features),
260                    (
261                        "categorical_low",
262                        categorical_transformer_low,
263                        categorical_low,
264                    ),
265                    (
266                        "categorical_high",
267                        categorical_transformer_high,
268                        categorical_high,
269                    ),
270                ]
271            )
272
273        # baseline models
274        try:
275            baseline_names = ["RandomForestClassifier", "XGBClassifier"]
276            baseline_models = [RandomForestClassifier(), xgb.XGBClassifier()]
277        except Exception as exception:
278            baseline_names = ["RandomForestClassifier"]
279            baseline_models = [RandomForestClassifier()]
280
281        if self.verbose > 0:
282            print("\n Fitting baseline models...")
283        for name, model in tqdm(zip(baseline_names, baseline_models)):
284            start = time.time()
285            try:
286                model.fit(X_train, y_train)
287                self.models_[name] = model
288                y_pred = model.predict(X_test)
289                accuracy = accuracy_score(y_test, y_pred, normalize=True)
290                b_accuracy = balanced_accuracy_score(y_test, y_pred)
291                f1 = f1_score(y_test, y_pred, average="weighted")
292                try:
293                    roc_auc = roc_auc_score(y_test, y_pred)
294                except Exception as exception:
295                    roc_auc = None
296                    if self.ignore_warnings is False:
297                        print("ROC AUC couldn't be calculated for " + name)
298                        print(exception)
299                names.append(name)
300                Accuracy.append(accuracy)
301                B_Accuracy.append(b_accuracy)
302                ROC_AUC.append(roc_auc)
303                F1.append(f1)
304                TIME.append(time.time() - start)
305                if self.custom_metric is not None:
306                    custom_metric = self.custom_metric(y_test, y_pred)
307                    CUSTOM_METRIC.append(custom_metric)
308                if self.verbose > 0:
309                    if self.custom_metric is not None:
310                        print(
311                            {
312                                "Model": name,
313                                "Accuracy": accuracy,
314                                "Balanced Accuracy": b_accuracy,
315                                "ROC AUC": roc_auc,
316                                "F1 Score": f1,
317                                self.custom_metric.__name__: custom_metric,
318                                "Time taken": time.time() - start,
319                            }
320                        )
321                    else:
322                        print(
323                            {
324                                "Model": name,
325                                "Accuracy": accuracy,
326                                "Balanced Accuracy": b_accuracy,
327                                "ROC AUC": roc_auc,
328                                "F1 Score": f1,
329                                "Time taken": time.time() - start,
330                            }
331                        )
332                if self.predictions:
333                    predictions[name] = y_pred
334            except Exception as exception:
335                if self.ignore_warnings is False:
336                    print(name + " model failed to execute")
337                    print(exception)
338
339        if self.estimators == "all":
340            self.classifiers = REGRESSORS + MTASKREGRESSORS
341        else:
342            self.classifiers = [
343                ("GBoostClassifier(" + est[0] + ")", est[1]())
344                for est in all_estimators()
345                if (
346                    issubclass(est[1], RegressorMixin)
347                    and (est[0] in self.estimators)
348                )
349            ] + [
350                (
351                    "GBoostClassifier(MultiTask(" + est[0] + "))",
352                    partial(MultiTaskRegressor, regr=est[1]()),
353                )
354                for est in all_estimators()
355                if (
356                    issubclass(est[1], RegressorMixin)
357                    and (est[0] in self.estimators)
358                )
359            ]
360
361        if self.preprocess is True:
362
363            if self.n_jobs is None:
364
365                for name, model in tqdm(self.classifiers):  # do parallel exec
366
367                    other_args = (
368                        {}
369                    )  # use this trick for `random_state` too --> refactor
370                    try:
371                        if (
372                            "n_jobs" in model().get_params().keys()
373                            and name.find("LogisticRegression") == -1
374                        ):
375                            other_args["n_jobs"] = self.n_jobs
376                    except Exception:
377                        pass
378
379                    start = time.time()
380
381                    try:
382                        if "random_state" in model().get_params().keys():
383                            if hist is False:
384                                fitted_clf = GenericBoostingClassifier(
385                                    {**other_args, **kwargs},
386                                    verbose=self.verbose,
387                                    base_model=model(
388                                        random_state=self.random_state
389                                    ),
390                                )
391                            else:
392                                fitted_clf = GenericBoostingClassifier(
393                                    {**other_args, **kwargs},
394                                    verbose=self.verbose,
395                                    base_model=model(
396                                        random_state=self.random_state
397                                    ),
398                                    hist=True,
399                                )
400
401                        else:
402                            if hist is False:
403                                fitted_clf = GenericBoostingClassifier(
404                                    base_model=model(**kwargs),
405                                    verbose=self.verbose,
406                                )
407                            else:
408                                fitted_clf = GenericBoostingClassifier(
409                                    base_model=model(**kwargs),
410                                    verbose=self.verbose,
411                                    hist=True,
412                                )
413
414                        if self.verbose > 0:
415                            print("\n Fitting boosted " + name + " model...")
416                        fitted_clf.fit(X_train, y_train)
417
418                        pipe = Pipeline(
419                            [
420                                ("preprocessor", preprocessor),
421                                ("classifier", fitted_clf),
422                            ]
423                        )
424
425                        if self.verbose > 0:
426                            print("\n Fitting boosted " + name + " model...")
427                        pipe.fit(X_train, y_train)
428                        self.models_[name] = pipe
429                        y_pred = pipe.predict(X_test)
430                        accuracy = accuracy_score(
431                            y_test, y_pred, normalize=True
432                        )
433                        b_accuracy = balanced_accuracy_score(y_test, y_pred)
434                        f1 = f1_score(y_test, y_pred, average="weighted")
435                        try:
436                            roc_auc = roc_auc_score(y_test, y_pred)
437                        except Exception as exception:
438                            roc_auc = None
439                            if self.ignore_warnings is False:
440                                print(
441                                    "ROC AUC couldn't be calculated for " + name
442                                )
443                                print(exception)
444                        names.append(name)
445                        Accuracy.append(accuracy)
446                        B_Accuracy.append(b_accuracy)
447                        ROC_AUC.append(roc_auc)
448                        F1.append(f1)
449                        TIME.append(time.time() - start)
450                        if self.custom_metric is not None:
451                            custom_metric = self.custom_metric(y_test, y_pred)
452                            CUSTOM_METRIC.append(custom_metric)
453                        if self.verbose > 0:
454                            if self.custom_metric is not None:
455                                print(
456                                    {
457                                        "Model": name,
458                                        "Accuracy": accuracy,
459                                        "Balanced Accuracy": b_accuracy,
460                                        "ROC AUC": roc_auc,
461                                        "F1 Score": f1,
462                                        self.custom_metric.__name__: custom_metric,
463                                        "Time taken": time.time() - start,
464                                    }
465                                )
466                            else:
467                                print(
468                                    {
469                                        "Model": name,
470                                        "Accuracy": accuracy,
471                                        "Balanced Accuracy": b_accuracy,
472                                        "ROC AUC": roc_auc,
473                                        "F1 Score": f1,
474                                        "Time taken": time.time() - start,
475                                    }
476                                )
477                        if self.predictions:
478                            predictions[name] = y_pred
479                    except Exception as exception:
480                        if self.ignore_warnings is False:
481                            print(name + " model failed to execute")
482                            print(exception)
483
484            else:
485
486                # train_model(self, name, model, X_train, y_train, X_test, y_test,
487                # use_preprocessing=False, preprocessor=None,
488                #    **kwargs):
489                results = Parallel(n_jobs=self.n_jobs)(
490                    delayed(self.train_model)(
491                        name,
492                        model,
493                        X_train,
494                        y_train,
495                        X_test,
496                        y_test,
497                        use_preprocessing=True,
498                        preprocessor=preprocessor,
499                        **kwargs
500                    )
501                    for name, model in tqdm(self.classifiers)
502                )
503                Accuracy = [res["accuracy"] for res in results]
504                B_Accuracy = [res["balanced_accuracy"] for res in results]
505                ROC_AUC = [res["roc_auc"] for res in results]
506                F1 = [res["f1"] for res in results]
507                names = [res["name"] for res in results]
508                TIME = [res["time"] for res in results]
509                if self.custom_metric is not None:
510                    CUSTOM_METRIC = [res["custom_metric"] for res in results]
511                if self.predictions:
512                    predictions = {
513                        res["name"]: res["predictions"] for res in results
514                    }
515
516        else:  # no preprocessing
517
518            if self.n_jobs is None:
519
520                for name, model in tqdm(self.classifiers):  # do parallel exec
521                    start = time.time()
522                    try:
523                        if "random_state" in model().get_params().keys():
524                            if hist is False:
525                                fitted_clf = GenericBoostingClassifier(
526                                    base_model=model(
527                                        random_state=self.random_state
528                                    ),
529                                    verbose=self.verbose,
530                                    **kwargs
531                                )
532                            else:
533                                fitted_clf = GenericBoostingClassifier(
534                                    base_model=model(
535                                        random_state=self.random_state
536                                    ),
537                                    verbose=self.verbose,
538                                    hist=True,
539                                    **kwargs
540                                )
541
542                        else:
543                            if hist is False:
544                                fitted_clf = GenericBoostingClassifier(
545                                    base_model=model(),
546                                    verbose=self.verbose,
547                                    **kwargs
548                                )
549                            else:
550                                fitted_clf = GenericBoostingClassifier(
551                                    base_model=model(),
552                                    verbose=self.verbose,
553                                    hist=True,
554                                    **kwargs
555                                )
556
557                        fitted_clf.fit(X_train, y_train)
558
559                        self.models_[name] = fitted_clf
560                        y_pred = fitted_clf.predict(X_test)
561                        accuracy = accuracy_score(
562                            y_test, y_pred, normalize=True
563                        )
564                        b_accuracy = balanced_accuracy_score(y_test, y_pred)
565                        f1 = f1_score(y_test, y_pred, average="weighted")
566                        try:
567                            roc_auc = roc_auc_score(y_test, y_pred)
568                        except Exception as exception:
569                            roc_auc = None
570                            if self.ignore_warnings is False:
571                                print(
572                                    "ROC AUC couldn't be calculated for " + name
573                                )
574                                print(exception)
575                        names.append(name)
576                        Accuracy.append(accuracy)
577                        B_Accuracy.append(b_accuracy)
578                        ROC_AUC.append(roc_auc)
579                        F1.append(f1)
580                        TIME.append(time.time() - start)
581                        if self.custom_metric is not None:
582                            custom_metric = self.custom_metric(y_test, y_pred)
583                            CUSTOM_METRIC.append(custom_metric)
584                        if self.verbose > 0:
585                            if self.custom_metric is not None:
586                                print(
587                                    {
588                                        "Model": name,
589                                        "Accuracy": accuracy,
590                                        "Balanced Accuracy": b_accuracy,
591                                        "ROC AUC": roc_auc,
592                                        "F1 Score": f1,
593                                        self.custom_metric.__name__: custom_metric,
594                                        "Time taken": time.time() - start,
595                                    }
596                                )
597                            else:
598                                print(
599                                    {
600                                        "Model": name,
601                                        "Accuracy": accuracy,
602                                        "Balanced Accuracy": b_accuracy,
603                                        "ROC AUC": roc_auc,
604                                        "F1 Score": f1,
605                                        "Time taken": time.time() - start,
606                                    }
607                                )
608                        if self.predictions:
609                            predictions[name] = y_pred
610                    except Exception as exception:
611                        if self.ignore_warnings is False:
612                            print(name + " model failed to execute")
613                            print(exception)
614
615            else:
616
617                results = Parallel(n_jobs=self.n_jobs)(
618                    delayed(self.train_model)(
619                        name,
620                        model,
621                        X_train,
622                        y_train,
623                        X_test,
624                        y_test,
625                        use_preprocessing=False,
626                        **kwargs
627                    )
628                    for name, model in tqdm(self.classifiers)
629                )
630                Accuracy = [res["accuracy"] for res in results]
631                B_Accuracy = [res["balanced_accuracy"] for res in results]
632                ROC_AUC = [res["roc_auc"] for res in results]
633                F1 = [res["f1"] for res in results]
634                names = [res["name"] for res in results]
635                TIME = [res["time"] for res in results]
636                if self.custom_metric is not None:
637                    CUSTOM_METRIC = [res["custom_metric"] for res in results]
638                if self.predictions:
639                    predictions = {
640                        res["name"]: res["predictions"] for res in results
641                    }
642
643        if self.custom_metric is None:
644            scores = pd.DataFrame(
645                {
646                    "Model": names,
647                    "Accuracy": Accuracy,
648                    "Balanced Accuracy": B_Accuracy,
649                    "ROC AUC": ROC_AUC,
650                    "F1 Score": F1,
651                    "Time Taken": TIME,
652                }
653            )
654        else:
655            scores = pd.DataFrame(
656                {
657                    "Model": names,
658                    "Accuracy": Accuracy,
659                    "Balanced Accuracy": B_Accuracy,
660                    "ROC AUC": ROC_AUC,
661                    "F1 Score": F1,
662                    "Custom metric": CUSTOM_METRIC,
663                    "Time Taken": TIME,
664                }
665            )
666        scores = scores.sort_values(by=self.sort_by, ascending=False).set_index(
667            "Model"
668        )
669
670        self.best_model_ = self.models_[scores.index[0]]
671
672        if self.predictions:
673            predictions_df = pd.DataFrame.from_dict(predictions)
674        return scores, predictions_df if self.predictions is True else scores
675
676    def get_best_model(self):
677        """
678        This function returns the best model pipeline based on the sort_by metric.
679
680        Returns:
681
682            best_model: object,
683                Returns the best model pipeline based on the sort_by metric.
684
685        """
686        return self.best_model_
687
688    def provide_models(self, X_train, X_test, y_train, y_test):
689        """Returns all the model objects trained. If fit hasn't been called yet,
690        then it's called to return the models.
691
692        Parameters:
693
694        X_train: array-like,
695            Training vectors, where rows is the number of samples
696            and columns is the number of features.
697
698        X_test: array-like,
699            Testing vectors, where rows is the number of samples
700            and columns is the number of features.
701
702        y_train: array-like,
703            Training vectors, where rows is the number of samples
704            and columns is the number of features.
705
706        y_test: array-like,
707            Testing vectors, where rows is the number of samples
708            and columns is the number of features.
709
710        Returns:
711
712            models: dict-object,
713                Returns a dictionary with each model's pipeline as value
714                and key = name of the model.
715        """
716        if len(self.models_.keys()) == 0:
717            self.fit(X_train, X_test, y_train, y_test)
718
719        return self.models_
720
721    def train_model(
722        self,
723        name,
724        model,
725        X_train,
726        y_train,
727        X_test,
728        y_test,
729        use_preprocessing=False,
730        preprocessor=None,
731        hist=False,
732        **kwargs
733    ):
734        """
735        Function to train a single model and return its results.
736        """
737        other_args = {}
738
739        # Handle n_jobs parameter
740        try:
741            if (
742                "n_jobs" in model().get_params().keys()
743                and "LogisticRegression" not in name
744            ):
745                other_args["n_jobs"] = self.n_jobs
746        except Exception:
747            pass
748
749        start = time.time()
750
751        try:
752            # Handle random_state parameter
753            if "random_state" in model().get_params().keys():
754                if hist is False:
755                    fitted_clf = GenericBoostingClassifier(
756                        {**other_args, **kwargs},
757                        verbose=self.verbose,
758                        base_model=model(random_state=self.random_state),
759                    )
760                else:
761                    fitted_clf = GenericBoostingClassifier(
762                        {**other_args, **kwargs},
763                        verbose=self.verbose,
764                        base_model=model(random_state=self.random_state),
765                        hist=True,
766                    )
767            else:
768                if hist is False:
769                    fitted_clf = GenericBoostingClassifier(
770                        base_model=model(**kwargs),
771                        verbose=self.verbose,
772                    )
773                else:
774                    fitted_clf = GenericBoostingClassifier(
775                        base_model=model(**kwargs),
776                        verbose=self.verbose,
777                        hist=True,
778                    )
779
780            if self.verbose > 0:
781                print("\n Fitting boosted " + name + " model...")
782
783            fitted_clf.fit(X_train, y_train)
784
785            if use_preprocessing and preprocessor is not None:
786                pipe = Pipeline(
787                    [
788                        ("preprocessor", preprocessor),
789                        ("classifier", fitted_clf),
790                    ]
791                )
792                if self.verbose > 0:
793                    print(
794                        "\n Fitting pipeline with preprocessing for "
795                        + name
796                        + " model..."
797                    )
798                pipe.fit(X_train, y_train)
799                y_pred = pipe.predict(X_test)
800            else:
801                # Case with no preprocessing
802                if self.verbose > 0:
803                    print(
804                        "\n Fitting model without preprocessing for "
805                        + name
806                        + " model..."
807                    )
808                y_pred = fitted_clf.predict(X_test)
809
810            accuracy = accuracy_score(y_test, y_pred, normalize=True)
811            b_accuracy = balanced_accuracy_score(y_test, y_pred)
812            f1 = f1_score(y_test, y_pred, average="weighted")
813            roc_auc = None
814
815            try:
816                roc_auc = roc_auc_score(y_test, y_pred)
817            except Exception as exception:
818                if self.ignore_warnings is False:
819                    print("ROC AUC couldn't be calculated for " + name)
820                    print(exception)
821
822            custom_metric = None
823            if self.custom_metric is not None:
824                custom_metric = self.custom_metric(y_test, y_pred)
825
826            return {
827                "name": name,
828                "model": fitted_clf if not use_preprocessing else pipe,
829                "accuracy": accuracy,
830                "balanced_accuracy": b_accuracy,
831                "roc_auc": roc_auc,
832                "f1": f1,
833                "custom_metric": custom_metric,
834                "time": time.time() - start,
835                "predictions": y_pred,
836            }
837        except Exception as exception:
838            if self.ignore_warnings is False:
839                print(name + " model failed to execute")
840                print(exception)
841            return None

Fitting -- almost -- all the classification algorithms and returning their scores.

Parameters:

    verbose: int, optional (default=0)
        Any positive number for verbosity.

    ignore_warnings: bool, optional (default=True)
        When set to True, the warning related to algorigms that are not
        able to run are ignored.

    custom_metric: function, optional (default=None)
        When function is provided, models are evaluated based on the custom
          evaluation metric provided.

    predictions: bool, optional (default=False)
        When set to True, the predictions of all the models models are
        returned as data frame.

    sort_by: string, optional (default='Accuracy')
        Sort models by a metric. Available options are 'Accuracy',
        'Balanced Accuracy', 'ROC AUC', 'F1 Score' or a custom metric
        identified by its name and provided by custom_metric.

    random_state: int, optional (default=42)
        Reproducibiility seed.

    estimators: list, optional (default='all')
        list of Estimators names or just 'all' for > 90 classifiers
        (default='all')

    preprocess: bool, preprocessing is done when set to True

    n_jobs: int, when possible, run in parallel
        For now, only used by individual models that support it.

    n_layers: int, optional (default=3)
        Number of layers of GenericBoostingClassifiers to be used.

    All the other parameters are the same as GenericBoostingClassifier's.

Attributes:

    models_: dict-object
        Returns a dictionary with each model pipeline as value
        with key as name of models.

    best_model_: object
        Returns the best model pipeline.

Examples


    
        import os
        import mlsauce as ms
        from sklearn.datasets import load_breast_cancer, load_iris, load_wine, load_digits
        from sklearn.model_selection import train_test_split
        from time import time

        load_models = [load_breast_cancer, load_iris, load_wine]

        for model in load_models:

            data = model()
            X = data.data
            y= data.target

            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 13)

            clf = ms.LazyBoostingClassifier(verbose=1, ignore_warnings=False,
                                            custom_metric=None, preprocess=False)

            start = time()
            models, predictioms = clf.fit(X_train, X_test, y_train, y_test)
            print(f"
Elapsed: {time() - start} seconds
")

            print(models)

def fit(self, X_train, X_test, y_train, y_test, hist=False, **kwargs): View Source

198    def fit(self, X_train, X_test, y_train, y_test, hist=False, **kwargs):
199        """Fit classifiers to X_train and y_train, predict and score on X_test,
200        y_test.
201
202        Parameters:
203
204            X_train: array-like,
205                Training vectors, where rows is the number of samples
206                and columns is the number of features.
207
208            X_test: array-like,
209                Testing vectors, where rows is the number of samples
210                and columns is the number of features.
211
212            y_train: array-like,
213                Training vectors, where rows is the number of samples
214                and columns is the number of features.
215
216            y_test: array-like,
217                Testing vectors, where rows is the number of samples
218                and columns is the number of features.
219
220            hist: bool, optional (default=False)
221                When set to True, the model is a GenericBoostingClassifier.
222
223            **kwargs: dict,
224                Additional arguments to be passed to the fit GenericBoostingClassifier.
225
226        Returns:
227
228            scores: Pandas DataFrame
229                Returns metrics of all the models in a Pandas DataFrame.
230
231            predictions: Pandas DataFrame
232                Returns predictions of all the models in a Pandas DataFrame.
233        """
234        Accuracy = []
235        B_Accuracy = []
236        ROC_AUC = []
237        F1 = []
238        names = []
239        TIME = []
240        predictions = {}
241
242        if self.custom_metric is not None:
243            CUSTOM_METRIC = []
244
245        if isinstance(X_train, np.ndarray):
246            X_train = pd.DataFrame(X_train)
247            X_test = pd.DataFrame(X_test)
248
249        numeric_features = X_train.select_dtypes(include=[np.number]).columns
250        categorical_features = X_train.select_dtypes(include=["object"]).columns
251
252        categorical_low, categorical_high = get_card_split(
253            X_train, categorical_features
254        )
255
256        if self.preprocess is True:
257            preprocessor = ColumnTransformer(
258                transformers=[
259                    ("numeric", numeric_transformer, numeric_features),
260                    (
261                        "categorical_low",
262                        categorical_transformer_low,
263                        categorical_low,
264                    ),
265                    (
266                        "categorical_high",
267                        categorical_transformer_high,
268                        categorical_high,
269                    ),
270                ]
271            )
272
273        # baseline models
274        try:
275            baseline_names = ["RandomForestClassifier", "XGBClassifier"]
276            baseline_models = [RandomForestClassifier(), xgb.XGBClassifier()]
277        except Exception as exception:
278            baseline_names = ["RandomForestClassifier"]
279            baseline_models = [RandomForestClassifier()]
280
281        if self.verbose > 0:
282            print("\n Fitting baseline models...")
283        for name, model in tqdm(zip(baseline_names, baseline_models)):
284            start = time.time()
285            try:
286                model.fit(X_train, y_train)
287                self.models_[name] = model
288                y_pred = model.predict(X_test)
289                accuracy = accuracy_score(y_test, y_pred, normalize=True)
290                b_accuracy = balanced_accuracy_score(y_test, y_pred)
291                f1 = f1_score(y_test, y_pred, average="weighted")
292                try:
293                    roc_auc = roc_auc_score(y_test, y_pred)
294                except Exception as exception:
295                    roc_auc = None
296                    if self.ignore_warnings is False:
297                        print("ROC AUC couldn't be calculated for " + name)
298                        print(exception)
299                names.append(name)
300                Accuracy.append(accuracy)
301                B_Accuracy.append(b_accuracy)
302                ROC_AUC.append(roc_auc)
303                F1.append(f1)
304                TIME.append(time.time() - start)
305                if self.custom_metric is not None:
306                    custom_metric = self.custom_metric(y_test, y_pred)
307                    CUSTOM_METRIC.append(custom_metric)
308                if self.verbose > 0:
309                    if self.custom_metric is not None:
310                        print(
311                            {
312                                "Model": name,
313                                "Accuracy": accuracy,
314                                "Balanced Accuracy": b_accuracy,
315                                "ROC AUC": roc_auc,
316                                "F1 Score": f1,
317                                self.custom_metric.__name__: custom_metric,
318                                "Time taken": time.time() - start,
319                            }
320                        )
321                    else:
322                        print(
323                            {
324                                "Model": name,
325                                "Accuracy": accuracy,
326                                "Balanced Accuracy": b_accuracy,
327                                "ROC AUC": roc_auc,
328                                "F1 Score": f1,
329                                "Time taken": time.time() - start,
330                            }
331                        )
332                if self.predictions:
333                    predictions[name] = y_pred
334            except Exception as exception:
335                if self.ignore_warnings is False:
336                    print(name + " model failed to execute")
337                    print(exception)
338
339        if self.estimators == "all":
340            self.classifiers = REGRESSORS + MTASKREGRESSORS
341        else:
342            self.classifiers = [
343                ("GBoostClassifier(" + est[0] + ")", est[1]())
344                for est in all_estimators()
345                if (
346                    issubclass(est[1], RegressorMixin)
347                    and (est[0] in self.estimators)
348                )
349            ] + [
350                (
351                    "GBoostClassifier(MultiTask(" + est[0] + "))",
352                    partial(MultiTaskRegressor, regr=est[1]()),
353                )
354                for est in all_estimators()
355                if (
356                    issubclass(est[1], RegressorMixin)
357                    and (est[0] in self.estimators)
358                )
359            ]
360
361        if self.preprocess is True:
362
363            if self.n_jobs is None:
364
365                for name, model in tqdm(self.classifiers):  # do parallel exec
366
367                    other_args = (
368                        {}
369                    )  # use this trick for `random_state` too --> refactor
370                    try:
371                        if (
372                            "n_jobs" in model().get_params().keys()
373                            and name.find("LogisticRegression") == -1
374                        ):
375                            other_args["n_jobs"] = self.n_jobs
376                    except Exception:
377                        pass
378
379                    start = time.time()
380
381                    try:
382                        if "random_state" in model().get_params().keys():
383                            if hist is False:
384                                fitted_clf = GenericBoostingClassifier(
385                                    {**other_args, **kwargs},
386                                    verbose=self.verbose,
387                                    base_model=model(
388                                        random_state=self.random_state
389                                    ),
390                                )
391                            else:
392                                fitted_clf = GenericBoostingClassifier(
393                                    {**other_args, **kwargs},
394                                    verbose=self.verbose,
395                                    base_model=model(
396                                        random_state=self.random_state
397                                    ),
398                                    hist=True,
399                                )
400
401                        else:
402                            if hist is False:
403                                fitted_clf = GenericBoostingClassifier(
404                                    base_model=model(**kwargs),
405                                    verbose=self.verbose,
406                                )
407                            else:
408                                fitted_clf = GenericBoostingClassifier(
409                                    base_model=model(**kwargs),
410                                    verbose=self.verbose,
411                                    hist=True,
412                                )
413
414                        if self.verbose > 0:
415                            print("\n Fitting boosted " + name + " model...")
416                        fitted_clf.fit(X_train, y_train)
417
418                        pipe = Pipeline(
419                            [
420                                ("preprocessor", preprocessor),
421                                ("classifier", fitted_clf),
422                            ]
423                        )
424
425                        if self.verbose > 0:
426                            print("\n Fitting boosted " + name + " model...")
427                        pipe.fit(X_train, y_train)
428                        self.models_[name] = pipe
429                        y_pred = pipe.predict(X_test)
430                        accuracy = accuracy_score(
431                            y_test, y_pred, normalize=True
432                        )
433                        b_accuracy = balanced_accuracy_score(y_test, y_pred)
434                        f1 = f1_score(y_test, y_pred, average="weighted")
435                        try:
436                            roc_auc = roc_auc_score(y_test, y_pred)
437                        except Exception as exception:
438                            roc_auc = None
439                            if self.ignore_warnings is False:
440                                print(
441                                    "ROC AUC couldn't be calculated for " + name
442                                )
443                                print(exception)
444                        names.append(name)
445                        Accuracy.append(accuracy)
446                        B_Accuracy.append(b_accuracy)
447                        ROC_AUC.append(roc_auc)
448                        F1.append(f1)
449                        TIME.append(time.time() - start)
450                        if self.custom_metric is not None:
451                            custom_metric = self.custom_metric(y_test, y_pred)
452                            CUSTOM_METRIC.append(custom_metric)
453                        if self.verbose > 0:
454                            if self.custom_metric is not None:
455                                print(
456                                    {
457                                        "Model": name,
458                                        "Accuracy": accuracy,
459                                        "Balanced Accuracy": b_accuracy,
460                                        "ROC AUC": roc_auc,
461                                        "F1 Score": f1,
462                                        self.custom_metric.__name__: custom_metric,
463                                        "Time taken": time.time() - start,
464                                    }
465                                )
466                            else:
467                                print(
468                                    {
469                                        "Model": name,
470                                        "Accuracy": accuracy,
471                                        "Balanced Accuracy": b_accuracy,
472                                        "ROC AUC": roc_auc,
473                                        "F1 Score": f1,
474                                        "Time taken": time.time() - start,
475                                    }
476                                )
477                        if self.predictions:
478                            predictions[name] = y_pred
479                    except Exception as exception:
480                        if self.ignore_warnings is False:
481                            print(name + " model failed to execute")
482                            print(exception)
483
484            else:
485
486                # train_model(self, name, model, X_train, y_train, X_test, y_test,
487                # use_preprocessing=False, preprocessor=None,
488                #    **kwargs):
489                results = Parallel(n_jobs=self.n_jobs)(
490                    delayed(self.train_model)(
491                        name,
492                        model,
493                        X_train,
494                        y_train,
495                        X_test,
496                        y_test,
497                        use_preprocessing=True,
498                        preprocessor=preprocessor,
499                        **kwargs
500                    )
501                    for name, model in tqdm(self.classifiers)
502                )
503                Accuracy = [res["accuracy"] for res in results]
504                B_Accuracy = [res["balanced_accuracy"] for res in results]
505                ROC_AUC = [res["roc_auc"] for res in results]
506                F1 = [res["f1"] for res in results]
507                names = [res["name"] for res in results]
508                TIME = [res["time"] for res in results]
509                if self.custom_metric is not None:
510                    CUSTOM_METRIC = [res["custom_metric"] for res in results]
511                if self.predictions:
512                    predictions = {
513                        res["name"]: res["predictions"] for res in results
514                    }
515
516        else:  # no preprocessing
517
518            if self.n_jobs is None:
519
520                for name, model in tqdm(self.classifiers):  # do parallel exec
521                    start = time.time()
522                    try:
523                        if "random_state" in model().get_params().keys():
524                            if hist is False:
525                                fitted_clf = GenericBoostingClassifier(
526                                    base_model=model(
527                                        random_state=self.random_state
528                                    ),
529                                    verbose=self.verbose,
530                                    **kwargs
531                                )
532                            else:
533                                fitted_clf = GenericBoostingClassifier(
534                                    base_model=model(
535                                        random_state=self.random_state
536                                    ),
537                                    verbose=self.verbose,
538                                    hist=True,
539                                    **kwargs
540                                )
541
542                        else:
543                            if hist is False:
544                                fitted_clf = GenericBoostingClassifier(
545                                    base_model=model(),
546                                    verbose=self.verbose,
547                                    **kwargs
548                                )
549                            else:
550                                fitted_clf = GenericBoostingClassifier(
551                                    base_model=model(),
552                                    verbose=self.verbose,
553                                    hist=True,
554                                    **kwargs
555                                )
556
557                        fitted_clf.fit(X_train, y_train)
558
559                        self.models_[name] = fitted_clf
560                        y_pred = fitted_clf.predict(X_test)
561                        accuracy = accuracy_score(
562                            y_test, y_pred, normalize=True
563                        )
564                        b_accuracy = balanced_accuracy_score(y_test, y_pred)
565                        f1 = f1_score(y_test, y_pred, average="weighted")
566                        try:
567                            roc_auc = roc_auc_score(y_test, y_pred)
568                        except Exception as exception:
569                            roc_auc = None
570                            if self.ignore_warnings is False:
571                                print(
572                                    "ROC AUC couldn't be calculated for " + name
573                                )
574                                print(exception)
575                        names.append(name)
576                        Accuracy.append(accuracy)
577                        B_Accuracy.append(b_accuracy)
578                        ROC_AUC.append(roc_auc)
579                        F1.append(f1)
580                        TIME.append(time.time() - start)
581                        if self.custom_metric is not None:
582                            custom_metric = self.custom_metric(y_test, y_pred)
583                            CUSTOM_METRIC.append(custom_metric)
584                        if self.verbose > 0:
585                            if self.custom_metric is not None:
586                                print(
587                                    {
588                                        "Model": name,
589                                        "Accuracy": accuracy,
590                                        "Balanced Accuracy": b_accuracy,
591                                        "ROC AUC": roc_auc,
592                                        "F1 Score": f1,
593                                        self.custom_metric.__name__: custom_metric,
594                                        "Time taken": time.time() - start,
595                                    }
596                                )
597                            else:
598                                print(
599                                    {
600                                        "Model": name,
601                                        "Accuracy": accuracy,
602                                        "Balanced Accuracy": b_accuracy,
603                                        "ROC AUC": roc_auc,
604                                        "F1 Score": f1,
605                                        "Time taken": time.time() - start,
606                                    }
607                                )
608                        if self.predictions:
609                            predictions[name] = y_pred
610                    except Exception as exception:
611                        if self.ignore_warnings is False:
612                            print(name + " model failed to execute")
613                            print(exception)
614
615            else:
616
617                results = Parallel(n_jobs=self.n_jobs)(
618                    delayed(self.train_model)(
619                        name,
620                        model,
621                        X_train,
622                        y_train,
623                        X_test,
624                        y_test,
625                        use_preprocessing=False,
626                        **kwargs
627                    )
628                    for name, model in tqdm(self.classifiers)
629                )
630                Accuracy = [res["accuracy"] for res in results]
631                B_Accuracy = [res["balanced_accuracy"] for res in results]
632                ROC_AUC = [res["roc_auc"] for res in results]
633                F1 = [res["f1"] for res in results]
634                names = [res["name"] for res in results]
635                TIME = [res["time"] for res in results]
636                if self.custom_metric is not None:
637                    CUSTOM_METRIC = [res["custom_metric"] for res in results]
638                if self.predictions:
639                    predictions = {
640                        res["name"]: res["predictions"] for res in results
641                    }
642
643        if self.custom_metric is None:
644            scores = pd.DataFrame(
645                {
646                    "Model": names,
647                    "Accuracy": Accuracy,
648                    "Balanced Accuracy": B_Accuracy,
649                    "ROC AUC": ROC_AUC,
650                    "F1 Score": F1,
651                    "Time Taken": TIME,
652                }
653            )
654        else:
655            scores = pd.DataFrame(
656                {
657                    "Model": names,
658                    "Accuracy": Accuracy,
659                    "Balanced Accuracy": B_Accuracy,
660                    "ROC AUC": ROC_AUC,
661                    "F1 Score": F1,
662                    "Custom metric": CUSTOM_METRIC,
663                    "Time Taken": TIME,
664                }
665            )
666        scores = scores.sort_values(by=self.sort_by, ascending=False).set_index(
667            "Model"
668        )
669
670        self.best_model_ = self.models_[scores.index[0]]
671
672        if self.predictions:
673            predictions_df = pd.DataFrame.from_dict(predictions)
674        return scores, predictions_df if self.predictions is True else scores

Fit classifiers to X_train and y_train, predict and score on X_test, y_test.

Parameters:

X_train: array-like,
    Training vectors, where rows is the number of samples
    and columns is the number of features.

X_test: array-like,
    Testing vectors, where rows is the number of samples
    and columns is the number of features.

y_train: array-like,
    Training vectors, where rows is the number of samples
    and columns is the number of features.

y_test: array-like,
    Testing vectors, where rows is the number of samples
    and columns is the number of features.

hist: bool, optional (default=False)
    When set to True, the model is a GenericBoostingClassifier.

**kwargs: dict,
    Additional arguments to be passed to the fit GenericBoostingClassifier.

Returns:

scores: Pandas DataFrame
    Returns metrics of all the models in a Pandas DataFrame.

predictions: Pandas DataFrame
    Returns predictions of all the models in a Pandas DataFrame.

def provide_models(self, X_train, X_test, y_train, y_test): View Source

688    def provide_models(self, X_train, X_test, y_train, y_test):
689        """Returns all the model objects trained. If fit hasn't been called yet,
690        then it's called to return the models.
691
692        Parameters:
693
694        X_train: array-like,
695            Training vectors, where rows is the number of samples
696            and columns is the number of features.
697
698        X_test: array-like,
699            Testing vectors, where rows is the number of samples
700            and columns is the number of features.
701
702        y_train: array-like,
703            Training vectors, where rows is the number of samples
704            and columns is the number of features.
705
706        y_test: array-like,
707            Testing vectors, where rows is the number of samples
708            and columns is the number of features.
709
710        Returns:
711
712            models: dict-object,
713                Returns a dictionary with each model's pipeline as value
714                and key = name of the model.
715        """
716        if len(self.models_.keys()) == 0:
717            self.fit(X_train, X_test, y_train, y_test)
718
719        return self.models_

Returns all the model objects trained. If fit hasn't been called yet, then it's called to return the models.

Parameters:

X_train: array-like, Training vectors, where rows is the number of samples and columns is the number of features.

X_test: array-like, Testing vectors, where rows is the number of samples and columns is the number of features.

y_train: array-like, Training vectors, where rows is the number of samples and columns is the number of features.

y_test: array-like, Testing vectors, where rows is the number of samples and columns is the number of features.

Returns:

models: dict-object,
    Returns a dictionary with each model's pipeline as value
    and key = name of the model.

class LazyBoostingRegressor(sklearn.base.RegressorMixin): View Source

 93class LazyBoostingRegressor(RegressorMixin):
 94    """
 95        Fitting -- almost -- all the regression algorithms
 96        and returning their scores.
 97
 98    Parameters:
 99
100        verbose: int, optional (default=0)
101            Any positive number for verbosity.
102
103        ignore_warnings: bool, optional (default=True)
104            When set to True, the warning related to algorigms that are not able to run are ignored.
105
106        custom_metric: function, optional (default=None)
107            When function is provided, models are evaluated based on the custom evaluation metric provided.
108
109        predictions: bool, optional (default=False)
110            When set to True, the predictions of all the models models are returned as dataframe.
111
112        sort_by: string, optional (default='RMSE')
113            Sort models by a metric. Available options are 'R-Squared', 'Adjusted R-Squared', 'RMSE', 'Time Taken' and 'Custom Metric'.
114            or a custom metric identified by its name and provided by custom_metric.
115
116        random_state: int, optional (default=42)
117            Reproducibiility seed.
118
119        estimators: list, optional (default='all')
120            list of Estimators names or just 'all' (default='all')
121
122        preprocess: bool
123            preprocessing is done when set to True
124
125        n_jobs : int, when possible, run in parallel
126            For now, only used by individual models that support it.
127
128        n_layers: int, optional (default=3)
129            Number of layers of CustomRegressors to be used.
130
131        All the other parameters are the same as CustomRegressor's.
132
133    Attributes:
134
135        models_: dict-object
136            Returns a dictionary with each model pipeline as value
137            with key as name of models.
138
139        best_model_: object
140            Returns the best model pipeline based on the sort_by metric.
141
142    Examples:
143
144        ```python
145        import os
146        import mlsauce as ms
147        from sklearn.datasets import load_diabetes
148        from sklearn.model_selection import train_test_split
149
150        data = load_diabetes()
151        X = data.data
152        y= data.target
153        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 123)
154
155        regr = ms.LazyBoostingRegressor(verbose=0, ignore_warnings=True,
156                                        custom_metric=None, preprocess=True)
157        models, predictioms = regr.fit(X_train, X_test, y_train, y_test)
158        model_dictionary = regr.provide_models(X_train, X_test, y_train, y_test)
159        print(models)
160        ```
161
162    """
163
164    def __init__(
165        self,
166        verbose=0,
167        ignore_warnings=True,
168        custom_metric=None,
169        predictions=False,
170        sort_by="RMSE",
171        random_state=42,
172        estimators="all",
173        preprocess=False,
174        n_jobs=None,
175    ):
176        self.verbose = verbose
177        self.ignore_warnings = ignore_warnings
178        self.custom_metric = custom_metric
179        self.predictions = predictions
180        self.sort_by = sort_by
181        self.models_ = {}
182        self.best_model_ = None
183        self.random_state = random_state
184        self.estimators = estimators
185        self.preprocess = preprocess
186        self.n_jobs = n_jobs
187
188    def fit(self, X_train, X_test, y_train, y_test, hist=False, **kwargs):
189        """Fit Regression algorithms to X_train and y_train, predict and score on X_test, y_test.
190
191        Parameters:
192
193            X_train : array-like,
194                Training vectors, where rows is the number of samples
195                and columns is the number of features.
196
197            X_test : array-like,
198                Testing vectors, where rows is the number of samples
199                and columns is the number of features.
200
201            y_train : array-like,
202                Training vectors, where rows is the number of samples
203                and columns is the number of features.
204
205            y_test : array-like,
206                Testing vectors, where rows is the number of samples
207                and columns is the number of features.
208
209            hist: bool, optional (default=False)
210                When set to True, the model is a HistGenericBoostingRegressor.
211
212            **kwargs: dict,
213                Additional parameters to be passed to the GenericBoostingRegressor.
214
215        Returns:
216        -------
217        scores:  Pandas DataFrame
218            Returns metrics of all the models in a Pandas DataFrame.
219
220        predictions : Pandas DataFrame
221            Returns predictions of all the models in a Pandas DataFrame.
222
223        """
224        R2 = []
225        ADJR2 = []
226        RMSE = []
227        # WIN = []
228        names = []
229        TIME = []
230        predictions = {}
231
232        if self.custom_metric:
233            CUSTOM_METRIC = []
234
235        if isinstance(X_train, np.ndarray):
236            X_train = pd.DataFrame(X_train)
237            X_test = pd.DataFrame(X_test)
238
239        numeric_features = X_train.select_dtypes(include=[np.number]).columns
240        categorical_features = X_train.select_dtypes(include=["object"]).columns
241
242        categorical_low, categorical_high = get_card_split(
243            X_train, categorical_features
244        )
245
246        if self.preprocess is True:
247            preprocessor = ColumnTransformer(
248                transformers=[
249                    ("numeric", numeric_transformer, numeric_features),
250                    (
251                        "categorical_low",
252                        categorical_transformer_low,
253                        categorical_low,
254                    ),
255                    (
256                        "categorical_high",
257                        categorical_transformer_high,
258                        categorical_high,
259                    ),
260                ]
261            )
262
263        # base models
264        try:
265            baseline_names = [
266                "RandomForestRegressor",
267                "XGBRegressor",
268                "GradientBoostingRegressor",
269            ]
270            baseline_models = [
271                RandomForestRegressor(),
272                xgb.XGBRegressor(),
273                GradientBoostingRegressor(),
274            ]
275        except Exception as exception:
276            baseline_names = [
277                "RandomForestRegressor",
278                "GradientBoostingRegressor",
279            ]
280            baseline_models = [
281                RandomForestRegressor(),
282                GradientBoostingRegressor(),
283            ]
284
285        if self.verbose > 0:
286            print("\n Fitting baseline models...")
287        for name, model in tqdm(zip(baseline_names, baseline_models)):
288            start = time.time()
289            try:
290                model.fit(X_train, y_train.ravel())
291                self.models_[name] = model
292                y_pred = model.predict(X_test)
293                r_squared = r2_score(y_test, y_pred)
294                adj_rsquared = adjusted_rsquared(
295                    r_squared, X_test.shape[0], X_test.shape[1]
296                )
297                rmse = root_mean_squared_error(y_test, y_pred)
298
299                names.append(name)
300                R2.append(r_squared)
301                ADJR2.append(adj_rsquared)
302                RMSE.append(rmse)
303                TIME.append(time.time() - start)
304
305                if self.custom_metric:
306                    custom_metric = self.custom_metric(y_test, y_pred)
307                    CUSTOM_METRIC.append(custom_metric)
308
309                if self.verbose > 0:
310                    scores_verbose = {
311                        "Model": name,
312                        "R-Squared": r_squared,
313                        "Adjusted R-Squared": adj_rsquared,
314                        "RMSE": rmse,
315                        "Time taken": time.time() - start,
316                    }
317
318                    if self.custom_metric:
319                        scores_verbose["Custom metric"] = custom_metric
320
321                    print(scores_verbose)
322                if self.predictions:
323                    predictions[name] = y_pred
324            except Exception as exception:
325                if self.ignore_warnings is False:
326                    print(name + " model failed to execute")
327                    print(exception)
328
329        if self.estimators == "all":
330            self.regressors = REGRESSORS
331        else:
332            self.regressors = [
333                ("GenericBooster(" + est[0] + ")", est[1](**kwargs))
334                for est in all_estimators()
335                if (
336                    issubclass(est[1], RegressorMixin)
337                    and (est[0] in self.estimators)
338                )
339            ]
340
341        if self.preprocess is True:
342
343            if self.n_jobs is None:
344
345                for name, regr in tqdm(self.regressors):  # do parallel exec
346
347                    start = time.time()
348
349                    try:
350
351                        if hist is False:
352
353                            model = GenericBoostingRegressor(
354                                base_model=regr(),
355                                verbose=self.verbose,
356                                **kwargs
357                            )
358
359                        else:
360
361                            model = HistGenericBoostingRegressor(
362                                base_model=regr(),
363                                verbose=self.verbose,
364                                **kwargs
365                            )
366
367                        model.fit(X_train, y_train.ravel())
368
369                        pipe = Pipeline(
370                            steps=[
371                                ("preprocessor", preprocessor),
372                                ("regressor", model),
373                            ]
374                        )
375                        if self.verbose > 0:
376                            print("\n Fitting boosted " + name + " model...")
377                        pipe.fit(X_train, y_train.ravel())
378
379                        self.models_[name] = pipe
380                        y_pred = pipe.predict(X_test)
381                        r_squared = r2_score(y_test, y_pred)
382                        adj_rsquared = adjusted_rsquared(
383                            r_squared, X_test.shape[0], X_test.shape[1]
384                        )
385                        rmse = root_mean_squared_error(y_test, y_pred)
386
387                        names.append(name)
388                        R2.append(r_squared)
389                        ADJR2.append(adj_rsquared)
390                        RMSE.append(rmse)
391                        TIME.append(time.time() - start)
392
393                        if self.custom_metric:
394                            custom_metric = self.custom_metric(y_test, y_pred)
395                            CUSTOM_METRIC.append(custom_metric)
396
397                        if self.verbose > 0:
398                            scores_verbose = {
399                                "Model": name,
400                                "R-Squared": r_squared,
401                                "Adjusted R-Squared": adj_rsquared,
402                                "RMSE": rmse,
403                                "Time taken": time.time() - start,
404                            }
405
406                            if self.custom_metric:
407                                scores_verbose["Custom metric"] = custom_metric
408
409                            print(scores_verbose)
410                        if self.predictions:
411                            predictions[name] = y_pred
412
413                    except Exception as exception:
414
415                        if self.ignore_warnings is False:
416                            print(name + " model failed to execute")
417                            print(exception)
418
419            else:
420
421                results = Parallel(n_jobs=self.n_jobs)(
422                    delayed(self.train_model)(
423                        name,
424                        model,
425                        X_train,
426                        y_train,
427                        X_test,
428                        y_test,
429                        use_preprocessing=True,
430                        preprocessor=preprocessor,
431                        **kwargs
432                    )
433                    for name, model in tqdm(self.regressors)
434                )
435                R2 = [
436                    result["r_squared"]
437                    for result in results
438                    if result is not None
439                ]
440                ADJR2 = [
441                    result["adj_rsquared"]
442                    for result in results
443                    if result is not None
444                ]
445                RMSE = [
446                    result["rmse"] for result in results if result is not None
447                ]
448                TIME = [
449                    result["time"] for result in results if result is not None
450                ]
451                names = [
452                    result["name"] for result in results if result is not None
453                ]
454                if self.custom_metric:
455                    CUSTOM_METRIC = [
456                        result["custom_metric"]
457                        for result in results
458                        if result is not None
459                    ]
460                if self.predictions:
461                    predictions = {
462                        result["name"]: result["predictions"]
463                        for result in results
464                        if result is not None
465                    }
466
467        else:  # self.preprocess is False; no preprocessing
468
469            if self.n_jobs is None:
470
471                for name, regr in tqdm(self.regressors):  # do parallel exec
472                    start = time.time()
473                    try:
474
475                        if hist is False:
476                            model = GenericBoostingRegressor(
477                                base_model=regr(),
478                                verbose=self.verbose,
479                                **kwargs
480                            )
481                        else:
482                            model = HistGenericBoostingRegressor(
483                                base_model=regr(),
484                                verbose=self.verbose,
485                                **kwargs
486                            )
487
488                        if self.verbose > 0:
489                            print("\n Fitting boosted " + name + " model...")
490                        model.fit(X_train, y_train.ravel())
491
492                        self.models_[name] = model
493                        y_pred = model.predict(X_test)
494
495                        r_squared = r2_score(y_test, y_pred)
496                        adj_rsquared = adjusted_rsquared(
497                            r_squared, X_test.shape[0], X_test.shape[1]
498                        )
499                        rmse = root_mean_squared_error(y_test, y_pred)
500
501                        names.append(name)
502                        R2.append(r_squared)
503                        ADJR2.append(adj_rsquared)
504                        RMSE.append(rmse)
505                        TIME.append(time.time() - start)
506
507                        if self.custom_metric:
508                            custom_metric = self.custom_metric(y_test, y_pred)
509                            CUSTOM_METRIC.append(custom_metric)
510
511                        if self.verbose > 0:
512                            scores_verbose = {
513                                "Model": name,
514                                "R-Squared": r_squared,
515                                "Adjusted R-Squared": adj_rsquared,
516                                "RMSE": rmse,
517                                "Time taken": time.time() - start,
518                            }
519
520                            if self.custom_metric:
521                                scores_verbose["Custom metric"] = custom_metric
522
523                            print(scores_verbose)
524                        if self.predictions:
525                            predictions[name] = y_pred
526                    except Exception as exception:
527                        if self.ignore_warnings is False:
528                            print(name + " model failed to execute")
529                            print(exception)
530
531            else:
532
533                results = Parallel(n_jobs=self.n_jobs)(
534                    delayed(self.train_model)(
535                        name,
536                        model,
537                        X_train,
538                        y_train,
539                        X_test,
540                        y_test,
541                        use_preprocessing=False,
542                        **kwargs
543                    )
544                    for name, model in tqdm(self.regressors)
545                )
546                R2 = [
547                    result["r_squared"]
548                    for result in results
549                    if result is not None
550                ]
551                ADJR2 = [
552                    result["adj_rsquared"]
553                    for result in results
554                    if result is not None
555                ]
556                RMSE = [
557                    result["rmse"] for result in results if result is not None
558                ]
559                TIME = [
560                    result["time"] for result in results if result is not None
561                ]
562                names = [
563                    result["name"] for result in results if result is not None
564                ]
565                if self.custom_metric:
566                    CUSTOM_METRIC = [
567                        result["custom_metric"]
568                        for result in results
569                        if result is not None
570                    ]
571                if self.predictions:
572                    predictions = {
573                        result["name"]: result["predictions"]
574                        for result in results
575                        if result is not None
576                    }
577
578        scores = {
579            "Model": names,
580            "Adjusted R-Squared": ADJR2,
581            "R-Squared": R2,
582            "RMSE": RMSE,
583            "Time Taken": TIME,
584        }
585
586        if self.custom_metric:
587            scores["Custom metric"] = CUSTOM_METRIC
588
589        scores = pd.DataFrame(scores)
590        scores = scores.sort_values(by=self.sort_by, ascending=True).set_index(
591            "Model"
592        )
593
594        self.best_model_ = self.models_[scores.index[0]]
595
596        if self.predictions:
597            predictions_df = pd.DataFrame.from_dict(predictions)
598        return scores, predictions_df if self.predictions is True else scores
599
600    def get_best_model(self):
601        """
602        This function returns the best model pipeline based on the sort_by metric.
603
604        Returns:
605
606            best_model: object,
607                Returns the best model pipeline based on the sort_by metric.
608
609        """
610        return self.best_model_
611
612    def provide_models(self, X_train, X_test, y_train, y_test):
613        """
614        This function returns all the model objects trained in fit function.
615        If fit is not called already, then we call fit and then return the models.
616
617        Parameters:
618
619            X_train : array-like,
620                Training vectors, where rows is the number of samples
621                and columns is the number of features.
622
623            X_test : array-like,
624                Testing vectors, where rows is the number of samples
625                and columns is the number of features.
626
627            y_train : array-like,
628                Training vectors, where rows is the number of samples
629                and columns is the number of features.
630
631            y_test : array-like,
632                Testing vectors, where rows is the number of samples
633                and columns is the number of features.
634
635        Returns:
636
637            models: dict-object,
638                Returns a dictionary with each model pipeline as value
639                with key as name of models.
640
641        """
642        if len(self.models_.keys()) == 0:
643            self.fit(X_train, X_test, y_train.ravel(), y_test.values)
644
645        return self.models_
646
647    def train_model(
648        self,
649        name,
650        regr,
651        X_train,
652        y_train,
653        X_test,
654        y_test,
655        use_preprocessing=False,
656        preprocessor=None,
657        hist=False,
658        **kwargs
659    ):
660        """
661        Function to train a single regression model and return its results.
662        """
663        start = time.time()
664
665        try:
666            if hist is False:
667                model = GenericBoostingRegressor(
668                    base_model=regr(), verbose=self.verbose, **kwargs
669                )
670            else:
671                model = HistGenericBoostingRegressor(
672                    base_model=regr(), verbose=self.verbose, **kwargs
673                )
674
675            if use_preprocessing and preprocessor is not None:
676                pipe = Pipeline(
677                    steps=[
678                        ("preprocessor", preprocessor),
679                        ("regressor", model),
680                    ]
681                )
682                if self.verbose > 0:
683                    print(
684                        "\n Fitting boosted "
685                        + name
686                        + " model with preprocessing..."
687                    )
688                pipe.fit(X_train, y_train.ravel())
689                y_pred = pipe.predict(X_test)
690                fitted_model = pipe
691            else:
692                # Case with no preprocessing
693                if self.verbose > 0:
694                    print(
695                        "\n Fitting boosted "
696                        + name
697                        + " model without preprocessing..."
698                    )
699                model.fit(X_train, y_train.ravel())
700                y_pred = model.predict(X_test)
701                fitted_model = model
702
703            r_squared = r2_score(y_test, y_pred)
704            adj_rsquared = adjusted_rsquared(
705                r_squared, X_test.shape[0], X_test.shape[1]
706            )
707            rmse = root_mean_squared_error(y_test, y_pred)
708
709            custom_metric = None
710            if self.custom_metric:
711                custom_metric = self.custom_metric(y_test, y_pred)
712
713            return {
714                "name": name,
715                "model": fitted_model,
716                "r_squared": r_squared,
717                "adj_rsquared": adj_rsquared,
718                "rmse": rmse,
719                "custom_metric": custom_metric,
720                "time": time.time() - start,
721                "predictions": y_pred,
722            }
723
724        except Exception as exception:
725            if self.ignore_warnings is False:
726                print(name + " model failed to execute")
727                print(exception)
728            return None

Fitting -- almost -- all the regression algorithms and returning their scores.

Parameters:

verbose: int, optional (default=0)
    Any positive number for verbosity.

ignore_warnings: bool, optional (default=True)
    When set to True, the warning related to algorigms that are not able to run are ignored.

custom_metric: function, optional (default=None)
    When function is provided, models are evaluated based on the custom evaluation metric provided.

predictions: bool, optional (default=False)
    When set to True, the predictions of all the models models are returned as dataframe.

sort_by: string, optional (default='RMSE')
    Sort models by a metric. Available options are 'R-Squared', 'Adjusted R-Squared', 'RMSE', 'Time Taken' and 'Custom Metric'.
    or a custom metric identified by its name and provided by custom_metric.

random_state: int, optional (default=42)
    Reproducibiility seed.

estimators: list, optional (default='all')
    list of Estimators names or just 'all' (default='all')

preprocess: bool
    preprocessing is done when set to True

n_jobs : int, when possible, run in parallel
    For now, only used by individual models that support it.

n_layers: int, optional (default=3)
    Number of layers of CustomRegressors to be used.

All the other parameters are the same as CustomRegressor's.

Attributes:

models_: dict-object
    Returns a dictionary with each model pipeline as value
    with key as name of models.

best_model_: object
    Returns the best model pipeline based on the sort_by metric.

Examples:


import os
import mlsauce as ms
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split

data = load_diabetes()
X = data.data
y= data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 123)

regr = ms.LazyBoostingRegressor(verbose=0, ignore_warnings=True,
                                custom_metric=None, preprocess=True)
models, predictioms = regr.fit(X_train, X_test, y_train, y_test)
model_dictionary = regr.provide_models(X_train, X_test, y_train, y_test)
print(models)

def fit(self, X_train, X_test, y_train, y_test, hist=False, **kwargs): View Source

188    def fit(self, X_train, X_test, y_train, y_test, hist=False, **kwargs):
189        """Fit Regression algorithms to X_train and y_train, predict and score on X_test, y_test.
190
191        Parameters:
192
193            X_train : array-like,
194                Training vectors, where rows is the number of samples
195                and columns is the number of features.
196
197            X_test : array-like,
198                Testing vectors, where rows is the number of samples
199                and columns is the number of features.
200
201            y_train : array-like,
202                Training vectors, where rows is the number of samples
203                and columns is the number of features.
204
205            y_test : array-like,
206                Testing vectors, where rows is the number of samples
207                and columns is the number of features.
208
209            hist: bool, optional (default=False)
210                When set to True, the model is a HistGenericBoostingRegressor.
211
212            **kwargs: dict,
213                Additional parameters to be passed to the GenericBoostingRegressor.
214
215        Returns:
216        -------
217        scores:  Pandas DataFrame
218            Returns metrics of all the models in a Pandas DataFrame.
219
220        predictions : Pandas DataFrame
221            Returns predictions of all the models in a Pandas DataFrame.
222
223        """
224        R2 = []
225        ADJR2 = []
226        RMSE = []
227        # WIN = []
228        names = []
229        TIME = []
230        predictions = {}
231
232        if self.custom_metric:
233            CUSTOM_METRIC = []
234
235        if isinstance(X_train, np.ndarray):
236            X_train = pd.DataFrame(X_train)
237            X_test = pd.DataFrame(X_test)
238
239        numeric_features = X_train.select_dtypes(include=[np.number]).columns
240        categorical_features = X_train.select_dtypes(include=["object"]).columns
241
242        categorical_low, categorical_high = get_card_split(
243            X_train, categorical_features
244        )
245
246        if self.preprocess is True:
247            preprocessor = ColumnTransformer(
248                transformers=[
249                    ("numeric", numeric_transformer, numeric_features),
250                    (
251                        "categorical_low",
252                        categorical_transformer_low,
253                        categorical_low,
254                    ),
255                    (
256                        "categorical_high",
257                        categorical_transformer_high,
258                        categorical_high,
259                    ),
260                ]
261            )
262
263        # base models
264        try:
265            baseline_names = [
266                "RandomForestRegressor",
267                "XGBRegressor",
268                "GradientBoostingRegressor",
269            ]
270            baseline_models = [
271                RandomForestRegressor(),
272                xgb.XGBRegressor(),
273                GradientBoostingRegressor(),
274            ]
275        except Exception as exception:
276            baseline_names = [
277                "RandomForestRegressor",
278                "GradientBoostingRegressor",
279            ]
280            baseline_models = [
281                RandomForestRegressor(),
282                GradientBoostingRegressor(),
283            ]
284
285        if self.verbose > 0:
286            print("\n Fitting baseline models...")
287        for name, model in tqdm(zip(baseline_names, baseline_models)):
288            start = time.time()
289            try:
290                model.fit(X_train, y_train.ravel())
291                self.models_[name] = model
292                y_pred = model.predict(X_test)
293                r_squared = r2_score(y_test, y_pred)
294                adj_rsquared = adjusted_rsquared(
295                    r_squared, X_test.shape[0], X_test.shape[1]
296                )
297                rmse = root_mean_squared_error(y_test, y_pred)
298
299                names.append(name)
300                R2.append(r_squared)
301                ADJR2.append(adj_rsquared)
302                RMSE.append(rmse)
303                TIME.append(time.time() - start)
304
305                if self.custom_metric:
306                    custom_metric = self.custom_metric(y_test, y_pred)
307                    CUSTOM_METRIC.append(custom_metric)
308
309                if self.verbose > 0:
310                    scores_verbose = {
311                        "Model": name,
312                        "R-Squared": r_squared,
313                        "Adjusted R-Squared": adj_rsquared,
314                        "RMSE": rmse,
315                        "Time taken": time.time() - start,
316                    }
317
318                    if self.custom_metric:
319                        scores_verbose["Custom metric"] = custom_metric
320
321                    print(scores_verbose)
322                if self.predictions:
323                    predictions[name] = y_pred
324            except Exception as exception:
325                if self.ignore_warnings is False:
326                    print(name + " model failed to execute")
327                    print(exception)
328
329        if self.estimators == "all":
330            self.regressors = REGRESSORS
331        else:
332            self.regressors = [
333                ("GenericBooster(" + est[0] + ")", est[1](**kwargs))
334                for est in all_estimators()
335                if (
336                    issubclass(est[1], RegressorMixin)
337                    and (est[0] in self.estimators)
338                )
339            ]
340
341        if self.preprocess is True:
342
343            if self.n_jobs is None:
344
345                for name, regr in tqdm(self.regressors):  # do parallel exec
346
347                    start = time.time()
348
349                    try:
350
351                        if hist is False:
352
353                            model = GenericBoostingRegressor(
354                                base_model=regr(),
355                                verbose=self.verbose,
356                                **kwargs
357                            )
358
359                        else:
360
361                            model = HistGenericBoostingRegressor(
362                                base_model=regr(),
363                                verbose=self.verbose,
364                                **kwargs
365                            )
366
367                        model.fit(X_train, y_train.ravel())
368
369                        pipe = Pipeline(
370                            steps=[
371                                ("preprocessor", preprocessor),
372                                ("regressor", model),
373                            ]
374                        )
375                        if self.verbose > 0:
376                            print("\n Fitting boosted " + name + " model...")
377                        pipe.fit(X_train, y_train.ravel())
378
379                        self.models_[name] = pipe
380                        y_pred = pipe.predict(X_test)
381                        r_squared = r2_score(y_test, y_pred)
382                        adj_rsquared = adjusted_rsquared(
383                            r_squared, X_test.shape[0], X_test.shape[1]
384                        )
385                        rmse = root_mean_squared_error(y_test, y_pred)
386
387                        names.append(name)
388                        R2.append(r_squared)
389                        ADJR2.append(adj_rsquared)
390                        RMSE.append(rmse)
391                        TIME.append(time.time() - start)
392
393                        if self.custom_metric:
394                            custom_metric = self.custom_metric(y_test, y_pred)
395                            CUSTOM_METRIC.append(custom_metric)
396
397                        if self.verbose > 0:
398                            scores_verbose = {
399                                "Model": name,
400                                "R-Squared": r_squared,
401                                "Adjusted R-Squared": adj_rsquared,
402                                "RMSE": rmse,
403                                "Time taken": time.time() - start,
404                            }
405
406                            if self.custom_metric:
407                                scores_verbose["Custom metric"] = custom_metric
408
409                            print(scores_verbose)
410                        if self.predictions:
411                            predictions[name] = y_pred
412
413                    except Exception as exception:
414
415                        if self.ignore_warnings is False:
416                            print(name + " model failed to execute")
417                            print(exception)
418
419            else:
420
421                results = Parallel(n_jobs=self.n_jobs)(
422                    delayed(self.train_model)(
423                        name,
424                        model,
425                        X_train,
426                        y_train,
427                        X_test,
428                        y_test,
429                        use_preprocessing=True,
430                        preprocessor=preprocessor,
431                        **kwargs
432                    )
433                    for name, model in tqdm(self.regressors)
434                )
435                R2 = [
436                    result["r_squared"]
437                    for result in results
438                    if result is not None
439                ]
440                ADJR2 = [
441                    result["adj_rsquared"]
442                    for result in results
443                    if result is not None
444                ]
445                RMSE = [
446                    result["rmse"] for result in results if result is not None
447                ]
448                TIME = [
449                    result["time"] for result in results if result is not None
450                ]
451                names = [
452                    result["name"] for result in results if result is not None
453                ]
454                if self.custom_metric:
455                    CUSTOM_METRIC = [
456                        result["custom_metric"]
457                        for result in results
458                        if result is not None
459                    ]
460                if self.predictions:
461                    predictions = {
462                        result["name"]: result["predictions"]
463                        for result in results
464                        if result is not None
465                    }
466
467        else:  # self.preprocess is False; no preprocessing
468
469            if self.n_jobs is None:
470
471                for name, regr in tqdm(self.regressors):  # do parallel exec
472                    start = time.time()
473                    try:
474
475                        if hist is False:
476                            model = GenericBoostingRegressor(
477                                base_model=regr(),
478                                verbose=self.verbose,
479                                **kwargs
480                            )
481                        else:
482                            model = HistGenericBoostingRegressor(
483                                base_model=regr(),
484                                verbose=self.verbose,
485                                **kwargs
486                            )
487
488                        if self.verbose > 0:
489                            print("\n Fitting boosted " + name + " model...")
490                        model.fit(X_train, y_train.ravel())
491
492                        self.models_[name] = model
493                        y_pred = model.predict(X_test)
494
495                        r_squared = r2_score(y_test, y_pred)
496                        adj_rsquared = adjusted_rsquared(
497                            r_squared, X_test.shape[0], X_test.shape[1]
498                        )
499                        rmse = root_mean_squared_error(y_test, y_pred)
500
501                        names.append(name)
502                        R2.append(r_squared)
503                        ADJR2.append(adj_rsquared)
504                        RMSE.append(rmse)
505                        TIME.append(time.time() - start)
506
507                        if self.custom_metric:
508                            custom_metric = self.custom_metric(y_test, y_pred)
509                            CUSTOM_METRIC.append(custom_metric)
510
511                        if self.verbose > 0:
512                            scores_verbose = {
513                                "Model": name,
514                                "R-Squared": r_squared,
515                                "Adjusted R-Squared": adj_rsquared,
516                                "RMSE": rmse,
517                                "Time taken": time.time() - start,
518                            }
519
520                            if self.custom_metric:
521                                scores_verbose["Custom metric"] = custom_metric
522
523                            print(scores_verbose)
524                        if self.predictions:
525                            predictions[name] = y_pred
526                    except Exception as exception:
527                        if self.ignore_warnings is False:
528                            print(name + " model failed to execute")
529                            print(exception)
530
531            else:
532
533                results = Parallel(n_jobs=self.n_jobs)(
534                    delayed(self.train_model)(
535                        name,
536                        model,
537                        X_train,
538                        y_train,
539                        X_test,
540                        y_test,
541                        use_preprocessing=False,
542                        **kwargs
543                    )
544                    for name, model in tqdm(self.regressors)
545                )
546                R2 = [
547                    result["r_squared"]
548                    for result in results
549                    if result is not None
550                ]
551                ADJR2 = [
552                    result["adj_rsquared"]
553                    for result in results
554                    if result is not None
555                ]
556                RMSE = [
557                    result["rmse"] for result in results if result is not None
558                ]
559                TIME = [
560                    result["time"] for result in results if result is not None
561                ]
562                names = [
563                    result["name"] for result in results if result is not None
564                ]
565                if self.custom_metric:
566                    CUSTOM_METRIC = [
567                        result["custom_metric"]
568                        for result in results
569                        if result is not None
570                    ]
571                if self.predictions:
572                    predictions = {
573                        result["name"]: result["predictions"]
574                        for result in results
575                        if result is not None
576                    }
577
578        scores = {
579            "Model": names,
580            "Adjusted R-Squared": ADJR2,
581            "R-Squared": R2,
582            "RMSE": RMSE,
583            "Time Taken": TIME,
584        }
585
586        if self.custom_metric:
587            scores["Custom metric"] = CUSTOM_METRIC
588
589        scores = pd.DataFrame(scores)
590        scores = scores.sort_values(by=self.sort_by, ascending=True).set_index(
591            "Model"
592        )
593
594        self.best_model_ = self.models_[scores.index[0]]
595
596        if self.predictions:
597            predictions_df = pd.DataFrame.from_dict(predictions)
598        return scores, predictions_df if self.predictions is True else scores

Fit Regression algorithms to X_train and y_train, predict and score on X_test, y_test.

Parameters:

X_train : array-like,
    Training vectors, where rows is the number of samples
    and columns is the number of features.

X_test : array-like,
    Testing vectors, where rows is the number of samples
    and columns is the number of features.

y_train : array-like,
    Training vectors, where rows is the number of samples
    and columns is the number of features.

y_test : array-like,
    Testing vectors, where rows is the number of samples
    and columns is the number of features.

hist: bool, optional (default=False)
    When set to True, the model is a HistGenericBoostingRegressor.

**kwargs: dict,
    Additional parameters to be passed to the GenericBoostingRegressor.

Returns:

scores: Pandas DataFrame Returns metrics of all the models in a Pandas DataFrame.

predictions : Pandas DataFrame Returns predictions of all the models in a Pandas DataFrame.

def provide_models(self, X_train, X_test, y_train, y_test): View Source

612    def provide_models(self, X_train, X_test, y_train, y_test):
613        """
614        This function returns all the model objects trained in fit function.
615        If fit is not called already, then we call fit and then return the models.
616
617        Parameters:
618
619            X_train : array-like,
620                Training vectors, where rows is the number of samples
621                and columns is the number of features.
622
623            X_test : array-like,
624                Testing vectors, where rows is the number of samples
625                and columns is the number of features.
626
627            y_train : array-like,
628                Training vectors, where rows is the number of samples
629                and columns is the number of features.
630
631            y_test : array-like,
632                Testing vectors, where rows is the number of samples
633                and columns is the number of features.
634
635        Returns:
636
637            models: dict-object,
638                Returns a dictionary with each model pipeline as value
639                with key as name of models.
640
641        """
642        if len(self.models_.keys()) == 0:
643            self.fit(X_train, X_test, y_train.ravel(), y_test.values)
644
645        return self.models_

This function returns all the model objects trained in fit function. If fit is not called already, then we call fit and then return the models.

Parameters:

X_train : array-like,
    Training vectors, where rows is the number of samples
    and columns is the number of features.

X_test : array-like,
    Testing vectors, where rows is the number of samples
    and columns is the number of features.

y_train : array-like,
    Training vectors, where rows is the number of samples
    and columns is the number of features.

y_test : array-like,
    Testing vectors, where rows is the number of samples
    and columns is the number of features.

Returns:

models: dict-object,
    Returns a dictionary with each model pipeline as value
    with key as name of models.

class MultiTaskRegressor(sklearn.base.BaseEstimator, sklearn.base.RegressorMixin): View Source

 9class MultiTaskRegressor(BaseEstimator, RegressorMixin):
10    """
11    A class for multi-task regression
12
13    Parameters
14    ----------
15    regr: object
16        A regressor object
17
18    Attributes
19    ----------
20    objs: list
21        A list containing the fitted regressor objects
22
23    """
24
25    def __init__(self, regr):
26        assert (
27            is_multitask_estimator(regr) == False
28        ), "The regressor is already a multi-task regressor"
29        self.regr = regr
30        self.objs = []
31
32    def fit(self, X, y):
33        """
34        Fit the regressor
35
36        Parameters
37        ----------
38        X: array-like
39            The input data
40        y: array-like
41            The target values
42
43        """
44        n_tasks = y.shape[1]
45        assert n_tasks > 1, "The number of columns in y must be greater than 1"
46        self.n_outputs_ = n_tasks
47        try:
48            for i in range(n_tasks):
49                self.regr.fit(X, y.iloc[:, i].values)
50                self.objs.append(deepcopy(self.regr))
51        except Exception:
52            for i in range(n_tasks):
53                self.regr.fit(X, y[:, i])
54                self.objs.append(deepcopy(self.regr))
55        return self
56
57    def predict(self, X):
58        """
59        Predict the target values
60
61        Parameters
62        ----------
63        X: array-like
64            The input data
65
66        Returns
67        -------
68        y_pred: array-like
69            The predicted target values
70
71        """
72        assert len(self.objs) > 0, "The regressor has not been fitted yet"
73        y_pred = np.zeros((X.shape[0], self.n_outputs_))
74        for i in range(self.n_outputs_):
75            y_pred[:, i] = self.objs[i].predict(X)
76        return y_pred

A class for multi-task regression

Parameters

regr: object A regressor object

Attributes

objs: list A list containing the fitted regressor objects

def fit(self, X, y): View Source

32    def fit(self, X, y):
33        """
34        Fit the regressor
35
36        Parameters
37        ----------
38        X: array-like
39            The input data
40        y: array-like
41            The target values
42
43        """
44        n_tasks = y.shape[1]
45        assert n_tasks > 1, "The number of columns in y must be greater than 1"
46        self.n_outputs_ = n_tasks
47        try:
48            for i in range(n_tasks):
49                self.regr.fit(X, y.iloc[:, i].values)
50                self.objs.append(deepcopy(self.regr))
51        except Exception:
52            for i in range(n_tasks):
53                self.regr.fit(X, y[:, i])
54                self.objs.append(deepcopy(self.regr))
55        return self

Fit the regressor

Parameters

X: array-like The input data y: array-like The target values

def predict(self, X): View Source

57    def predict(self, X):
58        """
59        Predict the target values
60
61        Parameters
62        ----------
63        X: array-like
64            The input data
65
66        Returns
67        -------
68        y_pred: array-like
69            The predicted target values
70
71        """
72        assert len(self.objs) > 0, "The regressor has not been fitted yet"
73        y_pred = np.zeros((X.shape[0], self.n_outputs_))
74        for i in range(self.n_outputs_):
75            y_pred[:, i] = self.objs[i].predict(X)
76        return y_pred

Predict the target values

Parameters

X: array-like The input data

Returns

y_pred: array-like The predicted target values

def download( pkgname='MASS', dataset='Boston', source='https://cran.r-universe.dev/', **kwargs): View Source

 6def download(
 7    pkgname="MASS",
 8    dataset="Boston",
 9    source="https://cran.r-universe.dev/",
10    **kwargs
11):
12    URL = source + pkgname + "/data/" + dataset + "/json"
13    res = requests.get(URL)
14    return pd.DataFrame(res.json(), **kwargs)

def get_config(): View Source

16def get_config():
17    """Retrieve current values for configuration set by :func:`set_config`
18
19    Returns
20    -------
21    config : dict
22        Keys are parameter names that can be passed to :func:`set_config`.
23
24    See Also
25    --------
26    config_context: Context manager for global mlsauce configuration
27    set_config: Set global mlsauce configuration
28    """
29    return _global_config.copy()

Retrieve current values for configuration set by set_config()

Returns

config : dict Keys are parameter names that can be passed to set_config().

Parameters

assume_finite : bool, optional If True, validation for finiteness will be skipped, saving time, but leading to potential crashes. If False, validation for finiteness will be performed, avoiding error. Global default: False.

*New in version 0.3.0.*

working_memory : int, optional If set, mlsauce will attempt to limit the size of temporary arrays to this number of MiB (per job when parallelised), often saving both computation time and memory on expensive operations that can be performed in chunks. Global default: 1024.

*New in version 0.3.0.*

print_changed_only : bool, optional If True, only the parameters that were set to non-default values will be printed when printing an estimator. For example, print(SVC()) while True will only print 'SVC()' while the default behaviour would be to print 'SVC(C=1.0, cache_size=200, ...)' with all the non-changed parameters.

*New in version 0.3.0.*

display : {'text', 'diagram'}, optional If 'diagram', estimators will be displayed as text in a jupyter lab of notebook context. If 'text', estimators will be displayed as text. Default is 'text'.

*New in version 0.3.0.*

Parameters

print_changed_only : bool, optional If True, only the parameters that were set to non-default values will be printed when printing an estimator. For example, print(SVC()) while True will only print 'SVC()', but would print 'SVC(C=1.0, cache_size=200, ...)' with all the non-changed parameters when False. Default is True.

*New in version 0.3.0.*

display : {'text', 'diagram'}, optional If 'diagram', estimators will be displayed as text in a jupyter lab of notebook context. If 'text', estimators will be displayed as text. Default is 'text'.

*New in version 0.3.0.*

Notes

All settings, not just those presently modified, will be returned to their previous values when the context manager is exited. This is not thread-safe.

Examples

>>> import mlsauce
>>> from mlsauce.utils.validation import assert_all_finite
>>> with config_context(assume_finite=True):
...     assert_all_finite([float('nan')])
>>> with config_context(assume_finite=True):
...     with config_context(assume_finite=False):
...         assert_all_finite([float('nan')])
Traceback (most recent call last):
...
ValueError: Input contains NaN, ...