unifiedbooster

 1from .gbdt import GBDT
 2from .gbdt_classification import GBDTClassifier
 3from .gbdt_regression import GBDTRegressor
 4from .gpoptimization import cross_val_optim, lazy_cross_val_optim
 5
 6__all__ = [
 7    "GBDT",
 8    "GBDTClassifier",
 9    "GBDTRegressor",
10    "cross_val_optim",
11    "lazy_cross_val_optim",
12]
class GBDT(sklearn.base.BaseEstimator):
  6class GBDT(BaseEstimator):
  7    """Gradient Boosted Decision Trees (GBDT) base class
  8
  9    Attributes:
 10
 11        model_type: str
 12            type of gradient boosting algorithm: 'xgboost', 'lightgbm',
 13            'catboost', 'gradientboosting'
 14
 15        n_estimators: int
 16            maximum number of trees that can be built
 17
 18        learning_rate: float
 19            shrinkage rate; used for reducing the gradient step
 20
 21        max_depth: int
 22            maximum tree depth
 23
 24        rowsample: float
 25            subsample ratio of the training instances
 26
 27        colsample: float
 28            percentage of features to use at each node split
 29
 30        verbose: int
 31            controls verbosity (default=0)
 32
 33        seed: int
 34            reproducibility seed
 35
 36        **kwargs: dict
 37            additional parameters to be passed to the class
 38    """
 39
 40    def __init__(
 41        self,
 42        model_type="xgboost",
 43        n_estimators=100,
 44        learning_rate=0.1,
 45        max_depth=3,
 46        rowsample=1.0,
 47        colsample=1.0,
 48        level=None,
 49        pi_method=None,
 50        verbose=0,
 51        seed=123,
 52        **kwargs
 53    ):
 54
 55        self.model_type = model_type
 56        self.n_estimators = n_estimators
 57        self.learning_rate = learning_rate
 58        self.max_depth = max_depth
 59        self.rowsample = rowsample
 60        self.colsample = colsample
 61        self.level = level
 62        self.pi_method = pi_method
 63        self.verbose = verbose
 64        self.seed = seed
 65
 66        if self.model_type == "xgboost":
 67            self.params = {
 68                "n_estimators": self.n_estimators,
 69                "learning_rate": self.learning_rate,
 70                "subsample": self.rowsample,
 71                "colsample_bynode": self.colsample,
 72                "max_depth": self.max_depth,
 73                "verbosity": self.verbose,
 74                "seed": self.seed,
 75                **kwargs,
 76            }
 77        elif self.model_type == "lightgbm":
 78            verbose = self.verbose - 1 if self.verbose == 0 else self.verbose
 79            self.params = {
 80                "n_estimators": self.n_estimators,
 81                "learning_rate": self.learning_rate,
 82                "subsample": self.rowsample,
 83                "feature_fraction_bynode": self.colsample,
 84                "max_depth": self.max_depth,
 85                "verbose": verbose,  # keep this way
 86                "seed": self.seed,
 87                **kwargs,
 88            }
 89        elif self.model_type == "catboost":
 90            self.params = {
 91                "iterations": self.n_estimators,
 92                "learning_rate": self.learning_rate,
 93                "subsample": self.rowsample,
 94                "rsm": self.colsample,
 95                "depth": self.max_depth,
 96                "verbose": self.verbose,
 97                "random_seed": self.seed,
 98                "boosting_type": "Plain",
 99                "leaf_estimation_iterations": 1,
100                "bootstrap_type": "Bernoulli",
101                **kwargs,
102            }
103        elif self.model_type == "gradientboosting":
104            self.params = {
105                "n_estimators": self.n_estimators,
106                "learning_rate": self.learning_rate,
107                "subsample": self.rowsample,
108                "max_features": self.colsample,
109                "max_depth": self.max_depth,
110                "verbose": self.verbose,
111                "random_state": self.seed,
112                **kwargs,
113            }
114
115    def fit(self, X, y, **kwargs):
116        """Fit custom model to training data (X, y).
117
118        Parameters:
119
120            X: {array-like}, shape = [n_samples, n_features]
121                Training vectors, where n_samples is the number
122                of samples and n_features is the number of features.
123
124            y: array-like, shape = [n_samples]
125                Target values.
126
127            **kwargs: additional parameters to be passed to
128                        self.cook_training_set or self.obj.fit
129
130        Returns:
131
132            self: object
133        """
134        if getattr(self, "type_fit") == "classification":
135            self.classes_ = np.unique(y)  # for compatibility with sklearn
136            self.n_classes_ = len(
137                self.classes_
138            )  # for compatibility with sklearn
139        if getattr(self, "model_type") == "gradientboosting":
140            self.model.max_features = int(self.model.max_features * X.shape[1])
141        return getattr(self, "model").fit(X, y, **kwargs)
142
143    def predict(self, X):
144        """Predict test data X.
145
146        Parameters:
147
148            X: {array-like}, shape = [n_samples, n_features]
149                Training vectors, where n_samples is the number
150                of samples and n_features is the number of features.
151
152            **kwargs: additional parameters to be passed to
153                    self.cook_test_set
154
155        Returns:
156
157            model predictions: {array-like}
158        """
159        if self.level is not None and self.type_fit == "regression":
160            return getattr(self, "model").predict(X, return_pi=True)
161        else:
162            return getattr(self, "model").predict(X)

Gradient Boosted Decision Trees (GBDT) base class

Attributes:

model_type: str
    type of gradient boosting algorithm: 'xgboost', 'lightgbm',
    'catboost', 'gradientboosting'

n_estimators: int
    maximum number of trees that can be built

learning_rate: float
    shrinkage rate; used for reducing the gradient step

max_depth: int
    maximum tree depth

rowsample: float
    subsample ratio of the training instances

colsample: float
    percentage of features to use at each node split

verbose: int
    controls verbosity (default=0)

seed: int
    reproducibility seed

**kwargs: dict
    additional parameters to be passed to the class
def fit(self, X, y, **kwargs):
115    def fit(self, X, y, **kwargs):
116        """Fit custom model to training data (X, y).
117
118        Parameters:
119
120            X: {array-like}, shape = [n_samples, n_features]
121                Training vectors, where n_samples is the number
122                of samples and n_features is the number of features.
123
124            y: array-like, shape = [n_samples]
125                Target values.
126
127            **kwargs: additional parameters to be passed to
128                        self.cook_training_set or self.obj.fit
129
130        Returns:
131
132            self: object
133        """
134        if getattr(self, "type_fit") == "classification":
135            self.classes_ = np.unique(y)  # for compatibility with sklearn
136            self.n_classes_ = len(
137                self.classes_
138            )  # for compatibility with sklearn
139        if getattr(self, "model_type") == "gradientboosting":
140            self.model.max_features = int(self.model.max_features * X.shape[1])
141        return getattr(self, "model").fit(X, y, **kwargs)

Fit custom model to training data (X, y).

Parameters:

X: {array-like}, shape = [n_samples, n_features]
    Training vectors, where n_samples is the number
    of samples and n_features is the number of features.

y: array-like, shape = [n_samples]
    Target values.

**kwargs: additional parameters to be passed to
            self.cook_training_set or self.obj.fit

Returns:

self: object
def predict(self, X):
143    def predict(self, X):
144        """Predict test data X.
145
146        Parameters:
147
148            X: {array-like}, shape = [n_samples, n_features]
149                Training vectors, where n_samples is the number
150                of samples and n_features is the number of features.
151
152            **kwargs: additional parameters to be passed to
153                    self.cook_test_set
154
155        Returns:
156
157            model predictions: {array-like}
158        """
159        if self.level is not None and self.type_fit == "regression":
160            return getattr(self, "model").predict(X, return_pi=True)
161        else:
162            return getattr(self, "model").predict(X)

Predict test data X.

Parameters:

X: {array-like}, shape = [n_samples, n_features]
    Training vectors, where n_samples is the number
    of samples and n_features is the number of features.

**kwargs: additional parameters to be passed to
        self.cook_test_set

Returns:

model predictions: {array-like}
class GBDTClassifier(unifiedbooster.GBDT, sklearn.base.ClassifierMixin):
 21class GBDTClassifier(GBDT, ClassifierMixin):
 22    """GBDT Classification model
 23
 24    Attributes:
 25
 26        model_type: str
 27            type of gradient boosting algorithm: 'xgboost', 'lightgbm',
 28            'catboost', 'gradientboosting'
 29
 30        n_estimators: int
 31            maximum number of trees that can be built
 32
 33        learning_rate: float
 34            shrinkage rate; used for reducing the gradient step
 35
 36        max_depth: int
 37            maximum tree depth
 38
 39        rowsample: float
 40            subsample ratio of the training instances
 41
 42        colsample: float
 43            percentage of features to use at each node split
 44
 45        level: float
 46            confidence level for prediction sets
 47
 48        pi_method: str
 49            method for constructing the prediction intervals: 'icp' (inductive conformal), 'tcp' (transductive conformal)
 50
 51        verbose: int
 52            controls verbosity (default=0)
 53
 54        seed: int
 55            reproducibility seed
 56
 57        **kwargs: dict
 58            additional parameters to be passed to the class
 59
 60    Examples:
 61
 62        ```python
 63        import unifiedbooster as ub
 64        from sklearn.datasets import load_iris
 65        from sklearn.model_selection import train_test_split
 66        from sklearn.metrics import accuracy_score
 67
 68        # Load dataset
 69        iris = load_iris()
 70        X, y = iris.data, iris.target
 71
 72        # Split dataset into training and testing sets
 73        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
 74
 75        # Initialize the unified regressor (example with XGBoost)
 76        regressor1 = ub.GBDTClassifier(model_type='xgboost')
 77        #regressor2 = ub.GBDTClassifier(model_type='catboost')
 78        regressor3 = ub.GBDTClassifier(model_type='lightgbm')
 79
 80        # Fit the model
 81        regressor1.fit(X_train, y_train)
 82        #regressor2.fit(X_train, y_train)
 83        regressor3.fit(X_train, y_train)
 84
 85        # Predict on the test set
 86        y_pred1 = regressor1.predict(X_test)
 87        #y_pred2 = regressor2.predict(X_test)
 88        y_pred3 = regressor3.predict(X_test)
 89
 90        # Evaluate the model
 91        accuracy1 = accuracy_score(y_test, y_pred1)
 92        #accuracy2 = accuracy_score(y_test, y_pred2)
 93        accuracy3 = accuracy_score(y_test, y_pred3)
 94        print(f"Classification Accuracy xgboost: {accuracy1:.2f}")
 95        #print(f"Classification Accuracy catboost: {accuracy2:.2f}")
 96        print(f"Classification Accuracy lightgbm: {accuracy3:.2f}")
 97        ```
 98    """
 99
100    def __init__(
101        self,
102        model_type="xgboost",
103        n_estimators=100,
104        learning_rate=0.1,
105        max_depth=3,
106        rowsample=1.0,
107        colsample=1.0,
108        level=None,
109        pi_method="icp",
110        verbose=0,
111        seed=123,
112        **kwargs,
113    ):
114
115        self.type_fit = "classification"
116
117        super().__init__(
118            model_type=model_type,
119            n_estimators=n_estimators,
120            learning_rate=learning_rate,
121            max_depth=max_depth,
122            rowsample=rowsample,
123            colsample=colsample,
124            level=level,
125            pi_method=pi_method,
126            verbose=verbose,
127            seed=seed,
128            **kwargs,
129        )
130
131        if self.level is not None:
132
133            if model_type == "xgboost":
134                self.model = PredictionSet(
135                    XGBClassifier(**self.params),
136                    level=self.level,
137                    method=self.pi_method,
138                )
139            elif model_type == "catboost":
140                self.model = PredictionSet(
141                    CatBoostClassifier(**self.params),
142                    level=self.level,
143                    method=self.pi_method,
144                )
145            elif model_type == "lightgbm":
146                self.model = PredictionSet(
147                    LGBMClassifier(**self.params),
148                    level=self.level,
149                    method=self.pi_method,
150                )
151            elif model_type == "gradientboosting":
152                self.model = PredictionSet(
153                    GradientBoostingClassifier(**self.params),
154                    level=self.level,
155                    method=self.pi_method,
156                )
157            else:
158                raise ValueError(f"Unknown model_type: {model_type}")
159
160        else:
161
162            if model_type == "xgboost":
163                self.model = XGBClassifier(**self.params)
164            elif model_type == "catboost":
165                self.model = CatBoostClassifier(**self.params)
166            elif model_type == "lightgbm":
167                self.model = LGBMClassifier(**self.params)
168            elif model_type == "gradientboosting":
169                self.model = GradientBoostingClassifier(**self.params)
170            else:
171                raise ValueError(f"Unknown model_type: {model_type}")
172
173    def predict_proba(self, X):
174        """Predict probabilities for test data X.
175
176        Args:
177
178            X: {array-like}, shape = [n_samples, n_features]
179                Training vectors, where n_samples is the number
180                of samples and n_features is the number of features.
181
182            **kwargs: additional parameters to be passed to
183                    self.cook_test_set
184
185        Returns:
186
187            probability estimates for test data: {array-like}
188        """
189        return self.model.predict_proba(X)

GBDT Classification model

Attributes:

model_type: str
    type of gradient boosting algorithm: 'xgboost', 'lightgbm',
    'catboost', 'gradientboosting'

n_estimators: int
    maximum number of trees that can be built

learning_rate: float
    shrinkage rate; used for reducing the gradient step

max_depth: int
    maximum tree depth

rowsample: float
    subsample ratio of the training instances

colsample: float
    percentage of features to use at each node split

level: float
    confidence level for prediction sets

pi_method: str
    method for constructing the prediction intervals: 'icp' (inductive conformal), 'tcp' (transductive conformal)

verbose: int
    controls verbosity (default=0)

seed: int
    reproducibility seed

**kwargs: dict
    additional parameters to be passed to the class

Examples:

import unifiedbooster as ub
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load dataset
iris = load_iris()
X, y = iris.data, iris.target

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the unified regressor (example with XGBoost)
regressor1 = ub.GBDTClassifier(model_type='xgboost')
#regressor2 = ub.GBDTClassifier(model_type='catboost')
regressor3 = ub.GBDTClassifier(model_type='lightgbm')

# Fit the model
regressor1.fit(X_train, y_train)
#regressor2.fit(X_train, y_train)
regressor3.fit(X_train, y_train)

# Predict on the test set
y_pred1 = regressor1.predict(X_test)
#y_pred2 = regressor2.predict(X_test)
y_pred3 = regressor3.predict(X_test)

# Evaluate the model
accuracy1 = accuracy_score(y_test, y_pred1)
#accuracy2 = accuracy_score(y_test, y_pred2)
accuracy3 = accuracy_score(y_test, y_pred3)
print(f"Classification Accuracy xgboost: {accuracy1:.2f}")
#print(f"Classification Accuracy catboost: {accuracy2:.2f}")
print(f"Classification Accuracy lightgbm: {accuracy3:.2f}")
def predict_proba(self, X):
173    def predict_proba(self, X):
174        """Predict probabilities for test data X.
175
176        Args:
177
178            X: {array-like}, shape = [n_samples, n_features]
179                Training vectors, where n_samples is the number
180                of samples and n_features is the number of features.
181
182            **kwargs: additional parameters to be passed to
183                    self.cook_test_set
184
185        Returns:
186
187            probability estimates for test data: {array-like}
188        """
189        return self.model.predict_proba(X)

Predict probabilities for test data X.

Args:

X: {array-like}, shape = [n_samples, n_features]
    Training vectors, where n_samples is the number
    of samples and n_features is the number of features.

**kwargs: additional parameters to be passed to
        self.cook_test_set

Returns:

probability estimates for test data: {array-like}
class GBDTRegressor(unifiedbooster.GBDT, sklearn.base.RegressorMixin):
 21class GBDTRegressor(GBDT, RegressorMixin):
 22    """GBDT Regression model
 23
 24    Attributes:
 25
 26        model_type: str
 27            type of gradient boosting algorithm: 'xgboost', 'lightgbm',
 28            'catboost', 'gradientboosting'
 29
 30        n_estimators: int
 31            maximum number of trees that can be built
 32
 33        learning_rate: float
 34            shrinkage rate; used for reducing the gradient step
 35
 36        max_depth: int
 37            maximum tree depth
 38
 39        rowsample: float
 40            subsample ratio of the training instances
 41
 42        colsample: float
 43            percentage of features to use at each node split
 44
 45        level: float
 46            confidence level for prediction sets
 47
 48        pi_method: str
 49            method for constructing the prediction intervals: 'splitconformal', 'localconformal'
 50
 51        verbose: int
 52            controls verbosity (default=0)
 53
 54        seed: int
 55            reproducibility seed
 56
 57        **kwargs: dict
 58            additional parameters to be passed to the class
 59
 60    Examples:
 61
 62        ```python
 63        import unifiedbooster as ub
 64        from sklearn.datasets import fetch_california_housing
 65        from sklearn.model_selection import train_test_split
 66        from sklearn.metrics import mean_squared_error
 67
 68        # Load dataset
 69        housing = fetch_california_housing()
 70        X, y = housing.data, housing.target
 71
 72        # Split dataset into training and testing sets
 73        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
 74
 75        # Initialize the unified regressor (example with XGBoost)
 76        regressor1 = ub.GBDTRegressor(model_type='xgboost')
 77        #regressor2 = ub.GBDTRegressor(model_type='catboost')
 78        regressor3 = ub.GBDTRegressor(model_type='lightgbm')
 79
 80        # Fit the model
 81        regressor1.fit(X_train, y_train)
 82        #regressor2.fit(X_train, y_train)
 83        regressor3.fit(X_train, y_train)
 84
 85        # Predict on the test set
 86        y_pred1 = regressor1.predict(X_test)
 87        #y_pred2 = regressor2.predict(X_test)
 88        y_pred3 = regressor3.predict(X_test)
 89
 90        # Evaluate the model
 91        mse1 = mean_squared_error(y_test, y_pred1)
 92        #mse2 = mean_squared_error(y_test, y_pred2)
 93        mse3 = mean_squared_error(y_test, y_pred3)
 94        print(f"Regression Mean Squared Error xgboost: {mse1:.2f}")
 95        #print(f"Regression Mean Squared Error catboost: {mse2:.2f}")
 96        print(f"Regression Mean Squared Error lightgbm: {mse3:.2f}")
 97        ```
 98    """
 99
100    def __init__(
101        self,
102        model_type="xgboost",
103        n_estimators=100,
104        learning_rate=0.1,
105        max_depth=3,
106        rowsample=1.0,
107        colsample=1.0,
108        level=None,
109        pi_method="splitconformal",
110        verbose=0,
111        seed=123,
112        **kwargs,
113    ):
114
115        self.type_fit = "regression"
116
117        super().__init__(
118            model_type=model_type,
119            n_estimators=n_estimators,
120            learning_rate=learning_rate,
121            max_depth=max_depth,
122            rowsample=rowsample,
123            colsample=colsample,
124            level=level,
125            pi_method=pi_method,
126            verbose=verbose,
127            seed=seed,
128            **kwargs,
129        )
130
131        if self.level is not None:
132
133            if model_type == "xgboost":
134                self.model = PredictionInterval(
135                    XGBRegressor(**self.params),
136                    level=self.level,
137                    method=self.pi_method,
138                )
139            elif model_type == "catboost":
140                self.model = PredictionInterval(
141                    CatBoostRegressor(**self.params),
142                    level=self.level,
143                    method=self.pi_method,
144                )
145            elif model_type == "lightgbm":
146                self.model = PredictionInterval(
147                    LGBMRegressor(**self.params),
148                    level=self.level,
149                    method=self.pi_method,
150                )
151            elif model_type == "gradientboosting":
152                self.model = PredictionInterval(
153                    GradientBoostingRegressor(**self.params),
154                    level=self.level,
155                    method=self.pi_method,
156                )
157            else:
158                raise ValueError(f"Unknown model_type: {model_type}")
159
160        else:
161
162            if model_type == "xgboost":
163                self.model = XGBRegressor(**self.params)
164            elif model_type == "catboost":
165                self.model = CatBoostRegressor(**self.params)
166            elif model_type == "lightgbm":
167                self.model = LGBMRegressor(**self.params)
168            elif model_type == "gradientboosting":
169                self.model = GradientBoostingRegressor(**self.params)
170            else:
171                raise ValueError(f"Unknown model_type: {model_type}")

GBDT Regression model

Attributes:

model_type: str
    type of gradient boosting algorithm: 'xgboost', 'lightgbm',
    'catboost', 'gradientboosting'

n_estimators: int
    maximum number of trees that can be built

learning_rate: float
    shrinkage rate; used for reducing the gradient step

max_depth: int
    maximum tree depth

rowsample: float
    subsample ratio of the training instances

colsample: float
    percentage of features to use at each node split

level: float
    confidence level for prediction sets

pi_method: str
    method for constructing the prediction intervals: 'splitconformal', 'localconformal'

verbose: int
    controls verbosity (default=0)

seed: int
    reproducibility seed

**kwargs: dict
    additional parameters to be passed to the class

Examples:

import unifiedbooster as ub
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Load dataset
housing = fetch_california_housing()
X, y = housing.data, housing.target

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the unified regressor (example with XGBoost)
regressor1 = ub.GBDTRegressor(model_type='xgboost')
#regressor2 = ub.GBDTRegressor(model_type='catboost')
regressor3 = ub.GBDTRegressor(model_type='lightgbm')

# Fit the model
regressor1.fit(X_train, y_train)
#regressor2.fit(X_train, y_train)
regressor3.fit(X_train, y_train)

# Predict on the test set
y_pred1 = regressor1.predict(X_test)
#y_pred2 = regressor2.predict(X_test)
y_pred3 = regressor3.predict(X_test)

# Evaluate the model
mse1 = mean_squared_error(y_test, y_pred1)
#mse2 = mean_squared_error(y_test, y_pred2)
mse3 = mean_squared_error(y_test, y_pred3)
print(f"Regression Mean Squared Error xgboost: {mse1:.2f}")
#print(f"Regression Mean Squared Error catboost: {mse2:.2f}")
print(f"Regression Mean Squared Error lightgbm: {mse3:.2f}")
def cross_val_optim( X_train, y_train, X_test=None, y_test=None, model_type='xgboost', type_fit='classification', scoring='accuracy', n_estimators=None, surrogate_obj=None, cv=5, n_jobs=None, n_init=10, n_iter=190, abs_tol=0.001, verbose=2, seed=123):
 14def cross_val_optim(
 15    X_train,
 16    y_train,
 17    X_test=None,
 18    y_test=None,
 19    model_type="xgboost",
 20    type_fit="classification",
 21    scoring="accuracy",
 22    n_estimators=None,
 23    surrogate_obj=None,
 24    cv=5,
 25    n_jobs=None,
 26    n_init=10,
 27    n_iter=190,
 28    abs_tol=1e-3,
 29    verbose=2,
 30    seed=123,
 31):
 32    """Cross-validation function and hyperparameters' search
 33
 34    Parameters:
 35
 36        X_train: array-like,
 37            Training vectors, where rows is the number of samples
 38            and columns is the number of features.
 39
 40        y_train: array-like,
 41            Training vectors, where rows is the number of samples
 42            and columns is the number of features.
 43
 44        X_test: array-like,
 45            Testing vectors, where rows is the number of samples
 46            and columns is the number of features.
 47
 48        y_test: array-like,
 49            Testing vectors, where rows is the number of samples
 50            and columns is the number of features.
 51
 52        model_type: str
 53            type of gradient boosting algorithm: 'xgboost', 'lightgbm',
 54            'catboost', 'gradientboosting'
 55
 56        type_fit: str
 57            "regression" or "classification"
 58
 59        scoring: str
 60            scoring metric; see https://scikit-learn.org/stable/modules/model_evaluation.html#the-scoring-parameter-defining-model-evaluation-rules
 61
 62        n_estimators: int
 63            maximum number of trees that can be built (default is None, and if None, then the parameter is tuned)
 64
 65        surrogate_obj: an object;
 66            An ML model for estimating the uncertainty around the objective function
 67
 68        cv: int;
 69            number of cross-validation folds
 70
 71        n_jobs: int;
 72            number of jobs for parallel execution
 73
 74        n_init: an integer;
 75            number of points in the initial setting, when `x_init` and `y_init` are not provided
 76
 77        n_iter: an integer;
 78            number of iterations of the minimization algorithm
 79
 80        abs_tol: a float;
 81            tolerance for convergence of the optimizer (early stopping based on acquisition function)
 82
 83        verbose: int
 84            controls verbosity
 85
 86        seed: int
 87            reproducibility seed
 88
 89    Examples:
 90
 91        ```python
 92        import unifiedbooster as ub
 93        from sklearn.datasets import load_breast_cancer
 94        from sklearn.model_selection import train_test_split
 95
 96        dataset = load_breast_cancer()
 97        X, y = dataset.data, dataset.target
 98        X_train, X_test, y_train, y_test = train_test_split(
 99            X, y, test_size=0.2, random_state=42
100        )
101
102        res1 = ub.cross_val_optim(
103            X_train,
104            y_train,
105            X_test=None,
106            y_test=None,
107            model_type="lightgbm",
108            type_fit="classification",
109            scoring="accuracy",
110            n_estimators=100,
111            surrogate_obj=None,
112            cv=5,
113            n_jobs=None,
114            n_init=10,
115            n_iter=190,
116            abs_tol=1e-3,
117            verbose=2,
118            seed=123,
119        )
120        print(res1)
121        ```
122    """
123
124    def gbdt_cv(
125        X_train,
126        y_train,
127        model_type="xgboost",
128        n_estimators=100,
129        learning_rate=0.1,
130        max_depth=3,
131        rowsample=1.0,
132        colsample=1.0,
133        cv=5,
134        n_jobs=None,
135        type_fit="classification",
136        scoring="accuracy",
137        seed=123,
138    ):
139        if type_fit == "regression":
140            estimator = GBDTRegressor(
141                model_type=model_type,
142                n_estimators=n_estimators,
143                learning_rate=learning_rate,
144                max_depth=max_depth,
145                rowsample=rowsample,
146                colsample=colsample,
147                verbose=0,
148                seed=seed,
149            )
150        elif type_fit == "classification":
151            estimator = GBDTClassifier(
152                model_type=model_type,
153                n_estimators=n_estimators,
154                learning_rate=learning_rate,
155                max_depth=max_depth,
156                rowsample=rowsample,
157                colsample=colsample,
158                verbose=0,
159                seed=seed,
160            )
161        return -cross_val_score(
162            estimator,
163            X_train,
164            y_train,
165            scoring=scoring,
166            cv=cv,
167            n_jobs=n_jobs,
168            verbose=0,
169        ).mean()
170
171    # objective function for hyperparams tuning
172    if n_estimators is not None:
173
174        def crossval_objective(xx):
175            return gbdt_cv(
176                X_train=X_train,
177                y_train=y_train,
178                model_type=model_type,
179                n_estimators=n_estimators,
180                learning_rate=10 ** xx[0],
181                max_depth=int(xx[1]),
182                rowsample=xx[2],
183                colsample=xx[3],
184                cv=cv,
185                n_jobs=n_jobs,
186                type_fit=type_fit,
187                scoring=scoring,
188                seed=seed,
189            )
190
191    else:  # n_estimators is None
192
193        def crossval_objective(xx):
194            return gbdt_cv(
195                X_train=X_train,
196                y_train=y_train,
197                model_type=model_type,
198                n_estimators=int(10 ** xx[4]),
199                learning_rate=10 ** xx[0],
200                max_depth=int(xx[1]),
201                rowsample=xx[2],
202                colsample=xx[3],
203                cv=cv,
204                n_jobs=n_jobs,
205                type_fit=type_fit,
206                scoring=scoring,
207                seed=seed,
208            )
209
210    if n_estimators is not None:
211        if surrogate_obj is None:
212            gp_opt = gp.GPOpt(
213                objective_func=crossval_objective,
214                lower_bound=np.array([-6, 1, 0.5, 0.5]),
215                upper_bound=np.array([0, 16, 1.0, 1.0]),
216                params_names=[
217                    "learning_rate",
218                    "max_depth",
219                    "rowsample",
220                    "colsample",
221                ],
222                method="bayesian",
223                n_init=n_init,
224                n_iter=n_iter,
225                seed=seed,
226            )
227        else:
228            gp_opt = gp.GPOpt(
229                objective_func=crossval_objective,
230                lower_bound=np.array([-6, 1, 0.5, 0.5]),
231                upper_bound=np.array([0, 16, 1.0, 1.0]),
232                params_names=[
233                    "learning_rate",
234                    "max_depth",
235                    "rowsample",
236                    "colsample",
237                ],
238                acquisition="ucb",
239                method="splitconformal",
240                surrogate_obj=ns.PredictionInterval(
241                    obj=surrogate_obj, method="splitconformal"
242                ),
243                n_init=n_init,
244                n_iter=n_iter,
245                seed=seed,
246            )
247    else:  # n_estimators is None
248        if surrogate_obj is None:
249            gp_opt = gp.GPOpt(
250                objective_func=crossval_objective,
251                lower_bound=np.array([-6, 1, 0.5, 0.5, 2]),
252                upper_bound=np.array([0, 16, 1.0, 1.0, 3]),
253                params_names=[
254                    "learning_rate",
255                    "max_depth",
256                    "rowsample",
257                    "colsample",
258                    "n_estimators",
259                ],
260                method="bayesian",
261                n_init=n_init,
262                n_iter=n_iter,
263                seed=seed,
264            )
265        else:
266            gp_opt = gp.GPOpt(
267                objective_func=crossval_objective,
268                lower_bound=np.array([-6, 1, 0.5, 0.5, 2]),
269                upper_bound=np.array([0, 16, 1.0, 1.0, 3]),
270                params_names=[
271                    "learning_rate",
272                    "max_depth",
273                    "rowsample",
274                    "colsample",
275                    "n_estimators",
276                ],
277                acquisition="ucb",
278                method="splitconformal",
279                surrogate_obj=ns.PredictionInterval(
280                    obj=surrogate_obj, method="splitconformal"
281                ),
282                n_init=n_init,
283                n_iter=n_iter,
284                seed=seed,
285            )
286
287    res = gp_opt.optimize(verbose=verbose, abs_tol=abs_tol)
288    res.best_params["model_type"] = model_type
289    res.best_params["n_estimators"] = (
290        int(n_estimators)
291        if n_estimators is not None
292        else int(10 ** res.best_params["n_estimators"])
293    )
294    res.best_params["learning_rate"] = 10 ** res.best_params["learning_rate"]
295    res.best_params["max_depth"] = int(res.best_params["max_depth"])
296    res.best_params["rowsample"] = res.best_params["rowsample"]
297    res.best_params["colsample"] = res.best_params["colsample"]
298
299    # out-of-sample error
300    if X_test is not None and y_test is not None:
301        if type_fit == "regression":
302            estimator = GBDTRegressor(**res.best_params, verbose=0, seed=seed)
303        elif type_fit == "classification":
304            estimator = GBDTClassifier(**res.best_params, verbose=0, seed=seed)
305        preds = estimator.fit(X_train, y_train).predict(X_test)
306        # check error on y_test
307        oos_err = getattr(metrics, scoring + "_score")(
308            y_true=y_test, y_pred=preds
309        )
310        result = namedtuple("result", res._fields + ("test_" + scoring,))
311        return result(*res, oos_err)
312    else:
313        return res

Cross-validation function and hyperparameters' search

Parameters:

X_train: array-like,
    Training vectors, where rows is the number of samples
    and columns is the number of features.

y_train: array-like,
    Training vectors, where rows is the number of samples
    and columns is the number of features.

X_test: array-like,
    Testing vectors, where rows is the number of samples
    and columns is the number of features.

y_test: array-like,
    Testing vectors, where rows is the number of samples
    and columns is the number of features.

model_type: str
    type of gradient boosting algorithm: 'xgboost', 'lightgbm',
    'catboost', 'gradientboosting'

type_fit: str
    "regression" or "classification"

scoring: str
    scoring metric; see https://scikit-learn.org/stable/modules/model_evaluation.html#the-scoring-parameter-defining-model-evaluation-rules

n_estimators: int
    maximum number of trees that can be built (default is None, and if None, then the parameter is tuned)

surrogate_obj: an object;
    An ML model for estimating the uncertainty around the objective function

cv: int;
    number of cross-validation folds

n_jobs: int;
    number of jobs for parallel execution

n_init: an integer;
    number of points in the initial setting, when `x_init` and `y_init` are not provided

n_iter: an integer;
    number of iterations of the minimization algorithm

abs_tol: a float;
    tolerance for convergence of the optimizer (early stopping based on acquisition function)

verbose: int
    controls verbosity

seed: int
    reproducibility seed

Examples:

import unifiedbooster as ub
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

dataset = load_breast_cancer()
X, y = dataset.data, dataset.target
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

res1 = ub.cross_val_optim(
    X_train,
    y_train,
    X_test=None,
    y_test=None,
    model_type="lightgbm",
    type_fit="classification",
    scoring="accuracy",
    n_estimators=100,
    surrogate_obj=None,
    cv=5,
    n_jobs=None,
    n_init=10,
    n_iter=190,
    abs_tol=1e-3,
    verbose=2,
    seed=123,
)
print(res1)
def lazy_cross_val_optim( X_train, y_train, X_test=None, y_test=None, model_type='xgboost', type_fit='classification', scoring='accuracy', customize=False, n_estimators=None, cv=5, n_jobs=None, n_init=10, n_iter=190, abs_tol=0.001, verbose=1, seed=123):
316def lazy_cross_val_optim(
317    X_train,
318    y_train,
319    X_test=None,
320    y_test=None,
321    model_type="xgboost",
322    type_fit="classification",
323    scoring="accuracy",
324    customize=False,
325    n_estimators=None,
326    cv=5,
327    n_jobs=None,
328    n_init=10,
329    n_iter=190,
330    abs_tol=1e-3,
331    verbose=1,
332    seed=123,
333):
334    """Automated Cross-validation function and hyperparameters' search using multiple surrogates
335
336    Parameters:
337
338        X_train: array-like,
339            Training vectors, where rows is the number of samples
340            and columns is the number of features.
341
342        y_train: array-like,
343            Training vectors, where rows is the number of samples
344            and columns is the number of features.
345
346        X_test: array-like,
347            Testing vectors, where rows is the number of samples
348            and columns is the number of features.
349
350        y_test: array-like,
351            Testing vectors, where rows is the number of samples
352            and columns is the number of features.
353
354        model_type: str
355            type of gradient boosting algorithm: 'xgboost', 'lightgbm',
356            'catboost', 'gradientboosting'
357
358        type_fit: str
359            "regression" or "classification"
360
361        scoring: str
362            scoring metric; see https://scikit-learn.org/stable/modules/model_evaluation.html#the-scoring-parameter-defining-model-evaluation-rules
363
364        customize: boolean
365            if True, the surrogate is transformed into a quasi-randomized network (default is False)
366
367        n_estimators: int
368            maximum number of trees that can be built (default is None, if None, the  parameters is tuned)
369
370        cv: int;
371            number of cross-validation folds
372
373        n_jobs: int;
374            number of jobs for parallel execution
375
376        n_init: an integer;
377            number of points in the initial setting, when `x_init` and `y_init` are not provided
378
379        n_iter: an integer;
380            number of iterations of the minimization algorithm
381
382        abs_tol: a float;
383            tolerance for convergence of the optimizer (early stopping based on acquisition function)
384
385        verbose: int
386            controls verbosity
387
388        seed: int
389            reproducibility seed
390
391    Examples:
392
393        ```python
394        import os
395        import unifiedbooster as ub
396        from sklearn.datasets import load_breast_cancer
397        from sklearn.model_selection import train_test_split
398        from sklearn.metrics import accuracy_score
399        from time import time
400
401        print(f"\n ----- Running: {os.path.basename(__file__)}... ----- \n")
402
403        dataset = load_breast_cancer()
404        X, y = dataset.data, dataset.target
405        X_train, X_test, y_train, y_test = train_test_split(
406            X, y, test_size=0.2, random_state=42
407        )
408
409        start = time()
410        res4 = ub.lazy_cross_val_optim(
411            X_train,
412            y_train,
413            X_test=X_test,
414            y_test=y_test,
415            model_type="lightgbm",
416            type_fit="classification",
417            scoring="accuracy",
418            n_estimators=100,
419            cv=5,
420            n_jobs=None,
421            n_init=10,
422            n_iter=190,
423            abs_tol=1e-3,
424            seed=123,
425            customize=False
426        )
427        print(f"Elapsed: {time()-start}")
428        print(res4)
429        ```
430    """
431
432    removed_regressors = [
433        "TheilSenRegressor",
434        "ARDRegression",
435        "CCA",
436        "GaussianProcessRegressor",
437        "GradientBoostingRegressor",
438        "HistGradientBoostingRegressor",
439        "IsotonicRegression",
440        "MultiOutputRegressor",
441        "MultiTaskElasticNet",
442        "MultiTaskElasticNetCV",
443        "MultiTaskLasso",
444        "MultiTaskLassoCV",
445        "OrthogonalMatchingPursuit",
446        "OrthogonalMatchingPursuitCV",
447        "PLSCanonical",
448        "PLSRegression",
449        "RadiusNeighborsRegressor",
450        "RegressorChain",
451        "StackingRegressor",
452        "VotingRegressor",
453    ]
454
455    results = []
456
457    for est in all_estimators():
458        if issubclass(est[1], RegressorMixin) and (
459            est[0] not in removed_regressors
460        ):
461            try:
462                if customize == True:
463                    print(f"\n surrogate: CustomRegressor({est[0]})")
464                    surr_obj = ns.CustomRegressor(obj=est[1]())
465                else:
466                    print(f"\n surrogate: {est[0]}")
467                    surr_obj = est[1]()
468                res = cross_val_optim(
469                    X_train=X_train,
470                    y_train=y_train,
471                    X_test=X_test,
472                    y_test=y_test,
473                    model_type=model_type,
474                    n_estimators=n_estimators,
475                    surrogate_obj=surr_obj,
476                    cv=cv,
477                    n_jobs=n_jobs,
478                    type_fit=type_fit,
479                    scoring=scoring,
480                    n_init=n_init,
481                    n_iter=n_iter,
482                    abs_tol=abs_tol,
483                    verbose=verbose,
484                    seed=seed,
485                )
486                print(f"\n result: {res}")
487                if customize == True:
488                    results.append((f"CustomRegressor({est[0]})", res))
489                else:
490                    results.append((est[0], res))
491            except:
492                pass
493
494    return results

Automated Cross-validation function and hyperparameters' search using multiple surrogates

Parameters:

    X_train: array-like,
        Training vectors, where rows is the number of samples
        and columns is the number of features.

    y_train: array-like,
        Training vectors, where rows is the number of samples
        and columns is the number of features.

    X_test: array-like,
        Testing vectors, where rows is the number of samples
        and columns is the number of features.

    y_test: array-like,
        Testing vectors, where rows is the number of samples
        and columns is the number of features.

    model_type: str
        type of gradient boosting algorithm: 'xgboost', 'lightgbm',
        'catboost', 'gradientboosting'

    type_fit: str
        "regression" or "classification"

    scoring: str
        scoring metric; see https://scikit-learn.org/stable/modules/model_evaluation.html#the-scoring-parameter-defining-model-evaluation-rules

    customize: boolean
        if True, the surrogate is transformed into a quasi-randomized network (default is False)

    n_estimators: int
        maximum number of trees that can be built (default is None, if None, the  parameters is tuned)

    cv: int;
        number of cross-validation folds

    n_jobs: int;
        number of jobs for parallel execution

    n_init: an integer;
        number of points in the initial setting, when `x_init` and `y_init` are not provided

    n_iter: an integer;
        number of iterations of the minimization algorithm

    abs_tol: a float;
        tolerance for convergence of the optimizer (early stopping based on acquisition function)

    verbose: int
        controls verbosity

    seed: int
        reproducibility seed

Examples:


    
        import os
        import unifiedbooster as ub
        from sklearn.datasets import load_breast_cancer
        from sklearn.model_selection import train_test_split
        from sklearn.metrics import accuracy_score
        from time import time

        print(f"
 ----- Running: {os.path.basename(__file__)}... ----- 
")

        dataset = load_breast_cancer()
        X, y = dataset.data, dataset.target
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )

        start = time()
        res4 = ub.lazy_cross_val_optim(
            X_train,
            y_train,
            X_test=X_test,
            y_test=y_test,
            model_type="lightgbm",
            type_fit="classification",
            scoring="accuracy",
            n_estimators=100,
            cv=5,
            n_jobs=None,
            n_init=10,
            n_iter=190,
            abs_tol=1e-3,
            seed=123,
            customize=False
        )
        print(f"Elapsed: {time()-start}")
        print(res4)