unifiedbooster
1from .gbdt import GBDT 2from .gbdt_classification import GBDTClassifier 3from .gbdt_regression import GBDTRegressor 4from .gpoptimization import cross_val_optim, lazy_cross_val_optim 5 6__all__ = [ 7 "GBDT", 8 "GBDTClassifier", 9 "GBDTRegressor", 10 "cross_val_optim", 11 "lazy_cross_val_optim", 12]
class
GBDT(sklearn.base.BaseEstimator):
6class GBDT(BaseEstimator): 7 """Gradient Boosted Decision Trees (GBDT) base class 8 9 Attributes: 10 11 model_type: str 12 type of gradient boosting algorithm: 'xgboost', 'lightgbm', 13 'catboost', 'gradientboosting' 14 15 n_estimators: int 16 maximum number of trees that can be built 17 18 learning_rate: float 19 shrinkage rate; used for reducing the gradient step 20 21 max_depth: int 22 maximum tree depth 23 24 rowsample: float 25 subsample ratio of the training instances 26 27 colsample: float 28 percentage of features to use at each node split 29 30 verbose: int 31 controls verbosity (default=0) 32 33 seed: int 34 reproducibility seed 35 36 **kwargs: dict 37 additional parameters to be passed to the class 38 """ 39 40 def __init__( 41 self, 42 model_type="xgboost", 43 n_estimators=100, 44 learning_rate=0.1, 45 max_depth=3, 46 rowsample=1.0, 47 colsample=1.0, 48 level=None, 49 pi_method=None, 50 verbose=0, 51 seed=123, 52 **kwargs 53 ): 54 55 self.model_type = model_type 56 self.n_estimators = n_estimators 57 self.learning_rate = learning_rate 58 self.max_depth = max_depth 59 self.rowsample = rowsample 60 self.colsample = colsample 61 self.level = level 62 self.pi_method = pi_method 63 self.verbose = verbose 64 self.seed = seed 65 66 if self.model_type == "xgboost": 67 self.params = { 68 "n_estimators": self.n_estimators, 69 "learning_rate": self.learning_rate, 70 "subsample": self.rowsample, 71 "colsample_bynode": self.colsample, 72 "max_depth": self.max_depth, 73 "verbosity": self.verbose, 74 "seed": self.seed, 75 **kwargs, 76 } 77 elif self.model_type == "lightgbm": 78 verbose = self.verbose - 1 if self.verbose == 0 else self.verbose 79 self.params = { 80 "n_estimators": self.n_estimators, 81 "learning_rate": self.learning_rate, 82 "subsample": self.rowsample, 83 "feature_fraction_bynode": self.colsample, 84 "max_depth": self.max_depth, 85 "verbose": verbose, # keep this way 86 "seed": self.seed, 87 **kwargs, 88 } 89 elif self.model_type == "catboost": 90 self.params = { 91 "iterations": self.n_estimators, 92 "learning_rate": self.learning_rate, 93 "subsample": self.rowsample, 94 "rsm": self.colsample, 95 "depth": self.max_depth, 96 "verbose": self.verbose, 97 "random_seed": self.seed, 98 "boosting_type": "Plain", 99 "leaf_estimation_iterations": 1, 100 "bootstrap_type": "Bernoulli", 101 **kwargs, 102 } 103 elif self.model_type == "gradientboosting": 104 self.params = { 105 "n_estimators": self.n_estimators, 106 "learning_rate": self.learning_rate, 107 "subsample": self.rowsample, 108 "max_features": self.colsample, 109 "max_depth": self.max_depth, 110 "verbose": self.verbose, 111 "random_state": self.seed, 112 **kwargs, 113 } 114 115 def fit(self, X, y, **kwargs): 116 """Fit custom model to training data (X, y). 117 118 Parameters: 119 120 X: {array-like}, shape = [n_samples, n_features] 121 Training vectors, where n_samples is the number 122 of samples and n_features is the number of features. 123 124 y: array-like, shape = [n_samples] 125 Target values. 126 127 **kwargs: additional parameters to be passed to 128 self.cook_training_set or self.obj.fit 129 130 Returns: 131 132 self: object 133 """ 134 if getattr(self, "type_fit") == "classification": 135 self.classes_ = np.unique(y) # for compatibility with sklearn 136 self.n_classes_ = len( 137 self.classes_ 138 ) # for compatibility with sklearn 139 if getattr(self, "model_type") == "gradientboosting": 140 self.model.max_features = int(self.model.max_features * X.shape[1]) 141 return getattr(self, "model").fit(X, y, **kwargs) 142 143 def predict(self, X): 144 """Predict test data X. 145 146 Parameters: 147 148 X: {array-like}, shape = [n_samples, n_features] 149 Training vectors, where n_samples is the number 150 of samples and n_features is the number of features. 151 152 **kwargs: additional parameters to be passed to 153 self.cook_test_set 154 155 Returns: 156 157 model predictions: {array-like} 158 """ 159 if self.level is not None and self.type_fit == "regression": 160 return getattr(self, "model").predict(X, return_pi=True) 161 else: 162 return getattr(self, "model").predict(X)
Gradient Boosted Decision Trees (GBDT) base class
Attributes:
model_type: str
type of gradient boosting algorithm: 'xgboost', 'lightgbm',
'catboost', 'gradientboosting'
n_estimators: int
maximum number of trees that can be built
learning_rate: float
shrinkage rate; used for reducing the gradient step
max_depth: int
maximum tree depth
rowsample: float
subsample ratio of the training instances
colsample: float
percentage of features to use at each node split
verbose: int
controls verbosity (default=0)
seed: int
reproducibility seed
**kwargs: dict
additional parameters to be passed to the class
def
fit(self, X, y, **kwargs):
115 def fit(self, X, y, **kwargs): 116 """Fit custom model to training data (X, y). 117 118 Parameters: 119 120 X: {array-like}, shape = [n_samples, n_features] 121 Training vectors, where n_samples is the number 122 of samples and n_features is the number of features. 123 124 y: array-like, shape = [n_samples] 125 Target values. 126 127 **kwargs: additional parameters to be passed to 128 self.cook_training_set or self.obj.fit 129 130 Returns: 131 132 self: object 133 """ 134 if getattr(self, "type_fit") == "classification": 135 self.classes_ = np.unique(y) # for compatibility with sklearn 136 self.n_classes_ = len( 137 self.classes_ 138 ) # for compatibility with sklearn 139 if getattr(self, "model_type") == "gradientboosting": 140 self.model.max_features = int(self.model.max_features * X.shape[1]) 141 return getattr(self, "model").fit(X, y, **kwargs)
Fit custom model to training data (X, y).
Parameters:
X: {array-like}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number
of samples and n_features is the number of features.
y: array-like, shape = [n_samples]
Target values.
**kwargs: additional parameters to be passed to
self.cook_training_set or self.obj.fit
Returns:
self: object
def
predict(self, X):
143 def predict(self, X): 144 """Predict test data X. 145 146 Parameters: 147 148 X: {array-like}, shape = [n_samples, n_features] 149 Training vectors, where n_samples is the number 150 of samples and n_features is the number of features. 151 152 **kwargs: additional parameters to be passed to 153 self.cook_test_set 154 155 Returns: 156 157 model predictions: {array-like} 158 """ 159 if self.level is not None and self.type_fit == "regression": 160 return getattr(self, "model").predict(X, return_pi=True) 161 else: 162 return getattr(self, "model").predict(X)
Predict test data X.
Parameters:
X: {array-like}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number
of samples and n_features is the number of features.
**kwargs: additional parameters to be passed to
self.cook_test_set
Returns:
model predictions: {array-like}
21class GBDTClassifier(GBDT, ClassifierMixin): 22 """GBDT Classification model 23 24 Attributes: 25 26 model_type: str 27 type of gradient boosting algorithm: 'xgboost', 'lightgbm', 28 'catboost', 'gradientboosting' 29 30 n_estimators: int 31 maximum number of trees that can be built 32 33 learning_rate: float 34 shrinkage rate; used for reducing the gradient step 35 36 max_depth: int 37 maximum tree depth 38 39 rowsample: float 40 subsample ratio of the training instances 41 42 colsample: float 43 percentage of features to use at each node split 44 45 level: float 46 confidence level for prediction sets 47 48 pi_method: str 49 method for constructing the prediction intervals: 'icp' (inductive conformal), 'tcp' (transductive conformal) 50 51 verbose: int 52 controls verbosity (default=0) 53 54 seed: int 55 reproducibility seed 56 57 **kwargs: dict 58 additional parameters to be passed to the class 59 60 Examples: 61 62 ```python 63 import unifiedbooster as ub 64 from sklearn.datasets import load_iris 65 from sklearn.model_selection import train_test_split 66 from sklearn.metrics import accuracy_score 67 68 # Load dataset 69 iris = load_iris() 70 X, y = iris.data, iris.target 71 72 # Split dataset into training and testing sets 73 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 74 75 # Initialize the unified regressor (example with XGBoost) 76 regressor1 = ub.GBDTClassifier(model_type='xgboost') 77 #regressor2 = ub.GBDTClassifier(model_type='catboost') 78 regressor3 = ub.GBDTClassifier(model_type='lightgbm') 79 80 # Fit the model 81 regressor1.fit(X_train, y_train) 82 #regressor2.fit(X_train, y_train) 83 regressor3.fit(X_train, y_train) 84 85 # Predict on the test set 86 y_pred1 = regressor1.predict(X_test) 87 #y_pred2 = regressor2.predict(X_test) 88 y_pred3 = regressor3.predict(X_test) 89 90 # Evaluate the model 91 accuracy1 = accuracy_score(y_test, y_pred1) 92 #accuracy2 = accuracy_score(y_test, y_pred2) 93 accuracy3 = accuracy_score(y_test, y_pred3) 94 print(f"Classification Accuracy xgboost: {accuracy1:.2f}") 95 #print(f"Classification Accuracy catboost: {accuracy2:.2f}") 96 print(f"Classification Accuracy lightgbm: {accuracy3:.2f}") 97 ``` 98 """ 99 100 def __init__( 101 self, 102 model_type="xgboost", 103 n_estimators=100, 104 learning_rate=0.1, 105 max_depth=3, 106 rowsample=1.0, 107 colsample=1.0, 108 level=None, 109 pi_method="icp", 110 verbose=0, 111 seed=123, 112 **kwargs, 113 ): 114 115 self.type_fit = "classification" 116 117 super().__init__( 118 model_type=model_type, 119 n_estimators=n_estimators, 120 learning_rate=learning_rate, 121 max_depth=max_depth, 122 rowsample=rowsample, 123 colsample=colsample, 124 level=level, 125 pi_method=pi_method, 126 verbose=verbose, 127 seed=seed, 128 **kwargs, 129 ) 130 131 if self.level is not None: 132 133 if model_type == "xgboost": 134 self.model = PredictionSet( 135 XGBClassifier(**self.params), 136 level=self.level, 137 method=self.pi_method, 138 ) 139 elif model_type == "catboost": 140 self.model = PredictionSet( 141 CatBoostClassifier(**self.params), 142 level=self.level, 143 method=self.pi_method, 144 ) 145 elif model_type == "lightgbm": 146 self.model = PredictionSet( 147 LGBMClassifier(**self.params), 148 level=self.level, 149 method=self.pi_method, 150 ) 151 elif model_type == "gradientboosting": 152 self.model = PredictionSet( 153 GradientBoostingClassifier(**self.params), 154 level=self.level, 155 method=self.pi_method, 156 ) 157 else: 158 raise ValueError(f"Unknown model_type: {model_type}") 159 160 else: 161 162 if model_type == "xgboost": 163 self.model = XGBClassifier(**self.params) 164 elif model_type == "catboost": 165 self.model = CatBoostClassifier(**self.params) 166 elif model_type == "lightgbm": 167 self.model = LGBMClassifier(**self.params) 168 elif model_type == "gradientboosting": 169 self.model = GradientBoostingClassifier(**self.params) 170 else: 171 raise ValueError(f"Unknown model_type: {model_type}") 172 173 def predict_proba(self, X): 174 """Predict probabilities for test data X. 175 176 Args: 177 178 X: {array-like}, shape = [n_samples, n_features] 179 Training vectors, where n_samples is the number 180 of samples and n_features is the number of features. 181 182 **kwargs: additional parameters to be passed to 183 self.cook_test_set 184 185 Returns: 186 187 probability estimates for test data: {array-like} 188 """ 189 return self.model.predict_proba(X)
GBDT Classification model
Attributes:
model_type: str
type of gradient boosting algorithm: 'xgboost', 'lightgbm',
'catboost', 'gradientboosting'
n_estimators: int
maximum number of trees that can be built
learning_rate: float
shrinkage rate; used for reducing the gradient step
max_depth: int
maximum tree depth
rowsample: float
subsample ratio of the training instances
colsample: float
percentage of features to use at each node split
level: float
confidence level for prediction sets
pi_method: str
method for constructing the prediction intervals: 'icp' (inductive conformal), 'tcp' (transductive conformal)
verbose: int
controls verbosity (default=0)
seed: int
reproducibility seed
**kwargs: dict
additional parameters to be passed to the class
Examples:
import unifiedbooster as ub
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# Load dataset
iris = load_iris()
X, y = iris.data, iris.target
# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Initialize the unified regressor (example with XGBoost)
regressor1 = ub.GBDTClassifier(model_type='xgboost')
#regressor2 = ub.GBDTClassifier(model_type='catboost')
regressor3 = ub.GBDTClassifier(model_type='lightgbm')
# Fit the model
regressor1.fit(X_train, y_train)
#regressor2.fit(X_train, y_train)
regressor3.fit(X_train, y_train)
# Predict on the test set
y_pred1 = regressor1.predict(X_test)
#y_pred2 = regressor2.predict(X_test)
y_pred3 = regressor3.predict(X_test)
# Evaluate the model
accuracy1 = accuracy_score(y_test, y_pred1)
#accuracy2 = accuracy_score(y_test, y_pred2)
accuracy3 = accuracy_score(y_test, y_pred3)
print(f"Classification Accuracy xgboost: {accuracy1:.2f}")
#print(f"Classification Accuracy catboost: {accuracy2:.2f}")
print(f"Classification Accuracy lightgbm: {accuracy3:.2f}")
def
predict_proba(self, X):
173 def predict_proba(self, X): 174 """Predict probabilities for test data X. 175 176 Args: 177 178 X: {array-like}, shape = [n_samples, n_features] 179 Training vectors, where n_samples is the number 180 of samples and n_features is the number of features. 181 182 **kwargs: additional parameters to be passed to 183 self.cook_test_set 184 185 Returns: 186 187 probability estimates for test data: {array-like} 188 """ 189 return self.model.predict_proba(X)
Predict probabilities for test data X.
Args:
X: {array-like}, shape = [n_samples, n_features]
Training vectors, where n_samples is the number
of samples and n_features is the number of features.
**kwargs: additional parameters to be passed to
self.cook_test_set
Returns:
probability estimates for test data: {array-like}
21class GBDTRegressor(GBDT, RegressorMixin): 22 """GBDT Regression model 23 24 Attributes: 25 26 model_type: str 27 type of gradient boosting algorithm: 'xgboost', 'lightgbm', 28 'catboost', 'gradientboosting' 29 30 n_estimators: int 31 maximum number of trees that can be built 32 33 learning_rate: float 34 shrinkage rate; used for reducing the gradient step 35 36 max_depth: int 37 maximum tree depth 38 39 rowsample: float 40 subsample ratio of the training instances 41 42 colsample: float 43 percentage of features to use at each node split 44 45 level: float 46 confidence level for prediction sets 47 48 pi_method: str 49 method for constructing the prediction intervals: 'splitconformal', 'localconformal' 50 51 verbose: int 52 controls verbosity (default=0) 53 54 seed: int 55 reproducibility seed 56 57 **kwargs: dict 58 additional parameters to be passed to the class 59 60 Examples: 61 62 ```python 63 import unifiedbooster as ub 64 from sklearn.datasets import fetch_california_housing 65 from sklearn.model_selection import train_test_split 66 from sklearn.metrics import mean_squared_error 67 68 # Load dataset 69 housing = fetch_california_housing() 70 X, y = housing.data, housing.target 71 72 # Split dataset into training and testing sets 73 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 74 75 # Initialize the unified regressor (example with XGBoost) 76 regressor1 = ub.GBDTRegressor(model_type='xgboost') 77 #regressor2 = ub.GBDTRegressor(model_type='catboost') 78 regressor3 = ub.GBDTRegressor(model_type='lightgbm') 79 80 # Fit the model 81 regressor1.fit(X_train, y_train) 82 #regressor2.fit(X_train, y_train) 83 regressor3.fit(X_train, y_train) 84 85 # Predict on the test set 86 y_pred1 = regressor1.predict(X_test) 87 #y_pred2 = regressor2.predict(X_test) 88 y_pred3 = regressor3.predict(X_test) 89 90 # Evaluate the model 91 mse1 = mean_squared_error(y_test, y_pred1) 92 #mse2 = mean_squared_error(y_test, y_pred2) 93 mse3 = mean_squared_error(y_test, y_pred3) 94 print(f"Regression Mean Squared Error xgboost: {mse1:.2f}") 95 #print(f"Regression Mean Squared Error catboost: {mse2:.2f}") 96 print(f"Regression Mean Squared Error lightgbm: {mse3:.2f}") 97 ``` 98 """ 99 100 def __init__( 101 self, 102 model_type="xgboost", 103 n_estimators=100, 104 learning_rate=0.1, 105 max_depth=3, 106 rowsample=1.0, 107 colsample=1.0, 108 level=None, 109 pi_method="splitconformal", 110 verbose=0, 111 seed=123, 112 **kwargs, 113 ): 114 115 self.type_fit = "regression" 116 117 super().__init__( 118 model_type=model_type, 119 n_estimators=n_estimators, 120 learning_rate=learning_rate, 121 max_depth=max_depth, 122 rowsample=rowsample, 123 colsample=colsample, 124 level=level, 125 pi_method=pi_method, 126 verbose=verbose, 127 seed=seed, 128 **kwargs, 129 ) 130 131 if self.level is not None: 132 133 if model_type == "xgboost": 134 self.model = PredictionInterval( 135 XGBRegressor(**self.params), 136 level=self.level, 137 method=self.pi_method, 138 ) 139 elif model_type == "catboost": 140 self.model = PredictionInterval( 141 CatBoostRegressor(**self.params), 142 level=self.level, 143 method=self.pi_method, 144 ) 145 elif model_type == "lightgbm": 146 self.model = PredictionInterval( 147 LGBMRegressor(**self.params), 148 level=self.level, 149 method=self.pi_method, 150 ) 151 elif model_type == "gradientboosting": 152 self.model = PredictionInterval( 153 GradientBoostingRegressor(**self.params), 154 level=self.level, 155 method=self.pi_method, 156 ) 157 else: 158 raise ValueError(f"Unknown model_type: {model_type}") 159 160 else: 161 162 if model_type == "xgboost": 163 self.model = XGBRegressor(**self.params) 164 elif model_type == "catboost": 165 self.model = CatBoostRegressor(**self.params) 166 elif model_type == "lightgbm": 167 self.model = LGBMRegressor(**self.params) 168 elif model_type == "gradientboosting": 169 self.model = GradientBoostingRegressor(**self.params) 170 else: 171 raise ValueError(f"Unknown model_type: {model_type}")
GBDT Regression model
Attributes:
model_type: str
type of gradient boosting algorithm: 'xgboost', 'lightgbm',
'catboost', 'gradientboosting'
n_estimators: int
maximum number of trees that can be built
learning_rate: float
shrinkage rate; used for reducing the gradient step
max_depth: int
maximum tree depth
rowsample: float
subsample ratio of the training instances
colsample: float
percentage of features to use at each node split
level: float
confidence level for prediction sets
pi_method: str
method for constructing the prediction intervals: 'splitconformal', 'localconformal'
verbose: int
controls verbosity (default=0)
seed: int
reproducibility seed
**kwargs: dict
additional parameters to be passed to the class
Examples:
import unifiedbooster as ub
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
# Load dataset
housing = fetch_california_housing()
X, y = housing.data, housing.target
# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Initialize the unified regressor (example with XGBoost)
regressor1 = ub.GBDTRegressor(model_type='xgboost')
#regressor2 = ub.GBDTRegressor(model_type='catboost')
regressor3 = ub.GBDTRegressor(model_type='lightgbm')
# Fit the model
regressor1.fit(X_train, y_train)
#regressor2.fit(X_train, y_train)
regressor3.fit(X_train, y_train)
# Predict on the test set
y_pred1 = regressor1.predict(X_test)
#y_pred2 = regressor2.predict(X_test)
y_pred3 = regressor3.predict(X_test)
# Evaluate the model
mse1 = mean_squared_error(y_test, y_pred1)
#mse2 = mean_squared_error(y_test, y_pred2)
mse3 = mean_squared_error(y_test, y_pred3)
print(f"Regression Mean Squared Error xgboost: {mse1:.2f}")
#print(f"Regression Mean Squared Error catboost: {mse2:.2f}")
print(f"Regression Mean Squared Error lightgbm: {mse3:.2f}")
def
cross_val_optim( X_train, y_train, X_test=None, y_test=None, model_type='xgboost', type_fit='classification', scoring='accuracy', n_estimators=None, surrogate_obj=None, cv=5, n_jobs=None, n_init=10, n_iter=190, abs_tol=0.001, verbose=2, seed=123):
14def cross_val_optim( 15 X_train, 16 y_train, 17 X_test=None, 18 y_test=None, 19 model_type="xgboost", 20 type_fit="classification", 21 scoring="accuracy", 22 n_estimators=None, 23 surrogate_obj=None, 24 cv=5, 25 n_jobs=None, 26 n_init=10, 27 n_iter=190, 28 abs_tol=1e-3, 29 verbose=2, 30 seed=123, 31): 32 """Cross-validation function and hyperparameters' search 33 34 Parameters: 35 36 X_train: array-like, 37 Training vectors, where rows is the number of samples 38 and columns is the number of features. 39 40 y_train: array-like, 41 Training vectors, where rows is the number of samples 42 and columns is the number of features. 43 44 X_test: array-like, 45 Testing vectors, where rows is the number of samples 46 and columns is the number of features. 47 48 y_test: array-like, 49 Testing vectors, where rows is the number of samples 50 and columns is the number of features. 51 52 model_type: str 53 type of gradient boosting algorithm: 'xgboost', 'lightgbm', 54 'catboost', 'gradientboosting' 55 56 type_fit: str 57 "regression" or "classification" 58 59 scoring: str 60 scoring metric; see https://scikit-learn.org/stable/modules/model_evaluation.html#the-scoring-parameter-defining-model-evaluation-rules 61 62 n_estimators: int 63 maximum number of trees that can be built (default is None, and if None, then the parameter is tuned) 64 65 surrogate_obj: an object; 66 An ML model for estimating the uncertainty around the objective function 67 68 cv: int; 69 number of cross-validation folds 70 71 n_jobs: int; 72 number of jobs for parallel execution 73 74 n_init: an integer; 75 number of points in the initial setting, when `x_init` and `y_init` are not provided 76 77 n_iter: an integer; 78 number of iterations of the minimization algorithm 79 80 abs_tol: a float; 81 tolerance for convergence of the optimizer (early stopping based on acquisition function) 82 83 verbose: int 84 controls verbosity 85 86 seed: int 87 reproducibility seed 88 89 Examples: 90 91 ```python 92 import unifiedbooster as ub 93 from sklearn.datasets import load_breast_cancer 94 from sklearn.model_selection import train_test_split 95 96 dataset = load_breast_cancer() 97 X, y = dataset.data, dataset.target 98 X_train, X_test, y_train, y_test = train_test_split( 99 X, y, test_size=0.2, random_state=42 100 ) 101 102 res1 = ub.cross_val_optim( 103 X_train, 104 y_train, 105 X_test=None, 106 y_test=None, 107 model_type="lightgbm", 108 type_fit="classification", 109 scoring="accuracy", 110 n_estimators=100, 111 surrogate_obj=None, 112 cv=5, 113 n_jobs=None, 114 n_init=10, 115 n_iter=190, 116 abs_tol=1e-3, 117 verbose=2, 118 seed=123, 119 ) 120 print(res1) 121 ``` 122 """ 123 124 def gbdt_cv( 125 X_train, 126 y_train, 127 model_type="xgboost", 128 n_estimators=100, 129 learning_rate=0.1, 130 max_depth=3, 131 rowsample=1.0, 132 colsample=1.0, 133 cv=5, 134 n_jobs=None, 135 type_fit="classification", 136 scoring="accuracy", 137 seed=123, 138 ): 139 if type_fit == "regression": 140 estimator = GBDTRegressor( 141 model_type=model_type, 142 n_estimators=n_estimators, 143 learning_rate=learning_rate, 144 max_depth=max_depth, 145 rowsample=rowsample, 146 colsample=colsample, 147 verbose=0, 148 seed=seed, 149 ) 150 elif type_fit == "classification": 151 estimator = GBDTClassifier( 152 model_type=model_type, 153 n_estimators=n_estimators, 154 learning_rate=learning_rate, 155 max_depth=max_depth, 156 rowsample=rowsample, 157 colsample=colsample, 158 verbose=0, 159 seed=seed, 160 ) 161 return -cross_val_score( 162 estimator, 163 X_train, 164 y_train, 165 scoring=scoring, 166 cv=cv, 167 n_jobs=n_jobs, 168 verbose=0, 169 ).mean() 170 171 # objective function for hyperparams tuning 172 if n_estimators is not None: 173 174 def crossval_objective(xx): 175 return gbdt_cv( 176 X_train=X_train, 177 y_train=y_train, 178 model_type=model_type, 179 n_estimators=n_estimators, 180 learning_rate=10 ** xx[0], 181 max_depth=int(xx[1]), 182 rowsample=xx[2], 183 colsample=xx[3], 184 cv=cv, 185 n_jobs=n_jobs, 186 type_fit=type_fit, 187 scoring=scoring, 188 seed=seed, 189 ) 190 191 else: # n_estimators is None 192 193 def crossval_objective(xx): 194 return gbdt_cv( 195 X_train=X_train, 196 y_train=y_train, 197 model_type=model_type, 198 n_estimators=int(10 ** xx[4]), 199 learning_rate=10 ** xx[0], 200 max_depth=int(xx[1]), 201 rowsample=xx[2], 202 colsample=xx[3], 203 cv=cv, 204 n_jobs=n_jobs, 205 type_fit=type_fit, 206 scoring=scoring, 207 seed=seed, 208 ) 209 210 if n_estimators is not None: 211 if surrogate_obj is None: 212 gp_opt = gp.GPOpt( 213 objective_func=crossval_objective, 214 lower_bound=np.array([-6, 1, 0.5, 0.5]), 215 upper_bound=np.array([0, 16, 1.0, 1.0]), 216 params_names=[ 217 "learning_rate", 218 "max_depth", 219 "rowsample", 220 "colsample", 221 ], 222 method="bayesian", 223 n_init=n_init, 224 n_iter=n_iter, 225 seed=seed, 226 ) 227 else: 228 gp_opt = gp.GPOpt( 229 objective_func=crossval_objective, 230 lower_bound=np.array([-6, 1, 0.5, 0.5]), 231 upper_bound=np.array([0, 16, 1.0, 1.0]), 232 params_names=[ 233 "learning_rate", 234 "max_depth", 235 "rowsample", 236 "colsample", 237 ], 238 acquisition="ucb", 239 method="splitconformal", 240 surrogate_obj=ns.PredictionInterval( 241 obj=surrogate_obj, method="splitconformal" 242 ), 243 n_init=n_init, 244 n_iter=n_iter, 245 seed=seed, 246 ) 247 else: # n_estimators is None 248 if surrogate_obj is None: 249 gp_opt = gp.GPOpt( 250 objective_func=crossval_objective, 251 lower_bound=np.array([-6, 1, 0.5, 0.5, 2]), 252 upper_bound=np.array([0, 16, 1.0, 1.0, 3]), 253 params_names=[ 254 "learning_rate", 255 "max_depth", 256 "rowsample", 257 "colsample", 258 "n_estimators", 259 ], 260 method="bayesian", 261 n_init=n_init, 262 n_iter=n_iter, 263 seed=seed, 264 ) 265 else: 266 gp_opt = gp.GPOpt( 267 objective_func=crossval_objective, 268 lower_bound=np.array([-6, 1, 0.5, 0.5, 2]), 269 upper_bound=np.array([0, 16, 1.0, 1.0, 3]), 270 params_names=[ 271 "learning_rate", 272 "max_depth", 273 "rowsample", 274 "colsample", 275 "n_estimators", 276 ], 277 acquisition="ucb", 278 method="splitconformal", 279 surrogate_obj=ns.PredictionInterval( 280 obj=surrogate_obj, method="splitconformal" 281 ), 282 n_init=n_init, 283 n_iter=n_iter, 284 seed=seed, 285 ) 286 287 res = gp_opt.optimize(verbose=verbose, abs_tol=abs_tol) 288 res.best_params["model_type"] = model_type 289 res.best_params["n_estimators"] = ( 290 int(n_estimators) 291 if n_estimators is not None 292 else int(10 ** res.best_params["n_estimators"]) 293 ) 294 res.best_params["learning_rate"] = 10 ** res.best_params["learning_rate"] 295 res.best_params["max_depth"] = int(res.best_params["max_depth"]) 296 res.best_params["rowsample"] = res.best_params["rowsample"] 297 res.best_params["colsample"] = res.best_params["colsample"] 298 299 # out-of-sample error 300 if X_test is not None and y_test is not None: 301 if type_fit == "regression": 302 estimator = GBDTRegressor(**res.best_params, verbose=0, seed=seed) 303 elif type_fit == "classification": 304 estimator = GBDTClassifier(**res.best_params, verbose=0, seed=seed) 305 preds = estimator.fit(X_train, y_train).predict(X_test) 306 # check error on y_test 307 oos_err = getattr(metrics, scoring + "_score")( 308 y_true=y_test, y_pred=preds 309 ) 310 result = namedtuple("result", res._fields + ("test_" + scoring,)) 311 return result(*res, oos_err) 312 else: 313 return res
Cross-validation function and hyperparameters' search
Parameters:
X_train: array-like,
Training vectors, where rows is the number of samples
and columns is the number of features.
y_train: array-like,
Training vectors, where rows is the number of samples
and columns is the number of features.
X_test: array-like,
Testing vectors, where rows is the number of samples
and columns is the number of features.
y_test: array-like,
Testing vectors, where rows is the number of samples
and columns is the number of features.
model_type: str
type of gradient boosting algorithm: 'xgboost', 'lightgbm',
'catboost', 'gradientboosting'
type_fit: str
"regression" or "classification"
scoring: str
scoring metric; see https://scikit-learn.org/stable/modules/model_evaluation.html#the-scoring-parameter-defining-model-evaluation-rules
n_estimators: int
maximum number of trees that can be built (default is None, and if None, then the parameter is tuned)
surrogate_obj: an object;
An ML model for estimating the uncertainty around the objective function
cv: int;
number of cross-validation folds
n_jobs: int;
number of jobs for parallel execution
n_init: an integer;
number of points in the initial setting, when `x_init` and `y_init` are not provided
n_iter: an integer;
number of iterations of the minimization algorithm
abs_tol: a float;
tolerance for convergence of the optimizer (early stopping based on acquisition function)
verbose: int
controls verbosity
seed: int
reproducibility seed
Examples:
import unifiedbooster as ub
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
dataset = load_breast_cancer()
X, y = dataset.data, dataset.target
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
res1 = ub.cross_val_optim(
X_train,
y_train,
X_test=None,
y_test=None,
model_type="lightgbm",
type_fit="classification",
scoring="accuracy",
n_estimators=100,
surrogate_obj=None,
cv=5,
n_jobs=None,
n_init=10,
n_iter=190,
abs_tol=1e-3,
verbose=2,
seed=123,
)
print(res1)
def
lazy_cross_val_optim( X_train, y_train, X_test=None, y_test=None, model_type='xgboost', type_fit='classification', scoring='accuracy', customize=False, n_estimators=None, cv=5, n_jobs=None, n_init=10, n_iter=190, abs_tol=0.001, verbose=1, seed=123):
316def lazy_cross_val_optim( 317 X_train, 318 y_train, 319 X_test=None, 320 y_test=None, 321 model_type="xgboost", 322 type_fit="classification", 323 scoring="accuracy", 324 customize=False, 325 n_estimators=None, 326 cv=5, 327 n_jobs=None, 328 n_init=10, 329 n_iter=190, 330 abs_tol=1e-3, 331 verbose=1, 332 seed=123, 333): 334 """Automated Cross-validation function and hyperparameters' search using multiple surrogates 335 336 Parameters: 337 338 X_train: array-like, 339 Training vectors, where rows is the number of samples 340 and columns is the number of features. 341 342 y_train: array-like, 343 Training vectors, where rows is the number of samples 344 and columns is the number of features. 345 346 X_test: array-like, 347 Testing vectors, where rows is the number of samples 348 and columns is the number of features. 349 350 y_test: array-like, 351 Testing vectors, where rows is the number of samples 352 and columns is the number of features. 353 354 model_type: str 355 type of gradient boosting algorithm: 'xgboost', 'lightgbm', 356 'catboost', 'gradientboosting' 357 358 type_fit: str 359 "regression" or "classification" 360 361 scoring: str 362 scoring metric; see https://scikit-learn.org/stable/modules/model_evaluation.html#the-scoring-parameter-defining-model-evaluation-rules 363 364 customize: boolean 365 if True, the surrogate is transformed into a quasi-randomized network (default is False) 366 367 n_estimators: int 368 maximum number of trees that can be built (default is None, if None, the parameters is tuned) 369 370 cv: int; 371 number of cross-validation folds 372 373 n_jobs: int; 374 number of jobs for parallel execution 375 376 n_init: an integer; 377 number of points in the initial setting, when `x_init` and `y_init` are not provided 378 379 n_iter: an integer; 380 number of iterations of the minimization algorithm 381 382 abs_tol: a float; 383 tolerance for convergence of the optimizer (early stopping based on acquisition function) 384 385 verbose: int 386 controls verbosity 387 388 seed: int 389 reproducibility seed 390 391 Examples: 392 393 ```python 394 import os 395 import unifiedbooster as ub 396 from sklearn.datasets import load_breast_cancer 397 from sklearn.model_selection import train_test_split 398 from sklearn.metrics import accuracy_score 399 from time import time 400 401 print(f"\n ----- Running: {os.path.basename(__file__)}... ----- \n") 402 403 dataset = load_breast_cancer() 404 X, y = dataset.data, dataset.target 405 X_train, X_test, y_train, y_test = train_test_split( 406 X, y, test_size=0.2, random_state=42 407 ) 408 409 start = time() 410 res4 = ub.lazy_cross_val_optim( 411 X_train, 412 y_train, 413 X_test=X_test, 414 y_test=y_test, 415 model_type="lightgbm", 416 type_fit="classification", 417 scoring="accuracy", 418 n_estimators=100, 419 cv=5, 420 n_jobs=None, 421 n_init=10, 422 n_iter=190, 423 abs_tol=1e-3, 424 seed=123, 425 customize=False 426 ) 427 print(f"Elapsed: {time()-start}") 428 print(res4) 429 ``` 430 """ 431 432 removed_regressors = [ 433 "TheilSenRegressor", 434 "ARDRegression", 435 "CCA", 436 "GaussianProcessRegressor", 437 "GradientBoostingRegressor", 438 "HistGradientBoostingRegressor", 439 "IsotonicRegression", 440 "MultiOutputRegressor", 441 "MultiTaskElasticNet", 442 "MultiTaskElasticNetCV", 443 "MultiTaskLasso", 444 "MultiTaskLassoCV", 445 "OrthogonalMatchingPursuit", 446 "OrthogonalMatchingPursuitCV", 447 "PLSCanonical", 448 "PLSRegression", 449 "RadiusNeighborsRegressor", 450 "RegressorChain", 451 "StackingRegressor", 452 "VotingRegressor", 453 ] 454 455 results = [] 456 457 for est in all_estimators(): 458 if issubclass(est[1], RegressorMixin) and ( 459 est[0] not in removed_regressors 460 ): 461 try: 462 if customize == True: 463 print(f"\n surrogate: CustomRegressor({est[0]})") 464 surr_obj = ns.CustomRegressor(obj=est[1]()) 465 else: 466 print(f"\n surrogate: {est[0]}") 467 surr_obj = est[1]() 468 res = cross_val_optim( 469 X_train=X_train, 470 y_train=y_train, 471 X_test=X_test, 472 y_test=y_test, 473 model_type=model_type, 474 n_estimators=n_estimators, 475 surrogate_obj=surr_obj, 476 cv=cv, 477 n_jobs=n_jobs, 478 type_fit=type_fit, 479 scoring=scoring, 480 n_init=n_init, 481 n_iter=n_iter, 482 abs_tol=abs_tol, 483 verbose=verbose, 484 seed=seed, 485 ) 486 print(f"\n result: {res}") 487 if customize == True: 488 results.append((f"CustomRegressor({est[0]})", res)) 489 else: 490 results.append((est[0], res)) 491 except: 492 pass 493 494 return results
Automated Cross-validation function and hyperparameters' search using multiple surrogates
Parameters:
X_train: array-like,
Training vectors, where rows is the number of samples
and columns is the number of features.
y_train: array-like,
Training vectors, where rows is the number of samples
and columns is the number of features.
X_test: array-like,
Testing vectors, where rows is the number of samples
and columns is the number of features.
y_test: array-like,
Testing vectors, where rows is the number of samples
and columns is the number of features.
model_type: str
type of gradient boosting algorithm: 'xgboost', 'lightgbm',
'catboost', 'gradientboosting'
type_fit: str
"regression" or "classification"
scoring: str
scoring metric; see https://scikit-learn.org/stable/modules/model_evaluation.html#the-scoring-parameter-defining-model-evaluation-rules
customize: boolean
if True, the surrogate is transformed into a quasi-randomized network (default is False)
n_estimators: int
maximum number of trees that can be built (default is None, if None, the parameters is tuned)
cv: int;
number of cross-validation folds
n_jobs: int;
number of jobs for parallel execution
n_init: an integer;
number of points in the initial setting, when `x_init` and `y_init` are not provided
n_iter: an integer;
number of iterations of the minimization algorithm
abs_tol: a float;
tolerance for convergence of the optimizer (early stopping based on acquisition function)
verbose: int
controls verbosity
seed: int
reproducibility seed
Examples:
import os
import unifiedbooster as ub
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from time import time
print(f"
----- Running: {os.path.basename(__file__)}... -----
")
dataset = load_breast_cancer()
X, y = dataset.data, dataset.target
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
start = time()
res4 = ub.lazy_cross_val_optim(
X_train,
y_train,
X_test=X_test,
y_test=y_test,
model_type="lightgbm",
type_fit="classification",
scoring="accuracy",
n_estimators=100,
cv=5,
n_jobs=None,
n_init=10,
n_iter=190,
abs_tol=1e-3,
seed=123,
customize=False
)
print(f"Elapsed: {time()-start}")
print(res4)