## Loading required package: Matrix
## Loaded glmnet 4.1-7
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
Examples for regression, time series, and classification.
Linear model
# 'X' contains the explanatory variables
# 'y' is the response
# 'k' is the number of folds in k-fold cross-validation
# 'repeats' is the number of repeats of the k-fold cross-validation procedure
# linear model example -----
crossvalidation::crossval_ml(x = X, y = y, k = 5, repeats = 3, show_progress = FALSE)
## $folds
## repeat_1 repeat_2 repeat_3
## fold_1 1.0657917 0.7188426 1.0338031
## fold_2 1.0253419 0.6820600 0.7632532
## fold_3 0.8726629 1.0294080 0.7830872
## fold_4 0.8899336 1.2441376 0.9776853
## fold_5 0.9724975 1.0578021 1.0921075
##
## $mean
## [1] 0.9472276
##
## $sd
## [1] 0.15799
##
## $median
## [1] 0.9776853
##
## $summary
## repeat_1 repeat_2 repeat_3
## Min. :0.8727 Min. :0.6821 Min. :0.7633
## 1st Qu.:0.8899 1st Qu.:0.7188 1st Qu.:0.7831
## Median :0.9725 Median :1.0294 Median :0.9777
## Mean :0.9652 Mean :0.9465 Mean :0.9300
## 3rd Qu.:1.0253 3rd Qu.:1.0578 3rd Qu.:1.0338
## Max. :1.0658 Max. :1.2441 Max. :1.0921
# linear model example, with validation set
crossvalidation::crossval_ml(x = X, y = y, k = 5, repeats = 3, p = 0.8, show_progress = FALSE)
## $folds
## repeat_1 repeat_2 repeat_3
## fold_training_1 0.8589887 0.6618209 0.8028635
## fold_validation_1 1.0689051 1.1206828 1.1719136
## fold_training_2 0.7093379 1.0424256 0.9199860
## fold_validation_2 1.0827333 1.1363834 1.0731990
## fold_training_3 1.0489754 1.1053316 1.1602932
## fold_validation_3 1.2271907 1.1456095 1.0881138
## fold_training_4 0.9703262 0.9576335 0.8823505
## fold_validation_4 1.1098708 1.0933379 1.0563444
## fold_training_5 1.0689683 0.9692094 1.1509957
## fold_validation_5 1.0925497 1.0641650 1.1304265
##
## $mean_training
## [1] 0.9539671
##
## $mean_validation
## [1] 1.110762
##
## $sd_training
## [1] 0.1507651
##
## $sd_validation
## [1] 0.04619929
##
## $median_training
## [1] 0.9692094
##
## $median_validation
## [1] 1.093338
##
## $summary_training
## repeat_1 repeat_2 repeat_3
## Min. :0.7093 Min. :0.6618 Min. :0.8029
## 1st Qu.:0.8590 1st Qu.:0.9576 1st Qu.:0.8824
## Median :0.9703 Median :0.9692 Median :0.9200
## Mean :0.9313 Mean :0.9473 Mean :0.9833
## 3rd Qu.:1.0490 3rd Qu.:1.0424 3rd Qu.:1.1510
## Max. :1.0690 Max. :1.1053 Max. :1.1603
##
## $summary_validation
## repeat_1 repeat_2 repeat_3
## Min. :1.069 Min. :1.064 Min. :1.056
## 1st Qu.:1.083 1st Qu.:1.093 1st Qu.:1.073
## Median :1.093 Median :1.121 Median :1.088
## Mean :1.116 Mean :1.112 Mean :1.104
## 3rd Qu.:1.110 3rd Qu.:1.136 3rd Qu.:1.130
## Max. :1.227 Max. :1.146 Max. :1.172
glmnet
# glmnet example -----
# fit glmnet, with alpha = 1, lambda = 0.1
crossvalidation::crossval_ml(x = X, y = y, k = 5, repeats = 3, show_progress = FALSE,
fit_func = glmnet::glmnet, predict_func = predict.glmnet,
packages = c("glmnet", "Matrix"), fit_params = list(alpha = 0.5, lambda = 0.1))
## $folds
## repeat_1 repeat_2 repeat_3
## fold_1 1.0514770 0.7505046 1.0142623
## fold_2 1.0195407 0.6996108 0.7214830
## fold_3 0.8858994 1.0033483 0.7838630
## fold_4 0.8101031 1.2110573 0.9747158
## fold_5 0.9562564 1.0257926 1.0952526
##
## $mean
## [1] 0.9335445
##
## $sd
## [1] 0.1511116
##
## $median
## [1] 0.9747158
##
## $summary
## repeat_1 repeat_2 repeat_3
## Min. :0.8101 Min. :0.6996 Min. :0.7215
## 1st Qu.:0.8859 1st Qu.:0.7505 1st Qu.:0.7839
## Median :0.9563 Median :1.0033 Median :0.9747
## Mean :0.9447 Mean :0.9381 Mean :0.9179
## 3rd Qu.:1.0195 3rd Qu.:1.0258 3rd Qu.:1.0143
## Max. :1.0515 Max. :1.2111 Max. :1.0953
# fit glmnet, with alpha = 0, lambda = 0.01
crossvalidation::crossval_ml(x = X, y = y, k = 5, repeats = 3, show_progress = FALSE,
fit_func = glmnet::glmnet, predict_func = predict.glmnet,
packages = c("glmnet", "Matrix"), fit_params = list(alpha = 0, lambda = 0.01))
## $folds
## repeat_1 repeat_2 repeat_3
## fold_1 1.0605785 0.7192835 1.0253503
## fold_2 1.0266807 0.6757645 0.7594241
## fold_3 0.8895544 1.0344944 0.7733901
## fold_4 0.8938092 1.2470820 1.0042933
## fold_5 0.9641163 1.0703400 1.0952784
##
## $mean
## [1] 0.949296
##
## $sd
## [1] 0.160505
##
## $median
## [1] 1.004293
##
## $summary
## repeat_1 repeat_2 repeat_3
## Min. :0.8896 Min. :0.6758 Min. :0.7594
## 1st Qu.:0.8938 1st Qu.:0.7193 1st Qu.:0.7734
## Median :0.9641 Median :1.0345 Median :1.0043
## Mean :0.9669 Mean :0.9494 Mean :0.9315
## 3rd Qu.:1.0267 3rd Qu.:1.0703 3rd Qu.:1.0254
## Max. :1.0606 Max. :1.2471 Max. :1.0953
# fit glmnet, with alpha = 0, lambda = 0.01, with validation set
crossvalidation::crossval_ml(x = X, y = y, k = 5, repeats = 2, p = 0.8,
show_progress = FALSE,
fit_func = glmnet::glmnet, predict_func = predict.glmnet,
packages = c("glmnet", "Matrix"), fit_params = list(alpha = 0, lambda = 0.01))
## $folds
## repeat_1 repeat_2
## fold_training_1 0.8581977 0.6617929
## fold_validation_1 1.0648167 1.1204975
## fold_training_2 0.7000929 1.0383779
## fold_validation_2 1.0683314 1.1451865
## fold_training_3 1.0515575 1.1083551
## fold_validation_3 1.1922896 1.1370166
## fold_training_4 0.9847907 0.9510162
## fold_validation_4 1.0888078 1.0477593
## fold_training_5 1.0600201 0.9665769
## fold_validation_5 1.0568172 1.0571621
##
## $mean_training
## [1] 0.9380778
##
## $mean_validation
## [1] 1.097868
##
## $sd_training
## [1] 0.1525031
##
## $sd_validation
## [1] 0.04841842
##
## $median_training
## [1] 0.9756838
##
## $median_validation
## [1] 1.07857
##
## $summary_training
## repeat_1 repeat_2
## Min. :0.7001 Min. :0.6618
## 1st Qu.:0.8582 1st Qu.:0.9510
## Median :0.9848 Median :0.9666
## Mean :0.9309 Mean :0.9452
## 3rd Qu.:1.0516 3rd Qu.:1.0384
## Max. :1.0600 Max. :1.1084
##
## $summary_validation
## repeat_1 repeat_2
## Min. :1.057 Min. :1.048
## 1st Qu.:1.065 1st Qu.:1.057
## Median :1.068 Median :1.120
## Mean :1.094 Mean :1.102
## 3rd Qu.:1.089 3rd Qu.:1.137
## Max. :1.192 Max. :1.145
Random Forest
# randomForest example -----
# fit randomForest with mtry = 2
crossvalidation::crossval_ml(x = X, y = y, k = 5, repeats = 3,
show_progress = FALSE,
fit_func = randomForest::randomForest, predict_func = predict,
packages = "randomForest", fit_params = list(mtry = 2))
## $folds
## repeat_1 repeat_2 repeat_3
## fold_1 1.0932266 0.8306226 1.0178042
## fold_2 1.0016687 0.8546975 0.8257244
## fold_3 0.8960080 1.1016916 0.7720573
## fold_4 0.7837822 1.2375085 1.0627761
## fold_5 0.9827901 1.0636167 1.1402827
##
## $mean
## [1] 0.9776172
##
## $sd
## [1] 0.1429053
##
## $median
## [1] 1.001669
##
## $summary
## repeat_1 repeat_2 repeat_3
## Min. :0.7838 Min. :0.8306 Min. :0.7721
## 1st Qu.:0.8960 1st Qu.:0.8547 1st Qu.:0.8257
## Median :0.9828 Median :1.0636 Median :1.0178
## Mean :0.9515 Mean :1.0176 Mean :0.9637
## 3rd Qu.:1.0017 3rd Qu.:1.1017 3rd Qu.:1.0628
## Max. :1.0932 Max. :1.2375 Max. :1.1403
# fit randomForest with mtry = 4
crossvalidation::crossval_ml(x = X, y = y, k = 5, repeats = 3,
show_progress = FALSE,
fit_func = randomForest::randomForest, predict_func = predict,
packages = "randomForest", fit_params = list(mtry = 4))
## $folds
## repeat_1 repeat_2 repeat_3
## fold_1 1.0938938 0.8294432 1.0274141
## fold_2 0.9948985 0.8837450 0.8508324
## fold_3 0.8830305 1.1455328 0.7690735
## fold_4 0.8326955 1.2709843 1.0781753
## fold_5 0.9802129 1.0722765 1.1277285
##
## $mean
## [1] 0.9893291
##
## $sd
## [1] 0.1440225
##
## $median
## [1] 0.9948985
##
## $summary
## repeat_1 repeat_2 repeat_3
## Min. :0.8327 Min. :0.8294 Min. :0.7691
## 1st Qu.:0.8830 1st Qu.:0.8837 1st Qu.:0.8508
## Median :0.9802 Median :1.0723 Median :1.0274
## Mean :0.9569 Mean :1.0404 Mean :0.9706
## 3rd Qu.:0.9949 3rd Qu.:1.1455 3rd Qu.:1.0782
## Max. :1.0939 Max. :1.2710 Max. :1.1277
# fit randomForest with mtry = 4, with validation set
crossvalidation::crossval_ml(x = X, y = y, k = 5, repeats = 2, p = 0.8,
show_progress = FALSE,
fit_func = randomForest::randomForest, predict_func = predict,
packages = "randomForest", fit_params = list(mtry = 4))
## $folds
## repeat_1 repeat_2
## fold_training_1 0.8159825 0.7779928
## fold_validation_1 0.6071914 0.7095150
## fold_training_2 0.8055798 0.9620625
## fold_validation_2 0.6124228 0.6857506
## fold_training_3 1.0334201 1.1550091
## fold_validation_3 0.9720091 0.9593153
## fold_training_4 0.9259389 1.0479121
## fold_validation_4 0.6447842 0.6833361
## fold_training_5 0.9795790 0.9959987
## fold_validation_5 0.8065167 0.6781677
##
## $mean_training
## [1] 0.9499476
##
## $mean_validation
## [1] 0.7359009
##
## $sd_training
## [1] 0.1205054
##
## $sd_validation
## [1] 0.1333626
##
## $median_training
## [1] 0.9708207
##
## $median_validation
## [1] 0.6845434
##
## $summary_training
## repeat_1 repeat_2
## Min. :0.8056 Min. :0.7780
## 1st Qu.:0.8160 1st Qu.:0.9621
## Median :0.9259 Median :0.9960
## Mean :0.9121 Mean :0.9878
## 3rd Qu.:0.9796 3rd Qu.:1.0479
## Max. :1.0334 Max. :1.1550
##
## $summary_validation
## repeat_1 repeat_2
## Min. :0.6072 Min. :0.6782
## 1st Qu.:0.6124 1st Qu.:0.6833
## Median :0.6448 Median :0.6858
## Mean :0.7286 Mean :0.7432
## 3rd Qu.:0.8065 3rd Qu.:0.7095
## Max. :0.9720 Max. :0.9593
xgboost
# xgboost example -----
# The response and covariates are named 'label' and 'data'
# So, we do this:
f_xgboost <- function(x, y, ...) xgboost::xgboost(data = x, label = y, ...)
# fit xgboost with nrounds = 5
crossvalidation::crossval_ml(x = X, y = y, k = 5, repeats = 3,
show_progress = FALSE,
fit_func = f_xgboost, predict_func = predict,
packages = "xgboost", fit_params = list(nrounds = 5,
verbose = FALSE))
## $folds
## repeat_1 repeat_2 repeat_3
## fold_1 1.0618781 0.7934194 1.1900606
## fold_2 0.9843552 0.8976441 0.7811620
## fold_3 1.0453996 1.2422247 0.7457622
## fold_4 0.7264478 1.3522984 0.9835989
## fold_5 1.0235773 1.0813265 1.1978533
##
## $mean
## [1] 1.007134
##
## $sd
## [1] 0.1911201
##
## $median
## [1] 1.023577
##
## $summary
## repeat_1 repeat_2 repeat_3
## Min. :0.7264 Min. :0.7934 Min. :0.7458
## 1st Qu.:0.9844 1st Qu.:0.8976 1st Qu.:0.7812
## Median :1.0236 Median :1.0813 Median :0.9836
## Mean :0.9683 Mean :1.0734 Mean :0.9797
## 3rd Qu.:1.0454 3rd Qu.:1.2422 3rd Qu.:1.1901
## Max. :1.0619 Max. :1.3523 Max. :1.1979
# fit xgboost with nrounds = 10
crossvalidation::crossval_ml(x = X, y = y, k = 5, repeats = 3,
show_progress = FALSE,
fit_func = f_xgboost, predict_func = predict,
packages = "xgboost", fit_params = list(nrounds = 10,
verbose = FALSE))
## $folds
## repeat_1 repeat_2 repeat_3
## fold_1 1.0658784 0.7836507 1.2662420
## fold_2 0.9968874 0.9484756 0.7958286
## fold_3 1.0444959 1.2541543 0.7130956
## fold_4 0.8412503 1.3478651 1.1406648
## fold_5 1.0178454 1.1320386 1.2381301
##
## $mean
## [1] 1.0391
##
## $sd
## [1] 0.1950815
##
## $median
## [1] 1.044496
##
## $summary
## repeat_1 repeat_2 repeat_3
## Min. :0.8413 Min. :0.7837 Min. :0.7131
## 1st Qu.:0.9969 1st Qu.:0.9485 1st Qu.:0.7958
## Median :1.0178 Median :1.1320 Median :1.1407
## Mean :0.9933 Mean :1.0932 Mean :1.0308
## 3rd Qu.:1.0445 3rd Qu.:1.2542 3rd Qu.:1.2381
## Max. :1.0659 Max. :1.3479 Max. :1.2662
# fit xgboost with nrounds = 10, with validation set
crossvalidation::crossval_ml(x = X, y = y, k = 5, repeats = 2, p = 0.8,
show_progress = FALSE,
fit_func = f_xgboost, predict_func = predict,
packages = "xgboost", fit_params = list(nrounds = 10,
verbose = FALSE))
## $folds
## repeat_1 repeat_2
## fold_training_1 0.7448843 0.8539725
## fold_validation_1 0.3889456 0.7549903
## fold_training_2 0.7627843 1.1084452
## fold_validation_2 0.5865502 0.6016042
## fold_training_3 1.1758940 1.1011674
## fold_validation_3 1.0291986 0.8007670
## fold_training_4 1.0463162 1.1637320
## fold_validation_4 0.8800788 0.5981181
## fold_training_5 0.9523677 1.0585674
## fold_validation_5 0.6689611 0.6313575
##
## $mean_training
## [1] 0.9968131
##
## $mean_validation
## [1] 0.6940571
##
## $sd_training
## [1] 0.1599181
##
## $sd_validation
## [1] 0.1791918
##
## $median_training
## [1] 1.052442
##
## $median_validation
## [1] 0.6501593
##
## $summary_training
## repeat_1 repeat_2
## Min. :0.7449 Min. :0.854
## 1st Qu.:0.7628 1st Qu.:1.059
## Median :0.9524 Median :1.101
## Mean :0.9364 Mean :1.057
## 3rd Qu.:1.0463 3rd Qu.:1.108
## Max. :1.1759 Max. :1.164
##
## $summary_validation
## repeat_1 repeat_2
## Min. :0.3889 Min. :0.5981
## 1st Qu.:0.5866 1st Qu.:0.6016
## Median :0.6690 Median :0.6314
## Mean :0.7107 Mean :0.6774
## 3rd Qu.:0.8801 3rd Qu.:0.7550
## Max. :1.0292 Max. :0.8008
Theta method (time series)
res <- crossvalidation::crossval_ts(y=AirPassengers, initial_window = 10, fcast_func = thetaf, show_progress = FALSE)
print(colMeans(res))
## ME RMSE MAE MPE MAPE
## 2.657082195 51.427170382 46.511874693 0.003423843 0.155428590
# Input data
# Transforming model response into a factor
y <- as.factor(as.numeric(iris$Species))
# Explanatory variables
X <- as.matrix(iris[, c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width")])
# 5-fold cross-validation repeated 3 times
# default error metric, when y is a factor: accuracy
crossvalidation::crossval_ml(x = X, y = y, k = 5, repeats = 3,
fit_func = randomForest::randomForest,
predict_func = predict,
fit_params = list(mtry = 2),
packages = "randomForest",
show_progress = FALSE)
## $folds
## repeat_1 repeat_2 repeat_3
## fold_1 0.9666667 0.9666667 1.0000000
## fold_2 0.9666667 0.9000000 0.9333333
## fold_3 1.0000000 0.9666667 0.9333333
## fold_4 0.9333333 1.0000000 0.9333333
## fold_5 0.9333333 0.9333333 0.9666667
##
## $mean
## [1] 0.9555556
##
## $sd
## [1] 0.02999118
##
## $median
## [1] 0.9666667
##
## $summary
## repeat_1 repeat_2 repeat_3
## Min. :0.9333 Min. :0.9000 Min. :0.9333
## 1st Qu.:0.9333 1st Qu.:0.9333 1st Qu.:0.9333
## Median :0.9667 Median :0.9667 Median :0.9333
## Mean :0.9600 Mean :0.9533 Mean :0.9533
## 3rd Qu.:0.9667 3rd Qu.:0.9667 3rd Qu.:0.9667
## Max. :1.0000 Max. :1.0000 Max. :1.0000
# We can specify custom error metrics for crossvalidation::crossval_ml
# here, the error rate
eval_metric <- function (preds, actual)
{
stopifnot(length(preds) == length(actual))
res <- 1-mean(preds == actual)
names(res) <- "error rate"
return(res)
}
# specify `eval_metric` argument for measuring the error rate
# instead of the (default) accuracy
crossvalidation::crossval_ml(x = X, y = y, k = 5, repeats = 3,
fit_func = randomForest::randomForest,
predict_func = predict,
fit_params = list(mtry = 2),
packages = "randomForest",
eval_metric=eval_metric,
show_progress = FALSE)
## $folds
## repeat_1 repeat_2 repeat_3
## fold_1 0.03333333 0.03333333 0.00000000
## fold_2 0.03333333 0.10000000 0.06666667
## fold_3 0.00000000 0.03333333 0.06666667
## fold_4 0.06666667 0.00000000 0.06666667
## fold_5 0.06666667 0.06666667 0.03333333
##
## $mean
## [1] 0.04444444
##
## $sd
## [1] 0.02999118
##
## $median
## [1] 0.03333333
##
## $summary
## repeat_1 repeat_2 repeat_3
## Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.03333 1st Qu.:0.03333 1st Qu.:0.03333
## Median :0.03333 Median :0.03333 Median :0.06667
## Mean :0.04000 Mean :0.04667 Mean :0.04667
## 3rd Qu.:0.06667 3rd Qu.:0.06667 3rd Qu.:0.06667
## Max. :0.06667 Max. :0.10000 Max. :0.06667