ADA - Predictive Analysis
ADA - Predictive Analysis
ADA - Predictive Analysis
24 December 2021
## Rows: 1,338
## Columns: 7
## $ age <int> 19, 18, 28, 33, 32, 31, 46, 37, 37, 60, 25, 62, 23, 56, 27, 1~
## $ sex <chr> "female", "male", "male", "male", "male", "female", "female",~
## $ bmi <dbl> 27.900, 33.770, 33.000, 22.705, 28.880, 25.740, 33.440, 27.74~
## $ children <int> 0, 1, 3, 0, 0, 0, 1, 3, 2, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0~
## $ smoker <chr> "yes", "no", "no", "no", "no", "no", "no", "no", "no", "no", ~
## $ region <chr> "southwest", "southeast", "southeast", "northwest", "northwes~
## $ charges <dbl> 16884.924, 1725.552, 4449.462, 21984.471, 3866.855, 3756.622,~
## # A tibble: 3 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -6425. 1744. -3.68 2.39e- 4
## 2 age 242. 22.3 10.8 2.42e-26
## 3 bmi 333. 51.4 6.48 1.28e-10
set.seed(123)
ins_split <- initial_split(ins, prop = 3/4)
ins_train <- training(ins_split)
ins_test <- testing(ins_split)
## # A tibble: 7 x 4
## variable type role source
## <chr> <chr> <chr> <chr>
## 1 age numeric predictor original
## 2 sex nominal predictor original
## 3 bmi numeric predictor original
## 4 children numeric predictor original
## 5 smoker nominal predictor original
## 6 region nominal predictor original
## 7 charges numeric outcome original
step_dummy(all_nominal_predictors())
## Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 6
##
## Operations:
##
## Dummy variables from all_nominal_predictors()
ins_workflow <-
workflow() %>%
add_model(lm_mod2) %>%
add_recipe(ins_rec)
ins_workflow
## == Workflow ====================================================================
## Preprocessor: Recipe
## Model: linear_reg()
##
## -- Preprocessor ----------------------------------------------------------------
## 1 Recipe Step
##
## * step_dummy()
##
## -- Model -----------------------------------------------------------------------
## Linear Regression Model Specification (regression)
##
## Computational engine: lm
lm_fit2 %>%
extract_fit_parsnip() %>%
tidy()
## # A tibble: 9 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -12235. 1167. -10.5 1.78e- 24
## 2 age 244. 14.2 17.2 3.81e- 58
## 3 bmi 363. 33.7 10.8 1.06e- 25
## 4 children 608. 161. 3.77 1.74e- 4
## 5 sex_male -296. 393. -0.752 4.52e- 1
## 6 smoker_yes 24003. 478. 50.2 9.28e-275
## 7 region_northwest -766. 557. -1.37 1.70e- 1
## 8 region_southeast -898. 563. -1.59 1.11e- 1
## 9 region_southwest -1033. 562. -1.84 6.61e- 2
predict(lm_fit2, ins_test)
## # A tibble: 335 x 1
## .pred
## <dbl>
## 1 25495.
## 2 2991.
## 3 10829.
## 4 35530.
## 5 14975.
## 6 32450.
## 7 15463.
## 8 6415.
## 9 11505.
## 10 4785.
## # ... with 325 more rows
set.seed(345)
ins_folds <- vfold_cv(ins_train, v = 5)
ins_folds
## # 5-fold cross-validation
## # A tibble: 5 x 2
## splits id
## <list> <chr>
## 1 <split [802/201]> Fold1
## 2 <split [802/201]> Fold2
## 3 <split [802/201]> Fold3
## 4 <split [803/200]> Fold4
## 5 <split [803/200]> Fold5
set.seed(456)
lm_fit_rs <- ins_workflow %>%
fit_resamples(ins_folds)
collect_metrics(lm_fit_rs)
## # A tibble: 2 x 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 rmse standard 6190. 5 285. Preprocessor1
## 2 rsq standard 0.746 5 0.00476 Preprocessor1
gc <- read.table("http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data")%>%
mutate_if(is.character, as.factor)
colnames(gc) <- c("chk_acct", "duration", "credit_his", "purpose", "amount",
"saving_acct", "present_emp", "installment_rate", "sex",
"other_debtor","present_resid", "property", "age",
"other_install", "housing", "n_credits","job", "n_people",
"telephone", "foreign", "response")
gc$response <- factor(gc$response)
glimpse(gc)
## Rows: 1,000
## Columns: 21
## $ chk_acct <fct> A11, A12, A14, A11, A11, A14, A14, A12, A14, A12, A12~
## $ duration <int> 6, 48, 12, 42, 24, 36, 24, 36, 12, 30, 12, 48, 12, 24~
## $ credit_his <fct> A34, A32, A34, A32, A33, A32, A32, A32, A32, A34, A32~
## $ purpose <fct> A43, A43, A46, A42, A40, A46, A42, A41, A43, A40, A40~
## $ amount <int> 1169, 5951, 2096, 7882, 4870, 9055, 2835, 6948, 3059,~
## $ saving_acct <fct> A65, A61, A61, A61, A61, A65, A63, A61, A64, A61, A61~
## $ present_emp <fct> A75, A73, A74, A74, A73, A73, A75, A73, A74, A71, A72~
## $ installment_rate <int> 4, 2, 2, 2, 3, 2, 3, 2, 2, 4, 3, 3, 1, 4, 2, 4, 4, 2,~
## $ sex <fct> A93, A92, A93, A93, A93, A93, A93, A93, A91, A94, A92~
## $ other_debtor <fct> A101, A101, A101, A103, A101, A101, A101, A101, A101,~
## $ present_resid <int> 4, 2, 3, 4, 4, 4, 4, 2, 4, 2, 1, 4, 1, 4, 4, 2, 4, 3,~
## $ property <fct> A121, A121, A121, A122, A124, A124, A122, A123, A121,~
## $ age <int> 67, 22, 49, 45, 53, 35, 53, 35, 61, 28, 25, 24, 22, 6~
## $ other_install <fct> A143, A143, A143, A143, A143, A143, A143, A143, A143,~
## $ housing <fct> A152, A152, A152, A153, A153, A153, A152, A151, A152,~
## $ n_credits <int> 2, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 2, 3,~
## $ job <fct> A173, A173, A172, A173, A173, A172, A173, A174, A172,~
## $ n_people <int> 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,~
## $ telephone <fct> A192, A191, A191, A191, A191, A192, A191, A192, A191,~
## $ foreign <fct> A201, A201, A201, A201, A201, A201, A201, A201, A201,~
## $ response <fct> 1, 2, 1, 1, 2, 1, 1, 1, 1, 2, 2, 2, 1, 2, 1, 2, 1, 1,~
## response n prop
## 1 1 700 0.7
## 2 2 300 0.3
## response n prop
## 1 1 525 0.7
## 2 2 225 0.3
## response n prop
## 1 1 175 0.7
## 2 2 75 0.3
Advanced Data Analysis, AY 2021-22 Predictive Analysis 24 December 2021 20 / 34
Validation Sets
set.seed(234)
gc_fold <- validation_split(gc_train,
strata = response,
prop = 0.80)
gc_fold
lr_mod <-
logistic_reg(penalty = tune(), mixture = 1) %>%
set_engine("glmnet")
lr_recipe <-
recipe(response ~ ., data = gc_train) %>%
step_dummy(all_nominal(), -all_outcomes()) %>%
step_zv(all_predictors()) %>%
step_normalize(all_predictors())
lr_workflow <-
workflow() %>%
add_model(lr_mod) %>%
add_recipe(lr_recipe)
## Selecting by penalty
## # A tibble: 3 x 1
## penalty
## <dbl>
## 1 0.0001
## 2 0.000112
## 3 0.000126
lr_reg_grid %>% top_n(3)
## Selecting by penalty
## # A tibble: 3 x 1
## penalty
## <dbl>
## 1 7.92
## 2 8.90
## 3 10
Advanced Data Analysis, AY 2021-22 Predictive Analysis 24 December 2021 25 / 34
Tuning Penalty
lr_res <-
lr_workflow %>%
tune_grid(gc_fold,
grid = lr_reg_grid,
control = control_grid(save_pred = TRUE),
metrics = metric_set(roc_auc))
lr_plot <-
lr_res %>%
collect_metrics() %>%
ggplot(aes(x = penalty, y = mean)) +
geom_point() +
geom_line() +
ylab("Area under the ROC Curve") +
scale_x_log10(labels = scales::label_number())
0.7
Area under the ROC Curve
0.6
0.5
top_models <-
lr_res %>%
show_best("roc_auc", n = 15) %>%
arrange(penalty)
top_models
## # A tibble: 15 x 7
## penalty .metric .estimator mean n std_err .config
## <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 0.00129 roc_auc binary 0.769 1 NA Preprocessor1_Model023
## 2 0.00163 roc_auc binary 0.769 1 NA Preprocessor1_Model025
## 3 0.00183 roc_auc binary 0.769 1 NA Preprocessor1_Model026
## 4 0.00206 roc_auc binary 0.771 1 NA Preprocessor1_Model027
## 5 0.00231 roc_auc binary 0.771 1 NA Preprocessor1_Model028
## 6 0.00260 roc_auc binary 0.772 1 NA Preprocessor1_Model029
## 7 0.00292 roc_auc binary 0.773 1 NA Preprocessor1_Model030
## 8 0.00327 roc_auc binary 0.773 1 NA Preprocessor1_Model031
## 9 0.00368 roc_auc binary 0.771 1 NA Preprocessor1_Model032
## 10 0.00413 roc_auc binary 0.771 1 NA Preprocessor1_Model033
## 11 0.00464 roc_auc binary 0.770 1 NA Preprocessor1_Model034
## 12 0.00521 roc_auc binary 0.769 1 NA Preprocessor1_Model035
## 13 0.00586 roc_auc binary 0.770 1 NA Preprocessor1_Model036
## 14 0.00658 roc_auc binary 0.769 1 NA Preprocessor1_Model037
## 15 0.00739 roc_auc binary 0.770 1 NA Preprocessor1_Model038
## # A tibble: 1 x 2
## penalty .config
## <dbl> <chr>
## 1 0.00292 Preprocessor1_Model030
0.75
sensitivity
0.50
0.25
0.00
## Model Workflow
lr_workflow <-
workflow() %>%
add_model(lr_mod) %>%
add_recipe(lr_recipe)
## Creating Grid for Tuning
lr_reg_grid <- tibble(penalty = 10ˆseq(-4, 1, length.out = 100))
## Tuning Penalty
lr_res <-
lr_workflow %>%
tune_grid(gc_fold,
grid = lr_reg_grid,
control = control_grid(save_pred = TRUE),
metrics = metric_set(roc_auc))
## Plotting AUC
lr_plot <-
lr_res %>%
collect_metrics() %>%
ggplot(aes(x = penalty, y = mean)) +
geom_point() +
geom_line() +
ylab("Area under the ROC Curve") +
scale_x_log10(labels = scales::label_number())
lr_plot