ADA - Predictive Analysis

Download as pdf or txt
Download as pdf or txt
You are on page 1of 34

Predictive Analysis

Advanced Data Analysis, AY 2021-22

24 December 2021

Advanced Data Analysis, AY 2021-22 Predictive Analysis 24 December 2021 1 / 34


Insurance Dataset

ins <- read.csv("insurance.csv")


glimpse(ins)

## Rows: 1,338
## Columns: 7
## $ age <int> 19, 18, 28, 33, 32, 31, 46, 37, 37, 60, 25, 62, 23, 56, 27, 1~
## $ sex <chr> "female", "male", "male", "male", "male", "female", "female",~
## $ bmi <dbl> 27.900, 33.770, 33.000, 22.705, 28.880, 25.740, 33.440, 27.74~
## $ children <int> 0, 1, 3, 0, 0, 0, 1, 3, 2, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0~
## $ smoker <chr> "yes", "no", "no", "no", "no", "no", "no", "no", "no", "no", ~
## $ region <chr> "southwest", "southeast", "southeast", "northwest", "northwes~
## $ charges <dbl> 16884.924, 1725.552, 4449.462, 21984.471, 3866.855, 3756.622,~

Advanced Data Analysis, AY 2021-22 Predictive Analysis 24 December 2021 2 / 34


Model Specification

lm_mod <- linear_reg() %>%


set_engine("lm")
lm_mod

## Linear Regression Model Specification (regression)


##
## Computational engine: lm

Advanced Data Analysis, AY 2021-22 Predictive Analysis 24 December 2021 3 / 34


Fitting the Linear Model

lm_fit <- lm_mod %>%


fit(charges ~ age + bmi, data = ins)
tidy(lm_fit)

## # A tibble: 3 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -6425. 1744. -3.68 2.39e- 4
## 2 age 242. 22.3 10.8 2.42e-26
## 3 bmi 333. 51.4 6.48 1.28e-10

Advanced Data Analysis, AY 2021-22 Predictive Analysis 24 December 2021 4 / 34


Prediction

new <- data.frame(age = c(21, 60),


bmi = c(30.5, 24.6))
pred <- predict(lm_fit, new_data = new)
ci_pred <- predict(lm_fit, new_data = new, type = "conf_int")
new %>%
bind_cols(pred) %>%
bind_cols(ci_pred)

## age bmi .pred .pred_lower .pred_upper


## 1 21 30.5 8811.177 7808.849 9813.505
## 2 60 24.6 16281.983 14980.008 17583.958

Advanced Data Analysis, AY 2021-22 Predictive Analysis 24 December 2021 5 / 34


Splitting Data for Training & Testing

set.seed(123)
ins_split <- initial_split(ins, prop = 3/4)
ins_train <- training(ins_split)
ins_test <- testing(ins_split)

Advanced Data Analysis, AY 2021-22 Predictive Analysis 24 December 2021 6 / 34


Initiating Recipe

ins_rec <- recipe(charges ~ ., data = ins_train)


summary(ins_rec)

## # A tibble: 7 x 4
## variable type role source
## <chr> <chr> <chr> <chr>
## 1 age numeric predictor original
## 2 sex nominal predictor original
## 3 bmi numeric predictor original
## 4 children numeric predictor original
## 5 smoker nominal predictor original
## 6 region nominal predictor original
## 7 charges numeric outcome original

Advanced Data Analysis, AY 2021-22 Predictive Analysis 24 December 2021 7 / 34


Preprocessing Steps

Create dummy variables for all nominal variables.

step_dummy(all_nominal_predictors())

Advanced Data Analysis, AY 2021-22 Predictive Analysis 24 December 2021 8 / 34


Preprocessing Steps

ins_rec <- recipe(charges ~ ., data = ins_train) %>%


step_dummy(all_nominal_predictors())
ins_rec

## Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 6
##
## Operations:
##
## Dummy variables from all_nominal_predictors()

Advanced Data Analysis, AY 2021-22 Predictive Analysis 24 December 2021 9 / 34


Model Specification

lm_mod2 <- linear_reg() %>%


set_engine("lm")

Advanced Data Analysis, AY 2021-22 Predictive Analysis 24 December 2021 10 / 34


Model Workflow

ins_workflow <-
workflow() %>%
add_model(lm_mod2) %>%
add_recipe(ins_rec)
ins_workflow

## == Workflow ====================================================================
## Preprocessor: Recipe
## Model: linear_reg()
##
## -- Preprocessor ----------------------------------------------------------------
## 1 Recipe Step
##
## * step_dummy()
##
## -- Model -----------------------------------------------------------------------
## Linear Regression Model Specification (regression)
##
## Computational engine: lm

Advanced Data Analysis, AY 2021-22 Predictive Analysis 24 December 2021 11 / 34


Train the Model
lm_fit2 <-
ins_workflow %>%
fit(data = ins_train)

lm_fit2 %>%
extract_fit_parsnip() %>%
tidy()

## # A tibble: 9 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -12235. 1167. -10.5 1.78e- 24
## 2 age 244. 14.2 17.2 3.81e- 58
## 3 bmi 363. 33.7 10.8 1.06e- 25
## 4 children 608. 161. 3.77 1.74e- 4
## 5 sex_male -296. 393. -0.752 4.52e- 1
## 6 smoker_yes 24003. 478. 50.2 9.28e-275
## 7 region_northwest -766. 557. -1.37 1.70e- 1
## 8 region_southeast -898. 563. -1.59 1.11e- 1
## 9 region_southwest -1033. 562. -1.84 6.61e- 2

Advanced Data Analysis, AY 2021-22 Predictive Analysis 24 December 2021 12 / 34


Prediction

predict(lm_fit2, ins_test)

## # A tibble: 335 x 1
## .pred
## <dbl>
## 1 25495.
## 2 2991.
## 3 10829.
## 4 35530.
## 5 14975.
## 6 32450.
## 7 15463.
## 8 6415.
## 9 11505.
## 10 4785.
## # ... with 325 more rows

Advanced Data Analysis, AY 2021-22 Predictive Analysis 24 December 2021 13 / 34


Full Code
# Reading Dataset
ins <- read.csv("insurance.csv")
# Splitting Dataset into Training and Testing
set.seed(123)
ins_split <- initial_split(ins, prop = 3/4)
ins_train <- training(ins_split)
ins_test <- testing(ins_split)
# Creating Recipe
ins_rec <- recipe(charges ~ ., data = ins_train) %>%
step_dummy(all_nominal_predictors())
# Model Specification
lm_mod2 <- linear_reg() %>%
set_engine("lm")
# Model Workflow
ins_workflow <- workflow() %>%
add_model(lm_mod2) %>%
add_recipe(ins_rec)
# Fitting the Model
lm_fit2 <- ins_workflow %>%
fit(data = ins_train)
# Model Summary
lm_fit2 %>%
extract_fit_parsnip() %>%
tidy()
# Prediction
predict(lm_fit2, ins_test)

Advanced Data Analysis, AY 2021-22 Predictive Analysis 24 December 2021 14 / 34


Fit with Resampling

set.seed(345)
ins_folds <- vfold_cv(ins_train, v = 5)
ins_folds

## # 5-fold cross-validation
## # A tibble: 5 x 2
## splits id
## <list> <chr>
## 1 <split [802/201]> Fold1
## 2 <split [802/201]> Fold2
## 3 <split [802/201]> Fold3
## 4 <split [803/200]> Fold4
## 5 <split [803/200]> Fold5

Advanced Data Analysis, AY 2021-22 Predictive Analysis 24 December 2021 15 / 34


Fitting

set.seed(456)
lm_fit_rs <- ins_workflow %>%
fit_resamples(ins_folds)

Advanced Data Analysis, AY 2021-22 Predictive Analysis 24 December 2021 16 / 34


Fitting

collect_metrics(lm_fit_rs)

## # A tibble: 2 x 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 rmse standard 6190. 5 285. Preprocessor1
## 2 rsq standard 0.746 5 0.00476 Preprocessor1

Advanced Data Analysis, AY 2021-22 Predictive Analysis 24 December 2021 17 / 34


Classification Example: German Credit Dataset

gc <- read.table("http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data")%>%
mutate_if(is.character, as.factor)
colnames(gc) <- c("chk_acct", "duration", "credit_his", "purpose", "amount",
"saving_acct", "present_emp", "installment_rate", "sex",
"other_debtor","present_resid", "property", "age",
"other_install", "housing", "n_credits","job", "n_people",
"telephone", "foreign", "response")
gc$response <- factor(gc$response)
glimpse(gc)

Advanced Data Analysis, AY 2021-22 Predictive Analysis 24 December 2021 18 / 34


German Credit Dataset

## Rows: 1,000
## Columns: 21
## $ chk_acct <fct> A11, A12, A14, A11, A11, A14, A14, A12, A14, A12, A12~
## $ duration <int> 6, 48, 12, 42, 24, 36, 24, 36, 12, 30, 12, 48, 12, 24~
## $ credit_his <fct> A34, A32, A34, A32, A33, A32, A32, A32, A32, A34, A32~
## $ purpose <fct> A43, A43, A46, A42, A40, A46, A42, A41, A43, A40, A40~
## $ amount <int> 1169, 5951, 2096, 7882, 4870, 9055, 2835, 6948, 3059,~
## $ saving_acct <fct> A65, A61, A61, A61, A61, A65, A63, A61, A64, A61, A61~
## $ present_emp <fct> A75, A73, A74, A74, A73, A73, A75, A73, A74, A71, A72~
## $ installment_rate <int> 4, 2, 2, 2, 3, 2, 3, 2, 2, 4, 3, 3, 1, 4, 2, 4, 4, 2,~
## $ sex <fct> A93, A92, A93, A93, A93, A93, A93, A93, A91, A94, A92~
## $ other_debtor <fct> A101, A101, A101, A103, A101, A101, A101, A101, A101,~
## $ present_resid <int> 4, 2, 3, 4, 4, 4, 4, 2, 4, 2, 1, 4, 1, 4, 4, 2, 4, 3,~
## $ property <fct> A121, A121, A121, A122, A124, A124, A122, A123, A121,~
## $ age <int> 67, 22, 49, 45, 53, 35, 53, 35, 61, 28, 25, 24, 22, 6~
## $ other_install <fct> A143, A143, A143, A143, A143, A143, A143, A143, A143,~
## $ housing <fct> A152, A152, A152, A153, A153, A153, A152, A151, A152,~
## $ n_credits <int> 2, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 2, 3,~
## $ job <fct> A173, A173, A172, A173, A173, A172, A173, A174, A172,~
## $ n_people <int> 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,~
## $ telephone <fct> A192, A191, A191, A191, A191, A192, A191, A192, A191,~
## $ foreign <fct> A201, A201, A201, A201, A201, A201, A201, A201, A201,~
## $ response <fct> 1, 2, 1, 1, 2, 1, 1, 1, 1, 2, 2, 2, 1, 2, 1, 2, 1, 1,~

Advanced Data Analysis, AY 2021-22 Predictive Analysis 24 December 2021 19 / 34


Splitting with Stratification
set.seed(123)
gc_splits <- initial_split(gc, strata = response)
gc_train <- training(gc_splits)
gc_test <- testing(gc_splits)
gc %>% count(response) %>% mutate(prop = n/sum(n))

## response n prop
## 1 1 700 0.7
## 2 2 300 0.3

gc_train %>% count(response) %>% mutate(prop = n/sum(n))

## response n prop
## 1 1 525 0.7
## 2 2 225 0.3

gc_test %>% count(response) %>% mutate(prop = n/sum(n))

## response n prop
## 1 1 175 0.7
## 2 2 75 0.3
Advanced Data Analysis, AY 2021-22 Predictive Analysis 24 December 2021 20 / 34
Validation Sets

set.seed(234)
gc_fold <- validation_split(gc_train,
strata = response,
prop = 0.80)
gc_fold

## # Validation Set Split (0.8/0.2) using stratification


## # A tibble: 1 x 2
## splits id
## <list> <chr>
## 1 <split [600/150]> validation

Advanced Data Analysis, AY 2021-22 Predictive Analysis 24 December 2021 21 / 34


Model Specification

lr_mod <-
logistic_reg(penalty = tune(), mixture = 1) %>%
set_engine("glmnet")

Advanced Data Analysis, AY 2021-22 Predictive Analysis 24 December 2021 22 / 34


Model Recipe

lr_recipe <-
recipe(response ~ ., data = gc_train) %>%
step_dummy(all_nominal(), -all_outcomes()) %>%
step_zv(all_predictors()) %>%
step_normalize(all_predictors())

Advanced Data Analysis, AY 2021-22 Predictive Analysis 24 December 2021 23 / 34


Model Workflow

lr_workflow <-
workflow() %>%
add_model(lr_mod) %>%
add_recipe(lr_recipe)

Advanced Data Analysis, AY 2021-22 Predictive Analysis 24 December 2021 24 / 34


Creating Grid for Tuning
lr_reg_grid <- tibble(penalty = 10ˆseq(-4, 1, length.out = 100))
lr_reg_grid %>% top_n(-3)

## Selecting by penalty

## # A tibble: 3 x 1
## penalty
## <dbl>
## 1 0.0001
## 2 0.000112
## 3 0.000126
lr_reg_grid %>% top_n(3)

## Selecting by penalty

## # A tibble: 3 x 1
## penalty
## <dbl>
## 1 7.92
## 2 8.90
## 3 10
Advanced Data Analysis, AY 2021-22 Predictive Analysis 24 December 2021 25 / 34
Tuning Penalty

lr_res <-
lr_workflow %>%
tune_grid(gc_fold,
grid = lr_reg_grid,
control = control_grid(save_pred = TRUE),
metrics = metric_set(roc_auc))

Advanced Data Analysis, AY 2021-22 Predictive Analysis 24 December 2021 26 / 34


Plotting AUC

lr_plot <-
lr_res %>%
collect_metrics() %>%
ggplot(aes(x = penalty, y = mean)) +
geom_point() +
geom_line() +
ylab("Area under the ROC Curve") +
scale_x_log10(labels = scales::label_number())

Advanced Data Analysis, AY 2021-22 Predictive Analysis 24 December 2021 27 / 34


Plotting AUC
lr_plot

0.7
Area under the ROC Curve

0.6

0.5

0.001 0.100 10.000


penalty

Advanced Data Analysis, AY 2021-22 Predictive Analysis 24 December 2021 28 / 34


Comparing Penalized Models

top_models <-
lr_res %>%
show_best("roc_auc", n = 15) %>%
arrange(penalty)
top_models

## # A tibble: 15 x 7
## penalty .metric .estimator mean n std_err .config
## <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 0.00129 roc_auc binary 0.769 1 NA Preprocessor1_Model023
## 2 0.00163 roc_auc binary 0.769 1 NA Preprocessor1_Model025
## 3 0.00183 roc_auc binary 0.769 1 NA Preprocessor1_Model026
## 4 0.00206 roc_auc binary 0.771 1 NA Preprocessor1_Model027
## 5 0.00231 roc_auc binary 0.771 1 NA Preprocessor1_Model028
## 6 0.00260 roc_auc binary 0.772 1 NA Preprocessor1_Model029
## 7 0.00292 roc_auc binary 0.773 1 NA Preprocessor1_Model030
## 8 0.00327 roc_auc binary 0.773 1 NA Preprocessor1_Model031
## 9 0.00368 roc_auc binary 0.771 1 NA Preprocessor1_Model032
## 10 0.00413 roc_auc binary 0.771 1 NA Preprocessor1_Model033
## 11 0.00464 roc_auc binary 0.770 1 NA Preprocessor1_Model034
## 12 0.00521 roc_auc binary 0.769 1 NA Preprocessor1_Model035
## 13 0.00586 roc_auc binary 0.770 1 NA Preprocessor1_Model036
## 14 0.00658 roc_auc binary 0.769 1 NA Preprocessor1_Model037
## 15 0.00739 roc_auc binary 0.770 1 NA Preprocessor1_Model038

Advanced Data Analysis, AY 2021-22 Predictive Analysis 24 December 2021 29 / 34


Choosing Best Penalty

best_penalty <- lr_res %>%


select_best("roc_auc")
best_penalty

## # A tibble: 1 x 2
## penalty .config
## <dbl> <chr>
## 1 0.00292 Preprocessor1_Model030

lr_auc <- lr_res %>%


collect_predictions(parameters = best_penalty) %>%
roc_curve(response, .pred_1) %>%
mutate(model = "Logistic Regression")
autoplot(lr_auc)

Advanced Data Analysis, AY 2021-22 Predictive Analysis 24 December 2021 30 / 34


ROC Curve
1.00

0.75
sensitivity

0.50

0.25

0.00

0.00 0.25 0.50 0.75 1.00


1 − specificity

Advanced Data Analysis, AY 2021-22 Predictive Analysis 24 December 2021 31 / 34


Full Code
gc <- read.table("http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data")%>%
mutate_if(is.character, as.factor)
colnames(gc) <- c("chk_acct", "duration", "credit_his", "purpose", "amount","saving_acct", "present_emp",
"installment_rate", "sex", "other_debtor","present_resid", "property", "age",
"other_install","housing","n_credits","job","n_people","telephone","foreign","response")
gc$response <- factor(gc$response)
glimpse(gc)
## Splitting with Stratification
set.seed(123)
gc_splits <- initial_split(gc, strata = response)
gc_train <- training(gc_splits)
gc_test <- testing(gc_splits)
## Validation Sets
set.seed(234)
gc_fold <- validation_split(gc_train,
strata = response,
prop = 0.80)
## Model Specification
lr_mod <-
logistic_reg(penalty = tune(), mixture = 1) %>%
set_engine("glmnet")
## Model Recipe
lr_recipe <-
recipe(response ~ ., data = gc_train) %>%
step_dummy(all_nominal(), -all_outcomes()) %>%
step_zv(all_predictors()) %>%
step_normalize(all_predictors())

Advanced Data Analysis, AY 2021-22 Predictive Analysis 24 December 2021 32 / 34


Full Code

## Model Workflow
lr_workflow <-
workflow() %>%
add_model(lr_mod) %>%
add_recipe(lr_recipe)
## Creating Grid for Tuning
lr_reg_grid <- tibble(penalty = 10ˆseq(-4, 1, length.out = 100))
## Tuning Penalty
lr_res <-
lr_workflow %>%
tune_grid(gc_fold,
grid = lr_reg_grid,
control = control_grid(save_pred = TRUE),
metrics = metric_set(roc_auc))
## Plotting AUC
lr_plot <-
lr_res %>%
collect_metrics() %>%
ggplot(aes(x = penalty, y = mean)) +
geom_point() +
geom_line() +
ylab("Area under the ROC Curve") +
scale_x_log10(labels = scales::label_number())
lr_plot

Advanced Data Analysis, AY 2021-22 Predictive Analysis 24 December 2021 33 / 34


Full Code

## Comparing Penalized Models


top_models <-
lr_res %>%
show_best("roc_auc", n = 15) %>%
arrange(penalty)
top_models
## Choosing Best Penalty
best_penalty <- lr_res %>%
select_best("roc_auc")
best_penalty
## ROC Curve
lr_auc <- lr_res %>%
collect_predictions(parameters = best_penalty) %>%
roc_curve(response, .pred_1) %>%
mutate(model = "Logistic Regression")
autoplot(lr_auc)

Advanced Data Analysis, AY 2021-22 Predictive Analysis 24 December 2021 34 / 34

You might also like