Machine Learning

Author

Kelly Hatfield

Step 1: Opening Data and Loading Packages

library(here)
here() starts at /Users/kellymccormickhatfield/Documents/MADA 2023/kellyhatfield-MADA-portfolio
library (tidyverse)
── Attaching packages
───────────────────────────────────────
tidyverse 1.3.2 ──
✔ ggplot2 3.4.0      ✔ purrr   1.0.1 
✔ tibble  3.1.8      ✔ dplyr   1.0.10
✔ tidyr   1.3.0      ✔ stringr 1.5.0 
✔ readr   2.1.3      ✔ forcats 0.5.2 
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
library (tidymodels)
── Attaching packages ────────────────────────────────────── tidymodels 1.0.0 ──
✔ broom        1.0.2     ✔ rsample      1.1.1
✔ dials        1.1.0     ✔ tune         1.0.1
✔ infer        1.0.4     ✔ workflows    1.1.2
✔ modeldata    1.0.1     ✔ workflowsets 1.0.0
✔ parsnip      1.0.3     ✔ yardstick    1.1.0
✔ recipes      1.0.4     
── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
✖ scales::discard() masks purrr::discard()
✖ dplyr::filter()   masks stats::filter()
✖ recipes::fixed()  masks stringr::fixed()
✖ dplyr::lag()      masks stats::lag()
✖ yardstick::spec() masks readr::spec()
✖ recipes::step()   masks stats::step()
• Use suppressPackageStartupMessages() to eliminate package startup messages
library(rpart)

Attaching package: 'rpart'

The following object is masked from 'package:dials':

    prune
library("rpart.plot")
library("glmnet")
Loading required package: Matrix

Attaching package: 'Matrix'

The following objects are masked from 'package:tidyr':

    expand, pack, unpack

Loaded glmnet 4.1-7
library("ranger")
library(vip)

Attaching package: 'vip'

The following object is masked from 'package:utils':

    vi
FinalDataML<- readRDS(here("fluanalysis","Data","FinalDataML.Rds"))

Step 2: Machine Learning

Data Setup and Null Model Performance

# Fix the random numbers by setting the seed 
# This enables the analysis to be reproducible when random numbers are used 
set.seed(123)


# Put 70/30 of the data into the training set 
data_split <- initial_split(FinalDataML, prop = 7/10)

# Create data frames for the two sets:
train_data <- training(data_split)
test_data  <- testing(data_split)


# Create data frames for the two sets:
train_data <- training(data_split)
test_data  <- testing(data_split)


#5-fold cross-validation, 5 times repeated
fold_ds<- vfold_cv(train_data, v = 5, repeats = 5, strata = BodyTemp)

#Recipe for the data and fitting 
data_recipe <- recipe(BodyTemp ~ ., data = train_data) %>%
  step_dummy(all_nominal(), -all_outcomes()) 

null_recipe <- recipe(BodyTemp ~ 1, data = train_data) %>%
  step_dummy(all_nominal(), -all_outcomes()) 

#linear model
ln_model <- linear_reg() %>% set_engine("lm") %>% set_mode("regression")

#Workflow
null_flow <- workflow() %>% add_model(ln_model) %>% add_recipe(null_recipe)

#look at model
null_fit <- null_flow %>% fit(data=train_data) %>% fit_resamples(resamples=fold_ds)
! Fold1, Repeat1: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold2, Repeat1: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold3, Repeat1: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold4, Repeat1: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold5, Repeat1: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold1, Repeat2: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold2, Repeat2: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold3, Repeat2: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold4, Repeat2: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold5, Repeat2: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold1, Repeat3: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold2, Repeat3: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold3, Repeat3: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold4, Repeat3: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold5, Repeat3: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold1, Repeat4: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold2, Repeat4: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold3, Repeat4: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold4, Repeat4: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold5, Repeat4: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold1, Repeat5: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold2, Repeat5: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold3, Repeat5: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold4, Repeat5: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold5, Repeat5: internal: A correlation computation is required, but `estimate` is constant and ha...
null_metrics<- collect_metrics(null_fit)
null_metrics
# A tibble: 2 × 6
  .metric .estimator   mean     n std_err .config             
  <chr>   <chr>       <dbl> <int>   <dbl> <chr>               
1 rmse    standard     1.22    25  0.0178 Preprocessor1_Model1
2 rsq     standard   NaN        0 NA      Preprocessor1_Model1
#RMSE: 1.21

Model tuning and fitting

Tree

#Model
tune_spec <- 
  decision_tree(
    cost_complexity = tune(),
    tree_depth = tune()
  ) %>% 
  set_engine("rpart") %>% 
  set_mode("regression")


#Grid
tree_grid <- grid_regular(cost_complexity(),
                          tree_depth(),
                          levels = 5)
#create workflow
tree_wf <- workflow() %>%
  add_model(tune_spec) %>%
  add_formula(BodyTemp ~ .)


#Tuning grid cross validation
tree_res <- 
  tree_wf %>% 
  tune_grid(
    resamples = fold_ds,
    grid = tree_grid
    )
! Fold1, Repeat1: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold2, Repeat1: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold3, Repeat1: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold4, Repeat1: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold5, Repeat1: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold1, Repeat2: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold2, Repeat2: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold3, Repeat2: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold4, Repeat2: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold5, Repeat2: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold1, Repeat3: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold2, Repeat3: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold3, Repeat3: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold4, Repeat3: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold5, Repeat3: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold1, Repeat4: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold2, Repeat4: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold3, Repeat4: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold4, Repeat4: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold5, Repeat4: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold1, Repeat5: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold2, Repeat5: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold3, Repeat5: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold4, Repeat5: internal: A correlation computation is required, but `estimate` is constant and ha...
! Fold5, Repeat5: internal: A correlation computation is required, but `estimate` is constant and ha...
tree_res %>% 
  collect_metrics()
# A tibble: 50 × 8
   cost_complexity tree_depth .metric .estimator     mean     n  std_err .config
             <dbl>      <int> <chr>   <chr>         <dbl> <int>    <dbl> <chr>  
 1    0.0000000001          1 rmse    standard     1.20      25  0.0173  Prepro…
 2    0.0000000001          1 rsq     standard     0.0485    25  0.00488 Prepro…
 3    0.0000000178          1 rmse    standard     1.20      25  0.0173  Prepro…
 4    0.0000000178          1 rsq     standard     0.0485    25  0.00488 Prepro…
 5    0.00000316            1 rmse    standard     1.20      25  0.0173  Prepro…
 6    0.00000316            1 rsq     standard     0.0485    25  0.00488 Prepro…
 7    0.000562              1 rmse    standard     1.20      25  0.0173  Prepro…
 8    0.000562              1 rsq     standard     0.0485    25  0.00488 Prepro…
 9    0.1                   1 rmse    standard     1.22      25  0.0178  Prepro…
10    0.1                   1 rsq     standard   NaN          0 NA       Prepro…
# … with 40 more rows
#Look at the best model 
tree_res %>%
  show_best()
Warning: No value of `metric` was given; metric 'rmse' will be used.
# A tibble: 5 × 8
  cost_complexity tree_depth .metric .estimator  mean     n std_err .config     
            <dbl>      <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>       
1    0.0000000001          1 rmse    standard    1.20    25  0.0173 Preprocesso…
2    0.0000000178          1 rmse    standard    1.20    25  0.0173 Preprocesso…
3    0.00000316            1 rmse    standard    1.20    25  0.0173 Preprocesso…
4    0.000562              1 rmse    standard    1.20    25  0.0173 Preprocesso…
5    0.1                   1 rmse    standard    1.22    25  0.0178 Preprocesso…
#rmse = 1.199

#Select best tree
best_tree <- tree_res %>%
  select_best(n=1)
Warning: No value of `metric` was given; metric 'rmse' will be used.
#Final model from best tree
final_wf <- 
  tree_wf %>% 
  finalize_workflow(best_tree)


#Fit 
final_fit <- 
  final_wf %>%
  fit(train_data) 


final_fit
══ Workflow [trained] ══════════════════════════════════════════════════════════
Preprocessor: Formula
Model: decision_tree()

── Preprocessor ────────────────────────────────────────────────────────────────
BodyTemp ~ .

── Model ───────────────────────────────────────────────────────────────────────
n= 510 

node), split, n, deviance, yval
      * denotes terminal node

1) root 510 767.6997 98.98529  
  2) SubjectiveFever=No 154 101.9304 98.55974 *
  3) SubjectiveFever=Yes 356 625.8163 99.16938 *
#Plot final fit
rpart.plot(extract_fit_parsnip(final_fit)$fit)
Warning: Cannot retrieve the data used to build the model (so cannot determine roundint and is.binary for the variables).
To silence this warning:
    Call rpart.plot with roundint=FALSE,
    or rebuild the rpart model with model=TRUE.

Lasso

#The steps (block of code) you should have here are 1) model specification, 2) workflow definition, 3) tuning grid specification and 4) tuning using cross-validation and the tune_grid() function.



#Build  model
lasso_mod <- 
  linear_reg(penalty = tune(), mixture = 1) %>% 
  set_engine("glmnet")

#Create workflow using data recipe from above
lasso_workflow <- 
  workflow() %>% 
  add_model(lasso_mod) %>% 
  add_recipe(data_recipe)

# tuning grid
lasso_grid <- tibble(penalty = 10^seq(-4, -1, length.out = 30))
#Bottom 5 penalty values
lasso_grid %>% top_n(-5)
Selecting by penalty
# A tibble: 5 × 1
   penalty
     <dbl>
1 0.0001  
2 0.000127
3 0.000161
4 0.000204
5 0.000259
#Top 5 penalty values
lasso_grid %>% top_n(5)
Selecting by penalty
# A tibble: 5 × 1
  penalty
    <dbl>
1  0.0386
2  0.0489
3  0.0621
4  0.0788
5  0.1   
#Using tuning grids
lr_res <- 
  lasso_workflow %>% 
  tune_grid(resamples = fold_ds,
            grid = lasso_grid,
            control = control_grid(verbose = FALSE, save_pred = TRUE),
            metrics = NULL)

lr_res %>% collect_metrics()
# A tibble: 60 × 7
    penalty .metric .estimator   mean     n std_err .config              
      <dbl> <chr>   <chr>       <dbl> <int>   <dbl> <chr>                
 1 0.0001   rmse    standard   1.21      25 0.0193  Preprocessor1_Model01
 2 0.0001   rsq     standard   0.0567    25 0.00650 Preprocessor1_Model01
 3 0.000127 rmse    standard   1.21      25 0.0193  Preprocessor1_Model02
 4 0.000127 rsq     standard   0.0567    25 0.00650 Preprocessor1_Model02
 5 0.000161 rmse    standard   1.21      25 0.0193  Preprocessor1_Model03
 6 0.000161 rsq     standard   0.0567    25 0.00650 Preprocessor1_Model03
 7 0.000204 rmse    standard   1.21      25 0.0193  Preprocessor1_Model04
 8 0.000204 rsq     standard   0.0567    25 0.00650 Preprocessor1_Model04
 9 0.000259 rmse    standard   1.21      25 0.0193  Preprocessor1_Model05
10 0.000259 rsq     standard   0.0567    25 0.00650 Preprocessor1_Model05
# … with 50 more rows
lr_res %>% show_best()
Warning: No value of `metric` was given; metric 'rmse' will be used.
# A tibble: 5 × 7
  penalty .metric .estimator  mean     n std_err .config              
    <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
1  0.0621 rmse    standard    1.18    25  0.0179 Preprocessor1_Model28
2  0.0788 rmse    standard    1.18    25  0.0176 Preprocessor1_Model29
3  0.0489 rmse    standard    1.18    25  0.0182 Preprocessor1_Model27
4  0.0386 rmse    standard    1.18    25  0.0183 Preprocessor1_Model26
5  0.1    rmse    standard    1.19    25  0.0175 Preprocessor1_Model30
#Selects best performing model
best_lasso <- lr_res %>% select_best()
Warning: No value of `metric` was given; metric 'rmse' will be used.
#rmse = 1.182 -- 1.1815

#Final Model
lasso_final_wf <- 
  lasso_workflow %>% finalize_workflow(best_lasso)
lasso_final_fit <- lasso_final_wf %>% fit(train_data) 

#plot
x <- extract_fit_engine(lasso_final_fit)
plot(x, "lambda")

Random Forest

#The steps (block of code) you should have here are 1) model specification, 2) workflow definition, 3) tuning grid specification and 4) tuning using cross-validation and the tune_grid() function.


#Build 
cores <- parallel::detectCores()
cores
[1] 8
randomforest_mod <-  rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>% 
  set_engine("ranger", importance = "impurity", num.threads = cores) %>% set_mode("regression")

#Workflow
randomforest_wf <-  workflow() %>% add_model(randomforest_mod) %>% add_recipe(data_recipe)

#Tune
extract_parameter_set_dials(randomforest_mod)
Collection of 2 parameters for tuning

 identifier  type    object
       mtry  mtry nparam[?]
      min_n min_n nparam[+]

Model parameters needing finalization:
   # Randomly Selected Predictors ('mtry')

See `?dials::finalize` or `?dials::update.parameters` for more information.
#Tune grid
randomforest_res <- randomforest_wf %>%  tune_grid(fold_ds, grid = 25, control = control_grid(save_pred = TRUE), metrics = NULL)
i Creating pre-processing data to finalize unknown parameter: mtry
#Best forest
randomforest_res %>% show_best()
Warning: No value of `metric` was given; metric 'rmse' will be used.
# A tibble: 5 × 8
   mtry min_n .metric .estimator  mean     n std_err .config              
  <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
1     9    33 rmse    standard    1.19    25  0.0176 Preprocessor1_Model24
2     3    25 rmse    standard    1.19    25  0.0177 Preprocessor1_Model02
3     4    19 rmse    standard    1.19    25  0.0177 Preprocessor1_Model25
4    13    27 rmse    standard    1.20    25  0.0176 Preprocessor1_Model20
5    20    39 rmse    standard    1.20    25  0.0175 Preprocessor1_Model01
randomforest_best <- randomforest_res %>% select_best()
Warning: No value of `metric` was given; metric 'rmse' will be used.
# workflow
randomforest_fwf<- randomforest_wf %>% finalize_workflow(randomforest_best)

#Final fit
ranforest_fin <- randomforest_fwf %>% fit(train_data)
ranforest_fin %>% extract_fit_parsnip() %>% vip(num_features = 28)

fun <- extract_fit_engine(ranforest_fin)
vip(fun)

#rmse 1.19

Final Model

We select LASSO model because it had the highest RMSE.

We will run the LASSO model on the split data

lasso_test_data <- 
  lasso_final_wf %>%
  last_fit(data_split) 
lasso_test_data %>%
   collect_metrics()
# A tibble: 2 × 4
  .metric .estimator .estimate .config             
  <chr>   <chr>          <dbl> <chr>               
1 rmse    standard      1.10   Preprocessor1_Model1
2 rsq     standard      0.0428 Preprocessor1_Model1
#RMSE 1.100