Evaluation

Author

Kelly Hatfield

Part 0: Accessing Data

library(here)
here() starts at /Users/kellymccormickhatfield/Documents/MADA 2023/kellyhatfield-MADA-portfolio
library(tidyverse)
── Attaching packages
───────────────────────────────────────
tidyverse 1.3.2 ──
✔ ggplot2 3.4.0      ✔ purrr   1.0.1 
✔ tibble  3.1.8      ✔ dplyr   1.0.10
✔ tidyr   1.3.0      ✔ stringr 1.5.0 
✔ readr   2.1.3      ✔ forcats 0.5.2 
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
library(tidymodels)
── Attaching packages ────────────────────────────────────── tidymodels 1.0.0 ──
✔ broom        1.0.2     ✔ rsample      1.1.1
✔ dials        1.1.0     ✔ tune         1.0.1
✔ infer        1.0.4     ✔ workflows    1.1.2
✔ modeldata    1.0.1     ✔ workflowsets 1.0.0
✔ parsnip      1.0.3     ✔ yardstick    1.1.0
✔ recipes      1.0.4     
── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
✖ scales::discard() masks purrr::discard()
✖ dplyr::filter()   masks stats::filter()
✖ recipes::fixed()  masks stringr::fixed()
✖ dplyr::lag()      masks stats::lag()
✖ yardstick::spec() masks readr::spec()
✖ recipes::step()   masks stats::step()
• Learn how to get started at https://www.tidymodels.org/start/
library(performance)

Attaching package: 'performance'

The following objects are masked from 'package:yardstick':

    mae, rmse
#Accessing data
datalocation <- here::here("fluanalysis","Data", "CleanSymp.Rds")
fludata <- readRDS(datalocation)
ls(fludata)
 [1] "AbPain"            "BodyTemp"          "Breathless"       
 [4] "ChestCongestion"   "ChestPain"         "ChillsSweats"     
 [7] "CoughIntensity"    "CoughYN"           "CoughYN2"         
[10] "Diarrhea"          "EarPn"             "EyePn"            
[13] "Fatigue"           "Headache"          "Hearing"          
[16] "Insomnia"          "ItchyEye"          "Myalgia"          
[19] "MyalgiaYN"         "NasalCongestion"   "Nausea"           
[22] "Pharyngitis"       "RunnyNose"         "Sneeze"           
[25] "SubjectiveFever"   "SwollenLymphNodes" "ToothPn"          
[28] "Vision"            "Vomit"             "Weakness"         
[31] "WeaknessYN"        "Wheeze"           

Part 1: Data Splitting

# Fix the random numbers by setting the seed 
# This enables the analysis to be reproducible when random numbers are used 
set.seed(1234)
# Put 3/4 of the data into the training set 
data_split <- initial_split(fludata, prop = 3/4)

# Create data frames for the two sets:
train_data <- training(data_split)
test_data  <- testing(data_split)

Workflow creation and model fitting

Create a simple recipe for a logistic model to our categorical outcome of interest.

#recipe #1: Nausea predicted by all variables
flu_recipe <- recipe(Nausea ~ ., data= train_data)

#model: logistic regression using GLM engine
lr_mod <- logistic_reg() %>% set_engine ("glm")

#workflow: tying together model and recipe

flu_wflow <- workflow() %>% add_model (lr_mod) %>% add_recipe (flu_recipe)

#Printing workflow
flu_wflow
══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: logistic_reg()

── Preprocessor ────────────────────────────────────────────────────────────────
0 Recipe Steps

── Model ───────────────────────────────────────────────────────────────────────
Logistic Regression Model Specification (classification)

Computational engine: glm 
#Fitting the model to train dataset
flu_fit <- 
  flu_wflow %>% 
  fit(data = train_data)

#Looking at model output;
flu_fit %>% 
  extract_fit_parsnip() %>% 
  tidy()
# A tibble: 38 × 5
   term                 estimate std.error statistic p.value
   <chr>                   <dbl>     <dbl>     <dbl>   <dbl>
 1 (Intercept)           -4.01       8.97     -0.447  0.655 
 2 SwollenLymphNodesYes  -0.400      0.229    -1.75   0.0802
 3 ChestCongestionYes     0.187      0.247     0.755  0.450 
 4 ChillsSweatsYes        0.343      0.328     1.05   0.296 
 5 NasalCongestionYes     0.384      0.290     1.32   0.185 
 6 CoughYNYes            -0.256      0.590    -0.434  0.664 
 7 SneezeYes              0.0384     0.246     0.156  0.876 
 8 FatigueYes             0.148      0.448     0.331  0.741 
 9 SubjectiveFeverYes     0.185      0.257     0.719  0.472 
10 HeadacheYes            0.571      0.351     1.63   0.104 
# … with 28 more rows

Use the trained workflow to predict with unseen test data

#Applying the model to unseen test data
predict(flu_fit, test_data)
Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
prediction from a rank-deficient fit may be misleading
# A tibble: 183 × 1
   .pred_class
   <fct>      
 1 Yes        
 2 No         
 3 No         
 4 No         
 5 No         
 6 No         
 7 No         
 8 No         
 9 No         
10 No         
# … with 173 more rows
flu_aug <- 
  augment(flu_fit, test_data)
Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
prediction from a rank-deficient fit may be misleading

Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
prediction from a rank-deficient fit may be misleading
# The data look like: 
flu_aug
# A tibble: 183 × 35
   SwollenLymph…¹ Chest…² Chill…³ Nasal…⁴ CoughYN Sneeze Fatigue Subje…⁵ Heada…⁶
   <fct>          <fct>   <fct>   <fct>   <fct>   <fct>  <fct>   <fct>   <fct>  
 1 Yes            Yes     Yes     Yes     No      Yes    Yes     Yes     Yes    
 2 Yes            No      Yes     No      No      No     Yes     Yes     Yes    
 3 No             No      Yes     No      Yes     Yes    Yes     Yes     Yes    
 4 No             Yes     Yes     Yes     Yes     Yes    Yes     Yes     Yes    
 5 Yes            Yes     Yes     Yes     Yes     No     Yes     Yes     Yes    
 6 No             Yes     Yes     Yes     Yes     Yes    Yes     Yes     Yes    
 7 No             No      Yes     No      Yes     No     Yes     Yes     Yes    
 8 Yes            Yes     Yes     Yes     Yes     Yes    Yes     Yes     Yes    
 9 No             Yes     No      No      Yes     Yes    Yes     No      Yes    
10 Yes            Yes     Yes     Yes     Yes     Yes    Yes     Yes     Yes    
# … with 173 more rows, 26 more variables: Weakness <fct>, WeaknessYN <fct>,
#   CoughIntensity <fct>, CoughYN2 <fct>, Myalgia <fct>, MyalgiaYN <fct>,
#   RunnyNose <fct>, AbPain <fct>, ChestPain <fct>, Diarrhea <fct>,
#   EyePn <fct>, Insomnia <fct>, ItchyEye <fct>, Nausea <fct>, EarPn <fct>,
#   Hearing <fct>, Pharyngitis <fct>, Breathless <fct>, ToothPn <fct>,
#   Vision <fct>, Vomit <fct>, Wheeze <fct>, BodyTemp <dbl>, .pred_class <fct>,
#   .pred_No <dbl>, .pred_Yes <dbl>, and abbreviated variable names …
#Plotting ROC curve;
flu_aug %>% 
  roc_curve(truth = Nausea, .pred_No) %>% 
  autoplot()

#Getting estimates;
flu_aug %>% 
  roc_auc(truth = Nausea, .pred_No)
# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 roc_auc binary         0.740

Alternative Model

#New recipe: only predictor is RunnyNose; 
flu_recipe2 <- recipe(Nausea ~ RunnyNose, data= train_data)



#New workflow

flu_wflow2 <- workflow() %>% add_model (lr_mod) %>% add_recipe (flu_recipe2)

#New fit
flu_fit2 <- 
  flu_wflow2 %>% 
  fit(data = train_data)

#Print new fit; 
flu_fit2 %>% 
  extract_fit_parsnip() %>% 
  tidy()
# A tibble: 2 × 5
  term         estimate std.error statistic  p.value
  <chr>           <dbl>     <dbl>     <dbl>    <dbl>
1 (Intercept)    -0.664     0.171    -3.89  0.000100
2 RunnyNoseYes    0.101     0.200     0.506 0.613   
#Conduct prediction on test data; 
predict(flu_fit2, test_data)
# A tibble: 183 × 1
   .pred_class
   <fct>      
 1 No         
 2 No         
 3 No         
 4 No         
 5 No         
 6 No         
 7 No         
 8 No         
 9 No         
10 No         
# … with 173 more rows
flu_aug2 <- 
  augment(flu_fit2, test_data)

# Output ROC curves and statistics; 
flu_aug2 %>% 
  roc_curve(truth = Nausea, .pred_No) %>% 
  autoplot()

flu_aug2 %>% 
  roc_auc(truth = Nausea, .pred_No)
# A tibble: 1 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 roc_auc binary         0.488

This Section Added by Annabella Hines

#recipe #2: Body Temperature predicted by all variables
flu_recipe2 <- recipe(BodyTemp ~ ., data= train_data)

#model: linear regression using GLM engine
ln_mod <- linear_reg() %>% set_engine ("lm")

#workflow: tying together linear model and recipe 2

flu_wflow2 <- workflow() %>% add_model(ln_mod) %>% add_recipe(flu_recipe2)

#Printing workflow
flu_wflow2
══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: linear_reg()

── Preprocessor ────────────────────────────────────────────────────────────────
0 Recipe Steps

── Model ───────────────────────────────────────────────────────────────────────
Linear Regression Model Specification (regression)

Computational engine: lm 
#Fitting the model to train dataset
flu_fit2 <- 
  flu_wflow2 %>% 
  fit(data = train_data)

#Looking at model output;
flu_fit2 %>% 
  extract_fit_parsnip() %>% 
  tidy()
# A tibble: 38 × 5
   term                 estimate std.error statistic  p.value
   <chr>                   <dbl>     <dbl>     <dbl>    <dbl>
 1 (Intercept)           97.5        0.382   255.    0       
 2 SwollenLymphNodesYes  -0.289      0.108    -2.67  0.00781 
 3 ChestCongestionYes     0.135      0.115     1.18  0.239   
 4 ChillsSweatsYes        0.199      0.148     1.34  0.179   
 5 NasalCongestionYes    -0.199      0.130    -1.53  0.127   
 6 CoughYNYes             0.328      0.273     1.20  0.230   
 7 SneezeYes             -0.335      0.115    -2.91  0.00377 
 8 FatigueYes             0.267      0.197     1.36  0.176   
 9 SubjectiveFeverYes     0.458      0.121     3.77  0.000180
10 HeadacheYes           -0.0328     0.151    -0.217 0.828   
# … with 28 more rows

Use the trained workflow to predict with unseen test data

#Applying the model to unseen test data
predict(flu_fit2, test_data)
Warning in predict.lm(object = object$fit, newdata = new_data, type =
"response"): prediction from a rank-deficient fit may be misleading
# A tibble: 183 × 1
   .pred
   <dbl>
 1  98.7
 2  99.0
 3  99.1
 4  98.6
 5  98.7
 6  98.7
 7  99.1
 8  98.9
 9  99.2
10  99.1
# … with 173 more rows
#augment data
flu_aug2 <- augment(flu_fit2, test_data)
Warning in predict.lm(object = object$fit, newdata = new_data, type =
"response"): prediction from a rank-deficient fit may be misleading
#pull body temperatures and predictions
temps <- flu_aug2 %>% pull(BodyTemp)
pred <- flu_aug2 %>% pull(.pred)

#run rmse
rmse_vec(temps, pred)
[1] 1.153632

Model with only Runny Nose as predictor

#Recipe for body temperature and runny nose
flu_recipe_btrn<- recipe(BodyTemp ~ RunnyNose, data=train_data)

#Modified workflow
flu_btrn_workflow <- workflow() %>% add_model(ln_mod) %>% add_recipe(flu_recipe_btrn)

#Fitting
flu_btrn_fit<-flu_btrn_workflow %>% fit(data=train_data)
#Check fit characteristics
flu_btrn_fit %>% extract_fit_parsnip() %>% tidy()
# A tibble: 2 × 5
  term         estimate std.error statistic p.value
  <chr>           <dbl>     <dbl>     <dbl>   <dbl>
1 (Intercept)    99.1      0.0978   1014.    0     
2 RunnyNoseYes   -0.229    0.115      -1.98  0.0477
#Predictions
predict(flu_btrn_fit, test_data)
# A tibble: 183 × 1
   .pred
   <dbl>
 1  98.9
 2  99.1
 3  99.1
 4  98.9
 5  98.9
 6  98.9
 7  99.1
 8  98.9
 9  99.1
10  98.9
# … with 173 more rows
#Augment data
btrn_aug<-augment(flu_btrn_fit, test_data) 

#Pull temperatures and predictions
temps2 <- btrn_aug %>% pull(BodyTemp)
pred2 <- btrn_aug %>% pull(.pred)

#Run RMSE
rmse(temps2, pred2)
Warning: Can't extract residuals from model.
Warning: Response residuals not available to calculate mean square error. (R)MSE
  is probably not reliable.
the condition has length > 1
[1] NA