Survival analysis with tidymodels

R/medicine 2024

Hannah Frick

Why survival analysis?

Our data situation

We want to analyse time-to-event data.

Side note: “time” can be any continuous variable but
most often it is indeed time.

If we don’t know the exact time, e.g., because the event
hasn’t happened yet, the observation is censored. It is
incomplete but not missing.

Time-to-event data inherently has two aspects: time
and event status. Regression and classification only
cover one aspect each.

Survival analysis is unique because it simultaneously considers if events happened (i.e. a binary outcome) and when events happened (e.g. a continuous outcome).¹

Why tidymodels?

tidymodels is a framework for modelling and

machine learning using tidyverse principles.

Core coverage

Extendable

Focus on the modelling question,

not the infrastructure for
empirical validation.

Focus on the modelling question,

not the syntax.

Case study

Building complaints in NYC

library(tidymodels)
library(censored)

building_complaints <- modeldatatoo::data_building_complaints()
glimpse(building_complaints)
#> Rows: 4,234
#> Columns: 11
#> $ days_to_disposition <dbl> 72, 1, 41, 45, 16, 62, 56, 11, 35, 38, 3…
#> $ status              <chr> "ACTIVE", "ACTIVE", "ACTIVE", "ACTIVE", …
#> $ year_entered        <fct> 2023, 2023, 2023, 2023, 2023, 2023, 2023…
#> $ latitude            <dbl> 40.66173, 40.57668, 40.73242, 40.68245, …
#> $ longitude           <dbl> -73.98297, -74.00453, -73.87630, -73.793…
#> $ borough             <fct> Brooklyn, Brooklyn, Queens, Queens, Broo…
#> $ special_district    <fct> None, None, None, None, None, None, None…
#> $ unit                <fct> Q-L, Q-L, SPOPS, Q-L, BKLYN, Q-L, Q-L, S…
#> $ community_board     <fct> 307, 313, 404, 412, 312, 406, 306, 306, …
#> $ complaint_category  <fct> 45, 45, 49, 45, 31, 45, 45, 49, 45, 45, …
#> $ complaint_priority  <fct> B, B, C, B, C, B, B, C, B, B, B, B, C, C…

Building complaints in NYC

building_complaints <- building_complaints %>% 
  mutate(
    disposition_surv = Surv(days_to_disposition, status == "CLOSED"), 
    .keep = "unused"
  )

Split the data

set.seed(403)
complaints_split <- initial_split(building_complaints)

complaints_train <- training(complaints_split)
complaints_test <- testing(complaints_split)

Building complaints in NYC

A first model

cox_spec <- proportional_hazards() %>% 
  set_engine("survival") %>% 
  set_mode("censored regression")

rec_other <- recipe(disposition_surv ~ ., data = complaints_train) %>% 
  step_unknown(complaint_priority) %>% 
  step_novel(all_nominal_predictors()) %>%
  step_other(community_board, unit, threshold = 0.02) %>%
  step_rm(complaint_category)

cox_wflow <- workflow() %>% 
  add_recipe(rec_other) %>% 
  add_model(cox_spec)

Resampling our first model

complaints_rset <- vfold_cv(complaints_train)

survival_metrics <- metric_set(brier_survival_integrated, brier_survival,
                               roc_auc_survival, concordance_survival)
evaluation_time_points <- seq(0, 300, 30)

Resampling our first model

set.seed(1)
cox_res <- fit_resamples(
  cox_wflow,
  resamples = complaints_rset,
  metrics = survival_metrics,
  eval_time = evaluation_time_points
)

Separation over evaluation time

Calibration over evaluation time

Integrated Brier score

collect_metrics(cox_res) %>% 
  filter(.metric == "brier_survival_integrated")
#> # A tibble: 1 × 7
#>   .metric           .estimator .eval_time   mean     n std_err .config
#>   <chr>             <chr>           <dbl>  <dbl> <int>   <dbl> <chr>  
#> 1 brier_survival_i… standard           NA 0.0552    10 0.00354 Prepro…

Let’s go further

Alternative modeling strategy

Instead of lumping factor levels together in a "other" category,

let’s try out regularization on dummy variables.

Switching to regularized model

cox_spec <- proportional_hazards() %>% 
  set_engine("survival") %>% 
  set_mode("censored regression")

rec_other <- recipe(disposition_surv ~ ., data = complaints_train) %>% 
  step_unknown(complaint_priority) %>% 
  step_novel(community_board, unit) %>%
  step_other(community_board, unit, threshold = 0.02) %>%
  step_rm(complaint_category)

cox_wflow <- workflow() %>% 
  add_recipe(rec_other) %>% 
  add_model(cox_spec)

Switching to regularized model

cox_spec <- proportional_hazards() %>% 
  set_engine("survival") %>% 
  set_mode("censored regression")

rec_dummies <- recipe(disposition_surv ~ ., data = complaints_train) %>% 
  step_unknown(complaint_priority) %>% 
  step_novel(all_nominal_predictors()) %>%
  step_dummy(all_nominal_predictors()) %>% 
  step_zv(all_predictors()) %>% 
  step_normalize(all_numeric_predictors())

cox_wflow <- workflow() %>% 
  add_recipe(rec_dummies) %>% 
  add_model(cox_spec)

Switching to regularized model

coxnet_spec <- proportional_hazards(penalty = 0.1) %>% 
  set_engine("glmnet") %>% 
  set_mode("censored regression")

rec_dummies <- recipe(disposition_surv ~ ., data = complaints_train) %>% 
  step_unknown(complaint_priority) %>% 
  step_novel(all_nominal_predictors()) %>%
  step_dummy(all_nominal_predictors()) %>% 
  step_zv(all_predictors()) %>% 
  step_normalize(all_numeric_predictors())

coxnet_wflow <- workflow() %>% 
  add_recipe(rec_dummies) %>% 
  add_model(coxnet_spec)

But how much regularization do we need?

coxnet_spec <- proportional_hazards(
    penalty = 0.1
  ) %>% 
  set_engine("glmnet") %>% 
  set_mode("censored regression")

rec_dummies <- recipe(disposition_surv ~ ., data = complaints_train) %>% 
  step_unknown(complaint_priority) %>% 
  step_novel(all_nominal_predictors()) %>%
  step_dummy(all_nominal_predictors()) %>% 
  step_zv(all_predictors()) %>% 
  step_normalize(all_numeric_predictors())

coxnet_wflow <- workflow() %>% 
  add_recipe(rec_dummies) %>% 
  add_model(coxnet_spec)

But how much regularization do we need?

coxnet_spec <- proportional_hazards(
    penalty = tune()
  ) %>% 
  set_engine("glmnet") %>% 
  set_mode("censored regression")

rec_dummies <- recipe(disposition_surv ~ ., data = complaints_train) %>% 
  step_unknown(complaint_priority) %>% 
  step_novel(all_nominal_predictors()) %>%
  step_dummy(all_nominal_predictors()) %>% 
  step_zv(all_predictors()) %>% 
  step_normalize(all_numeric_predictors())

coxnet_wflow <- workflow() %>% 
  add_recipe(rec_dummies) %>% 
  add_model(coxnet_spec)

Resample 1 model

set.seed(1)
cox_res <- fit_resamples(
  cox_wflow,
  resamples = complaints_rset,
  metrics = survival_metrics,
  eval_time = evaluation_time_points
)

Resample 1 model

set.seed(1)
coxnet_res <- fit_resamples(
  coxnet_wflow,
  resamples = complaints_rset,
  metrics = survival_metrics,
  eval_time = evaluation_time_points
)

Resample 10 models

set.seed(1)
coxnet_res <- tune_grid(
  coxnet_wflow,
  resamples = complaints_rset,
  grid = 10,
  metrics = survival_metrics,
  eval_time = evaluation_time_points
)

Compare preformance

show_best(cox_res, metric = "brier_survival_integrated", n = 1)
#> # A tibble: 1 × 7
#>   .metric           .estimator .eval_time   mean     n std_err .config
#>   <chr>             <chr>           <dbl>  <dbl> <int>   <dbl> <chr>  
#> 1 brier_survival_i… standard           NA 0.0552    10 0.00354 Prepro…

show_best(coxnet_res, metric = "brier_survival_integrated", n = 1)
#> # A tibble: 1 × 8
#>   penalty .metric   .estimator .eval_time   mean     n std_err .config
#>     <dbl> <chr>     <chr>           <dbl>  <dbl> <int>   <dbl> <chr>  
#> 1 0.00750 brier_su… standard           NA 0.0531    10 0.00354 Prepro…

tidymodels for survival analysis

Models:
parametric, semi-parametric, and tree-based
Predictions:
survival time, survival probability, hazard, and linear predictor
Metrics:
concordance index, Brier score, integrated Brier score, AUC ROC

tidymodels for survival analysis

Learn more on tidymodels.org

tidymodels.org