I. Load packages

library(readxl)     # read data from Excel files
library(tidyverse)   # clean & format data
library(ggplot2)     # visualize data
library(rstatix)     # basic statistical analysis
library(survival)    # fit and plot survival curves (base R)
library(survminer)   # beautiful survival plot

II. Estimate survival function

a. Kaplan Meier (KM) curves

The Kaplan-Meier estimate (nonparametric) is a step function with jumps at event times. The size of the steps depend on the number of events and the number of individuals at risk at the corresponding time. Note that if the last data is censored, the estimator will not reach the zero value.

1). Without grouping variable

fit0 <- survfit(Surv(time, status) ~ 1, data = lung) 
#time - survival time; 
#status - 1 (event occured)
#status - 0 (censored)

ggsurvplot(fit0, color = "#2E9FDF")

2). Add a grouping variable

fit1 <- survfit(Surv(time, status) ~ sex, data = lung)
ggsurvplot(fit1)

3). Understatnd the output of survfit() at a specific time point

  • the number of subjects at risk
  • the number of events
  • the survival time probability
  • the respective 95% confidence interval
summary(fit1, times = c(250, 750))
## Call: survfit(formula = Surv(time, status) ~ sex, data = lung)
## 
##                 sex=1 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##   250     62      65   0.5192  0.0433        0.441        0.611
##   750      7      44   0.0781  0.0276        0.039        0.156
## 
##                 sex=2 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##   250     53      24    0.719  0.0490       0.6287        0.821
##   750      3      28    0.125  0.0549       0.0527        0.295

4). Pros and Cons of the Kaplan-Meirs survival estimator

Pros:
- It is commonly used to describe survival. - It is commonly used to compare two study populations. - It is intuitive graphical presentation. (exploratory)

Cons:
- It is mainly descriptive. - It does not control for covariates. (only add categorical varaibles) - It can not accommodate time-dependent variables.

b. Accelerated Failure Time

This is a parametric survival model in which survival time (the outcome) is assumed to follow a known distribution. Examples of distributions that are commonly used for survival time are: the Weibull, the exponential (a special case of the Weibull), the log-logistic, the log-normal, etc.

1)Weibull distribution

fit_weibull=survreg(Surv(time, status) ~ as.factor(sex)+age, dist = "weibull",data = lung)
summary(fit_weibull)
## 
## Call:
## survreg(formula = Surv(time, status) ~ as.factor(sex) + age, 
##     data = lung, dist = "weibull")
##                    Value Std. Error     z       p
## (Intercept)      6.65694    0.44752 14.88 < 2e-16
## as.factor(sex)2  0.38209    0.12748  3.00  0.0027
## age             -0.01226    0.00696 -1.76  0.0781
## Log(scale)      -0.28230    0.06188 -4.56 5.1e-06
## 
## Scale= 0.754 
## 
## Weibull distribution
## Loglik(model)= -1147.1   Loglik(intercept only)= -1153.9
##  Chisq= 13.59 on 2 degrees of freedom, p= 0.0011 
## Number of Newton-Raphson Iterations: 5 
## n= 228

2) Lognormal distribution

fit_lognormal=survreg(Surv(time, status) ~ as.factor(sex)+age, dist = "lognormal",data = lung)
summary(fit_lognormal)
## 
## Call:
## survreg(formula = Surv(time, status) ~ as.factor(sex) + age, 
##     data = lung, dist = "lognormal")
##                    Value Std. Error     z       p
## (Intercept)      6.92724    0.54168 12.79 < 2e-16
## as.factor(sex)2  0.51925    0.15515  3.35 0.00082
## age             -0.02336    0.00839 -2.78 0.00536
## Log(scale)       0.05134    0.05602  0.92 0.35943
## 
## Scale= 1.05 
## 
## Log Normal distribution
## Loglik(model)= -1158.8   Loglik(intercept only)= -1169.3
##  Chisq= 21.04 on 2 degrees of freedom, p= 2.7e-05 
## Number of Newton-Raphson Iterations: 3 
## n= 228

3) Adjusted survival time

library(ggeffects)
test=ggeffect(fit_lognormal)$sex
test
## # Predicted values of time
## 
## sex | Predicted |           95% CI
## ----------------------------------
## 1   |    237.15 | [197.45, 284.84]
## 2   |    398.60 | [312.00, 509.22]
## 
## Adjusted for:
## * age = 62.45

III. Test Survival Curve Differences

The null hypothesis (\(H_0\)) is that there is no overall difference between the two (or k) survival curves

a. log-rank test

  • most widely used test
  • most powerful test when the proportional hazards (PH) assumption is satisfied (use cumulative hazard functions, see IV.b-c)
survdiff(Surv(time, status) ~ sex, data = lung, rho = 0) #rho controls the type of test
## Call:
## survdiff(formula = Surv(time, status) ~ sex, data = lung, rho = 0)
## 
##         N Observed Expected (O-E)^2/E (O-E)^2/V
## sex=1 138      112     91.6      4.55      10.3
## sex=2  90       53     73.4      5.68      10.3
## 
##  Chisq= 10.3  on 1 degrees of freedom, p= 0.001

b. Peto & Peto Gehan-Wilcoxon test

  • most sensitive to early differences (or earlier time points) between survival curves
survdiff(Surv(time, status) ~ sex, data = lung, rho = 1) #rho controls the type of test
## Call:
## survdiff(formula = Surv(time, status) ~ sex, data = lung, rho = 1)
## 
##         N Observed Expected (O-E)^2/E (O-E)^2/V
## sex=1 138     70.4     55.6      3.95      12.7
## sex=2  90     28.7     43.5      5.04      12.7
## 
##  Chisq= 12.7  on 1 degrees of freedom, p= 4e-04

IV. Customize suvival curves

a. Change line types and color palettes

ggsurvplot(fit1,  
           size = 1,  # change line size
           linetype = "strata", # change line type by groups
           break.time.by = 250, # break time axis by 250
           palette = c("#E7B800", "#2E9FDF"), # custom color palette
           conf.int = TRUE, # Add confidence interval
           pval = TRUE # Add p-value
           )

b. Use grey palette

ggsurvplot(fit1, 
           size = 1,  
           linetype = "strata", 
           conf.int = TRUE, 
           pval = TRUE,
           palette = "grey" # custom color palette - Grey
           )

c. Add risk table

ggsurvplot(fit1, 
           size = 1,  
           linetype = "strata", 
           pval = TRUE, 
           conf.int = TRUE,
           risk.table = TRUE) # Add risk table

d. Change risk.table color by strata

ggsurvplot(fit1, 
           linetype = "strata", 
           pval = TRUE, 
           conf.int = TRUE,
           palette = c("#E7B800", "#2E9FDF"),
           risk.table = TRUE, # Add risk table
           risk.table.col = "strata", # Change risk table color by groups
           ggtheme = theme_bw() # Change ggplot2 theme
           )

e. Change axis limits and breaks

ggsurvplot(fit1, 
           linetype = "strata", 
           pval = TRUE, 
           conf.int = TRUE,
           palette = c("#E7B800", "#2E9FDF"),
           risk.table = TRUE, # Add risk table
           risk.table.col = "strata", # Change risk table color by groups
           ggtheme = theme_bw(), # Change ggplot2 theme
           xlim=c(0,1100), # Change limits of x axis
           break.x.by=100) # Change breaks of x axis

IV. Transform survival curves

a. Plot cumulative events

ggsurvplot(fit1, 
           linetype = "strata", 
           pval = TRUE, 
           conf.int = TRUE,
           palette = c("#E7B800", "#2E9FDF"),
           ggtheme = theme_bw(),
           fun = "event") # Specify cumulative events

b. Plot the cumulative hazard function

  • When PH is satisfied, the two curves will be proportional to each other (i.e., the steadily grow away of each other).
ggsurvplot(fit1, 
           linetype = "strata", 
           pval = TRUE, 
           conf.int = TRUE,
           palette = c("#E7B800", "#2E9FDF"),
           ggtheme = theme_bw(),
           fun = "cumhaz") # Specify cumulative hazard function

c. Plot the log-log survival curves

  • When PH is satisfied, the two curves will be reasonably parallel to each other
  • Usually would assume the PH assumption is satisfied unless there is strong evidence of nonparallelism of the log–log curves (e.g. curves are crossing).
ggsurvplot(fit1, 
           linetype = "strata", 
           pval = TRUE, 
           conf.int = TRUE,
           palette = c("#E7B800", "#2E9FDF"),
           ggtheme = theme_bw(),
           fun = function (s) -log(-log(s)),
           ylab = "log-log Survival")

V. Survival curves with multiple grouping variable

a. Complex survival curve

The null hypothesis (H0) is that there is no overall difference between the six survival curves
- pairwise comparison = 15
- no more than 8 survival curves in one plot is better

fit2 <- survfit(Surv(time, status) ~ rx + sex, data = colon )
# rx represents the treatment group - Observation, Levamisole, Levamisole + 5-FU
# must be categorical variables for KM survival estimator

ggsurvplot(fit2, 
           pval = TRUE, 
           xlim=c(0,3000),
           break.time.by = 500,
           risk.table = TRUE,
           risk.table.col = "strata",
           risk.table.height = 0.35, # Adjust table height
           palette = "Dark2"
           )

b. Change legend labels

ggsurvplot(fit2, 
           pval = TRUE, 
           xlim=c(0,3000),
           break.time.by = 500,
           risk.table = TRUE,
           risk.table.col = "strata",
           risk.table.height = 0.35, # Adjust table height
           palette = "Dark2",
           legend.labs = c("A", "B", "C", "D", "E", "F")) # Change legend labels both in survival plot and risk table

VI. Case Study

# import data
KM = read_csv("../data/km.csv") %>%
  janitor::clean_names() %>%
  mutate(event=as.numeric(cnsr==0),
         trt02p=as.factor(ifelse(trt02pn==1,"Active","Placebo"))) %>%
         filter(event %in% c(0,1))

# fit KM curves
kM_fit <- survfit(Surv(aval, event) ~ trt02p,data = KM)

# plot KM curves
ggsurvplot(kM_fit, 
           data = KM,
           color= "strata",
           censor.shape="strata", 
           censor.size = 3, 
           #linetype=c(2,1),
           #ncensor.plot = TRUE, 
           #ncensor.plot.title="number of censored subjects",
           pval = TRUE, 
           pval.size = 4, 
           pval.coord=c(1,0.2),
           conf.int=FALSE,
           xlim=c(0,50),
           break.x.by=2,
           surv.median.line = "hv",
           legend.title="",
           ggtheme = theme_bw(),
           subtitle = "Figure XXX. Kaplan-Meier Curves for Time to Event Data \n
 ITT Analysis Set",
           ylab="1 - Probability of Event (%)",
           xlab="Time (weeks)")$plot +

           scale_y_continuous(breaks=seq(0,1,by=0.1),labels=seq(0,100,by=10)) +
           scale_shape_manual("",labels=c("Active","Placebo"),values = c(1,16)) +
           scale_color_manual("",labels=c("Active","Placebo"),values = c("blue","red")) +
           theme( plot.margin = margin(0.5,0.2,0.1,0.85, "cm"),
                  panel.grid.major.x = element_blank(),
                  panel.grid.minor = element_blank(),
                  panel.grid.major.y = element_blank(),
                  plot.title = element_text(size = rel(0.6),face = "bold", vjust = 5),
                  plot.subtitle = element_text(size = rel(0.8),face = "bold", hjust=0.5,vjust = 1.5),
                  strip.text.x = element_text(face = "bold"), #facet title
                  axis.title = element_text(face = "bold"),
                  legend.position = "top",
                  legend.title = element_text(face = "bold",size = rel(0.9),vjust= 1.8),
                  legend.key.size = unit(0.4, "cm"),
                  legend.background = element_rect(fill="transparent",size=0.5), # set background color to transparent
                  axis.title.y = element_text(size = rel(0.9),vjust= 5),
                  axis.title.x = element_text(size = rel(0.9),vjust= -0.5),
                  axis.text.x = element_text(face = "bold",color="black",size=rel(1.0)),
                  axis.text.y = element_text(face = "bold",color="black",size=rel(1.0)))