library(readxl) # read data from Excel files
library(tidyverse) # clean & format data
library(ggplot2) # visualize data
library(rstatix) # basic statistical analysis
library(survival) # fit and plot survival curves (base R)
library(survminer) # beautiful survival plot
The Kaplan-Meier estimate (nonparametric) is a step function with jumps at event times. The size of the steps depend on the number of events and the number of individuals at risk at the corresponding time. Note that if the last data is censored, the estimator will not reach the zero value.
fit0 <- survfit(Surv(time, status) ~ 1, data = lung)
#time - survival time;
#status - 1 (event occured)
#status - 0 (censored)
ggsurvplot(fit0, color = "#2E9FDF")
fit1 <- survfit(Surv(time, status) ~ sex, data = lung)
ggsurvplot(fit1)
summary(fit1, times = c(250, 750))
## Call: survfit(formula = Surv(time, status) ~ sex, data = lung)
##
## sex=1
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 250 62 65 0.5192 0.0433 0.441 0.611
## 750 7 44 0.0781 0.0276 0.039 0.156
##
## sex=2
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 250 53 24 0.719 0.0490 0.6287 0.821
## 750 3 28 0.125 0.0549 0.0527 0.295
Pros:
- It is commonly used to describe survival. - It is commonly used to compare two study populations. - It is intuitive graphical presentation. (exploratory)
Cons:
- It is mainly descriptive. - It does not control for covariates. (only add categorical varaibles) - It can not accommodate time-dependent variables.
This is a parametric survival model in which survival time (the outcome) is assumed to follow a known distribution. Examples of distributions that are commonly used for survival time are: the Weibull, the exponential (a special case of the Weibull), the log-logistic, the log-normal, etc.
fit_weibull=survreg(Surv(time, status) ~ as.factor(sex)+age, dist = "weibull",data = lung)
summary(fit_weibull)
##
## Call:
## survreg(formula = Surv(time, status) ~ as.factor(sex) + age,
## data = lung, dist = "weibull")
## Value Std. Error z p
## (Intercept) 6.65694 0.44752 14.88 < 2e-16
## as.factor(sex)2 0.38209 0.12748 3.00 0.0027
## age -0.01226 0.00696 -1.76 0.0781
## Log(scale) -0.28230 0.06188 -4.56 5.1e-06
##
## Scale= 0.754
##
## Weibull distribution
## Loglik(model)= -1147.1 Loglik(intercept only)= -1153.9
## Chisq= 13.59 on 2 degrees of freedom, p= 0.0011
## Number of Newton-Raphson Iterations: 5
## n= 228
fit_lognormal=survreg(Surv(time, status) ~ as.factor(sex)+age, dist = "lognormal",data = lung)
summary(fit_lognormal)
##
## Call:
## survreg(formula = Surv(time, status) ~ as.factor(sex) + age,
## data = lung, dist = "lognormal")
## Value Std. Error z p
## (Intercept) 6.92724 0.54168 12.79 < 2e-16
## as.factor(sex)2 0.51925 0.15515 3.35 0.00082
## age -0.02336 0.00839 -2.78 0.00536
## Log(scale) 0.05134 0.05602 0.92 0.35943
##
## Scale= 1.05
##
## Log Normal distribution
## Loglik(model)= -1158.8 Loglik(intercept only)= -1169.3
## Chisq= 21.04 on 2 degrees of freedom, p= 2.7e-05
## Number of Newton-Raphson Iterations: 3
## n= 228
library(ggeffects)
test=ggeffect(fit_lognormal)$sex
test
## # Predicted values of time
##
## sex | Predicted | 95% CI
## ----------------------------------
## 1 | 237.15 | [197.45, 284.84]
## 2 | 398.60 | [312.00, 509.22]
##
## Adjusted for:
## * age = 62.45
The null hypothesis (\(H_0\)) is that there is no overall difference between the two (or k) survival curves
survdiff(Surv(time, status) ~ sex, data = lung, rho = 0) #rho controls the type of test
## Call:
## survdiff(formula = Surv(time, status) ~ sex, data = lung, rho = 0)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## sex=1 138 112 91.6 4.55 10.3
## sex=2 90 53 73.4 5.68 10.3
##
## Chisq= 10.3 on 1 degrees of freedom, p= 0.001
survdiff(Surv(time, status) ~ sex, data = lung, rho = 1) #rho controls the type of test
## Call:
## survdiff(formula = Surv(time, status) ~ sex, data = lung, rho = 1)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## sex=1 138 70.4 55.6 3.95 12.7
## sex=2 90 28.7 43.5 5.04 12.7
##
## Chisq= 12.7 on 1 degrees of freedom, p= 4e-04
ggsurvplot(fit1,
size = 1, # change line size
linetype = "strata", # change line type by groups
break.time.by = 250, # break time axis by 250
palette = c("#E7B800", "#2E9FDF"), # custom color palette
conf.int = TRUE, # Add confidence interval
pval = TRUE # Add p-value
)
ggsurvplot(fit1,
size = 1,
linetype = "strata",
conf.int = TRUE,
pval = TRUE,
palette = "grey" # custom color palette - Grey
)
ggsurvplot(fit1,
size = 1,
linetype = "strata",
pval = TRUE,
conf.int = TRUE,
risk.table = TRUE) # Add risk table
ggsurvplot(fit1,
linetype = "strata",
pval = TRUE,
conf.int = TRUE,
palette = c("#E7B800", "#2E9FDF"),
risk.table = TRUE, # Add risk table
risk.table.col = "strata", # Change risk table color by groups
ggtheme = theme_bw() # Change ggplot2 theme
)
ggsurvplot(fit1,
linetype = "strata",
pval = TRUE,
conf.int = TRUE,
palette = c("#E7B800", "#2E9FDF"),
risk.table = TRUE, # Add risk table
risk.table.col = "strata", # Change risk table color by groups
ggtheme = theme_bw(), # Change ggplot2 theme
xlim=c(0,1100), # Change limits of x axis
break.x.by=100) # Change breaks of x axis
ggsurvplot(fit1,
linetype = "strata",
pval = TRUE,
conf.int = TRUE,
palette = c("#E7B800", "#2E9FDF"),
ggtheme = theme_bw(),
fun = "event") # Specify cumulative events
ggsurvplot(fit1,
linetype = "strata",
pval = TRUE,
conf.int = TRUE,
palette = c("#E7B800", "#2E9FDF"),
ggtheme = theme_bw(),
fun = "cumhaz") # Specify cumulative hazard function
ggsurvplot(fit1,
linetype = "strata",
pval = TRUE,
conf.int = TRUE,
palette = c("#E7B800", "#2E9FDF"),
ggtheme = theme_bw(),
fun = function (s) -log(-log(s)),
ylab = "log-log Survival")
The null hypothesis (H0) is that there is no overall difference between the six survival curves
- pairwise comparison = 15
- no more than 8 survival curves in one plot is better
fit2 <- survfit(Surv(time, status) ~ rx + sex, data = colon )
# rx represents the treatment group - Observation, Levamisole, Levamisole + 5-FU
# must be categorical variables for KM survival estimator
ggsurvplot(fit2,
pval = TRUE,
xlim=c(0,3000),
break.time.by = 500,
risk.table = TRUE,
risk.table.col = "strata",
risk.table.height = 0.35, # Adjust table height
palette = "Dark2"
)
ggsurvplot(fit2,
pval = TRUE,
xlim=c(0,3000),
break.time.by = 500,
risk.table = TRUE,
risk.table.col = "strata",
risk.table.height = 0.35, # Adjust table height
palette = "Dark2",
legend.labs = c("A", "B", "C", "D", "E", "F")) # Change legend labels both in survival plot and risk table
# import data
KM = read_csv("../data/km.csv") %>%
janitor::clean_names() %>%
mutate(event=as.numeric(cnsr==0),
trt02p=as.factor(ifelse(trt02pn==1,"Active","Placebo"))) %>%
filter(event %in% c(0,1))
# fit KM curves
kM_fit <- survfit(Surv(aval, event) ~ trt02p,data = KM)
# plot KM curves
ggsurvplot(kM_fit,
data = KM,
color= "strata",
censor.shape="strata",
censor.size = 3,
#linetype=c(2,1),
#ncensor.plot = TRUE,
#ncensor.plot.title="number of censored subjects",
pval = TRUE,
pval.size = 4,
pval.coord=c(1,0.2),
conf.int=FALSE,
xlim=c(0,50),
break.x.by=2,
surv.median.line = "hv",
legend.title="",
ggtheme = theme_bw(),
subtitle = "Figure XXX. Kaplan-Meier Curves for Time to Event Data \n
ITT Analysis Set",
ylab="1 - Probability of Event (%)",
xlab="Time (weeks)")$plot +
scale_y_continuous(breaks=seq(0,1,by=0.1),labels=seq(0,100,by=10)) +
scale_shape_manual("",labels=c("Active","Placebo"),values = c(1,16)) +
scale_color_manual("",labels=c("Active","Placebo"),values = c("blue","red")) +
theme( plot.margin = margin(0.5,0.2,0.1,0.85, "cm"),
panel.grid.major.x = element_blank(),
panel.grid.minor = element_blank(),
panel.grid.major.y = element_blank(),
plot.title = element_text(size = rel(0.6),face = "bold", vjust = 5),
plot.subtitle = element_text(size = rel(0.8),face = "bold", hjust=0.5,vjust = 1.5),
strip.text.x = element_text(face = "bold"), #facet title
axis.title = element_text(face = "bold"),
legend.position = "top",
legend.title = element_text(face = "bold",size = rel(0.9),vjust= 1.8),
legend.key.size = unit(0.4, "cm"),
legend.background = element_rect(fill="transparent",size=0.5), # set background color to transparent
axis.title.y = element_text(size = rel(0.9),vjust= 5),
axis.title.x = element_text(size = rel(0.9),vjust= -0.5),
axis.text.x = element_text(face = "bold",color="black",size=rel(1.0)),
axis.text.y = element_text(face = "bold",color="black",size=rel(1.0)))