I. Load packages

library(readxl)     # read data from Excel files
library(tidyverse)   # clean & format data
library(ggplot2)     # visualize data
library(rstatix)     # basic statistical analysis
library(table1)     # build summary table
library(ggpubr)     # arrange different plots

II. Import dataset

data <- read_xlsx("../data/sample_complex.xlsx") 
str(data) #39 subjects & 30 characteristcs
## tibble [39 × 30] (S3: tbl_df/tbl/data.frame)
##  $ opioid   : num [1:39] 60 49.5 337.5 35 64 ...
##  $ nrs      : num [1:39] 3.01 6.37 6.6 6.05 5.83 ...
##  $ siteid   : chr [1:39] "101" "101" "101" "101" ...
##  $ ethnic   : chr [1:39] "NOT HISPANIC OR LATINO" "NOT HISPANIC OR LATINO" "NOT HISPANIC OR LATINO" "NOT HISPANIC OR LATINO" ...
##  $ age      : num [1:39] 62 65 56 75 47 57 64 60 59 72 ...
##  $ sex      : chr [1:39] "Female" "Male" "Male" "Male" ...
##  $ race     : chr [1:39] "WHITE" "BLACK" "WHITE" "WHITE" ...
##  $ cohort   : chr [1:39] "Cohort 1" "Cohort 1" "Cohort 1" "Cohort 1" ...
##  $ type     : chr [1:39] "Open Lumbar Posterior Spinal Fusion" "Open Lumbar Posterior Spinal Fusion" "Open Lumbar Posterior Spinal Fusion" "Open Lumbar Posterior Spinal Fusion" ...
##  $ level    : chr [1:39] "1-level" "2-level" "1-level" "1-level" ...
##  $ fusion   : chr [1:39] "Primary fusion" "Primary fusion" "Primary fusion" "Primary fusion" ...
##  $ techniq  : chr [1:39] "Open surgical technique" "Open surgical technique" "Open surgical technique" "Open surgical technique" ...
##  $ radicu   : chr [1:39] "N" "N" "N" "N" ...
##  $ spinal   : chr [1:39] "N" "N" "N" "N" ...
##  $ surgery  : chr [1:39] "Y" "Y" "Y" "Y" ...
##  $ complfl  : chr [1:39] "Y" "Y" NA "Y" ...
##  $ dcsreas  : chr [1:39] NA NA NA NA ...
##  $ basewt   : num [1:39] 93 91.6 88.9 77.1 62.6 ...
##  $ baseht   : num [1:39] 168 173 173 170 163 ...
##  $ basebmi  : num [1:39] 33.1 30.7 29.8 26.6 23.7 33.2 26.1 26.2 33.4 30.7 ...
##  $ basenrs  : num [1:39] 4 5 6 7 0 4 4 0 NA 8 ...
##  $ param    : chr [1:39] "AUC (0-72)" "AUC (0-72)" "AUC (0-72)" "AUC (0-72)" ...
##  $ aval     : num [1:39] 217 458 475 436 420 ...
##  $ agegroup : chr [1:39] "Age>=57" "Age>=57" "Age<57" "Age>=57" ...
##  $ label_age: chr [1:39] ">=57" ">=57" "<57" ">=57" ...
##  $ htgroup  : chr [1:39] "Height<180" "Height<180" "Height<180" "Height<180" ...
##  $ bmigroup : chr [1:39] "bmi>=30" "bmi>=30" "bmi<30" "bmi<30" ...
##  $ nrsgroup : chr [1:39] "NRS<6" "NRS<6" "NRS>=6" "NRS>=6" ...
##  $ cluster  : num [1:39] 1 2 3 2 2 2 2 2 2 1 ...
##  $ subject  : chr [1:39] "01-0002" "01-0003" "01-0005" "01-0010" ...
# Randomly assign treatment group
data$trt=sample(c("A","B"),size=nrow(data),replace = TRUE)

III. Mixed plots

a. “reflection” plot

1) Figure out the two variables:

  • primary variable as positive
  • secondary variable as negative
  • both variables can be scaled
ggplot(data=data, aes(x=subject)) +
  #pain-positive
  geom_bar(aes(y=nrs),stat="identity",fill="grey",alpha=0.8)+
  #opioid-negative-scaled
  geom_bar(aes(y=opioid*(-0.01)),stat="identity",fill="steelblue",alpha=0.8)+
  labs(x="") +
  theme(legend.position = "none",
        panel.grid.minor.x = element_blank(),
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1,size = rel(0.9)))

2) Relabel y-axis to reflect variables’ true values

ggplot(data=data, aes(x=subject)) +
  #pain-positive
  geom_bar(aes(y=nrs),stat="identity",fill="grey",alpha=0.8)+
  #opioid-negative-scaled
  geom_bar(aes(y=opioid*(-0.01)),stat="identity",fill="steelblue",alpha=0.8)+
  #relabel y-axis
  scale_y_continuous("",limits=c(-3.5,10),
                        breaks=c(-3,-1.5,0,2,4,6,8,10),
                        labels=c("OMED 300mg","OMED 150mg","0","NRS 2","NRS 4","NRS 6","NRS 8","NRS 10"))+
  labs(x="") +
  theme_bw()+
  theme(legend.position = "none",
        panel.grid.minor = element_blank(),
        panel.grid.major.x = element_blank(), 
        panel.grid.minor.x = element_blank(),
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1,size = rel(0.9)))

3) Sort subjects by ascending/descending order of primary variable

data$subject=fct_reorder(data$subject,data$nrs,.fun=max)

ggplot(data=data, aes(x=subject)) +
  #pain-positive
  geom_bar(aes(y=nrs),stat="identity",fill="grey",alpha=0.8)+
  #opioid-negative-scaled
  geom_bar(aes(y=opioid*(-0.01)),stat="identity",fill="steelblue",alpha=0.8)+
  #relabel y-axis
  scale_y_continuous("",limits=c(-3.5,10),
                        breaks=c(-3,-1.5,0,2,4,6,8,10),
                        labels=c("OMED 300mg","OMED 150mg","0","NRS 2","NRS 4","NRS 6","NRS 8","NRS 10"))+
  labs(x="") +
  theme_bw()+
  theme(legend.position = "none",
        panel.grid.minor = element_blank(),
        panel.grid.major.x = element_blank(), 
        panel.grid.minor.x = element_blank(),
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1,size = rel(0.9)))

4) Color primary variable by treatment group

ggplot(data=data, aes(x=subject)) +
  #pain-positive
  geom_bar(aes(y=nrs,fill=trt),stat="identity",alpha=0.8)+
  #opioid-negative-scaled
  geom_bar(aes(y=opioid*(-0.01)),stat="identity",fill="steelblue",alpha=0.8)+
  #relabel y-axis
  scale_y_continuous("",limits=c(-3.5,10),
                        breaks=c(-3,-1.5,0,2,4,6,8,10),
                        labels=c("OMED 300mg","OMED 150mg","0","NRS 2","NRS 4","NRS 6","NRS 8","NRS 10")) +
  #color by different treatment group
  scale_fill_manual("Treatment",values=c("grey","brown3")) +
  geom_text(aes(label=subject,y=0),angle=90,hjust=-0.1,size = 3) +
  geom_hline(yintercept=0,size=1)+
  labs(x="") +
  theme_bw()+
  theme(legend.position = "top",
        panel.grid.minor = element_blank(),
        panel.grid.major.x = element_blank(), 
        panel.grid.minor.x = element_blank(),
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank() )

b. “seesaw” plot

1) Figure out the two subgroups for an categorical variable (e.g. sex - Female and sex - Male)

trtA=data %>%
  filter(trt=="A") %>%
  select(sex,agegroup,nrsgroup,htgroup,bmigroup,radicu,spinal) %>%
  lapply(table) %>%
  unlist()

# Select a few categorical variables
group = rep(c("sex","age","baseline NRS","height","bmi","radicu","spinal"),each=2)

# Name the subgroups for each variable
label_raw = names(trtA)
label_raw
##  [1] "sex.Female"          "sex.Male"            "agegroup.Age<57"    
##  [4] "agegroup.Age>=57"    "nrsgroup.NRS<6"      "nrsgroup.NRS>=6"    
##  [7] "htgroup.Height<180"  "htgroup.Height>=180" "bmigroup.bmi<30"    
## [10] "bmigroup.bmi>=30"    "radicu.N"            "radicu.Y"           
## [13] "spinal.N"            "spinal.Y"
# Extract all the text after "." in raw label
label = substring(label_raw, first=regexpr(".", label_raw,fixed = TRUE) + 1) 
label
##  [1] "Female"      "Male"        "Age<57"      "Age>=57"     "NRS<6"      
##  [6] "NRS>=6"      "Height<180"  "Height>=180" "bmi<30"      "bmi>=30"    
## [11] "N"           "Y"           "N"           "Y"
subgroups = as.data.frame(cbind(group,label));subgroups
##           group       label
## 1           sex      Female
## 2           sex        Male
## 3           age      Age<57
## 4           age     Age>=57
## 5  baseline NRS       NRS<6
## 6  baseline NRS      NRS>=6
## 7        height  Height<180
## 8        height Height>=180
## 9           bmi      bmi<30
## 10          bmi     bmi>=30
## 11       radicu           N
## 12       radicu           Y
## 13       spinal           N
## 14       spinal           Y
# Percentage of each subgroup
cluster_data=cbind(subgroups,trtA) %>%
  group_by(group)%>%
  mutate( trtA=as.numeric(trtA),
          all=sum(trtA),
          perc=trtA/all,
          perc_label=paste0(round(100*perc, 2), "%", sep="")) %>%
  ungroup()
head(cluster_data,10) 
## # A tibble: 10 x 6
##    group        label        trtA   all  perc perc_label
##    <fct>        <fct>       <dbl> <dbl> <dbl> <chr>     
##  1 sex          Female          6    17 0.353 35.29%    
##  2 sex          Male           11    17 0.647 64.71%    
##  3 age          Age<57          7    17 0.412 41.18%    
##  4 age          Age>=57        10    17 0.588 58.82%    
##  5 baseline NRS NRS<6          13    16 0.812 81.25%    
##  6 baseline NRS NRS>=6          3    16 0.188 18.75%    
##  7 height       Height<180     10    17 0.588 58.82%    
##  8 height       Height>=180     7    17 0.412 41.18%    
##  9 bmi          bmi<30         12    17 0.706 70.59%    
## 10 bmi          bmi>=30         5    17 0.294 29.41%

2) Determine the sign for each subgroup

  • higher percentage as positive
  • lower percentage as negative
cluster_bar = cluster_data %>%
  arrange(group,perc) %>%
  mutate(coef=rep(c(-1,1),nrow(cluster_data)/2),
         perc_new=coef*perc)

head(cluster_bar,10)
## # A tibble: 10 x 8
##    group        label        trtA   all  perc perc_label  coef perc_new
##    <fct>        <fct>       <dbl> <dbl> <dbl> <chr>      <dbl>    <dbl>
##  1 age          Age<57          7    17 0.412 41.18%        -1   -0.412
##  2 age          Age>=57        10    17 0.588 58.82%         1    0.588
##  3 baseline NRS NRS>=6          3    16 0.188 18.75%        -1   -0.188
##  4 baseline NRS NRS<6          13    16 0.812 81.25%         1    0.812
##  5 bmi          bmi>=30         5    17 0.294 29.41%        -1   -0.294
##  6 bmi          bmi<30         12    17 0.706 70.59%         1    0.706
##  7 height       Height>=180     7    17 0.412 41.18%        -1   -0.412
##  8 height       Height<180     10    17 0.588 58.82%         1    0.588
##  9 radicu       Y               6    17 0.353 35.29%        -1   -0.353
## 10 radicu       N              11    17 0.647 64.71%         1    0.647
ggplot(cluster_bar, aes(x=group, y=perc_new)) + 
  
  geom_bar(stat='identity', aes(fill=as.factor(coef)),width=.3)  +
  geom_hline(yintercept=0,linetype=2,color="grey20") +

  geom_label(data=subset(cluster_bar,coef==1),aes(label=label),size=4,fontface=1,vjust=-0.8) +
  
  geom_label(data=subset(cluster_bar,coef==-1),aes(label=label),size=4,fontface=1,vjust=1.5) +
  
  scale_y_continuous(limits=c(-1,1),breaks=seq(-1,1,by=0.5),labels = c("100%","50%",0,"50%","100%"))+
  scale_fill_manual("", values = c("grey","tomato")) 

3) Flip the coordinate + add percentage label

ggplot(cluster_bar, aes(x=group, y=perc_new)) + 
  
  geom_bar(stat='identity', aes(fill=as.factor(coef)),width=.3)  +
  geom_hline(yintercept=0,linetype=2,color="grey20") +

  geom_label(data=subset(cluster_bar,coef==1),aes(label=label),hjust=-0.1,size=4,fontface=1) +
  geom_text(data=subset(cluster_bar,coef==1),aes(label=perc_label),fontface=2,size=3,hjust=1) +
  
  geom_label(data=subset(cluster_bar,coef==-1),aes(label=label),hjust=1.25,size=4,fontface=1) +
  geom_text(data=subset(cluster_bar,coef==-1),aes(label=perc_label),fontface=2,size=3,hjust=-0.1) +
  
  scale_y_continuous(limits=c(-1,1),breaks=seq(-1,1,by=0.5),labels = c("100%","50%",0,"50%","100%")) +
  scale_fill_manual("", values = c("grey","tomato")) +
  
  coord_flip()

4) More aesthetic improvement

ggplot(cluster_bar, aes(x=group, y=perc_new)) + 
  
  geom_bar(stat='identity', aes(fill=as.factor(coef)),width=.3)  +
  geom_hline(yintercept=0,linetype=2,color="grey20") +

  geom_label(data=subset(cluster_bar,coef==1),aes(label=label),hjust=-0.1,size=4,fontface=1) +
  geom_text(data=subset(cluster_bar,coef==1),aes(label=perc_label),fontface=2,size=3,hjust=1) +
  
  geom_label(data=subset(cluster_bar,coef==-1),aes(label=label),hjust=1.25,size=4,fontface=1) +
  geom_text(data=subset(cluster_bar,coef==-1),aes(label=perc_label),fontface=2,size=3,hjust=-0.1) +
  
  scale_y_continuous(limits=c(-1,1),breaks=seq(-1,1,by=0.5),labels = c("100%","50%",0,"50%","100%")) +
  scale_fill_manual("", values = c("grey","tomato")) +
  
  coord_flip() +
  xlab("") +
  ylab("") +
  theme_bw() +
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        plot.title = element_text(size = rel(1.5),face = "bold", vjust = 1.5),
        strip.text.x = element_text(face = "bold"), #facet title
        axis.title = element_text(face = "bold"),
        legend.position = "none",
        #legend.title = element_text(size = rel(1.1),face = "bold"),
        #legend.key.size = unit(0.4, "cm"),
         axis.title.y = element_text(size = rel(1.1),vjust= 0.5),
         axis.title.x = element_text(size = rel(1.1),vjust= -1.5),
         axis.text.y = element_text(size = rel(1.4),face = "bold",color="black"),
         axis.text.x = element_text(size = rel(1.2),color="black")) 

5) Same procedure for treatment group B or other grouping variable - build a function

first_function=function(trt_selected,title_text){
  
subdata=data %>%
 filter(trt %in% trt_selected) %>%
 select(sex,agegroup,nrsgroup,htgroup,bmigroup,radicu,spinal) %>%
 lapply(table) %>%
  unlist()

  group = rep(c("sex","age","baseline NRS","height","bmi","radicu","spinal"),each=2)


cluster_data=as.data.frame(cbind(subdata,group)) %>%
  group_by(group)%>%
  mutate( subdata=as.numeric(subdata),
          all=sum(subdata)) %>%
  ungroup()

cluster_data$label=names(subdata)


cluster_bar = cluster_data %>%
 mutate(param=substring(label, first=regexpr(".", label,fixed = TRUE) + 1),
        perc=subdata/all,
        perc_label=paste0(round(100*perc, 2), "%", sep="")) %>%
 arrange(group,perc) %>%
 mutate(coef=rep(c(-1,1),nrow(cluster_data)/2),
        perc_new=coef*perc) %>%
 select(-label) 

ggplot(cluster_bar, aes(x=group, y=perc_new)) + 
  
  geom_bar(stat='identity', aes(fill=as.factor(coef)),width=.3)  +
  geom_hline(yintercept=0,linetype=2,color="grey20") +

  geom_label(data=subset(cluster_bar,coef==1),aes(label=param),hjust=-0.1,size=3,fontface=1) +
  geom_text(data=subset(cluster_bar,coef==1),aes(label=perc_label),fontface=2,size=3,hjust=1) +
  
  geom_label(data=subset(cluster_bar,coef==-1),aes(label=param),hjust=1.25,size=3,fontface=1) +
  geom_text(data=subset(cluster_bar,coef==-1),aes(label=perc_label),fontface=2,size=3,hjust=-0.1) +
  
  scale_y_continuous(limits=c(-1,1),breaks=seq(-1,1,by=0.5),labels = c("100%","50%",0,"50%","100%")) +
 
  scale_fill_manual("", values = c("grey","tomato")) +
  labs(title= title_text) + 
  coord_flip() +
  xlab("") +
  ylab("") +
  theme_bw() +
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        plot.title = element_text(size = rel(1.2),face = "bold", vjust = 1.5,hjust=0.5),
        strip.text.x = element_text(face = "bold"), #facet title
        axis.title = element_text(face = "bold"),
        legend.position = "none",
        #legend.title = element_text(size = rel(1.1),face = "bold"),
        #legend.key.size = unit(0.4, "cm"),
         axis.title.y = element_text(size = rel(1.1),vjust= 0.5),
         axis.title.x = element_text(size = rel(1.1),vjust= -1.5),
         axis.text.y = element_text(size = rel(1.1),face = "bold",color="black"),
         axis.text.x = element_text(size = rel(1.1),color="black")) 
}

6) Check function output

p1=first_function(trt_selected = "A",title_text = "Treatment - A");p1

p2=first_function(trt_selected = "B",title_text = "Treatment - B");p2

p3=first_function(trt_selected = c("A","B"),title_text = "Overall");p3

7) Make multi-Panel plots - ggarrange() function

ggarrange(p1,p2,p3)

8) Add a summary table to the panel plot (optional)

summary_table=table1::table1(~sex+agegroup+nrsgroup+htgroup+bmigroup+radicu | trt, data=data)
datatable=as.data.frame(summary_table,row.names = NULL)
texttable=ggtexttable(datatable,theme = ttheme("blank",base_size=8),rows = NULL) %>%
  table_cell_font(row = c(3,6,9,13,17,21), column = 1, face = "bold.italic",size=9) %>%
  tab_add_hline(at.row = 1, row.side = "top", linewidth = 3, linetype = 1) %>%
  tab_add_hline(at.row = 3, row.side = "top", linewidth = 1, linetype = 1) %>%
  tab_add_hline(at.row = 23, row.side = "bottom", linewidth = 3, linetype = 1)%>%
  tab_add_title(text = "Demographic Characteristics",face="bold",size=9)
texttable

ggarrange(ggarrange(p1,p2,nrow =2),
          texttable,ncol=2,widths=c(0.55,0.45))

IV. Summary

  • Quick intro - bar and line plots
  • Data preprocession
  • Heatmap
  • Survival Curves
  • Panel plot