library(readxl)     # read data from Excel files
library(tidyverse)   # clean & format data
library(ggplot2)     # visualize data
library(rstatix)     # basic statistical analysis
setwd("/Users/zhouchuhan/introR/data")  #specify a folder path to locate data files
sample=read_xlsx("sample_nrs.xlsx") 

Quick reviwe of imported data set

a. Check variables, data type of each variable

str(sample)
## tibble [1,081 × 5] (S3: tbl_df/tbl/data.frame)
##  $ subjid : num [1:1081] 101002 102002 102003 200002 200003 ...
##  $ aval   : num [1:1081] 4 5 5 4 4 4 3 4 4 4 ...
##  $ avisitn: chr [1:1081] "100" "100" "100" "100" ...
##  $ vsn    : chr [1:1081] "Baseline" "Baseline" "Baseline" "Baseline" ...
##  $ siteid : chr [1:1081] "1" "1" "1" "2" ...

b. Change numerical variable to categorical variable

sample$subjid=as.character(sample$subjid)
sample$avisitn=as.numeric(sample$avisitn)

c. Get the number of subjects from longitudinal data

subj=unique(sample$subjid)
length(subj)
## [1] 86

d. Assign treatment group to each subject (randomization data)

set.seed(1)
trt=sample(c("A","B"),size=length(subj),replace = TRUE)
trt
##  [1] "A" "B" "A" "A" "B" "A" "A" "A" "B" "B" "A" "A" "A" "A" "A" "B" "B" "B" "B"
## [20] "A" "A" "A" "A" "A" "A" "A" "B" "A" "A" "B" "B" "B" "A" "B" "A" "A" "B" "A"
## [39] "B" "B" "B" "B" "A" "B" "B" "B" "B" "B" "A" "A" "B" "A" "B" "B" "A" "A" "B"
## [58] "B" "B" "A" "A" "B" "B" "B" "B" "B" "B" "A" "B" "B" "B" "B" "A" "A" "A" "B"
## [77] "B" "A" "A" "B" "B" "B" "A" "A" "A" "B"
effl=sample(c("Y","N"),size=length(subj),replace = TRUE)
effl
##  [1] "Y" "N" "Y" "N" "Y" "Y" "N" "N" "Y" "Y" "Y" "N" "N" "Y" "N" "N" "N" "Y" "Y"
## [20] "Y" "Y" "N" "Y" "Y" "Y" "Y" "Y" "N" "N" "N" "N" "Y" "N" "N" "N" "N" "Y" "Y"
## [39] "N" "Y" "Y" "N" "N" "Y" "Y" "Y" "N" "Y" "N" "Y" "N" "N" "N" "Y" "N" "Y" "Y"
## [58] "N" "Y" "N" "N" "N" "Y" "Y" "N" "Y" "Y" "Y" "Y" "N" "N" "N" "N" "N" "Y" "Y"
## [77] "Y" "N" "Y" "Y" "N" "Y" "Y" "N" "Y" "Y"
subj_trt=as_tibble(cbind(subj,trt))
head(subj_trt,10)
## # A tibble: 10 x 2
##    subj   trt  
##    <chr>  <chr>
##  1 101002 A    
##  2 102002 B    
##  3 102003 A    
##  4 200002 A    
##  5 200003 B    
##  6 200004 A    
##  7 200005 A    
##  8 200009 A    
##  9 200010 B    
## 10 200011 B

e. Specify levels of the grouping variable (e.g. treatment)

subj_trt$trt=factor(subj_trt$trt,levels=c("A","B"))

subj_trt$effl=effl
subj_trt$effl=factor(subj_trt$effl,levels=c("Y","N"))

str(subj_trt)
## tibble [86 × 3] (S3: tbl_df/tbl/data.frame)
##  $ subj: chr [1:86] "101002" "102002" "102003" "200002" ...
##  $ trt : Factor w/ 2 levels "A","B": 1 2 1 1 2 1 1 1 2 2 ...
##  $ effl: Factor w/ 2 levels "Y","N": 1 2 1 2 1 1 2 2 1 1 ...

f. Combine two data sets - longitudinal data and randomization data

sample_cm=left_join(sample, subj_trt, by=c("subjid"="subj")) %>%
  arrange(subjid,avisitn)
head(sample_cm,20)
## # A tibble: 20 x 7
##    subjid  aval avisitn vsn      siteid trt   effl 
##    <chr>  <dbl>   <dbl> <chr>    <chr>  <fct> <fct>
##  1 101002 4         100 Baseline 1      A     Y    
##  2 101002 3.14      101 1        1      A     Y    
##  3 101002 1.86      102 2        1      A     Y    
##  4 101002 1.29      103 3        1      A     Y    
##  5 101002 2         104 4        1      A     Y    
##  6 101002 2.14      105 5        1      A     Y    
##  7 101002 1.57      106 6        1      A     Y    
##  8 101002 1.14      107 7        1      A     Y    
##  9 101002 1         108 8        1      A     Y    
## 10 101002 0.714     109 9        1      A     Y    
## 11 101002 1         110 10       1      A     Y    
## 12 101002 1.33      111 11       1      A     Y    
## 13 101002 0.667     112 12       1      A     Y    
## 14 102002 5         100 Baseline 1      B     N    
## 15 102002 2         101 1        1      B     N    
## 16 102002 1.43      102 2        1      B     N    
## 17 102002 0.571     103 3        1      B     N    
## 18 102002 0.571     104 4        1      B     N    
## 19 102002 0.857     105 5        1      B     N    
## 20 102002 0.571     106 6        1      B     N

Summry statistics

a. Average NRS by treatment and visit

summary = sample_cm %>%
  group_by(trt,avisitn) %>%
  get_summary_stats(aval, type = "mean_sd") %>%
  #select(trt,avisitn,n,mean,sd)
  select(trt,avisitn,everything(),-variable)
summary
## # A tibble: 26 x 5
##    trt   avisitn     n  mean    sd
##    <fct>   <dbl> <dbl> <dbl> <dbl>
##  1 A         100    40 4.45  2.75 
##  2 A         101    39 2.01  1.50 
##  3 A         102    38 1.22  1.36 
##  4 A         103    38 1.09  1.28 
##  5 A         104    38 1.00  1.02 
##  6 A         105    38 0.949 1.32 
##  7 A         106    39 0.777 1.17 
##  8 A         107    38 0.631 0.875
##  9 A         108    37 0.718 1.09 
## 10 A         109    37 0.825 1.34 
## # … with 16 more rows

b. Build a formated table - Baseline NRS

library(table1) 
baseline=filter(sample_cm,vsn == "Baseline")
table1( ~ aval | trt, data=baseline)
A
(N=40)
B
(N=44)
Overall
(N=84)
aval
Mean (SD) 4.45 (2.75) 4.50 (2.30) 4.48 (2.51)
Median [Min, Max] 4.00 [0, 10.0] 4.00 [0, 9.00] 4.00 [0, 10.0]
table1( ~ aval | trt*effl, data=baseline)
A
B
Overall
Y
(N=18)
N
(N=22)
Y
(N=26)
N
(N=18)
Y
(N=44)
N
(N=40)
aval
Mean (SD) 4.39 (2.50) 4.50 (3.00) 4.23 (2.21) 4.89 (2.42) 4.30 (2.31) 4.68 (2.73)
Median [Min, Max] 4.00 [0, 10.0] 4.00 [0, 9.00] 4.00 [0, 9.00] 4.50 [0, 9.00] 4.00 [0, 10.0] 4.00 [0, 9.00]
table1( ~ aval | effl*trt, data=baseline)
Y
N
Overall
A
(N=18)
B
(N=26)
A
(N=22)
B
(N=18)
A
(N=40)
B
(N=44)
aval
Mean (SD) 4.39 (2.50) 4.23 (2.21) 4.50 (3.00) 4.89 (2.42) 4.45 (2.75) 4.50 (2.30)
Median [Min, Max] 4.00 [0, 10.0] 4.00 [0, 9.00] 4.00 [0, 9.00] 4.50 [0, 9.00] 4.00 [0, 10.0] 4.00 [0, 9.00]

Distribution plot

Basic Boxplot

sample_cm$vsn=factor(sample_cm$vsn,levels=c("Baseline",1:12))
ggplot(data=sample_cm,aes(x=vsn,y=aval)) +
  geom_boxplot(width=0.5) +
  scale_x_discrete("Visit (d)",labels=unique(sample_cm$vsn)) +
  scale_y_continuous("NRS pain [0-10]",limits = c(0,10),breaks = c(0,2,4,6,8,10)) +
  theme_classic()

Group by Treatment

ggplot(data=sample_cm,aes(x=vsn,y=aval)) +
  #geom_boxplot(aes(group=trt),width=0.5) +
  geom_boxplot(aes(fill=trt),width=0.5) +
  scale_x_discrete("Visit (d)",labels=unique(sample_cm$vsn)) +
  scale_y_continuous("NRS pain [0-10]",limits = c(0,10),breaks = c(0,2,4,6,8,10)) +
  theme_classic() 

Add errorbar

ggplot(data=sample_cm,aes(x=vsn,y=aval)) +
  stat_boxplot(aes(fill=trt),geom ='errorbar',width=0.15,position = position_dodge(0.5)) +
  geom_boxplot(aes(fill=trt),width=0.5,position = position_dodge(0.5)) +
  scale_x_discrete("Visit (d)",labels=unique(sample_cm$vsn)) +
  scale_y_continuous("NRS pain [0-10]",limits = c(0,10),breaks = c(0,2,4,6,8,10)) +
  theme_classic() +
  theme(legend.position = "top")

Case Study

Questions:

  1. two different scales on y-axis
  2. font changed to TIMES

Step 1 - Derive adjusted y-axis values - cutoff at 10

  • If a data point is in the range of [0,10], its y-axis value will remain: \[ Scale 1: [0,10] \ -> [0,10]\]
  • If a data point is in the range of (10,30], it will have a different y-axis value: \[ Scale 2: (10,30] \ -> [11,15]\]
setwd("/Users/zhouchuhan/introR/data")
KM3 = read_csv("f_14_2_4_1_crp_ri.csv") %>%
  janitor::clean_names() %>%
  mutate(avisitn=factor(avisitn,levels=c("2","4","5","6","7","8","10")),
         aval_ad=ifelse(aval<=10,aval,11+(aval-10)/5)) # use ifelse() to create the adjusted aval

Step 2 - Review the original single scale figure

Note: Boxplots are squeezed when a singe scale being used

Step 3 - Find the two different scales

Step 4 - Change to adjusted y-axis values

Note: Adjusted y-axis values are shown in the parenthesis

Step 5 - Final plot

  • Change to TIMES NEW ROMEN font by specifying family=“serif”
ggplot(data=KM3,aes(x=avisitn,y=aval_ad)) +
  
  stat_boxplot(geom = 'errorbar',width=0.1,color="blue") +
  geom_boxplot(width=0.15,color="blue",outlier.size=2,outlier.shape=1) +
  scale_x_discrete(labels=c("Baseline","D4","1","2","4","6","12")) +
  scale_y_continuous("CRP(mg/dL)", limits=c(0,15),breaks = c(0:10,12:15),labels = c(0:10,15,20,25,30)) +
  labs(subtitle="Figure XXX. Boxplot Longitudinal \n Safety Analysis Set",
       x="Weeks") +
  theme_classic() +
  theme(  plot.margin = margin(0.5,0.2,0.1,0.85, "cm"),
          panel.grid.major.x = element_blank(),
          panel.grid.minor = element_blank(),
          panel.grid.major.y = element_blank(),
          plot.title = element_text(size = rel(0.6),face = "bold", vjust = 5),
          plot.subtitle = element_text(size = rel(1),face = "bold",family="serif", hjust=0.5,vjust = 1.5),
          axis.title.y = element_text(#face = "bold",
                                      size = rel(1.2),family="serif",vjust= 5),
          axis.title.x = element_text(#face = "bold",
                                      size = rel(1.2),family="serif",vjust= -0.5),
          axis.text.x = element_text(color="black",family="serif",size=rel(1.2)),
          axis.text.y = element_text(color="black",family="serif",size=rel(1.2))
           ) 

Step 6 (Optional) - Build a table to display the count of observations at each visit

table=KM3 %>%
  dplyr::group_by(avisitn) %>%
  dplyr::summarise(n=n()) %>%
  spread(avisitn,n) %>%
  mutate(c1=c("Number of \n Subjects:")) %>%
  select(c1,everything())

library(ggpubr)
ggtexttable(table, rows = NULL, cols = NULL, theme = ttheme("blank"))