Training-2

library(readxl)     # read data from Excel files
library(tidyverse)   # clean & format data
library(ggplot2)     # visualize data
library(rstatix)     # basic statistical analysis

setwd("/Users/zhouchuhan/introR/data")  #specify a folder path to locate data files
sample=read_xlsx("sample_nrs.xlsx")

Quick reviwe of imported data set

a. Check variables, data type of each variable

str(sample)

## tibble [1,081 × 5] (S3: tbl_df/tbl/data.frame)
##  $ subjid : num [1:1081] 101002 102002 102003 200002 200003 ...
##  $ aval   : num [1:1081] 4 5 5 4 4 4 3 4 4 4 ...
##  $ avisitn: chr [1:1081] "100" "100" "100" "100" ...
##  $ vsn    : chr [1:1081] "Baseline" "Baseline" "Baseline" "Baseline" ...
##  $ siteid : chr [1:1081] "1" "1" "1" "2" ...

b. Change numerical variable to categorical variable

sample$subjid=as.character(sample$subjid)
sample$avisitn=as.numeric(sample$avisitn)

c. Get the number of subjects from longitudinal data

subj=unique(sample$subjid)
length(subj)

## [1] 86

d. Assign treatment group to each subject (randomization data)

set.seed(1)
trt=sample(c("A","B"),size=length(subj),replace = TRUE)
trt

##  [1] "A" "B" "A" "A" "B" "A" "A" "A" "B" "B" "A" "A" "A" "A" "A" "B" "B" "B" "B"
## [20] "A" "A" "A" "A" "A" "A" "A" "B" "A" "A" "B" "B" "B" "A" "B" "A" "A" "B" "A"
## [39] "B" "B" "B" "B" "A" "B" "B" "B" "B" "B" "A" "A" "B" "A" "B" "B" "A" "A" "B"
## [58] "B" "B" "A" "A" "B" "B" "B" "B" "B" "B" "A" "B" "B" "B" "B" "A" "A" "A" "B"
## [77] "B" "A" "A" "B" "B" "B" "A" "A" "A" "B"

effl=sample(c("Y","N"),size=length(subj),replace = TRUE)
effl

##  [1] "Y" "N" "Y" "N" "Y" "Y" "N" "N" "Y" "Y" "Y" "N" "N" "Y" "N" "N" "N" "Y" "Y"
## [20] "Y" "Y" "N" "Y" "Y" "Y" "Y" "Y" "N" "N" "N" "N" "Y" "N" "N" "N" "N" "Y" "Y"
## [39] "N" "Y" "Y" "N" "N" "Y" "Y" "Y" "N" "Y" "N" "Y" "N" "N" "N" "Y" "N" "Y" "Y"
## [58] "N" "Y" "N" "N" "N" "Y" "Y" "N" "Y" "Y" "Y" "Y" "N" "N" "N" "N" "N" "Y" "Y"
## [77] "Y" "N" "Y" "Y" "N" "Y" "Y" "N" "Y" "Y"

subj_trt=as_tibble(cbind(subj,trt))
head(subj_trt,10)

## # A tibble: 10 x 2
##    subj   trt  
##    <chr>  <chr>
##  1 101002 A    
##  2 102002 B    
##  3 102003 A    
##  4 200002 A    
##  5 200003 B    
##  6 200004 A    
##  7 200005 A    
##  8 200009 A    
##  9 200010 B    
## 10 200011 B

e. Specify levels of the grouping variable (e.g. treatment)

subj_trt$trt=factor(subj_trt$trt,levels=c("A","B"))

subj_trt$effl=effl
subj_trt$effl=factor(subj_trt$effl,levels=c("Y","N"))

str(subj_trt)

## tibble [86 × 3] (S3: tbl_df/tbl/data.frame)
##  $ subj: chr [1:86] "101002" "102002" "102003" "200002" ...
##  $ trt : Factor w/ 2 levels "A","B": 1 2 1 1 2 1 1 1 2 2 ...
##  $ effl: Factor w/ 2 levels "Y","N": 1 2 1 2 1 1 2 2 1 1 ...

f. Combine two data sets - longitudinal data and randomization data

sample_cm=left_join(sample, subj_trt, by=c("subjid"="subj")) %>%
  arrange(subjid,avisitn)
head(sample_cm,20)

## # A tibble: 20 x 7
##    subjid  aval avisitn vsn      siteid trt   effl 
##    <chr>  <dbl>   <dbl> <chr>    <chr>  <fct> <fct>
##  1 101002 4         100 Baseline 1      A     Y    
##  2 101002 3.14      101 1        1      A     Y    
##  3 101002 1.86      102 2        1      A     Y    
##  4 101002 1.29      103 3        1      A     Y    
##  5 101002 2         104 4        1      A     Y    
##  6 101002 2.14      105 5        1      A     Y    
##  7 101002 1.57      106 6        1      A     Y    
##  8 101002 1.14      107 7        1      A     Y    
##  9 101002 1         108 8        1      A     Y    
## 10 101002 0.714     109 9        1      A     Y    
## 11 101002 1         110 10       1      A     Y    
## 12 101002 1.33      111 11       1      A     Y    
## 13 101002 0.667     112 12       1      A     Y    
## 14 102002 5         100 Baseline 1      B     N    
## 15 102002 2         101 1        1      B     N    
## 16 102002 1.43      102 2        1      B     N    
## 17 102002 0.571     103 3        1      B     N    
## 18 102002 0.571     104 4        1      B     N    
## 19 102002 0.857     105 5        1      B     N    
## 20 102002 0.571     106 6        1      B     N

Summry statistics

a. Average NRS by treatment and visit

summary = sample_cm %>%
  group_by(trt,avisitn) %>%
  get_summary_stats(aval, type = "mean_sd") %>%
  #select(trt,avisitn,n,mean,sd)
  select(trt,avisitn,everything(),-variable)
summary

## # A tibble: 26 x 5
##    trt   avisitn     n  mean    sd
##    <fct>   <dbl> <dbl> <dbl> <dbl>
##  1 A         100    40 4.45  2.75 
##  2 A         101    39 2.01  1.50 
##  3 A         102    38 1.22  1.36 
##  4 A         103    38 1.09  1.28 
##  5 A         104    38 1.00  1.02 
##  6 A         105    38 0.949 1.32 
##  7 A         106    39 0.777 1.17 
##  8 A         107    38 0.631 0.875
##  9 A         108    37 0.718 1.09 
## 10 A         109    37 0.825 1.34 
## # … with 16 more rows

b. Build a formated table - Baseline NRS

library(table1) 
baseline=filter(sample_cm,vsn == "Baseline")
table1( ~ aval | trt, data=baseline)

	A (N=40)	B (N=44)	Overall (N=84)
aval
Mean (SD)	4.45 (2.75)	4.50 (2.30)	4.48 (2.51)
Median [Min, Max]	4.00 [0, 10.0]	4.00 [0, 9.00]	4.00 [0, 10.0]

table1( ~ aval | trt*effl, data=baseline)

	A		B		Overall
	Y (N=18)	N (N=22)	Y (N=26)	N (N=18)	Y (N=44)	N (N=40)
aval
Mean (SD)	4.39 (2.50)	4.50 (3.00)	4.23 (2.21)	4.89 (2.42)	4.30 (2.31)	4.68 (2.73)
Median [Min, Max]	4.00 [0, 10.0]	4.00 [0, 9.00]	4.00 [0, 9.00]	4.50 [0, 9.00]	4.00 [0, 10.0]	4.00 [0, 9.00]

table1( ~ aval | effl*trt, data=baseline)

	Y		N		Overall
	A (N=18)	B (N=26)	A (N=22)	B (N=18)	A (N=40)	B (N=44)
aval
Mean (SD)	4.39 (2.50)	4.23 (2.21)	4.50 (3.00)	4.89 (2.42)	4.45 (2.75)	4.50 (2.30)
Median [Min, Max]	4.00 [0, 10.0]	4.00 [0, 9.00]	4.00 [0, 9.00]	4.50 [0, 9.00]	4.00 [0, 10.0]	4.00 [0, 9.00]

Distribution plot

Basic Boxplot

sample_cm$vsn=factor(sample_cm$vsn,levels=c("Baseline",1:12))
ggplot(data=sample_cm,aes(x=vsn,y=aval)) +
  geom_boxplot(width=0.5) +
  scale_x_discrete("Visit (d)",labels=unique(sample_cm$vsn)) +
  scale_y_continuous("NRS pain [0-10]",limits = c(0,10),breaks = c(0,2,4,6,8,10)) +
  theme_classic()

Group by Treatment

ggplot(data=sample_cm,aes(x=vsn,y=aval)) +
  #geom_boxplot(aes(group=trt),width=0.5) +
  geom_boxplot(aes(fill=trt),width=0.5) +
  scale_x_discrete("Visit (d)",labels=unique(sample_cm$vsn)) +
  scale_y_continuous("NRS pain [0-10]",limits = c(0,10),breaks = c(0,2,4,6,8,10)) +
  theme_classic()

Add errorbar

ggplot(data=sample_cm,aes(x=vsn,y=aval)) +
  stat_boxplot(aes(fill=trt),geom ='errorbar',width=0.15,position = position_dodge(0.5)) +
  geom_boxplot(aes(fill=trt),width=0.5,position = position_dodge(0.5)) +
  scale_x_discrete("Visit (d)",labels=unique(sample_cm$vsn)) +
  scale_y_continuous("NRS pain [0-10]",limits = c(0,10),breaks = c(0,2,4,6,8,10)) +
  theme_classic() +
  theme(legend.position = "top")

Case Study

Questions:

two different scales on y-axis
font changed to TIMES

Step 1 - Derive adjusted y-axis values - cutoff at 10

If a data point is in the range of [0,10], its y-axis value will remain: \[ Scale 1: [0,10] \ -> [0,10]\]
If a data point is in the range of (10,30], it will have a different y-axis value: \[ Scale 2: (10,30] \ -> [11,15]\]

setwd("/Users/zhouchuhan/introR/data")
KM3 = read_csv("f_14_2_4_1_crp_ri.csv") %>%
  janitor::clean_names() %>%
  mutate(avisitn=factor(avisitn,levels=c("2","4","5","6","7","8","10")),
         aval_ad=ifelse(aval<=10,aval,11+(aval-10)/5)) # use ifelse() to create the adjusted aval

Step 2 - Review the original single scale figure

Note: Boxplots are squeezed when a singe scale being used

Step 3 - Find the two different scales

Step 4 - Change to adjusted y-axis values

Note: Adjusted y-axis values are shown in the parenthesis

Step 5 - Final plot

Change to TIMES NEW ROMEN font by specifying family=“serif”

ggplot(data=KM3,aes(x=avisitn,y=aval_ad)) +
  
  stat_boxplot(geom = 'errorbar',width=0.1,color="blue") +
  geom_boxplot(width=0.15,color="blue",outlier.size=2,outlier.shape=1) +
  scale_x_discrete(labels=c("Baseline","D4","1","2","4","6","12")) +
  scale_y_continuous("CRP(mg/dL)", limits=c(0,15),breaks = c(0:10,12:15),labels = c(0:10,15,20,25,30)) +
  labs(subtitle="Figure XXX. Boxplot Longitudinal \n Safety Analysis Set",
       x="Weeks") +
  theme_classic() +
  theme(  plot.margin = margin(0.5,0.2,0.1,0.85, "cm"),
          panel.grid.major.x = element_blank(),
          panel.grid.minor = element_blank(),
          panel.grid.major.y = element_blank(),
          plot.title = element_text(size = rel(0.6),face = "bold", vjust = 5),
          plot.subtitle = element_text(size = rel(1),face = "bold",family="serif", hjust=0.5,vjust = 1.5),
          axis.title.y = element_text(#face = "bold",
                                      size = rel(1.2),family="serif",vjust= 5),
          axis.title.x = element_text(#face = "bold",
                                      size = rel(1.2),family="serif",vjust= -0.5),
          axis.text.x = element_text(color="black",family="serif",size=rel(1.2)),
          axis.text.y = element_text(color="black",family="serif",size=rel(1.2))
           )

Step 6 (Optional) - Build a table to display the count of observations at each visit

table=KM3 %>%
  dplyr::group_by(avisitn) %>%
  dplyr::summarise(n=n()) %>%
  spread(avisitn,n) %>%
  mutate(c1=c("Number of \n Subjects:")) %>%
  select(c1,everything())

library(ggpubr)
ggtexttable(table, rows = NULL, cols = NULL, theme = ttheme("blank"))

Training-2

Chuhan Zhou

August 07, 2021

Quick reviwe of imported data set

a. Check variables, data type of each variable

b. Change numerical variable to categorical variable

c. Get the number of subjects from longitudinal data

d. Assign treatment group to each subject (randomization data)

e. Specify levels of the grouping variable (e.g. treatment)

f. Combine two data sets - longitudinal data and randomization data

Summry statistics

a. Average NRS by treatment and visit

b. Build a formated table - Baseline NRS

Distribution plot

Basic Boxplot

Group by Treatment

Add errorbar

Case Study

Questions:

Step 1 - Derive adjusted y-axis values - cutoff at 10

Step 2 - Review the original single scale figure

Step 3 - Find the two different scales

Step 4 - Change to adjusted y-axis values

Step 5 - Final plot

Step 6 (Optional) - Build a table to display the count of observations at each visit