This is the 2nd script used for data analysis and figure generation for the the manuscript by Masche et al. titled “Specific gut microbiome members are associated with distinct immune markers in allogeneic hematopoietic stem cell transplantation”.

This script and associated data are provided by Anna Cäcilia Masche, Susan Holmes, and Sünje Johanna Pamp.

These data and the associated script are licensed under the Creative Commons Attribution-ShareAlike 4.0 International License. To view a copy of this license, visit http://creativecommons.org/licenses/by-sa/4.0/ or send a letter to Creative Commons, PO Box 1866, Mountain View, CA 94042, USA.

Under the condition that appropriate credit is provided, you are free to: 1) Share, copy and redistribute the material in any medium or format 2) Adapt, remix, transform, and build upon the material for any purpose, even commercially.

To see the full license associated with attribution of this work, see the CC-By-CA license, see http://creativecommons.org/licenses/by-sa/4.0/.

The local filename is: Script2_diversity.Rmd.

Load required packages:

library("phyloseq")
library("dplyr")
## Warning: package 'dplyr' was built under R version 3.4.1
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library("ggplot2")
library("gridExtra")
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library("reshape2")
#install.packages("PMCMR")
library("PMCMR")
## Warning: package 'PMCMR' was built under R version 3.4.4
## PMCMR is superseded by PMCMRplus and will be no longer maintained. You may wish to install PMCMRplus instead.

Load the phyloseq object resulting from Script 1:

The file phy_obj1_core_cleaned_vsd1.Rdata is provided.

d_30 <- readRDS(file = "M:/Documents/Publications/Masche_R_Scripts_and_Data/Data/phy_obj1_core_cleaned_vsd1.Rdata")

Boxplot of alpha-diversity per time point:

The following code produces the basic Figure 1C.

plot.div <- ggplot(subset(sample_data(d_30),timepoint != "NA"), aes(x = timepoint, y =  InvSimpson)) + geom_boxplot(outlier.color="NA") + geom_jitter(size=3, shape = 21, width = 0.2)  + theme(axis.title.y = element_text(size=12, margin=margin(0,25,0,0), face = "bold"), axis.text.y = element_text(size=16), axis.text.x = element_text(size=14, face = "bold", angle = 35, vjust = 1, hjust = 1), axis.title.x = element_blank(), legend.text = element_text(size=14, face = "bold"), legend.title = element_text(face = "bold", size = 20),panel.grid.minor = element_blank(), panel.grid.major = element_blank(), panel.background = element_blank(), panel.border = element_rect(color = "black", fill= NA)) + labs(y="Inverse Simpson") + scale_x_discrete(labels=c("pre-HSCT", "day of HSCT", "week +1", "week +2", "week +3", "week +4", "week +5")) + stat_boxplot(geom ='errorbar') 

p_median <- plot.div+stat_summary(fun.y=median, geom="line", colour="black", size = 1.4, aes(group = 1)) + coord_trans(y = "log10")


print(p_median)

Comparison of alpha diversity at different time points using a Friedman test with Benjamini-Hochberg correction and a post-hoc Conover test:

#subset h2:
div_fried <- as(sample_data(d_30)[,c("Patient_ID", "timepoint", "InvSimpson")], "data.frame")

#dcast to format the data as required for the Friedman test
div_fried_dcast <- dcast(div_fried, Patient_ID  ~ timepoint, value.var = "InvSimpson", mean, margins =TRUE)

#turn into a matrix: 
div_fried_dcast_ma <- as.matrix(div_fried_dcast)

#perfrom Friedman test.Week +5 needs to be excluded as there were too few observatons at this time point.
friedman.test(div_fried_dcast_ma[, c(2:7)], na.action = "na.omit", p.adjust.method = "BH") #no overall significant difference between timepoints
## 
##  Friedman rank sum test
## 
## data:  div_fried_dcast_ma[, c(2:7)]
## Friedman chi-squared = 8.8571, df = 5, p-value = 0.1149
#Post-hoc Conover test. Conover was chosen over Nemenyi as it allows for p-value correction for multiple testing.
#Therefore first convert the data to a numeric matrix:

div_fried_dcast_ma_num <- div_fried_dcast_ma

mode(div_fried_dcast_ma_num) <- "numeric" #convert character matrix to numeric matrix
## Warning in mde(x): NAs durch Umwandlung erzeugt
posthoc.friedman.conover.test(div_fried_dcast_ma_num[, c(2:7)],na.action = "na.omit", p.adjust.method = "BH")
## 
##  Pairwise comparisons using Conover's test for a two-way 
##                     balanced complete block design 
## 
## data:  div_fried_dcast_ma_num[, c(2:7)] 
## 
##    f01     w0      w1      w2      w3     
## w0 0.03520 -       -       -       -      
## w1 0.87249 0.04782 -       -       -      
## w2 0.00057 0.16067 0.00089 -       -      
## w3 2.4e-06 0.00653 4.1e-06 0.16067 -      
## w4 1.1e-14 1.4e-09 1.5e-14 1.4e-06 0.00036
## 
## P value adjustment method: BH

Rank abundance curve:

In the rank abundance curve, we want to show average abundance in percent. Therefore we would like to transform the OTU counts to relative abundance. This needs to be done from absolute counts (not from variance stabilized transformed counts). So we start with loading a version of the phyloseq object with absolute OTU counts (as created in Script 1).

The file phy_obj1_core_cleaned.Rdata is provided.

phy_dat <- readRDS(file = "M:/Documents/Publications/Masche_R_Scripts_and_Data/Data/phy_obj1_core_cleaned.Rdata")

families_1 = tax_glom(phy_dat, "Family")#agglom. on family level

#8 most abundant families:
Family8 = names(sort(taxa_sums(families_1), TRUE)[1:8])

#transform to relative abundance:
families_rel1 = transform_sample_counts(families_1, function(x) x/sum(x))

#get the family names
fam_names1 <- as.data.frame(tax_table(families_rel1)[,"Family"])

#extract the family-agglomerated otu table
otu_families_rel1 = as.data.frame(t(otu_table(families_rel1)))

#now join with the previous data:

otu_families_rel_11 <- tibble::rownames_to_column(otu_families_rel1, var = "feces_sample")#gives a name to the row.names column

#rename the columns to family names: 
colnames(otu_families_rel_11) <- fam_names1$Family[match(names(otu_families_rel_11), rownames(fam_names1))]
colnames(otu_families_rel_11)[1] <- "feces_sample"

The following code produces the basic Figure 1D.

otu_families_rel_12 <- reshape2::melt(otu_families_rel_11)
## Using feces_sample as id variables
names(otu_families_rel_12)[names(otu_families_rel_12) == 'variable'] <- 'Family'
names(otu_families_rel_12)[names(otu_families_rel_12) == 'value'] <- 'Relative_abundance'

otu_families_rel_13 <- otu_families_rel_12 %>% group_by(Family) %>% dplyr::summarize(Rel_abund_mean = mean(Relative_abundance, na.rm = TRUE))

levels(otu_families_rel_13$Family)[35]<- c("Family XI.1")

otu_families_rel_14 <- otu_families_rel_13[order(-otu_families_rel_13$Rel_abund_mean),][1:8,]

my.col_x <- c("Enterobacteriaceae" = "#a6cee3", "Lactobacillaceae" = "#1f78b4", "Streptococcaceae" = "#b2df8a", "Staphylococcaceae" = "#33a02c", "Enterococcaceae" = "#fb9a99", "Lachnospiraceae" = "#e31a1c", "Erysipelotrichaceae" = "#fdbf6f", "Ruminococcaceae" = "#ff7f00")

plot.rank <- ggplot(otu_families_rel_14, aes(y= Rel_abund_mean, x = reorder(Family, -Rel_abund_mean), fill = otu_families_rel_14$Family, levels=c("Enterobacteriaceae", "Lactobacillaceae", "Streptococcaceae", "Staphylococcaceae", "Enterococcaceae", "Lachnospiraceae", "Erysipelotrichaceae", "Ruminococcaceae"))) + geom_bar(stat = "identity") + scale_fill_manual(values = my.col_x) + theme(axis.title.y = element_text(size=12,  margin=margin(0,25,0,0), face = "bold"), axis.text.y = element_text(size=16), axis.text.x = element_text(size=14, face = "bold", angle = 0, vjust = 1, hjust = 1), axis.title.x = element_text(size=12,  margin=margin(25,0,0,0), face = "bold"), legend.text = element_text(size=14, face = "bold"), legend.title = element_text(face = "bold", size = 20),panel.grid.minor = element_blank(), panel.grid.major = element_blank(), panel.background = element_blank(), panel.border = element_rect(color = "black", fill= NA)) + labs(y="Average abundance", fill="Family", x = "Rank of family") + scale_x_discrete(labels=c("1", "2", "3", "4", "5", "6", "7", "8")) 

plot.rank