Personal Microbial Cloud

Taxon Analysis

Read in prepared workspace and load packages.

load("../data/pb_data.RData")
library(vegan)
## Loading required package: permute This is vegan 2.0-8
library(labdsv)
## Loading required package: mgcv This is mgcv 1.7-22. For overview type
## 'help("mgcv-package")'. Loading required package: MASS
## 
## Attaching package: 'labdsv'
## 
## The following object is masked from 'package:stats':
## 
## density
library(ape)
library(xtable)
source("../data/pb_functions_for_manuscript.R")

How do samples fall into groups? I.e., do we have balance in treatments?

table(pb.map$person, pb.map$sampleType)
##     
##      filter petri.dish
##   s1     48         24
##   s2     46         23
##   s3     46         24
table(pb.map$location)
## 
##   occ unocc 
##   108   103
table(pb.map$duration)
## 
##   2   4 
## 105 106
table(pb.map$sampleType)
## 
##     filter petri.dish 
##        140         71

Quick refrerence vector of names for figures and whatnot. Some genus, families, etc are empty, so search for the finest taxonomic level and use that name. cons is in functions script sourced above.

consensus <- apply(taxo[, 1:6], 1, cons)
consensus[1:10]
##                     2                     6                    12 
##      "Halomonadaceae" "Solirubrobacterales"         "Rhizobiales" 
##                    21                    24                    25 
##            "Bacteria" "Gammaproteobacteria"            "Bacteria" 
##                    35                    73                   120 
##          "Bacillales"  "Betaproteobacteria"            "Bacteria" 
##                   122 
##           "Ralstonia"

Generate a list of the 10 most abundant OTUs overall, but also tack on OTUs that are among the 10 most abundant in any particular group of samples. Then a matching version of the mapping table.

tops <- names(rev(sort(colSums(pb.1000))))[1:10]
for (i in 1:length(gr2)) {
    tops.plus <- names(rev(sort(colSums(pb.1000[gr2[[i]], ]))))[1:10]
    tops <- unique(c(tops, tops.plus))
}
tops <- tops[rev(order(taxo[tops, 7]))]
taxo.tops <- taxo[tops, ]
length(tops)
## [1] 36
consensus[tops]
##               234992                25506               121358 
##   "Methylobacterium"     "Staphylococcus"       "Tumebacillus" 
##               201443                94008                80526 
##   "Stenotrophomonas"    "Corynebacterium"    "Corynebacterium" 
##               151122                72518               136033 
## "Enterobacteriaceae"    "Corynebacterium"      "Acinetobacter" 
##               129988               172108               199152 
##    "Corynebacterium"        "Pseudomonas"      "Lactobacillus" 
##               129498               134501               229364 
##      "Streptococcus"          "Comamonas"        "Micrococcus" 
##               204422                57760                41465 
##       "Sphingomonas"  "Sphingomonadaceae"     "Dolosigranulum" 
##                63496               235029               205036 
##    "Actinomycetales"            "Kocuria"         "Finegoldia" 
##               164584               189711                92831 
##     "Alcaligenaceae"        "Cupriavidus"            "Pantoea" 
##                  264               133239               196388 
##   "Methylobacterium"            "Gemella"        "Leuconostoc" 
##               100717               187238                58876 
##          "Neisseria"           "Bacteria"          "Aeromonas" 
##                 4557               200842               102650 
##    "Cloacibacterium"  "Flavobacteriaceae"        "Rhizobiales" 
##                55224               169608               102857 
##        "Rhizobiales"         "Inquilinus"         "Pedobacter"

Create a reordered map and otu table.

pb.order <- data.frame(pb.1000[unlist(gr2), tops])
pb.order$gr.num <- c(rep(gr.name, c(unlist(lapply(gr2, length)))))
map.order <- pb.map[row.names(pb.order), ]
pb.order$location <- factor(map.order$location, levels = c("occ", "unocc"))

Make new trimmed OTU table and assess with boxplots. Then send the statistically interesting otus to figures and tables script.

pb.order <- pb.order[map.order$duration == "4", ]
# for (i in 1:(ncol(otus.to.barplot.tmp)-2)) {
# boxplot(otus.to.barplot.tmp[, i] ~ factor(otus.to.barplot.tmp$gr.num),
# las=2, main=consensus[gsub('X', '', names(otus.to.barplot.tmp)[i])],
# sub=i) readline('Press return for next plot') }
pb.order <- pb.order[, c(1, 2, 3, 4, 5, 6, 8, 10, 12, 18, 21, ncol(pb.order) - 
    1, ncol(pb.order))]
names(pb.order) <- gsub("X", "", names(pb.order))
consensus[names(pb.order)]
##             234992              25506             121358 
## "Methylobacterium"   "Staphylococcus"     "Tumebacillus" 
##             201443              94008              80526 
## "Stenotrophomonas"  "Corynebacterium"  "Corynebacterium" 
##              72518             129988             199152 
##  "Corynebacterium"  "Corynebacterium"    "Lactobacillus" 
##              41465             205036               <NA> 
##   "Dolosigranulum"       "Finegoldia"                 NA 
##               <NA> 
##                 NA
rm(cons, Evenness, gr.name, gr2, tops.plus, makeTaxo)

Now for indicator analysis - this takes a long time and lots of memory! This uses the indval() function in the labdsv package. This gives output for all OTUs, the vast majority of which are unimportant to analysis. So there are a few steps to cut the output down to just the statistically salient OTUs.

Unpack list of groups and pull out just 4 hour air filter dataset.

for (i in 1:length(names(groups))) {
    assign(names(groups)[i], unlist(groups[[i]]))
}

occ4f <- c(s14of, s24of, s34of, s14uf, s24uf, s34uf)
occ4f.table <- pb.1000[occ4f, ]
occ4f.table <- occ4f.table[, -which(colSums(occ4f.table) == 0)]
occ4f.map <- pb.map[occ4f, ]

Split this into two chunks - each takes a long time.

iv.location <- indval(occ4f.table, occ4f.map$location)
iv.person <- indval(occ4f.table, occ4f.map$occ.person2)

levels(occ4f.map$occ.person2)

Keep those that are significant before p-value adjustment.

p <- 0.05

these <- which(iv.location$pval <= p)
iv.location.table <- data.frame(iv.location$maxcls[these], round(iv.location$indcls[these], 
    4), iv.location$pval[these])
names(iv.location.table) <- c("cluster", "indicator_value", "probability")
iv.location.table$cluster[iv.location.table$cluster == "1"] <- "occupied"
iv.location.table$cluster[iv.location.table$cluster == "2"] <- "unoccupied"

these <- which(iv.person$pval <= p)
iv.person.table <- data.frame(iv.person$maxcls[these], round(iv.person$indcls[these], 
    4), iv.person$pval[these])
names(iv.person.table) <- c("cluster", "indicator_value", "probability")
iv.person.table$cluster[iv.person.table$cluster == "1"] <- "Subject1"
iv.person.table$cluster[iv.person.table$cluster == "2"] <- "Subject2"
iv.person.table$cluster[iv.person.table$cluster == "3"] <- "Subject3"
iv.person.table$cluster[iv.person.table$cluster == "4"] <- "unoccupied"

Order by indicator value and tack on taxonomic information.

ivL <- iv.location.table[rev(order(iv.location.table$indicator_value)), ]
ivP <- iv.person.table[rev(order(iv.person.table$indicator_value)), ]

head(ivP)
##          cluster indicator_value probability
## X41465  Subject1          0.9620       0.001
## X199152 Subject3          0.8948       0.001
## X154708 Subject3          0.8193       0.001
## X63496  Subject1          0.7999       0.001
## X172036 Subject1          0.7554       0.001
## X186045 Subject3          0.7500       0.001
ivL.names <- gsub("X", "", row.names(ivL))
ivL <- data.frame(cbind(ivL[, ], taxo[ivL.names, ]))

ivP.names <- gsub("X", "", row.names(ivP))
ivP <- cbind(ivP[, ], taxo[ivP.names, ])

ivL <- ivL[, c(2, 3, 10, 1, 9, 8, 7, 6, 5, 4)]
ivP <- ivP[, c(2, 3, 10, 1, 9, 8, 7, 6, 5, 4)]

Print out for later reference.

print(xtable(head(ivL, 20), digits = 3), type = "html")
indicator_value probability abundance cluster genus family order class phylum kingdom
X94008 0.830 0.001 10134.000 occupied Corynebacterium Corynebacteriaceae Actinomycetales Actinobacteria Actinobacteria Bacteria
X72518 0.795 0.001 5093.000 occupied Corynebacterium Corynebacteriaceae Actinomycetales Actinobacteria Actinobacteria Bacteria
X25506 0.770 0.001 19463.000 occupied Staphylococcus Staphylococcaceae Bacillales Bacilli Firmicutes Bacteria
X80526 0.758 0.001 8760.000 occupied Corynebacterium Corynebacteriaceae Actinomycetales Actinobacteria Actinobacteria Bacteria
X201443 0.751 0.001 12640.000 unoccupied Stenotrophomonas Xanthomonadaceae Xanthomonadales Gammaproteobacteria Proteobacteria Bacteria
X129498 0.692 0.001 2393.000 occupied Streptococcus Streptococcaceae Lactobacillales Bacilli Firmicutes Bacteria
X129988 0.650 0.010 2893.000 occupied Corynebacterium Corynebacteriaceae Actinomycetales Actinobacteria Actinobacteria Bacteria
X151122 0.619 0.006 7579.000 unoccupied Enterobacteriaceae Enterobacteriales Gammaproteobacteria Proteobacteria Bacteria
X121358 0.604 0.035 16283.000 unoccupied Tumebacillus Bacillaceae Bacillales Bacilli Firmicutes Bacteria
X13338 0.556 0.001 266.000 occupied Anaerococcus Incertae Sedis XI Clostridiales Clostridia Firmicutes Bacteria
X84956 0.551 0.001 251.000 occupied Peptoniphilus Incertae Sedis XI Clostridiales Clostridia Firmicutes Bacteria
X172949 0.546 0.001 242.000 occupied Corynebacterium Corynebacteriaceae Actinomycetales Actinobacteria Actinobacteria Bacteria
X234992 0.544 0.034 28147.000 unoccupied Methylobacterium Methylobacteriaceae Rhizobiales Alphaproteobacteria Proteobacteria Bacteria
X205036 0.540 0.001 793.000 occupied Finegoldia Incertae Sedis XI Clostridiales Clostridia Firmicutes Bacteria
X134501 0.538 0.003 1560.000 unoccupied Comamonas Comamonadaceae Burkholderiales Betaproteobacteria Proteobacteria Bacteria
X233136 0.532 0.001 277.000 unoccupied Halomonas Halomonadaceae Oceanospirillales Gammaproteobacteria Proteobacteria Bacteria
X230275 0.519 0.001 322.000 occupied Anaerococcus Incertae Sedis XI Clostridiales Clostridia Firmicutes Bacteria
X8516 0.505 0.001 337.000 occupied Anaerococcus Incertae Sedis XI Clostridiales Clostridia Firmicutes Bacteria
X63496 0.500 0.001 1061.000 occupied Actinomycetales Actinobacteria Actinobacteria Bacteria
X199152 0.489 0.001 2554.000 occupied Lactobacillus Lactobacillaceae Lactobacillales Bacilli Firmicutes Bacteria
print(xtable(head(ivP, 20), digits = 3), type = "html")
indicator_value probability abundance cluster genus family order class phylum kingdom
X41465 0.962 0.001 1089.000 Subject1 Dolosigranulum Carnobacteriaceae Lactobacillales Bacilli Firmicutes Bacteria
X199152 0.895 0.001 2554.000 Subject3 Lactobacillus Lactobacillaceae Lactobacillales Bacilli Firmicutes Bacteria
X154708 0.819 0.001 64.000 Subject3 Corynebacterium Corynebacteriaceae Actinomycetales Actinobacteria Actinobacteria Bacteria
X63496 0.800 0.001 1061.000 Subject1 Actinomycetales Actinobacteria Actinobacteria Bacteria
X172036 0.755 0.001 362.000 Subject1 Corynebacterium Corynebacteriaceae Actinomycetales Actinobacteria Actinobacteria Bacteria
X186045 0.750 0.001 154.000 Subject3 Facklamia Aerococcaceae Lactobacillales Bacilli Firmicutes Bacteria
X8516 0.744 0.001 337.000 Subject3 Anaerococcus Incertae Sedis XI Clostridiales Clostridia Firmicutes Bacteria
X9535 0.667 0.001 133.000 Subject1 Corynebacterium Corynebacteriaceae Actinomycetales Actinobacteria Actinobacteria Bacteria
X175349 0.665 0.001 99.000 Subject3 Peptoniphilus Incertae Sedis XI Clostridiales Clostridia Firmicutes Bacteria
X108686 0.652 0.001 205.000 Subject1 Corynebacterium Corynebacteriaceae Actinomycetales Actinobacteria Actinobacteria Bacteria
X129988 0.640 0.001 2893.000 Subject1 Corynebacterium Corynebacteriaceae Actinomycetales Actinobacteria Actinobacteria Bacteria
X196388 0.589 0.001 253.000 Subject2 Leuconostoc Leuconostocaceae Lactobacillales Bacilli Firmicutes Bacteria
X205036 0.583 0.002 793.000 Subject1 Finegoldia Incertae Sedis XI Clostridiales Clostridia Firmicutes Bacteria
X217189 0.576 0.001 37.000 Subject3 Incertae Sedis XI Clostridiales Clostridia Firmicutes Bacteria
X96641 0.564 0.001 283.000 Subject1 Anaerococcus Incertae Sedis XI Clostridiales Clostridia Firmicutes Bacteria
X63291 0.557 0.001 70.000 Subject1 Corynebacterium Corynebacteriaceae Actinomycetales Actinobacteria Actinobacteria Bacteria
X201443 0.554 0.002 12640.000 unoccupied Stenotrophomonas Xanthomonadaceae Xanthomonadales Gammaproteobacteria Proteobacteria Bacteria
X9604 0.540 0.001 260.000 Subject1 Aerococcaceae Lactobacillales Bacilli Firmicutes Bacteria
X94008 0.538 0.001 10134.000 Subject1 Corynebacterium Corynebacteriaceae Actinomycetales Actinobacteria Actinobacteria Bacteria
X72518 0.532 0.001 5093.000 Subject3 Corynebacterium Corynebacteriaceae Actinomycetales Actinobacteria Actinobacteria Bacteria

Clean up and save important bits to go to figures and tables script.

rm(list = ls()[which(ls() %in% names(groups))])
rm(occ4f, occ4f.table, occ4f.map, i, iv.location, iv.person, p, these, iv.location.table, 
    iv.person.table, ivL.names, ivP.names)
save(ivL, ivP, pb.order, taxo.tops, tops, consensus, file = "../data/taxon_analysis.RData")