Personal Microbial Cloud

Sequence Data Cleanup

library(vegan)
## Loading required package: permute This is vegan 2.0-8
library(labdsv)
## Loading required package: mgcv This is mgcv 1.7-22. For overview type
## 'help("mgcv-package")'. Loading required package: MASS
## 
## Attaching package: 'labdsv'
## 
## The following object is masked from 'package:stats':
## 
## density
library(ape)
source("../data/pb_functions_for_manuscript.R")

Load data:

pb <- read.delim("../data/otu_table_r.txt", head = TRUE, row.names = 1)
pb.tax <- pb[, ncol(pb)]
pb <- pb[, -ncol(pb)]
pb <- t(pb)

pb.map <- read.delim("../data/pb_map.txt", head = TRUE, row.names = 1)

all(row.names(pb.map) %in% row.names(pb))
## [1] TRUE
dim(pb.map)
## [1] 300  12
dim(pb)
## [1]    300 235688

pb.map <- pb.map[row.names(pb), ]
dim(pb)
## [1]    300 235688
dim(pb.map)
## [1] 300  12
length(pb.tax)
## [1] 235688
identical(row.names(pb), row.names(pb.map))
## [1] TRUE

Keep original copy before changing things.

pb.original <- pb
pb.tax.original <- pb.tax
pb.map.original <- pb.map
# saved just in case of screwup. 
pb <- pb.original
pb.tax <- pb.tax.original
pb.map <- pb.map.original

Take out plant sequences

plants <- grep("Streptophyt", pb.tax)
pb <- pb[, -plants]
pb.tax <- pb.tax[-plants]
rm(plants)

Set up taxonomy table

taxo <- makeTaxo(taxo.in = pb.tax, otu.table = pb)
## Warning: no non-missing arguments to max; returning -Inf
dim(taxo)
## [1] 234213      7
dim(pb)
## [1]    300 234213

The big dataset started with 1.0028 × 107 total sequences from 234213 OTUs and 300 samples.

Identify those samples not used in the study.

nonexp <- which(pb.map$location == "east" | pb.map$location == "west" | pb.map$location == 
    "out")
pb <- pb[-nonexp, ]
pb.map <- pb.map[-nonexp, ]
rm(nonexp)
dim(pb)
## [1]    243 234213
dim(pb.map)
## [1] 243  12

Index for controls, and clean up mapping table.

pb.map <- pb.map[, c("person", "location", "duration", "sampleType")]
control <- which(pb.map$location == "control")
pb <- pb[-c(control), ]
pb.map <- pb.map[-c(control), ]
dim(pb)
## [1]    216 234213
dim(pb.map)
## [1] 216   4
identical(row.names(pb), row.names(pb.map))
## [1] TRUE

Remove 4 contaminants that were introduced by PCR reagents. And remove control samples. This is detailed in pb_analysis_Redo.Rmd.

contaminants <- c("190873", "64285", "96443", "129992")
contaminants.index <- which(colnames(pb) %in% contaminants)
dim(pb)
## [1]    216 234213
dim(pb.map)
## [1] 216   4
pb <- pb[, -contaminants.index]
taxo <- taxo[-contaminants.index, ]  # pb
rm(control, contaminants, contaminants.index)
identical(row.names(pb.map), row.names(pb))
## [1] TRUE
dim(pb.map)
## [1] 216   4
dim(pb)
## [1]    216 234209
dim(taxo)
## [1] 234209      7

Rarefy to 1000 seqs per sample.

sort(rowSums(pb))
## uf04.2h.s3 uf05.2h.s3 uf09.2h.s2 uf05.4h.s2 up04.4h.s2 of04.4h.s2 
##         14         30        319        445        622       1035 
## uf07.2h.s1 uf08.2h.s2 up05.2h.s3 uf09.4h.s2 uf11.4h.s1 uf12.2h.s3 
##       1168       1209       1264       1298       1509       1832 
## of01.2h.s2 up05.2h.s2 uf05.2h.s1 of11.2h.s3 uf10.4h.s3 uf12.4h.s2 
##       1981       2100       2132       2313       2361       2411 
## up02.2h.s2 of04.4h.s1 uf03.4h.s3 uf03.2h.s1 up02.4h.s3 uf03.2h.s2 
##       2414       2563       2577       2690       2725       2755 
## op03.2h.s3 up06.4h.s3 of02.2h.s2 of03.4h.s2 up05.4h.s2 of11.2h.s1 
##       2780       2896       3276       3339       3429       3544 
## up04.2h.s1 uf02.2h.s3 of06.2h.s3 uf06.4h.s1 of05.2h.s2 uf09.2h.s1 
##       3688       3716       3770       3813       3929       3956 
## uf09.2h.s3 up04.4h.s3 uf08.4h.s2 up03.2h.s1 uf06.2h.s3 uf01.4h.s1 
##       3974       4049       4254       4255       4340       4379 
## of09.4h.s2 uf08.2h.s1 of09.2h.s2 up04.2h.s2 op06.4h.s1 op05.2h.s1 
##       4398       4431       4633       4664       4743       4774 
## uf03.4h.s2 uf07.2h.s3 uf12.4h.s3 uf06.4h.s2 uf10.4h.s2 uf02.4h.s1 
##       4858       5012       5403       5463       5465       5507 
## uf12.2h.s1 uf04.4h.s3 of07.4h.s2 op06.2h.s3 of02.2h.s3 up06.4h.s1 
##       5553       5567       5695       5766       5859       6039 
## uf01.2h.s3 uf12.4h.s1 of03.2h.s2 uf03.4h.s1 uf08.2h.s3 op05.2h.s2 
##       6122       6242       6471       6544       6603       6699 
## uf06.2h.s1 of12.4h.s2 uf03.2h.s3 up01.4h.s3 op04.4h.s2 uf10.2h.s3 
##       6853       6966       7024       7045       7058       7083 
## uf04.2h.s1 up03.4h.s3 uf11.4h.s2 up06.2h.s1 of03.2h.s3 uf01.2h.s1 
##       7331       7417       7708       8239       8321       8341 
## of12.2h.s3 of06.2h.s2 op05.2h.s3 uf09.4h.s1 up01.2h.s1 up06.4h.s2 
##       8599       8667       8832       8924       8944       9123 
## op05.4h.s3 op06.2h.s1 of10.2h.s3 uf11.2h.s1 of03.2h.s1 up05.4h.s3 
##       9728       9836      10098      10105      10287      10455 
## up03.2h.s2 of04.2h.s2 up03.4h.s1 uf07.4h.s2 of04.4h.s3 of03.4h.s3 
##      10719      10737      10874      10982      11224      11229 
## uf02.4h.s3 of03.4h.s1 up03.2h.s3 of10.4h.s3 of01.2h.s3 uf08.4h.s1 
##      11361      11362      11409      11440      11496      11595 
## uf07.4h.s1 up01.4h.s2 op04.4h.s3 uf01.4h.s2 uf04.2h.s2 of10.2h.s1 
##      11680      11794      11943      12079      12268      12565 
## of08.2h.s2 up02.4h.s2 uf04.4h.s1 op03.4h.s2 uf10.2h.s1 uf11.2h.s2 
##      12570      12576      12827      12839      12993      13104 
## op04.2h.s2 uf01.2h.s2 uf11.2h.s3 uf07.2h.s2 uf04.4h.s2 uf10.2h.s2 
##      14264      14296      14360      14393      15082      15325 
## of05.2h.s3 uf09.4h.s3 of04.2h.s3 uf02.2h.s2 uf06.4h.s3 of08.4h.s2 
##      15617      15760      15851      15868      16233      16497 
## up02.2h.s1 up01.4h.s1 op03.4h.s3 of11.4h.s2 of06.2h.s1 of09.2h.s3 
##      16667      17823      17842      18619      18804      19083 
## op04.2h.s1 of07.2h.s1 op03.2h.s1 of02.4h.s2 up04.4h.s1 uf02.2h.s1 
##      19087      19664      19949      20004      20114      20435 
## of05.4h.s1 of08.2h.s3 up06.2h.s3 uf05.4h.s1 op01.2h.s2 op02.2h.s1 
##      20546      21763      21847      22022      22716      22824 
## op01.4h.s3 of04.2h.s1 op01.2h.s3 uf05.2h.s2 uf10.4h.s1 of06.4h.s2 
##      23185      23190      23312      23526      23916      24057 
## of05.2h.s1 uf02.4h.s2 of09.4h.s3 op02.4h.s1 of05.4h.s2 of07.2h.s3 
##      24064      24119      26054      26422      26816      27521 
## of12.4h.s1 uf05.4h.s3 of11.4h.s3 up02.2h.s3 op01.4h.s1 up01.2h.s3 
##      27893      27964      28939      29254      29271      29317 
## of12.2h.s2 uf12.2h.s2 op04.4h.s1 of05.4h.s3 uf06.2h.s2 uf08.4h.s3 
##      30192      30442      30820      30826      31470      33236 
## of02.4h.s1 of12.2h.s1 of02.2h.s1 up02.4h.s1 of10.4h.s1 up05.4h.s1 
##      34169      34801      36046      37192      37617      37713 
## op02.2h.s2 of01.4h.s1 uf01.4h.s3 up05.2h.s1 of08.2h.s1 of11.4h.s1 
##      38675      38721      39095      39258      40386      41980 
## up04.2h.s3 of01.2h.s1 uf11.4h.s3 op06.2h.s2 of07.2h.s2 of10.4h.s2 
##      43128      43367      44531      44663      44820      45242 
## up01.2h.s2 up06.2h.s2 op02.4h.s2 up03.4h.s2 of06.4h.s1 of08.4h.s3 
##      45419      45429      45633      45687      45882      47273 
## of07.4h.s1 of09.2h.s1 op04.2h.s3 of11.2h.s2 of10.2h.s2 of02.4h.s3 
##      47494      48523      48552      48748      49228      55685 
## of12.4h.s3 op05.4h.s1 of09.4h.s1 op06.4h.s2 of08.4h.s1 op01.2h.s1 
##      56417      58353      62013      64385      64649      69957 
## uf07.4h.s3 of07.4h.s3 of01.4h.s2 op03.4h.s1 op03.2h.s2 op01.4h.s2 
##      76818      78604      82810      87558      87843      90912 
## op05.4h.s2 op06.4h.s3 op02.2h.s3 of06.4h.s3 of01.4h.s3 op02.4h.s3 
##     100535     104632     113890     144827     173563     298953
pb <- pb[-which(rowSums(pb) < 1000), ]
pb.1000 <- rrarefy(pb, 1000)
pb.1000 <- pb.1000[, -which(colSums(pb.1000) == 0)]
pb.map <- pb.map[row.names(pb.1000), ]
taxo <- taxo[colnames(pb.1000), ]
dim(pb.1000)
## [1]   211 19226
dim(pb.map)
## [1] 211   4
dim(taxo)
## [1] 19226     7

Put them all in alphabetical order, since that gives nice groupings. Yay for labelling schemes!

pb.map <- pb.map[order(row.names(pb.map)), ]
pb.1000 <- pb.1000[row.names(pb.map), ]

Fix the abundance column - it still has abundance from full dataset, but abundance from rarefied dataset more meaningful.

identical(row.names(taxo), colnames(pb.1000))  # just to make sure
## [1] TRUE
taxo$abundance <- colSums(pb.1000)

A few metrics to reference later.

total.seqs <- sum(pb)
total.samples <- nrow(pb)
total.otus <- ncol(pb)

rarefied.seqs <- sum(pb.1000)
rarefied.samples <- nrow(pb.1000)
rarefied.otus <- ncol(pb.1000)

Clean up factor levels.

pb.map$person <- factor(pb.map$person)
pb.map$location <- factor(pb.map$location)
pb.map$duration <- factor(pb.map$duration)
pb.map$sampleType <- factor(pb.map$sampleType)
pb.map$occ.person <- factor(paste(pb.map$location, pb.map$person, sep = ""))

pb.map$occ.person2 <- as.character(pb.map$occ.person)
pb.map$occ.person2[pb.map$location == "unocc"] <- "unocc"
pb.map$occ.person2 <- factor(pb.map$occ.person2)

How do samples fall into groups? I.e., do we have balance in treatments?

table(pb.map$person, pb.map$sampleType)
##     
##      filter petri.dish
##   s1     48         24
##   s2     46         23
##   s3     46         24
table(pb.map$location)
## 
##   occ unocc 
##   108   103
table(pb.map$duration)
## 
##   2   4 
## 105 106
table(pb.map$sampleType)
## 
##     filter petri.dish 
##        140         71

Indexing lists for quick parsing.

s22of <- c(1:nrow(pb.map))[pb.map$person == "s2" & pb.map$duration == 2 & pb.map$location == 
    "occ" & pb.map$sampleType == "filter"]
s24of <- c(1:nrow(pb.map))[pb.map$person == "s2" & pb.map$duration == 4 & pb.map$location == 
    "occ" & pb.map$sampleType == "filter"]
s22op <- c(1:nrow(pb.map))[pb.map$person == "s2" & pb.map$duration == 2 & pb.map$location == 
    "occ" & pb.map$sampleType == "petri.dish"]
s24op <- c(1:nrow(pb.map))[pb.map$person == "s2" & pb.map$duration == 4 & pb.map$location == 
    "occ" & pb.map$sampleType == "petri.dish"]
s22uf <- c(1:nrow(pb.map))[pb.map$person == "s2" & pb.map$duration == 2 & pb.map$location == 
    "unocc" & pb.map$sampleType == "filter"]
s24uf <- c(1:nrow(pb.map))[pb.map$person == "s2" & pb.map$duration == 4 & pb.map$location == 
    "unocc" & pb.map$sampleType == "filter"]
s22up <- c(1:nrow(pb.map))[pb.map$person == "s2" & pb.map$duration == 2 & pb.map$location == 
    "unocc" & pb.map$sampleType == "petri.dish"]
s24up <- c(1:nrow(pb.map))[pb.map$person == "s2" & pb.map$duration == 4 & pb.map$location == 
    "unocc" & pb.map$sampleType == "petri.dish"]

##
s32of <- c(1:nrow(pb.map))[pb.map$person == "s3" & pb.map$duration == 2 & pb.map$location == 
    "occ" & pb.map$sampleType == "filter"]
s34of <- c(1:nrow(pb.map))[pb.map$person == "s3" & pb.map$duration == 4 & pb.map$location == 
    "occ" & pb.map$sampleType == "filter"]
s32op <- c(1:nrow(pb.map))[pb.map$person == "s3" & pb.map$duration == 2 & pb.map$location == 
    "occ" & pb.map$sampleType == "petri.dish"]
s34op <- c(1:nrow(pb.map))[pb.map$person == "s3" & pb.map$duration == 4 & pb.map$location == 
    "occ" & pb.map$sampleType == "petri.dish"]
s32uf <- c(1:nrow(pb.map))[pb.map$person == "s3" & pb.map$duration == 2 & pb.map$location == 
    "unocc" & pb.map$sampleType == "filter"]
s34uf <- c(1:nrow(pb.map))[pb.map$person == "s3" & pb.map$duration == 4 & pb.map$location == 
    "unocc" & pb.map$sampleType == "filter"]
s32up <- c(1:nrow(pb.map))[pb.map$person == "s3" & pb.map$duration == 2 & pb.map$location == 
    "unocc" & pb.map$sampleType == "petri.dish"]
s34up <- c(1:nrow(pb.map))[pb.map$person == "s3" & pb.map$duration == 4 & pb.map$location == 
    "unocc" & pb.map$sampleType == "petri.dish"]

##
s12of <- c(1:nrow(pb.map))[pb.map$person == "s1" & pb.map$duration == 2 & pb.map$location == 
    "occ" & pb.map$sampleType == "filter"]
s14of <- c(1:nrow(pb.map))[pb.map$person == "s1" & pb.map$duration == 4 & pb.map$location == 
    "occ" & pb.map$sampleType == "filter"]
s12op <- c(1:nrow(pb.map))[pb.map$person == "s1" & pb.map$duration == 2 & pb.map$location == 
    "occ" & pb.map$sampleType == "petri.dish"]
s14op <- c(1:nrow(pb.map))[pb.map$person == "s1" & pb.map$duration == 4 & pb.map$location == 
    "occ" & pb.map$sampleType == "petri.dish"]
s12uf <- c(1:nrow(pb.map))[pb.map$person == "s1" & pb.map$duration == 2 & pb.map$location == 
    "unocc" & pb.map$sampleType == "filter"]
s14uf <- c(1:nrow(pb.map))[pb.map$person == "s1" & pb.map$duration == 4 & pb.map$location == 
    "unocc" & pb.map$sampleType == "filter"]
s12up <- c(1:nrow(pb.map))[pb.map$person == "s1" & pb.map$duration == 2 & pb.map$location == 
    "unocc" & pb.map$sampleType == "petri.dish"]
s14up <- c(1:nrow(pb.map))[pb.map$person == "s1" & pb.map$duration == 4 & pb.map$location == 
    "unocc" & pb.map$sampleType == "petri.dish"]


## A few groupings to call up later

groups <- list(s22of, s32of, s12of, s22op, s32op, s12op, s22uf, s32uf, s12uf, 
    s22up, s32up, s12up, s24of, s34of, s14of, s24op, s34op, s14op, s24uf, s34uf, 
    s14uf, s24up, s34up, s14up)

names(groups) <- c("s22of", "s32of", "s12of", "s22op", "s32op", "s12op", "s22uf", 
    "s32uf", "s12uf", "s22up", "s32up", "s12up", "s24of", "s34of", "s14of", 
    "s24op", "s34op", "s14op", "s24uf", "s34uf", "s14uf", "s24up", "s34up", 
    "s14up")

gr2 <- list(s14of, s12of, s14op, s12op, s14uf, s12uf, s14up, s12up, s34of, s32of, 
    s34op, s32op, s34uf, s32uf, s34up, s32up, s24of, s22of, s24op, s22op, s24uf, 
    s22uf, s24up, s22up)

gr.name <- c("s14of", "s12of", "s14op", "s12op", "s14uf", "s12uf", "s14up", 
    "s12up", "s34of", "s32of", "s34op", "s32op", "s34uf", "s32uf", "s34up", 
    "s32up", "s24of", "s22of", "s24op", "s22op", "s24uf", "s22uf", "s24up", 
    "s22up")

# clean up group names
rm(list = ls()[which(ls() %in% names(groups))])

Now save the image that can be used for analysis and figures.

rm(pb.original, pb.map.original, pb.tax.original, pb, pb.tax)
save.image("../data/pb_data.RData")
ls()
##  [1] "cons"             "Evenness"         "gr.name"         
##  [4] "gr2"              "groups"           "makeTaxo"        
##  [7] "pb.1000"          "pb.map"           "rarefied.otus"   
## [10] "rarefied.samples" "rarefied.seqs"    "se"              
## [13] "taxo"             "total.otus"       "total.samples"   
## [16] "total.seqs"