This file starts with the fastq files from the PACBIO run and runs through the dada2 workflow to get a phyloseq object at the end. Two cells were used with all samples ran once on each cell. Fastq files from the different cells are them combined after error learning for the individual runs.

## https://benjjneb.github.io/LRASManuscript/LRASms_fecal.html

library(dada2)
library(Biostrings)
library(ShortRead)
library(ggplot2)
library(reshape2)
library(gridExtra)
library(phyloseq)
library(gtools)

Cell 1

path1 <- "/blue/mulligan/duttonc/Congo/Cell1" 
fns1 <- list.files(path1, pattern="fastq", full.names=TRUE)
F27 <- "AGRGTTYGATYMTGGCTCAG"
R1492 <- "RGYTACCTTGTTACGACTT"
rc <- dada2:::rc
theme_set(theme_bw())
if (file.exists("/blue/mulligan/duttonc/Congo/output/cell1_output.Rdata")){
  load("/blue/mulligan/duttonc/Congo/output/cell1_output.Rdata")
} else{}

## remove primers
nops1 <- file.path(path1, "noprimers", basename(fns1))
if (exists("prim1")){
  print("cached file loaded: prim1")
} else{
  prim1 <- removePrimers(fns1, nops1, primer.fwd=F27, primer.rev=dada2:::rc(R1492), orient=TRUE)
}
## [1] "cached file loaded: prim1"
## filter
if (exists("lens.fn1")){
    print("cached file loaded: lens.fn1")
  } else{
  lens.fn1 <- lapply(nops1, function(fn) nchar(getSequences(fn)))
}
## [1] "cached file loaded: lens.fn1"
lens <- do.call(c, lens.fn1)
hist(lens, 1000)

filts1 <- file.path(path1, "noprimers", "filtered", basename(fns1))

if (exists("track1")){
  print("cached file loaded: track1")
} else{
  track1 <- filterAndTrim(nops1, filts1, minQ=3, minLen=1000, maxLen=1600, maxN=0, rm.phix=FALSE, maxEE=2)
}
## [1] "cached file loaded: track1"
track1
##                                              reads.in reads.out
## G1_6M-16S_For_bc1002-16S_Rev_bc1045.fastq        2585      2478
## G1_6W-16S_For_bc1006-16S_Rev_bc1045.fastq        4009      3853
## G10_3M-16S_For_bc1020-16S_Rev_bc1035.fastq       2201      2143
## G10_6M-16S_For_bc1022-16S_Rev_bc1033.fastq       2044      1943
## G10_6W-16S_For_bc1005-16S_Rev_bc1060.fastq       7191      6932
## G11_3M-16S_For_bc1012-16S_Rev_bc1035.fastq       2392      2321
## G11_6M-16S_For_bc1015-16S_Rev_bc1033.fastq       1755      1689
## G11_6W-16S_For_bc1012-16S_Rev_bc1057.fastq       5772      5595
## G12_3M-16S_For_bc1005-16S_Rev_bc1054.fastq       4878      4729
## G12_6W-16S_For_bc1006-16S_Rev_bc1033.fastq       6199      5984
## G12_MV1-16S_For_bc1024-16S_Rev_bc1056.fastq      4305      4128
## G12_PT1-16S_For_bc1002-16S_Rev_bc1044.fastq      2337      2105
## G13_3M_2-16S_For_bc1008-16S_Rev_bc1060.fastq     6238      6051
## G13_3M-16S_For_bc1024-16S_Rev_bc1059.fastq      11267     10870
## G13_6M-16S_For_bc1011-16S_Rev_bc1035.fastq       4325      4165
## G13_6W-16S_For_bc1020-16S_Rev_bc1054.fastq       4921      4727
## G14_3M-16S_For_bc1005-16S_Rev_bc1033.fastq       3211      3109
## G14_6M-16S_For_bc1015-16S_Rev_bc1075.fastq       2966      2873
## G14_6W-16S_For_bc1012-16S_Rev_bc1033.fastq       1852      1794
## G14_PT-16S_For_bc1002-16S_Rev_bc1057.fastq       4919      4759
## G15_3M-16S_For_bc1022-16S_Rev_bc1035.fastq       2848      2765
## G15_6W-16S_For_bc1024-16S_Rev_bc1035.fastq       3774      3663
## G15_MV1-16S_For_bc1024-16S_Rev_bc1033.fastq      3879      3749
## G16_6M-16S_For_bc1005-16S_Rev_bc1056.fastq       4285      4112
## G17_3M-16S_For_bc1015-16S_Rev_bc1065.fastq       3433      3320
## G17_6M-16S_For_bc1012-16S_Rev_bc1044.fastq       1945      1890
## G17_6W-16S_For_bc1005-16S_Rev_bc1045.fastq       3173      3054
## G19_3M-16S_For_bc1004-16S_Rev_bc1045.fastq       2485      2412
## G19_6W-16S_For_bc1010-16S_Rev_bc1035.fastq       1940      1875
## G19_6WR-16S_For_bc1022-16S_Rev_bc1062.fastq      5398      5218
## G19_MV3-16S_For_bc1005-16S_Rev_bc1075.fastq      5880      5718
## G2_3M-16S_For_bc1004-16S_Rev_bc1035.fastq        3112      2971
## G2_6M-16S_For_bc1011-16S_Rev_bc1044.fastq        3441      3243
## G2_6W-16S_For_bc1020-16S_Rev_bc1065.fastq        4094      3937
## G2_PT-16S_For_bc1024-16S_Rev_bc1065.fastq       14814     14360
## G20_6W-16S_For_bc1024-16S_Rev_bc1054.fastq       6238      6046
## G20_MV1-16S_For_bc1005-16S_Rev_bc1059.fastq      9035      8750
## G20_PT1-16S_For_bc1020-16S_Rev_bc1045.fastq      3340      3243
## G20_PT2-16S_For_bc1008-16S_Rev_bc1045.fastq      2637      2554
## G20_PT2R-16S_For_bc1007-16S_Rev_bc1045.fastq     3588      3460
## G21_3M-16S_For_bc1005-16S_Rev_bc1044.fastq       2304      2223
## G21_6W-16S_For_bc1007-16S_Rev_bc1057.fastq      10668     10283
## G22_3M-16S_For_bc1020-16S_Rev_bc1075.fastq       1981      1915
## G22_6W-16S_For_bc1006-16S_Rev_bc1044.fastq       4081      3958
## G22_MV-16S_For_bc1003-16S_Rev_bc1044.fastq       2954      2827
## G23_3M-16S_For_bc1008-16S_Rev_bc1035.fastq       1996      1912
## G24_3M-16S_For_bc1020-16S_Rev_bc1056.fastq       4708      4564
## G24_6M-16S_For_bc1007-16S_Rev_bc1075.fastq       8984      8629
## G24_6W-16S_For_bc1008-16S_Rev_bc1062.fastq       7179      6981
## G25_3M-16S_For_bc1012-16S_Rev_bc1054.fastq       4775      4629
## G25_6W-16S_For_bc1008-16S_Rev_bc1065.fastq       6931      6736
## G26_3M-16S_For_bc1024-16S_Rev_bc1060.fastq      10584     10267
## G26_6M-16S_For_bc1020-16S_Rev_bc1060.fastq       8743      8466
## G26_6W-16S_For_bc1006-16S_Rev_bc1035.fastq       3663      3511
## G27_6M-16S_For_bc1002-16S_Rev_bc1056.fastq       1888      1781
## G27_6W-16S_For_bc1008-16S_Rev_bc1057.fastq       5437      5224
## G28_6M-16S_For_bc1022-16S_Rev_bc1060.fastq       9963      9665
## G29_3M-16S_For_bc1005-16S_Rev_bc1065.fastq       3186      3103
## G29_6W-16S_For_bc1008-16S_Rev_bc1054.fastq       4180      4056
## G3_3M-16S_For_bc1015-16S_Rev_bc1060.fastq        3551      3395
## G3_6M-16S_For_bc1009-16S_Rev_bc1044.fastq        2306      2239
## G3_6W-16S_For_bc1012-16S_Rev_bc1059.fastq        9842      9530
## G30_3M-16S_For_bc1009-16S_Rev_bc1035.fastq       1718      1637
## G30_6M-16S_For_bc1015-16S_Rev_bc1059.fastq       4717      4493
## G30_6W-16S_For_bc1010-16S_Rev_bc1044.fastq       1511      1458
## G31_3M-16S_For_bc1007-16S_Rev_bc1044.fastq       2642      2511
## G31_6M-16S_For_bc1003-16S_Rev_bc1045.fastq       4076      3918
## G33_3M-16S_For_bc1012-16S_Rev_bc1075.fastq       3028      2889
## G33_6W-16S_For_bc1022-16S_Rev_bc1059.fastq      10059      9667
## G34_3M-16S_For_bc1015-16S_Rev_bc1054.fastq       4262      4156
## G34_6W-16S_For_bc1012-16S_Rev_bc1065.fastq       3370      3227
## G35_3M-16S_For_bc1002-16S_Rev_bc1059.fastq       4649      4437
## G35_6M-16S_For_bc1012-16S_Rev_bc1045.fastq       3361      3249
## G35_6W-16S_For_bc1020-16S_Rev_bc1057.fastq       6420      6233
## G36_3M-16S_For_bc1024-16S_Rev_bc1075.fastq        536       515
## G36_6M-16S_For_bc1002-16S_Rev_bc1035.fastq       3362      3259
## G37_6W-16S_For_bc1008-16S_Rev_bc1044.fastq       2305      2237
## G37_PT1-16S_For_bc1007-16S_Rev_bc1035.fastq      1944      1867
## G38_3M-16S_For_bc1008-16S_Rev_bc1059.fastq       6740      6535
## G38_6W-16S_For_bc1002-16S_Rev_bc1054.fastq       3050      2965
## G39_6M-16S_For_bc1012-16S_Rev_bc1062.fastq       3746      3627
## G4_3M-16S_For_bc1015-16S_Rev_bc1062.fastq        4391      4247
## G4_3MR-16S_For_bc1015-16S_Rev_bc1045.fastq       2633      2538
## G4_6W-16S_For_bc1009-16S_Rev_bc1033.fastq        4481      4315
## G40_3M-16S_For_bc1010-16S_Rev_bc1045.fastq       1863      1789
## G40_6M-16S_For_bc1002-16S_Rev_bc1033.fastq       2770      2664
## G40_6W-16S_For_bc1007-16S_Rev_bc1060.fastq      11413     10945
## G41_3M-16S_For_bc1022-16S_Rev_bc1056.fastq       4243      4082
## G42_3M-16S_For_bc1002-16S_Rev_bc1062.fastq       2850      2751
## G42_6M-16S_For_bc1005-16S_Rev_bc1035.fastq       3372      3276
## G43_6M-16S_For_bc1010-16S_Rev_bc1033.fastq       3618      3517
## G43_6W-16S_For_bc1008-16S_Rev_bc1075.fastq       2519      2421
## G45_3M-16S_For_bc1024-16S_Rev_bc1045.fastq       4727      4577
## G45_6W-16S_For_bc1008-16S_Rev_bc1056.fastq       4132      3985
## G45_MV-16S_For_bc1024-16S_Rev_bc1057.fastq      11545     11144
## G46_3M-16S_For_bc1015-16S_Rev_bc1044.fastq       2364      2282
## G46_3MR-16S_For_bc1020-16S_Rev_bc1044.fastq      2928      2837
## G46_6M-16S_For_bc1022-16S_Rev_bc1065.fastq       4054      3918
## G46_6W-16S_For_bc1007-16S_Rev_bc1062.fastq      13383     12918
## G47_3M-16S_For_bc1005-16S_Rev_bc1062.fastq      10701     10390
## G47_6W-16S_For_bc1011-16S_Rev_bc1045.fastq       2757      2648
## G48_3M-16S_For_bc1015-16S_Rev_bc1057.fastq       8984      8574
## G48_6M-16S_For_bc1004-16S_Rev_bc1033.fastq       1762      1703
## G48_6W-16S_For_bc1012-16S_Rev_bc1060.fastq       8682      8396
## G5_3M-16S_For_bc1007-16S_Rev_bc1054.fastq        4257      4022
## G5_MV1-16S_For_bc1003-16S_Rev_bc1033.fastq       3311      3181
## G50_3M-16S_For_bc1020-16S_Rev_bc1062.fastq       4013      3790
## G50_6M-16S_For_bc1007-16S_Rev_bc1059.fastq       7721      7411
## G50_6W-16S_For_bc1004-16S_Rev_bc1044.fastq       2450      2373
## G51_6W-16S_For_bc1024-16S_Rev_bc1044.fastq       3741      3602
## G52_3M-16S_For_bc1024-16S_Rev_bc1062.fastq       9802      9473
## G52_6M-16S_For_bc1022-16S_Rev_bc1057.fastq       6344      6166
## G52_6W-16S_For_bc1020-16S_Rev_bc1059.fastq       9859      9527
## G7_3M-16S_For_bc1008-16S_Rev_bc1033.fastq        2167      2086
## G7_3MR-16S_For_bc1015-16S_Rev_bc1056.fastq       3020      2873
## G7_6M-16S_For_bc1007-16S_Rev_bc1033.fastq        2584      2447
## G7_MV2-16S_For_bc1022-16S_Rev_bc1044.fastq       3241      3142
## G7_PT1-16S_For_bc1005-16S_Rev_bc1057.fastq       6692      6490
## G8_3M-16S_For_bc1002-16S_Rev_bc1060.fastq        3476      3351
## G8_6W-16S_For_bc1007-16S_Rev_bc1065.fastq        9434      9151
## G9_3M-16S_For_bc1012-16S_Rev_bc1056.fastq        4618      4467
## G9_6M-16S_For_bc1022-16S_Rev_bc1075.fastq        3450      3347
## G9_6W-16S_For_bc1011-16S_Rev_bc1033.fastq        4344      4199
## MC1-16S_For_bc1009-16S_Rev_bc1045.fastq          1779      1718
## MC2-16S_For_bc1022-16S_Rev_bc1045.fastq          3520      3435
## MC3-16S_For_bc1020-16S_Rev_bc1033.fastq          3283      3216
## dada2
if (exists("drp1")){
  print("cached file loaded: drp1")
} else{
  drp1 <- derepFastq(filts1, verbose=TRUE)
}
## [1] "cached file loaded: drp1"
## Learn errors
if (exists("err1")){
  print("cached file loaded: err1")
} else{
  err1 <- learnErrors(drp1, errorEstimationFunction=PacBioErrfun, BAND_SIZE=32, multithread=TRUE)
}
## [1] "cached file loaded: err1"
## Plot errors
plotErrors(err1)

## Denoise

if (exists("dd1")){
  print("cached file loaded: dd1")
} else{
  dd1 <- dada(drp1, err=err1, BAND_SIZE=32, multithread=TRUE)
}
## [1] "cached file loaded: dd1"
cbind(ccs=prim1[,1], primers=prim1[,2], filtered=track1[,2], denoised=sapply(dd1, function(x) sum(x$denoised)))
##                                                ccs primers filtered denoised
## G1_6M-16S_For_bc1002-16S_Rev_bc1045.fastq     2973    2585     2478     2294
## G1_6W-16S_For_bc1006-16S_Rev_bc1045.fastq     4581    4009     3853     3771
## G10_3M-16S_For_bc1020-16S_Rev_bc1035.fastq    2276    2201     2143     2083
## G10_6M-16S_For_bc1022-16S_Rev_bc1033.fastq    2460    2044     1943     1795
## G10_6W-16S_For_bc1005-16S_Rev_bc1060.fastq    7981    7191     6932     6728
## G11_3M-16S_For_bc1012-16S_Rev_bc1035.fastq    2718    2392     2321     2282
## G11_6M-16S_For_bc1015-16S_Rev_bc1033.fastq    2016    1755     1689     1607
## G11_6W-16S_For_bc1012-16S_Rev_bc1057.fastq    6301    5772     5595     5527
## G12_3M-16S_For_bc1005-16S_Rev_bc1054.fastq    5211    4878     4729     4641
## G12_6W-16S_For_bc1006-16S_Rev_bc1033.fastq    7124    6199     5984     5917
## G12_MV1-16S_For_bc1024-16S_Rev_bc1056.fastq   4466    4305     4128     3987
## G12_PT1-16S_For_bc1002-16S_Rev_bc1044.fastq   2723    2337     2105     2021
## G13_3M_2-16S_For_bc1008-16S_Rev_bc1060.fastq  6944    6238     6051     5911
## G13_3M-16S_For_bc1024-16S_Rev_bc1059.fastq   11681   11267    10870    10687
## G13_6M-16S_For_bc1011-16S_Rev_bc1035.fastq    4655    4325     4165     3986
## G13_6W-16S_For_bc1020-16S_Rev_bc1054.fastq    5111    4921     4727     4682
## G14_3M-16S_For_bc1005-16S_Rev_bc1033.fastq    3402    3211     3109     3082
## G14_6M-16S_For_bc1015-16S_Rev_bc1075.fastq    3439    2966     2873     2810
## G14_6W-16S_For_bc1012-16S_Rev_bc1033.fastq    2117    1852     1794     1696
## G14_PT-16S_For_bc1002-16S_Rev_bc1057.fastq    5578    4919     4759     4562
## G15_3M-16S_For_bc1022-16S_Rev_bc1035.fastq    3388    2848     2765     2708
## G15_6W-16S_For_bc1024-16S_Rev_bc1035.fastq    3921    3774     3663     3631
## G15_MV1-16S_For_bc1024-16S_Rev_bc1033.fastq   4044    3879     3749     3688
## G16_6M-16S_For_bc1005-16S_Rev_bc1056.fastq    4731    4285     4112     4008
## G17_3M-16S_For_bc1015-16S_Rev_bc1065.fastq    3913    3433     3320     3228
## G17_6M-16S_For_bc1012-16S_Rev_bc1044.fastq    2328    1945     1890     1825
## G17_6W-16S_For_bc1005-16S_Rev_bc1045.fastq    3494    3173     3054     3019
## G19_3M-16S_For_bc1004-16S_Rev_bc1045.fastq    2740    2485     2412     2369
## G19_6W-16S_For_bc1010-16S_Rev_bc1035.fastq    2181    1940     1875     1844
## G19_6WR-16S_For_bc1022-16S_Rev_bc1062.fastq   6349    5398     5218     5079
## G19_MV3-16S_For_bc1005-16S_Rev_bc1075.fastq   6099    5880     5718     5600
## G2_3M-16S_For_bc1004-16S_Rev_bc1035.fastq     3552    3112     2971     2874
## G2_6M-16S_For_bc1011-16S_Rev_bc1044.fastq     3736    3441     3243     3029
## G2_6W-16S_For_bc1020-16S_Rev_bc1065.fastq     4182    4094     3937     3780
## G2_PT-16S_For_bc1024-16S_Rev_bc1065.fastq    15227   14814    14360    14248
## G20_6W-16S_For_bc1024-16S_Rev_bc1054.fastq    6450    6238     6046     5977
## G20_MV1-16S_For_bc1005-16S_Rev_bc1059.fastq   9746    9035     8750     8680
## G20_PT1-16S_For_bc1020-16S_Rev_bc1045.fastq   3415    3340     3243     3204
## G20_PT2-16S_For_bc1008-16S_Rev_bc1045.fastq   2832    2637     2554     2533
## G20_PT2R-16S_For_bc1007-16S_Rev_bc1045.fastq  3728    3588     3460     3422
## G21_3M-16S_For_bc1005-16S_Rev_bc1044.fastq    2592    2304     2223     2184
## G21_6W-16S_For_bc1007-16S_Rev_bc1057.fastq   11101   10668    10283    10162
## G22_3M-16S_For_bc1020-16S_Rev_bc1075.fastq    2053    1981     1915     1868
## G22_6W-16S_For_bc1006-16S_Rev_bc1044.fastq    4798    4081     3958     3866
## G22_MV-16S_For_bc1003-16S_Rev_bc1044.fastq    3273    2954     2827     2740
## G23_3M-16S_For_bc1008-16S_Rev_bc1035.fastq    2235    1996     1912     1884
## G24_3M-16S_For_bc1020-16S_Rev_bc1056.fastq    4896    4708     4564     4531
## G24_6M-16S_For_bc1007-16S_Rev_bc1075.fastq    9423    8984     8629     8539
## G24_6W-16S_For_bc1008-16S_Rev_bc1062.fastq    7769    7179     6981     6963
## G25_3M-16S_For_bc1012-16S_Rev_bc1054.fastq    5418    4775     4629     4585
## G25_6W-16S_For_bc1008-16S_Rev_bc1065.fastq    7435    6931     6736     6675
## G26_3M-16S_For_bc1024-16S_Rev_bc1060.fastq   10974   10584    10267    10116
## G26_6M-16S_For_bc1020-16S_Rev_bc1060.fastq    9072    8743     8466     8398
## G26_6W-16S_For_bc1006-16S_Rev_bc1035.fastq    4238    3663     3511     3302
## G27_6M-16S_For_bc1002-16S_Rev_bc1056.fastq    2234    1888     1781     1681
## G27_6W-16S_For_bc1008-16S_Rev_bc1057.fastq    6043    5437     5224     5203
## G28_6M-16S_For_bc1022-16S_Rev_bc1060.fastq   11243    9963     9665     9530
## G29_3M-16S_For_bc1005-16S_Rev_bc1065.fastq    3308    3186     3103     3050
## G29_6W-16S_For_bc1008-16S_Rev_bc1054.fastq    4558    4180     4056     3988
## G3_3M-16S_For_bc1015-16S_Rev_bc1060.fastq     4009    3551     3395     3192
## G3_6M-16S_For_bc1009-16S_Rev_bc1044.fastq     2541    2306     2239     2156
## G3_6W-16S_For_bc1012-16S_Rev_bc1059.fastq    10796    9842     9530     9411
## G30_3M-16S_For_bc1009-16S_Rev_bc1035.fastq    1860    1718     1637     1589
## G30_6M-16S_For_bc1015-16S_Rev_bc1059.fastq    5453    4717     4493     4384
## G30_6W-16S_For_bc1010-16S_Rev_bc1044.fastq    1743    1511     1458     1421
## G31_3M-16S_For_bc1007-16S_Rev_bc1044.fastq    2818    2642     2511     2387
## G31_6M-16S_For_bc1003-16S_Rev_bc1045.fastq    4379    4076     3918     3800
## G33_3M-16S_For_bc1012-16S_Rev_bc1075.fastq    3545    3028     2889     2783
## G33_6W-16S_For_bc1022-16S_Rev_bc1059.fastq   11852   10059     9667     9605
## G34_3M-16S_For_bc1015-16S_Rev_bc1054.fastq    4728    4262     4156     4116
## G34_6W-16S_For_bc1012-16S_Rev_bc1065.fastq    3983    3370     3227     3158
## G35_3M-16S_For_bc1002-16S_Rev_bc1059.fastq    5566    4649     4437     4387
## G35_6M-16S_For_bc1012-16S_Rev_bc1045.fastq    3599    3361     3249     3190
## G35_6W-16S_For_bc1020-16S_Rev_bc1057.fastq    6620    6420     6233     6179
## G36_3M-16S_For_bc1024-16S_Rev_bc1075.fastq     562     536      515      485
## G36_6M-16S_For_bc1002-16S_Rev_bc1035.fastq    3882    3362     3259     3184
## G37_6W-16S_For_bc1008-16S_Rev_bc1044.fastq    2604    2305     2237     2165
## G37_PT1-16S_For_bc1007-16S_Rev_bc1035.fastq   2040    1944     1867     1742
## G38_3M-16S_For_bc1008-16S_Rev_bc1059.fastq    7508    6740     6535     6406
## G38_6W-16S_For_bc1002-16S_Rev_bc1054.fastq    3400    3050     2965     2850
## G39_6M-16S_For_bc1012-16S_Rev_bc1062.fastq    4307    3746     3627     3455
## G4_3M-16S_For_bc1015-16S_Rev_bc1062.fastq     4891    4391     4247     4108
## G4_3MR-16S_For_bc1015-16S_Rev_bc1045.fastq    2931    2633     2538     2420
## G4_6W-16S_For_bc1009-16S_Rev_bc1033.fastq     4840    4481     4315     4180
## G40_3M-16S_For_bc1010-16S_Rev_bc1045.fastq    2113    1863     1789     1738
## G40_6M-16S_For_bc1002-16S_Rev_bc1033.fastq    3274    2770     2664     2479
## G40_6W-16S_For_bc1007-16S_Rev_bc1060.fastq   11920   11413    10945    10744
## G41_3M-16S_For_bc1022-16S_Rev_bc1056.fastq    5022    4243     4082     3980
## G42_3M-16S_For_bc1002-16S_Rev_bc1062.fastq    3144    2850     2751     2684
## G42_6M-16S_For_bc1005-16S_Rev_bc1035.fastq    3547    3372     3276     3190
## G43_6M-16S_For_bc1010-16S_Rev_bc1033.fastq    4103    3618     3517     3363
## G43_6W-16S_For_bc1008-16S_Rev_bc1075.fastq    2949    2519     2421     2307
## G45_3M-16S_For_bc1024-16S_Rev_bc1045.fastq    4839    4727     4577     4489
## G45_6W-16S_For_bc1008-16S_Rev_bc1056.fastq    4627    4132     3985     3948
## G45_MV-16S_For_bc1024-16S_Rev_bc1057.fastq   11880   11545    11144    11056
## G46_3M-16S_For_bc1015-16S_Rev_bc1044.fastq    2711    2364     2282     2205
## G46_3MR-16S_For_bc1020-16S_Rev_bc1044.fastq   3083    2928     2837     2795
## G46_6M-16S_For_bc1022-16S_Rev_bc1065.fastq    4556    4054     3918     3713
## G46_6W-16S_For_bc1007-16S_Rev_bc1062.fastq   13821   13383    12918    12856
## G47_3M-16S_For_bc1005-16S_Rev_bc1062.fastq   11193   10701    10390    10302
## G47_6W-16S_For_bc1011-16S_Rev_bc1045.fastq    2901    2757     2648     2536
## G48_3M-16S_For_bc1015-16S_Rev_bc1057.fastq   10175    8984     8574     8380
## G48_6M-16S_For_bc1004-16S_Rev_bc1033.fastq    1974    1762     1703     1594
## G48_6W-16S_For_bc1012-16S_Rev_bc1060.fastq    9915    8682     8396     8336
## G5_3M-16S_For_bc1007-16S_Rev_bc1054.fastq     4446    4257     4022     3831
## G5_MV1-16S_For_bc1003-16S_Rev_bc1033.fastq    3565    3311     3181     3067
## G50_3M-16S_For_bc1020-16S_Rev_bc1062.fastq    4101    4013     3790     3580
## G50_6M-16S_For_bc1007-16S_Rev_bc1059.fastq    8095    7721     7411     7054
## G50_6W-16S_For_bc1004-16S_Rev_bc1044.fastq    2816    2450     2373     2316
## G51_6W-16S_For_bc1024-16S_Rev_bc1044.fastq    3932    3741     3602     3520
## G52_3M-16S_For_bc1024-16S_Rev_bc1062.fastq   10081    9802     9473     9451
## G52_6M-16S_For_bc1022-16S_Rev_bc1057.fastq    7219    6344     6166     5907
## G52_6W-16S_For_bc1020-16S_Rev_bc1059.fastq   10244    9859     9527     9453
## G7_3M-16S_For_bc1008-16S_Rev_bc1033.fastq     2353    2167     2086     2061
## G7_3MR-16S_For_bc1015-16S_Rev_bc1056.fastq    3404    3020     2873     2809
## G7_6M-16S_For_bc1007-16S_Rev_bc1033.fastq     2687    2584     2447     2334
## G7_MV2-16S_For_bc1022-16S_Rev_bc1044.fastq    3774    3241     3142     3079
## G7_PT1-16S_For_bc1005-16S_Rev_bc1057.fastq    6986    6692     6490     6389
## G8_3M-16S_For_bc1002-16S_Rev_bc1060.fastq     4014    3476     3351     3216
## G8_6W-16S_For_bc1007-16S_Rev_bc1065.fastq     9803    9434     9151     9023
## G9_3M-16S_For_bc1012-16S_Rev_bc1056.fastq     5309    4618     4467     4425
## G9_6M-16S_For_bc1022-16S_Rev_bc1075.fastq     3852    3450     3347     3274
## G9_6W-16S_For_bc1011-16S_Rev_bc1033.fastq     4709    4344     4199     4099
## MC1-16S_For_bc1009-16S_Rev_bc1045.fastq       1890    1779     1718     1669
## MC2-16S_For_bc1022-16S_Rev_bc1045.fastq       3929    3520     3435     3406
## MC3-16S_For_bc1020-16S_Rev_bc1033.fastq       3394    3283     3216     3195
## Sequence table

if (exists("st1")){
  print("cached file loaded: st1")
} else{
  st1 <- makeSequenceTable(dd1); dim(st1)
  
  save(lens.fn1, dd1, drp1, err1, prim1, track1, st1, file="/blue/mulligan/duttonc/Congo/output/cell1_output.Rdata")
}
## [1] "cached file loaded: st1"

Cell 2

## CELL 2
if (file.exists("/blue/mulligan/duttonc/Congo/output/cell2_output.Rdata")){
  load("/blue/mulligan/duttonc/Congo/output/cell2_output.Rdata")
} else{}

path2 <- "/blue/mulligan/duttonc/Congo/Cell2"
fns2 <- list.files(path2, pattern="fastq", full.names=TRUE)

## remove primers
nops2 <- file.path(path2, "noprimers", basename(fns2))
if (exists("prim2")){
  print("cached file loaded: prim2")
} else{
  prim2 <- removePrimers(fns2, nops2, primer.fwd=F27, primer.rev=dada2:::rc(R1492), orient=TRUE)
}
## [1] "cached file loaded: prim2"
## filter
if (exists("lens.fn2")){
  print("cached file loaded: lens.fn2")
} else{
  lens.fn2 <- lapply(nops2, function(fn) nchar(getSequences(fn)))
}
## [1] "cached file loaded: lens.fn2"
lens <- do.call(c, lens.fn2)
hist(lens, 1000)

filts2 <- file.path(path2, "noprimers", "filtered", basename(fns2))

if (exists("track2")){
  print("cached file loaded: track2")
} else{
  track2 <- filterAndTrim(nops2, filts2, minQ=3, minLen=1000, maxLen=1600, maxN=0, rm.phix=FALSE, maxEE=2)
}
## [1] "cached file loaded: track2"
track2
##                                              reads.in reads.out
## G1_6M-16S_For_bc1002-16S_Rev_bc1045.fastq        8573      8193
## G1_6W-16S_For_bc1006-16S_Rev_bc1045.fastq       13098     12556
## G10_3M-16S_For_bc1020-16S_Rev_bc1035.fastq       7449      7199
## G10_6M-16S_For_bc1022-16S_Rev_bc1033.fastq       6644      6358
## G10_6W-16S_For_bc1005-16S_Rev_bc1060.fastq      24270     23350
## G11_3M-16S_For_bc1012-16S_Rev_bc1035.fastq       7724      7469
## G11_6M-16S_For_bc1015-16S_Rev_bc1033.fastq       5374      5162
## G11_6W-16S_For_bc1012-16S_Rev_bc1057.fastq      18849     18281
## G12_3M-16S_For_bc1005-16S_Rev_bc1054.fastq      15761     15239
## G12_6W-16S_For_bc1006-16S_Rev_bc1033.fastq      19580     18906
## G12_MV1-16S_For_bc1024-16S_Rev_bc1056.fastq     14336     13723
## G12_PT1-16S_For_bc1002-16S_Rev_bc1044.fastq      7902      7069
## G13_3M_2-16S_For_bc1008-16S_Rev_bc1060.fastq    20751     20060
## G13_3M-16S_For_bc1024-16S_Rev_bc1059.fastq      36468     35328
## G13_6M-16S_For_bc1011-16S_Rev_bc1035.fastq      14776     14195
## G13_6W-16S_For_bc1020-16S_Rev_bc1054.fastq      15984     15327
## G14_3M-16S_For_bc1005-16S_Rev_bc1033.fastq      10283      9994
## G14_6M-16S_For_bc1015-16S_Rev_bc1075.fastq       9512      9209
## G14_6W-16S_For_bc1012-16S_Rev_bc1033.fastq       6117      5901
## G14_PT-16S_For_bc1002-16S_Rev_bc1057.fastq      16040     15508
## G15_3M-16S_For_bc1022-16S_Rev_bc1035.fastq       9524      9223
## G15_6W-16S_For_bc1024-16S_Rev_bc1035.fastq      12369     11982
## G15_MV1-16S_For_bc1024-16S_Rev_bc1033.fastq     12637     12239
## G16_6M-16S_For_bc1005-16S_Rev_bc1056.fastq      14144     13654
## G17_3M-16S_For_bc1015-16S_Rev_bc1065.fastq      11185     10819
## G17_6M-16S_For_bc1012-16S_Rev_bc1044.fastq       6461      6245
## G17_6W-16S_For_bc1005-16S_Rev_bc1045.fastq      10192      9804
## G19_3M-16S_For_bc1004-16S_Rev_bc1045.fastq       8370      8062
## G19_6W-16S_For_bc1010-16S_Rev_bc1035.fastq       6468      6276
## G19_6WR-16S_For_bc1022-16S_Rev_bc1062.fastq     17768     17149
## G19_MV3-16S_For_bc1005-16S_Rev_bc1075.fastq     19458     18893
## G2_3M-16S_For_bc1004-16S_Rev_bc1035.fastq       10049      9592
## G2_6M-16S_For_bc1011-16S_Rev_bc1044.fastq       11187     10634
## G2_6W-16S_For_bc1020-16S_Rev_bc1065.fastq       13688     13162
## G2_PT-16S_For_bc1024-16S_Rev_bc1065.fastq       48406     47058
## G20_6W-16S_For_bc1024-16S_Rev_bc1054.fastq      21082     20482
## G20_MV1-16S_For_bc1005-16S_Rev_bc1059.fastq     30639     29605
## G20_PT1-16S_For_bc1020-16S_Rev_bc1045.fastq     10940     10636
## G20_PT2-16S_For_bc1008-16S_Rev_bc1045.fastq      9184      8883
## G20_PT2R-16S_For_bc1007-16S_Rev_bc1045.fastq    12712     12231
## G21_3M-16S_For_bc1005-16S_Rev_bc1044.fastq       7586      7297
## G21_6W-16S_For_bc1007-16S_Rev_bc1057.fastq      34339     32955
## G22_3M-16S_For_bc1020-16S_Rev_bc1075.fastq       6207      5962
## G22_6W-16S_For_bc1006-16S_Rev_bc1044.fastq      12863     12421
## G22_MV-16S_For_bc1003-16S_Rev_bc1044.fastq       9481      9111
## G23_3M-16S_For_bc1008-16S_Rev_bc1035.fastq       6806      6610
## G24_3M-16S_For_bc1020-16S_Rev_bc1056.fastq      15702     15194
## G24_6M-16S_For_bc1007-16S_Rev_bc1075.fastq      29319     28069
## G24_6W-16S_For_bc1008-16S_Rev_bc1062.fastq      23589     22893
## G25_3M-16S_For_bc1012-16S_Rev_bc1054.fastq      14942     14470
## G25_6W-16S_For_bc1008-16S_Rev_bc1065.fastq      22253     21637
## G26_3M-16S_For_bc1024-16S_Rev_bc1060.fastq      35333     34289
## G26_6M-16S_For_bc1020-16S_Rev_bc1060.fastq      28500     27598
## G26_6W-16S_For_bc1006-16S_Rev_bc1035.fastq      11800     11357
## G27_6M-16S_For_bc1002-16S_Rev_bc1056.fastq       6288      5954
## G27_6W-16S_For_bc1008-16S_Rev_bc1057.fastq      17186     16559
## G28_6M-16S_For_bc1022-16S_Rev_bc1060.fastq      32341     31348
## G29_3M-16S_For_bc1005-16S_Rev_bc1065.fastq      10780     10478
## G29_6W-16S_For_bc1008-16S_Rev_bc1054.fastq      14314     13880
## G3_3M-16S_For_bc1015-16S_Rev_bc1060.fastq       11538     11038
## G3_6M-16S_For_bc1009-16S_Rev_bc1044.fastq        7708      7472
## G3_6W-16S_For_bc1012-16S_Rev_bc1059.fastq       30970     29990
## G30_3M-16S_For_bc1009-16S_Rev_bc1035.fastq       5396      5198
## G30_6M-16S_For_bc1015-16S_Rev_bc1059.fastq      15107     14399
## G30_6W-16S_For_bc1010-16S_Rev_bc1044.fastq       5063      4850
## G31_3M-16S_For_bc1007-16S_Rev_bc1044.fastq       8166      7808
## G31_6M-16S_For_bc1003-16S_Rev_bc1045.fastq      12766     12305
## G33_3M-16S_For_bc1012-16S_Rev_bc1075.fastq       9720      9250
## G33_6W-16S_For_bc1022-16S_Rev_bc1059.fastq      32018     30731
## G34_3M-16S_For_bc1015-16S_Rev_bc1054.fastq      13918     13460
## G34_6W-16S_For_bc1012-16S_Rev_bc1065.fastq      11252     10793
## G35_3M-16S_For_bc1002-16S_Rev_bc1059.fastq      14462     13816
## G35_6M-16S_For_bc1012-16S_Rev_bc1045.fastq      11166     10840
## G35_6W-16S_For_bc1020-16S_Rev_bc1057.fastq      20704     20052
## G36_3M-16S_For_bc1024-16S_Rev_bc1075.fastq       1742      1689
## G36_6M-16S_For_bc1002-16S_Rev_bc1035.fastq      10974     10593
## G37_6W-16S_For_bc1008-16S_Rev_bc1044.fastq       7774      7529
## G37_PT1-16S_For_bc1007-16S_Rev_bc1035.fastq      6586      6268
## G38_3M-16S_For_bc1008-16S_Rev_bc1059.fastq      22719     22104
## G38_6W-16S_For_bc1002-16S_Rev_bc1054.fastq      10531     10161
## G39_6M-16S_For_bc1012-16S_Rev_bc1062.fastq      12035     11607
## G4_3M-16S_For_bc1015-16S_Rev_bc1062.fastq       14319     13792
## G4_3MR-16S_For_bc1015-16S_Rev_bc1045.fastq       8382      8118
## G4_6W-16S_For_bc1009-16S_Rev_bc1033.fastq       15042     14464
## G40_3M-16S_For_bc1010-16S_Rev_bc1045.fastq       6145      5895
## G40_6M-16S_For_bc1002-16S_Rev_bc1033.fastq       9032      8590
## G40_6W-16S_For_bc1007-16S_Rev_bc1060.fastq      38493     36832
## G41_3M-16S_For_bc1022-16S_Rev_bc1056.fastq      13574     13092
## G42_3M-16S_For_bc1002-16S_Rev_bc1062.fastq       8966      8701
## G42_6M-16S_For_bc1005-16S_Rev_bc1035.fastq      10643     10319
## G43_6M-16S_For_bc1010-16S_Rev_bc1033.fastq      12052     11712
## G43_6W-16S_For_bc1008-16S_Rev_bc1075.fastq       8199      7906
## G45_3M-16S_For_bc1024-16S_Rev_bc1045.fastq      14892     14414
## G45_6W-16S_For_bc1008-16S_Rev_bc1056.fastq      13874     13409
## G45_MV-16S_For_bc1024-16S_Rev_bc1057.fastq      36903     35692
## G46_3M-16S_For_bc1015-16S_Rev_bc1044.fastq       8049      7787
## G46_3MR-16S_For_bc1020-16S_Rev_bc1044.fastq      9794      9508
## G46_6M-16S_For_bc1022-16S_Rev_bc1065.fastq      13687     13214
## G46_6W-16S_For_bc1007-16S_Rev_bc1062.fastq      43273     41786
## G47_3M-16S_For_bc1005-16S_Rev_bc1062.fastq      35224     34188
## G47_6W-16S_For_bc1011-16S_Rev_bc1045.fastq       8952      8553
## G48_3M-16S_For_bc1015-16S_Rev_bc1057.fastq      28975     27619
## G48_6M-16S_For_bc1004-16S_Rev_bc1033.fastq       5670      5458
## G48_6W-16S_For_bc1012-16S_Rev_bc1060.fastq      27476     26595
## G5_3M-16S_For_bc1007-16S_Rev_bc1054.fastq       13617     12959
## G5_MV1-16S_For_bc1003-16S_Rev_bc1033.fastq      11188     10758
## G50_3M-16S_For_bc1020-16S_Rev_bc1062.fastq      12819     12087
## G50_6M-16S_For_bc1007-16S_Rev_bc1059.fastq      25294     24303
## G50_6W-16S_For_bc1004-16S_Rev_bc1044.fastq       8042      7739
## G51_6W-16S_For_bc1024-16S_Rev_bc1044.fastq      12019     11606
## G52_3M-16S_For_bc1024-16S_Rev_bc1062.fastq      31224     30147
## G52_6M-16S_For_bc1022-16S_Rev_bc1057.fastq      20150     19490
## G52_6W-16S_For_bc1020-16S_Rev_bc1059.fastq      32570     31452
## G7_3M-16S_For_bc1008-16S_Rev_bc1033.fastq        7209      6948
## G7_3MR-16S_For_bc1015-16S_Rev_bc1056.fastq       9982      9473
## G7_6M-16S_For_bc1007-16S_Rev_bc1033.fastq        8280      7868
## G7_MV2-16S_For_bc1022-16S_Rev_bc1044.fastq      10865     10543
## G7_PT1-16S_For_bc1005-16S_Rev_bc1057.fastq      22347     21708
## G8_3M-16S_For_bc1002-16S_Rev_bc1060.fastq       11882     11480
## G8_6W-16S_For_bc1007-16S_Rev_bc1065.fastq       31661     30722
## G9_3M-16S_For_bc1012-16S_Rev_bc1056.fastq       14554     14101
## G9_6M-16S_For_bc1022-16S_Rev_bc1075.fastq       10922     10574
## G9_6W-16S_For_bc1011-16S_Rev_bc1033.fastq       14336     13857
## MC1-16S_For_bc1009-16S_Rev_bc1045.fastq          5825      5670
## MC2-16S_For_bc1022-16S_Rev_bc1045.fastq         11742     11456
## MC3-16S_For_bc1020-16S_Rev_bc1033.fastq         10879     10593
## dada2
if (exists("drp2")){
  print("cached file loaded: drp2")
} else{
  drp2 <- derepFastq(filts2, verbose=TRUE)
}
## [1] "cached file loaded: drp2"
## Learn errors
if (exists("err2")){
  print("cached file loaded: err2")
} else{
  err2 <- learnErrors(drp2, errorEstimationFunction=PacBioErrfun, BAND_SIZE=32, multithread=TRUE)
}
## [1] "cached file loaded: err2"
## Plot errors
plotErrors(err2)

## Denoise
if (exists("dd2")){
  print("cached file loaded: dd2")
} else{
  dd2 <- dada(drp2, err=err2, BAND_SIZE=32, multithread=TRUE)
}
## [1] "cached file loaded: dd2"
cbind(ccs=prim2[,1], primers=prim2[,2], filtered=track2[,2], denoised=sapply(dd2, function(x) sum(x$denoised)))
##                                                ccs primers filtered denoised
## G1_6M-16S_For_bc1002-16S_Rev_bc1045.fastq     9956    8573     8193     7677
## G1_6W-16S_For_bc1006-16S_Rev_bc1045.fastq    15171   13098    12556    12312
## G10_3M-16S_For_bc1020-16S_Rev_bc1035.fastq    7690    7449     7199     7048
## G10_6M-16S_For_bc1022-16S_Rev_bc1033.fastq    8208    6644     6358     6119
## G10_6W-16S_For_bc1005-16S_Rev_bc1060.fastq   26955   24270    23350    22855
## G11_3M-16S_For_bc1012-16S_Rev_bc1035.fastq    8927    7724     7469     7390
## G11_6M-16S_For_bc1015-16S_Rev_bc1033.fastq    6308    5374     5162     4931
## G11_6W-16S_For_bc1012-16S_Rev_bc1057.fastq   20465   18849    18281    18134
## G12_3M-16S_For_bc1005-16S_Rev_bc1054.fastq   16845   15761    15239    15085
## G12_6W-16S_For_bc1006-16S_Rev_bc1033.fastq   22636   19580    18906    18564
## G12_MV1-16S_For_bc1024-16S_Rev_bc1056.fastq  14875   14336    13723    13388
## G12_PT1-16S_For_bc1002-16S_Rev_bc1044.fastq   9296    7902     7069     6798
## G13_3M_2-16S_For_bc1008-16S_Rev_bc1060.fastq 23067   20751    20060    19703
## G13_3M-16S_For_bc1024-16S_Rev_bc1059.fastq   38024   36468    35328    34848
## G13_6M-16S_For_bc1011-16S_Rev_bc1035.fastq   15874   14776    14195    13665
## G13_6W-16S_For_bc1020-16S_Rev_bc1054.fastq   16652   15984    15327    15210
## G14_3M-16S_For_bc1005-16S_Rev_bc1033.fastq   10963   10283     9994     9931
## G14_6M-16S_For_bc1015-16S_Rev_bc1075.fastq   11209    9512     9209     9049
## G14_6W-16S_For_bc1012-16S_Rev_bc1033.fastq    7114    6117     5901     5708
## G14_PT-16S_For_bc1002-16S_Rev_bc1057.fastq   18353   16040    15508    15084
## G15_3M-16S_For_bc1022-16S_Rev_bc1035.fastq   11263    9524     9223     9087
## G15_6W-16S_For_bc1024-16S_Rev_bc1035.fastq   12819   12369    11982    11895
## G15_MV1-16S_For_bc1024-16S_Rev_bc1033.fastq  13149   12637    12239    12125
## G16_6M-16S_For_bc1005-16S_Rev_bc1056.fastq   15686   14144    13654    13244
## G17_3M-16S_For_bc1015-16S_Rev_bc1065.fastq   12748   11185    10819    10629
## G17_6M-16S_For_bc1012-16S_Rev_bc1044.fastq    7852    6461     6245     6122
## G17_6W-16S_For_bc1005-16S_Rev_bc1045.fastq   11204   10192     9804     9713
## G19_3M-16S_For_bc1004-16S_Rev_bc1045.fastq    9260    8370     8062     7918
## G19_6W-16S_For_bc1010-16S_Rev_bc1035.fastq    7318    6468     6276     6167
## G19_6WR-16S_For_bc1022-16S_Rev_bc1062.fastq  20927   17768    17149    16882
## G19_MV3-16S_For_bc1005-16S_Rev_bc1075.fastq  20300   19458    18893    18693
## G2_3M-16S_For_bc1004-16S_Rev_bc1035.fastq    11529   10049     9592     9335
## G2_6M-16S_For_bc1011-16S_Rev_bc1044.fastq    12265   11187    10634    10229
## G2_6W-16S_For_bc1020-16S_Rev_bc1065.fastq    14000   13688    13162    12753
## G2_PT-16S_For_bc1024-16S_Rev_bc1065.fastq    49738   48406    47058    46837
## G20_6W-16S_For_bc1024-16S_Rev_bc1054.fastq   21802   21082    20482    20279
## G20_MV1-16S_For_bc1005-16S_Rev_bc1059.fastq  33009   30639    29605    29366
## G20_PT1-16S_For_bc1020-16S_Rev_bc1045.fastq  11211   10940    10636    10548
## G20_PT2-16S_For_bc1008-16S_Rev_bc1045.fastq   9842    9184     8883     8796
## G20_PT2R-16S_For_bc1007-16S_Rev_bc1045.fastq 13178   12712    12231    12101
## G21_3M-16S_For_bc1005-16S_Rev_bc1044.fastq    8592    7586     7297     7189
## G21_6W-16S_For_bc1007-16S_Rev_bc1057.fastq   35874   34339    32955    32573
## G22_3M-16S_For_bc1020-16S_Rev_bc1075.fastq    6466    6207     5962     5862
## G22_6W-16S_For_bc1006-16S_Rev_bc1044.fastq   15463   12863    12421    12204
## G22_MV-16S_For_bc1003-16S_Rev_bc1044.fastq   10515    9481     9111     8890
## G23_3M-16S_For_bc1008-16S_Rev_bc1035.fastq    7621    6806     6610     6548
## G24_3M-16S_For_bc1020-16S_Rev_bc1056.fastq   16376   15702    15194    15103
## G24_6M-16S_For_bc1007-16S_Rev_bc1075.fastq   30912   29319    28069    27861
## G24_6W-16S_For_bc1008-16S_Rev_bc1062.fastq   25686   23589    22893    22832
## G25_3M-16S_For_bc1012-16S_Rev_bc1054.fastq   17087   14942    14470    14385
## G25_6W-16S_For_bc1008-16S_Rev_bc1065.fastq   23858   22253    21637    21435
## G26_3M-16S_For_bc1024-16S_Rev_bc1060.fastq   36646   35333    34289    33868
## G26_6M-16S_For_bc1020-16S_Rev_bc1060.fastq   29700   28500    27598    27420
## G26_6W-16S_For_bc1006-16S_Rev_bc1035.fastq   13644   11800    11357    10833
## G27_6M-16S_For_bc1002-16S_Rev_bc1056.fastq    7346    6288     5954     5677
## G27_6W-16S_For_bc1008-16S_Rev_bc1057.fastq   19118   17186    16559    16477
## G28_6M-16S_For_bc1022-16S_Rev_bc1060.fastq   36575   32341    31348    31166
## G29_3M-16S_For_bc1005-16S_Rev_bc1065.fastq   11152   10780    10478    10419
## G29_6W-16S_For_bc1008-16S_Rev_bc1054.fastq   15797   14314    13880    13754
## G3_3M-16S_For_bc1015-16S_Rev_bc1060.fastq    13290   11538    11038    10508
## G3_6M-16S_For_bc1009-16S_Rev_bc1044.fastq     8601    7708     7472     7243
## G3_6W-16S_For_bc1012-16S_Rev_bc1059.fastq    34126   30970    29990    29814
## G30_3M-16S_For_bc1009-16S_Rev_bc1035.fastq    5893    5396     5198     5094
## G30_6M-16S_For_bc1015-16S_Rev_bc1059.fastq   17542   15107    14399    13828
## G30_6W-16S_For_bc1010-16S_Rev_bc1044.fastq    5950    5063     4850     4753
## G31_3M-16S_For_bc1007-16S_Rev_bc1044.fastq    8788    8166     7808     7512
## G31_6M-16S_For_bc1003-16S_Rev_bc1045.fastq   13890   12766    12305    12019
## G33_3M-16S_For_bc1012-16S_Rev_bc1075.fastq   11590    9720     9250     9054
## G33_6W-16S_For_bc1022-16S_Rev_bc1059.fastq   37586   32018    30731    30517
## G34_3M-16S_For_bc1015-16S_Rev_bc1054.fastq   15656   13918    13460    12827
## G34_6W-16S_For_bc1012-16S_Rev_bc1065.fastq   13309   11252    10793    10584
## G35_3M-16S_For_bc1002-16S_Rev_bc1059.fastq   17162   14462    13816    13724
## G35_6M-16S_For_bc1012-16S_Rev_bc1045.fastq   11984   11166    10840    10709
## G35_6W-16S_For_bc1020-16S_Rev_bc1057.fastq   21383   20704    20052    19967
## G36_3M-16S_For_bc1024-16S_Rev_bc1075.fastq    1846    1742     1689     1633
## G36_6M-16S_For_bc1002-16S_Rev_bc1035.fastq   12792   10974    10593    10414
## G37_6W-16S_For_bc1008-16S_Rev_bc1044.fastq    8773    7774     7529     7354
## G37_PT1-16S_For_bc1007-16S_Rev_bc1035.fastq   6874    6586     6268     5959
## G38_3M-16S_For_bc1008-16S_Rev_bc1059.fastq   25388   22719    22104    21842
## G38_6W-16S_For_bc1002-16S_Rev_bc1054.fastq   11917   10531    10161     9885
## G39_6M-16S_For_bc1012-16S_Rev_bc1062.fastq   13919   12035    11607    11185
## G4_3M-16S_For_bc1015-16S_Rev_bc1062.fastq    16094   14319    13792    13454
## G4_3MR-16S_For_bc1015-16S_Rev_bc1045.fastq    9347    8382     8118     7891
## G4_6W-16S_For_bc1009-16S_Rev_bc1033.fastq    16346   15042    14464    14173
## G40_3M-16S_For_bc1010-16S_Rev_bc1045.fastq    7039    6145     5895     5780
## G40_6M-16S_For_bc1002-16S_Rev_bc1033.fastq   10648    9032     8590     8139
## G40_6W-16S_For_bc1007-16S_Rev_bc1060.fastq   40598   38493    36832    36338
## G41_3M-16S_For_bc1022-16S_Rev_bc1056.fastq   16163   13574    13092    12899
## G42_3M-16S_For_bc1002-16S_Rev_bc1062.fastq    9886    8966     8701     8609
## G42_6M-16S_For_bc1005-16S_Rev_bc1035.fastq   11208   10643    10319    10176
## G43_6M-16S_For_bc1010-16S_Rev_bc1033.fastq   13646   12052    11712    11401
## G43_6W-16S_For_bc1008-16S_Rev_bc1075.fastq    9625    8199     7906     7657
## G45_3M-16S_For_bc1024-16S_Rev_bc1045.fastq   15322   14892    14414    14329
## G45_6W-16S_For_bc1008-16S_Rev_bc1056.fastq   15539   13874    13409    13309
## G45_MV-16S_For_bc1024-16S_Rev_bc1057.fastq   38060   36903    35692    35491
## G46_3M-16S_For_bc1015-16S_Rev_bc1044.fastq    9243    8049     7787     7603
## G46_3MR-16S_For_bc1020-16S_Rev_bc1044.fastq  10302    9794     9508     9447
## G46_6M-16S_For_bc1022-16S_Rev_bc1065.fastq   15449   13687    13214    12726
## G46_6W-16S_For_bc1007-16S_Rev_bc1062.fastq   44832   43273    41786    40924
## G47_3M-16S_For_bc1005-16S_Rev_bc1062.fastq   36984   35224    34188    33964
## G47_6W-16S_For_bc1011-16S_Rev_bc1045.fastq    9555    8952     8553     8411
## G48_3M-16S_For_bc1015-16S_Rev_bc1057.fastq   32941   28975    27619    27135
## G48_6M-16S_For_bc1004-16S_Rev_bc1033.fastq    6578    5670     5458     5223
## G48_6W-16S_For_bc1012-16S_Rev_bc1060.fastq   31592   27476    26595    26499
## G5_3M-16S_For_bc1007-16S_Rev_bc1054.fastq    14313   13617    12959    12465
## G5_MV1-16S_For_bc1003-16S_Rev_bc1033.fastq   12120   11188    10758    10440
## G50_3M-16S_For_bc1020-16S_Rev_bc1062.fastq   13138   12819    12087    11412
## G50_6M-16S_For_bc1007-16S_Rev_bc1059.fastq   26594   25294    24303    23350
## G50_6W-16S_For_bc1004-16S_Rev_bc1044.fastq    9464    8042     7739     7530
## G51_6W-16S_For_bc1024-16S_Rev_bc1044.fastq   12695   12019    11606    11501
## G52_3M-16S_For_bc1024-16S_Rev_bc1062.fastq   32075   31224    30147    30120
## G52_6M-16S_For_bc1022-16S_Rev_bc1057.fastq   23149   20150    19490    18902
## G52_6W-16S_For_bc1020-16S_Rev_bc1059.fastq   33901   32570    31452    31312
## G7_3M-16S_For_bc1008-16S_Rev_bc1033.fastq     7801    7209     6948     6878
## G7_3MR-16S_For_bc1015-16S_Rev_bc1056.fastq   11519    9982     9473     9301
## G7_6M-16S_For_bc1007-16S_Rev_bc1033.fastq     8624    8280     7868     7576
## G7_MV2-16S_For_bc1022-16S_Rev_bc1044.fastq   12752   10865    10543    10456
## G7_PT1-16S_For_bc1005-16S_Rev_bc1057.fastq   23346   22347    21708    21499
## G8_3M-16S_For_bc1002-16S_Rev_bc1060.fastq    13648   11882    11480    11189
## G8_6W-16S_For_bc1007-16S_Rev_bc1065.fastq    32853   31661    30722    30378
## G9_3M-16S_For_bc1012-16S_Rev_bc1056.fastq    16918   14554    14101    14024
## G9_6M-16S_For_bc1022-16S_Rev_bc1075.fastq    12430   10922    10574    10410
## G9_6W-16S_For_bc1011-16S_Rev_bc1033.fastq    15472   14336    13857    13650
## MC1-16S_For_bc1009-16S_Rev_bc1045.fastq       6280    5825     5670     5572
## MC2-16S_For_bc1022-16S_Rev_bc1045.fastq      13094   11742    11456    11359
## MC3-16S_For_bc1020-16S_Rev_bc1033.fastq      11274   10879    10593    10533
## Sequence table
if (exists("st2")){
  print("cached file loaded: st2")
} else{
  st2 <- makeSequenceTable(dd2); dim(st2)
  
  save(lens.fn2, dd2, drp2, err2, prim2, track2, st2, file="/blue/mulligan/duttonc/Congo/output/cell2_output.Rdata")
}
## [1] "cached file loaded: st2"

Combine sequence tables

## due to error made at sequencing facility, need to change the name of one of the samples in each sequence table
rownames(st1) <- gsub("G13_3M_2", "G37_3M", rownames(st1))
rownames(st2) <- gsub("G13_3M_2", "G37_3M", rownames(st2))

## combine the two sequence tables together by summing them. 
stcomb <- mergeSequenceTables(st1, st2, repeats="sum")

## check rownames, should still be 126 samples
rownames(stcomb)
##   [1] "G1_6M-16S_For_bc1002-16S_Rev_bc1045.fastq"   
##   [2] "G1_6W-16S_For_bc1006-16S_Rev_bc1045.fastq"   
##   [3] "G10_3M-16S_For_bc1020-16S_Rev_bc1035.fastq"  
##   [4] "G10_6M-16S_For_bc1022-16S_Rev_bc1033.fastq"  
##   [5] "G10_6W-16S_For_bc1005-16S_Rev_bc1060.fastq"  
##   [6] "G11_3M-16S_For_bc1012-16S_Rev_bc1035.fastq"  
##   [7] "G11_6M-16S_For_bc1015-16S_Rev_bc1033.fastq"  
##   [8] "G11_6W-16S_For_bc1012-16S_Rev_bc1057.fastq"  
##   [9] "G12_3M-16S_For_bc1005-16S_Rev_bc1054.fastq"  
##  [10] "G12_6W-16S_For_bc1006-16S_Rev_bc1033.fastq"  
##  [11] "G12_MV1-16S_For_bc1024-16S_Rev_bc1056.fastq" 
##  [12] "G12_PT1-16S_For_bc1002-16S_Rev_bc1044.fastq" 
##  [13] "G37_3M-16S_For_bc1008-16S_Rev_bc1060.fastq"  
##  [14] "G13_3M-16S_For_bc1024-16S_Rev_bc1059.fastq"  
##  [15] "G13_6M-16S_For_bc1011-16S_Rev_bc1035.fastq"  
##  [16] "G13_6W-16S_For_bc1020-16S_Rev_bc1054.fastq"  
##  [17] "G14_3M-16S_For_bc1005-16S_Rev_bc1033.fastq"  
##  [18] "G14_6M-16S_For_bc1015-16S_Rev_bc1075.fastq"  
##  [19] "G14_6W-16S_For_bc1012-16S_Rev_bc1033.fastq"  
##  [20] "G14_PT-16S_For_bc1002-16S_Rev_bc1057.fastq"  
##  [21] "G15_3M-16S_For_bc1022-16S_Rev_bc1035.fastq"  
##  [22] "G15_6W-16S_For_bc1024-16S_Rev_bc1035.fastq"  
##  [23] "G15_MV1-16S_For_bc1024-16S_Rev_bc1033.fastq" 
##  [24] "G16_6M-16S_For_bc1005-16S_Rev_bc1056.fastq"  
##  [25] "G17_3M-16S_For_bc1015-16S_Rev_bc1065.fastq"  
##  [26] "G17_6M-16S_For_bc1012-16S_Rev_bc1044.fastq"  
##  [27] "G17_6W-16S_For_bc1005-16S_Rev_bc1045.fastq"  
##  [28] "G19_3M-16S_For_bc1004-16S_Rev_bc1045.fastq"  
##  [29] "G19_6W-16S_For_bc1010-16S_Rev_bc1035.fastq"  
##  [30] "G19_6WR-16S_For_bc1022-16S_Rev_bc1062.fastq" 
##  [31] "G19_MV3-16S_For_bc1005-16S_Rev_bc1075.fastq" 
##  [32] "G2_3M-16S_For_bc1004-16S_Rev_bc1035.fastq"   
##  [33] "G2_6M-16S_For_bc1011-16S_Rev_bc1044.fastq"   
##  [34] "G2_6W-16S_For_bc1020-16S_Rev_bc1065.fastq"   
##  [35] "G2_PT-16S_For_bc1024-16S_Rev_bc1065.fastq"   
##  [36] "G20_6W-16S_For_bc1024-16S_Rev_bc1054.fastq"  
##  [37] "G20_MV1-16S_For_bc1005-16S_Rev_bc1059.fastq" 
##  [38] "G20_PT1-16S_For_bc1020-16S_Rev_bc1045.fastq" 
##  [39] "G20_PT2-16S_For_bc1008-16S_Rev_bc1045.fastq" 
##  [40] "G20_PT2R-16S_For_bc1007-16S_Rev_bc1045.fastq"
##  [41] "G21_3M-16S_For_bc1005-16S_Rev_bc1044.fastq"  
##  [42] "G21_6W-16S_For_bc1007-16S_Rev_bc1057.fastq"  
##  [43] "G22_3M-16S_For_bc1020-16S_Rev_bc1075.fastq"  
##  [44] "G22_6W-16S_For_bc1006-16S_Rev_bc1044.fastq"  
##  [45] "G22_MV-16S_For_bc1003-16S_Rev_bc1044.fastq"  
##  [46] "G23_3M-16S_For_bc1008-16S_Rev_bc1035.fastq"  
##  [47] "G24_3M-16S_For_bc1020-16S_Rev_bc1056.fastq"  
##  [48] "G24_6M-16S_For_bc1007-16S_Rev_bc1075.fastq"  
##  [49] "G24_6W-16S_For_bc1008-16S_Rev_bc1062.fastq"  
##  [50] "G25_3M-16S_For_bc1012-16S_Rev_bc1054.fastq"  
##  [51] "G25_6W-16S_For_bc1008-16S_Rev_bc1065.fastq"  
##  [52] "G26_3M-16S_For_bc1024-16S_Rev_bc1060.fastq"  
##  [53] "G26_6M-16S_For_bc1020-16S_Rev_bc1060.fastq"  
##  [54] "G26_6W-16S_For_bc1006-16S_Rev_bc1035.fastq"  
##  [55] "G27_6M-16S_For_bc1002-16S_Rev_bc1056.fastq"  
##  [56] "G27_6W-16S_For_bc1008-16S_Rev_bc1057.fastq"  
##  [57] "G28_6M-16S_For_bc1022-16S_Rev_bc1060.fastq"  
##  [58] "G29_3M-16S_For_bc1005-16S_Rev_bc1065.fastq"  
##  [59] "G29_6W-16S_For_bc1008-16S_Rev_bc1054.fastq"  
##  [60] "G3_3M-16S_For_bc1015-16S_Rev_bc1060.fastq"   
##  [61] "G3_6M-16S_For_bc1009-16S_Rev_bc1044.fastq"   
##  [62] "G3_6W-16S_For_bc1012-16S_Rev_bc1059.fastq"   
##  [63] "G30_3M-16S_For_bc1009-16S_Rev_bc1035.fastq"  
##  [64] "G30_6M-16S_For_bc1015-16S_Rev_bc1059.fastq"  
##  [65] "G30_6W-16S_For_bc1010-16S_Rev_bc1044.fastq"  
##  [66] "G31_3M-16S_For_bc1007-16S_Rev_bc1044.fastq"  
##  [67] "G31_6M-16S_For_bc1003-16S_Rev_bc1045.fastq"  
##  [68] "G33_3M-16S_For_bc1012-16S_Rev_bc1075.fastq"  
##  [69] "G33_6W-16S_For_bc1022-16S_Rev_bc1059.fastq"  
##  [70] "G34_3M-16S_For_bc1015-16S_Rev_bc1054.fastq"  
##  [71] "G34_6W-16S_For_bc1012-16S_Rev_bc1065.fastq"  
##  [72] "G35_3M-16S_For_bc1002-16S_Rev_bc1059.fastq"  
##  [73] "G35_6M-16S_For_bc1012-16S_Rev_bc1045.fastq"  
##  [74] "G35_6W-16S_For_bc1020-16S_Rev_bc1057.fastq"  
##  [75] "G36_3M-16S_For_bc1024-16S_Rev_bc1075.fastq"  
##  [76] "G36_6M-16S_For_bc1002-16S_Rev_bc1035.fastq"  
##  [77] "G37_6W-16S_For_bc1008-16S_Rev_bc1044.fastq"  
##  [78] "G37_PT1-16S_For_bc1007-16S_Rev_bc1035.fastq" 
##  [79] "G38_3M-16S_For_bc1008-16S_Rev_bc1059.fastq"  
##  [80] "G38_6W-16S_For_bc1002-16S_Rev_bc1054.fastq"  
##  [81] "G39_6M-16S_For_bc1012-16S_Rev_bc1062.fastq"  
##  [82] "G4_3M-16S_For_bc1015-16S_Rev_bc1062.fastq"   
##  [83] "G4_3MR-16S_For_bc1015-16S_Rev_bc1045.fastq"  
##  [84] "G4_6W-16S_For_bc1009-16S_Rev_bc1033.fastq"   
##  [85] "G40_3M-16S_For_bc1010-16S_Rev_bc1045.fastq"  
##  [86] "G40_6M-16S_For_bc1002-16S_Rev_bc1033.fastq"  
##  [87] "G40_6W-16S_For_bc1007-16S_Rev_bc1060.fastq"  
##  [88] "G41_3M-16S_For_bc1022-16S_Rev_bc1056.fastq"  
##  [89] "G42_3M-16S_For_bc1002-16S_Rev_bc1062.fastq"  
##  [90] "G42_6M-16S_For_bc1005-16S_Rev_bc1035.fastq"  
##  [91] "G43_6M-16S_For_bc1010-16S_Rev_bc1033.fastq"  
##  [92] "G43_6W-16S_For_bc1008-16S_Rev_bc1075.fastq"  
##  [93] "G45_3M-16S_For_bc1024-16S_Rev_bc1045.fastq"  
##  [94] "G45_6W-16S_For_bc1008-16S_Rev_bc1056.fastq"  
##  [95] "G45_MV-16S_For_bc1024-16S_Rev_bc1057.fastq"  
##  [96] "G46_3M-16S_For_bc1015-16S_Rev_bc1044.fastq"  
##  [97] "G46_3MR-16S_For_bc1020-16S_Rev_bc1044.fastq" 
##  [98] "G46_6M-16S_For_bc1022-16S_Rev_bc1065.fastq"  
##  [99] "G46_6W-16S_For_bc1007-16S_Rev_bc1062.fastq"  
## [100] "G47_3M-16S_For_bc1005-16S_Rev_bc1062.fastq"  
## [101] "G47_6W-16S_For_bc1011-16S_Rev_bc1045.fastq"  
## [102] "G48_3M-16S_For_bc1015-16S_Rev_bc1057.fastq"  
## [103] "G48_6M-16S_For_bc1004-16S_Rev_bc1033.fastq"  
## [104] "G48_6W-16S_For_bc1012-16S_Rev_bc1060.fastq"  
## [105] "G5_3M-16S_For_bc1007-16S_Rev_bc1054.fastq"   
## [106] "G5_MV1-16S_For_bc1003-16S_Rev_bc1033.fastq"  
## [107] "G50_3M-16S_For_bc1020-16S_Rev_bc1062.fastq"  
## [108] "G50_6M-16S_For_bc1007-16S_Rev_bc1059.fastq"  
## [109] "G50_6W-16S_For_bc1004-16S_Rev_bc1044.fastq"  
## [110] "G51_6W-16S_For_bc1024-16S_Rev_bc1044.fastq"  
## [111] "G52_3M-16S_For_bc1024-16S_Rev_bc1062.fastq"  
## [112] "G52_6M-16S_For_bc1022-16S_Rev_bc1057.fastq"  
## [113] "G52_6W-16S_For_bc1020-16S_Rev_bc1059.fastq"  
## [114] "G7_3M-16S_For_bc1008-16S_Rev_bc1033.fastq"   
## [115] "G7_3MR-16S_For_bc1015-16S_Rev_bc1056.fastq"  
## [116] "G7_6M-16S_For_bc1007-16S_Rev_bc1033.fastq"   
## [117] "G7_MV2-16S_For_bc1022-16S_Rev_bc1044.fastq"  
## [118] "G7_PT1-16S_For_bc1005-16S_Rev_bc1057.fastq"  
## [119] "G8_3M-16S_For_bc1002-16S_Rev_bc1060.fastq"   
## [120] "G8_6W-16S_For_bc1007-16S_Rev_bc1065.fastq"   
## [121] "G9_3M-16S_For_bc1012-16S_Rev_bc1056.fastq"   
## [122] "G9_6M-16S_For_bc1022-16S_Rev_bc1075.fastq"   
## [123] "G9_6W-16S_For_bc1011-16S_Rev_bc1033.fastq"   
## [124] "MC1-16S_For_bc1009-16S_Rev_bc1045.fastq"     
## [125] "MC2-16S_For_bc1022-16S_Rev_bc1045.fastq"     
## [126] "MC3-16S_For_bc1020-16S_Rev_bc1033.fastq"

Assign Taxonomy

## Assign taxonomy, download appropriate reference database from here - https://benjjneb.github.io/dada2/training.html

## Please note, assignTaxonomy with the species training set is recommended over the other options.
## https://github.com/benjjneb/dada2/issues/1319#issuecomment-820659005

## CELL 2
if (file.exists("/blue/mulligan/duttonc/Congo/output/cellcombined_output.Rdata")){
  load("/blue/mulligan/duttonc/Congo/output/cellcombined_output.Rdata")
} else{}

if (exists("tax")){
  print("cached file loaded: tax")
} else{
  tax <- assignTaxonomy(stcomb, "/blue/mulligan/duttonc/Congo/tax/silva_nr99_v138.1_wSpecies_train_set.fa.gz", multithread=TRUE) # Slowest part
  tax[,"Genus"] <- gsub("Escherichia-Shigella", "Escherichia", tax[,"Genus"]) # Reformat to be compatible with other data sources
}
## [1] "cached file loaded: tax"
head(unname(tax))
##      [,1]       [,2]               [,3]             [,4]               
## [1,] "Bacteria" "Actinobacteriota" "Actinobacteria" "Bifidobacteriales"
## [2,] "Bacteria" "Actinobacteriota" "Actinobacteria" "Bifidobacteriales"
## [3,] "Bacteria" "Firmicutes"       "Bacilli"        "Lactobacillales"  
## [4,] "Bacteria" "Actinobacteriota" "Actinobacteria" "Bifidobacteriales"
## [5,] "Bacteria" "Actinobacteriota" "Actinobacteria" "Bifidobacteriales"
## [6,] "Bacteria" "Actinobacteriota" "Actinobacteria" "Bifidobacteriales"
##      [,5]                 [,6]              [,7]        
## [1,] "Bifidobacteriaceae" "Bifidobacterium" "longum"    
## [2,] "Bifidobacteriaceae" "Bifidobacterium" "longum"    
## [3,] "Streptococcaceae"   "Streptococcus"   "salivarius"
## [4,] "Bifidobacteriaceae" "Bifidobacterium" "breve"     
## [5,] "Bifidobacteriaceae" "Bifidobacterium" "longum"    
## [6,] "Bifidobacteriaceae" "Bifidobacterium" "longum"
if (exists("bim")){
  print("cached file loaded: bim")
} else{
  bim <- isBimeraDenovo(stcomb, minFoldParentOverAbundance=3.5, multithread=TRUE)

}
## [1] "cached file loaded: bim"

What proportion are chimeras?

## Check Chimeras
bim <- isBimeraDenovo(stcomb, minFoldParentOverAbundance=3.5, multithread=TRUE)
table(bim)
## bim
## FALSE  TRUE 
##  3449   884
sum(stcomb[,bim])/sum(stcomb)
## [1] 0.03357118
## Extract Sample Names
sample.names <- sapply(strsplit(fns1, "-"), function(x) paste(x[1]))
sample.names <- sapply(strsplit(sample.names, "/"), function(x) paste(x[7]))
rownames(stcomb) <- sample.names
sample.names
##   [1] "G1_6M"    "G1_6W"    "G10_3M"   "G10_6M"   "G10_6W"   "G11_3M"  
##   [7] "G11_6M"   "G11_6W"   "G12_3M"   "G12_6W"   "G12_MV1"  "G12_PT1" 
##  [13] "G13_3M_2" "G13_3M"   "G13_6M"   "G13_6W"   "G14_3M"   "G14_6M"  
##  [19] "G14_6W"   "G14_PT"   "G15_3M"   "G15_6W"   "G15_MV1"  "G16_6M"  
##  [25] "G17_3M"   "G17_6M"   "G17_6W"   "G19_3M"   "G19_6W"   "G19_6WR" 
##  [31] "G19_MV3"  "G2_3M"    "G2_6M"    "G2_6W"    "G2_PT"    "G20_6W"  
##  [37] "G20_MV1"  "G20_PT1"  "G20_PT2"  "G20_PT2R" "G21_3M"   "G21_6W"  
##  [43] "G22_3M"   "G22_6W"   "G22_MV"   "G23_3M"   "G24_3M"   "G24_6M"  
##  [49] "G24_6W"   "G25_3M"   "G25_6W"   "G26_3M"   "G26_6M"   "G26_6W"  
##  [55] "G27_6M"   "G27_6W"   "G28_6M"   "G29_3M"   "G29_6W"   "G3_3M"   
##  [61] "G3_6M"    "G3_6W"    "G30_3M"   "G30_6M"   "G30_6W"   "G31_3M"  
##  [67] "G31_6M"   "G33_3M"   "G33_6W"   "G34_3M"   "G34_6W"   "G35_3M"  
##  [73] "G35_6M"   "G35_6W"   "G36_3M"   "G36_6M"   "G37_6W"   "G37_PT1" 
##  [79] "G38_3M"   "G38_6W"   "G39_6M"   "G4_3M"    "G4_3MR"   "G4_6W"   
##  [85] "G40_3M"   "G40_6M"   "G40_6W"   "G41_3M"   "G42_3M"   "G42_6M"  
##  [91] "G43_6M"   "G43_6W"   "G45_3M"   "G45_6W"   "G45_MV"   "G46_3M"  
##  [97] "G46_3MR"  "G46_6M"   "G46_6W"   "G47_3M"   "G47_6W"   "G48_3M"  
## [103] "G48_6M"   "G48_6W"   "G5_3M"    "G5_MV1"   "G50_3M"   "G50_6M"  
## [109] "G50_6W"   "G51_6W"   "G52_3M"   "G52_6M"   "G52_6W"   "G7_3M"   
## [115] "G7_3MR"   "G7_6M"    "G7_MV2"   "G7_PT1"   "G8_3M"    "G8_6W"   
## [121] "G9_3M"    "G9_6M"    "G9_6W"    "MC1"      "MC2"      "MC3"
## remove chimeras
if (exists("seqtab")){
  print("cached file loaded: seqtab")
} else{
  seqtab <- removeBimeraDenovo(stcomb, method="consensus", multithread=TRUE)
  
  save(bim, seqtab, tax, file=("/blue/mulligan/duttonc/Congo/output/cellcombined_output.Rdata"))
}
## [1] "cached file loaded: seqtab"
## construct the phyloseq object 

## import sample data
sample_data <- read.csv(file="/blue/mulligan/duttonc/Congo/Congo_metadata_V4.csv", header=TRUE, sep=",")
rownames(sample_data) <- sample_data$sampleId

otuforsamples <- otu_table(seqtab, taxa_are_rows = FALSE)

## fix the G13_3M_2 samples to G37_3M. 
rownames(otuforsamples) <- gsub("G13_3M_2", "G37_3M", rownames(otuforsamples))

dataforsamples<-sample_data(sample_data)

psCongo <- phyloseq(otuforsamples, dataforsamples, tax_table(tax))

saveRDS(psCongo, "/blue/mulligan/duttonc/Congo/output/psCongo_V4.rds") 

psCongo
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 3832 taxa and 126 samples ]
## sample_data() Sample Data:       [ 126 samples by 172 sample variables ]
## tax_table()   Taxonomy Table:    [ 3832 taxa by 7 taxonomic ranks ]
sessionInfo()
## R version 4.2.1 (2022-06-23)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.1 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] gtools_3.9.3                phyloseq_1.40.0            
##  [3] gridExtra_2.3               reshape2_1.4.4             
##  [5] ggplot2_3.3.6               ShortRead_1.54.0           
##  [7] GenomicAlignments_1.32.1    SummarizedExperiment_1.26.1
##  [9] Biobase_2.56.0              MatrixGenerics_1.8.1       
## [11] matrixStats_0.62.0          Rsamtools_2.12.0           
## [13] GenomicRanges_1.48.0        BiocParallel_1.30.3        
## [15] Biostrings_2.64.1           GenomeInfoDb_1.32.4        
## [17] XVector_0.36.0              IRanges_2.30.1             
## [19] S4Vectors_0.34.0            BiocGenerics_0.42.0        
## [21] dada2_1.24.0                Rcpp_1.0.9                 
## 
## loaded via a namespace (and not attached):
##  [1] nlme_3.1-159           bitops_1.0-7           RColorBrewer_1.1-3    
##  [4] tools_4.2.1            bslib_0.4.0            vegan_2.6-2           
##  [7] utf8_1.2.2             R6_2.5.1               mgcv_1.8-40           
## [10] DBI_1.1.3              colorspace_2.0-3       permute_0.9-7         
## [13] rhdf5filters_1.8.0     ade4_1.7-19            withr_2.5.0           
## [16] tidyselect_1.2.0       compiler_4.2.1         cli_3.4.1             
## [19] DelayedArray_0.22.0    labeling_0.4.2         sass_0.4.2            
## [22] scales_1.2.1           stringr_1.4.1          digest_0.6.30         
## [25] rmarkdown_2.16         jpeg_0.1-9             pkgconfig_2.0.3       
## [28] htmltools_0.5.3        highr_0.9              fastmap_1.1.0         
## [31] rlang_1.0.6            rstudioapi_0.14        farver_2.1.1          
## [34] jquerylib_0.1.4        generics_0.1.3         hwriter_1.3.2.1       
## [37] jsonlite_1.8.2         dplyr_1.0.10           RCurl_1.98-1.9        
## [40] magrittr_2.0.3         GenomeInfoDbData_1.2.8 biomformat_1.24.0     
## [43] interp_1.1-3           Matrix_1.5-1           munsell_0.5.0         
## [46] Rhdf5lib_1.18.2        fansi_1.0.3            ape_5.6-2             
## [49] lifecycle_1.0.3        stringi_1.7.8          yaml_2.3.5            
## [52] MASS_7.3-58.1          zlibbioc_1.42.0        rhdf5_2.40.0          
## [55] plyr_1.8.7             grid_4.2.1             parallel_4.2.1        
## [58] crayon_1.5.2           deldir_1.0-6           lattice_0.20-45       
## [61] splines_4.2.1          multtest_2.52.0        knitr_1.40            
## [64] pillar_1.8.1           igraph_1.3.5           codetools_0.2-18      
## [67] glue_1.6.2             evaluate_0.16          latticeExtra_0.6-30   
## [70] data.table_1.14.4      RcppParallel_5.1.5     png_0.1-7             
## [73] vctrs_0.5.0            foreach_1.5.2          gtable_0.3.1          
## [76] assertthat_0.2.1       cachem_1.0.6           xfun_0.33             
## [79] survival_3.4-0         tibble_3.1.8           iterators_1.0.14      
## [82] cluster_2.1.4