Sylvain Monteux, Frida Keuper, Sébastien Fontaine, Konstantin Gavazov, Sara Hallin, Jaanis Juhanson, Eveline J. Krab, Sandrine Revaillot, Erik Verbruggen, Josefine Walz, James T. Weedon, Ellen Dorrepaal
Nature Geoscience (2020), https://dx.doi.org/10.1038/s41561-020-00662-4
This Jupyter notebook was ran with the following software: To run this notebook, you will need the following software:
# In this notebook, qiime is installed on a conda env called "qiime", and is called by "conda activate qiime", feel free to replace that with whatever you call it, or to remove that line if qiime is installed without conda
# The conda environment can be created using the environment.yml file as follows
# This should install most packages, but some (e.g. RDP classifier) might still need to be manually installed
conda env create -n qiime -f ../environment.yml
# Download sequence data from ENA using enaGroupGet
mkdir ../reads/16s
mkdir ../reads/ITS
cd ../reads/
enaGroupGet -f submitted PRJEB29467
# If enaBrowserTools / enaGroupGet is not an option, or does not work, you can download all raw read files from ENA
# (project accession number PRJEB29467) manually or through a ftp client. The sequence headers are modified when
# downloading fastq format from ENA which will cause the script to not work without tweaking, hence the "-f submitted" flag
mv PRJEB29467/ERR*/*.gz .
rmdir PRJEB29467/* -p
xargs mv -t 16s/ < 16s_namelist.txt
cd ../../16S
# The ../reads/16s/ folder should now have all the fastq.gz files (R1 and R2 for each sample), without arborescence
# Compressed read files should now be in the ../reads folder, separated into a 16s/ and a its/ folder
gunzip ../reads/16s/*
cat ../reads/16s/*R1* > Read1.fastq
cat ../reads/16s/*R2* > Read2.fastq
gzip ../reads/16s/*
vsearch --fastq_mergepairs Read1.fastq --reverse Read2.fastq --fastq_allowmergestagger --fastq_minmergelen 100 --fastq_maxdiffs 1 --fastqout merged_vsearch.fastq --threads 3
# remove primers with awk script
awk -f scripts/remove_primers.awk merged_vsearch.fastq > noprimers.fastq
# quality filtering with vsearch at 0.05 max expected error / Q score
vsearch --fastq_filter noprimers.fastq --fastaout np_filtered_05.fasta --fastq_maxee 0.05
# dereplication of repeated sequences, sortbysize
vsearch --derep_fulllength np_filtered_05.fasta --output npf05_derep.fasta --sizeout --threads 3
# cluster otus together + first chimera check
vsearch --cluster_size npf05_derep.fasta --consout OTUs_Mammoth.fasta --id 0.97 --relabel 'OTU_' --threads 3
# chimera check with uchime
vsearch --uchime_ref OTUs_Mammoth.fasta --db scripts/gold.fa --nonchimeras OTUs_uchimed_Mammoth.fasta --uchimeout uchimeout_Mammoth.uchime --relabel 'OTU_' --threads 6
# Relabel sequences giving them a unique sequence number and the number corresponding to their barcode/tag
awk -f scripts/relabel.awk np_filtered_05.fasta > npf_relabeled.fasta
# map original reads with the OTUs
vsearch --usearch_global npf_relabeled.fasta -db OTUs_uchimed_Mammoth.fasta -id 0.97 -uc Mammoth_readmap.uc --threads 3
# convert .uc readmap into .otu table
python2 scripts/python_scripts/uc2otutab2.py Mammoth_readmap.uc > Mammoth_readmap.otu
# Use R to remove abundant OTUs from the controls
Rscript scripts/control_OTU_removal.R
# (If using conda, activate your python2 environment with qiime installed, hereafter called 'qiime')
conda activate qiime
# validate mapping file
validate_mapping_file.py -m MonteuxKeuper_Mapping.txt -o mapvalid/ -b -p -j SampleID
# convert OTU table to biom format
biom convert -i Yedoma.otu -o Yedoma.biom --table-type="OTU table" --to-json
# prune OTUs present in less than 10% of samples
filter_otus_from_otu_table.py -i Yedoma.biom -o YedomaF.biom -s 4 # 36 samples in full dataset
# filter fasta file from OTUs that have been pruned
filter_fasta.py -f OTUs_uchimed_Mammoth.fasta -o Yedoma_OTUs.fasta -b YedomaF.biom
# align the sequences from the pruned OTU lists
align_seqs.py -i Yedoma_OTUs.fasta -o Yedoma_pyNAST/
# Remove the positions which are gap for every sequences (common for pyNAST as short sequences are aligned against the whole 16S gene)
filter_alignment.py -i Yedoma_pyNAST/Yedoma_OTUs_aligned.fasta
# create a new OTU map without the OTUs that couldn't be aligned
filter_otus_from_otu_table.py -i YedomaF.biom -o Yedoma_nofail.biom -e Yedoma_pyNAST/Yedoma_OTUs_failures.fasta
# filter OTUs from the ones that failed alignment
filter_fasta.py -f OTUs_uchimed_Mammoth.fasta -o Yedoma_OTUs_nofails.fasta -b Yedoma_nofail.biom
# make a tree with the aligned pfiltered OTUs
make_phylogeny.py -i Yedoma_OTUs_aligned_pfiltered.fasta
# assign a taxonomy to the fasta file without fails
assign_taxonomy.py -m rdp -i Yedoma_OTUs_nofails.fasta -o Yedoma_gg13_8/ -t scripts/97_otu_taxonomy.txt -r scripts/97_otus.fasta --rdp_max_memory=3000
# Add taxonomy metadata to the biom tables
biom add-metadata -i Yedoma_nofail.biom -o Yedoma_nofail_ggtax.biom --observation-metadata-fp Yedoma_gg13_8/Yedoma_OTUs_nofails_tax_assignments.txt --sc-separated taxonomy --observation-header OTUID,taxonomy
# Summarize the tables
biom summarize-table -i Yedoma_nofail_ggtax.biom -o summary_Yedoma_gg.txt
# Alpha rarefaction analysis
alpha_rarefaction.py -i Yedoma_nofail_ggtax.biom -o alpha_rar_up_to_12k -m MonteuxKeuper_Mapping.txt -t Yedoma_OTUs_aligned_pfiltered.tre -f -e 12000 -O 3 -a -n 10
# Rarefy the samples
filter_samples_from_otu_table.py -i Yedoma_nofail_ggtax.biom -o Yedoma_nofail_noctrl_ggtax.biom -m MonteuxKeuper_Mapping.txt -s 'Experiment:Mammoth'
multiple_rarefactions_even_depth.py -i Yedoma_nofail_noctrl_ggtax.biom -o Yedoma_rar_5000/ -d 5000 -n 100 -k
# convert biom tables to tsv
bash scripts/convert_table.sh
# clean converted tsv tables so they can be read by R
bash scripts/clean_tsv_from_biom_rarefied_tables.sh
# read all rarefactions, add the data.frames to one another, and divide by 100; print consensus100_rarefaction5000.otu consensus
Rscript scripts/make_consensus_from_rarefactions.R
biom convert -i Yedoma_rarefied5000_tsv/clean/consensus100_rarefaction5000.otu -o consensus100_rarefaction5000.biom --to-json
# Add taxonomy metadata to the biom tables
biom add-metadata -i consensus100_rarefaction5000.biom -o Yedoma_5000_nofail_ggtax.biom --observation-metadata-fp Yedoma_gg13_8/Yedoma_OTUs_nofails_tax_assignments.txt --sc-separated taxonomy --observation-header OTUID,taxonomy
summarize_taxa.py -i Yedoma_5000_nofail_ggtax.biom -o Yedoma_5000_taxasum/
alpha_diversity.py -i Yedoma_5000_nofail_ggtax.biom -o alpha_div_Yedoma.txt -m ace,chao1,chao1_ci,observed_otus,shannon,simpson
beta_diversity.py -i Yedoma_5000_nofail_ggtax.biom -o Yedoma_unifrac_5000 -t Yedoma_OTUs_aligned_pfiltered.tre
biom convert -i Yedoma_5000_nofail_ggtax.biom -o Yedoma_5000_nofail_ggtax.otu --table-type="OTU table" --to-tsv
#Rcmd .libPaths()
differential_abundance.py -i Yedoma_nofail_ggtax.biom -o diff_abund_YvI_all_harvests.txt -m MonteuxKeuper_Mapping.txt -c DESEQ -x b -y a -a DESeq2_nbinom -d
filter_samples_from_otu_table.py -i Yedoma_nofail_ggtax.biom -o Harvest0_ggtax.biom -m MonteuxKeuper_Mapping.txt -s 'Harvest:0'
filter_samples_from_otu_table.py -i Yedoma_nofail_ggtax.biom -o Harvest1_ggtax.biom -m MonteuxKeuper_Mapping.txt -s 'Harvest:1'
filter_samples_from_otu_table.py -i Yedoma_nofail_ggtax.biom -o Harvest2_ggtax.biom -m MonteuxKeuper_Mapping.txt -s 'Harvest:2'
filter_samples_from_otu_table.py -i Yedoma_nofail_ggtax.biom -o Harvest3_ggtax.biom -m MonteuxKeuper_Mapping.txt -s 'Harvest:3'
filter_samples_from_otu_table.py -i Yedoma_nofail_ggtax.biom -o Harvest4_ggtax.biom -m MonteuxKeuper_Mapping.txt -s 'Harvest:4'
differential_abundance.py -i Harvest0_ggtax.biom -o diff_abund_YvI_H0.txt -m MonteuxKeuper_Mapping.txt -c DESEQ -x b -y a -a DESeq2_nbinom -d
differential_abundance.py -i Harvest1_ggtax.biom -o diff_abund_YvI_H1.txt -m MonteuxKeuper_Mapping.txt -c DESEQ -x b -y a -a DESeq2_nbinom -d
differential_abundance.py -i Harvest2_ggtax.biom -o diff_abund_YvI_H2.txt -m MonteuxKeuper_Mapping.txt -c DESEQ -x b -y a -a DESeq2_nbinom -d
differential_abundance.py -i Harvest3_ggtax.biom -o diff_abund_YvI_H3.txt -m MonteuxKeuper_Mapping.txt -c DESEQ -x b -y a -a DESeq2_nbinom -d
differential_abundance.py -i Harvest4_ggtax.biom -o diff_abund_YvI_H4.txt -m MonteuxKeuper_Mapping.txt -c DESEQ -x b -y a -a DESeq2_nbinom -d
# Copy result files to a ../Rnew/ folder
mkdir ../Downstream/Rnew
cp Yedoma_unifrac_5000/weighted_unifrac_Yedoma_5000_nofail_ggtax.txt ../Downstream/Rnew
awk -f scripts/cleaner.awk Yedoma_5000_taxasum/Yedoma_5000_nofail_ggtax_L2.txt > ../Downstream/Rnew/Yedoma_5000_nofail_ggtax_L2.txt
awk -f scripts/cleaner.awk Yedoma_5000_taxasum/Yedoma_5000_nofail_ggtax_L3.txt > ../Downstream/Rnew/Yedoma_5000_nofail_ggtax_L3.txt
awk -f scripts/cleaner.awk Yedoma_5000_nofail_ggtax.otu > ../Downstream/Rnew/Yedoma_5000_nofail_ggtax.otu
cp alpha_rar_up_to_12k/alpha_div_collated/chao1.txt ../Downstream/Rnew/16schao1.txt
cp diff_abund_YvI_all_harvests.txt ../Downstream/Rnew/
cp diff_abund_YvI_H0.txt ../Downstream/Rnew/
cp diff_abund_YvI_H1.txt ../Downstream/Rnew/
cp diff_abund_YvI_H2.txt ../Downstream/Rnew/
cp diff_abund_YvI_H3.txt ../Downstream/Rnew/
cp diff_abund_YvI_H4.txt ../Downstream/Rnew/
cp alpha_div_Yedoma.txt ../Downstream/Rnew/