##################################################################################### #This is a forloop for trimming Illumina raw reads using trimmomatic ###################################################################################### #Software packages required to install: Trimmomatic # Path to folder containing the raw-reads: raw_reads="/path/to/raw_reads/" # HERE is the paired trimmed_read outputs trimmed="/path/to/raw_reads/trimmed_reads" # HERE is the unpaired trimmed_read outputs untrimmed="/path/to/raw_reads/untrimmed_reads" # Folder containg trimmomatic executable files and adaptor database: "/path/to/Trimmomatic/" mkdir -p $trimmed mkdir -p $untrimmed cd $raw_reads for read1 in *_R1.fastq.gz do read2=${read1//_R1.fastq.gz/_R2.fastq.gz} java -jar /path/to/Trimmomatic/trimmomatic-0.38.jar PE $read1 $read2 $trimmed/${read1%_R1.fastq.gz}_R1_paired.fq.gz $untrimmed/${read1%_R1.fastq.gz}_R1_unpaired.fq.gz $trimmed/${read2%_R2.fastq.gz}_R2_paired.fq.gz $untrimmed/${read2%_R2.fastq.gz}_R2_unpaired.fq.gz ILLUMINACLIP:/path/to/Trimmomatic/adapters/NEB_Costum.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:50 > $trimmed/trimmomatic_log.txt done ##################################################################################### #This is a forloop for short-read assembling of genomes using Unicycler: ###################################################################################### #Software packages required to install: Unicycler # HERE IS THE folder where the trimmed reads are: trimmed_reads="/path/to/reads/" # Unicycler assembled genomes will be saved here Assembled_contigs="/path/to/reads/Assembled_contigs" mkdir -p $Assembled_contigs cd $trimmed_reads for read1 in *_R1_paired.fq.gz do read2=${read1//_R1_paired.fq.gz/_R2_paired.fq.gz} unicycler -1 $read1 -2 $read2 -o $Assembled_contigs/${read1%_R1_paired.fq.gz}_output_dir -t 10 done ##################################################################################### #Command used for long-read assembly of barcoded PacBio reads using Flye assembler: ###################################################################################### #Software packages required to install: Flye_assembler # genome size should be adjusted based on each genome: flye --pacbio-raw PacBio_Barcoded_subreads.fasta --out-dir flye_output_final --genome-size "XXX"m --threads 12 --plasmids --iterations 1 ############################################################################################################## #This is a forloop for pilon polishing of Flye assemblies (or other assembled contigs) using illumina reads: ############################################################################################################## # Software packages required to install: bwa, samtools, pilon # HERE IS THE folder where the Flye_assemblies and corresponding trimmed illumina reads are saved: Assemblies="/path/to/flye_assemblies" cd $Assemblies for read1 in *_R1_paired.fq.gz do read2=${read1//_R1_paired.fq.gz/_R2_paired.fq.gz} assembly=${read1//_R1_paired.fq.gz/_flye_assembly.fasta} echo $read1 echo $read2 echo $assembly echo "Pilon round 1" echo "indexing hybrid assembly" bwa index $assembly mkdir ${read1%_R1_paired.fq.gz}_pilon echo "using bwa mem to map illumina reads to the hybrid assembly followed by samtools to sort the bam file" bwa mem -t 12 $assembly $read1 $read2 | samtools view - -Sb | samtools sort - -@12 -o /path/to/flye_assemblies/${read1%_R1_paired.fq.gz}_pilon/${read1%_R1_paired.fq.gz}_map_sorted_round1.bam echo "using samtools to index the sorted bam file" samtools index /path/to/flye_assemblies/${read1%_R1_paired.fq.gz}_pilon/${read1%_R1_paired.fq.gz}_map_sorted_round1.bam echo "running pilon on hybrid assembly" pilon --genome $assembly --fix all --changes --frags /path/to/flye_assemblies/${read1%_R1_paired.fq.gz}_pilon/${read1%_R1_paired.fq.gz}_map_sorted_round1.bam --threads 12 --output /path/to/flye_assemblies/${read1%_R1_paired.fq.gz}_pilon/Pilon_round1 | tee /path/to/flye_assemblies/${read1%_R1_paired.fq.gz}_pilon/round1.pilon echo "Pilon round 2" echo "indexing round 1 pilon fasta" bwa index /path/to/flye_assemblies/${read1%_R1_paired.fq.gz}_pilon/Pilon_round1.fasta echo "using bwa mem to map illumina reads to round 1 pilon fasta followed by samtools to sort the bam file" bwa mem -t 12 /path/to/flye_assemblies/${read1%_R1_paired.fq.gz}_pilon/Pilon_round1.fasta $read1 $read2 | samtools view - -Sb | samtools sort - -@12 -o /path/to/flye_assemblies/${read1%_R1_paired.fq.gz}_pilon/${read1%_R1_paired.fq.gz}_map_sorted_round2.bam echo "using samtools to index the sorted bam file round 2" samtools index /path/to/flye_assemblies/${read1%_R1_paired.fq.gz}_pilon/${read1%_R1_paired.fq.gz}_map_sorted_round2.bam echo "running pilon on hybrid assembly round 2" pilon --genome /path/to/flye_assemblies/${read1%_R1_paired.fq.gz}_pilon/Pilon_round1.fasta --fix all --changes --frags /path/to/flye_assemblies/${read1%_R1_paired.fq.gz}_pilon/${read1%_R1_paired.fq.gz}_map_sorted_round2.bam --threads 12 --output /path/to/flye_assemblies/${read1%_R1_paired.fq.gz}_pilon/Pilon_round2 | tee /path/to/flye_assemblies/${read1%_R1_paired.fq.gz}_pilon/round2.pilon echo "Pilon round 3" echo "indexing round 2 pilon fasta" bwa index /path/to/flye_assemblies/${read1%_R1_paired.fq.gz}_pilon/Pilon_round2.fasta echo "using bwa mem to map illumina reads to round 2 pilon fasta followed by samtools to sort the bam file" bwa mem -t 12 /path/to/flye_assemblies/${read1%_R1_paired.fq.gz}_pilon/Pilon_round2.fasta $read1 $read2 | samtools view - -Sb | samtools sort - -@12 -o /path/to/flye_assemblies/${read1%_R1_paired.fq.gz}_pilon/${read1%_R1_paired.fq.gz}_map_sorted_round3.bam echo "using samtools to index the sorted bam file round 3" samtools index /path/to/flye_assemblies/${read1%_R1_paired.fq.gz}_pilon/${read1%_R1_paired.fq.gz}_map_sorted_round3.bam echo "running pilon on hybrid assembly round 3" pilon --genome /path/to/flye_assemblies/${read1%_R1_paired.fq.gz}_pilon/Pilon_round2.fasta --fix all --changes --frags /path/to/flye_assemblies/${read1%_R1_paired.fq.gz}_pilon/${read1%_R1_paired.fq.gz}_map_sorted_round3.bam --threads 12 --output /path/to/flye_assemblies/${read1%_R1_paired.fq.gz}_pilon/Pilon_round3 | tee /path/to/flye_assemblies/${read1%_R1_paired.fq.gz}_pilon/round3.pilon echo "Pilon round 4" echo "indexing round 3 pilon fasta" bwa index /path/to/flye_assemblies/${read1%_R1_paired.fq.gz}_pilon/Pilon_round3.fasta echo "using bwa mem to map illumina reads to round 3 pilon fasta followed by samtools to sort the bam file" bwa mem -t 12 /path/to/flye_assemblies/${read1%_R1_paired.fq.gz}_pilon/Pilon_round3.fasta $read1 $read2 | samtools view - -Sb | samtools sort - -@12 -o /path/to/flye_assemblies/${read1%_R1_paired.fq.gz}_pilon/${read1%_R1_paired.fq.gz}_map_sorted_round4.bam echo "using samtools to index the sorted bam file round 4" samtools index /path/to/flye_assemblies/${read1%_R1_paired.fq.gz}_pilon/${read1%_R1_paired.fq.gz}_map_sorted_round4.bam echo "running pilon on hybrid assembly round 4" pilon --genome /path/to/flye_assemblies/${read1%_R1_paired.fq.gz}_pilon/Pilon_round3.fasta --fix all --changes --frags /path/to/flye_assemblies/${read1%_R1_paired.fq.gz}_pilon/${read1%_R1_paired.fq.gz}_map_sorted_round4.bam --threads 12 --output /path/to/flye_assemblies/${read1%_R1_paired.fq.gz}_pilon/Pilon_round4 | tee /path/to/flye_assemblies/${read1%_R1_paired.fq.gz}_pilon/round4.pilon echo "Pilon round 5" echo "indexing round 4 pilon fasta" bwa index /path/to/flye_assemblies/${read1%_R1_paired.fq.gz}_pilon/Pilon_round4.fasta echo "using bwa mem to map illumina reads to round 4 pilon fasta followed by samtools to sort the bam file" bwa mem -t 12 /path/to/flye_assemblies/${read1%_R1_paired.fq.gz}_pilon/Pilon_round4.fasta $read1 $read2 | samtools view - -Sb | samtools sort - -@12 -o /path/to/flye_assemblies/${read1%_R1_paired.fq.gz}_pilon/${read1%_R1_paired.fq.gz}_map_sorted_round5.bam echo "using samtools to index the sorted bam file round 5" samtools index /path/to/flye_assemblies/${read1%_R1_paired.fq.gz}_pilon/${read1%_R1_paired.fq.gz}_map_sorted_round5.bam echo "running pilon on hybrid assembly round 5" pilon --genome /path/to/flye_assemblies/${read1%_R1_paired.fq.gz}_pilon/Pilon_round4.fasta --fix all --changes --frags /path/to/flye_assemblies/${read1%_R1_paired.fq.gz}_pilon/${read1%_R1_paired.fq.gz}_map_sorted_round5.bam --threads 12 --output /path/to/flye_assemblies/${read1%_R1_paired.fq.gz}_pilon/Pilon_round5 | tee /path/to/flye_assemblies/${read1%_R1_paired.fq.gz}_pilon/round5.pilon done ################################################################################################# #This is a forloop for barcoded short- and long-read hybrid assembly of genomes using Unicycler: ################################################################################################# # Software packages required to install: Unicycler # HERE IS THE folder where the barcoded illumina reads and PacBio subreads are: Reads="/path/to/reads" # Unicycler assembled genomes will be saved here Assembled_contigs="/path/to/reads/Assembled_contigs" mkdir -p $Assembled_contigs cd $Reads for read1 in *_R1_paired.fq.gz do read2=${read1//_R1_paired.fq.gz/_R2_paired.fq.gz} read3=${read1//_R1_paired.fq.gz/_PacBio_Barcoded_subreads.fasta} unicycler -1 $read1 -2 $read2 -l $read3 -o $Assembled_contigs/${read1%_R1_paired.fq.gz}_output_dir -t 32 done ################################################################################################# #This is the bioinformatic workflow for SML hybrid assembly (example for one genome "GC250") ################################################################################################# # Software packages required to install: bedtools, minimap2, samtools, seqtk, filtlong, Flye_assembler, Unicycler # Convert the format of PacBio subreads from bam to fastq bedtools bamtofastq -i SML.subreads.bam -fq pacbio_SML.fastq cp pacbio_SML.fastq /path/to/pacbio/SML/subread/ # Move to the folder containing Illumina reads and Unicycler short-read assembly of the target genome: cd /path/to/illumina/assembly/GC250 && mkdir GC250_flye && # alignemnt of raw PacBio subreads to the short-read assembly of the target genome: minimap2 -ax map-pb -t 24 /path/to/illumina/assembly/GC250/GC250_assembly.fasta /path/to/pacbio/SML/subread/pacbio_SML.fastq > GC250_flye/GC250_minimap_aligned_reads.sam && cd /path/to/illumina/assembly/GC250/GC250_flye && # Converting the Pacbio aligned reads' format from sam to bam samtools view -@ 8 -bS GC250_minimap_aligned_reads.sam -o GC250_minimap_aligned_reads.bam && # Removing sam file rm GC250_minimap_aligned_reads.sam && # Sorting the resulting bam file samtools sort -@ 8 GC250_minimap_aligned_reads.bam -o GC250_minimap_aligned_sorted.bam && # Indexing the sorted bam file samtools index GC250_minimap_aligned_sorted.bam && # Extracting only the mapped reads from the bam file samtools view -b -F 4 GC250_minimap_aligned_sorted.bam > GC250_minimap_aligned_sorted_filtered.bam && # Converting the bam file to fastq file as input for Flye Long-Read Assembler samtools bam2fq GC250_minimap_aligned_sorted_filtered.bam | seqtk seq -A > GC250_minimap_aligned_sorted_filtered.fasta && rm GC250_minimap_aligned_sorted.bam && rm GC250_minimap_aligned_reads.bam && # Using filtlong to align the sorted PacBio reads to Illumina short reads for selecting 90% of the highest quality long reads (up to a maximum of 500000000) that are longer than 2000bp filtlong -1 /path/to/illumina/assembly/GC250/GC250_R1_paired.fq.gz -2 /path/to/illumina/assembly/GC250/GC250_R2_paired.fq.gz --min_length 2000 --keep_percent 90 --target_bases 500000000 --trim --split 100 GC250_minimap_aligned_sorted_filtered.fasta > GC250_minimap_aligned_sorted_filtered_filtlong.fasta && cp /path/to/illumina/assembly/GC250/GC250_assembly.fasta /path/to/illumina/assembly/GC250/GC250_flye/ && # Adding the Illumina short-read assemblt to the pool of selected PacBio reads cat GC250_minimap_aligned_sorted_filtered_filtlong.fasta GC250_assembly.fasta > GC250_filtlong_unicycler_combined.fasta && # Using the Flye assembler for assembling sorted PacBio reads to long-read contigs flye --pacbio-raw GC250_filtlong_unicycler_combined.fasta --out-dir GC250_flye_output --genome-size 2.5m --threads 16 --plasmids && # Rename Flye assembly files mv /path/to/illumina/assembly/GC250/GC250_flye/GC250_flye_output/*.fasta /path/to/illumina/assembly/GC250/GC250_flye/GC250_flye_output/GC250_flye_round1_assembly.fasta && mv /path/to/illumina/assembly/GC250/GC250_flye/GC250_flye_output/*.gfa /path/to/illumina/assembly/GC250/GC250_flye/GC250_flye_output/GC250_flye_round1_assembly.gfa && # Copy the sorted PacBio reads to the output folder of the Flye assembler cp GC250_minimap_aligned_sorted_filtered_filtlong.fasta /path/to/illumina/assembly/GC250/GC250_flye/GC250_flye_output/ && # Copy Illumina reads to the output folder of the Flye assembler cp /path/to/illumina/assembly/GC250/GC250_R* /path/to/illumina/assembly/GC250/GC250_flye/GC250_flye_output/ && # Change the working directory to the output folder of the Flye assembler cd /path/to/illumina/assembly/GC250/GC250_flye/GC250_flye_output/ && # Perform SML hybrid assembly using Illumina reads, PacBio sorted long reads, and flye long-read contigs unicycler -1 GC250_R1_paired.fq.gz -2 GC250_R2_paired.fq.gz -l GC250_minimap_aligned_sorted_filtered_filtlong.fasta --existing_long_read_assembly GC250_flye_round1_assembly.fasta -o GC250_flye_unicycler_hybrid --mode bold -t 16 ##################################################################################### #This is a forloop for genome annotation using Prodigal: ###################################################################################### # Software packages required to install: Prodigal # HERE IS THE folder where the assembled genomes are: Assembled_contigs="/path/to/fasta_files/" # Prokka annotations will be saved here Prodigal_annot="/path/to/fasta_files/Prodigal_annot" mkdir -p $Prodigal_annot ####################### cd $Assembled_contigs for contigs in *.fasta do prodigal -i $contigs -o $Prodigal_annot/${contigs%.fasta}.genes -a $Prodigal_annot/${contigs%.fasta}_proteins.faa done ##################################################################################### #This is a forloop for balsting predicted CDSs against UniProt database: ###################################################################################### # Software packages required to install: Diamond # Database required to install: UniProtKB/TrEMBL # HERE IS THE folder where the Prodigal predicted protein codings are: Prodigal_annot="/path/to/fasta_files/Prodigal_annot" # Diamond hits will be saved here Diamond_hits="/path/to/fasta_files/Prodigal_annot/Diamond_hits" mkdir -p $Diamond_hits ####################### cd $Prodigal_annot for contigs in *.faa do diamond blastp --db /path/to/Uniprot_ref -q $contigs -o $Diamond_hits/${contigs%.faa}_diamond.txt --outfmt 6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send qlen slen evalue bitscore gaps --threads 8 --header --max-target-seqs 1 done