#PdTT Version 1.0 (January 2015) #This document describes workflow for dN/dS ratio analysis and transcriptome divergence index (TDI) calculation #Please cite: #Cheng X, Hui JHL, Lee YY, Kwan HS. 2015. A 'developmental hourglass' in fungi. #Please make sure the following software packages are properly installed: #1) BLAST: http://blast.ncbi.nlm.nih.gov/Blast.cgi #2) MAFFT: http://mafft.cbrc.jp/alignment/software/ #3) PAL2NAL: http://www.bork.embl.de/pal2nal/ #4) PAML: http://abacus.gene.ucl.ac.uk/software/ #5) Perl module: BioPerl (http://search.cpan.org/~cjfields/BioPerl/) #To calculate TDI and statistically assess the TDI profile, please also install the following: #6) Perl module: Statistics-R-0.33 (http://search.cpan.org/~fangly/Statistics-R-0.33/) #7) R: http://www.r-project.org/ #####Step 1##### #Prepare the following files: #1) The protein sequences of the genome of interest #2) The protein sequences of the reference genome #3) The nucleotide sequences of the genome of interest #4) The nucleotide sequences of the reference genome #####Step 2##### #Step 2.1# #Run reciprocal blastp. $in_pr_fasta is the protein sequences of the genome of interest, $db_pr_fasta is the protein sequences of the reference genome blastall -i $in_pr_fasta -d $db_pr_fasta -o A_B -m 8 -p blastp blastall -i $db_pr_fasta -d $in_pr_fasta -o B_A -m 8 -p blastp #Step 2.2# #Find Reciprocal Best Hits using BLASTP (RBHB) perl -e '$name_col=0; $e_col=10; $score_col=11; while(<>) {s/\r?\n//; @F=split /\t/, $_; ($n, $e, $s) = @F[$name_col, $e_col, $score_col]; if (! exists($max{$n})) {push @names, $n}; if (! exists($max{$n}) || $s > $max{$n} || $e < $max_e{$n}) {$max{$n} = $s; $max_e{$n} = $e; $best{$n} = ()}; if ($s == $max{$n}) {$best{$n} .= "$_\n"};} for $n (@names) {print $best{$n}}' A_B > A_B.best perl -e '$name_col=0; $score_col=11; while(<>) {s/\r?\n//; @F=split /\t/, $_; ($n, $s) = @F[$name_col, $score_col]; if (! exists($max{$n})) {push @names, $n}; if (! exists($max{$n}) || $s > $max{$n}) {$max{$n} = $s; $best{$n} = ()}; if ($s == $max{$n}) {$best{$n} .= "$_\n"};} for $n (@names) {print $best{$n}}' B_A > B_A.best perl -e '$col1=1; $col2=0;' -e '($f1,$f2)=@ARGV; open(F1,$f1); while () {s/\r?\n//; @F=split /\t/, $_; $line1{$F[$col1]} .= "$_\n"} open(F2,$f2); while () {s/\r?\n//;@F=split /\t/, $_; if ($x = $line1{$F[$col2]}) {$x =~ s/\n/\t$_\n/g; print $x}}' A_B.best B_A.best > A_B_A perl -e '$colm=0; $coln=13; $count=0; while(<>) {s/\r?\n//; @F=split /\t/, $_; if ($F[$colm] eq $F[$coln]) {print "$_\n"; $count++}} warn "\nChose $count lines out of $. where column $colm had same text as column $coln\n\n";' A_B_A > A_B_A.recip perl -e '@cols=(0, 1, 10); while(<>) {s/\r?\n//; @F=split /\t/, $_; print join("\t", @F[@cols]), "\n"} warn "\nJoined columns ", join(", ", @cols), " for $. lines\n\n"' A_B_A.recip > RBHB.out #####Step 3##### #Convert into protein and nucleotide pairs #Run 2fasta.pl under the directory. [RBHB.out] is the RBHB results of step 2.2; #[in_pr_fasta] and [db_pr_fasta] are the protein sequences of the genome of interest and the reference genome, respectively; #[in_nt_fasta] and [db_nt_fasta] are the nucleotide sequences of the genome of interest and the reference genome, respectively; #[e_value] is the blastp e-value cutoff; #[fasta_dir_pr] and [fasta_dir_nt] are the output directories for storing protein sequences and nucleotide sequences of each gene pair, respectively. perl 2fasta.pl [RBHB.out] [in_pr_fasta] [db_pr_fasta] [e_value] [fasta_dir_pr] perl 2fasta.pl [RBHB.out] [in_nt_fasta] [db_nt_fasta] [e_value] [fasta_dir_nt] #####Step 4##### #Compute dN/dS ratios #Run compute_dNdS.pl #[fasta_dir_pr] and [fasta_dir_nt] are the directories generated during Step 3, which store protein sequences and nucleotide sequences of each gene pair, respectively. #The results are stored in the output file "dNdS_ratios.tab" perl compute_dNdS.pl [fasta_dir_pr] [fasta_dir_nt] #####Step 5##### #Calculate the transcriptome divergence index (TDI) #Run calculate_TDI_p.pl #The [in_file] file should follow the following format: #Gene_id dNdS Intensity_stage1 Intensity_stage2 Intensity_stage3 ... #where #Column 1 = The gene id #Column 2 = The dN/dS ratio of that gene #Column 3-- = The expression levels of that gene across developmental stages perl calculate_TDI_p.pl [in_file]