#PdTT Version 1.0 (January 2015)
#This document describes workflow for dN/dS ratio analysis and transcriptome divergence index (TDI) calculation

#Please cite:
#Cheng X, Hui JHL, Lee YY, Kwan HS. 2015. A 'developmental hourglass' in fungi.

#Please make sure the following software packages are properly installed:
#1) BLAST: http://blast.ncbi.nlm.nih.gov/Blast.cgi
#2) MAFFT: http://mafft.cbrc.jp/alignment/software/
#3) PAL2NAL: http://www.bork.embl.de/pal2nal/
#4) PAML: http://abacus.gene.ucl.ac.uk/software/
#5) Perl module: BioPerl (http://search.cpan.org/~cjfields/BioPerl/)

#To calculate TDI and statistically assess the TDI profile, please also install the following:
#6) Perl module: Statistics-R-0.33 (http://search.cpan.org/~fangly/Statistics-R-0.33/)
#7) R: http://www.r-project.org/

#####Step 1#####
#Prepare the following files:
#1) The protein sequences of the genome of interest
#2) The protein sequences of the reference genome
#3) The nucleotide sequences of the genome of interest
#4) The nucleotide sequences of the reference genome

#####Step 2#####
#Step 2.1#
#Run reciprocal blastp. $in_pr_fasta is the protein sequences of the genome of interest, $db_pr_fasta is the protein sequences of the reference genome
blastall -i $in_pr_fasta -d $db_pr_fasta -o A_B -m 8 -p blastp
blastall -i $db_pr_fasta -d $in_pr_fasta -o B_A -m 8 -p blastp

#Step 2.2#
#Find Reciprocal Best Hits using BLASTP (RBHB)
perl -e '$name_col=0; $e_col=10; $score_col=11; while(<>) {s/\r?\n//; @F=split /\t/, $_; ($n, $e, $s) = @F[$name_col, $e_col, $score_col]; if (! exists($max{$n})) {push @names, $n}; if (! exists($max{$n}) || $s > $max{$n} || $e < $max_e{$n}) {$max{$n} = $s; $max_e{$n} = $e; $best{$n} = ()}; if ($s == $max{$n}) {$best{$n} .= "$_\n"};} for $n (@names) {print $best{$n}}' A_B > A_B.best
perl -e '$name_col=0; $score_col=11; while(<>) {s/\r?\n//; @F=split /\t/, $_; ($n, $s) = @F[$name_col, $score_col]; if (! exists($max{$n})) {push @names, $n}; if (! exists($max{$n}) || $s > $max{$n}) {$max{$n} = $s; $best{$n} = ()}; if ($s == $max{$n}) {$best{$n} .= "$_\n"};} for $n (@names) {print $best{$n}}' B_A > B_A.best
perl -e '$col1=1; $col2=0;' -e '($f1,$f2)=@ARGV; open(F1,$f1); while (<F1>) {s/\r?\n//; @F=split /\t/, $_; $line1{$F[$col1]} .= "$_\n"} open(F2,$f2); while (<F2>) {s/\r?\n//;@F=split /\t/, $_; if ($x = $line1{$F[$col2]}) {$x =~ s/\n/\t$_\n/g; print $x}}' A_B.best B_A.best > A_B_A
perl -e '$colm=0; $coln=13; $count=0; while(<>) {s/\r?\n//; @F=split /\t/, $_; if ($F[$colm] eq $F[$coln]) {print "$_\n"; $count++}} warn "\nChose $count lines out of $. where column $colm had same text as column $coln\n\n";' A_B_A > A_B_A.recip
perl -e '@cols=(0, 1, 10); while(<>) {s/\r?\n//; @F=split /\t/, $_; print join("\t", @F[@cols]), "\n"} warn "\nJoined columns ", join(", ", @cols), " for $. lines\n\n"' A_B_A.recip > RBHB.out

#####Step 3#####
#Convert into protein and nucleotide pairs
#Run 2fasta.pl under the directory. [RBHB.out] is the RBHB results of step 2.2;
#[in_pr_fasta] and [db_pr_fasta] are the protein sequences of the genome of interest and the reference genome, respectively;
#[in_nt_fasta] and [db_nt_fasta] are the nucleotide sequences of the genome of interest and the reference genome, respectively;
#[e_value] is the blastp e-value cutoff;
#[fasta_dir_pr] and [fasta_dir_nt] are the output directories for storing protein sequences and nucleotide sequences of each gene pair, respectively.
perl 2fasta.pl [RBHB.out] [in_pr_fasta] [db_pr_fasta] [e_value] [fasta_dir_pr]
perl 2fasta.pl [RBHB.out] [in_nt_fasta] [db_nt_fasta] [e_value] [fasta_dir_nt]

#####Step 4#####
#Compute dN/dS ratios
#Run compute_dNdS.pl
#[fasta_dir_pr] and [fasta_dir_nt] are the directories generated during Step 3, which store protein sequences and nucleotide sequences of each gene pair, respectively.
#The results are stored in the output file "dNdS_ratios.tab"
perl compute_dNdS.pl [fasta_dir_pr] [fasta_dir_nt]

#####Step 5#####
#Calculate the transcriptome divergence index (TDI)
#Run calculate_TDI_p.pl
#The [in_file] file should follow the following format:
#Gene_id	dNdS	Intensity_stage1	Intensity_stage2	Intensity_stage3	...
#where
#Column 1	=	The gene id
#Column 2	=	The dN/dS ratio of that gene
#Column 3--	=	The expression levels of that gene across developmental stages
perl calculate_TDI_p.pl [in_file]