#############################################
### C. reinhardtii RNASeq data processing ###
#############################################

# fastqc 0.11.3
# cutadapt v 1.14
# trimgalore! v0.4.1
# bbduk v37.32
# fastx toolkit v0.0.14
# seqtk 1.3
# STAR v2.5.3a
# samtools v1.1
# R v3.1.2


### Trimming and read cleanup ###

$ ./trim_galore --illumina --stringency 7 --gzip --path_to_cutadapt /Path/to/cutadapt /Path/to/sample.fastq.gz

# map to rRNA database (ribokmersdb.fa.gz), keep both the ribodepleted reads for downstream analysis (sample_clean.fq.gz), and the rRNA reads (riboreads.fq.gz)
$./bbduk.sh in=/Path/to/sample_trimmed.fq.gz out=/Path/to/output/sample_clean.fq.gz outm=/Path/to/riboreads.fq.gz ref=/Path/to/ribokmersdb.fa.gz

# optional: take a random subset of the rRNA reads for BLAST 
$./seqtk sample Path/to/riboreads.fq.gz 500 > /Path/to/subset.fq.gz 
$ fastq_to_fasta -i /Path/to/subset.fq.gz  -o subset.fa


### Mapping to genome ###

# generate indexes
$ STAR --runThreadN 12 --runMode genomeGenerate --genomeDir ./ --genomeFastaFiles ./Creinhardtii_281_v5.0.fa

# map to genome
$ STAR --runThreadN 8 --genomeDir /Path/to/genomeDir/ --sjdbGTFfile /Path/to/annotation/Creinhardtii_281_v5.5.gene_exons.gff3 --sjdbGTFtagExonParentTranscript Parent --sjdbOverhang 75 --alignIntronMax 3000 --readFilesIn /Path/to/sample_clean.fq.gz --readFilesCommand gunzip -c --outFileNamePrefix ./sample
$ cat sampleLog.final.out

# turn to bam file, sort, double check data
$ samtools view -bS sampleAligned.out.sam > sample.bam
$ samtools sort sample.bam sample_sort
$ samtools flagstat sample.bam
$ samtools flagstat sample_sort.bam

# files for IGV visualization
$ samtools index sample_sort.bam
$ STAR --runMode inputAlignmentsFromBAM --inputBAMfile PW1a_sort.bam --outWigType bedGraph --outWigStrand Stranded —outFileNamePrefix ./sample_


### Generate count tables ###

open R
$ R

> library(ggplot2)
> library(gplots)
> library(Rsubread)
> library(edgeR)
> library(plyr)
> library(ggrepel)
> library(Rmisc)
> library(data.table)
> library(RColorBrewer)

# if treatment 1 replicates = sample_a and sample_b
# if treatment 2 replicates = sample2_a and sample2_b

> bams <- c(‘sample_a.bam’, ‘sample_b.bam’, ‘sample2_a.bam’ ‘sample2_b.bam’)
> rawcounts <- featureCounts (bams, annot.ext=‘/Path/to/annotation/Creinhardtii_281_v5.5.gene_exons.gff3', isGTFAnnotationFile=TRUE, GTF.featureType='exon', GTF.attrType='Parent', strandSpecific=2)
> colnames(rawcounts$counts) <- c(‘sample_a’,’sample_b’, ‘sample2_a’, ‘sample2_b’)
> saveRDS(rawcounts, ‘rawcounts.rds’)

> pd <- read.delim(‘/Path/to/SampleList.txt', sep='\t')
> group <- factor(paste0(pd$Genotype, '.', pd$Treatment))
> cbind(pd, Group=group)
> x <- DGEList(rawcounts$counts, group=group, genes=rawcounts$annotation[,c('GeneID','Length')])

# remove genes with less than 0.5 counts per million in less than 2 samples
> keep <- rowSums(cpm(x)>0.5)>= 2
> summary(keep) 
> y <- x[keep, , keep.lib.sizes=FALSE]

# 'trimmed mean of m' (TMM) normalization with Mean-Difference (MD) plot of distances for quality control
> y <- calcNormFactors(y)
> y$samples
> saveRDS(y, "normalizedDGEListObject.rds")

# generate mean difference plot of each sample to check normalization efficiency
> png(filename='MDplot_sample.png', width=1600, height=1600, units='px', res=300)
> plotMD(cpm(y, log=TRUE), column=1) 
> abline(h=0, col='red', lty=2, lwd=2)
> dev.off()

# MDS plot
> png(filename='MDSplot.png', width=1600, height=1600, units='px', res=300)
> plotMDS(y)
> dev.off()

# calculate RPKM and log2(RPKM) values (automatically accounts for gene length)
> y_rpkm <- rpkm(y)
> y_rpkm_df <- as.data.frame(y_rpkm)
> saveRDS(y_rpkm_df, "RPKMvalues.rds")
> y_logrpkm <- rpkm(y, log=TRUE)
> y_logrpkm_df <- as.data.frame(y_logrpkm)
> saveRDS(y_logrpkm_df, "RPKMvalueslog2.rds")

# check replicability of biological replicate samples and plot
> cor.test(y_logrpkm_df$sample_a, y_logrpkm_df$sample_b, method='pearson')
> png(filename='Replicability.png', width=1600, height=1600, units='px', res=300)
> plot(y_logrpkm_df$sample_a, y_logrpkm_df$sample_b, pch=16, cex=0.5, xlim=c(-2,16), ylim=c(-2,16), xlab='Log2 RPKM sample_a, ylab='Log2 RPKM sample_b’, col='lightgrey')
> abline(a=0, b=1, col='red')
> dev.off()


### Differential expression analysis ###

> design <- model.matrix(~0+group)
> colnames(design) <- levels(group)

# estimate dispersion and the BCV
> y <- estimateDisp(y, design, robust=TRUE)
> y$common.dispersion 
> plotBCV(y)

> fit <- glmQLFit(y, design, robust=TRUE)
> head(fit$coefficients)
> plotQLDisp(fit)

> y_logrpkm_df <- cbind(names=rownames(y_logrpkm_df), y_logrpkm_df)
> colnames(y_logrpkm_df) <- c('GeneID’,’sample_a’, ’sample_b’, ‘sample2_a’ ‘sample2_b’)
> y_rpkm_df <- cbind(names=rownames(y_rpkm_df), y_rpkm_df)
> colnames(y_rpkm_df) <- c('GeneID’,’sample_a’, ’sample_b’, ‘sample2_a’ ‘sample2_b’)

# define all pairwise comparisons of interest
> my.contrasts <- makeContrasts(samplevssample2 = sample2-sample, levels=design)

# for each comparison defined
> qlf_samplevssample2 <- glmQLFTest(fit, contrast=my.contrasts[,’samplevssample2’])
> samplevssample2 <- topTags(qlf_samplevssample2, n=Inf, sort.by="PValue", adjust.method='BH') 
> samplevssample2_table <- join(samplevssample2$table, y_rpkm_df[,c('GeneID’,’sample_a’, ’sample_b’, ‘sample2_a’ ‘sample2_b’)], by='GeneID', type='inner', match='all')
> samplevssample2_table$AvRPKM.sample<- (samplevssample2_table$sample_a + samplevssample2$sample_b)/2
> samplevssample2_table$AvRPKM.sample2 <- (samplevssample2_table$sample2_a + samplevssample2_table$sample2_b)/2
> saveRDS(samplevssample2_table, "QLFandRPKM_samplevssample2.rds")

# define lists of up- and down regulated genes 
> down_samplevssample2 <- subset(samplevssample2_table, logFC <=(-1) & FDR <0.01)
> down <- down_samplevssample2$GeneID
> up_samplevssample2 <- subset(samplevssample2_table, logFC >=1 & FDR <0.01)
> up <- up_samplevssample2$GeneID
> change <- c(down, up)

# save file of up/down genes
> com1 <- rbind(down_samplevssample2, up_samplevssample2)
> write.table(com1, file="DEgenes.txt", sep="\t", quote=FALSE)

# generate p-value histogram
> png(filename='UnadjPValhistogram.png', width=1600, height=1600, units='px', res=300)
> ggplot(samplevssample2$table, aes(PValue)) + geom_histogram(fill='darkgrey') + theme(panel.background=element_blank(), panel.grid.major=element_line(colour='lightgrey', linetype='dotted'),panel.grid.minor=element_line(colour='lightgrey', linetype='dotted'), panel.border=element_rect(colour='black', fill=NA, size=1)) + xlab("unadjusted p-values sample vs sample2") 
> dev.off() 

# generate volcano plot
> samplevssample2_volcano <- subset(samplevssample2_table, select=c(GeneID,logFC,PValue, FDR))
> samplevssample2$minlog10p <- -log10(samplevssample2_volcano$PValue)
> samplevssample2updown_volcano <- subset(samplevssample2_volcano, abs(logFC)>=1 & FDR<0.01)
> png(filename='Volcano.png', width=1600, height=1600, units='px', res=300)
> ggplot() + geom_point(data=samplevssample2_volcano, aes(logFC, minlog10p), colour='grey', size=1) + geom_point(data=samplevssample2updown_volcano, aes(logFC, minlog10p), colour='#4DBBD5B2', size=1)+ theme(panel.background=element_blank(), panel.grid.major=element_line(colour='lightgrey',linetype='dotted'),panel.grid.minor=element_line(colour='lightgrey', linetype='dotted'), panel.border=element_rect(colour='black', fill=NA, size=1)) + xlab("log2FC") + ylab("-log10 p-values") + labs(title="sample vs sample2") + geom_vline(xintercept=0, colour="red")
> dev.off()

# generate MA plot to also include expression values
> samplevssample2_ma <- subset(samplevssample2_table, select=c(GeneID,logFC, logCPM, PValue, FDR))
> samplevssample2updown_ma <- subset(samplevssample2_ma, abs(logFC)>=1 & FDR<0.01)

> png(filename='MAplot_samplevssample2.png', width=1600, height=1600, units='px', res=300)
> ggplot() + geom_point(data=samplevssample2_ma, aes(logFC, logCPM), colour='grey', size=1) + geom_point(data=samplevssample2updown_ma, aes(logFC, logCPM), colour='#4DBBD5B2', size=1)+ theme(panel.background=element_blank(), panel.grid.major=element_line(colour='lightgrey',linetype='dotted'),panel.grid.minor=element_line(colour='lightgrey', linetype='dotted'), panel.border=element_rect(colour='black', fill=NA, size=1)) + xlab("log2FC") + ylab("log10CPM") + labs(title=“sample vs sample 2”) + geom_vline(xintercept=0, colour="red")
> dev.off()


### mars1 dependence ###

# remove genes with average RPKM =< 2.5 in less than 2 conditions
> y_rpkm_df$sample <- (y_rpkm_df$sample_a + y_rpkm_df$sample_b)/2
> y_rpkm_df$sample2 <- (y_rpkm_df$sample2_a + y_rpkm_df$sample2_b)/2
> avrpkm <- y_rpkm_df[,5:6]
> strong <- subset(avrpkm, rowSums(avrpkm > 2.5) >=2) 
> saveRDS(strong, "Avg_RPKMvalues_stringent.rds")
> TP <- rownames(strong)


# DE gene set with FDR<0.001 and abs(logFC)>=2. Calculate this for each condition
> colnames(samplevssample2_table) <- c("GeneID","Length","logFC","logCPM","F","PValue","FDR”,”sample_a”,”sample_b”,”sample2_a”,”sample2_b”,”AvRPKM_sample”,”AvRPKM_sample2")
> sub <- samplevssample2_table[which(samplevssample2_table$GeneID %in% TP), ]
> DE <- subset(sub, abs(logFC) >= 2 & FDR <0.001) 

# DE common in or unique to different treatments (ClpP, HL) in WT.
# Suppose DE_ClpP is for those differentially expressed upon ClpP, and DE_HL for those upon high light
> dt_ClpP <- data.table(DE_ClpP, key=c("GeneID","Length"))
> dt_HL <- data.table(DE_HL, key=c("GeneID","Length"))
> only_ClpP <- dt_ClpP[!dt_HL] 
> only_HL <- dt_HL[!dt_ClpP] 
> combo <- join(DE_ClpP, DE_HL,  by=c("GeneID","Length"), type="inner", match="all") 
> stress_up <- subset(combo, HL.logFC >= 2 & HL.FDR <0.001 & ClpP.logFC >= 2 & ClpP.FDR <0.001)
> stress_down <- subset(combo, HL.logFC <= -2 & HL.FDR <0.001 & ClpP.logFC <= -2 & ClpP.FDR <0.001)
> DE_all <- rbind(stress_up, stress_down) 

# define subsets of mars1-(in)dependent genes, eg for genes only DE in ClpP
# suppose DEgenes_mars1.t0vsClpP is the subset of DE genes in mars1.t0 vs mars1.ClpP
> temp1 <- join(only_ClpP, DEgenes_mars1.t0vsClpP,  by=c("GeneID","Length"), type="inner", match="all")
> colnames(temp1) <- c("GeneID","Length","WT.ClpP.logFC","WT.ClpP.logCPM","WT.ClpP.F","WT.ClpP.PValue","WT.ClpP.FDR","RPKM_WT.t0_a","RPKM_WT.t0_b","RPKM_WT.ClpP_a","RPKM_WT.ClpP_b","AvRPKM.WT.t0","AvRPKM.WT.ClpP", "mars1.ClpP.logFC","mars1.ClpP.logCPM","mars1.F","mars1.ClpP.PValue","mars1.ClpP.FDR","RPKM_mars1.t0_a","RPKM_mars1.t0_b","RPKM_mars1.ClpP_a","RPKM_mars1.ClpP_b","AvRPKM.mars1.t0","AvRPKM.mars1.ClpP")
> onlyClpP_upinboth <- subset(temp1, WT.ClpP.logFC >= 2 & WT.ClpP.FDR <0.001 & mars1.ClpP.logFC >=2 & mars1.ClpP.FDR < 0.001) 
> onlyClpP_downinboth <- subset(temp1, WT.ClpP.logFC <= -2 & WT.ClpP.FDR <0.001 & mars1.ClpP.logFC <= -2 & mars1.ClpP.FDR < 0.001) 
> onlyClpP_notmars1dep <- merge(onlyClpP_upinboth, onlyClpP_downinboth, all=TRUE) 
> saveRDS(onlyClpP_notmars1dep, "DEinClpPonly_mars1independent.rds")
> write.table(onlyClpP_notmars1dep, file="DEinClpPonly_mars1independent.txt", sep="\t", quote=F)
> onlyClpP_mars1dep_up <- subset(temp1, WT.ClpP.logFC >= 2 & WT.ClpP.FDR <0.001 & mars1.ClpP.logFC <1)
> onlyClpP_mars1dep_down <- subset(temp1, WT.ClpP.logFC <= -2 & WT.ClpP.FDR <0.001 & mars1.ClpP.logFC > -1) 
> onlyClpP_mars1dep <- merge(onlyClpP_mars1dep_up, onlyClpP_mars1dep_down, all=TRUE) 
> saveRDS(onlyClpP_mars1dep, "DEinClpPonly_mars1dependent.rds")
> write.table(onlyClpP_mars1dep, file="DEinClpPonly_mars1dependent.txt", sep="\t", quote=F)

# heatmaps
# create table with logFC only
> rownames(onlyClpP_notmars1dep) <- onlyClpP_notmars1dep$GeneID
> onlyClpP_indepFC <- onlyClpP_notmars1dep[,c(3,14)]
> colnames(onlyClpP_indepFC) <- c("WT.ClpP","mars1.ClpP")
> onlyClpP_indepFC$dependency <- rep(0, nrow(onlyClpP_indepFC))

> rownames(onlyClpP_mars1dep) <- onlyClpP_mars1dep$GeneID
> onlyClpP_depFC  <- onlyClpP_mars1dep[,c(3,14)]
> colnames(onlyClpP_depFC) <- c("WT.ClpP","mars1.ClpP")
> onlyClpP_depFC$dependency <- rep(1, nrow(onlyClpP_depFC))

> onlyClpP_FC <- rbind(onlyClpP_depFC, onlyClpP_indepFC)
> onlyClpP_FCmatrix <- as.matrix(onlyClpP_FC)
> palette <- rev(colorRampPalette(brewer.pal(10, "RdBu"))((256)))

> png(filename='Heatmap_onlyClpP_mars1dependence.png', width=1600, height=1600, units='px', res=300)
> heatmap.2(onlyClpP_FCmatrix, Colv=NA, trace="none", col=palette, scale="none", density.info="none", dendrogram="row", cexRow=0.2, cexCol=0.5, margins=c(5,20), keysize=0.75, key.par = list(cex=0.5))
> dev.off()

# heatmaps of defined pathway subsets, eg autophagy, downloaded from Phytozome v12.1
> PS_table <- read.table("Chlamy_GeneSetsByFunction_autophagy.txt", sep="\t", header=FALSE)
> PS_tablen <- as.vector(PS_table[,"V1"])
# double-check that it is class 'character'
> class(PS_tablen)
> PS_FC <- hm_FC2[PS_tablen,]
# remove rows without expression data (with NAs)
> PS_FCc <- PS_FC[complete.cases(PS_FC),] 
> write.table(PS_FCc, "logFC_table_autophagy.txt", sep="\t", quote=F)
> PS_FCm <- as.matrix(PS_FCc)

> png(filename='Heatmap_autophagy.png', width=1600, height=1600, units='px', res=300)
> heatmap.2(PS_FCm, Colv=NA, trace="none", col=palette, scale="none", density.info="none", dendrogram="row", cexRow=0.7, cexCol=0.7, margins=c(5,20), keysize=0.75, key.par = list(cex=0.5))
> dev.off()

# to find the order of the heatmap rows
> PS_heat <- heatmap.2(PS_FCm, Colv=NA, trace="none", col=palette, scale="none", density.info="none", dendrogram="row", cexRow=0.7, cexCol=0.7, margins=c(5,20), keysize=0.75, key.par = list(cex=0.5))
> PSrow<- PS_heat$rowInd
> PS_FCc$GeneID <- row.names(PS_FCc)
> PSorder <- PS_FCc[PSrow,7]
> PSorderrev <- rev(PSorder)
> write.table(PSorderrev, "autophagy_rowIDsclustered.txt", sep="\t", quote=F)

# adapt files for import in MapMan
> mapman <- readRDS("DEinClpPonly_mars1dependent.rds")
> mapman$GeneID <- gsub(".v5.5","",mapman$GeneID)
> mapman$GeneID <- gsub("C","c",mapman$GeneID)
> mapman2 <- mapman[,c(1,3,14,25,36,47,58)] 
> write.table(mapman2, "Mapman_DEinClpPonly_mars1dependent.txt", sep="\t", quote=F, row.names=F)