############################################# ### C. reinhardtii RNASeq data processing ### ############################################# # fastqc 0.11.3 # cutadapt v 1.14 # trimgalore! v0.4.1 # bbduk v37.32 # fastx toolkit v0.0.14 # seqtk 1.3 # STAR v2.5.3a # samtools v1.1 # R v3.1.2 ### Trimming and read cleanup ### $ ./trim_galore --illumina --stringency 7 --gzip --path_to_cutadapt /Path/to/cutadapt /Path/to/sample.fastq.gz # map to rRNA database (ribokmersdb.fa.gz), keep both the ribodepleted reads for downstream analysis (sample_clean.fq.gz), and the rRNA reads (riboreads.fq.gz) $./bbduk.sh in=/Path/to/sample_trimmed.fq.gz out=/Path/to/output/sample_clean.fq.gz outm=/Path/to/riboreads.fq.gz ref=/Path/to/ribokmersdb.fa.gz # optional: take a random subset of the rRNA reads for BLAST $./seqtk sample Path/to/riboreads.fq.gz 500 > /Path/to/subset.fq.gz $ fastq_to_fasta -i /Path/to/subset.fq.gz -o subset.fa ### Mapping to genome ### # generate indexes $ STAR --runThreadN 12 --runMode genomeGenerate --genomeDir ./ --genomeFastaFiles ./Creinhardtii_281_v5.0.fa # map to genome $ STAR --runThreadN 8 --genomeDir /Path/to/genomeDir/ --sjdbGTFfile /Path/to/annotation/Creinhardtii_281_v5.5.gene_exons.gff3 --sjdbGTFtagExonParentTranscript Parent --sjdbOverhang 75 --alignIntronMax 3000 --readFilesIn /Path/to/sample_clean.fq.gz --readFilesCommand gunzip -c --outFileNamePrefix ./sample $ cat sampleLog.final.out # turn to bam file, sort, double check data $ samtools view -bS sampleAligned.out.sam > sample.bam $ samtools sort sample.bam sample_sort $ samtools flagstat sample.bam $ samtools flagstat sample_sort.bam # files for IGV visualization $ samtools index sample_sort.bam $ STAR --runMode inputAlignmentsFromBAM --inputBAMfile PW1a_sort.bam --outWigType bedGraph --outWigStrand Stranded —outFileNamePrefix ./sample_ ### Generate count tables ### open R $ R > library(ggplot2) > library(gplots) > library(Rsubread) > library(edgeR) > library(plyr) > library(ggrepel) > library(Rmisc) > library(data.table) > library(RColorBrewer) # if treatment 1 replicates = sample_a and sample_b # if treatment 2 replicates = sample2_a and sample2_b > bams <- c(‘sample_a.bam’, ‘sample_b.bam’, ‘sample2_a.bam’ ‘sample2_b.bam’) > rawcounts <- featureCounts (bams, annot.ext=‘/Path/to/annotation/Creinhardtii_281_v5.5.gene_exons.gff3', isGTFAnnotationFile=TRUE, GTF.featureType='exon', GTF.attrType='Parent', strandSpecific=2) > colnames(rawcounts$counts) <- c(‘sample_a’,’sample_b’, ‘sample2_a’, ‘sample2_b’) > saveRDS(rawcounts, ‘rawcounts.rds’) > pd <- read.delim(‘/Path/to/SampleList.txt', sep='\t') > group <- factor(paste0(pd$Genotype, '.', pd$Treatment)) > cbind(pd, Group=group) > x <- DGEList(rawcounts$counts, group=group, genes=rawcounts$annotation[,c('GeneID','Length')]) # remove genes with less than 0.5 counts per million in less than 2 samples > keep <- rowSums(cpm(x)>0.5)>= 2 > summary(keep) > y <- x[keep, , keep.lib.sizes=FALSE] # 'trimmed mean of m' (TMM) normalization with Mean-Difference (MD) plot of distances for quality control > y <- calcNormFactors(y) > y$samples > saveRDS(y, "normalizedDGEListObject.rds") # generate mean difference plot of each sample to check normalization efficiency > png(filename='MDplot_sample.png', width=1600, height=1600, units='px', res=300) > plotMD(cpm(y, log=TRUE), column=1) > abline(h=0, col='red', lty=2, lwd=2) > dev.off() # MDS plot > png(filename='MDSplot.png', width=1600, height=1600, units='px', res=300) > plotMDS(y) > dev.off() # calculate RPKM and log2(RPKM) values (automatically accounts for gene length) > y_rpkm <- rpkm(y) > y_rpkm_df <- as.data.frame(y_rpkm) > saveRDS(y_rpkm_df, "RPKMvalues.rds") > y_logrpkm <- rpkm(y, log=TRUE) > y_logrpkm_df <- as.data.frame(y_logrpkm) > saveRDS(y_logrpkm_df, "RPKMvalueslog2.rds") # check replicability of biological replicate samples and plot > cor.test(y_logrpkm_df$sample_a, y_logrpkm_df$sample_b, method='pearson') > png(filename='Replicability.png', width=1600, height=1600, units='px', res=300) > plot(y_logrpkm_df$sample_a, y_logrpkm_df$sample_b, pch=16, cex=0.5, xlim=c(-2,16), ylim=c(-2,16), xlab='Log2 RPKM sample_a, ylab='Log2 RPKM sample_b’, col='lightgrey') > abline(a=0, b=1, col='red') > dev.off() ### Differential expression analysis ### > design <- model.matrix(~0+group) > colnames(design) <- levels(group) # estimate dispersion and the BCV > y <- estimateDisp(y, design, robust=TRUE) > y$common.dispersion > plotBCV(y) > fit <- glmQLFit(y, design, robust=TRUE) > head(fit$coefficients) > plotQLDisp(fit) > y_logrpkm_df <- cbind(names=rownames(y_logrpkm_df), y_logrpkm_df) > colnames(y_logrpkm_df) <- c('GeneID’,’sample_a’, ’sample_b’, ‘sample2_a’ ‘sample2_b’) > y_rpkm_df <- cbind(names=rownames(y_rpkm_df), y_rpkm_df) > colnames(y_rpkm_df) <- c('GeneID’,’sample_a’, ’sample_b’, ‘sample2_a’ ‘sample2_b’) # define all pairwise comparisons of interest > my.contrasts <- makeContrasts(samplevssample2 = sample2-sample, levels=design) # for each comparison defined > qlf_samplevssample2 <- glmQLFTest(fit, contrast=my.contrasts[,’samplevssample2’]) > samplevssample2 <- topTags(qlf_samplevssample2, n=Inf, sort.by="PValue", adjust.method='BH') > samplevssample2_table <- join(samplevssample2$table, y_rpkm_df[,c('GeneID’,’sample_a’, ’sample_b’, ‘sample2_a’ ‘sample2_b’)], by='GeneID', type='inner', match='all') > samplevssample2_table$AvRPKM.sample<- (samplevssample2_table$sample_a + samplevssample2$sample_b)/2 > samplevssample2_table$AvRPKM.sample2 <- (samplevssample2_table$sample2_a + samplevssample2_table$sample2_b)/2 > saveRDS(samplevssample2_table, "QLFandRPKM_samplevssample2.rds") # define lists of up- and down regulated genes > down_samplevssample2 <- subset(samplevssample2_table, logFC <=(-1) & FDR <0.01) > down <- down_samplevssample2$GeneID > up_samplevssample2 <- subset(samplevssample2_table, logFC >=1 & FDR <0.01) > up <- up_samplevssample2$GeneID > change <- c(down, up) # save file of up/down genes > com1 <- rbind(down_samplevssample2, up_samplevssample2) > write.table(com1, file="DEgenes.txt", sep="\t", quote=FALSE) # generate p-value histogram > png(filename='UnadjPValhistogram.png', width=1600, height=1600, units='px', res=300) > ggplot(samplevssample2$table, aes(PValue)) + geom_histogram(fill='darkgrey') + theme(panel.background=element_blank(), panel.grid.major=element_line(colour='lightgrey', linetype='dotted'),panel.grid.minor=element_line(colour='lightgrey', linetype='dotted'), panel.border=element_rect(colour='black', fill=NA, size=1)) + xlab("unadjusted p-values sample vs sample2") > dev.off() # generate volcano plot > samplevssample2_volcano <- subset(samplevssample2_table, select=c(GeneID,logFC,PValue, FDR)) > samplevssample2$minlog10p <- -log10(samplevssample2_volcano$PValue) > samplevssample2updown_volcano <- subset(samplevssample2_volcano, abs(logFC)>=1 & FDR<0.01) > png(filename='Volcano.png', width=1600, height=1600, units='px', res=300) > ggplot() + geom_point(data=samplevssample2_volcano, aes(logFC, minlog10p), colour='grey', size=1) + geom_point(data=samplevssample2updown_volcano, aes(logFC, minlog10p), colour='#4DBBD5B2', size=1)+ theme(panel.background=element_blank(), panel.grid.major=element_line(colour='lightgrey',linetype='dotted'),panel.grid.minor=element_line(colour='lightgrey', linetype='dotted'), panel.border=element_rect(colour='black', fill=NA, size=1)) + xlab("log2FC") + ylab("-log10 p-values") + labs(title="sample vs sample2") + geom_vline(xintercept=0, colour="red") > dev.off() # generate MA plot to also include expression values > samplevssample2_ma <- subset(samplevssample2_table, select=c(GeneID,logFC, logCPM, PValue, FDR)) > samplevssample2updown_ma <- subset(samplevssample2_ma, abs(logFC)>=1 & FDR<0.01) > png(filename='MAplot_samplevssample2.png', width=1600, height=1600, units='px', res=300) > ggplot() + geom_point(data=samplevssample2_ma, aes(logFC, logCPM), colour='grey', size=1) + geom_point(data=samplevssample2updown_ma, aes(logFC, logCPM), colour='#4DBBD5B2', size=1)+ theme(panel.background=element_blank(), panel.grid.major=element_line(colour='lightgrey',linetype='dotted'),panel.grid.minor=element_line(colour='lightgrey', linetype='dotted'), panel.border=element_rect(colour='black', fill=NA, size=1)) + xlab("log2FC") + ylab("log10CPM") + labs(title=“sample vs sample 2”) + geom_vline(xintercept=0, colour="red") > dev.off() ### mars1 dependence ### # remove genes with average RPKM =< 2.5 in less than 2 conditions > y_rpkm_df$sample <- (y_rpkm_df$sample_a + y_rpkm_df$sample_b)/2 > y_rpkm_df$sample2 <- (y_rpkm_df$sample2_a + y_rpkm_df$sample2_b)/2 > avrpkm <- y_rpkm_df[,5:6] > strong <- subset(avrpkm, rowSums(avrpkm > 2.5) >=2) > saveRDS(strong, "Avg_RPKMvalues_stringent.rds") > TP <- rownames(strong) # DE gene set with FDR<0.001 and abs(logFC)>=2. Calculate this for each condition > colnames(samplevssample2_table) <- c("GeneID","Length","logFC","logCPM","F","PValue","FDR”,”sample_a”,”sample_b”,”sample2_a”,”sample2_b”,”AvRPKM_sample”,”AvRPKM_sample2") > sub <- samplevssample2_table[which(samplevssample2_table$GeneID %in% TP), ] > DE <- subset(sub, abs(logFC) >= 2 & FDR <0.001) # DE common in or unique to different treatments (ClpP, HL) in WT. # Suppose DE_ClpP is for those differentially expressed upon ClpP, and DE_HL for those upon high light > dt_ClpP <- data.table(DE_ClpP, key=c("GeneID","Length")) > dt_HL <- data.table(DE_HL, key=c("GeneID","Length")) > only_ClpP <- dt_ClpP[!dt_HL] > only_HL <- dt_HL[!dt_ClpP] > combo <- join(DE_ClpP, DE_HL, by=c("GeneID","Length"), type="inner", match="all") > stress_up <- subset(combo, HL.logFC >= 2 & HL.FDR <0.001 & ClpP.logFC >= 2 & ClpP.FDR <0.001) > stress_down <- subset(combo, HL.logFC <= -2 & HL.FDR <0.001 & ClpP.logFC <= -2 & ClpP.FDR <0.001) > DE_all <- rbind(stress_up, stress_down) # define subsets of mars1-(in)dependent genes, eg for genes only DE in ClpP # suppose DEgenes_mars1.t0vsClpP is the subset of DE genes in mars1.t0 vs mars1.ClpP > temp1 <- join(only_ClpP, DEgenes_mars1.t0vsClpP, by=c("GeneID","Length"), type="inner", match="all") > colnames(temp1) <- c("GeneID","Length","WT.ClpP.logFC","WT.ClpP.logCPM","WT.ClpP.F","WT.ClpP.PValue","WT.ClpP.FDR","RPKM_WT.t0_a","RPKM_WT.t0_b","RPKM_WT.ClpP_a","RPKM_WT.ClpP_b","AvRPKM.WT.t0","AvRPKM.WT.ClpP", "mars1.ClpP.logFC","mars1.ClpP.logCPM","mars1.F","mars1.ClpP.PValue","mars1.ClpP.FDR","RPKM_mars1.t0_a","RPKM_mars1.t0_b","RPKM_mars1.ClpP_a","RPKM_mars1.ClpP_b","AvRPKM.mars1.t0","AvRPKM.mars1.ClpP") > onlyClpP_upinboth <- subset(temp1, WT.ClpP.logFC >= 2 & WT.ClpP.FDR <0.001 & mars1.ClpP.logFC >=2 & mars1.ClpP.FDR < 0.001) > onlyClpP_downinboth <- subset(temp1, WT.ClpP.logFC <= -2 & WT.ClpP.FDR <0.001 & mars1.ClpP.logFC <= -2 & mars1.ClpP.FDR < 0.001) > onlyClpP_notmars1dep <- merge(onlyClpP_upinboth, onlyClpP_downinboth, all=TRUE) > saveRDS(onlyClpP_notmars1dep, "DEinClpPonly_mars1independent.rds") > write.table(onlyClpP_notmars1dep, file="DEinClpPonly_mars1independent.txt", sep="\t", quote=F) > onlyClpP_mars1dep_up <- subset(temp1, WT.ClpP.logFC >= 2 & WT.ClpP.FDR <0.001 & mars1.ClpP.logFC <1) > onlyClpP_mars1dep_down <- subset(temp1, WT.ClpP.logFC <= -2 & WT.ClpP.FDR <0.001 & mars1.ClpP.logFC > -1) > onlyClpP_mars1dep <- merge(onlyClpP_mars1dep_up, onlyClpP_mars1dep_down, all=TRUE) > saveRDS(onlyClpP_mars1dep, "DEinClpPonly_mars1dependent.rds") > write.table(onlyClpP_mars1dep, file="DEinClpPonly_mars1dependent.txt", sep="\t", quote=F) # heatmaps # create table with logFC only > rownames(onlyClpP_notmars1dep) <- onlyClpP_notmars1dep$GeneID > onlyClpP_indepFC <- onlyClpP_notmars1dep[,c(3,14)] > colnames(onlyClpP_indepFC) <- c("WT.ClpP","mars1.ClpP") > onlyClpP_indepFC$dependency <- rep(0, nrow(onlyClpP_indepFC)) > rownames(onlyClpP_mars1dep) <- onlyClpP_mars1dep$GeneID > onlyClpP_depFC <- onlyClpP_mars1dep[,c(3,14)] > colnames(onlyClpP_depFC) <- c("WT.ClpP","mars1.ClpP") > onlyClpP_depFC$dependency <- rep(1, nrow(onlyClpP_depFC)) > onlyClpP_FC <- rbind(onlyClpP_depFC, onlyClpP_indepFC) > onlyClpP_FCmatrix <- as.matrix(onlyClpP_FC) > palette <- rev(colorRampPalette(brewer.pal(10, "RdBu"))((256))) > png(filename='Heatmap_onlyClpP_mars1dependence.png', width=1600, height=1600, units='px', res=300) > heatmap.2(onlyClpP_FCmatrix, Colv=NA, trace="none", col=palette, scale="none", density.info="none", dendrogram="row", cexRow=0.2, cexCol=0.5, margins=c(5,20), keysize=0.75, key.par = list(cex=0.5)) > dev.off() # heatmaps of defined pathway subsets, eg autophagy, downloaded from Phytozome v12.1 > PS_table <- read.table("Chlamy_GeneSetsByFunction_autophagy.txt", sep="\t", header=FALSE) > PS_tablen <- as.vector(PS_table[,"V1"]) # double-check that it is class 'character' > class(PS_tablen) > PS_FC <- hm_FC2[PS_tablen,] # remove rows without expression data (with NAs) > PS_FCc <- PS_FC[complete.cases(PS_FC),] > write.table(PS_FCc, "logFC_table_autophagy.txt", sep="\t", quote=F) > PS_FCm <- as.matrix(PS_FCc) > png(filename='Heatmap_autophagy.png', width=1600, height=1600, units='px', res=300) > heatmap.2(PS_FCm, Colv=NA, trace="none", col=palette, scale="none", density.info="none", dendrogram="row", cexRow=0.7, cexCol=0.7, margins=c(5,20), keysize=0.75, key.par = list(cex=0.5)) > dev.off() # to find the order of the heatmap rows > PS_heat <- heatmap.2(PS_FCm, Colv=NA, trace="none", col=palette, scale="none", density.info="none", dendrogram="row", cexRow=0.7, cexCol=0.7, margins=c(5,20), keysize=0.75, key.par = list(cex=0.5)) > PSrow<- PS_heat$rowInd > PS_FCc$GeneID <- row.names(PS_FCc) > PSorder <- PS_FCc[PSrow,7] > PSorderrev <- rev(PSorder) > write.table(PSorderrev, "autophagy_rowIDsclustered.txt", sep="\t", quote=F) # adapt files for import in MapMan > mapman <- readRDS("DEinClpPonly_mars1dependent.rds") > mapman$GeneID <- gsub(".v5.5","",mapman$GeneID) > mapman$GeneID <- gsub("C","c",mapman$GeneID) > mapman2 <- mapman[,c(1,3,14,25,36,47,58)] > write.table(mapman2, "Mapman_DEinClpPonly_mars1dependent.txt", sep="\t", quote=F, row.names=F)