WGCNA is an open source package for R available at https://horvath.genetics.ucla.edu/html/ Citation: Langfelder, P. and Horvath, S. WGCNA: an R package for weighted correlation network analysis. BMC Bioinformatics. 9:559 (2008). The following is the open source code from the website used by AMPEL BioSolutions. Log2 normalized microarray expression values for WB, PBMC, purified T cell, B cell or monocyte datasets were filtered using an IQR to remove saturated probes with low variability between samples and used as inputs to WGCNA (V1.51). WGCNA READ ME This folder contains operational (6 parts) and shared R scripts used for WGCNA analysis by AMPEL BioSolutions. Each time you start a new analysis, clone deseq/part0-eset.R, wgcna/active_vst/part1,part2 etc., and the /wgcna/active_vst/bigc... data. You can change the name of the active_vst folder depending on your analysis Part 0 -- eset: This script normalizes raw counts data via several methods and generates corresponding esets. Upload your raw counts and clinical data file to the deseq folder before you begin Part 1 -- inputs: This script runs some quality control and looks for outliers Part 1B -- clinical data: This scripts converts clinical traits to numerical values for downstream correlations Part 2 -- SFT: This script helps to identify the best soft threshold power for part 3 Part 3 -- TOM: This script generates TOM files (topological overlay matrix) for downstream analysis Part 4 -- cut trees: This script recuts the network generated in part 3 into 4 deep split options Part 5 -- correlate: This script correlates module eigengenes with clinical traits and annotates gene info for each deep split Part 6 -- excel results: This script takes the final deep split (chosen on the basis of agreement with dendrogram, clinical trait correlations, etc.) and generates a results spreadsheet ################################################################################################ ### WGCNA PART 1 # This script will create a datExpr data frame based on expression data from an eset. # For WGCNA the columns of datExpr will be probes and the rows will be arrays. The script # will also create a datTraits frame where columns are clinical traits and rows are arrays. # The row names of datExpr must match the row names of datTraits. # CURRENTLY, as of February 2017, AMPEL is using ONLY GCRMA normalized esets annotated with manufacturer CDFs (i.e., Affy, Brainarry or Illumina, etc.) ################################################################################################ ### LOAD AND CONFIGURE WGCNA LIBRARY library(Biobase) library(limma) library(WGCNA) options(stringsAsFactors = FALSE) allowWGCNAThreads() active.base <- "EXAMPLE_WGCNA" title.base <- "EXAMPLE_WGCNA" file.base <- "EXAMPLE_WGCNA" analysisDir <- "~/Catalina-et-al_Nat-Comm_Suppl-Scripts/WGCNA_EXAMPLE/" esetDir <- "~/Catalina-et-al_Nat-Comm_Suppl-Scripts/Expression_set_EXAMPLE/" globalDir <- "~/Catalina-et-al_Nat-Comm_Suppl-Scripts/" projectDir <- "~/Catalina-et-al_Nat-Comm_Suppl-Scripts/WGCNA_EXAMPLE/" scriptDir <- "~/Catalina-et-al_Nat-Comm_Suppl-Scripts/WGCNA_EXAMPLE/Shared_R_Scripts/" setwd(analysisDir) save(globalDir,projectDir,scriptDir,esetDir,analysisDir,file.base,title.base,active.base, file=paste0(file.base,"_analysis_parameters.RData")) setwd(analysisDir) load(paste0(esetDir,"GSE88884_Batch-1_RMA-Normalized_ESET.Rdata")) eset <- eset_illum1_merged_BA rm(eset_illum1_merged_BA) pdata <- read.delim(paste0(esetDir,"GSE88884_Batch-1_RMA-Normalized_METADATA.txt"),stringsAsFactors = FALSE,sep = "\t",header = TRUE,row.names = 1) table(pdata$Batch) table(pdata$Cohort) table(pdata$Sex) table(pdata$SLEDAI) table(eset$batch) table(eset$group) table(eset$sex) sampleNames(eset) sampleNames(eset) <- substr(sampleNames(eset),1,nchar(sampleNames(eset))-4) #Remove the .CEL from the sample names # remove <- read.delim(paste0(projectDir,"GPL17586_control_probes.txt"),sep = "\t",header = FALSE) # index <- which(rownames(eset)%in%remove$V1) # eset <- eset[-index,] index <- which(pdata$Batch=="ILLUMINATE-2") pdata <- pdata[-index,] pdata[,"Cohort"] <- gsub("Normal","CTL",pdata[,"Cohort"]) index <- which(pdata$Cohort=="CTL") control <- pdata[index,] index <- which(control$Batch!="Additional_HCs") control <- control[index,] index <- which(pdata$Sex=="Male") pdata <- pdata[-index,] index <- which(!is.na(pdata$SLEDAI)) sle <- pdata[index,] pdata <- rbind(control,sle) table(pdata$Batch) table(pdata$Cohort) table(pdata$Sex) rownames(pdata) eset <- eset[,which(sampleNames(eset) %in% rownames(pdata))] pdata <- pdata[which(rownames(pdata) %in% sampleNames(eset)),] eset <- eset[,order(sampleNames(eset))] pdata <- pdata[order(rownames(pdata)),] identical(rownames(pdata),sampleNames(eset)) rownames(pdata) <- paste(pdata$Cohort, substr(pdata$Subject_ID,6,nchar(pdata$Subject_ID)), pdata$SLEDAI, sep = ".") sampleNames(eset) <- rownames(pdata) # outliers <- read.delim(paste0(analysisDir,"ILLUMINATE-1_Outliers.txt"),sep = "\t",header = TRUE) # index <- which(rownames(pdata)%in%outliers$ILLUMINATE.1) # pdata <- pdata[-index,] index <- which(sampleNames(eset)=="CTL.0744.NA"| sampleNames(eset)=="CTL.1561.NA"| sampleNames(eset)=="CTL.0515.NA"| sampleNames(eset)=="CTL.0055.NA"| sampleNames(eset)=="CTL.0236.NA") #Internal controls found to be males eset <- eset[,-index] index <- which(sampleNames(eset)=="CTL.1329.NA"| sampleNames(eset)=="CTL.0070.NA") #Additional internal controls to remove eset <- eset[,-index] # #Unnecessary for brainarray CDF eset # remove <- read.delim(paste0(projectDir,"GPL17586_control_probes.txt"),sep = "\t",header = FALSE) # index <- which(rownames(eset)%in%remove$V1) # eset <- eset[-index,] eset <- eset[,which(sampleNames(eset) %in% rownames(pdata))] pdata <- pdata[which(rownames(pdata) %in% sampleNames(eset)),] eset <- eset[,order(sampleNames(eset))] pdata <- pdata[order(rownames(pdata)),] identical(rownames(pdata),sampleNames(eset)) sampleNames(eset) <- rownames(pdata) pData(eset) <- pdata ###Create SYMBOL column required by some a4 functions colnames(fData(eset)) fData(eset)[,"SYMBOL"] <- as.character(fData(eset)[,"geneSymbol"]) #Take information from the featureData (fData) in the eset fData(eset)[,"ENTREZ"] <- as.character(fData(eset)[,"geneEntrezID"]) #Take information from the featureData (fData) in the eset fData(eset)[,"ENSEMBL"] <- as.character(fData(eset)[,"geneEnsembl"]) #Take information from the featureData (fData) in the eset fData(eset) <- fData(eset)[,c("probe","SYMBOL","ENTREZ","ENSEMBL")] ###REMOVE AFFY PROBES index <- which( substr(rownames(eset),1,4)!="AFFX" ) eset <- eset[index, ] rm(index) ####################################################################################### # Plot PCA setwd(analysisDir) library(affycoretools) # options(rgl.useNULL=TRUE) # library(rgl) eset$Cohort <- factor(eset$Cohort) pdf("PCA_Pre-filter.pdf",width = 16, height = 9) plotPCA(eset, groups=as.numeric(eset$Cohort), groupnames=levels(eset$Cohort), main="Pre-filter", pcs=c(1,2), plot3d=FALSE, col=c("red","blue"), pch=c(16,17), addtext=sampleNames(eset)) plotPCA(eset, groups=as.numeric(eset$Cohort), groupnames=levels(eset$Cohort), main="Pre-filter", pcs=c(1,3), plot3d=FALSE, col=c("red","blue"), pch=c(16,17), addtext=sampleNames(eset)) plotPCA(eset, groups=as.numeric(eset$Cohort), groupnames=levels(eset$Cohort), main="Pre-filter", pcs=c(3,2), plot3d=FALSE, col=c("red","blue"), pch=c(16,17), addtext=sampleNames(eset)) dev.off() # plotPCA(eset, groups=as.numeric(eset$Cohort), groupnames=levels(eset$Cohort), # pcs=c(1,2,3), plot3d=TRUE, col=c("red","blue","orange"), pch=c(16,17,18)) ####################################################################################### # Draw a quick dendrogram to look for outliers in the unfiltered set library(flashClust) sampleTree <- flashClust(dist(t(exprs(eset))), method = "average"); # Plot the sample tree: Open a graphic output window of size 12 by 9 inches # The user should change the dimensions if the window is too large or too small. # sizeGrWindow(12,9) # pdf(file = "Plots/sampleClustering.pdf", width = 12, height = 9); pdf("Dendrogram_Pre-filter.pdf",width = 16, height = 9) par(cex = 0.9); par(mar = c(0,4,2,0)) plot(sampleTree, main = "Pre-filter", sub="", xlab="", cex.lab = 1.5, cex.axis = 1.5, cex.main = 1.5) dev.off() ### STOP: IF OUTLIERS ARE DETECTED IN THE DENDROGRAM, USE THIS # Find clusters cut by the line and plot a line to show the cut # abline(h = 115, col = "red"); #Determine cluster under the line # clust <- cutreeStatic(sampleTree, cutHeight = 115, minSize = 10) # table(clust) # keepSamples <- (clust==1) #Cluster 1 contains the samples we want to keep. # eset <- eset[,keepSamples] #SLE-GSM260919 was removed as an outlier # nrow(eset) # ### ALTERNATIVELY: IF OUTLIERS ARE DETECTED IN THE DENDROGRAM, USE THIS # sampleNames(eset) # eset <- eset[,sampleNames(eset)!="SLE-GSM260919"] #Remove the outlier sample(s) by name ##### Determine the 50% intensity threshold nrow <- format(as.numeric(nrow(eset)),big.mark=",",scientific=F) #Make an object out of the number of rows that originally are in the eset WB <- hist(rowMeans(exprs(eset)), breaks=1000, xlab="Average log2 Probe Intensity", main="Illuminate-1 Active Female Reanalyzed WB SLE All Intensities", sub=paste(nrow, "total probes"), xlim=c(2,15), col="yellow") nrow(exprs(eset)) #70,492 (transcript cluster) 0.50*nrow(exprs(eset)) #33,764 wb.counts <- as.data.frame(WB$counts) sum(wb.counts[1:414,]) #Find the break where the lower 50% of probes are removed wb.breaks <- as.data.frame(WB$breaks) cut.off <- as.data.frame(wb.breaks[414,]) #5.66 dev.off() #Clear any graphs # par(mfrow=c(1,2)) #Setting graphics parameters A vector of length 2, where the first argument specifies the number of rows and the second the number of columns of plots nrow <- format(as.numeric(nrow(eset)),big.mark=",",scientific=F) #Make an object out of the number of rows that originally are in the eset pdf("Probe-Intensity-Threshold.pdf",height = 9,width = 16) hist(rowMeans(exprs(eset)), breaks=100, xlab="Average log2 Probe Intensity", main="GSE88884_Illuminate1-Active_ESET_RMA_Affy: WB, ALL Probes", sub=paste(nrow, "total probes"), xlim=c(2,15), col="yellow") abline(v=5.04, lty=1, lwd=3, col="red") dev.off() ##### Apply the low intensity threshold filter index <- which(rowMeans(exprs(eset))<5.66) #Get row numbers for probes with average expression less than 5.2 eset.genefiltered <- eset[-index,] #Remove the probes with average expression less than 5.2 nrow(eset.genefiltered) #Check the number of probes remaining 33855 eset <- eset.genefiltered # Interquartile range filtering, as proposed by PMID:27657141, entails selection of the maximal point on a histograme of IQR values across all probes # Here, that value for 12 SLE and 12 CTL is 0.265 # However, a universal IQR threshold for all datasets run on the same chip platform is optimal # pdf("probe IQR Filter Threshold.pdf",width = 16,height = 9) # iqr.df <- data.frame() # for(i in 1:nrow(exprs(eset))) { # row <- as.vector(exprs(eset)[i,]) # iqr.df[i,1] <- IQR(row) # } # rownames(iqr.df) <- rownames(exprs(eset)) # index <- which(iqr.df[,1]>0.05) # iqr.df <- iqr.df[index,,FALSE] # iqr.hist <- hist(iqr.df[,1], breaks=1000, xlab="probe IQRs", # main="probe Density at IQR Intervals", # sub=paste(nrow(iqr.df), "total probes"), xlim=c(0,1), col="yellow") # iqr.max <- which(iqr.hist$counts==max(iqr.hist$counts)) # iqr <- as.vector(iqr.hist$mids)[iqr.max] # abline(v=iqr, lty=2, lwd=2, col="red") # dev.off() # iqr.df <- iqr.df[which(iqr.df[,1]>iqr),,FALSE] # eset.filtered <- eset[which(rownames(eset)%in%rownames(iqr.df)),] # nrow(eset.filtered) # eset <- eset.filtered # rm(eset.filtered) ####################################################################################### ###Create the datExpr0 frame for WGCNA datExpr0 <- exprs(eset) #Take out the expression data ONLY from the eset dataframe nrow(datExpr0) #Make sure the number of rows/probes matches the filtered eset # IMPORTANT! TRANSPOSE THE ORIGINAL EXPRESSION MATRIX # WGCNA requires columns be probes and rows be arrays datExpr0 <- t(datExpr0) ### CHECK FOR MISSING EXCESSIVE VALUES AND IDENTIFY OUTLIER MICROARRAY SAMPLES # Check for gene entries with NAs and very low counts gsg <- goodSamplesGenes(datExpr0, verbose = 3); gsg$allOK # Display and remove these bad entries if (!gsg$allOK) { # Optionally, print the gene and sample names that were removed: if (sum(!gsg$goodGenes)>0) badGenes <- colnames(datExpr0)[!gsg$goodGenes]; printFlush(paste("Removing genes:", paste(colnames(datExpr0)[!gsg$goodGenes], collapse = ", "))); if (sum(!gsg$goodSamples)>0) printFlush(paste("Removing samples:", paste(rownames(datExpr0)[!gsg$goodSamples], collapse = ", "))); # Remove the offending genes and samples from the data: datExpr0 <- datExpr0[gsg$goodSamples, gsg$goodGenes] } else { badGenes<-NULL } # We must remove probe columns containing only one value. Failure to do so breaks downstream TOM creation, # and it must be performed now before module colors are assigned. datExpr1 <- datExpr0[,apply(datExpr0,2,function(x) any(c(FALSE,x[-length(x)]!=x[-1])))] ncol(datExpr0) - ncol(datExpr1) datExpr0 <- datExpr1 rm(datExpr1,gsg) # ##Remove bad probes eliminated by good genes function datExpr1 <- t(datExpr0) eset.wgcna <- eset[rownames(eset)%in%rownames(datExpr1),] nrow(eset.wgcna)==nrow(datExpr1) nrow(eset.wgcna) #Check the number of probes remaining setwd(analysisDir) save(eset.wgcna,file=paste0(file.base,"_eset_WGCNA.RData")) ####################################################################################### # Plot PCA RECHECK setwd(analysisDir) library(affycoretools) # options(rgl.useNULL=TRUE) # library(rgl) eset.wgcna$Cohort <- factor(eset.wgcna$Cohort) pdf("PCA_Post-filter.pdf",width = 16, height = 9) plotPCA(eset.wgcna, groups=as.numeric(eset.wgcna$Cohort), groupnames=levels(eset.wgcna$Cohort), main="Post-Filter", pcs=c(1,2), plot3d=FALSE, col=c("red","blue"), pch=c(16,17), addtext=sampleNames(eset.wgcna)) dev.off() # plotPCA(eset.wgcna, groups=as.numeric(eset.wgcna$Cohort), groupnames=levels(eset.wgcna$Cohort), # pcs=c(1,2,3), plot3d=TRUE, col=c("red","blue"), pch=c(16,17,17)) ####################################################################################### # Draw a quick dendrogram to look for outliers in the filtered set library(flashClust) sampleTree <- flashClust(dist(datExpr0), method = "average"); # Plot the sample tree: Open a graphic output window of size 12 by 9 inches # The user should change the dimensions if the window is too large or too small. # sizeGrWindow(12,9) pdf("Dendrogram_Post-filter.pdf",width = 16, height = 9) par(cex = 0.9); par(mar = c(0,4,2,0)) plot(sampleTree, main = "Post-Filter", sub="", xlab="", cex.lab = 1.5, cex.axis = 1.5, cex.main = 1.5) dev.off() datExpr <- datExpr0 ncol(datExpr) nGenes <- ncol(datExpr) # number of columns (probes) nSamples <- nrow(datExpr) # number of rows (arrays) rm(sampleTree, datExpr0, datExpr1, index) ########################################################################################### ## LOAD CLINICAL TRAIT DATA (Applies to both filtered and unfiltered expression sets) # Create a patients data frame analogous to expression data that will hold binary designations # of trait membership, or a numerical measurement of a continuous clinical variable patients.data <- pData(eset.wgcna) rownames(patients.data) #Are they the same as the datExpr object, minus any outliers? patients.data[is.na(patients.data)] <- "" patients.data <- as.data.frame(patients.data) colnames(patients.data) datTraits <- patients.data[,c("Cohort","SLEDAI","Age", "dsDNA_Level","dsDNA_IU", "C3_Level","C3_GperL", "C4_Level","C4_GperL")] # Create numerical factors of binary designations # Cohort table(patients.data$Cohort) datTraits$Cohort <- gsub("CTL",0,datTraits$Cohort) datTraits$Cohort <- gsub("SLE",1,datTraits$Cohort) datTraits$Cohort <- as.numeric(datTraits$Cohort) # SLEDAI table(patients.data$SLEDAI) datTraits$SLEDAI <- as.numeric(datTraits$SLEDAI) # Age table(patients.data$Age) datTraits$Age <- as.numeric(datTraits$Age) #dsDNA_Level table(patients.data$dsDNA_Level) datTraits$dsDNA_Level <- gsub("Negative",0,datTraits$dsDNA_Level) datTraits$dsDNA_Level <- gsub("Positive",1,datTraits$dsDNA_Level) datTraits$dsDNA_Level <- as.numeric(datTraits$dsDNA_Level) #dsDNA_IU table(patients.data$dsDNA_IU) datTraits$dsDNA_IU <- as.numeric(datTraits$dsDNA_IU) #C3_Level table(patients.data$C3_Level) datTraits$C3_Level <- gsub("N",0,datTraits$C3_Level) datTraits$C3_Level <- gsub("L",1,datTraits$C3_Level) datTraits$C3_Level <- gsub("H",2,datTraits$C3_Level) datTraits$C3_Level <- as.numeric(datTraits$C3_Level) #C3_GperL table(patients.data$C3_GperL) datTraits$C3_GperL <- as.numeric(datTraits$C3_GperL) #C4_Level table(patients.data$C4_Level) datTraits$C4_Level <- gsub("N",0,datTraits$C4_Level) datTraits$C4_Level <- gsub("L",1,datTraits$C4_Level) datTraits$C4_Level <- gsub("H",2,datTraits$C4_Level) datTraits$C4_Level <- as.numeric(datTraits$C4_Level) #C4_GperL table(patients.data$C4_GperL) datTraits$C4_GperL <- as.numeric(datTraits$C4_GperL) #Race table(patients.data$Race) datTraits$Race_Black <- patients.data$Race datTraits$Race_Black <- gsub("afr_amer",1,datTraits$Race_Black) datTraits$Race_Black <- gsub("amer_ind_ak_native",0,datTraits$Race_Black) datTraits$Race_Black <- gsub("asian",0,datTraits$Race_Black) datTraits$Race_Black <- gsub("multiple",0,datTraits$Race_Black) datTraits$Race_Black <- gsub("native_hw_pac_island",0,datTraits$Race_Black) datTraits$Race_Black <- gsub("white",0,datTraits$Race_Black) datTraits$Race_Black <- as.numeric(datTraits$Race_Black) datTraits$Race_Native <- patients.data$Race datTraits$Race_Native <- gsub("afr_amer",0,datTraits$Race_Native) datTraits$Race_Native <- gsub("amer_ind_ak_native",1,datTraits$Race_Native) datTraits$Race_Native <- gsub("asian",0,datTraits$Race_Native) datTraits$Race_Native <- gsub("multiple",0,datTraits$Race_Native) datTraits$Race_Native <- gsub("native_hw_pac_island",0,datTraits$Race_Native) datTraits$Race_Native <- gsub("white",0,datTraits$Race_Native) datTraits$Race_Native <- as.numeric(datTraits$Race_Native) datTraits$Race_White <- patients.data$Race datTraits$Race_White <- gsub("afr_amer",0,datTraits$Race_White) datTraits$Race_White <- gsub("amer_ind_ak_native",0,datTraits$Race_White) datTraits$Race_White <- gsub("asian",0,datTraits$Race_White) datTraits$Race_White <- gsub("multiple",0,datTraits$Race_White) datTraits$Race_White <- gsub("native_hw_pac_island",0,datTraits$Race_White) datTraits$Race_White <- gsub("white",1,datTraits$Race_White) datTraits$Race_White <- as.numeric(datTraits$Race_White) #Region table(patients.data$Region) datTraits$Region_Europe <- patients.data$Region datTraits$Region_Europe <- gsub("europe",1,datTraits$Region_Europe) datTraits$Region_Europe <- gsub("mex_ca_sa",0,datTraits$Region_Europe) datTraits$Region_Europe <- gsub("rest_world",0,datTraits$Region_Europe) datTraits$Region_Europe <- gsub("us_can",0,datTraits$Region_Europe) datTraits$Region_Europe <- as.numeric(datTraits$Region_Europe) datTraits$Region_South <- patients.data$Region datTraits$Region_South <- gsub("europe",0,datTraits$Region_South) datTraits$Region_South <- gsub("mex_ca_sa",1,datTraits$Region_South) datTraits$Region_South <- gsub("rest_world",0,datTraits$Region_South) datTraits$Region_South <- gsub("us_can",0,datTraits$Region_South) datTraits$Region_South <- as.numeric(datTraits$Region_South) datTraits$Region_North <- patients.data$Region datTraits$Region_North <- gsub("europe",0,datTraits$Region_North) datTraits$Region_North <- gsub("mex_ca_sa",0,datTraits$Region_North) datTraits$Region_North <- gsub("rest_world",0,datTraits$Region_North) datTraits$Region_North <- gsub("us_can",1,datTraits$Region_North) datTraits$Region_North <- as.numeric(datTraits$Region_North) # Match row names of datExpr and datTraits datExpr <- datExpr[ order(rownames(datExpr)), ] datTraits <- datTraits[ order(rownames(datTraits)), ] identical( rownames(datExpr), rownames(datTraits) ) #####Clean up the dataset and display final dendrogram with corresponding clinical factors collectGarbage(); # Re-cluster samples after outlier removal and display dendrogram sampleTree <- flashClust(dist(datExpr), method = "average") # Convert traits to a color representation: white means low, red means high, grey means missing entry traitColors <- numbers2colors(datTraits, signed = FALSE); # Plot the sample dendrogram and the colors underneath pdf("Dendrogram_Traits.pdf",width = 16, height = 9) plotDendroAndColors(sampleTree, traitColors, groupLabels = names(datTraits), main = "Post-Filter" ) dev.off() ### SAVE OBJECTS save(datExpr, datTraits, eset.wgcna, badGenes, file="part1-final-objects.RData") ################################################################################################ ### WGCNA PART 2 # This script uses the datExpr and datTraits to establish a soft threshold ################################################################################################ ### LOAD AND CONFIGURE WGCNA LIBRARY analysisDir <- "~/Catalina-et-al_Nat-Comm_Suppl-Scripts/WGCNA_EXAMPLE/" library(WGCNA) options(stringsAsFactors = FALSE); allowWGCNAThreads() ################################################################################################ # Load files setwd(analysisDir) load("EXAMPLE_WGCNA_analysis_parameters.RData") load("part1-final-objects.RData") # Choose a set of soft-thresholding powers # First generate a sequence of powers to apply powers <- c(c(1:10), seq(from = 12, to=30, by=2)) # Call the network topology analysis function # BlockSize of 10,000 quickly ran on 8 VCPUs and 52 Gb memory sft <- pickSoftThreshold(datExpr, dataIsExpr = TRUE, powerVector = powers, verbose = 5, networkType="signed", corFnc = "bicor", moreNetworkConcepts = TRUE, blockSize=10000) #bicor is the only option pdf("SFT.pdf",width = 16,height = 9) par(mfrow = c(1,2)); cex1 <- 0.8; # font size # Scale-free topology fit index as a function of the soft-thresholding power plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2], xlab=paste(active.base,"STP Selection",sep=" "),ylab=paste(active.base,"Scale Free Topology Model Fit, signed",sep=" "),type="n", main = paste("Scale independence")); text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2], labels=powers,cex=cex1,col="red"); # Mean connectivity as a function of the soft-thresholding power plot(sft$fitIndices[,1], sft$fitIndices[,5], xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n", main = paste("Mean connectivity")) text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red") dev.off() ###SELECT THE THRESHOLD NUMBER AT WHICH THE SFT MODEL FIT PLATEAUS ###NOTE: If graphs are unusual, consider re-filtering, OR if sample numbers are low, use the SFT values on the WGCNA FAQ page table save(sft, file="part2-output-SFT.RData") sft.fits <- sft$fitIndices write.table(sft.fits,file="part2-output-STP.txt", quote=FALSE, sep = "\t",row.names=FALSE, col.names=TRUE) # You will need to change the power and update the saveTOMFileBase with the current date ### WGCNA PART 3 ADJACENCY MATRIX GENERATION AND TOM ENCODING # # In general, the only parameters to be changed here are file naming and the power selection. # Also, this script must be run from the VM terminal, i.e. $sudo Rscript part3-TOM.R library(WGCNA) options(stringsAsFactors = FALSE); allowWGCNAThreads() setwd("~/Catalina-et-al_Nat-Comm_Suppl-Scripts/WGCNA_EXAMPLE/") load("EXAMPLE_WGCNA_analysis_parameters.RData") load("part1-final-objects.RData") corType = "pearson" power = 10 networkType = "signed" TOMType = "signed" deepSplit = 3 minModuleSize = 100 numericLabels = TRUE detectCutHeight = 1 mergeCutHeight = 0.2 pamStage = TRUE pamRespectsDendro = TRUE saveTOMFileBase = paste0(file.base,"_SP10_10K_28Aug2018") TOM.settings <- c(power, networkType, TOMType, deepSplit, minModuleSize, numericLabels, detectCutHeight, mergeCutHeight, pamStage, pamRespectsDendro, saveTOMFileBase) names(TOM.settings) <- c("power", "networkType", "TOMType", "deepSplit", "minModuleSize", "numericLabels", "detectCutHeight", "mergeCutHeight", "pamStage", "pamRespectsDendro", "saveTOMFileBase") save(TOM.settings,file=paste0(saveTOMFileBase,"_TOM_settings.RData")) net = blockwiseModules(datExpr, power = power, corType=corType, deepSplit = deepSplit, networkType = "signed", # ALWAYS SIGNED TOMType = "signed", # ALWAYS SIGNED minModuleSize = minModuleSize, numericLabels = TRUE, detectCutHeight = detectCutHeight, mergeCutHeight = mergeCutHeight, pamStage = pamStage, pamRespectsDendro = pamRespectsDendro, saveTOMs = TRUE, saveTOMFileBase = saveTOMFileBase, verbose = 5, maxBlockSize = 10000) # ALWAYS 10,000 save(net,file=paste0(saveTOMFileBase,"_NET.RData")) ### PART 4 - Recutting dendrogram trees ############################################################################################################### ### CONCEPTUAL OVERVIEW # We first recut the blocks of clustered weighted, co-expression gene using all four levels # of deep split (DS) and generally inspect the per-probe module assignment along with color # bars of correlation to all clinical traits accompanying the dataset. It is these four DS # recuts we scrutinize using subsequent part5 figures and calculations. ################################################################################################ ### LOAD SUPPORTING AMPEL WGCNA SCRIPTS # Load all scripts in shared scripts WGCNA directory # scriptPaths <- list.files(pattern="[.]R$", path="/mnt/disks/rserver/projects/CopyOfshared_scripts/wgcna", full.names=TRUE) # scriptPaths # source(scriptPaths) # rm(scriptPaths) ################################################################################################ ### LOAD LIBRARIES AND CONFIGURE SESSION library(WGCNA) library(Hmisc) # contains capitalize function library(Biobase) options(stringsAsFactors = FALSE); allowWGCNAThreads() ################################################################################################ ### LOAD STUDY DATA setwd(analysisDir) load("part1-final-objects.RData") eset <- eset.wgcna # Helps confirm we're loading the eset resulting from WGCNA variance filtering rm(eset.wgcna) ################################################################################################ ### LOAD TOM SETTINGS & MODULE AND COLOR ASSIGNMENTS load(paste0(saveTOMFileBase,"_TOM_settings.RData")) load(paste0(saveTOMFileBase,"_NET.RData")) ################################################################################################ projectName <- active.base save(projectName,file="projectName.RData") pData <- pData(eset) fData <- fData(eset) minModuleSize.original <- as.numeric(TOM.settings["minModuleSize"]) projectDir <- getwd() # Save original working directory to return to after directory traversal TOMfiles <- net$TOMFiles ################################################################################################ ## CALL THE SCRIPT TO ADD COLOR NAMES TO WGNCA MODULES GENERATED DURING TOM CREATION ## Note we'll be regenerating these anyways during the recut, so this is only ## really used for dianostics source(paste0(scriptDir,"wgcna.net.colornames.R")) ############################################################################################################### ## RECUT THE TOM NETWORK BLOCKS USING FOUR LEVELS OF DEEP SPLIT ## This repeats blockwise module detection from the pre-calculated TOM ## Patience, these recuts take 2-3 minutes each, even on the cloud. ## SET POTENTIAL NEW RECUT PARAMETERS. SEE WGCNA DOCUMENTATION FOR ADDITIONAL INFORMATION # Here we have the opportunity to experiment specficially usage of the PAM stage, adjustment of # module cut height, minimum module size, and of course the four levels of deep split (DS1-4). # Note PAM stage is conducted as a second pass to assign modules not first assigned by the general recut, # and historically it's always been used. recut.corType <- "pearson" # pearson is safest. bicor is other common option but usually results in many unassigned probes recut.pamStage <- TRUE # Usually true recut.pamRespectsDendro <- TRUE # Always true if using PAM stage recut.detectCutHeight <- 1 # Usually 1 for most experimentss, adjust carefully!! recut.minModuleSize <- 100 # Experiment with this carefully! Raise as needed for sets with large numbers of probes/genes recut.mergeCutHeight <- 0.2 # Usually 0.2. Module merging can occur downstream experimentName.recut <- paste("recut.","STP",TOM.settings["power"],".pam",recut.pamStage,".minMod-", recut.minModuleSize,sep="") experimentName.recut save(experimentName.recut, file="experimentName.recut.RData") ############################################################################################################### ############################### ### CALL THE RECUT BLOCKS SCRIPT. THIS MAY TAKE UP TO ~10 MINUTES EVEN ON THE CLOUD source(paste0(scriptDir,"wgcna.blocks.recut.v2.R")) ############################### # SAVE THE RECUT OBJECTS. IMPORTANT!!! ALL OUTPUT AND FIGURES RELATED TO A RECUT OF THE # MASTER TOM ARE STORED IN A PROJECT SUBDIRECTORY!!! setStorageDir <- dget(paste0(scriptDir,"wgcna/set.storage.directory.R")) outputStorage <- setStorageDir( experimentName.recut ) save( moduleColors, moduleColors.DS1, moduleColors.DS2, moduleColors.DS3, moduleColors.DS4, moduleLabels, moduleLabels.DS1, moduleLabels.DS2, moduleLabels.DS3, moduleLabels.DS4, MEs, MEs.labels, MEs.DS1, MEs.DS1.labels, MEs.DS2, MEs.DS2.labels, MEs.DS3, MEs.DS3.labels, MEs.DS4, MEs.DS4.labels, colorLevels, colorLevels.DS1, colorLevels.DS2, colorLevels.DS3, colorLevels.DS4, file="recut.DS-DS1-DS4.modules.and.genes.RData") setwd(analysisDir) ############################################################################################################### ### PLOT THE BLOCK DENDROGRAMS OF RECUT PROBES outputStorage <- setStorageDir( c(experimentName.recut,"plots") ) pngWidth=1500 # width of output PNGs pngHeight=1000 # height of output PNGs marAll = c(1, 7, 3, 1) # bottom, left, top and right margins source(paste0(scriptDir,"wgcna.blocks.dendro.with.traitbars.R")) ## PART 5 Correlate modules to clinical traits and select final modules saveTOMFileBase = paste0(file.base,"_SP10_10K_28Aug2018") #### change sigTrait <- "SLEDAI" #### change # You will need to check the rownames of the deepsplits ~ line 100 # You will also need to change your final choice of deep split and other options ~ line 400 ############################################################################################################### ### CONCEPTUAL OVERVIEW # We first recut the blocks of clustered weighted, co-expression gene using all four levels # of deep split (DS) and generally inspect the per-probe module assignment along with color # bars of correlation to all clinical traits accompanying the dataset. It is these four DS # recuts we scrutinize using these subsequent figures and calculations in order to choose the # deep split that provides the most valuable modules. # # 1. We correlate the module eigengenes (MEs) of each DS to clinical traits. Here we're looking for the maximum # number of modules significantly correlated to our clinical trait of greatest interest, usually SLEDAI, but # all correlations are later stored and reported in heatmaps and data tables. # # 2. We annotate genes within the four DS modules using both GO and AMPEL BIG-C categories, # with the goal of looking for a DS that yields modules discretely capturing # single categories. # # 3. Dendrograms are generated for each deep split, which are used more to help establish if modules are too # small or need to be merged. The case to merge modules can also be built by inspecting how much # GO or BIGC annotation they share. # # 4. Heatmaps are automatically generated for every module in every deepsplit, and stored in a subdirectory. # # 5. We move from the module level to the gene level, calculating the kME & kIM and look at per-module # statistics and figures of these two. # # 6. Recall the ME is calculated using the average gene counts of all samples, so for each module we inspect # the contribution of each sample to that module's averaged counts. Historically we've found many modules are # generated due to the contribution of a single sample very strongly expressing those genes. Clearly these # modules must be disregarded because they contain genes that don't adequately describe the difference # between diseased and normal # # 7. Per ME we inspect the scatterplots of module membership (kIM) vs. correlation to significant clinical trait. # This is more a visual confirmation of "module quality", a characteristic that one day needs to be quantified. # # 8. We include a similar scatterplot of the grey module, ensuring interesting genes weren't lost to the dump module. # # 9. The genes with gene annotation, module color assignment, gene-level correlations to clinical traits, # kMEs and kIMs are bound into the final geneLists. # # 10. geneLists are output to a Microsoft Excel workbook, including a summary sheet, room for figures, # and workseet for each module gene list. # ############################################################################################################### library(WGCNA) library(Hmisc) # contains capitalize function library(a4) options(stringsAsFactors = FALSE); allowWGCNAThreads() # Load project level output setwd(analysisDir) load("part1-final-objects.RData") eset <- eset.wgcna rm(eset.wgcna) pData <- pData(eset) fData <- fData(eset) load(paste0(saveTOMFileBase,"_NET.RData")) load(paste0(saveTOMFileBase,"_TOM_settings.RData")) load("projectName.RData") # Load recut module output load("experimentName.recut.RData") experimentDir<-paste0(analysisDir,experimentName.recut) setwd(experimentDir) load("recut.DS-DS1-DS4.modules.and.genes.RData") ######################################################################################## # SET CLINICAL TRAIT OF MOST INTEREST, USUALLY SLEDAI. ALSO SET GLOBAL P-VAL p_val <- 0.2 moduleColors.batch <- c("moduleColors", "moduleColors.DS1", "moduleColors.DS2", "moduleColors.DS3", "moduleColors.DS4") moduleCor.batch <- c("moduleCor","moduleCor.DS1","moduleCor.DS2","moduleCor.DS3","moduleCor.DS4") MEs.batch <- c("MEs", "MEs.DS1", "MEs.DS2", "MEs.DS3", "MEs.DS4") ######################################################################################## # DS-DS1-DS4 Calculate module correlations to clinical traits # as the modules originally created during TOM generation. The "DS-orig" is almost always a DS of 3, # but I do refer to it here as simply "DS", followed by the four recuts of DS1, DS2, DS3, DS4 (DS1-DS4) # In theory the deepsplit function within the TOM() function should generate results identical to # those from the recut() function! colnames(eset) rownames(datTraits) rownames(datExpr) rownames(MEs) rownames(MEs.DS1) rownames(MEs.DS2) rownames(MEs.DS3) rownames(MEs.DS4) #MEs.DS1<-MEs.DS1[order(rownames(MEs.DS1),decreasing=TRUE),] #MEs.DS2<-MEs.DS2[order(rownames(MEs.DS2),decreasing=TRUE),] #MEs.DS3<-MEs.DS3[order(rownames(MEs.DS3),decreasing=TRUE),] #MEs.DS4<-MEs.DS4[order(rownames(MEs.DS4),decreasing=TRUE),] # Graphics parameters for module corr saveFile <- TRUE # Specify if output plots will be saved plotMar <- c(8, 6, 4, 2) xLabels <- colnames(datTraits) xColorOffset <- 0 xLabelsAngle <- 45 cex.lab.x <- 1 cex.lab.y <- 0.5 text.size <- 0.2 pngWidth <- 1200 pngHeight <- 1000 setwd(analysisDir) setStorageDir <- dget(paste0(scriptDir,"wgcna/set.storage.directory.R")) outputStorage <- setStorageDir( c(experimentName.recut,"plots" ) ) source(paste0(scriptDir,"wgcna.cor.modules.traits.R")) rm(saveFile,plotMar,xLabels,xColorOffset,xLabelsAngle,cex.lab.x,cex.lab.y,text.size) setwd(analysisDir) ######################################################################################## # GET A QUICK SNAPSHOT OF SIGNIFICANT CORRELATION TO CLINICAL TRAIT OF MOST INTEREST sig.orig <- length(which(moduleCor[,paste(sigTrait,".p",sep="")]% group_by(ENTREZ) %>% top_n(1,kWithin)}) library (plyr) df <- ldply (data.list, data.frame) # Turn list back into a dataframe colnames(df)[4] <-"probe" df <- df[,-1] colnames(geneInfo)[3] <- "probe" rownames(geneInfo) <- geneInfo$probe rownames(df) <- df$probe #Now it looks just like geneInfo df <- df[!is.na(df$ENTREZ),] # Remove all unannotated probes df<-df[!is.na(df$SYMBOL),] ws <- createSheet(wb=wb, sheetName="Correlations") moduleCor <- as.data.frame(moduleCor) moduleCor <- moduleCor[order(rownames(moduleCor)),] df$moduleColor <- as.character(df$moduleColor) mod.counts <- as.data.frame(table(df$moduleColor)) mod.counts <- mod.counts[order(mod.counts$Var1),] rownames(mod.counts) <- mod.counts$Var1 moduleCor <- moduleCor[which(rownames(moduleCor)!="grey"),] identical(rownames(moduleCor),rownames(mod.counts)) moduleCor$moduleCount <- mod.counts$Freq addDataFrame(x=moduleCor, sheet=ws, row.names=TRUE,col.names=TRUE) ###################################################################### # NOW ADD INDIVIDUAL MODULE SHEETS IN WORKBOOK addDataFrame(x=moduleCor, sheet=ws, row.names=TRUE,col.names=TRUE) for (i in 1:length(color_index) ) { single.mods <- df[ df$moduleColor==color_index[i], ] single.mods <- single.mods[ !is.na(single.mods$moduleColor), ] single.mods <- single.mods[ order(single.mods$SLEDAI.p), ] ######################MODIFY the primary clinical trait nrow(single.mods) ws <- createSheet(wb=wb, sheetName=paste(toupper(color_index[i]))) addDataFrame(x=single.mods, sheet=ws, row.names=FALSE) } ###################################################################### # SAVE THE WORKBOOK setwd(experimentDir) saveWorkbook(wb, paste0(title.stem,".xlsx")) write.table(df, paste0(title.stem,".txt")) ################################################################## WGCNA SHARED R SCRIPTS ################################################################################################### # WGCNA PLOT RECUT DEEPSPLIT MODULES ALONG WITH CLINICAL TRAIT CORRELATIONS ################################################################################################### # Correlate probe intensities / normalized gene counts to binary or continuous clinical traits # and convert correlation values to a blue-white-red color palette for a color bar #colnames(datTraits) # Display clinical traits that will be plotted as correlation bars traitBar <- colnames(datExpr) for(i in 1:ncol(datTraits)) { trait <- as.data.frame(datTraits[,i]) names(trait) <- colnames(datTraits)[i] trait.corr <- as.numeric(cor(datExpr, trait, use = "p")) # Here is the correlation work trait.corr.color <- data.frame(numbers2colors(trait.corr, signed = T)) names(trait.corr.color) <- colnames(datTraits)[i] traitBar <- cbind(traitBar,trait.corr.color) } print(paste("Genes correlated to clinical traits have been encoded as color bars.")) rm(trait,trait.corr,trait.corr.color,i) traitBar.recut <- data.frame(cbind(moduleColors.DS1,moduleColors.DS2, moduleColors.DS3,moduleColors.DS4)) names(traitBar.recut) <- c("DS1","DS2","DS3","DS4") traitBar.recut <- cbind( traitBar.recut, traitBar[,2:ncol(traitBar)]) ################################################################################################### # Loop through the blocks generated by TOM network encoding of the adjacency network and # generate a dendrogram with module colors and clinical traits bars for (i in 1:length(net$TOMFiles)) { blockNumber = i title <- paste(experimentName.recut," DS 1-4 Block ",i,"/",length(net$TOMFiles),sep="") # Combine the gene correlation colors into a single frame datColors = traitBar.recut[net$blockGenes[[blockNumber]],] # Plot the dendrogram and the module colors underneath png(paste(experimentName.recut," block ",i," of ",length(net$TOMFiles),".png",sep=""), pngWidth, pngHeight, pointsize=20) datColors = traitBar.recut[net$blockGenes[[blockNumber]],] plotDendroAndColors(net$dendrograms[[blockNumber]], colors = datColors, groupLabels = colnames(traitBar.recut), dendroLabels = FALSE, hang = 0.03, addGuide = TRUE, guideHang = 0.05, marAll = marAll, saveMar = TRUE, # bottom, left, top and right main=title) print(paste("Block",i,"dendrogram with trait bars figure saved")) dev.off() } setwd(projectDir) #rm(traitBar, i) ## Fix up maximal allowed permissions in the file tree # Sys.chmod(list.dirs("."), "777") # f <- list.files(".", all.files = TRUE, full.names = TRUE, recursive = TRUE) # Sys.chmod(f, (file.info(f)$mode | "664")) # rm(f) ############################################################################################## WGCNA SET STORAGE DIRECTORY # Set directory path to save these plots in, formatted as # /mnt/disks/rserver/projects/projectName/wgcna/DSpower.pamBool.minMod-Num/plots/blocks/ # The function will traverse into the final directory, creating them along the way as needed setStorageDir <- function (storageDir) { # Create directories if they don't exist. This is a primitive script that works by moving into or creating # directories using dir.create. Note I'm not able to pass a directory path to this function, such as # dir.create("plots/blocks/DS10.pamTRUE.minMod-100"), so have to generate these one at time. newDir <- vector('character') for (i in 1:length(storageDir)) { newDir <- storageDir[i] if (!dir.exists(newDir)) { dir.create(newDir, showWarnings=TRUE, mode="0777") print(paste("Sub-directory",newDir,"created.")) } else { print(paste("Sub-directory",paste(getwd(),"/",newDir," exists.",sep="") )) } setwd( paste(getwd(),newDir,sep="/") ) } print(paste("Current storage directory: ",getwd(),sep="")) rm(storageDir,newDir,i) } ############################################################################################ ################################################################################################### # PLOT A HEATMAP OF TRANSCRIPT EXPRESSION PER MODULE # of correlation of most interesting clinical trait to module eigengene. Currently I'm # not adding a color bar, which we once used for manually-set kmeans number of clusters ################################################################################################### library(Biobase) heatmap.module <- function( moduleNames, moduleCor, geneInfo, eset, saveFile, deepSplit, cexRow, cexCol, margins, pngWidth, pngHeight) { for ( i in 1:length(moduleNames) ) { geneInfo.subset <- geneInfo[ geneInfo$moduleColor==moduleNames[i], ] index <- which(rownames(eset)%in%geneInfo.subset$probe) m.use <- exprs(eset)[index,] distance <- dist(m.use,method="euclidean") hc <- hclust(distance) hc.cut = cutree(hc,k=6) # k = number of clusters. #table(hc.cut) # show cluster cut results hc.cut.levels = levels(as.factor(hc.cut)) # factor cluster level numbers for coloring #hc.cut.levels # review cluster level factors colors = rainbow(length(hc.cut.levels)); cluster.patient.colors = c() #for (i in 1:length(hc.cut.levels)) { #cluster.patient.colors[names(which(hc.cut==hc.cut.levels[i]))] = colors[i] #} #cluster.patient.colors dist.euclidean = dist.euclidean <- function(x, ...) dist(x,method="euclidean") label.size = 1 samples = colnames(m.use) hmcol <- colorRampPalette(c("blue","white","red"))(256) #FOR THE COLORBLIND AMONG US ## MAKE SURE TO MAKE THE R STUDIO PLOTS FRAME AS LARGE AS POSSIBLE BEFORE RUNNING (SHRINK THE OTHER FRAMES) # Plot the result if (!.Device=="null device") { dev.off() } # Catches the "cannot shut down device 1 (the null device)" error # Grab the module's correlation significance to sigTrait module.sig <- round(moduleCor[moduleNames[i],paste(sigTrait,".p",sep="")],3) if (saveFile==TRUE) { # Generate a PNG of the figure png( paste(deepSplit,".",moduleNames[i],".",sigTrait,".p",module.sig,".png",sep=""), pngWidth, pngHeight, pointsize=20) } # genes.heat = heatmap.2(data.matrix(m.use), col=hmcol, scale="row", distfun = dist2, key=TRUE, # symkey=TRUE, density.info="none", trace="none", # ColSideColors=cluster.patient.colors[samples],cexCol=1.2, # cexRow = 0.1,margins=c(12,9), # main=paste(moduleNames[i]," module: ",projectName,".", deepSplit, " gene expression", sep="")) # Plot heatmap without side color bar genes.heat = heatmap.2(data.matrix(m.use), col=hmcol, scale="row", distfun = dist.euclidean, key=TRUE, symkey=TRUE, density.info="none", trace="none", cexRow = cexRow, cexCol=cexCol, margins=margins, main=paste(moduleNames[i]," ", sigTrait," p",module.sig, " module ",deepSplit,".",projectName, sep="")) #return(m.use) if (saveFile==TRUE) { # Complete PNG save dev.off() print(paste(deepSplit,moduleNames[i], "heatmap generated",sep=" ")) } } #print(paste(deepSplit," heatmaps of ",length(moduleNames)," modules generated.",sep="")) } ####################################################################################################### WGCNA CORRELATE MODULES TO TRAITS traitCorr <- function( MEs, datTraits, moduleColors, sigTrait, p_val, title.heatmap, plotMar, cex.lab.x, cex.lab.y, text.size, xLabels, xColorOffset, xLabelsAngle, maxRows, chart.filename, pngWidth, pngHeight, saveFile, deepSplit.cor) { # Define numbers of genes and samples nGenes = ncol(datExpr); nSamples = nrow(datExpr); datTraits <- data.matrix(datTraits) # Use the R cor() function to correlate the module eigengenes to clinical traits along with an associated p-value moduleTraitCor = cor(MEs, datTraits, use = "p"); moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples); moduleTraitCor <- data.frame(moduleTraitCor) moduleTraitPvalue <- data.frame(moduleTraitPvalue) corNames <- vector('character') moduleCor <- vector('numeric') for (i in 1:ncol(moduleTraitCor)) { moduleCor <- cbind( moduleCor, moduleTraitCor[,i],moduleTraitPvalue[,i] ) corNames <- c( corNames, colnames(moduleTraitCor)[i], paste(colnames(moduleTraitPvalue)[i],".p",sep="") ) } colnames(moduleCor) <- corNames rownames(moduleCor) <- rownames(moduleTraitCor) # Order the correlations frames by decreasing significance of correlation to the primary clinical variable of choice, # usually SLEDAI or cohort moduleCor <- moduleCor[ order(moduleCor[, paste(sigTrait,".p",sep="")] ), ] index <- match( rownames(moduleCor),rownames(moduleTraitCor) ) moduleTraitCor <- moduleTraitCor[index,,drop=F] index <- match( rownames(moduleCor),rownames(moduleTraitPvalue) ) moduleTraitPvalue <- moduleTraitPvalue[ index,,drop=F] moduleTraitCor <- data.matrix(moduleTraitCor) moduleTraitPvalue <- data.matrix(moduleTraitPvalue) # Sum the number of probes in each module and bind this to our correlation matrix #table(moduleColors) colorFreq <- data.frame(table(factor(moduleColors))) rownames(colorFreq) <- colorFreq[,1] index <- match(rownames(moduleCor),rownames(colorFreq) ) colorFreq <- colorFreq[index,] moduleCor <- cbind( moduleCor, "moduleCount"=colorFreq$Freq ) #colnames(moduleCor) moduleCor <- moduleCor[ ,c(ncol(moduleCor),1:ncol(moduleCor)-1) ] index <- which(colnames(moduleCor)==paste(sigTrait,".p",sep="")) sig.count <- length(which(moduleCor[,index] < p_val)) # Prepare plot window sizeGrWindow(10,6) # Color code each association by the correlation value: # Display the correlations and their p-values textMatrix = paste(signif(moduleTraitCor, 2), " (", signif(moduleTraitPvalue, 2), ")", sep = ""); if (saveFile==TRUE) { # Generate a PNG of the figure png(chart.filename,pngWidth, pngHeight, pointsize=20) } # dev.off() dim(textMatrix) = dim(moduleTraitCor) par(mar = plotMar); # bottom, left, top, right margins # Display the correlation values using a heatmap plot yLabels <- paste(rownames(moduleTraitCor)," (",moduleCor[,1], ")", sep="") ySymbols <- rownames(moduleTraitCor) labeledHeatmap(Matrix = moduleTraitCor, xLabels = xLabels, xLabelsAngle = xLabelsAngle, xColorOffset = xColorOffset, yLabels = yLabels, ySymbols = ySymbols, colorLabels = FALSE, colors = blueWhiteRed(50), textMatrix = textMatrix, setStdMargins = FALSE, cex.text = text.size, zlim = c(-1,1), cex.lab.x = cex.lab.x, cex.lab.y = cex.lab.y, main = paste(title.heatmap, ", ", sig.count, " module(s) sig by ",sigTrait," p_val < ",p_val, " (", round(sig.count/nrow(moduleCor)*100,0),"%)", sep="")) if (saveFile==TRUE) { # Complete PNG save dev.off() print(paste( deepSplit.cor,"dendrogram with trait bars saved.")) } moduleTraitCor <- data.matrix(moduleTraitCor) moduleTraitPvalue <- data.matrix(moduleTraitPvalue) moduleCor.list <- list(moduleTraitCor,moduleTraitPvalue, moduleCor ) return(moduleCor.list) plotMar <- c(8, 6, 4, 2) # bottom, left, top, and right figure margins #rm(plotMar, text.size, p_val, x.size, y.size, maxRows, title.heatmap, chart.filename) } # Correlate modules created during TOM generation moduleCor <- traitCorr( MEs=MEs, datTraits=datTraits, moduleColors=moduleColors, sigTrait=sigTrait, p_val=p_val, title.heatmap=paste("DS Original",experimentName.recut,sep=" "), plotMar, cex.lab.x, cex.lab.y, text.size, xLabels, xColorOffset, xLabelsAngle, maxRows=maxRows,chart.filename=paste("DS-orig.corrMap.",experimentName.recut,".png",sep=""), pngWidth, pngHeight,saveFile, deepSplit.cor="DS-orig" ) moduleCor.val <- moduleCor[[1]] moduleCor.pval <- moduleCor[[2]] moduleCor <- moduleCor[[3]] # Correlate deep split 1 modules moduleCor.DS1 <- traitCorr( MEs=MEs.DS1, datTraits=datTraits, moduleColors=moduleColors.DS1, sigTrait=sigTrait, p_val=p_val, title.heatmap=paste("DS1",experimentName.recut,sep=" "), plotMar, cex.lab.x, cex.lab.y, text.size, xLabels, xColorOffset, xLabelsAngle, maxRows=maxRows,chart.filename=paste("DS1.corrMap.",experimentName.recut,".png",sep=""), pngWidth, pngHeight,saveFile, deepSplit.cor="DS1" ) moduleCor.DS1.val <- moduleCor.DS1[[1]] moduleCor.DS1.pval <- moduleCor.DS1[[2]] moduleCor.DS1 <- moduleCor.DS1[[3]] # Correlate deep split 2 modules moduleCor.DS2 <- traitCorr( MEs=MEs.DS2, datTraits=datTraits, moduleColors=moduleColors.DS2, sigTrait=sigTrait, p_val=p_val, title.heatmap=paste("DS2",experimentName.recut,sep=" "), plotMar=plotMar, cex.lab.x=cex.lab.x, cex.lab.y = cex.lab.y, text.size=text.size, xLabels=xLabels, xColorOffset=xColorOffset, xLabelsAngle=xLabelsAngle, maxRows=maxRows, chart.filename=paste("DS2.corrMap.",experimentName.recut,".png",sep=""), pngWidth=pngWidth, pngHeight=pngHeight, saveFile=saveFile, deepSplit.cor="DS2" ) moduleCor.DS2.val <- moduleCor.DS2[[1]] moduleCor.DS2.pval <- moduleCor.DS2[[2]] moduleCor.DS2 <- moduleCor.DS2[[3]] # Correlate deep split 3 modules moduleCor.DS3 <- traitCorr( MEs=MEs.DS3, datTraits=datTraits, moduleColors=moduleColors.DS3, sigTrait=sigTrait, p_val=p_val, title.heatmap=paste("DS3",experimentName.recut,sep=" "), plotMar=plotMar, cex.lab.x=cex.lab.x, cex.lab.y = cex.lab.y, text.size=text.size, xLabels=xLabels, xColorOffset=xColorOffset, xLabelsAngle=xLabelsAngle, maxRows=maxRows, chart.filename=paste("DS3.corrMap.",experimentName.recut,".png",sep=""), pngWidth=pngWidth, pngHeight=pngHeight, saveFile=saveFile, deepSplit.cor="DS3" ) moduleCor.DS3.val <- moduleCor.DS3[[1]] moduleCor.DS3.pval <- moduleCor.DS3[[2]] moduleCor.DS3 <- moduleCor.DS3[[3]] # Correlate deep split 4 modules moduleCor.DS4 <- traitCorr( MEs=MEs.DS4, datTraits=datTraits, moduleColors=moduleColors.DS4, sigTrait=sigTrait, p_val=p_val, title.heatmap=paste("DS4",experimentName.recut,sep=" "), plotMar=plotMar, cex.lab.x=cex.lab.x, cex.lab.y = cex.lab.y, text.size=text.size, xLabels=xLabels, xColorOffset=xColorOffset, xLabelsAngle=xLabelsAngle, maxRows=maxRows, chart.filename=paste("DS4.corrMap.",experimentName.recut,".png",sep=""), pngWidth=pngWidth, pngHeight=pngHeight, saveFile=saveFile, deepSplit.cor="DS4" ) moduleCor.DS4.val <- moduleCor.DS4[[1]] moduleCor.DS4.pval <- moduleCor.DS4[[2]] moduleCor.DS4 <- moduleCor.DS4[[3]] # Set the proper save directory first. script needs fixing. save(moduleCor.DS1, moduleCor.DS1.pval, moduleCor.DS1.val, moduleCor.DS2, moduleCor.DS2.pval, moduleCor.DS2.val, moduleCor.DS3, moduleCor.DS3.pval, moduleCor.DS3.val, moduleCor.DS4, moduleCor.DS4.pval, moduleCor.DS4.val, file="recut.DS1-DS4.module.correlations.RData") ############################################################################################ WGCNA kIMs per block ######################################################################################## ## CALCULATE KIMs FOR ALL DEEP SPLITS # Calculate kIM, or intramodular connectivity, i.e. connectivity of nodes to other nodes # within the same module. This requires the adjaceny matrix be recreated, so make sure # to use the same soft power and unsigned or signed designation originally used to create the network setwd(dir) kIM = matrix(nrow=ncol(datExpr),ncol=4) for (i in 1:length(net$TOMFiles)) { datExpr.block <- datExpr[ , net$blockGenes[[i]] ] adjacency.block = adjacency(datExpr.block, power= as.integer(TOM.settings["power"]), type="signed") print( paste("Calculating adjacencies for genes within block ",i," of ",length(net$TOMFiles),sep="")) moduleColors.block <- moduleColors[ net$blockGenes[[i]] ] print( paste("Calculating scaled kIM intramodular connectivity for adjacencies within block ",i," of ",length(net$TOMFiles),sep="")) kIM.block = intramodularConnectivity(adjacency.block, moduleColors.block, scaleByMax = TRUE) # Scale to 1 kIM[1:nrow(kIM.block),1:4] <- kIM.block # FIX } ############################################################################################ WGCNA KME PRIMARY ############################################################################################### # DS1 KMEs KME.corr.DS1 <- KME.MEs.DS1[,seq(1, ncol(KME.MEs.DS1), 2)] colnames(KME.corr.DS1) <- substr( colnames(KME.corr.DS1), 5, nchar(colnames(KME.corr.DS1))) KME.corr.p.DS1 <- KME.MEs.DS1[,seq(2, ncol(KME.MEs.DS1), 2)] colnames(KME.corr.p.DS1) <- colnames(KME.corr.DS1) KME.primary.DS1 <- matrix(nrow=nrow(KME.MEs.DS1),ncol=3) print(paste("Retrieving primary KME values for DS1...")) for (i in 1:nrow(geneList.DS1)) { for (j in 1:ncol(KME.corr.DS1)) { if (geneList.DS1[i,"moduleColors.DS1"]==colnames(KME.corr.DS1)[j] ) { KME.primary.DS1[i,1] <- geneList.DS1[i,"moduleColors.DS1"] KME.primary.DS1[i,2] <- KME.corr.DS1[i,j] KME.primary.DS1[i,3] <- KME.corr.p.DS1[i,j] } } } colnames(KME.primary.DS1) <- c("KME.primary","KME.color","KME.p.color") rm(KME.corr.DS1,KME.corr.p.DS1) ############################################################################################### # DS2 KMEs KME.corr.DS2 <- KME.MEs.DS2[,seq(1, ncol(KME.MEs.DS2), 2)] colnames(KME.corr.DS2) <- substr( colnames(KME.corr.DS2), 5, nchar(colnames(KME.corr.DS2))) KME.corr.p.DS2 <- KME.MEs.DS2[,seq(2, ncol(KME.MEs.DS2), 2)] colnames(KME.corr.p.DS2) <- colnames(KME.corr.DS2) KME.primary.DS2 <- matrix(nrow=nrow(KME.MEs.DS2),ncol=3) print(paste("Retrieving primary KME values for DS2...")) for (i in 1:nrow(geneList.DS2)) { for (j in 1:ncol(KME.corr.DS2)) { if (geneList.DS2[i,"moduleColors.DS2"]==colnames(KME.corr.DS2)[j] ) { KME.primary.DS2[i,1] <- geneList.DS2[i,"moduleColors.DS2"] KME.primary.DS2[i,2] <- KME.corr.DS2[i,j] KME.primary.DS2[i,3] <- KME.corr.p.DS2[i,j] } } } colnames(KME.primary.DS2) <- c("KME.primary","KME.color","KME.p.color") rm(KME.corr.DS2,KME.corr.p.DS2) ############################################################################################### # DS3 KMEs KME.corr.DS3 <- KME.MEs.DS3[,seq(1, ncol(KME.MEs.DS3), 2)] colnames(KME.corr.DS3) <- substr( colnames(KME.corr.DS3), 5, nchar(colnames(KME.corr.DS3))) KME.corr.p.DS3 <- KME.MEs.DS3[,seq(2, ncol(KME.MEs.DS3), 2)] colnames(KME.corr.p.DS3) <- colnames(KME.corr.DS3) KME.primary.DS3 <- matrix(nrow=nrow(KME.MEs.DS3),ncol=3) # print(paste("Retrieving primary KME values for DS3...")) for (i in 1:nrow(geneList.DS3)) { for (j in 1:ncol(KME.corr.DS3)) { if (geneList.DS3[i,"moduleColors.DS3"]==colnames(KME.corr.DS3)[j] ) { KME.primary.DS3[i,1] <- geneList.DS3[i,"moduleColors.DS3"] KME.primary.DS3[i,2] <- KME.corr.DS3[i,j] KME.primary.DS3[i,3] <- KME.corr.p.DS3[i,j] } } } colnames(KME.primary.DS3) <- c("KME.primary","KME.color","KME.p.color") rm(KME.corr.DS3,KME.corr.p.DS3) ############################################################################################### # DS4 KMEs KME.corr.DS4 <- KME.MEs.DS4[,seq(1, ncol(KME.MEs.DS4), 2)] colnames(KME.corr.DS4) <- substr( colnames(KME.corr.DS4), 5, nchar(colnames(KME.corr.DS4))) KME.corr.p.DS4 <- KME.MEs.DS4[,seq(2, ncol(KME.MEs.DS4), 2)] colnames(KME.corr.p.DS4) <- colnames(KME.corr.DS4) KME.primary.DS4 <- matrix(nrow=nrow(KME.MEs.DS4),ncol=3) # print(paste("Retrieving primary KME values for DS4...")) for (i in 1:nrow(geneList.DS4)) { for (j in 1:ncol(KME.corr.DS4)) { if (geneList.DS4[i,"moduleColors.DS4"]==colnames(KME.corr.DS4)[j] ) { KME.primary.DS4[i,1] <- geneList.DS4[i,"moduleColors.DS4"] KME.primary.DS4[i,2] <- KME.corr.DS4[i,j] KME.primary.DS4[i,3] <- KME.corr.p.DS4[i,j] } } } colnames(KME.primary.DS4) <- c("KME.primary","KME.color","KME.p.color") rm(KME.corr.DS4,KME.corr.p.DS4) ############################################################################################### save(KME.primary.DS1,KME.primary.DS2,KME.primary.DS3,KME.primary.DS4,file="gene_KMEs.RData") ############################################################################################## WGCNA DEEPSPLITS DENDROGRAMS plotDendro <- function( datExpr, moduleColors, moduleCor, sigTrait, parMar, p_val, cex, deepSplit, chart.filename, pngWidth, pngHeight, saveFile ) { ###################################################### # Prepare p-values of correlation between modules eigengenes and clinical traits sigTrait.p <-paste(sigTrait,".p",sep="") # Prepare module eigengene names MEList = moduleEigengenes(datExpr, colors = moduleColors) MEs = MEList$eigengenes ###################################################### # Calculate dissimilarity of module eigengenes MEDiss = 1-cor(MEs); # Cluster module eigengene dissimilarity METree = hclust(as.dist(MEDiss), method = "average"); ###################################################### # Format module names and include members and significance METree$labels <- substr( METree$labels, 3, nchar(METree$labels) ) index <- match(METree$labels, rownames(moduleCor) ) tree.labels <- data.frame(moduleCor[index,c("moduleCount",sigTrait.p,sigTrait)]) tree.labels$color <- rownames(moduleCor) tree.labels[sigTrait.p] <- round(tree.labels[sigTrait.p],5) tree.labels$sign <- "[+]" index <- which(tree.labels[sigTrait]<=0) tree.labels[index,"sign"] <- "[-]" tree.labels$label <- paste("* ",tree.labels$sign," ", rownames(tree.labels)," (",tree.labels$moduleCount," p",tree.labels[,2],")",sep="") index <- which(tree.labels[sigTrait.p] > p_val) #tree.labels[index,"label"] <- paste(rownames(tree.labels)[index],"(",tree.labels[index,"moduleCount"],")",sep="") tree.labels[index,"label"] <- paste( tree.labels[index,"sign"]," ", rownames(tree.labels)[index]," (",tree.labels[index,"moduleCount"],")", sep="" ) METree$labels <- tree.labels$label sigNum <- length(which(tree.labels[,2]0) { if (nExtraPlots==5) { par(mfrow=c(2,3)) } else { par(mfrow=c(2,2)) } startIndex<-nplots*6+1 endIndex<-nmods for (k in startIndex:endIndex) { myModName<-colors.mods[k] plotModulesME(moduleName=myModName,moduleColors=moduleColors,KME.primary=KME.primary) } } dev.off() pdf(paste0("kMEgrey",DS.tag,".pdf"),width=16,height=9) par(mfrow=c(1,1)) myModName <- "grey" plotModulesME(moduleName=myModName,moduleColors=moduleColors,KME.primary=KME.primary) dev.off() graphics.off() } ################################################################################################### WGCNA ANNOTATE ENSEMBL GENES ################################################################################################### # ANNOTATE ENSEMBL GENES ################################################################################################### biomart.annotate <- function (IDs, filter, attributes) { # Query biomaRt for annotation information keyed to IDs (usually Ensembl) library('biomaRt') mart <- useDataset("hsapiens_gene_ensembl", useMart("ensembl")) # View available marts, inputs and outputs #datasets <- listDatasets(mart) #filters <- listFilters(mart) #attribs <- listAttributes(mart) #ensemblID <- colnames(datExpr) # Give this a minute or so. Also, remember not all Ensembl areas are actually transcribed and thus assigned an Entrez ID print(paste("Querying biomaRt...",sep="")) biomart.output <- getBM(filters=filter, attributes=attributes, values=IDs, mart=mart) print(paste("Query complete.",sep="")) return(biomart.output) } #################################################################################################### WGCNA BLOCKS RECUT VERSION 1 ################################################################################################### # RECUT WGCNA TOM BLOCKS INTO NEW MODULES USING FOUR LEVELS OF DEEP SPLIT ################################################################################################### # Load and execute the function to set the storage directory for the recut blocks # The function will leave you in the endpoint directory, so return to the main project directory as needed setStorageDir <- dget("/mnt/disks/rserver/projects/shared_scripts/wgcna/set.storage.directory.R") outputStorage <- setStorageDir( paste("recut",experimentName.recut,sep=".") ) # Each element of this vector is a subsequent subdirectory # setwd(projectDir) # Don't call until end of job!! #################################################################################################################### ###RECUT THE NETWORK BLOCKS WITH A DETECTION CUT HEIGHT OF 1, MERGE HEIGHT OF 0.2, DEEP SPLIT OF 1 TOMFiles <- paste(projectDir,"/TOM/",net$TOMFiles,sep="") print(paste("Recutting blocks using deepsplit ","1",sep="")) recutBlocks = recutBlockwiseTrees(datExpr, net$goodSamples, net$goodGenes, net$blocks, TOMFiles, net$dendrograms, networkType="signed", corType = "pearson", pamStage=TRUE, pamRespectsDendro = TRUE, detectCutHeight = 1, deepSplit = 1, minModuleSize=100, mergeCutHeight=0.2, numericLabels = TRUE) print(paste("Deepsplit ",i," cuts complete.",sep="")) MEs.DS1 <- recutBlocks$MEs rownames(MEs.DS1) <- rownames(datExpr) MEs.DS1.labels <- as.numeric(substr( colnames(MEs.DS1), 3, nchar(colnames(MEs.DS1)) )) colnames(MEs.DS1) <- labels2colors(MEs.DS1.labels) moduleLabels.DS1 <- recutBlocks$colors moduleColors.DS1 <- labels2colors(moduleLabels.DS1) colorLevels.DS1 <- data.frame(table(moduleColors.DS1)) colorLevels.DS1 <- colorLevels.DS1[order(colorLevels.DS1[,2], decreasing=TRUE), ] #################################################################################################################### ## RECUT THE NETWORK BLOCKS WITH A DETECTION CUT HEIGHT OF 1, MERGE HEIGHT OF 0.2, DEEP SPLIT OF 2 print(paste("Recutting blocks using deepsplit ","2",sep="")) recutBlocks = recutBlockwiseTrees(datExpr, net$goodSamples, net$goodGenes, net$blocks, TOMFiles, net$dendrograms, networkType="signed", corType="pearson", pamStage=TRUE, pamRespectsDendro = TRUE, detectCutHeight = 1, deepSplit=2, minModuleSize=100, mergeCutHeight=0.2, numericLabels = TRUE) print(paste("Deepsplit ",i," cuts complete.",sep="")) MEs.DS2 <- recutBlocks$MEs rownames(MEs.DS2) <- rownames(datExpr) MEs.DS2.labels <- as.numeric(substr( colnames(MEs.DS2), 3, nchar(colnames(MEs.DS2)) )) colnames(MEs.DS2) <- labels2colors(MEs.DS2.labels) moduleLabels.DS2 <- recutBlocks$colors moduleColors.DS2 = labels2colors(moduleLabels.DS2) moduleColors.DS2 <- matchLabels(moduleColors.DS2, moduleColors.DS1, pThreshold = 5e-2, na.rm = TRUE, ignoreLabels = if (is.numeric(moduleColors.DS2)) 0 else "grey", extraLabels = if (is.numeric(moduleColors.DS2)) c(1:1000) else standardColors() ) # Regenerate the MEs now that we have new color names MEs.DS2 <- moduleEigengenes( datExpr, moduleColors.DS2, verbose=2) MEs.DS2 <- MEs.DS2$eigengenes rownames(MEs.DS2) <- rownames(datExpr) colnames(MEs.DS2) <- substr( colnames(MEs.DS2), 3, nchar(colnames(MEs.DS2)) ) colorLevels.DS2 <- data.frame(table(moduleColors.DS2)) colorLevels.DS2 <- colorLevels.DS2[order(colorLevels.DS2[,2], decreasing=TRUE), ] #################################################################################################################### ## RECUT THE NETWORK BLOCKS WITH A DETECTION CUT HEIGHT OF 1, MERGE HEIGHT OF 0.2, DEEP SPLIT OF 3 print(paste("Recutting blocks using deepsplit ","3",sep="")) recutBlocks = recutBlockwiseTrees(datExpr, net$goodSamples, net$goodGenes, net$blocks, TOMFiles, net$dendrograms, networkType="signed", corType="pearson", pamStage=TRUE, pamRespectsDendro = TRUE, detectCutHeight = 1, deepSplit=3, minModuleSize=100, mergeCutHeight=0.2, numericLabels = TRUE) print(paste("Deepsplit ",i," cuts complete.",sep="")) MEs.DS3 <- recutBlocks$MEs rownames(MEs.DS3) <- rownames(datExpr) MEs.DS3.labels <- as.numeric(substr( colnames(MEs.DS3), 3, nchar(colnames(MEs.DS3)) )) colnames(MEs.DS3) <- labels2colors(MEs.DS3.labels) moduleLabels.DS3 <- recutBlocks$colors moduleColors.DS3 = labels2colors(moduleLabels.DS3) moduleColors.DS3 <- matchLabels(moduleColors.DS3, moduleColors.DS1, pThreshold = 5e-2, na.rm = TRUE, ignoreLabels = if (is.numeric(moduleColors.DS2)) 0 else "grey", extraLabels = if (is.numeric(moduleColors.DS2)) c(1:1000) else standardColors() ) # Regenerate the MEs now that we have new color names MEs.DS3 <- moduleEigengenes( datExpr, moduleColors.DS3, verbose=2) MEs.DS3 <- MEs.DS3$eigengenes rownames(MEs.DS3) <- rownames(datExpr) colnames(MEs.DS3) <- substr( colnames(MEs.DS3), 3, nchar(colnames(MEs.DS3)) ) colorLevels.DS3 <- data.frame(table(moduleColors.DS3)) colorLevels.DS3 <- colorLevels.DS3[order(colorLevels.DS3[,2], decreasing=TRUE), ] #################################################################################################################### ## RECUT THE NETWORK BLOCKS WITH A DETECTION CUT HEIGHT OF 1, MERGE HEIGHT OF 0.2, DEEP SPLIT OF 4 print(paste("Recutting blocks using deepsplit ","4",sep="")) recutBlocks = recutBlockwiseTrees(datExpr, net$goodSamples, net$goodGenes, net$blocks, TOMFiles, net$dendrograms, networkType="signed", corType="pearson", pamStage=TRUE, pamRespectsDendro = TRUE, detectCutHeight = 1, deepSplit=4, minModuleSize=100, mergeCutHeight=0.2, numericLabels = TRUE) print(paste("Deepsplit ",i," cuts complete.",sep="")) MEs.DS4 <- recutBlocks$MEs rownames(MEs.DS4) <- rownames(datExpr) MEs.DS4.labels <- as.numeric(substr( colnames(MEs.DS4), 3, nchar(colnames(MEs.DS4)) )) colnames(MEs.DS4) <- labels2colors(MEs.DS4.labels) moduleLabels.DS4 <- recutBlocks$colors moduleColors.DS4 = labels2colors(moduleLabels.DS4) moduleColors.DS4 <- matchLabels(moduleColors.DS4, moduleColors.DS1, pThreshold = 5e-2, na.rm = TRUE, ignoreLabels = if (is.numeric(moduleColors.DS2)) 0 else "grey", extraLabels = if (is.numeric(moduleColors.DS2)) c(1:1000) else standardColors() ) # Regenerate the MEs now that we have new color names MEs.DS4 <- moduleEigengenes( datExpr, moduleColors.DS4, verbose=2) MEs.DS4 <- MEs.DS4$eigengenes rownames(MEs.DS4) <- rownames(datExpr) colnames(MEs.DS4) <- substr( colnames(MEs.DS4), 3, nchar(colnames(MEs.DS4)) ) colorLevels.DS4 <- data.frame(table(moduleColors.DS4)) colorLevels.DS4 <- colorLevels.DS4[order(colorLevels.DS4[,2], decreasing=TRUE), ] ####### # Plot a cut as needed diagnostically # plotDendroAndColors(net$dendrograms[[1]], moduleColors.DS1[net$blockGenes[[1]]], # c("DS1"), main = "TSOKOS CD4 Active SLE Females DS3 PAM SP10 Cut Height 0.995 Block 1/4", # dendroLabels = FALSE, hang = 0.03, # addGuide = TRUE, guideHang = 0.05) ####### ###################################################################################################################### WGCNA BLOCKS RECUT V2 ################################################################################################### # RECUT WGCNA TOM BLOCKS INTO NEW MODULES USING FOUR LEVELS OF DEEP SPLIT ################################################################################################### # Load and execute the function to set the storage directory for the recut blocks # The function will leave you in the endpoint directory, so return to the main project directory as needed # setStorageDir <- dget("/mnt/disks/rserver/projects/shared_scripts/wgcna/set.storage.directory.R") # outputStorage <- setStorageDir( experimentName.recut ) # Each element of this vector is a subsequent subdirectory # setwd(projectDir) # Don't call until end of job!! for (i in 1:4) { ### CUT WITH DEEPSPLIT 1 THROUGH 4 recutBlocks = recutBlockwiseTrees(datExpr, net$goodSamples, net$goodGenes, net$blocks, TOMFiles=paste(projectDir,"/",net$TOMFiles,sep=""), net$dendrograms, networkType="signed",TOMType = "signed", corType = recut.corType, pamStage=recut.pamStage, pamRespectsDendro=recut.pamRespectsDendro, detectCutHeight=recut.detectCutHeight, deepSplit=i, minModuleSize=recut.minModuleSize, mergeCutHeight=recut.mergeCutHeight, numericLabels = TRUE) print(paste("Deepsplit ",i," cuts complete.",sep="")) # Use these in a potentially simplified for-loop instead of the four if(i==..) blocks. # Note usage of assign() and as.name() functions # assign( paste("MEs.DS",i,sep=""),recutBlocks$MEs ) # rownames(as.name( paste("MEs.DS",i,sep=""))) <- rownames(datExpr) if (i==1) { print(paste("Recutting blocks using deepsplit ",i,sep="")) MEs.DS1 <- recutBlocks$MEs rownames(MEs.DS1) <- rownames(datExpr) MEs.DS1.labels <- as.numeric(substr( colnames(MEs.DS1), 3, nchar(colnames(MEs.DS1)) )) colnames(MEs.DS1) <- labels2colors(MEs.DS1.labels) moduleLabels.DS1 <- recutBlocks$colors moduleColors.DS1 <- labels2colors(moduleLabels.DS1) colorLevels.DS1 <- data.frame(table(moduleColors.DS1)) colorLevels.DS1 <- colorLevels.DS1[order(colorLevels.DS1[,2], decreasing=TRUE), ] print(paste("Block ",i," recut modules regenerated.",sep="")) } if (i==2) { print(paste("Recutting blocks using deepsplit ",i,sep="")) MEs.DS2 <- recutBlocks$MEs rownames(MEs.DS2) <- rownames(datExpr) MEs.DS2.labels <- as.numeric(substr( colnames(MEs.DS2), 3, nchar(colnames(MEs.DS2)) )) colnames(MEs.DS2) <- labels2colors(MEs.DS2.labels) moduleLabels.DS2 <- recutBlocks$colors moduleColors.DS2 <- labels2colors(moduleLabels.DS2) moduleColors.DS2 <- matchLabels(moduleColors.DS2, moduleColors.DS1, pThreshold = 5e-2, na.rm = TRUE, ignoreLabels = if (is.numeric(moduleColors.DS2)) 0 else "grey", extraLabels = if (is.numeric(moduleColors.DS2)) c(1:1000) else standardColors() ) # Regenerate the MEs now that we have new color names MEs.DS2 <- moduleEigengenes( datExpr, moduleColors.DS2, verbose=2) MEs.DS2 <- MEs.DS2$eigengenes rownames(MEs.DS2) <- rownames(datExpr) colnames(MEs.DS2) <- substr( colnames(MEs.DS2), 3, nchar(colnames(MEs.DS2)) ) colorLevels.DS2 <- data.frame(table(moduleColors.DS2)) colorLevels.DS2 <- colorLevels.DS2[order(colorLevels.DS2[,2], decreasing=TRUE), ] print(paste("Block ",i," recut modules regenerated.",sep="")) } if (i==3) { print(paste("Recutting blocks using deepsplit ",i,sep="")) MEs.DS3 <- recutBlocks$MEs rownames(MEs.DS3) <- rownames(datExpr) MEs.DS3.labels <- as.numeric(substr( colnames(MEs.DS3), 3, nchar(colnames(MEs.DS3)) )) colnames(MEs.DS3) <- labels2colors(MEs.DS3.labels) moduleLabels.DS3 <- recutBlocks$colors moduleColors.DS3 = labels2colors(moduleLabels.DS3) moduleColors.DS3 <- matchLabels(moduleColors.DS3, moduleColors.DS1, pThreshold = 5e-2, na.rm = TRUE, ignoreLabels = if (is.numeric(moduleColors.DS2)) 0 else "grey", extraLabels = if (is.numeric(moduleColors.DS2)) c(1:1000) else standardColors() ) # Regenerate the MEs now that we have new color names MEs.DS3 <- moduleEigengenes( datExpr, moduleColors.DS3, verbose=2) MEs.DS3 <- MEs.DS3$eigengenes rownames(MEs.DS3) <- rownames(datExpr) colnames(MEs.DS3) <- substr( colnames(MEs.DS3), 3, nchar(colnames(MEs.DS3)) ) colorLevels.DS3 <- data.frame(table(moduleColors.DS3)) colorLevels.DS3 <- colorLevels.DS3[order(colorLevels.DS3[,2], decreasing=TRUE), ] print(paste("Block ",i," recut modules regenerated.",sep="")) } if (i==4) { print(paste("Recutting blocks using deepsplit ",i,sep="")) MEs.DS4 <- recutBlocks$MEs rownames(MEs.DS4) <- rownames(datExpr) MEs.DS4.labels <- as.numeric(substr( colnames(MEs.DS4), 3, nchar(colnames(MEs.DS4)) )) colnames(MEs.DS4) <- labels2colors(MEs.DS4.labels) moduleLabels.DS4 <- recutBlocks$colors moduleColors.DS4 = labels2colors(moduleLabels.DS4) moduleColors.DS4 <- matchLabels(moduleColors.DS4, moduleColors.DS1, pThreshold = 5e-2, na.rm = TRUE, ignoreLabels = if (is.numeric(moduleColors.DS2)) 0 else "grey", extraLabels = if (is.numeric(moduleColors.DS2)) c(1:1000) else standardColors() ) # Regenerate the MEs now that we have new color names MEs.DS4 <- moduleEigengenes( datExpr, moduleColors.DS4, verbose=2) MEs.DS4 <- MEs.DS4$eigengenes rownames(MEs.DS4) <- rownames(datExpr) colnames(MEs.DS4) <- substr( colnames(MEs.DS4), 3, nchar(colnames(MEs.DS4)) ) colorLevels.DS4 <- data.frame(table(moduleColors.DS4)) colorLevels.DS4 <- colorLevels.DS4[order(colorLevels.DS4[,2], decreasing=TRUE), ] print(paste("Block ",i," recut modules regenerated.",sep="")) } } setwd(projectDir) ####### # Plot a cut as needed diagnostically # plotDendroAndColors(net$dendrograms[[1]], moduleColors.DS1[net$blockGenes[[1]]], # c("DS1"), main = "TSOKOS CD4 Active SLE Females DS3 PAM SP10 Cut Height 0.995 Block 1/4", # dendroLabels = FALSE, hang = 0.03, # addGuide = TRUE, guideHang = 0.05) ####### ############################################################################################################ ################################################################################################### # ANNOTATE WGCNA MODULES WITH TOP GO PATHWAYS ################################################################################################### # Join some fData to genes with their module color names. Note the gene order in the moduleColors vector is # still in the same order as rownames of the original datExpr from whence adjacency and subsequent # TOM encoding was calculated. geneNames = colnames(datExpr) geneInfo = data.frame(moduleColor=moduleColors.DS2) # Annotate the modules with GO functional enrichment entrezIDs = as.character(geneInfo$geneEntrezID) GOenr = GOenrichmentAnalysis(geneInfo$moduleColor, allLLIDs, organism = "human", nBestP = 10); GOenr.terms.all = GOenr$bestPTerms[[4]]$enrichment ################################################################################################### # ADD COLOR NAMES TO WGCNA MODULES ################################################################################################### # Load probe/gene numeric module assignments moduleLabels <- net$colors # Convert numeric module numbers to color names moduleColors <- labels2colors(net$colors) # Extract module eigengenes per sample MEs <- net$MEs rownames(MEs) <- rownames(datExpr) # Rename module numbers to colors # CRITICAL!!! You MUST pass a numeric vector to labels2colors MEs.labels <- as.numeric(substr( colnames(MEs), 3, nchar(colnames(MEs)) )) colnames(MEs) <- labels2colors(MEs.labels) # Create frequency table of probes in modules colorLevels <- data.frame(table(moduleColors)) colorLevels <- colorLevels[order(colorLevels[,2], decreasing=TRUE), ] #################################################################################################### WGCNA PLOT Module Eigenegenes and Boxplots ME.boxplot<-function(MEs,index) { quantile.ME<-quantile(MEs[,index]) median.ME<-round(quantile.ME[3],2) max.ME<-round(quantile.ME[5],2) diff.ME<-max.ME-median.ME par(mar=c(8,3,4,3)) boxplot(MEs[,index],main=paste0(colnames(MEs)[index], "\n","Median=", median.ME,". Max=", max.ME, ". Diff=", diff.ME)) } ME.barplot<-function(MEs,index) { barplot(MEs[,index],col=colnames(MEs)[index],names.arg=rownames(MEs),las=2,ylim=c(-1,1),cex.names=0.6) } ME_plots <- function(MEs,DS.tag) { nmods<-ncol(MEs) nplots<-floor(nmods/6) pdf(paste("ME_plots.",DS.tag,".pdf"),width=16,height=9) for (i in 1:nplots) { par(mfrow=c(2,6)) startIndex<-6*i-5 endIndex<-6*i for (j in startIndex:endIndex) { ME.boxplot(MEs=MEs,index=j) } for (j in startIndex:endIndex) { ME.barplot(MEs=MEs,index=j) } } nExtraPlots<-nmods-nplots*6 if (nExtraPlots>0) { par(mfrow=c(2,nExtraPlots)) startIndex<-nplots*6+1 endIndex<-nmods for (k in startIndex:endIndex) { ME.boxplot(MEs=MEs,index=k) } for (k in startIndex:endIndex) { ME.barplot(MEs=MEs,index=k) } } dev.off() graphics.off() } ####################################################################################################### WGCNA PLOT MODULES VS SIGNATURE TRAIT library(Hmisc) plotModulesIM<-function(moduleName,moduleColors,kIM) { module<-moduleName col<-module moduleGenesIndex<-which(moduleColors==module) moduleGenes.kIM<-kIM[moduleGenesIndex,"kWithin"] moduleGenes.sigTrait<-gene.corr.r_p[moduleGenesIndex,sigTrait] if (module=="ivory") {col="gray"} ###If you have very light colors, substitute them for a darker color to be seen on graphs if (module=="floralwhite") {col="gray"} if (module=="lightcyan") {col="cyan"} if (module=="lightcyan1") {col="cyan"} if (module=="honeydew1") {col="gray"} if (module=="white") {col="gray"} if (module=="lightyellow") {col="yellow"} if (module=="aliceblue") {col="blue"} if (module=="antiquewhite1") {col="gray"} if (module=="lavenderblush") {col="gray"} if (module=="lavenderblush1") {col="gray"} if (module=="lavenderblush2") {col="gray"} if (module=="lavenderblush3") {col="gray"} if (module=="mistyrose") {col="gray"} if (module=="moccasin") {col="gray"} if (module=="navajowhite") {col="gray"} if (module=="navajowhite1") {col="gray"} if (module=="navajowhite2") {col="gray"} if (module=="paleturquoise") {col="turquoise"} if (module=="thistle1") {col="gray"} if (module=="thistle2") {col="gray"} if (module=="whitesmoke") {col="gray"} verboseScatterplot(moduleGenes.kIM,moduleGenes.sigTrait, xlab = paste0("Probe intramodular membership in ", module, " module"), ylab = paste0("Probe correlation to ",sigTrait), main = paste(capitalize(module), " kIM vs. gene significance\n", sep=""), cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = col, ylim=c(-1,1), xlim=c(-1,1)) abline(h=0,lty=2,lwd=1,col="red") abline(v=0,lty=2,lwd=1,col="red") } kIMvsCorrPlot<-function(moduleCor,moduleColors,kIM,DS.tag) { colors.mods <- rownames(moduleCor) colors.mods <- colors.mods[colors.mods!="grey"] colors.mods <- sort(colors.mods) nmods<-length(colors.mods) nplots<-floor(nmods/6) pdf(paste0("kIMvsCorrPlots",DS.tag,".pdf"),width=16,height=9) for (i in 1:nplots) { par(mfrow=c(2,3)) startIndex<-6*i-5 endIndex<-6*i for(j in startIndex:endIndex) { myModName<-colors.mods[j] plotModulesIM(moduleName=myModName,moduleColors=moduleColors,kIM=kIM) } } nExtraPlots<-nmods-nplots*6 if (nExtraPlots>0) { if (nExtraPlots==5) { par(mfrow=c(2,3)) } else { par(mfrow=c(2,2)) } startIndex<-nplots*6+1 endIndex<-nmods for (k in startIndex:endIndex) { myModName<-colors.mods[k] plotModulesIM(moduleName=myModName,moduleColors=moduleColors,kIM=kIM) } } dev.off() graphics.off() }