WGCNA is an open source package for R available at https://horvath.genetics.ucla.edu/html/

Citation: Langfelder, P. and Horvath, S. WGCNA: an R package for weighted correlation network analysis. BMC Bioinformatics. 9:559 (2008).

The following is the open source code from the website used by AMPEL BioSolutions. 

Log2 normalized microarray expression values for WB, PBMC, purified T cell, B cell or monocyte datasets were filtered using an IQR to remove saturated probes with low variability between samples and used as inputs to WGCNA (V1.51). 

WGCNA READ ME


This folder contains operational (6 parts) and shared R scripts used for WGCNA analysis by AMPEL BioSolutions. 
Each time you start a new analysis, clone deseq/part0-eset.R, wgcna/active_vst/part1,part2 etc., and the /wgcna/active_vst/bigc... data.  You can change the name of the active_vst folder depending on your analysis

Part 0 -- eset:  This script normalizes raw counts data via several methods and generates corresponding esets.  Upload your raw counts and clinical data file to the deseq folder before you begin

Part 1 -- inputs: This script runs some quality control and looks for outliers

Part 1B -- clinical data: This scripts converts clinical traits to numerical values for downstream correlations

Part 2 -- SFT: This script helps to identify the best soft threshold power for part 3

Part 3 -- TOM: This script generates TOM files (topological overlay matrix) for downstream analysis

Part 4 -- cut trees: This script recuts the network generated in part 3 into 4 deep split options

Part 5 -- correlate: This script correlates module eigengenes with clinical traits and annotates gene info for each deep split

Part 6 -- excel results: This script takes the final deep split (chosen on the basis of agreement with dendrogram, clinical trait correlations, etc.) and generates a results spreadsheet

################################################################################################
### WGCNA PART 1
# This script will create a datExpr data frame based on expression data from an eset.
# For WGCNA the columns of datExpr will be probes and the rows will be arrays. The script
# will also create a datTraits frame where columns are clinical traits and rows are arrays.
# The row names of datExpr must match the row names of datTraits.
# CURRENTLY, as of February 2017, AMPEL is using ONLY GCRMA normalized esets annotated with manufacturer CDFs (i.e., Affy, Brainarry or Illumina, etc.)
################################################################################################

### LOAD AND CONFIGURE WGCNA LIBRARY
library(Biobase)
library(limma)
library(WGCNA)
options(stringsAsFactors = FALSE)
allowWGCNAThreads()

active.base <- "EXAMPLE_WGCNA"
title.base <- "EXAMPLE_WGCNA"
file.base <- "EXAMPLE_WGCNA"
analysisDir <- "~/Catalina-et-al_Nat-Comm_Suppl-Scripts/WGCNA_EXAMPLE/"
esetDir <- "~/Catalina-et-al_Nat-Comm_Suppl-Scripts/Expression_set_EXAMPLE/"
globalDir <- "~/Catalina-et-al_Nat-Comm_Suppl-Scripts/"
projectDir <- "~/Catalina-et-al_Nat-Comm_Suppl-Scripts/WGCNA_EXAMPLE/"
scriptDir <- "~/Catalina-et-al_Nat-Comm_Suppl-Scripts/WGCNA_EXAMPLE/Shared_R_Scripts/"
setwd(analysisDir)
save(globalDir,projectDir,scriptDir,esetDir,analysisDir,file.base,title.base,active.base,
     file=paste0(file.base,"_analysis_parameters.RData"))

setwd(analysisDir)
load(paste0(esetDir,"GSE88884_Batch-1_RMA-Normalized_ESET.Rdata"))
eset <- eset_illum1_merged_BA
rm(eset_illum1_merged_BA)
pdata <- read.delim(paste0(esetDir,"GSE88884_Batch-1_RMA-Normalized_METADATA.txt"),stringsAsFactors = FALSE,sep = "\t",header = TRUE,row.names = 1)

table(pdata$Batch)
table(pdata$Cohort)
table(pdata$Sex)
table(pdata$SLEDAI)
table(eset$batch) 
table(eset$group) 
table(eset$sex)

sampleNames(eset)
sampleNames(eset) <- substr(sampleNames(eset),1,nchar(sampleNames(eset))-4) #Remove the .CEL from the sample names

# remove <- read.delim(paste0(projectDir,"GPL17586_control_probes.txt"),sep = "\t",header = FALSE)
# index <- which(rownames(eset)%in%remove$V1)
# eset <- eset[-index,]

index <- which(pdata$Batch=="ILLUMINATE-2")
pdata <- pdata[-index,]

pdata[,"Cohort"] <- gsub("Normal","CTL",pdata[,"Cohort"])
index <- which(pdata$Cohort=="CTL")
control <- pdata[index,]

index <- which(control$Batch!="Additional_HCs")
control <- control[index,]

index <- which(pdata$Sex=="Male")
pdata <- pdata[-index,]

index <- which(!is.na(pdata$SLEDAI))
sle <- pdata[index,]

pdata <- rbind(control,sle)

table(pdata$Batch)
table(pdata$Cohort)
table(pdata$Sex)

rownames(pdata)
eset <- eset[,which(sampleNames(eset) %in% rownames(pdata))]
pdata <- pdata[which(rownames(pdata) %in% sampleNames(eset)),]
eset <- eset[,order(sampleNames(eset))]
pdata <- pdata[order(rownames(pdata)),]
identical(rownames(pdata),sampleNames(eset))

rownames(pdata) <- paste(pdata$Cohort,
                         substr(pdata$Subject_ID,6,nchar(pdata$Subject_ID)),
                         pdata$SLEDAI,
                         sep = ".")
sampleNames(eset) <- rownames(pdata)

# outliers <- read.delim(paste0(analysisDir,"ILLUMINATE-1_Outliers.txt"),sep = "\t",header = TRUE)
# index <- which(rownames(pdata)%in%outliers$ILLUMINATE.1)
# pdata <- pdata[-index,]

index <- which(sampleNames(eset)=="CTL.0744.NA"|
                 sampleNames(eset)=="CTL.1561.NA"|
                 sampleNames(eset)=="CTL.0515.NA"|
                 sampleNames(eset)=="CTL.0055.NA"|
                 sampleNames(eset)=="CTL.0236.NA") #Internal controls found to be males
eset <- eset[,-index]

index <- which(sampleNames(eset)=="CTL.1329.NA"|
                 sampleNames(eset)=="CTL.0070.NA") #Additional internal controls to remove
eset <- eset[,-index]

# #Unnecessary for brainarray CDF eset
# remove <- read.delim(paste0(projectDir,"GPL17586_control_probes.txt"),sep = "\t",header = FALSE)
# index <- which(rownames(eset)%in%remove$V1)
# eset <- eset[-index,]

eset <- eset[,which(sampleNames(eset) %in% rownames(pdata))]
pdata <- pdata[which(rownames(pdata) %in% sampleNames(eset)),]
eset <- eset[,order(sampleNames(eset))]
pdata <- pdata[order(rownames(pdata)),]
identical(rownames(pdata),sampleNames(eset))
sampleNames(eset) <- rownames(pdata)
pData(eset) <- pdata

###Create SYMBOL column required by some a4 functions
colnames(fData(eset))
fData(eset)[,"SYMBOL"] <- as.character(fData(eset)[,"geneSymbol"]) #Take information from the featureData (fData) in the eset
fData(eset)[,"ENTREZ"] <- as.character(fData(eset)[,"geneEntrezID"]) #Take information from the featureData (fData) in the eset
fData(eset)[,"ENSEMBL"] <- as.character(fData(eset)[,"geneEnsembl"]) #Take information from the featureData (fData) in the eset
fData(eset) <- fData(eset)[,c("probe","SYMBOL","ENTREZ","ENSEMBL")]

###REMOVE AFFY PROBES
index <- which( substr(rownames(eset),1,4)!="AFFX" )
eset <- eset[index, ]
rm(index)

#######################################################################################
# Plot PCA

setwd(analysisDir)
library(affycoretools)
# options(rgl.useNULL=TRUE)
# library(rgl)

eset$Cohort <- factor(eset$Cohort)
pdf("PCA_Pre-filter.pdf",width = 16, height = 9)
plotPCA(eset, groups=as.numeric(eset$Cohort), groupnames=levels(eset$Cohort), main="Pre-filter",
        pcs=c(1,2), plot3d=FALSE, col=c("red","blue"), pch=c(16,17), addtext=sampleNames(eset))
plotPCA(eset, groups=as.numeric(eset$Cohort), groupnames=levels(eset$Cohort), main="Pre-filter",
        pcs=c(1,3), plot3d=FALSE, col=c("red","blue"), pch=c(16,17), addtext=sampleNames(eset))
plotPCA(eset, groups=as.numeric(eset$Cohort), groupnames=levels(eset$Cohort), main="Pre-filter",
        pcs=c(3,2), plot3d=FALSE, col=c("red","blue"), pch=c(16,17), addtext=sampleNames(eset))
dev.off()
# plotPCA(eset, groups=as.numeric(eset$Cohort), groupnames=levels(eset$Cohort),
#         pcs=c(1,2,3), plot3d=TRUE, col=c("red","blue","orange"), pch=c(16,17,18))

#######################################################################################
# Draw a quick dendrogram to look for outliers in the unfiltered set
library(flashClust)
sampleTree <- flashClust(dist(t(exprs(eset))), method = "average");
# Plot the sample tree: Open a graphic output window of size 12 by 9 inches
# The user should change the dimensions if the window is too large or too small.
# sizeGrWindow(12,9)
# pdf(file = "Plots/sampleClustering.pdf", width = 12, height = 9);
pdf("Dendrogram_Pre-filter.pdf",width = 16, height = 9)
par(cex = 0.9);
par(mar = c(0,4,2,0))
plot(sampleTree, main = "Pre-filter", sub="", xlab="", cex.lab = 1.5,
     cex.axis = 1.5, cex.main = 1.5)
dev.off()

### STOP: IF OUTLIERS ARE DETECTED IN THE DENDROGRAM, USE THIS
# Find clusters cut by the line and plot a line to show the cut
# abline(h = 115, col = "red"); #Determine cluster under the line
# clust <- cutreeStatic(sampleTree, cutHeight = 115, minSize = 10)
# table(clust)
# keepSamples <- (clust==1) #Cluster 1 contains the samples we want to keep.
# eset <- eset[,keepSamples] #SLE-GSM260919 was removed as an outlier
# nrow(eset)

# ### ALTERNATIVELY: IF OUTLIERS ARE DETECTED IN THE DENDROGRAM, USE THIS
# sampleNames(eset)
# eset <- eset[,sampleNames(eset)!="SLE-GSM260919"] #Remove the outlier sample(s) by name

##### Determine the 50% intensity threshold
nrow <- format(as.numeric(nrow(eset)),big.mark=",",scientific=F) #Make an object out of the number of rows that originally are in the eset
WB <- hist(rowMeans(exprs(eset)), breaks=1000, xlab="Average log2 Probe Intensity",
           main="Illuminate-1 Active Female Reanalyzed WB SLE All Intensities",
           sub=paste(nrow, "total probes"), xlim=c(2,15), col="yellow")

nrow(exprs(eset)) #70,492 (transcript cluster)
0.50*nrow(exprs(eset)) #33,764
wb.counts <- as.data.frame(WB$counts)
sum(wb.counts[1:414,]) #Find the break where the lower 50% of probes are removed
wb.breaks <- as.data.frame(WB$breaks)
cut.off <- as.data.frame(wb.breaks[414,]) #5.66
dev.off() #Clear any graphs

# par(mfrow=c(1,2)) #Setting graphics parameters A vector of length 2, where the first argument specifies the number of rows and the second the number of columns of plots

nrow <- format(as.numeric(nrow(eset)),big.mark=",",scientific=F) #Make an object out of the number of rows that originally are in the eset
pdf("Probe-Intensity-Threshold.pdf",height = 9,width = 16)
hist(rowMeans(exprs(eset)), breaks=100, xlab="Average log2 Probe Intensity",
     main="GSE88884_Illuminate1-Active_ESET_RMA_Affy: WB, ALL Probes",
     sub=paste(nrow, "total probes"), xlim=c(2,15), col="yellow")
abline(v=5.04, lty=1, lwd=3, col="red")
dev.off()

##### Apply the low intensity threshold filter
index <- which(rowMeans(exprs(eset))<5.66) #Get row numbers for probes with average expression less than 5.2
eset.genefiltered <- eset[-index,] #Remove the probes with average expression less than 5.2
nrow(eset.genefiltered) #Check the number of probes remaining 33855
eset <- eset.genefiltered

# Interquartile range filtering, as proposed by PMID:27657141, entails selection of the maximal point on a histograme of IQR values across all probes
# Here, that value for 12 SLE and 12 CTL is 0.265
# However, a universal IQR threshold for all datasets run on the same chip platform is optimal
# pdf("probe IQR Filter Threshold.pdf",width = 16,height = 9)
# iqr.df <- data.frame()
# for(i in 1:nrow(exprs(eset))) {
#   row <- as.vector(exprs(eset)[i,])
#   iqr.df[i,1] <- IQR(row)
# }
# rownames(iqr.df) <- rownames(exprs(eset))
# index <- which(iqr.df[,1]>0.05)
# iqr.df <- iqr.df[index,,FALSE]
# iqr.hist <- hist(iqr.df[,1], breaks=1000, xlab="probe IQRs",
#                  main="probe Density at IQR Intervals",
#                  sub=paste(nrow(iqr.df), "total probes"), xlim=c(0,1), col="yellow")
# iqr.max <- which(iqr.hist$counts==max(iqr.hist$counts))
# iqr <- as.vector(iqr.hist$mids)[iqr.max]
# abline(v=iqr, lty=2, lwd=2, col="red")
# dev.off()
# iqr.df <- iqr.df[which(iqr.df[,1]>iqr),,FALSE]
# eset.filtered <- eset[which(rownames(eset)%in%rownames(iqr.df)),]
# nrow(eset.filtered)
# eset <- eset.filtered
# rm(eset.filtered)

#######################################################################################
###Create the datExpr0 frame for WGCNA
datExpr0 <- exprs(eset) #Take out the expression data ONLY from the eset dataframe
nrow(datExpr0) #Make sure the number of rows/probes matches the filtered eset

# IMPORTANT! TRANSPOSE THE ORIGINAL EXPRESSION MATRIX
# WGCNA requires columns be probes and rows be arrays
datExpr0 <- t(datExpr0) 

### CHECK FOR MISSING EXCESSIVE VALUES AND IDENTIFY OUTLIER MICROARRAY SAMPLES

# Check for gene entries with NAs and very low counts
gsg <- goodSamplesGenes(datExpr0, verbose = 3);
gsg$allOK

# Display and remove these bad entries
if (!gsg$allOK)
{
  # Optionally, print the gene and sample names that were removed:
  if (sum(!gsg$goodGenes)>0)
    badGenes <- colnames(datExpr0)[!gsg$goodGenes];
  printFlush(paste("Removing genes:", paste(colnames(datExpr0)[!gsg$goodGenes], collapse = ", ")));
  if (sum(!gsg$goodSamples)>0)
    printFlush(paste("Removing samples:", paste(rownames(datExpr0)[!gsg$goodSamples], collapse = ", ")));
  # Remove the offending genes and samples from the data:
  datExpr0 <- datExpr0[gsg$goodSamples, gsg$goodGenes]
} else {
  badGenes<-NULL
}

# We must remove probe columns containing only one value. Failure to do so breaks downstream TOM creation,
# and it must be performed now before module colors are assigned.
datExpr1 <- datExpr0[,apply(datExpr0,2,function(x) any(c(FALSE,x[-length(x)]!=x[-1])))]
ncol(datExpr0) - ncol(datExpr1)
datExpr0 <- datExpr1
rm(datExpr1,gsg)

# ##Remove bad probes eliminated by good genes function
datExpr1 <- t(datExpr0)
eset.wgcna <- eset[rownames(eset)%in%rownames(datExpr1),]
nrow(eset.wgcna)==nrow(datExpr1)
nrow(eset.wgcna) #Check the number of probes remaining
setwd(analysisDir)
save(eset.wgcna,file=paste0(file.base,"_eset_WGCNA.RData"))

#######################################################################################
# Plot PCA RECHECK
setwd(analysisDir)
library(affycoretools)
# options(rgl.useNULL=TRUE)
# library(rgl)

eset.wgcna$Cohort <- factor(eset.wgcna$Cohort)
pdf("PCA_Post-filter.pdf",width = 16, height = 9)
plotPCA(eset.wgcna, groups=as.numeric(eset.wgcna$Cohort), groupnames=levels(eset.wgcna$Cohort), main="Post-Filter",
        pcs=c(1,2), plot3d=FALSE, col=c("red","blue"), pch=c(16,17), addtext=sampleNames(eset.wgcna))
dev.off()
# plotPCA(eset.wgcna, groups=as.numeric(eset.wgcna$Cohort), groupnames=levels(eset.wgcna$Cohort),
#         pcs=c(1,2,3), plot3d=TRUE, col=c("red","blue"), pch=c(16,17,17))

#######################################################################################
# Draw a quick dendrogram to look for outliers in the filtered set
library(flashClust)
sampleTree <- flashClust(dist(datExpr0), method = "average");
# Plot the sample tree: Open a graphic output window of size 12 by 9 inches
# The user should change the dimensions if the window is too large or too small.
# sizeGrWindow(12,9)
pdf("Dendrogram_Post-filter.pdf",width = 16, height = 9)
par(cex = 0.9);
par(mar = c(0,4,2,0))
plot(sampleTree, main = "Post-Filter", sub="", xlab="", cex.lab = 1.5,
     cex.axis = 1.5, cex.main = 1.5)
dev.off()

datExpr <- datExpr0
ncol(datExpr)
nGenes <- ncol(datExpr) # number of columns (probes)
nSamples <- nrow(datExpr) # number of rows (arrays)
rm(sampleTree, datExpr0, datExpr1, index)

###########################################################################################
## LOAD CLINICAL TRAIT DATA (Applies to both filtered and unfiltered expression sets)

# Create a patients data frame analogous to expression data that will hold binary designations
# of trait membership, or a numerical measurement of a continuous clinical variable
patients.data <- pData(eset.wgcna)
rownames(patients.data) #Are they the same as the datExpr object, minus any outliers?
patients.data[is.na(patients.data)] <- ""
patients.data <- as.data.frame(patients.data)

colnames(patients.data)
datTraits <- patients.data[,c("Cohort","SLEDAI","Age",
                          "dsDNA_Level","dsDNA_IU",
                          "C3_Level","C3_GperL",
                          "C4_Level","C4_GperL")]

# Create numerical factors of binary designations

# Cohort
table(patients.data$Cohort)
datTraits$Cohort <- gsub("CTL",0,datTraits$Cohort)
datTraits$Cohort <- gsub("SLE",1,datTraits$Cohort)
datTraits$Cohort <- as.numeric(datTraits$Cohort)

# SLEDAI
table(patients.data$SLEDAI)
datTraits$SLEDAI <- as.numeric(datTraits$SLEDAI)

# Age
table(patients.data$Age)
datTraits$Age <- as.numeric(datTraits$Age)

#dsDNA_Level
table(patients.data$dsDNA_Level)
datTraits$dsDNA_Level <- gsub("Negative",0,datTraits$dsDNA_Level)
datTraits$dsDNA_Level <- gsub("Positive",1,datTraits$dsDNA_Level)
datTraits$dsDNA_Level <- as.numeric(datTraits$dsDNA_Level)

#dsDNA_IU
table(patients.data$dsDNA_IU)
datTraits$dsDNA_IU <- as.numeric(datTraits$dsDNA_IU)

#C3_Level
table(patients.data$C3_Level)
datTraits$C3_Level <- gsub("N",0,datTraits$C3_Level)
datTraits$C3_Level <- gsub("L",1,datTraits$C3_Level)
datTraits$C3_Level <- gsub("H",2,datTraits$C3_Level)
datTraits$C3_Level <- as.numeric(datTraits$C3_Level)

#C3_GperL
table(patients.data$C3_GperL)
datTraits$C3_GperL <- as.numeric(datTraits$C3_GperL)

#C4_Level
table(patients.data$C4_Level)
datTraits$C4_Level <- gsub("N",0,datTraits$C4_Level)
datTraits$C4_Level <- gsub("L",1,datTraits$C4_Level)
datTraits$C4_Level <- gsub("H",2,datTraits$C4_Level)
datTraits$C4_Level <- as.numeric(datTraits$C4_Level)

#C4_GperL
table(patients.data$C4_GperL)
datTraits$C4_GperL <- as.numeric(datTraits$C4_GperL)

#Race
table(patients.data$Race)

datTraits$Race_Black <- patients.data$Race
datTraits$Race_Black <- gsub("afr_amer",1,datTraits$Race_Black)
datTraits$Race_Black <- gsub("amer_ind_ak_native",0,datTraits$Race_Black)
datTraits$Race_Black <- gsub("asian",0,datTraits$Race_Black)
datTraits$Race_Black <- gsub("multiple",0,datTraits$Race_Black)
datTraits$Race_Black <- gsub("native_hw_pac_island",0,datTraits$Race_Black)
datTraits$Race_Black <- gsub("white",0,datTraits$Race_Black)
datTraits$Race_Black <- as.numeric(datTraits$Race_Black)

datTraits$Race_Native <- patients.data$Race
datTraits$Race_Native <- gsub("afr_amer",0,datTraits$Race_Native)
datTraits$Race_Native <- gsub("amer_ind_ak_native",1,datTraits$Race_Native)
datTraits$Race_Native <- gsub("asian",0,datTraits$Race_Native)
datTraits$Race_Native <- gsub("multiple",0,datTraits$Race_Native)
datTraits$Race_Native <- gsub("native_hw_pac_island",0,datTraits$Race_Native)
datTraits$Race_Native <- gsub("white",0,datTraits$Race_Native)
datTraits$Race_Native <- as.numeric(datTraits$Race_Native)

datTraits$Race_White <- patients.data$Race
datTraits$Race_White <- gsub("afr_amer",0,datTraits$Race_White)
datTraits$Race_White <- gsub("amer_ind_ak_native",0,datTraits$Race_White)
datTraits$Race_White <- gsub("asian",0,datTraits$Race_White)
datTraits$Race_White <- gsub("multiple",0,datTraits$Race_White)
datTraits$Race_White <- gsub("native_hw_pac_island",0,datTraits$Race_White)
datTraits$Race_White <- gsub("white",1,datTraits$Race_White)
datTraits$Race_White <- as.numeric(datTraits$Race_White)

#Region
table(patients.data$Region)

datTraits$Region_Europe <- patients.data$Region
datTraits$Region_Europe <- gsub("europe",1,datTraits$Region_Europe)
datTraits$Region_Europe <- gsub("mex_ca_sa",0,datTraits$Region_Europe)
datTraits$Region_Europe <- gsub("rest_world",0,datTraits$Region_Europe)
datTraits$Region_Europe <- gsub("us_can",0,datTraits$Region_Europe)
datTraits$Region_Europe <- as.numeric(datTraits$Region_Europe)

datTraits$Region_South <- patients.data$Region
datTraits$Region_South <- gsub("europe",0,datTraits$Region_South)
datTraits$Region_South <- gsub("mex_ca_sa",1,datTraits$Region_South)
datTraits$Region_South <- gsub("rest_world",0,datTraits$Region_South)
datTraits$Region_South <- gsub("us_can",0,datTraits$Region_South)
datTraits$Region_South <- as.numeric(datTraits$Region_South)

datTraits$Region_North <- patients.data$Region
datTraits$Region_North <- gsub("europe",0,datTraits$Region_North)
datTraits$Region_North <- gsub("mex_ca_sa",0,datTraits$Region_North)
datTraits$Region_North <- gsub("rest_world",0,datTraits$Region_North)
datTraits$Region_North <- gsub("us_can",1,datTraits$Region_North)
datTraits$Region_North <- as.numeric(datTraits$Region_North)

# Match row names of datExpr and datTraits
datExpr <- datExpr[ order(rownames(datExpr)), ]
datTraits <- datTraits[ order(rownames(datTraits)), ]
identical( rownames(datExpr), rownames(datTraits) )

#####Clean up the dataset and display final dendrogram with corresponding clinical factors
collectGarbage();

# Re-cluster samples after outlier removal and display dendrogram
sampleTree <- flashClust(dist(datExpr), method = "average")
# Convert traits to a color representation: white means low, red means high, grey means missing entry
traitColors <- numbers2colors(datTraits, signed = FALSE);
# Plot the sample dendrogram and the colors underneath
pdf("Dendrogram_Traits.pdf",width = 16, height = 9)
plotDendroAndColors(sampleTree, traitColors,
                    groupLabels = names(datTraits),
                    main = "Post-Filter" )
dev.off()
### SAVE OBJECTS 
save(datExpr, datTraits, eset.wgcna, badGenes, file="part1-final-objects.RData")

################################################################################################
### WGCNA PART 2

# This script uses the datExpr and datTraits to establish a soft threshold

################################################################################################
### LOAD AND CONFIGURE WGCNA LIBRARY
analysisDir <- "~/Catalina-et-al_Nat-Comm_Suppl-Scripts/WGCNA_EXAMPLE/"
library(WGCNA)
options(stringsAsFactors = FALSE);
allowWGCNAThreads()

################################################################################################

# Load files
setwd(analysisDir)
load("EXAMPLE_WGCNA_analysis_parameters.RData")
load("part1-final-objects.RData")
# Choose a set of soft-thresholding powers
# First generate a sequence of powers to apply
powers <- c(c(1:10), seq(from = 12, to=30, by=2))

# Call the network topology analysis function

# BlockSize of 10,000 quickly ran on 8 VCPUs and 52 Gb memory
sft <- pickSoftThreshold(datExpr, dataIsExpr = TRUE, powerVector = powers, verbose = 5, networkType="signed", corFnc = "bicor",
                         moreNetworkConcepts = TRUE, blockSize=10000) #bicor is the only option

pdf("SFT.pdf",width = 16,height = 9)
par(mfrow = c(1,2));
cex1 <- 0.8; # font size
# Scale-free topology fit index as a function of the soft-thresholding power
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
     xlab=paste(active.base,"STP Selection",sep=" "),ylab=paste(active.base,"Scale Free Topology Model Fit, signed",sep=" "),type="n",
     main = paste("Scale independence"));
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
     labels=powers,cex=cex1,col="red");

# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[,1], sft$fitIndices[,5],
     xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
     main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")
dev.off()
###SELECT THE THRESHOLD NUMBER AT WHICH THE SFT MODEL FIT PLATEAUS
###NOTE: If graphs are unusual, consider re-filtering, OR if sample numbers are low, use the SFT values on the WGCNA FAQ page table

save(sft, file="part2-output-SFT.RData")

sft.fits <- sft$fitIndices
write.table(sft.fits,file="part2-output-STP.txt", quote=FALSE, sep = "\t",row.names=FALSE, col.names=TRUE)

# You will need to change the power and update the saveTOMFileBase with the current date

### WGCNA PART 3 ADJACENCY MATRIX GENERATION AND TOM ENCODING
#

# In general, the only parameters to be changed here are file naming and the power selection.
# Also, this script must be run from the VM terminal, i.e. $sudo Rscript part3-TOM.R

library(WGCNA)
options(stringsAsFactors = FALSE);
allowWGCNAThreads()
setwd("~/Catalina-et-al_Nat-Comm_Suppl-Scripts/WGCNA_EXAMPLE/")
load("EXAMPLE_WGCNA_analysis_parameters.RData")
load("part1-final-objects.RData")

corType = "pearson"
power = 10
networkType = "signed"
TOMType = "signed"
deepSplit = 3
minModuleSize = 100
numericLabels = TRUE
detectCutHeight = 1
mergeCutHeight = 0.2
pamStage = TRUE
pamRespectsDendro = TRUE
saveTOMFileBase = paste0(file.base,"_SP10_10K_28Aug2018")

TOM.settings <- c(power, networkType, TOMType, deepSplit, minModuleSize, numericLabels, detectCutHeight, mergeCutHeight, 
                  pamStage, pamRespectsDendro, saveTOMFileBase)
names(TOM.settings) <- c("power", "networkType", "TOMType", "deepSplit", "minModuleSize", "numericLabels", "detectCutHeight",
                         "mergeCutHeight", "pamStage", "pamRespectsDendro", "saveTOMFileBase")
save(TOM.settings,file=paste0(saveTOMFileBase,"_TOM_settings.RData"))

net = blockwiseModules(datExpr,
                       power = power,
                       corType=corType,
                       deepSplit = deepSplit,
                       networkType = "signed", # ALWAYS SIGNED
                       TOMType = "signed", # ALWAYS SIGNED
                       minModuleSize = minModuleSize,
                       numericLabels = TRUE,
                       detectCutHeight = detectCutHeight,
                       mergeCutHeight = mergeCutHeight,
                       pamStage = pamStage,
                       pamRespectsDendro = pamRespectsDendro, 
                       saveTOMs = TRUE,
                       saveTOMFileBase = saveTOMFileBase,
                       verbose = 5,
                       maxBlockSize = 10000) # ALWAYS 10,000

save(net,file=paste0(saveTOMFileBase,"_NET.RData"))

### PART 4 - Recutting dendrogram trees

###############################################################################################################
### CONCEPTUAL OVERVIEW
# We first recut the blocks of clustered weighted, co-expression gene using all four levels
# of deep split (DS) and generally inspect the per-probe module assignment along with color
# bars of correlation to all clinical traits accompanying the dataset. It is these four DS
# recuts we scrutinize using subsequent part5 figures and calculations.

################################################################################################
### LOAD SUPPORTING AMPEL WGCNA SCRIPTS
# Load all scripts in shared scripts WGCNA directory
# scriptPaths <- list.files(pattern="[.]R$", path="/mnt/disks/rserver/projects/CopyOfshared_scripts/wgcna", full.names=TRUE)
# scriptPaths
# source(scriptPaths)
# rm(scriptPaths)

################################################################################################
### LOAD LIBRARIES AND CONFIGURE SESSION
library(WGCNA)
library(Hmisc) # contains capitalize function
library(Biobase)
options(stringsAsFactors = FALSE);
allowWGCNAThreads()

################################################################################################
### LOAD STUDY DATA
setwd(analysisDir)
load("part1-final-objects.RData")
eset <- eset.wgcna # Helps confirm we're loading the eset resulting from WGCNA variance filtering
rm(eset.wgcna)

################################################################################################
### LOAD TOM SETTINGS & MODULE AND COLOR ASSIGNMENTS
load(paste0(saveTOMFileBase,"_TOM_settings.RData"))
load(paste0(saveTOMFileBase,"_NET.RData"))

################################################################################################

projectName <- active.base
save(projectName,file="projectName.RData")
pData <- pData(eset)
fData <- fData(eset)
minModuleSize.original <- as.numeric(TOM.settings["minModuleSize"])
projectDir <- getwd() # Save original working directory to return to after directory traversal
TOMfiles <- net$TOMFiles

################################################################################################
## CALL THE SCRIPT TO ADD COLOR NAMES TO WGNCA MODULES GENERATED DURING TOM CREATION
## Note we'll be regenerating these anyways during the recut, so this is only
## really used for dianostics
source(paste0(scriptDir,"wgcna.net.colornames.R"))

###############################################################################################################
## RECUT THE TOM NETWORK BLOCKS USING FOUR LEVELS OF DEEP SPLIT
## This repeats blockwise module detection from the pre-calculated TOM
## Patience, these recuts take 2-3 minutes each, even on the cloud.

## SET POTENTIAL NEW RECUT PARAMETERS. SEE WGCNA DOCUMENTATION FOR ADDITIONAL INFORMATION
# Here we have the opportunity to experiment specficially usage of the PAM stage, adjustment of
# module cut height, minimum module size, and of course the four levels of deep split (DS1-4).
# Note PAM stage is conducted as a second pass to assign modules not first assigned by the general recut,
# and historically it's always been used.

recut.corType <- "pearson" # pearson is safest. bicor is other common option but usually results in many unassigned probes
recut.pamStage <- TRUE # Usually true
recut.pamRespectsDendro <- TRUE # Always true if using PAM stage
recut.detectCutHeight <- 1 # Usually 1 for most experimentss, adjust carefully!!
recut.minModuleSize <- 100 # Experiment with this carefully! Raise as needed for sets with large numbers of probes/genes
recut.mergeCutHeight <- 0.2 # Usually 0.2. Module merging can occur downstream

experimentName.recut <- paste("recut.","STP",TOM.settings["power"],".pam",recut.pamStage,".minMod-", recut.minModuleSize,sep="")
experimentName.recut
save(experimentName.recut, file="experimentName.recut.RData")

###############################################################################################################

###############################
### CALL THE RECUT BLOCKS SCRIPT. THIS MAY TAKE UP TO ~10 MINUTES EVEN ON THE CLOUD
source(paste0(scriptDir,"wgcna.blocks.recut.v2.R"))
###############################
# SAVE THE RECUT OBJECTS. IMPORTANT!!! ALL OUTPUT AND FIGURES RELATED TO A RECUT OF THE
# MASTER TOM ARE STORED IN A PROJECT SUBDIRECTORY!!!

setStorageDir <- dget(paste0(scriptDir,"wgcna/set.storage.directory.R"))
outputStorage <- setStorageDir( experimentName.recut )


save( moduleColors, moduleColors.DS1, moduleColors.DS2, moduleColors.DS3, moduleColors.DS4,
      moduleLabels, moduleLabels.DS1, moduleLabels.DS2, moduleLabels.DS3, moduleLabels.DS4,
      MEs, MEs.labels, MEs.DS1, MEs.DS1.labels, MEs.DS2, MEs.DS2.labels, MEs.DS3, MEs.DS3.labels, MEs.DS4, MEs.DS4.labels,
      colorLevels, colorLevels.DS1, colorLevels.DS2, colorLevels.DS3, colorLevels.DS4,
      file="recut.DS-DS1-DS4.modules.and.genes.RData")

setwd(analysisDir)

###############################################################################################################
### PLOT THE BLOCK DENDROGRAMS OF RECUT PROBES

outputStorage <- setStorageDir( c(experimentName.recut,"plots") )

pngWidth=1500 # width of output PNGs
pngHeight=1000 # height of output PNGs
marAll = c(1, 7, 3, 1) # bottom, left, top and right margins
source(paste0(scriptDir,"wgcna.blocks.dendro.with.traitbars.R"))

## PART 5 Correlate modules to clinical traits and select final modules
saveTOMFileBase = paste0(file.base,"_SP10_10K_28Aug2018") #### change
sigTrait <- "SLEDAI" #### change

# You will need to check the rownames of the deepsplits ~ line 100
# You will also need to change your final choice of deep split and other options ~ line 400

###############################################################################################################
### CONCEPTUAL OVERVIEW
# We first recut the blocks of clustered weighted, co-expression gene using all four levels
# of deep split (DS) and generally inspect the per-probe module assignment along with color
# bars of correlation to all clinical traits accompanying the dataset. It is these four DS
# recuts we scrutinize using these subsequent figures and calculations in order to choose the
# deep split that provides the most valuable modules.
#
# 1. We correlate the module eigengenes (MEs) of each DS to clinical traits. Here we're looking for the maximum 
# number of modules significantly correlated to our clinical trait of greatest interest, usually SLEDAI, but
# all correlations are later stored and reported in heatmaps and data tables.
#
# 2. We annotate genes within the four DS modules using both GO and AMPEL BIG-C categories,
# with the goal of looking for a DS that yields modules discretely capturing
# single categories.
#
# 3. Dendrograms are generated for each deep split, which are used more to help establish if modules are too
# small or need to be merged. The case to merge modules can also be built by inspecting how much
# GO or BIGC annotation they share. 
#
# 4. Heatmaps are automatically generated for every module in every deepsplit, and stored in a subdirectory.
#
# 5. We move from the module level to the gene level, calculating the kME & kIM and look at per-module
# statistics and figures of these two.
#
# 6. Recall the ME is calculated using the average gene counts of all samples, so for each module we inspect
# the contribution of each sample to that module's averaged counts. Historically we've found many modules are
# generated due to the contribution of a single sample very strongly expressing those genes. Clearly these
# modules must be disregarded because they contain genes that don't adequately describe the difference
# between diseased and normal
#
# 7. Per ME we inspect the scatterplots of module membership (kIM) vs. correlation to significant clinical trait.
# This is more a visual confirmation of "module quality", a characteristic that one day needs to be quantified.
#
# 8. We include a similar scatterplot of the grey module, ensuring interesting genes weren't lost to the dump module.
#
# 9. The genes with gene annotation, module color assignment, gene-level correlations to clinical traits,
# kMEs and kIMs are bound into the final geneLists.
#
# 10. geneLists are output to a Microsoft Excel workbook, including a summary sheet, room for figures, 
# and workseet for each module gene list.
#
###############################################################################################################

library(WGCNA)
library(Hmisc) # contains capitalize function
library(a4)
options(stringsAsFactors = FALSE);
allowWGCNAThreads()

# Load project level output
setwd(analysisDir)
load("part1-final-objects.RData")
eset <- eset.wgcna
rm(eset.wgcna)
pData <- pData(eset)
fData <- fData(eset)

load(paste0(saveTOMFileBase,"_NET.RData"))
load(paste0(saveTOMFileBase,"_TOM_settings.RData"))
load("projectName.RData")

# Load recut module output
load("experimentName.recut.RData")
experimentDir<-paste0(analysisDir,experimentName.recut)
setwd(experimentDir)
load("recut.DS-DS1-DS4.modules.and.genes.RData")

########################################################################################
# SET CLINICAL TRAIT OF MOST INTEREST, USUALLY SLEDAI. ALSO SET GLOBAL P-VAL

p_val <- 0.2
moduleColors.batch <- c("moduleColors", "moduleColors.DS1", "moduleColors.DS2",
                        "moduleColors.DS3", "moduleColors.DS4")
moduleCor.batch <- c("moduleCor","moduleCor.DS1","moduleCor.DS2","moduleCor.DS3","moduleCor.DS4")
MEs.batch <- c("MEs", "MEs.DS1", "MEs.DS2", "MEs.DS3", "MEs.DS4")

########################################################################################
# DS-DS1-DS4 Calculate module correlations to clinical traits
# as the modules originally created during TOM generation. The "DS-orig" is almost always a DS of 3,
# but I do refer to it here as simply "DS", followed by the four recuts of DS1, DS2, DS3, DS4 (DS1-DS4)
# In theory the deepsplit function within the TOM() function should generate results identical to
# those from the recut() function!


colnames(eset)
rownames(datTraits)
rownames(datExpr)
rownames(MEs)
rownames(MEs.DS1)
rownames(MEs.DS2)
rownames(MEs.DS3)
rownames(MEs.DS4)
#MEs.DS1<-MEs.DS1[order(rownames(MEs.DS1),decreasing=TRUE),]
#MEs.DS2<-MEs.DS2[order(rownames(MEs.DS2),decreasing=TRUE),]
#MEs.DS3<-MEs.DS3[order(rownames(MEs.DS3),decreasing=TRUE),]
#MEs.DS4<-MEs.DS4[order(rownames(MEs.DS4),decreasing=TRUE),]

# Graphics parameters for module corr
saveFile <-  TRUE # Specify if output plots will be saved
plotMar <- c(8, 6, 4, 2)
xLabels <- colnames(datTraits)
xColorOffset <- 0
xLabelsAngle <- 45
cex.lab.x <- 1
cex.lab.y <- 0.5
text.size <- 0.2
pngWidth <- 1200
pngHeight <- 1000

setwd(analysisDir)
setStorageDir <- dget(paste0(scriptDir,"wgcna/set.storage.directory.R"))
outputStorage <- setStorageDir( c(experimentName.recut,"plots" ) )

source(paste0(scriptDir,"wgcna.cor.modules.traits.R"))

rm(saveFile,plotMar,xLabels,xColorOffset,xLabelsAngle,cex.lab.x,cex.lab.y,text.size)

setwd(analysisDir)

########################################################################################
# GET A QUICK SNAPSHOT OF SIGNIFICANT CORRELATION TO CLINICAL TRAIT OF MOST INTEREST

sig.orig <- length(which(moduleCor[,paste(sigTrait,".p",sep="")]<p_val))
sig.DS1 <- length(which(moduleCor.DS1[,paste(sigTrait,".p",sep="")]<p_val)) 
sig.DS2 <- length(which(moduleCor.DS2[,paste(sigTrait,".p",sep="")]<p_val))
sig.DS3 <- length(which(moduleCor.DS3[,paste(sigTrait,".p",sep="")]<p_val))
sig.DS4 <- length(which(moduleCor.DS4[,paste(sigTrait,".p",sep="")]<p_val))

print(paste(sig.orig, " modules sig to ",sigTrait,": DS",TOM.settings["deepSplit"]," TOM cut",sep=""))
print(paste(sig.DS1, " modules sig to ",sigTrait,": DS1 cut",sep=""))
print(paste(sig.DS2, " modules sig to ",sigTrait,": DS2 cut",sep=""))
print(paste(sig.DS3, " modules sig to ",sigTrait,": DS3 cut",sep=""))
print(paste(sig.DS4, " modules sig to ",sigTrait,": DS4 cut",sep=""))

rm(sig.orig,sig.DS1,sig.DS2,sig.DS3,sig.DS4)

########################################################################################
# BEGIN CREATION OF GENE LISTS

fData$fData.order <- rep(1:nrow(fData))
geneList.DS <- cbind( moduleColors, fData )
geneList.DS1 <- cbind( moduleColors.DS1, fData )
geneList.DS2 <- cbind( moduleColors.DS2, fData )
geneList.DS3 <- cbind( moduleColors.DS3, fData )
geneList.DS4 <- cbind( moduleColors.DS4, fData )

##############################################################################
# ANNOTATE WITH BIGC
# 
# setwd(analysisDir)
# bigc <- read.delim(paste0(esetDir,"BIGCv4-3_NSG_17Aug2017.txt"), header=TRUE, sep="\t")
# 
# geneList.DS.1 <- merge( geneList.DS, bigc[,1:3], by.x="ENTREZ", by.y="Entrez_ID", all.x=TRUE, all.y=FALSE )
# geneList.DS.1 <- geneList.DS.1[order(geneList.DS.1$fData.order),]
# geneList.DS <- geneList.DS.1
# rm(geneList.DS.1)
# 
# geneList.DS1.1 <- merge( geneList.DS1, bigc[,1:3], by.x="ENTREZ", by.y="Entrez_ID", all.x=TRUE, all.y=FALSE )
# geneList.DS1.1 <- geneList.DS1.1[order(geneList.DS1.1$fData.order),]
# geneList.DS1 <- geneList.DS1.1
# rm(geneList.DS1.1)
# 
# geneList.DS2.1 <- merge( geneList.DS2, bigc[,1:3], by.x="ENTREZ", by.y="Entrez_ID", all.x=TRUE, all.y=FALSE )
# geneList.DS2.1 <- geneList.DS2.1[order(geneList.DS2.1$fData.order),]
# geneList.DS2 <- geneList.DS2.1
# rm(geneList.DS2.1)
# 
# geneList.DS3.1 <- merge( geneList.DS3, bigc[,1:3], by.x="ENTREZ", by.y="Entrez_ID", all.x=TRUE, all.y=FALSE )
# geneList.DS3.1 <- geneList.DS3.1[order(geneList.DS3.1$fData.order),]
# geneList.DS3 <- geneList.DS3.1
# rm(geneList.DS3.1)
# 
# geneList.DS4.1 <- merge( geneList.DS4, bigc[,1:3], by.x="ENTREZ", by.y="Entrez_ID", all.x=TRUE, all.y=FALSE )
# geneList.DS4.1 <- geneList.DS4.1[order(geneList.DS4.1$fData.order),]
# geneList.DS4 <- geneList.DS4.1
# rm(geneList.DS4.1)
# 
# geneList.batch <- c("geneList.DS","geneList.DS1","geneList.DS2","geneList.DS3","geneList.DS4")


########################################################################################
# SAVE EARLY GENE LISTS

outputStorage <- setStorageDir( c(experimentName.recut ) )
save(geneList.DS, geneList.DS1,geneList.DS2,geneList.DS3,geneList.DS4,file="geneLists.RData")
setwd(analysisDir)

########################################################################################


########################################################################################
## PLOT DENDROGRAM OF ALL MODULES FOR VARIOUS DEEPSPLITS

outputStorage <- setStorageDir( c(experimentName.recut,"plots" ) )

# Load the plot dendrogram function
source(paste0(scriptDir,"wgcna.deepsplits.dendros.R"))

# parMar sets bottom, left, top, right margins

# Original DS3 cut
plotDendro(datExpr, moduleColors, moduleCor, sigTrait, parMar=c(6,10,2,14), p_val, cex=0.7, deepSplit="DS3",
           chart.filename="dendro.TOMDS3.png", pngWidth=1400, pngHeight=1000, saveFile=TRUE)

# DS1 recut
plotDendro(datExpr, moduleColors.DS1, moduleCor.DS1, sigTrait, parMar=c(6,10,2,14), p_val, cex=0.7, deepSplit="DS1",
           chart.filename="dendro.DS1.png", pngWidth=1400, pngHeight=1000, saveFile=TRUE)

# DS2 recut
plotDendro(datExpr, moduleColors.DS2, moduleCor.DS2, sigTrait, parMar=c(6,10,2,14), p_val, cex=0.7, deepSplit="DS2",
           chart.filename="dendro.DS2.png", pngWidth=1400, pngHeight=1000, saveFile=TRUE)

# DS3 recut
plotDendro(datExpr, moduleColors.DS3, moduleCor.DS3, sigTrait, parMar=c(6,10,2,14), p_val, cex=0.7, deepSplit="DS3",
           chart.filename="dendro.DS3.png", pngWidth=1400, pngHeight=1000, saveFile=TRUE)

# DS4 recut
plotDendro(datExpr, moduleColors.DS4, moduleCor.DS4, sigTrait, parMar=c(6,10,2,14), p_val, cex=0.7, deepSplit="DS4",
           chart.filename="dendro.DS4.png", pngWidth=1400, pngHeight=1000, saveFile=TRUE)

setwd(analysisDir)

########################################################################################
## GENERATE HEATMAPS OF probe/GENE EXPRESSION IN ALL DEEPSPLIT MODULES

# Load the heatmap function
source(paste0(scriptDir,"wgcna.module.genes.heatmap.R"))

# Save the many heatmaps in their own directory
setStorageDir <- dget(paste0(scriptDir,"wgcna/set.storage.directory.R"))
outputStorage <- setStorageDir( c(experimentName.recut,"plots","heatmaps" ) )

library(gplots)
# Original DS3 cut
heatmap.module( moduleNames=colnames(MEs), moduleCor=moduleCor, geneInfo=geneList.DS, eset=eset, saveFile=TRUE, 
                deepSplit="DS3-TOM", cexRow = 0.1, cexCol=1.2, margins=c(7,7),
                pngWidth=1800, pngHeight=800)
# DS1 recut
heatmap.module( moduleNames=colnames(MEs.DS1), moduleCor=moduleCor.DS1, geneInfo=geneList.DS1, eset=eset, saveFile=TRUE, 
                deepSplit="DS1", cexRow = 0.1, cexCol=1.2, margins=c(7,7),
                pngWidth=1800, pngHeight=800)

# DS2 recut
heatmap.module( moduleNames=colnames(MEs.DS2), moduleCor=moduleCor.DS2, geneInfo=geneList.DS2, eset=eset, saveFile=TRUE, 
                deepSplit="DS2", cexRow = 0.1, cexCol=1.2, margins=c(7,7),
                pngWidth=1800, pngHeight=800)

# DS3 recut
heatmap.module( moduleNames=colnames(MEs.DS3), moduleCor=moduleCor.DS3, geneInfo=geneList.DS3, eset=eset, saveFile=TRUE, 
                deepSplit="DS3", cexRow = 0.1, cexCol=1.2, margins=c(7,7),
                pngWidth=1800, pngHeight=800)

# DS4 recut
heatmap.module( moduleNames=colnames(MEs.DS4), moduleCor=moduleCor.DS4, geneInfo=geneList.DS4, eset=eset, saveFile=TRUE, 
                deepSplit="DS4", cexRow = 0.1, cexCol=1.2, margins=c(7,7),
                pngWidth=1800, pngHeight=800)

setwd(analysisDir)

########################################################################################
## CORRELATE probeS/GENES TO ALL CLINICAL TRAITS AND BIND TO GENE LISTS

# Inspect the probes with 0 standard deviation!!!
gene.corr.r <- cor(datExpr, datTraits, use = "p")
gene.corr.p <- corPvalueStudent(gene.corr.r, nrow(datExpr))

# Bind correlations to their pvals
colnames(gene.corr.p) <- paste( colnames(gene.corr.p), ".p", sep="" )

alternate.cols <- function(m1, m2) {
  cbind(m1, m2)[, order(c(seq(ncol(m1)), seq(ncol(m2))))]
}
gene.corr.r_p <- alternate.cols(gene.corr.r, gene.corr.p)

setwd(experimentDir)
save(gene.corr.r_p, file="gene_correlations_datTraits.RData")

########################################################################################
## CALCULATE KMEs FOR ALL DEEP SPLITS
# Signed kME is essentially modular membership, meaning the correlation of each probe to the eigengene
# of it's containing module, calculated as MM = as.data.frame(cor(datExpr, MEs, use = "p"));
# Dr. Peter Langfelder writes "There's also a function called signedKME which essentially does the same 
# as your cor() call, but adds a bunch of checks for correct data formatting and for non-varying genes
# or genes with too many missing values."
# See https://support.bioconductor.org/p/69858/

for (i in 1:length(MEs.batch)) {
  print(paste("Calculating kMEs for",MEs.batch[i]))
  KME.DS <- signedKME( datExpr, get(MEs.batch[i]), outputColumnName="KME")
  KME.DS.p <- corPvalueStudent(as.matrix(KME.DS), nSamples=ncol(datTraits))
  
  colnames(KME.DS) <- paste("",colnames(get(MEs.batch[i])),sep="")
  colnames(KME.DS.p) <- paste( colnames(KME.DS), ".p", sep="" )
  
  KME.DS.final <- alternate.cols(KME.DS,KME.DS.p)
  
  assign(paste("KME.",MEs.batch[i],sep=""), KME.DS.final)
  
}

save(KME.MEs,KME.MEs.DS1,KME.MEs.DS2,KME.MEs.DS3,KME.MEs.DS4, file="gene_KMEs.RData")

KME.batch <- c("KME.MEs", "KME.MEs.DS1", "KME.MEs.DS2", "KME.MEs.DS3", "KME.MEs.DS4")

###############
# Generate a frame of the KME the genes were primarily assigned to


for (i in 1:length(KME.batch)) {
  KME.primary <- matrix(nrow=nrow(get(KME.batch[i])),ncol=1)
  KME.corr <- get(KME.batch[i])[,seq(1, ncol(get(KME.batch[i])), 2)]
  print(paste("Calculating kMEs for ",KME.batch[i],sep=""))
  for (j in 1:nrow(geneList.DS)) {
    if (!is.null(KME.corr[i, geneList.DS[j,"moduleColors"] ])) {
      KME.primary[j] <- KME.corr[j, geneList.DS[j,"moduleColors"] ]
      assign(paste("KME.primary",KME.batch[i],sep=""), KME.corr)
    }
  }
}


save(KME.primaryKME.MEs,KME.primaryKME.MEs.DS1,KME.primaryKME.MEs.DS2,KME.primaryKME.MEs.DS3,KME.primaryKME.MEs.DS4, file="gene_primaryKMEs.RData")

########################################################################################
## CALCULATE KIMs FOR ALL DEEP SPLITS
# Calculate kIM, or intramodular connectivity, i.e. connectivity of nodes to other nodes
# within the same module. Note kME and kIM values should be fairly similar, as a strong kME
# correlation of a gene to the eigengene, or strong modular membership, should suggest
# a high degree of connectedness to other intramodular genes, as a hub gene.
# Again, see https://support.bioconductor.org/p/69858/.
# source(paste0(scriptDir,"wgcna.kIMs.per.block.R"))

for (i in 1:length(moduleColors.batch)) {
  
  print(paste("Calculating intramodular connectivities for ",moduleColors.batch[i],sep=""))
  
  kIM=intramodularConnectivity.fromExpr(datExpr, get(moduleColors.batch[i]), corFnc = "cor",
                                        corOptions = "use = 'p'",
                                        distFnc = "dist", distOptions = "method = 'euclidean'",
                                        networkType = "signed",
                                        power = as.integer(TOM.settings["power"]),
                                        scaleByMax = TRUE,
                                        ignoreColors = if (is.numeric(colors)) 0 else "grey",
                                        getWholeNetworkConnectivity = TRUE)
  
  assign(paste("kIM.",moduleColors.batch[i],sep=""), kIM)
}

save(kIM.moduleColors,kIM.moduleColors.DS1,kIM.moduleColors.DS2,kIM.moduleColors.DS3,kIM.moduleColors.DS4,
     file="genes_kIMs.RData")

########################################################################################
## PLOT INTRAMODULAR MEMBERSHIP VS. CORRELATION TO CLINICAL TRAIT OF INTEREST

setStorageDir <- dget(paste0(scriptDir,"wgcna/set.storage.directory.R"))
outputStorage <- setStorageDir( c(experimentName.recut,"plots") )

source(paste0(scriptDir,"wgcna.plot.module.im.vs.sigtrait-corr.R"))
# n.b. this doesn't plot grey module because there is no kIM data
kIMvsCorrPlot(moduleCor.DS4,moduleColors.DS4,kIM.moduleColors.DS4,DS.tag="DS4")
source(paste0(scriptDir,"wgcna.plot.module.me.vs.sigtrait-corr.R"))
kMEvsCorrPlot(moduleCor.DS4,moduleColors.DS4,KME.primaryKME.MEs.DS4,DS.tag="DS4")
source(paste0(scriptDir,"wgcna.plot.mes.and.boxplot.R"))
ME_plots(MEs=MEs.DS4,DS.tag="DS4")

########################################################################################
## BRING IT ALL TOGETHER BY BINDING GENEINFO, GENE.CORR.R_P, KME, AND KIM

setwd(experimentDir)

# choose a final deep split to go with

geneList.DS4.final <- cbind(geneList.DS4, gene.corr.r_p, kIM.moduleColors.DS4, KME.MEs.DS4)

write.table(geneList.DS4.final,file="geneList.DS4.final.txt",row.names=TRUE, quote=FALSE, sep="\t")

colnames(geneList.DS4.final)
geneList.DS4.final.simple <- geneList.DS4.final[,1:40] # remove all module name columns

index <- which(!is.na(geneList.DS4.final.simple[,"ENTREZ"]))
geneList.DS4.final.simple <- geneList.DS4.final.simple[index,]

write.table(geneList.DS4.final.simple,file="geneList.DS4.final.simple.txt",row.names=FALSE, quote=FALSE, sep="\t")

# index <- which(geneList.DS.final.simple[,"moduleColors"]==c("lightyellow","brown"))
# 
# geneList.DS.final.simple.lightyellow.brown <- geneList.DS.final.simple[index,]
# 
# write.table(geneList.DS.final.simple.lightyellow.brown,file="geneList.DS.final.simple.lightyellow.brown.txt",row.names=TRUE, quote=FALSE, sep="\t")

save(moduleCor.DS4,geneList.DS4.final.simple,file="final_results.DS4.RData")
write.table(moduleCor.DS4,file="moduleCor.DS4.txt",row.names=TRUE, quote=FALSE, sep="\t")

WGCNA PART 6 EXPORT RESULTS TO EXCEL

load("experimentName.recut.RData")
experimentDir<-paste0(analysisDir,experimentName.recut)
setwd(experimentDir)
load("final_results.DS4.RData")
geneInfo<-geneList.DS4.final.simple
colnames(geneInfo)[2]<-"moduleColor"
moduleCor<-moduleCor.DS4
rm(geneList.DS4.final.simple,moduleCor.DS4)

# you will need to make changes throughout this script

###############################################################################################################
# EXPORT MODULES TO EXCEL
options(java.parameters = "-Xmx16000m")
library(xlsx)
wb <- createWorkbook()

title.stem<-paste0(file.base,"_WGCNA_Final-Report_NSG_28Aug2018")

##############################################
# CREATE SUMMARY PAGE

# Include at least:
# 1. GSE number / ArrayExpress number
# 2. Study title
# 3. PubMed ID (if referenced in a publication)
# 4. Type of sample
# 5. How sample was isolated
# 6. Timepoints in the study if there are more than one
# 7. What is being compared (e.g., Disease to healthy; disease to disease)
# 8. Whether or not there is in vitro stimulation of the cells
# 9. Numbers of samples: divided into healthy control and disease type if applicable
# 10. Ethnicity of patients
# 11. Clinical data (e.g., cohort, SLEDAI, drugs, etc.)
# 12. Platform used (e.g., Affy hgU133plus2)
# 13. ID of outliers (ie GSMxxxx)
# 14. Data normalization steps
# 15. Analysis files location
# 16. Version control (e.g., date of analysis and by whom)
# 17. Deepsplit used (i.e., 1, 2, 3, or 4)
# 18. Soft threshold used (e.g., 20)
# 19. Count of starting probes
# 20. Count of retained, analyzed probes
# 21. Correlation function type (i.e., Pearson, Bicor [biweight midcorrelation], or Spearman)
# 22. Signed or unsigned network
# 23. Were duplicates removed post-modularization, and if so, using what collapseRow method
# 24. Block size used in generating SFT and making TOMs

study <- data.frame(nrow=33, ncol=2)
study[1,] <- c("Study Number","GSE88884")
study[2,] <- c("Analysis_Type","WGCNA")
study[3,] <- c("Cell/Tissue Type","Whole Blood")
study[4,] <- c("Cell Stimulation","None")
study[5,] <- c("Common_Name","Lilly Illuminate-1")
study[6,] <- c("Disease Comparison","SLE vs Healthy Controls")
study[7,] <- c("Cohorts","SLE;CTL")
study[8,] <- c("SLE Females","816")
study[9,] <- c("SLE Males","0")
study[10,] <- c("SLE Unknown Sex","0")
study[11,] <- c("Control Females","10")
study[12,] <- c("Control Males","0")
study[13,] <- c("Control Unknown Sex","0")
study[14,] <- c("SLEDAI Range","4 to 27")
study[15,] <- c("Race(s)","Multiple")
study[16,] <- c("Clinical Variables","Cohort, SLEDAI, Age, C3, C4, dsDNA, Race, Region")
study[17,] <- c("Microarray Type","Affymetrix Human Transcriptome Array 2.0 (HTA2.0)")
study[18,] <- c("Chip Definition Files used","BA")
study[19,] <- c("Input ESET File","eset_illum1_merged_all_samples_entrez_rma_ampel.Rdata")
study[20,] <- c("Outlier Accessions","See patient data Excel file (GSE88884_Patient-Data_Trimmed_NSG_2017-04-07.xlsx)")
study[21,] <- c("Starting Probes","32500")
study[22,] <- c("Probe Intensity Threshold","50%, 5.66 intensity")
study[23,] <- c("Analyzed Probes","16327")
study[24,] <- c("Network Type","Signed")
study[25,] <- c("Correlation Type","Pearson")
study[26,] <- c("Soft Threshold Power","10")
study[27,] <- c("Block Size","10000")
study[28,] <- c("Block Count","2")
study[29,] <- c("Deep Split","4")
study[30,] <- c("Analysis Performed by","NSG")
study[31,] <- c("Analysis Conducted on","28Aug2018")
study[32,] <- c("Source Publication PMID(s)","")
study[33,] <- c("Notes","Female SLE versus Female Controls as determined by key gene expression from Illuminate-1 batch")

ws <- createSheet(wb=wb, sheetName="WGCNA Summary")
addDataFrame(x=study, sheet=ws, row.names=FALSE,col.names=FALSE)
######################################################################
# NOW ADD Eigengene values from the desired deepsplit of choice
load(paste0(experimentDir,"/recut.DS-DS1-DS4.modules.and.genes.RData"))

me.table <- t(MEs.DS4)

ws <- createSheet(wb=wb, sheetName="Module Eigengenes")
addDataFrame(x=me.table, sheet=ws)
######################################################################
# NOW ADD Clinical values from the desired deepsplit of choice
load(paste0(analysisDir,"part1-final-objects.RData"))

ws <- createSheet(wb=wb, sheetName="Clinical Traits")
addDataFrame(x=datTraits, sheet=ws)
######################################################################
# ADD A TABLE OF CORRELATIONS FOR MODULE EIGENGENES AND CLINICAL TRAITS

color_index <- levels(factor(geneInfo$moduleColor))
length(color_index)

library(dplyr)

geneInfo.list <- split(geneInfo, f = geneInfo$moduleColor) # Convert to a list object with elements based on module colors
data.list <- lapply(geneInfo.list,function(x){ # Remove all within module duplicates by taking the ones with highest kWithin
  as.data.frame(x) %>%
    group_by(ENTREZ) %>%
    top_n(1,kWithin)})

library (plyr)

df <- ldply (data.list, data.frame) # Turn list back into a dataframe
colnames(df)[4] <-"probe"
df <- df[,-1]
colnames(geneInfo)[3] <- "probe"
rownames(geneInfo) <- geneInfo$probe
rownames(df) <- df$probe #Now it looks just like geneInfo
df <- df[!is.na(df$ENTREZ),] # Remove all unannotated probes
df<-df[!is.na(df$SYMBOL),]

ws <- createSheet(wb=wb, sheetName="Correlations")
moduleCor <- as.data.frame(moduleCor)
moduleCor <- moduleCor[order(rownames(moduleCor)),]
df$moduleColor <- as.character(df$moduleColor)
mod.counts <- as.data.frame(table(df$moduleColor))
mod.counts <- mod.counts[order(mod.counts$Var1),]
rownames(mod.counts) <- mod.counts$Var1
moduleCor <- moduleCor[which(rownames(moduleCor)!="grey"),]
identical(rownames(moduleCor),rownames(mod.counts))
moduleCor$moduleCount <- mod.counts$Freq

addDataFrame(x=moduleCor, sheet=ws, row.names=TRUE,col.names=TRUE)

######################################################################
# NOW ADD INDIVIDUAL MODULE SHEETS IN WORKBOOK

addDataFrame(x=moduleCor, sheet=ws, row.names=TRUE,col.names=TRUE)
for (i in 1:length(color_index) ) {
  single.mods <- df[ df$moduleColor==color_index[i], ]
  single.mods <- single.mods[ !is.na(single.mods$moduleColor), ]
  single.mods <- single.mods[ order(single.mods$SLEDAI.p), ] ######################MODIFY the primary clinical trait
  nrow(single.mods)
  ws <- createSheet(wb=wb, sheetName=paste(toupper(color_index[i])))
  addDataFrame(x=single.mods, sheet=ws, row.names=FALSE)
}

######################################################################
# SAVE THE WORKBOOK
setwd(experimentDir)
saveWorkbook(wb, paste0(title.stem,".xlsx"))
write.table(df, paste0(title.stem,".txt"))


##################################################################
WGCNA SHARED R SCRIPTS

###################################################################################################
# WGCNA PLOT RECUT DEEPSPLIT MODULES ALONG WITH CLINICAL TRAIT CORRELATIONS
###################################################################################################

# Correlate probe intensities / normalized gene counts to binary or continuous clinical traits
# and convert correlation values to a blue-white-red color palette for a color bar

#colnames(datTraits) # Display clinical traits that will be plotted as correlation bars 

traitBar <- colnames(datExpr)

for(i in 1:ncol(datTraits)) {
  trait <- as.data.frame(datTraits[,i])
  names(trait) <- colnames(datTraits)[i]
  trait.corr <- as.numeric(cor(datExpr, trait, use = "p")) # Here is the correlation work
  trait.corr.color <- data.frame(numbers2colors(trait.corr, signed = T))
  names(trait.corr.color) <- colnames(datTraits)[i]
  traitBar <- cbind(traitBar,trait.corr.color)
}

print(paste("Genes correlated to clinical traits have been encoded as color bars."))
rm(trait,trait.corr,trait.corr.color,i)

traitBar.recut <- data.frame(cbind(moduleColors.DS1,moduleColors.DS2,
                                   moduleColors.DS3,moduleColors.DS4))
names(traitBar.recut) <- c("DS1","DS2","DS3","DS4")
traitBar.recut <- cbind( traitBar.recut, traitBar[,2:ncol(traitBar)])


###################################################################################################
# Loop through the blocks generated by TOM network encoding of the adjacency network and
# generate a dendrogram with module colors and clinical traits bars
for (i in 1:length(net$TOMFiles)) {
  blockNumber = i
  title <- paste(experimentName.recut," DS 1-4 Block ",i,"/",length(net$TOMFiles),sep="")
  
  # Combine the gene correlation colors into a single frame
  datColors = traitBar.recut[net$blockGenes[[blockNumber]],]
  
  # Plot the dendrogram and the module colors underneath
  png(paste(experimentName.recut," block ",i," of ",length(net$TOMFiles),".png",sep=""),
      pngWidth, pngHeight, pointsize=20)
  
  datColors = traitBar.recut[net$blockGenes[[blockNumber]],]
  
  plotDendroAndColors(net$dendrograms[[blockNumber]], colors = datColors,
                      groupLabels = colnames(traitBar.recut),
                      dendroLabels = FALSE, hang = 0.03,
                      addGuide = TRUE, guideHang = 0.05,
                      marAll = marAll, saveMar = TRUE, # bottom, left, top and right 
                      main=title)
  print(paste("Block",i,"dendrogram with trait bars figure saved"))
  dev.off()
}
setwd(projectDir)
#rm(traitBar, i)

## Fix up maximal allowed permissions in the file tree
# Sys.chmod(list.dirs("."), "777")
# f <- list.files(".", all.files = TRUE, full.names = TRUE, recursive = TRUE)
# Sys.chmod(f, (file.info(f)$mode | "664"))
# rm(f)

##############################################################################################
WGCNA SET STORAGE DIRECTORY
# Set directory path to save these plots in, formatted as 
# /mnt/disks/rserver/projects/projectName/wgcna/DSpower.pamBool.minMod-Num/plots/blocks/
# The function will traverse into the final directory, creating them along the way as needed

setStorageDir <- function (storageDir) {
  # Create directories if they don't exist. This is a primitive script that works by moving into or creating 
  # directories using dir.create. Note I'm not able to pass a directory path to this function, such as
  # dir.create("plots/blocks/DS10.pamTRUE.minMod-100"), so have to generate these one at time.
  newDir <- vector('character')
  for (i in 1:length(storageDir)) {
    newDir <- storageDir[i]
    if (!dir.exists(newDir)) {
      dir.create(newDir, showWarnings=TRUE, mode="0777")
      print(paste("Sub-directory",newDir,"created."))
    } else {
      print(paste("Sub-directory",paste(getwd(),"/",newDir," exists.",sep="") ))
    }
    setwd( paste(getwd(),newDir,sep="/") )
  }
  print(paste("Current storage directory: ",getwd(),sep=""))
  rm(storageDir,newDir,i)
}

############################################################################################
###################################################################################################
# PLOT A HEATMAP OF TRANSCRIPT EXPRESSION PER MODULE
# of correlation of most interesting clinical trait to module eigengene. Currently I'm
# not adding a color bar, which we once used for manually-set kmeans number of clusters
###################################################################################################

library(Biobase)

heatmap.module <- function( moduleNames, moduleCor, geneInfo, eset, saveFile, deepSplit,
                            cexRow, cexCol, margins, pngWidth, pngHeight) {
  
  for ( i in 1:length(moduleNames) ) {
    geneInfo.subset <- geneInfo[ geneInfo$moduleColor==moduleNames[i], ]
    index <- which(rownames(eset)%in%geneInfo.subset$probe)
    m.use <- exprs(eset)[index,]
    distance <- dist(m.use,method="euclidean")
    hc <- hclust(distance)
    
    hc.cut = cutree(hc,k=6) # k = number of clusters. 
    #table(hc.cut) # show cluster cut results
    hc.cut.levels = levels(as.factor(hc.cut)) # factor cluster level numbers for coloring
    #hc.cut.levels # review cluster level factors
    
    colors = rainbow(length(hc.cut.levels));
    cluster.patient.colors = c()
    
    #for (i in 1:length(hc.cut.levels)) {
      #cluster.patient.colors[names(which(hc.cut==hc.cut.levels[i]))] = colors[i] 
      #}
    #cluster.patient.colors
    
    dist.euclidean = dist.euclidean <- function(x, ...)
      dist(x,method="euclidean")
    label.size = 1
    
    samples = colnames(m.use)
    
    hmcol <- colorRampPalette(c("blue","white","red"))(256) #FOR THE COLORBLIND AMONG US
    
    ## MAKE SURE TO MAKE THE R STUDIO PLOTS FRAME AS LARGE AS POSSIBLE BEFORE RUNNING (SHRINK THE OTHER FRAMES)
    # Plot the result
    if (!.Device=="null device") { dev.off() } # Catches the "cannot shut down device 1 (the null device)" error
    
    # Grab the module's correlation significance to sigTrait
    module.sig <- round(moduleCor[moduleNames[i],paste(sigTrait,".p",sep="")],3)
    
    if (saveFile==TRUE) {
      # Generate a PNG of the figure
      png( paste(deepSplit,".",moduleNames[i],".",sigTrait,".p",module.sig,".png",sep=""), pngWidth, pngHeight, pointsize=20)
    }
    
    # genes.heat = heatmap.2(data.matrix(m.use), col=hmcol, scale="row", distfun = dist2, key=TRUE,
    #                        symkey=TRUE, density.info="none", trace="none",
    #                        ColSideColors=cluster.patient.colors[samples],cexCol=1.2,
    #                        cexRow = 0.1,margins=c(12,9),
    #                        main=paste(moduleNames[i]," module: ",projectName,".", deepSplit, " gene expression", sep=""))
    
    # Plot heatmap without side color bar
    genes.heat = heatmap.2(data.matrix(m.use), col=hmcol, scale="row", distfun = dist.euclidean, key=TRUE,
                           symkey=TRUE, density.info="none", trace="none",
                           cexRow = cexRow, cexCol=cexCol, margins=margins,
                           main=paste(moduleNames[i]," ", sigTrait," p",module.sig,
                                      " module ",deepSplit,".",projectName, sep=""))
    #return(m.use)
    
    if (saveFile==TRUE) {
      # Complete PNG save
      dev.off()
      print(paste(deepSplit,moduleNames[i], "heatmap generated",sep=" "))
    }
  }
  #print(paste(deepSplit," heatmaps of ",length(moduleNames)," modules generated.",sep=""))
}

#######################################################################################################
WGCNA CORRELATE MODULES TO TRAITS

traitCorr <- function( MEs, datTraits, moduleColors, sigTrait, p_val,
                       title.heatmap,
                       plotMar, cex.lab.x, cex.lab.y, text.size,
                       xLabels, xColorOffset, xLabelsAngle,
                       maxRows,
                       chart.filename, pngWidth, pngHeight, saveFile, deepSplit.cor) {
  # Define numbers of genes and samples
  nGenes = ncol(datExpr);
  nSamples = nrow(datExpr);
  datTraits <- data.matrix(datTraits)
  
  # Use the R cor() function to correlate the module eigengenes to clinical traits along with an associated p-value
  moduleTraitCor = cor(MEs, datTraits, use = "p");
  moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples);
  
  moduleTraitCor <- data.frame(moduleTraitCor)
  moduleTraitPvalue <- data.frame(moduleTraitPvalue)
  
  corNames <- vector('character')
  moduleCor <- vector('numeric')
  
  for (i in 1:ncol(moduleTraitCor)) {
    moduleCor <- cbind( moduleCor, moduleTraitCor[,i],moduleTraitPvalue[,i] )
    corNames <- c( corNames, colnames(moduleTraitCor)[i], paste(colnames(moduleTraitPvalue)[i],".p",sep="") )
  }
  
  colnames(moduleCor) <- corNames
  rownames(moduleCor) <- rownames(moduleTraitCor)
  
  # Order the correlations frames by decreasing significance of correlation to the primary clinical variable of choice,
  # usually SLEDAI or cohort
  moduleCor <- moduleCor[ order(moduleCor[, paste(sigTrait,".p",sep="")] ), ]
  
  index <- match( rownames(moduleCor),rownames(moduleTraitCor) )
  moduleTraitCor <- moduleTraitCor[index,,drop=F]
  
  index <- match( rownames(moduleCor),rownames(moduleTraitPvalue) )
  moduleTraitPvalue <- moduleTraitPvalue[ index,,drop=F]
  
  moduleTraitCor <- data.matrix(moduleTraitCor)
  moduleTraitPvalue <- data.matrix(moduleTraitPvalue)
  
  # Sum the number of probes in each module and bind this to our correlation matrix
  #table(moduleColors)
  colorFreq <- data.frame(table(factor(moduleColors)))
  rownames(colorFreq) <- colorFreq[,1]
  index <- match(rownames(moduleCor),rownames(colorFreq) )
  colorFreq <- colorFreq[index,]
  moduleCor <- cbind( moduleCor, "moduleCount"=colorFreq$Freq )
  #colnames(moduleCor)
  moduleCor <- moduleCor[ ,c(ncol(moduleCor),1:ncol(moduleCor)-1) ]
  
  index <- which(colnames(moduleCor)==paste(sigTrait,".p",sep=""))
  sig.count <- length(which(moduleCor[,index] < p_val))
  
  # Prepare plot window
  sizeGrWindow(10,6)
  # Color code each association by the correlation value:
  # Display the correlations and their p-values
  textMatrix = paste(signif(moduleTraitCor, 2), " (",
                     signif(moduleTraitPvalue, 2), ")", sep = "");
  
  if (saveFile==TRUE) {
    # Generate a PNG of the figure
    png(chart.filename,pngWidth, pngHeight, pointsize=20)
  }
  # dev.off()
  dim(textMatrix) = dim(moduleTraitCor)
  par(mar = plotMar); # bottom, left, top, right margins
  
  # Display the correlation values using a heatmap plot
  yLabels <- paste(rownames(moduleTraitCor)," (",moduleCor[,1], ")", sep="")
  ySymbols <- rownames(moduleTraitCor)
  
  labeledHeatmap(Matrix = moduleTraitCor,
                 xLabels = xLabels,
                 xLabelsAngle = xLabelsAngle,
                 xColorOffset = xColorOffset,
                 yLabels = yLabels,
                 ySymbols = ySymbols,
                 colorLabels = FALSE,
                 colors = blueWhiteRed(50),
                 textMatrix = textMatrix,
                 setStdMargins = FALSE,
                 cex.text = text.size,
                 zlim = c(-1,1),
                 cex.lab.x = cex.lab.x,
                 cex.lab.y = cex.lab.y,
                 main = paste(title.heatmap,
                              ", ", sig.count, " module(s) sig by ",sigTrait," p_val < ",p_val, " (", 
                              round(sig.count/nrow(moduleCor)*100,0),"%)", sep=""))
  if (saveFile==TRUE) {
    # Complete PNG save
    dev.off()
    print(paste( deepSplit.cor,"dendrogram with trait bars saved."))
  }
  
  moduleTraitCor <- data.matrix(moduleTraitCor)
  moduleTraitPvalue <- data.matrix(moduleTraitPvalue)
  
  moduleCor.list <- list(moduleTraitCor,moduleTraitPvalue, moduleCor )
  
  return(moduleCor.list)
  
  plotMar <- c(8, 6, 4, 2) # bottom, left, top, and right figure margins
  
  #rm(plotMar, text.size, p_val, x.size, y.size, maxRows, title.heatmap, chart.filename)
  
}

# Correlate modules created during TOM generation
moduleCor <- traitCorr( MEs=MEs, datTraits=datTraits, moduleColors=moduleColors, sigTrait=sigTrait, p_val=p_val,
                            title.heatmap=paste("DS Original",experimentName.recut,sep=" "),
                            plotMar, cex.lab.x, cex.lab.y, text.size,
                            xLabels, xColorOffset, xLabelsAngle,
                            maxRows=maxRows,chart.filename=paste("DS-orig.corrMap.",experimentName.recut,".png",sep=""),
                            pngWidth, pngHeight,saveFile, deepSplit.cor="DS-orig" )
moduleCor.val <- moduleCor[[1]]
moduleCor.pval <- moduleCor[[2]]
moduleCor <- moduleCor[[3]]

# Correlate deep split 1 modules
moduleCor.DS1 <- traitCorr( MEs=MEs.DS1, datTraits=datTraits, moduleColors=moduleColors.DS1, sigTrait=sigTrait, p_val=p_val,
                            title.heatmap=paste("DS1",experimentName.recut,sep=" "),
                            plotMar, cex.lab.x, cex.lab.y, text.size,
                            xLabels, xColorOffset, xLabelsAngle,
                            maxRows=maxRows,chart.filename=paste("DS1.corrMap.",experimentName.recut,".png",sep=""),
                            pngWidth, pngHeight,saveFile, deepSplit.cor="DS1" )
moduleCor.DS1.val <- moduleCor.DS1[[1]]
moduleCor.DS1.pval <- moduleCor.DS1[[2]]
moduleCor.DS1 <- moduleCor.DS1[[3]]

# Correlate deep split 2 modules
moduleCor.DS2 <- traitCorr( MEs=MEs.DS2, datTraits=datTraits, moduleColors=moduleColors.DS2, sigTrait=sigTrait, p_val=p_val,
                            title.heatmap=paste("DS2",experimentName.recut,sep=" "),
                            plotMar=plotMar, cex.lab.x=cex.lab.x, cex.lab.y = cex.lab.y, text.size=text.size,
                            xLabels=xLabels, xColorOffset=xColorOffset, xLabelsAngle=xLabelsAngle,
                            maxRows=maxRows,
                            chart.filename=paste("DS2.corrMap.",experimentName.recut,".png",sep=""), pngWidth=pngWidth, pngHeight=pngHeight,
                            saveFile=saveFile, deepSplit.cor="DS2"  )
moduleCor.DS2.val <- moduleCor.DS2[[1]]
moduleCor.DS2.pval <- moduleCor.DS2[[2]]
moduleCor.DS2 <- moduleCor.DS2[[3]]

# Correlate deep split 3 modules
moduleCor.DS3 <- traitCorr( MEs=MEs.DS3, datTraits=datTraits, moduleColors=moduleColors.DS3, sigTrait=sigTrait, p_val=p_val,
                            title.heatmap=paste("DS3",experimentName.recut,sep=" "),
                            plotMar=plotMar, cex.lab.x=cex.lab.x, cex.lab.y = cex.lab.y, text.size=text.size,
                            xLabels=xLabels, xColorOffset=xColorOffset, xLabelsAngle=xLabelsAngle,
                            maxRows=maxRows,
                            chart.filename=paste("DS3.corrMap.",experimentName.recut,".png",sep=""), pngWidth=pngWidth, pngHeight=pngHeight,
                            saveFile=saveFile, deepSplit.cor="DS3"  )
moduleCor.DS3.val <- moduleCor.DS3[[1]]
moduleCor.DS3.pval <- moduleCor.DS3[[2]]
moduleCor.DS3 <- moduleCor.DS3[[3]]

# Correlate deep split 4 modules
moduleCor.DS4 <- traitCorr( MEs=MEs.DS4, datTraits=datTraits, moduleColors=moduleColors.DS4, sigTrait=sigTrait, p_val=p_val,
                            title.heatmap=paste("DS4",experimentName.recut,sep=" "),
                            plotMar=plotMar, cex.lab.x=cex.lab.x, cex.lab.y = cex.lab.y, text.size=text.size,
                            xLabels=xLabels, xColorOffset=xColorOffset, xLabelsAngle=xLabelsAngle,
                            maxRows=maxRows,
                            chart.filename=paste("DS4.corrMap.",experimentName.recut,".png",sep=""), pngWidth=pngWidth, pngHeight=pngHeight,
                            saveFile=saveFile, deepSplit.cor="DS4"  )
moduleCor.DS4.val <- moduleCor.DS4[[1]]
moduleCor.DS4.pval <- moduleCor.DS4[[2]]
moduleCor.DS4 <- moduleCor.DS4[[3]]

# Set the proper save directory first. script needs fixing. 
save(moduleCor.DS1, moduleCor.DS1.pval, moduleCor.DS1.val,
     moduleCor.DS2, moduleCor.DS2.pval, moduleCor.DS2.val,
     moduleCor.DS3, moduleCor.DS3.pval, moduleCor.DS3.val,
     moduleCor.DS4, moduleCor.DS4.pval, moduleCor.DS4.val,
     file="recut.DS1-DS4.module.correlations.RData")

############################################################################################
WGCNA kIMs per block

########################################################################################
## CALCULATE KIMs FOR ALL DEEP SPLITS
# Calculate kIM, or intramodular connectivity, i.e. connectivity of nodes to other nodes
# within the same module. This requires the adjaceny matrix be recreated, so make sure
# to use the same soft power and unsigned or signed designation originally used to create the network

setwd(dir)
kIM = matrix(nrow=ncol(datExpr),ncol=4)
for (i in 1:length(net$TOMFiles)) {
  datExpr.block <- datExpr[ , net$blockGenes[[i]] ]
  adjacency.block = adjacency(datExpr.block, power= as.integer(TOM.settings["power"]), type="signed")
  print( paste("Calculating adjacencies for genes within block ",i," of ",length(net$TOMFiles),sep=""))
  moduleColors.block <- moduleColors[ net$blockGenes[[i]] ]
  print( paste("Calculating scaled kIM intramodular connectivity for adjacencies within block ",i," of ",length(net$TOMFiles),sep=""))
  kIM.block = intramodularConnectivity(adjacency.block, moduleColors.block, scaleByMax = TRUE) # Scale to 1
  kIM[1:nrow(kIM.block),1:4] <- kIM.block # FIX
  
}

############################################################################################
WGCNA KME PRIMARY

###############################################################################################
# DS1 KMEs
KME.corr.DS1 <- KME.MEs.DS1[,seq(1, ncol(KME.MEs.DS1), 2)]
colnames(KME.corr.DS1) <- substr( colnames(KME.corr.DS1), 5, nchar(colnames(KME.corr.DS1)))
KME.corr.p.DS1 <- KME.MEs.DS1[,seq(2, ncol(KME.MEs.DS1), 2)]
colnames(KME.corr.p.DS1) <- colnames(KME.corr.DS1)

KME.primary.DS1 <- matrix(nrow=nrow(KME.MEs.DS1),ncol=3)

print(paste("Retrieving primary KME values for DS1..."))
for (i in 1:nrow(geneList.DS1)) {
  for (j in 1:ncol(KME.corr.DS1)) {
    if (geneList.DS1[i,"moduleColors.DS1"]==colnames(KME.corr.DS1)[j] ) { 
      KME.primary.DS1[i,1] <- geneList.DS1[i,"moduleColors.DS1"]
      KME.primary.DS1[i,2] <- KME.corr.DS1[i,j]
      KME.primary.DS1[i,3] <- KME.corr.p.DS1[i,j]
    }
  }
}
colnames(KME.primary.DS1) <- c("KME.primary","KME.color","KME.p.color")
rm(KME.corr.DS1,KME.corr.p.DS1)

###############################################################################################
# DS2 KMEs
KME.corr.DS2 <- KME.MEs.DS2[,seq(1, ncol(KME.MEs.DS2), 2)]
colnames(KME.corr.DS2) <- substr( colnames(KME.corr.DS2), 5, nchar(colnames(KME.corr.DS2)))
KME.corr.p.DS2 <- KME.MEs.DS2[,seq(2, ncol(KME.MEs.DS2), 2)]
colnames(KME.corr.p.DS2) <- colnames(KME.corr.DS2)

KME.primary.DS2 <- matrix(nrow=nrow(KME.MEs.DS2),ncol=3)


print(paste("Retrieving primary KME values for DS2..."))
for (i in 1:nrow(geneList.DS2)) {
  for (j in 1:ncol(KME.corr.DS2)) {
    if (geneList.DS2[i,"moduleColors.DS2"]==colnames(KME.corr.DS2)[j] ) { 
      KME.primary.DS2[i,1] <- geneList.DS2[i,"moduleColors.DS2"]
      KME.primary.DS2[i,2] <- KME.corr.DS2[i,j]
      KME.primary.DS2[i,3] <- KME.corr.p.DS2[i,j]
    }
  }
}
colnames(KME.primary.DS2) <- c("KME.primary","KME.color","KME.p.color")
rm(KME.corr.DS2,KME.corr.p.DS2)

###############################################################################################
# DS3 KMEs
KME.corr.DS3 <- KME.MEs.DS3[,seq(1, ncol(KME.MEs.DS3), 2)]
colnames(KME.corr.DS3) <- substr( colnames(KME.corr.DS3), 5, nchar(colnames(KME.corr.DS3)))
KME.corr.p.DS3 <- KME.MEs.DS3[,seq(2, ncol(KME.MEs.DS3), 2)]
colnames(KME.corr.p.DS3) <- colnames(KME.corr.DS3)

KME.primary.DS3 <- matrix(nrow=nrow(KME.MEs.DS3),ncol=3)

# 
print(paste("Retrieving primary KME values for DS3..."))
for (i in 1:nrow(geneList.DS3)) {
  for (j in 1:ncol(KME.corr.DS3)) {
    if (geneList.DS3[i,"moduleColors.DS3"]==colnames(KME.corr.DS3)[j] ) { 
      KME.primary.DS3[i,1] <- geneList.DS3[i,"moduleColors.DS3"]
      KME.primary.DS3[i,2] <- KME.corr.DS3[i,j]
      KME.primary.DS3[i,3] <- KME.corr.p.DS3[i,j]
    }
  }
}
colnames(KME.primary.DS3) <- c("KME.primary","KME.color","KME.p.color")
rm(KME.corr.DS3,KME.corr.p.DS3)

###############################################################################################
# DS4 KMEs
KME.corr.DS4 <- KME.MEs.DS4[,seq(1, ncol(KME.MEs.DS4), 2)]
colnames(KME.corr.DS4) <- substr( colnames(KME.corr.DS4), 5, nchar(colnames(KME.corr.DS4)))
KME.corr.p.DS4 <- KME.MEs.DS4[,seq(2, ncol(KME.MEs.DS4), 2)]
colnames(KME.corr.p.DS4) <- colnames(KME.corr.DS4)

KME.primary.DS4 <- matrix(nrow=nrow(KME.MEs.DS4),ncol=3)

# 
print(paste("Retrieving primary KME values for DS4..."))
for (i in 1:nrow(geneList.DS4)) {
  for (j in 1:ncol(KME.corr.DS4)) {
    if (geneList.DS4[i,"moduleColors.DS4"]==colnames(KME.corr.DS4)[j] ) { 
      KME.primary.DS4[i,1] <- geneList.DS4[i,"moduleColors.DS4"]
      KME.primary.DS4[i,2] <- KME.corr.DS4[i,j]
      KME.primary.DS4[i,3] <- KME.corr.p.DS4[i,j]
    }
  }
}
colnames(KME.primary.DS4) <- c("KME.primary","KME.color","KME.p.color")
rm(KME.corr.DS4,KME.corr.p.DS4)

###############################################################################################

save(KME.primary.DS1,KME.primary.DS2,KME.primary.DS3,KME.primary.DS4,file="gene_KMEs.RData")


##############################################################################################
WGCNA DEEPSPLITS DENDROGRAMS
plotDendro <- function( datExpr, moduleColors, moduleCor, sigTrait, parMar, p_val, cex, deepSplit,
                        chart.filename, pngWidth, pngHeight, saveFile ) {
  
  ######################################################
  # Prepare p-values of correlation between modules eigengenes and clinical traits
  sigTrait.p <-paste(sigTrait,".p",sep="")
  
  # Prepare module eigengene names
  MEList = moduleEigengenes(datExpr, colors = moduleColors)
  MEs = MEList$eigengenes
  
  ######################################################
  # Calculate dissimilarity of module eigengenes
  MEDiss = 1-cor(MEs);
  
  # Cluster module eigengene dissimilarity
  METree = hclust(as.dist(MEDiss), method = "average");
  
  ######################################################
  # Format module names and include members and significance
  METree$labels <- substr( METree$labels, 3, nchar(METree$labels) )
  index <- match(METree$labels, rownames(moduleCor) )
  tree.labels <- data.frame(moduleCor[index,c("moduleCount",sigTrait.p,sigTrait)])
  tree.labels$color <- rownames(moduleCor)
  tree.labels[sigTrait.p] <- round(tree.labels[sigTrait.p],5)
  tree.labels$sign <- "[+]"
  index <- which(tree.labels[sigTrait]<=0)
  tree.labels[index,"sign"] <- "[-]"
  tree.labels$label <- paste("* ",tree.labels$sign," ", rownames(tree.labels)," (",tree.labels$moduleCount," p",tree.labels[,2],")",sep="")
  index <- which(tree.labels[sigTrait.p] > p_val)
  #tree.labels[index,"label"] <- paste(rownames(tree.labels)[index],"(",tree.labels[index,"moduleCount"],")",sep="")
  tree.labels[index,"label"] <- paste( tree.labels[index,"sign"]," ", rownames(tree.labels)[index]," (",tree.labels[index,"moduleCount"],")", sep="" )
  METree$labels <- tree.labels$label
  sigNum <- length(which(tree.labels[,2]<p_val))
  
  ######################################################
  # Plot the result
  if (!.Device=="null device") { dev.off() } # Catches the "cannot shut down device 1 (the null device)" error
  if (saveFile==TRUE) {
    # Generate a PNG of the figure
    png(chart.filename,pngWidth, pngHeight, pointsize=20)
  }
  
  par(oma=c(0,0,0,0),mar=parMar) # bottom, left, top, right margins
  #sizeGrWindow(7, 6)
  dendro.title <- paste(projectName," ",deepSplit," ",experimentName.recut," p_val<",p_val," *",sigNum," sig ",sigTrait,sep="")
  
  # Old vertical style denrogram
  #plot(METree, main = dendro.title, xlab = "", sub = "", cex=cex)
  
  # Plot a nice horizontal plot
  hcd <- as.dendrogram(METree)
  par(oma=c(0,0,0,0),mar=parMar)
  nodePar <- list(lab.cex = cex, pch = c(NA, 21), cex = cex, col = "black")
  plot(hcd,  xlab = "Module Dissimilarity", nodePar = nodePar, horiz = TRUE, main = dendro.title, cex=cex)
  
  # Complete PNG save
  if (!.Device=="null device") { dev.off() }
  
  print(paste(deepSplit," dendrogram of ",ncol(MEs)," modules generated.",sep=""))
  
  ######################################################
  # NEW DENDROGRAM APPROACHES
  # See http://www.sthda.com/english/wiki/beautiful-dendrogram-visualizations-in-r-5-must-known-methods-unsupervised-machine-learning
  
  # Compute distances and hierarchical clustering
  #dd <- dist(MEDiss, method = "euclidean") # note we've scaled this
  # dd <- dist(scale(MEDiss), method = "euclidean") # note we've scaled this
  # hc <- hclust(dd, method = "ward.D2") # consider using other methods such as euclidean, etc.
  # 
  # dev.off()
  # plot(hc)
  # 
  # hc$labels <- tree.labels$label
  # plot(hc)
  # 
  # # Put the labels at the same height: hang = -1
  # plot(hc, hang = -1, cex = 0.6)
  # 
  # # Convert hclust into a dendrogram and plot
  # hcd <- as.dendrogram(hc)
  # 
  # # Default plot
  # dev.off()
  # par(oma=c(0,0,0,0),mar=c(8,0,0,0)) # bottom, left, top, right margins
  # plot(hcd, type = "rectangle", ylab = "Height")
  # 
  # # Triangle plot
  # dev.off()
  # par(oma=c(0,0,0,0),mar=c(8,0,0,0))
  # plot(hcd, type = "triangle", ylab = "Height")
  # 
  # # Zoom in to the first dendrogram
  # plot(hcd, xlim = c(1, 20), ylim = c(1,8))
  # 
  # # Define nodePar
  # nodePar <- list(lab.cex = 0.8, pch = c(NA, 19), cex = 0.9, col = "black")
  # # Customized plot; remove labels
  # plot(hcd, ylab = "Height", nodePar = nodePar, leaflab = "none")
  # 
  # # Horizontal plot
  # library(dendextend)
  # hcd <- as.dendrogram(hc)
  # 
  # dev.off()
  # par(oma=c(0,0,0,0),mar=c(6,4,2,14))
  # nodePar <- list(lab.cex = 0.85, pch = c(NA, 21), cex = 0.5, col = "black")
  # 
  # plot(hcd,  xlab = "Module Dissimilarity", nodePar = nodePar, horiz = TRUE, main = dendro.title)
  # 
  # # labels_colors(hcd) <- tree.labels$color # Not working yet, need to match colors before reorganizing
  # 
  # # The package ape (Analyses of Phylogenetics and Evolution) can be used to produce a more sophisticated dendrogram.
  # library("ape")
  # # Default plot
  # dev.off()
  # par(oma=c(0,0,0,0),mar=c(4,2,2,2))
  # plot(as.phylo(hc), cex = 0.8, label.offset = 0.5)
  # plot(as.phylo(hc), cex = 0.8, label.offset = 0.5, type="cladogram")
  # 
  # # Unrooted phylogenetic tree
  # plot(as.phylo(hc), type = "unrooted", cex = 0.6, no.margin = TRUE)
  # 
  # # Fan
  # plot(as.phylo(hc), type = "fan")
  # 
  # # GGplot2 dendrograms
  # library("ggplot2", "ggdendro")
  # # Visualization using the default theme named theme_dendro()
  # ggdendrogram(hc)
  # # Rotate the plot and remove default theme
  # ggdendrogram(hc, rotate = TRUE, theme_dendro = FALSE)
  # # Build dendrogram object from hclust results
  # dend <- as.dendrogram(hc)
  # # Extract the data (for rectangular lines)
  # # Type can be "rectangle" or "triangle"
  # dend_data <- dendro_data(dend, type = "rectangle")
  # # What contains dend_data
  # names(dend_data)
  # # Extract data for line segments
  # head(dend_data$segments)
  # # Extract data for labels
  # head(dend_data$labels)
  # # Plot line segments and add labels
  # p <- ggplot(dend_data$segments) + 
  #   geom_segment(aes(x = x, y = y, xend = xend, yend = yend))+
  #   geom_text(data = dend_data$labels, aes(x, y, label = label),
  #             hjust = 1, angle = 90, size = 3)+
  #   ylim(-3, 15)
  # print(p)
  
  # Phylotools package
  # See http://www.phytools.org/ and https://cran.r-project.org/web/packages/phytools/
  # Unsure of relevancy to distance dendrograms
  
}

#############################################################################################
WGCNA PLOT MODULE EIGENGENE VS SIG TRAIT CORRELATION
library(Hmisc)

plotModulesME<-function(moduleName,moduleColors,KME.primary) {
  module<-moduleName
  col<-module
  moduleGenesIndex<-which(moduleColors==module)
  moduleGenes.kME<-KME.primary[moduleGenesIndex,which(colnames(KME.primary)==module)]
  moduleGenes.sigTrait<-gene.corr.r_p[moduleGenesIndex,sigTrait]
  if (module=="ivory") {col="gray"} ###If you have very light colors, substitute them for a darker color to be seen on graphs
  if (module=="floralwhite") {col="gray"}
  if (module=="lightcyan") {col="cyan"}
  if (module=="lightcyan1") {col="cyan"}
  if (module=="honeydew1") {col="gray"}
  if (module=="white") {col="gray"}
  if (module=="lightyellow") {col="yellow"}
  if (module=="aliceblue") {col="blue"}
  if (module=="antiquewhite1") {col="gray"}
  if (module=="lavenderblush") {col="gray"}
  if (module=="lavenderblush1") {col="gray"}
  if (module=="lavenderblush2") {col="gray"}
  if (module=="lavenderblush3") {col="gray"}
  if (module=="mistyrose") {col="gray"}
  if (module=="moccasin") {col="gray"}
  if (module=="navajowhite") {col="gray"}
  if (module=="navajowhite1") {col="gray"}
  if (module=="navajowhite2") {col="gray"}
  if (module=="paleturquoise") {col="turquoise"}
  if (module=="thistle1") {col="gray"}
  if (module=="thistle2") {col="gray"}
  if (module=="whitesmoke") {col="gray"}
  
  verboseScatterplot(moduleGenes.kME,moduleGenes.sigTrait,
       xlab = paste0("Mean probe Pearson correlation to ", module, " module\neigengene (first principal component of module)"),
       ylab = paste0("Probe correlation to ",sigTrait),
       main = paste(capitalize(module), " kME vs. gene significance\n", sep=""),
       cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = col, ylim=c(-1,1), xlim=c(-1,1))
  abline(h=0,lty=2,lwd=1,col="red")
  abline(v=0,lty=2,lwd=1,col="red")
}

kMEvsCorrPlot<-function(moduleCor,moduleColors,KME.primary,DS.tag) {
  
  colors.mods <- rownames(moduleCor)
  colors.mods <- colors.mods[colors.mods!="grey"]
  colors.mods <- sort(colors.mods)
  
  nmods<-length(colors.mods)
  nplots<-floor(nmods/6)
  pdf(paste("kMEvsCorrPlots",DS.tag,".pdf"),width=16,height=9)
  for (i in 1:nplots) {
    par(mfrow=c(2,3))
    startIndex<-6*i-5
    endIndex<-6*i
    for(j in startIndex:endIndex) {
      myModName<-colors.mods[j]
      plotModulesME(moduleName=myModName,moduleColors=moduleColors,KME.primary=KME.primary)
    }
  }
  nExtraPlots<-nmods-nplots*6
  if (nExtraPlots>0) {
    if (nExtraPlots==5) {
      par(mfrow=c(2,3))
    } else {
      par(mfrow=c(2,2))
    }
    startIndex<-nplots*6+1
    endIndex<-nmods
    for (k in startIndex:endIndex) {
      myModName<-colors.mods[k]
      plotModulesME(moduleName=myModName,moduleColors=moduleColors,KME.primary=KME.primary)
    }
  }
  dev.off()
  pdf(paste0("kMEgrey",DS.tag,".pdf"),width=16,height=9)
  par(mfrow=c(1,1))
  myModName <- "grey"
  plotModulesME(moduleName=myModName,moduleColors=moduleColors,KME.primary=KME.primary)
  dev.off()
  graphics.off()
}

###################################################################################################
WGCNA ANNOTATE ENSEMBL GENES
###################################################################################################
# ANNOTATE ENSEMBL GENES
###################################################################################################

biomart.annotate <- function (IDs, filter, attributes) {
  
  # Query biomaRt for annotation information keyed to IDs (usually Ensembl)
  library('biomaRt')
  mart <- useDataset("hsapiens_gene_ensembl", useMart("ensembl"))
  
  # View available marts, inputs and outputs
  #datasets <- listDatasets(mart)
  #filters <- listFilters(mart)
  #attribs <- listAttributes(mart)
  #ensemblID <- colnames(datExpr)
  
  # Give this a minute or so. Also, remember not all Ensembl areas are actually transcribed and thus assigned an Entrez ID
  print(paste("Querying biomaRt...",sep=""))
  biomart.output <- getBM(filters=filter, attributes=attributes, values=IDs, mart=mart)
  print(paste("Query complete.",sep=""))
  
  return(biomart.output)
}
####################################################################################################
WGCNA BLOCKS RECUT VERSION 1
###################################################################################################
# RECUT WGCNA TOM BLOCKS INTO NEW MODULES USING FOUR LEVELS OF DEEP SPLIT
###################################################################################################

# Load and execute the function to set the storage directory for the recut blocks
# The function will leave you in the endpoint directory, so return to the main project directory as needed
setStorageDir <- dget("/mnt/disks/rserver/projects/shared_scripts/wgcna/set.storage.directory.R")
outputStorage <- setStorageDir( paste("recut",experimentName.recut,sep=".") ) # Each element of this vector is a subsequent subdirectory
# setwd(projectDir) # Don't call until end of job!!

####################################################################################################################
###RECUT THE NETWORK BLOCKS WITH A DETECTION CUT HEIGHT OF 1, MERGE HEIGHT OF 0.2, DEEP SPLIT OF 1
TOMFiles <- paste(projectDir,"/TOM/",net$TOMFiles,sep="")

print(paste("Recutting blocks using deepsplit ","1",sep=""))
recutBlocks = recutBlockwiseTrees(datExpr, net$goodSamples, net$goodGenes, net$blocks, TOMFiles, net$dendrograms,
                                  networkType="signed",
                                  corType = "pearson", pamStage=TRUE, pamRespectsDendro = TRUE,
                                  detectCutHeight = 1,
                                  deepSplit = 1, minModuleSize=100, mergeCutHeight=0.2, numericLabels = TRUE)

print(paste("Deepsplit ",i," cuts complete.",sep=""))

MEs.DS1 <- recutBlocks$MEs
rownames(MEs.DS1) <- rownames(datExpr)

MEs.DS1.labels <- as.numeric(substr( colnames(MEs.DS1), 3, nchar(colnames(MEs.DS1)) ))
colnames(MEs.DS1) <- labels2colors(MEs.DS1.labels)

moduleLabels.DS1 <- recutBlocks$colors
moduleColors.DS1 <- labels2colors(moduleLabels.DS1)

colorLevels.DS1 <- data.frame(table(moduleColors.DS1))
colorLevels.DS1 <- colorLevels.DS1[order(colorLevels.DS1[,2], decreasing=TRUE), ]

####################################################################################################################
## RECUT THE NETWORK BLOCKS WITH A DETECTION CUT HEIGHT OF 1, MERGE HEIGHT OF 0.2, DEEP SPLIT OF 2

print(paste("Recutting blocks using deepsplit ","2",sep=""))
recutBlocks = recutBlockwiseTrees(datExpr, net$goodSamples, net$goodGenes, net$blocks, TOMFiles, net$dendrograms,
                                  networkType="signed",
                                  corType="pearson", pamStage=TRUE, pamRespectsDendro = TRUE,
                                  detectCutHeight = 1,
                                  deepSplit=2, minModuleSize=100, mergeCutHeight=0.2, numericLabels = TRUE)

print(paste("Deepsplit ",i," cuts complete.",sep=""))

MEs.DS2 <- recutBlocks$MEs
rownames(MEs.DS2) <- rownames(datExpr)

MEs.DS2.labels <- as.numeric(substr( colnames(MEs.DS2), 3, nchar(colnames(MEs.DS2)) ))
colnames(MEs.DS2) <- labels2colors(MEs.DS2.labels)

moduleLabels.DS2 <- recutBlocks$colors
moduleColors.DS2 = labels2colors(moduleLabels.DS2)

moduleColors.DS2 <- matchLabels(moduleColors.DS2, moduleColors.DS1, pThreshold = 5e-2, na.rm = TRUE,
                                ignoreLabels = if (is.numeric(moduleColors.DS2)) 0 else "grey",
                                extraLabels = if (is.numeric(moduleColors.DS2)) c(1:1000) else standardColors() )

# Regenerate the MEs now that we have new color names
MEs.DS2 <- moduleEigengenes( datExpr, moduleColors.DS2, verbose=2)
MEs.DS2 <- MEs.DS2$eigengenes
rownames(MEs.DS2) <- rownames(datExpr)
colnames(MEs.DS2) <- substr( colnames(MEs.DS2), 3, nchar(colnames(MEs.DS2)) )

colorLevels.DS2 <- data.frame(table(moduleColors.DS2))
colorLevels.DS2 <- colorLevels.DS2[order(colorLevels.DS2[,2], decreasing=TRUE), ]


####################################################################################################################
## RECUT THE NETWORK BLOCKS WITH A DETECTION CUT HEIGHT OF 1, MERGE HEIGHT OF 0.2, DEEP SPLIT OF 3

print(paste("Recutting blocks using deepsplit ","3",sep=""))
recutBlocks = recutBlockwiseTrees(datExpr, net$goodSamples, net$goodGenes, net$blocks, TOMFiles, net$dendrograms,
                                  networkType="signed",
                                  corType="pearson", pamStage=TRUE, pamRespectsDendro = TRUE,
                                  detectCutHeight = 1,
                                  deepSplit=3, minModuleSize=100, mergeCutHeight=0.2, numericLabels = TRUE)

print(paste("Deepsplit ",i," cuts complete.",sep=""))

MEs.DS3 <- recutBlocks$MEs
rownames(MEs.DS3) <- rownames(datExpr)

MEs.DS3.labels <- as.numeric(substr( colnames(MEs.DS3), 3, nchar(colnames(MEs.DS3)) ))
colnames(MEs.DS3) <- labels2colors(MEs.DS3.labels)

moduleLabels.DS3 <- recutBlocks$colors
moduleColors.DS3 = labels2colors(moduleLabels.DS3)

moduleColors.DS3 <- matchLabels(moduleColors.DS3, moduleColors.DS1, pThreshold = 5e-2, na.rm = TRUE,
                                ignoreLabels = if (is.numeric(moduleColors.DS2)) 0 else "grey",
                                extraLabels = if (is.numeric(moduleColors.DS2)) c(1:1000) else standardColors() )

# Regenerate the MEs now that we have new color names
MEs.DS3 <- moduleEigengenes( datExpr, moduleColors.DS3, verbose=2)
MEs.DS3 <- MEs.DS3$eigengenes
rownames(MEs.DS3) <- rownames(datExpr)
colnames(MEs.DS3) <- substr( colnames(MEs.DS3), 3, nchar(colnames(MEs.DS3)) )

colorLevels.DS3 <- data.frame(table(moduleColors.DS3))
colorLevels.DS3 <- colorLevels.DS3[order(colorLevels.DS3[,2], decreasing=TRUE), ]

####################################################################################################################
## RECUT THE NETWORK BLOCKS WITH A DETECTION CUT HEIGHT OF 1, MERGE HEIGHT OF 0.2, DEEP SPLIT OF 4

print(paste("Recutting blocks using deepsplit ","4",sep=""))
recutBlocks = recutBlockwiseTrees(datExpr, net$goodSamples, net$goodGenes, net$blocks, TOMFiles, net$dendrograms,
                                  networkType="signed",
                                  corType="pearson", pamStage=TRUE, pamRespectsDendro = TRUE,
                                  detectCutHeight = 1,
                                  deepSplit=4, minModuleSize=100, mergeCutHeight=0.2, numericLabels = TRUE)

print(paste("Deepsplit ",i," cuts complete.",sep=""))

MEs.DS4 <- recutBlocks$MEs
rownames(MEs.DS4) <- rownames(datExpr)

MEs.DS4.labels <- as.numeric(substr( colnames(MEs.DS4), 3, nchar(colnames(MEs.DS4)) ))
colnames(MEs.DS4) <- labels2colors(MEs.DS4.labels)

moduleLabels.DS4 <- recutBlocks$colors
moduleColors.DS4 = labels2colors(moduleLabels.DS4)

moduleColors.DS4 <- matchLabels(moduleColors.DS4, moduleColors.DS1, pThreshold = 5e-2, na.rm = TRUE,
                                ignoreLabels = if (is.numeric(moduleColors.DS2)) 0 else "grey",
                                extraLabels = if (is.numeric(moduleColors.DS2)) c(1:1000) else standardColors() )

# Regenerate the MEs now that we have new color names
MEs.DS4 <- moduleEigengenes( datExpr, moduleColors.DS4, verbose=2)
MEs.DS4 <- MEs.DS4$eigengenes
rownames(MEs.DS4) <- rownames(datExpr)
colnames(MEs.DS4) <- substr( colnames(MEs.DS4), 3, nchar(colnames(MEs.DS4)) )

colorLevels.DS4 <- data.frame(table(moduleColors.DS4))
colorLevels.DS4 <- colorLevels.DS4[order(colorLevels.DS4[,2], decreasing=TRUE), ]


#######
# Plot a cut as needed diagnostically
# plotDendroAndColors(net$dendrograms[[1]], moduleColors.DS1[net$blockGenes[[1]]],
#                     c("DS1"), main = "TSOKOS CD4 Active SLE Females DS3 PAM SP10 Cut Height 0.995 Block 1/4",
#                     dendroLabels = FALSE, hang = 0.03,
#                     addGuide = TRUE, guideHang = 0.05)
#######

######################################################################################################################
WGCNA BLOCKS RECUT V2
###################################################################################################
# RECUT WGCNA TOM BLOCKS INTO NEW MODULES USING FOUR LEVELS OF DEEP SPLIT
###################################################################################################

# Load and execute the function to set the storage directory for the recut blocks
# The function will leave you in the endpoint directory, so return to the main project directory as needed
# setStorageDir <- dget("/mnt/disks/rserver/projects/shared_scripts/wgcna/set.storage.directory.R")
# outputStorage <- setStorageDir( experimentName.recut ) # Each element of this vector is a subsequent subdirectory
# setwd(projectDir) # Don't call until end of job!!

for (i in 1:4) {
  ### CUT WITH DEEPSPLIT 1 THROUGH 4
  recutBlocks = recutBlockwiseTrees(datExpr, net$goodSamples, net$goodGenes, net$blocks,
                                    TOMFiles=paste(projectDir,"/",net$TOMFiles,sep=""), net$dendrograms,
                                    networkType="signed",TOMType = "signed",
                                    corType = recut.corType,
                                    pamStage=recut.pamStage, pamRespectsDendro=recut.pamRespectsDendro,
                                    detectCutHeight=recut.detectCutHeight,
                                    deepSplit=i, minModuleSize=recut.minModuleSize, mergeCutHeight=recut.mergeCutHeight,
                                    numericLabels = TRUE)
  print(paste("Deepsplit ",i," cuts complete.",sep=""))
  
  # Use these in a potentially simplified for-loop instead of the four if(i==..) blocks. 
  # Note usage of assign() and as.name() functions
  # assign( paste("MEs.DS",i,sep=""),recutBlocks$MEs )
  # rownames(as.name( paste("MEs.DS",i,sep="")))  <- rownames(datExpr)
  
  if (i==1) {
    print(paste("Recutting blocks using deepsplit ",i,sep=""))
    MEs.DS1 <- recutBlocks$MEs
    rownames(MEs.DS1) <- rownames(datExpr)
    MEs.DS1.labels <- as.numeric(substr( colnames(MEs.DS1), 3, nchar(colnames(MEs.DS1)) ))
    colnames(MEs.DS1) <- labels2colors(MEs.DS1.labels)
    moduleLabels.DS1 <- recutBlocks$colors
    moduleColors.DS1 <- labels2colors(moduleLabels.DS1)
    
    colorLevels.DS1 <- data.frame(table(moduleColors.DS1))
    colorLevels.DS1 <- colorLevels.DS1[order(colorLevels.DS1[,2], decreasing=TRUE), ]
    print(paste("Block ",i," recut modules regenerated.",sep=""))
  }
  
  if (i==2) {
    print(paste("Recutting blocks using deepsplit ",i,sep=""))
    MEs.DS2 <- recutBlocks$MEs
    rownames(MEs.DS2) <- rownames(datExpr)
    
    MEs.DS2.labels <- as.numeric(substr( colnames(MEs.DS2), 3, nchar(colnames(MEs.DS2)) ))
    colnames(MEs.DS2) <- labels2colors(MEs.DS2.labels)
    
    moduleLabels.DS2 <- recutBlocks$colors
    moduleColors.DS2 <- labels2colors(moduleLabels.DS2)
    
    moduleColors.DS2 <- matchLabels(moduleColors.DS2, moduleColors.DS1, pThreshold = 5e-2, na.rm = TRUE,
                                    ignoreLabels = if (is.numeric(moduleColors.DS2)) 0 else "grey",
                                    extraLabels = if (is.numeric(moduleColors.DS2)) c(1:1000) else standardColors() )
    
    # Regenerate the MEs now that we have new color names
    MEs.DS2 <- moduleEigengenes( datExpr, moduleColors.DS2, verbose=2)
    MEs.DS2 <- MEs.DS2$eigengenes
    rownames(MEs.DS2) <- rownames(datExpr)
    colnames(MEs.DS2) <- substr( colnames(MEs.DS2), 3, nchar(colnames(MEs.DS2)) )
    
    colorLevels.DS2 <- data.frame(table(moduleColors.DS2))
    colorLevels.DS2 <- colorLevels.DS2[order(colorLevels.DS2[,2], decreasing=TRUE), ]
    print(paste("Block ",i," recut modules regenerated.",sep=""))
  }
  
  if (i==3) {
    print(paste("Recutting blocks using deepsplit ",i,sep=""))
    MEs.DS3 <- recutBlocks$MEs
    rownames(MEs.DS3) <- rownames(datExpr)
    
    MEs.DS3.labels <- as.numeric(substr( colnames(MEs.DS3), 3, nchar(colnames(MEs.DS3)) ))
    colnames(MEs.DS3) <- labels2colors(MEs.DS3.labels)
    
    moduleLabels.DS3 <- recutBlocks$colors
    moduleColors.DS3 = labels2colors(moduleLabels.DS3)
    
    moduleColors.DS3 <- matchLabels(moduleColors.DS3, moduleColors.DS1, pThreshold = 5e-2, na.rm = TRUE,
                                    ignoreLabels = if (is.numeric(moduleColors.DS2)) 0 else "grey",
                                    extraLabels = if (is.numeric(moduleColors.DS2)) c(1:1000) else standardColors() )
    
    # Regenerate the MEs now that we have new color names
    MEs.DS3 <- moduleEigengenes( datExpr, moduleColors.DS3, verbose=2)
    MEs.DS3 <- MEs.DS3$eigengenes
    rownames(MEs.DS3) <- rownames(datExpr)
    colnames(MEs.DS3) <- substr( colnames(MEs.DS3), 3, nchar(colnames(MEs.DS3)) )
    
    colorLevels.DS3 <- data.frame(table(moduleColors.DS3))
    colorLevels.DS3 <- colorLevels.DS3[order(colorLevels.DS3[,2], decreasing=TRUE), ]
    print(paste("Block ",i," recut modules regenerated.",sep=""))
  }
  
  if (i==4) {
    print(paste("Recutting blocks using deepsplit ",i,sep=""))
    MEs.DS4 <- recutBlocks$MEs
    rownames(MEs.DS4) <- rownames(datExpr)
    
    MEs.DS4.labels <- as.numeric(substr( colnames(MEs.DS4), 3, nchar(colnames(MEs.DS4)) ))
    colnames(MEs.DS4) <- labels2colors(MEs.DS4.labels)
    
    moduleLabels.DS4 <- recutBlocks$colors
    moduleColors.DS4 = labels2colors(moduleLabels.DS4)
    
    moduleColors.DS4 <- matchLabels(moduleColors.DS4, moduleColors.DS1, pThreshold = 5e-2, na.rm = TRUE,
                                    ignoreLabels = if (is.numeric(moduleColors.DS2)) 0 else "grey",
                                    extraLabels = if (is.numeric(moduleColors.DS2)) c(1:1000) else standardColors() )
    
    # Regenerate the MEs now that we have new color names
    MEs.DS4 <- moduleEigengenes( datExpr, moduleColors.DS4, verbose=2)
    MEs.DS4 <- MEs.DS4$eigengenes
    rownames(MEs.DS4) <- rownames(datExpr)
    colnames(MEs.DS4) <- substr( colnames(MEs.DS4), 3, nchar(colnames(MEs.DS4)) )
    
    colorLevels.DS4 <- data.frame(table(moduleColors.DS4))
    colorLevels.DS4 <- colorLevels.DS4[order(colorLevels.DS4[,2], decreasing=TRUE), ]
    print(paste("Block ",i," recut modules regenerated.",sep=""))
  }
}

setwd(projectDir)

#######
# Plot a cut as needed diagnostically
# plotDendroAndColors(net$dendrograms[[1]], moduleColors.DS1[net$blockGenes[[1]]],
#                     c("DS1"), main = "TSOKOS CD4 Active SLE Females DS3 PAM SP10 Cut Height 0.995 Block 1/4",
#                     dendroLabels = FALSE, hang = 0.03,
#                     addGuide = TRUE, guideHang = 0.05)
#######

############################################################################################################
###################################################################################################
# ANNOTATE WGCNA MODULES WITH TOP GO PATHWAYS
###################################################################################################

# Join some fData to genes with their module color names. Note the gene order in the moduleColors vector is
# still in the same order as rownames of the original datExpr from whence adjacency and subsequent
# TOM encoding was calculated.

geneNames = colnames(datExpr)
geneInfo = data.frame(moduleColor=moduleColors.DS2)

# Annotate the modules with GO functional enrichment
entrezIDs = as.character(geneInfo$geneEntrezID)
GOenr = GOenrichmentAnalysis(geneInfo$moduleColor, allLLIDs, organism = "human", nBestP = 10);
GOenr.terms.all = GOenr$bestPTerms[[4]]$enrichment

###################################################################################################
# ADD COLOR NAMES TO WGCNA MODULES
###################################################################################################

# Load probe/gene numeric module assignments
moduleLabels <- net$colors

# Convert numeric module numbers to color names
moduleColors <- labels2colors(net$colors)

# Extract module eigengenes per sample
MEs <- net$MEs
rownames(MEs) <- rownames(datExpr)

# Rename module numbers to colors 
# CRITICAL!!! You MUST pass a numeric vector to labels2colors
MEs.labels <- as.numeric(substr( colnames(MEs), 3, nchar(colnames(MEs)) ))
colnames(MEs) <- labels2colors(MEs.labels)

# Create frequency table of probes in modules
colorLevels <- data.frame(table(moduleColors))
colorLevels <- colorLevels[order(colorLevels[,2], decreasing=TRUE), ]

####################################################################################################
WGCNA PLOT Module Eigenegenes and Boxplots
ME.boxplot<-function(MEs,index) {
  quantile.ME<-quantile(MEs[,index])
  median.ME<-round(quantile.ME[3],2)
  max.ME<-round(quantile.ME[5],2)
  diff.ME<-max.ME-median.ME
  par(mar=c(8,3,4,3))
  boxplot(MEs[,index],main=paste0(colnames(MEs)[index], "\n","Median=", median.ME,". Max=", max.ME, ". Diff=", diff.ME))
}
ME.barplot<-function(MEs,index) {
  barplot(MEs[,index],col=colnames(MEs)[index],names.arg=rownames(MEs),las=2,ylim=c(-1,1),cex.names=0.6)
}

ME_plots <- function(MEs,DS.tag) {
  nmods<-ncol(MEs)
  nplots<-floor(nmods/6)
  pdf(paste("ME_plots.",DS.tag,".pdf"),width=16,height=9)
  for (i in 1:nplots) {
    par(mfrow=c(2,6))
    startIndex<-6*i-5
    endIndex<-6*i
    for (j in startIndex:endIndex) {
      ME.boxplot(MEs=MEs,index=j)
    }
    for (j in startIndex:endIndex) {
      ME.barplot(MEs=MEs,index=j)
    }
  }
  nExtraPlots<-nmods-nplots*6
  if (nExtraPlots>0) {
    par(mfrow=c(2,nExtraPlots))
    startIndex<-nplots*6+1
    endIndex<-nmods
    for (k in startIndex:endIndex) {
      ME.boxplot(MEs=MEs,index=k)
    }
    for (k in startIndex:endIndex) {
      ME.barplot(MEs=MEs,index=k)
    }
  }
  dev.off()
  graphics.off()
}

#######################################################################################################
WGCNA PLOT MODULES VS SIGNATURE TRAIT
library(Hmisc)

plotModulesIM<-function(moduleName,moduleColors,kIM) {
  module<-moduleName
  col<-module
  moduleGenesIndex<-which(moduleColors==module)
  moduleGenes.kIM<-kIM[moduleGenesIndex,"kWithin"]
  moduleGenes.sigTrait<-gene.corr.r_p[moduleGenesIndex,sigTrait]
  if (module=="ivory") {col="gray"} ###If you have very light colors, substitute them for a darker color to be seen on graphs
  if (module=="floralwhite") {col="gray"}
  if (module=="lightcyan") {col="cyan"}
  if (module=="lightcyan1") {col="cyan"}
  if (module=="honeydew1") {col="gray"}
  if (module=="white") {col="gray"}
  if (module=="lightyellow") {col="yellow"}
  if (module=="aliceblue") {col="blue"}
  if (module=="antiquewhite1") {col="gray"}
  if (module=="lavenderblush") {col="gray"}
  if (module=="lavenderblush1") {col="gray"}
  if (module=="lavenderblush2") {col="gray"}
  if (module=="lavenderblush3") {col="gray"}
  if (module=="mistyrose") {col="gray"}
  if (module=="moccasin") {col="gray"}
  if (module=="navajowhite") {col="gray"}
  if (module=="navajowhite1") {col="gray"}
  if (module=="navajowhite2") {col="gray"}
  if (module=="paleturquoise") {col="turquoise"}
  if (module=="thistle1") {col="gray"}
  if (module=="thistle2") {col="gray"}
  if (module=="whitesmoke") {col="gray"}
  
  verboseScatterplot(moduleGenes.kIM,moduleGenes.sigTrait,
       xlab = paste0("Probe intramodular membership in ", module, " module"),
       ylab = paste0("Probe correlation to ",sigTrait),
       main = paste(capitalize(module), " kIM vs. gene significance\n", sep=""),
       cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = col, ylim=c(-1,1), xlim=c(-1,1))
  abline(h=0,lty=2,lwd=1,col="red")
  abline(v=0,lty=2,lwd=1,col="red")
}

kIMvsCorrPlot<-function(moduleCor,moduleColors,kIM,DS.tag) {
  
  colors.mods <- rownames(moduleCor)
  colors.mods <- colors.mods[colors.mods!="grey"]
  colors.mods <- sort(colors.mods)
  
  nmods<-length(colors.mods)
  nplots<-floor(nmods/6)
  pdf(paste0("kIMvsCorrPlots",DS.tag,".pdf"),width=16,height=9)
  for (i in 1:nplots) {
    par(mfrow=c(2,3))
    startIndex<-6*i-5
    endIndex<-6*i
    for(j in startIndex:endIndex) {
      myModName<-colors.mods[j]
      plotModulesIM(moduleName=myModName,moduleColors=moduleColors,kIM=kIM)
    }
  }
  nExtraPlots<-nmods-nplots*6
  if (nExtraPlots>0) {
    if (nExtraPlots==5) {
      par(mfrow=c(2,3))
    } else {
      par(mfrow=c(2,2))
    }
    startIndex<-nplots*6+1
    endIndex<-nmods
    for (k in startIndex:endIndex) {
      myModName<-colors.mods[k]
      plotModulesIM(moduleName=myModName,moduleColors=moduleColors,kIM=kIM)
    }
  }
  dev.off()
  graphics.off()
}