GSVA (V1.25.0) software package is an open source package available from R/Bioconductor and is used as a non-parametric, unsupervised method for estimating the variation of pre-defined gene sets in patient and control samples of microarray expression data sets www.bioconductor.org/packages/release/bioc/html/GSVA.html Hänzelmann, S. et al. GSVA: gene set variation analysis for microarray and RNA-seq data. BMC Bioinformatics. 14:7(2013). Example of Gene Set Variation Analysis Used by AMPLE BioSolutions ########GSVA script###### #######Geneset Variation analysis (GSVA) script###### #######Load the eset generated using the brain array CDF######## if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager") BiocManager::install("GSVA", version = "3.8") library(GSVA) load("~/Catalina-et-al_Nat-Comm_Suppl-Scripts/Expression_set_EXAMPLE/GSE88884_Batch-1_RMA-Normalized_ESET.Rdata") ls() eset_ba <- eset pDATA <- pData(eset_ba) table(eset_ba$Cohort) #10 CTL, 813 SLE table(eset_ba$Race) #####Removing the control probe#### index <- which( substr(rownames(eset_ba),1,4)=="AFFX" ) # 0 probes have been removed eset_ba <- eset_ba[ -index, ] fData(eset_ba)[,"SYMBOL"] <- as.character(fData(eset_ba)[,"geneSymbol"]) cohort = as.character(eset_ba$Cohort) eset_ba$friendlyName <- paste(cohort, substr(eset_ba$Subject_ID,6,9),eset_ba$SLEDAI, sep="_") sampleNames(eset_ba) <- eset_ba$friendlyName FD_ba <- fData(eset_ba) #####Exactly has to match with filtered eset (above one)###### IQR_BA <- data.frame() for(i in 1:nrow(exprs(eset_ba))) { row <- as.vector(exprs(eset_ba)[i,]) IQR_BA[i,1] <- IQR(row) } rownames(IQR_BA) <- rownames(exprs(eset_ba)) IQR_BA$probe <- rownames(IQR_BA) colnames(IQR_BA)[1] <- "IQR" index <- which(IQR_BA$IQR>0.05) #32,500 IQR_ba_filtered <- IQR_BA[index,] IQR_hist <- hist(IQR_ba_filtered$IQR, breaks=1000, xlab="probe IQRs", main="probe Density at IQR Intervals", sub=paste(nrow(IQR_ba_filtered), "total probes"), xlim=c(0,1), col="yellow") IQR_max <- which(IQR_hist$counts==max(IQR_hist$counts)) iqr <- as.vector(IQR_hist$mids)[IQR_max] abline(v=iqr, lty=2, lwd=2, col="red") IQR_ba_filtered <- IQR_ba_filtered[which(IQR_ba_filtered$IQR>iqr),,FALSE] #21,945 index <- which(rownames(eset_ba)%in%IQR_ba_filtered$probe)#21945 eset.iqr.filtered <- eset_ba[index,] FD_ba_filtered <- fData(eset.iqr.filtered) eset_ba_iqr_filtered <- exprs(eset.iqr.filtered) identical(rownames(FD_ba_filtered),rownames(eset_ba_iqr_filtered)) identical(rownames(FD_ba_filtered),IQR_ba_filtered$probe) eset_ba_iqr_filtered <- cbind(eset_ba_iqr_filtered,IQR_ba_filtered,FD_ba_filtered[,c(2,4)]) eset_ba_iqr_filtered[,828] = "BA_CDF" colnames(eset_ba_iqr_filtered)[828] = "CDF" #index <- which(is.na(eset_ba_iqr_filtered)[,"geneEntrezID"]) # 52 probes removed without gene mapping index <- which(is.na(eset_ba_iqr_filtered)[,"geneSymbol"]) #866 eset_ba_iqr_filtered_NA <- eset_ba_iqr_filtered[-index,] #22,106 eset_ba_iqr_filtered_NA = eset_ba_iqr_filtered_NA[ order(eset_ba_iqr_filtered_NA$IQR), ] #21079 setwd("~/Catalina-et-al_Nat-Comm_Suppl-Scripts/GSVA_EXAMPLE") save(eset_ba_iqr_filtered_NA, file="eset_ba_probes_filtered_IQR.Rdata") #######Now trying to remove duplicate gene symbols based on the IQR.#### eset_ba_iqr_filtered_NA = eset_ba_iqr_filtered_NA[ order(-eset_ba_iqr_filtered_NA$IQR), ] eset_ba_filtered_iqr_genes <- eset_ba_iqr_filtered_NA[ !duplicated(eset_ba_iqr_filtered_NA$geneSymbol), ]#21,078 eset_ba_filtered_iqr_entrezid <- eset_ba_iqr_filtered_NA[ !duplicated(eset_ba_iqr_filtered_NA$geneEntrezID), ]#21,079 setwd("~/Catalina-et-al_Nat-Comm_Suppl-Scripts/GSVA_EXAMPLE") save(eset_ba_iqr_filtered_NA,file="ba_iqr_filtered.Rdata") save(eset_ba_filtered_iqr_genes,eset_ba_filtered_iqr_entrezid,file="eset_ba_iqr_filtered_sig.Rdata") ####Removing duplicate gene symbols based on the highest IQR vales. Retaining the probes(genesymbol) with high IQR values#### eset_genes <- eset_ba_filtered_iqr_genes colnames(eset_genes) rownames(eset_genes) <- eset_genes$geneSymbol eset_genes <- eset_genes[,c(1:823)] #colnames(eset_genes) <- sampleNames(eset_ba) eset_genes <- as.matrix(eset_genes) ####GSVA script Load the genesets###### IFN_GSVA <- read.delim("~/Catalina-et-al_Nat-Comm_Suppl-Scripts/GSVA_EXAMPLE/IFN_Signatues.txt", sep="\t", dec=",") IFN_GSVA[,] <- lapply(IFN_GSVA[,],as.character) IFN_GSVA_stacked <- stack(IFN_GSVA) IFN_GSVA_stacked <- IFN_GSVA_stacked[-which(IFN_GSVA_stacked$values == ""),] colnames(IFN_GSVA_stacked)[1] <- "geneSymbol" colnames(IFN_GSVA_stacked)[2] <- "Category" testList <- tapply(IFN_GSVA_stacked$geneSymbol,IFN_GSVA_stacked$Category,c) n <- names(testList) uniqueList <- lapply(testList, unique) makeSet <- function(geneIds, n) { GeneSet(geneIds, geneIdType=SymbolIdentifier(), setName=n) } library(dplyr) library(GSEABase) library(GSVA) gsList <- gsc <- mapply(makeSet, uniqueList[], n) newlist = gsList %>% lapply(function(x) trimws(x@geneIds)) #gsc <- GeneSetCollection(gsc) #### AFTER CREATING THE GENESET COLLECTION OBJECT NOW WE CAN PERFORM GSVA FUNCTION AND #### GET EXPRESSION SET WHICH IS TRANSFORMED FROM GENE BY SAMPLE TO GENESET BY SAMPLE eset <- eset_genes ###computing GSVA enrichment scores##### gse88884_illum1_SLE_CTL_gsva <- gsva(eset, newlist, method = c("gsva"), rnaseq=FALSE, min.sz=1, max.sz=Inf, verbose=TRUE)$es.obs overlap_genes <- merge(eset_ba_filtered_iqr_genes,IFN_GSVA_stacked,by.x="geneSymbol",by.y="geneSymbol") #####Save the enrichment scores object####### setwd("~/Catalina-et-al_Nat-Comm_Suppl-Scripts/GSVA_EXAMPLE") write.csv(gse88884_illum1_SLE_CTL_gsva,file="GSE88884_Lilly_Illuminate-1_Female_MixedRace_SLEDAI6-27_813SLE_10CTL_BA_Probes_IFN-Paper-IQR_GSVA_Enrichment_scores.csv") write.csv(overlap_genes,file="GSE88884_Lilly_Illuminate-1_Female_MixedRace_SLEDAI6-27_813SLE_10CTL_BA_Probes_IFN-Paper-IQR_GSVA_Overlapping_genes.csv") ########Script to generate heat map using gsva enrichment scores################# exprs_hm <- gse88884_illum1_SLE_CTL_gsva ######Generating heatmap with log2 expression values for each genes that is overlapping with each category#### # We must remove rows where all columns are NA index <- which(is.na(rowSums(exprs_hm))) if (length(index) > 0) { exprs_hm <- exprs_hm[-index,] } # Here we'll use Euclidean distances to figure cluster colors distance <- dist(t(exprs_hm),method="euclidean") hc <- hclust(distance) hc.cut = cutree(hc,k=3) # k = number of clusters table(hc.cut) # show cluster cut results hc.cut.levels = levels(as.factor(hc.cut)) # factor cluster level numbers for coloring hc.cut.levels # review cluster level factors colors = rainbow(length(hc.cut.levels));colors cluster.patient.colors = c() for (i in 1:length(hc.cut.levels)){ cluster.patient.colors[names(which(hc.cut==hc.cut.levels[i]))] = colors[i] } cluster.patient.colors # Euclidean distances for both probes and arrays dist.euclidean = dist.euclidean <- function(x, ...) dist(x,method="euclidean") label.size = 0.8 samples = colnames(exprs_hm) hmcol <- colorRampPalette(c("blue","grey","red"))(256) ## MAKE SURE TO MAKE THE R STUDIO PLOTS FRAME AS LARGE AS POSSIBLE BEFORE RUNNING (SHRINK THE OTHER FRAMES) dev.off() library(gplots) nrow(exprs_hm) genes.heat = heatmap.2(data.matrix(exprs_hm), col=hmcol, Rowv = FALSE, scale="none", distfun = dist.euclidean,key=TRUE, symkey=FALSE, density.info="none", trace="none", ColSideColors=cluster.patient.colors[samples],cexCol=label.size, cexRow = label.size,margins=c(9,9))