## Project: LPEseq ## Purpose: PLoS ONE Revision ## Revision version: 2nd version ## Written by: Jungsoo Gim (iedenkim@gmail.com) ## The 1st reviewer satisfied all the responses ## The 2nd reviewer is not satisfied about our responses and need to be argued ## The 3rd reviewer asked several meaningfule questions ## This is for the response to the reviewer 3 ## This work has done at desktop at Won's lab ## Set-ups ## Comment # "What I am missing is the attempt to falsify this! How, very simple. You have data with more #than one sample. Estimate from these the ‘true’ variance conditioned on the expression levels #of the genes. Then overlay this line into Fig 1C (with removed outliers) and perform a KS test. #If the test is not significant I am willing to entertain the idea. Do it for different sample sizes to #demonstrate this is a robust finding." ## The core idea is to compare two variance curves: see below tmp.x1 <- sort(rnorm(1000)) tmp.x2 <- sort(rnorm(1000)) plot(tmp.x1, type="l", main=paste("p-value using KS test is: ", round(ks.test(tmp.x1, tmp.x2)$p.value, 5), sep=""), xlab="X1", ylab="X2"); lines(tmp.x2, col=2) ks.test(tmp.x1, tmp.x2) tmp.x1 <- sort(rnorm(1000)) tmp.x2 <- sort(rnorm(1000, sd=1.1)) plot(tmp.x1, type="l", main=paste("p-value using KS test is: ", round(ks.test(tmp.x1, tmp.x2)$p.value, 5), sep="")); lines(tmp.x2, col=2) ks.test(tmp.x1, tmp.x2) ## Plan to response ## 1. Estimate variance using total samples ## 2. Estimate variance using selected two samples ## 3. Perform KS-test load('/LPEseq_20140714_real_data_analysis.Rdata') ## Data included is head(sultan) head(gilad) head(fly5) head(maqc) head(katz) library(LPEseq) LPEseq.outlier.edit <- function(expr_dat, d=d, o.mode="FC", trim=F){ if(o.mode=="FC"){ ix.outlier <- which(abs(apply(expr_dat, 1, diff)) >= d) expr_dat_sub <- expr_dat[-ix.outlier,] if(trim==T){ expr_dat_sub <- expr_dat expr_dat_sub[ix.outlier,2] <- d } } if(o.mode=="Empirical"){ tmp.vec <- cbind(apply(expr_dat, 1, mean),abs(apply(expr_dat, 1, diff))) ix.grp1 <- which(tmp.vec[,1] < 3) # the value was changed from 2 to 3 ix.grp2 <- which(tmp.vec[,1] >= 3) ix.outlier1 <- intersect(ix.grp1, which(tmp.vec[,2] >= 2)) ix.outlier2 <- intersect(ix.grp2, which(tmp.vec[,2] >= 1.2)) ix.outlier <- c(ix.outlier1, ix.outlier2) expr_dat_sub <- expr_dat[-ix.outlier,] if(trim==T){ expr_dat_sub <- expr_dat expr_dat_sub[ix.outlier1,] <- 2 expr_dat_sub[ix.outlier2,] <- 1.2 } } return(list(y=expr_dat_sub, outlier=ix.outlier)) } tmp.expr <- LPEseq.normalise(sultan[,c(1,3)]) tmp.vec <- cbind(apply(tmp.expr, 1, mean), apply(tmp.expr, 1, diff), apply(tmp.expr, 1, var)) plot(tmp.vec[,1:2]) plot(tmp.vec[,c(1,3)]) ix.zero.grp1 <- which(tmp.expr[,1]==0) ix.zero.grp2 <- which(tmp.expr[,2]==0) ix.zero.both <- unique(c(ix.zero.grp1, ix.zero.grp2)) tmp.expr.nonzero <- tmp.vec[-ix.zero.both,] par(mfrow=c(2,2)) plot(tmp.vec[,1:2]) plot(tmp.expr.nonzero[,1:2]) plot(tmp.vec[,c(1,3)]) plot(tmp.expr.nonzero[,c(1,3)]) LPEseq.var.edit <- function(y, n.bin=n.bin, df=df, d=d, o.mode="FC", trim=F, chisq.filter=TRUE){ library(outliers) y <- LPEseq.outlier.edit(y, d=d, o.mode=o.mode)$y qnt <- 1/n.bin AM <- am.trans(y) A <- AM[,1] M <- AM[,2] median.y <- apply(y, 1, median) quantile.A <- quantile(A, probs = seq(0,1,qnt), na.rm=T) quan.n <- length(quantile.A)-1 var.M <- rep(0, length=quan.n-1) medianAs <- rep(0, length=quan.n-1) if(sum(A==min(A)) > (qnt*length(A))){ tmpA <- A[!(A==min(A))] quantile.A <- c(min(A), quantile(tmpA, probs=seq(qnt, 1, qnt), na.rm=T)) } for(i in 2:(quan.n+1)){ n.i <- length(!is.na(M[A>=quantile.A[i-1]&A 1){ mult.factor <- 0.5*((n.i-0.5)/(n.i-1)) tmp.M <- M[A>=quantile.A[i-1] & A=quantile.A[i-1] & A (qnt*length(A))){ tmpA <- A[!(A==min(A))] quantile.A <- c(min(A), quantile(tmpA, probs=seq(qnt, 1, qnt), na.rm=T)) } for(i in 2:(quan.n+1)){ n.i <- length(!is.na(M[A>=quantile.A[i-1]&A 1){ mult.factor <- 0.5*((n.i-0.5)/(n.i-1)) var.M[i-1] <- mult.factor * var(M[A>=quantile.A[i-1] & A=quantile.A[i-1] & A (qnt*length(A1))){ tmpA1 <- A1[!(A1==min(A1))] quantile.A1 <- c(min(A1), quantile(tmpA1, probs=seq(qnt, 1, qnt), na.rm=T)) } if(sum(A2==min(A2)) > (qnt*length(A2))){ tmpA2 <- A2[!(A2==min(A2))] quantile.A2 <- c(min(A2), quantile(tmpA2, probs=seq(qnt, 1, qnt), na.rm=T)) } for(i in 2:(quan.n+1)){ n.i <- length(!is.na(M1[A1>=quantile.A1[i-1]&A1 1){ mult.factor <- 0.5*((n.i-0.5)/(n.i-1)) var.M1[i-1] <- mult.factor * var(M1[A1>=quantile.A1[i-1] & A1=quantile.A1[i-1] & A1=quantile.A1[i-1]&A2 1){ mult.factor <- 0.5*((n.i-0.5)/(n.i-1)) var.M2[i-1] <- mult.factor * var(M2[A2>=quantile.A2[i-1] & A2=quantile.A2[i-1] & A2 res.hapmap.fc #### HAPMAP #### hapmap.mont.f <- hapmap.mont[,which(hapmap.mont.sex=="Female")] set.seed(123456) doKS(hapmap.mont.f, df=10, d=3.0, mode="total") -> res.hapmap.female.fc3.0.total doKS4multi <- function(indata, grp, n.rep = n.rep, df=df, d=d, n.bin=100, n.smooth=10, mode="x", o.mode="FC", trim=F){ in.norm <- LPEseq.normalise(indata) grp.names <- names(table(grp)) ix.grp1 <- which(grp == grp.names[1]) ix.grp2 <- which(grp == grp.names[2]) tmp.lpe.var <- as.list(NA) if(mode=="x"){ for(i in 1:length(n.rep)){ tmp.lpe.var[[i]] <- lpe.var.ks(in.norm[,sample(ix.grp1, n.rep[i])], df=df, n.bin=n.bin) } } if(mode=="y"){ tmp.lpe.var <- lpe.var.ks(in.norm[,ix.grp2], df=df, n.bin=n.bin) } if(mode=="total"){ tmp.lpe.var <- lpe.var.ks(in.norm, df=df, n.bin=n.bin) } if(mode=="pool"){ tmp.lpe.var <- lpe.var.ks.pool(in.norm, df=df, n.bin=n.bin) } tmp.lpeseq.var <- as.list(NA) for(i in 1:10){ ix.used <- c(sample(ix.grp1, 1), sample(ix.grp2, 1)) in.norm.norep <- in.norm[,ix.used] tmp.lpeseq.var[[i]] <- LPEseq.var.edit(in.norm.norep, n.bin=100, df=df, d=d, o.mode=o.mode, trim=trim) } #ix.used <- c(sample(1:n.rep, 1), sample((n.rep+1):ncol(indata), 1)) #in.norm.norep <- in.norm[,ix.used] #tmp.lpeseq.var <- LPEseq.var.edit(in.norm.norep, n.bin=100, df=df, d=d) res.lpeseq.var <- as.list(NA) res.lpe.var <- as.list(NA) bin.vec <- quantile(1:10, seq(0,1,1/n.smooth)) for(i in 1:4){ res.lpe.var[[i]] <- fixbounds.predict.smooth.spline(tmp.lpe.var[[i]], bin.vec)$y } for(i in 1:10){ res.lpeseq.var[[i]] <- fixbounds.predict.smooth.spline(tmp.lpeseq.var[[i]], bin.vec)$y } # if(any(res.lpe.var < min(res.lpe.var))){ # res.lpe.var[res.lpe.var < min(res.lpe.var)] <- min(res.lpe.var) # } for(i in 1:4){ if(any(res.lpe.var[[i]] < 0)){ res.lpe.var[res.lpe.var[[i]] < 0] <- min(res.lpe.var[[i]]) } } # if(any(res.lpeseq.var < min(res.lpeseq.var))){ # res.lpeseq.var[res.lpeseq.var < min(res.lpeseq.var)] <- min(res.lpeseq.var) # } for(i in 1:10){ if(any(res.lpeseq.var[[i]] < 0)){ res.lpeseq.var[[i]][res.lpeseq.var[[i]] < 0] <- min(res.lpeseq.var[[i]]) } } res.ks <- numeric(10) for(i in 1:10){ res.ks[i] <- ks.test(res.lpe.var[[4]], res.lpeseq.var[[i]])$p.value } plot(res.lpe.var[[4]], type="l", col="royalblue", main=paste("Average p-value using KS test is: ", round(mean(res.ks), 5), sep=""), lwd=3, ylab="Est. Var", xlab="Intensity") lines(res.lpe.var[[3]], col="deepskyblue", lty=2 ,lwd=3) lines(res.lpe.var[[2]], col="dodgerblue", lty=3, lwd=3) lines(res.lpe.var[[1]], col="lightskyblue", lty=4, lwd=3) for(i in 1:10){ lines(res.lpeseq.var[[i]], col="grey50", lty=5, lwd=3) } legend("topright", legend=c("Total samples", "20 samples", "10 samples", "5 samples", "1 sample"), lty=1:5, col=c("royalblue", "deepskyblue", "dodgerblue", "lightskyblue", "grey50"), lwd=3) return(list(ks.pvalue=res.ks, lpe.var=res.lpe.var, lpeseq.var=res.lpeseq.var)) #print(ix.used) } set.seed(123456) doKS4multi(hapmap.mont, hapmap.mont.sex, n.rep=c(5,10,20,33), df=10, d=5.0, mode="x") -> res.hapmap.female.fc3.0.x cdfs.multi.plot <- function(x){ plot(sort(x[[2]][[4]]), (1:length(x[[2]][[4]]))/length(x[[2]][[4]]), type="s", lwd=2, col="royalblue", ylab="", xlab="estimated variance", main="Empirical cumulative distribution") for(i in 1:10){ points(sort(x[[3]][[i]]), (1:length(x[[3]][[i]]))/length(x[[3]][[i]]), lwd=2, lty=2, col="grey50", type="s") } mtext(text = expression(hat(F)[n](x)), side=2, line=2.5) legend("bottomright", legend=c("With replicate", "W/O replicate"), lty=c(1,2), col=c("royalblue", "grey50"), lwd=3) } cdfs.plot(res.hapmap.female.fc3.0.x) result_hapmap <- matrix(0, nrow=nrow(hapmap), ncol=5) colnames(result_hapmap) <- c("LPEseq", "edgeR", "DESeq", "DEseq2", "NBPSeq") set.seed(1004) result_hapmap[,1] <- testLPEseq(hapmap.mont, hapmap.mont.sex)[,2] result_hapmap[,2] <- testEdgeR(hapmap.mont, hapmap.mont.sex)[,2] result_hapmap[,3] <- testDESeq(hapmap.mont, hapmap.mont.sex)[,2] result_hapmap[,4] <- testDESeq2(hapmap.mont, hapmap.mont.sex)[,2] result_hapmap[,5] <- testNBPSeq(as.matrix(hapmap.mont), hapmap.mont.sex)[,2] makeSummary <- function(result_file, value){ tmp <- numeric(ncol(result_file)) names(tmp) <- colnames(result_file) for(i in 1:length(tmp)){ tmp[i] <- length(which(result_file[,i] res.hapmap.female.fc3 set.seed(123456) doKS4hapmap(hapmap.mont, hapmap.mont.sex, df=10, d=4) -> res.hapmap.female.fc4 set.seed(123456) doKS4hapmap(hapmap.mont, hapmap.mont.sex, df=10, d=5) -> res.hapmap.female.fc5 set.seed(123456) doKS4hapmap(hapmap.mont, hapmap.mont.sex, df=10, d=7) -> res.hapmap.female.fc7 boxplot(res.hapmap.female.fc7$ks.pvalue, ylim=c(0,1)) mean(res.hapmap.female.fc7$ks.pvalue); sd(res.hapmap.female.fc7$ks.pvalue) ############################# ## RECONSIDERING OVERLAPS ############################# result_sultan_new <- matrix(NA, nrow=nrow(sultan), ncol=6) colnames(result_sultan_new) <- c("LPE", "LPEseq", "edgeR", "DESeq", "DEseq2", "NBPSeq") head(sultan) set.seed(1004) result_sultan_new[,1] <- testLPEseq(sultan, hapmap.mont.sex)[,2] result_sultan_new[,2] <- testLPEseq(sultan, hapmap.mont.sex)[,2] result_sultan_new[,3] <- testEdgeR(sultan, hapmap.mont.sex)[,2] result_sultan_new[,4] <- testDESeq(sultan, hapmap.mont.sex)[,2] result_sultan_new[,5] <- testDESeq2(sultan, hapmap.mont.sex)[,2] result_sultan_new[,6] <- testNBPSeq(as.matrix(sultan), hapmap.mont.sex)[,2] library(VennDiagram) venn.plpot <- venn.diagram(list( LPEseq = hapmap.alias[which(result_hapmap[,1] < 0.05),1], edgeR = hapmap.alias[which(result_hapmap[,2] < 0.05),1], DESeq = hapmap.alias[which(result_hapmap[,3] < 0.05),1], DESeq2 = hapmap.alias[which(result_hapmap[,4] < 0.05),1], NBPSeq = hapmap.alias[which(result_hapmap[,5] < 0.05),1] ), "VennDiagram_Hapmap.tiff", col = "transparent", fill = 1:5 ) venn.plot <- venn.diagram(list(A=1:10, B=5:20, C=4:30), "tmp.tiff") ############################### ## MAQC OVERLAPS ############################### result_maqc <- matrix(0, nrow=nrow(maqc), ncol=5) colnames(result_maqc) <- c("LPEseq", "edgeR", "DESeq", "DEseq2", "NBPSeq") set.seed(1004) result_maqc[,1] <- testLPEseq(maqc, grp7)[,2] result_maqc[,2] <- testEdgeR(maqc, grp7)[,2] result_maqc[,3] <- testDESeq(maqc, grp7)[,2] result_maqc[,4] <- testDESeq2(maqc, grp7)[,2] result_maqc[,5] <- testNBPSeq(as.matrix(maqc), grp7)[,2] makeSummary <- function(result_file, value){ tmp <- numeric(ncol(result_file)) names(tmp) <- colnames(result_file) for(i in 1:length(tmp)){ tmp[i] <- length(which(result_file[,i]