OTAcluster is an R package for mean partition of ensemble of partitions by optimal transport alignment (OTA) and both partition and cluster-wise uncertainty measures, including topological relationship between clusters in a partition, Confidence Point Set (CPS), Cluster Alignment and Points based (CAP) separability, and Wasserstein distance between partitions.
OTAcluster utilizes C codes for its main optimal transport alignment(OTA). SetOTA() function set the directory for the C codes, and compile them.
library(OTAcluster)
# set OTA package directory and compile C code.
SetOTA("./package5")
OTAcluster provides a function GenData() generating partition on p dimmensional space.
set.seed(2070)
dat = GenData(n=5000,p=2,C=4)
OTAcluster provides two functions GenBsSamps() and GenNoiseSamps() for generating perturbed ensemble of partition.
# set parameters for 'kmeans' method in GenBsSamps.
C=4
bs.samps = GenBsSamps(dat$X, nb=100, method="kmeans")
noise.samps = GenNoiseSamps(dat$X, nb=100, method="kmeans")
For method, both functions can take already defined methods strings(ex. “kmeans”, “Mclust”, “hclust”, “dbscan”, “PCAreduce”, “HMM-VB”) or user specified funtions which take inputs and output as follows.
kcca.user <- function(X, newdata){
kcl = flexclust::kcca(X, k=C, flexclust::kccaFamily("kmeans"))
res = flexclust::predict(kcl, newdata=newdata)
return(res)
}
# set parameters for kcca method.
C=4
bs.samps = GenBsSamps(dat$X, nb=100, method=kcca.user)
To find mean partition, first, reference partition (median partition) index should be obtained by pairwise Wasserstein distance of all partitions. This work is done by align2(). Next by alignment of all partition to the reference partition, mean partition and its uncertainty statistics are obtained by MeanPart().
# find mean partition and uncertainty statistics.
idx = align2(bs.samps$Z,100)
res = MeanPart(bs.samps$Z,100,idx)
In the case that the ground truth is known, by plotting each partition and comparing Wasserstein distances of each partition, accuracy of clustering result can be checked.
# plot the result on two dimensional space.
plot_part(dat$X,dat$z) # ground truth
plot_part(dat$X,kmeans(dat$X,C)$cluster) # baseline method
plot_part(dat$X,res$repre) # OTA
# distance between ground truth and each partition
WassDist(c(dat$z,kmeans(dat$X,C)$cluster,res$repre),3)
#> V1 V2
#> 1 4 0.000000
#> 2 4 0.252062
#> 3 4 0.251448
# Topological relationships between mean partition and ensemble clusters
TRS = res$summary[,c(3,5,7,9)]
rownames(TRS) = c("C1","C2","C3","C4")
colnames(TRS) = c("match","split","merge","lack of correspondence")
t(TRS)
#> C1 C2 C3 C4
#> match 85 84 84 97
#> split 0 0 0 1
#> merge 0 1 1 0
#> lack of correspondence 15 15 15 2
# Cluster Alignment and Points based (CAP) separability
CAP = res$jaccard_mat
rownames(CAP) = colnames(CAP) = c("C1","C2","C3","C4")
CAP
#> C1 C2 C3 C4
#> C1 0.000 0.999 1.000 0.912
#> C2 0.999 0.000 0.931 1.000
#> C3 1.000 0.931 0.000 0.992
#> C4 0.912 1.000 0.992 0.000
# Confidence Point Set(CPS)
plot_part(dat$X,res$confset[[1]])
plot_part(dat$X,res$confset[[2]])
plot_part(dat$X,res$confset[[3]])
plot_part(dat$X,res$confset[[4]])
Here we apply OTAcluster on single-cell genomic data. This data set consists of \(300\) cells and \(8,686\) genes, which is available from the Github repository:
# Pre-processing
b <-read.table("Pollen2014.txt", sep=',', header = T,row.names=1)
lb <-read.table("SupplementaryLabels.txt", sep=',', header = T)
D <- log2(as.matrix(b) + 1) # log transformation of count data
cd <- b
cd <- cd[, colSums(cd>0)>1.8e3]
# remove genes that don't have many reads
cd <- cd[rowSums(cd)>10, ]
# remove genes that are not seen in a sufficient number of cells
cd <- cd[rowSums(cd>0)>5, ]
# transform to make more data normal
mat <- log10(as.matrix(cd)+1)
v <- apply(mat, 1, var)
vi <- names(sort(v, decreasing = T)[1:500])
Input <- t(mat[vi,]) # data
Ensemble of partitions is generated by bootstrap resampling method through GenBsSamps().
X = Input
# set parameters for 'PCAreduce' and "HMM-VB" methods in GenBsSamps.
C=14;
NVB=5;DSQ=rep(dim(X)[2]/NVB,NVB);CSQ=rep(40,NVB)
# set HMM-VB package directory.
directory_HMMVB = "./HMMVB"
#Train_data.txt, model_binary.dat, hyperparam_HMMVB.dat, Test_data.txt should be ready in ./HMMVB.
bs.samps.p = GenBsSamps(X,100,'PCAreduce')
bs.samps.v = GenBsSamps(X,100,'HMM-VB')
Mean partition and uncertainty measures for HMM-VB method are computed through align2() and MeanPart().
# for individual cluster study
idx.v = align2(bs.samps.v$Z,100)
res.v = MeanPart(bs.samps.v$Z,100,idx.v)
# Topological relationships between mean partition and ensemble clusters
# Empty clusters are deleted
TRS.v = res.v$summary[c(1,3,4,5,6,8,11,12,13,18),c(3,5,7,9)]
rownames(TRS.v) = c("HL60","Cl1","2338","K562","NPC","Kera","BJ","Cl2","iPS","2339")
colnames(TRS.v) = c("match","split","merge","lack of correspondence")
t(TRS.v)
#> HL60 Cl1 2338 K562 NPC Kera BJ Cl2 iPS 2339
#> match 96 23 71 94 60 98 40 25 33 56
#> split 4 34 25 6 1 1 58 8 35 33
#> merge 0 0 0 0 0 0 0 0 0 0
#> lack of correspondence 0 43 4 0 39 1 2 67 32 11
# Cluster Alignment and Points based (CAP) separability
CAP.v = res.v$jaccard_mat[c(1,3,4,5,6,8,11,12,13,18),c(1,3,4,5,6,8,11,12,13,18)]
rownames(CAP.v) = colnames(CAP.v) = c("HL60","Cl1","2338","K562","NPC","Kera","BJ","Cl2","iPS","2339")
CAP.v*100
#> HL60 Cl1 2338 K562 NPC Kera BJ Cl2 iPS 2339
#> HL60 0.0 95.4 99.0 94.6 99.0 97.3 97.0 95.2 99.1 98.8
#> Cl1 95.4 0.0 96.2 99.0 81.2 100.0 100.0 16.0 91.8 100.0
#> 2338 99.0 96.2 0.0 97.6 96.9 94.8 100.0 95.9 94.5 93.6
#> K562 94.6 99.0 97.6 0.0 100.0 100.0 100.0 98.9 97.9 93.8
#> NPC 99.0 81.2 96.9 100.0 0.0 100.0 100.0 78.1 67.8 100.0
#> Kera 97.3 100.0 94.8 100.0 100.0 0.0 89.6 100.0 100.0 100.0
#> BJ 97.0 100.0 100.0 100.0 100.0 89.6 0.0 100.0 100.0 100.0
#> Cl2 95.2 16.0 95.9 98.9 78.1 100.0 100.0 0.0 90.0 100.0
#> iPS 99.1 91.8 94.5 97.9 67.8 100.0 100.0 90.0 0.0 96.6
#> 2339 98.8 100.0 93.6 93.8 100.0 100.0 100.0 100.0 96.6 0.0
# 10 - 90% Confidence Point Set(CPS) for cell iPS
for(i in (1:9)/10){
assign(paste("res.v.",i,sep=""),
MeanPart(bs.samps.v$Z,100,idx.v,alpha=i))
}
library(dplyr)
Ch <- function(X,A){
s = cbind(X,A) %>% split(as.numeric(A))
ch = s %>% lapply(., function(el) chull(el$PC1, el$PC2))
ch = lapply(names(ch), function(x) s[[x]][ch[[x]],]) %>% do.call(rbind, .)
return(ch)
}
draw.obs <- function(X,true_cell_cls){
X = data.frame(PC1=X[,1],PC2=X[,2])
A = as.character(res.v.0.1$confset[[13]])
A1 = as.character(res.v.0.1$confset[[13]])
A2 = as.character(res.v.0.2$confset[[13]])
A3 = as.character(res.v.0.3$confset[[13]])
A4 = as.character(res.v.0.4$confset[[13]])
A5 = as.character(res.v.0.5$confset[[13]])
A6 = as.character(res.v.0.6$confset[[13]])
A7 = as.character(res.v.0.7$confset[[13]])
A8 = as.character(res.v.0.8$confset[[13]])
A9 = as.character(res.v.0.9$confset[[13]])
cha1 = Ch(X,A1)
cha2 = Ch(X,A2)
cha3 = Ch(X,A3)
cha4 = Ch(X,A4)
cha5 = Ch(X,A5)
cha6 = Ch(X,A6)
cha7 = Ch(X,A7)
cha8 = Ch(X,A8)
cha9 = Ch(X,A9)
cha1 = cha1[cha1$A==1,]
cha2 = cha2[cha2$A==1,]
cha3 = cha3[cha3$A==1,]
cha4 = cha4[cha4$A==1,]
cha5 = cha5[cha5$A==1,]
cha6 = cha6[cha6$A==1,]
cha7 = cha7[cha7$A==1,]
cha8 = cha8[cha8$A==1,]
cha9 = cha9[cha9$A==1,]
library(ggplot2)
ggplot(as.data.frame(X), aes(x=PC1, y=PC2, color = factor(A, level=c("1","0")))) +
geom_point(shape=19, size=1, alpha=0.5) +
geom_polygon(data = cha1, aes(fill = A), alpha = 1/5, color = NA) +
geom_polygon(data = cha2, aes(fill = A), alpha = 0.2/5, color = NA) +
geom_polygon(data = cha3, aes(fill = A), alpha = 0.3/5, color = NA) +
geom_polygon(data = cha4, aes(fill = A), alpha = 0.4/5, color = NA) +
geom_polygon(data = cha5, aes(fill = A), alpha = 0.5/5, color = NA) +
geom_polygon(data = cha6, aes(fill = A), alpha = 0.6/5, color = NA) +
geom_polygon(data = cha7, aes(fill = A), alpha = 0.7/5, color = NA) +
geom_polygon(data = cha8, aes(fill = A), alpha = 0.8/5, color = NA) +
geom_polygon(data = cha9, aes(fill = A), alpha = 0.9/5, color = NA) +
guides(color = guide_legend(title="Cluster"), fill=guide_legend(title="Cluster")) +
theme_classic() +
theme(legend.position="none")
}
draw.obs(prcomp(t(D))$x[,1:2],res.v$repre)