Quick tour of OTAcluster

Beomseok Seo

2018-04-24

OTAcluster is an R package for mean partition of ensemble of partitions by optimal transport alignment (OTA) and both partition and cluster-wise uncertainty measures, including topological relationship between clusters in a partition, Confidence Point Set (CPS), Cluster Alignment and Points based (CAP) separability, and Wasserstein distance between partitions.

How to use OTAcluster functions

1. Preset directory and compile C codes

OTAcluster utilizes C codes for its main optimal transport alignment(OTA). SetOTA() function set the directory for the C codes, and compile them.

library(OTAcluster)
# set OTA package directory and compile C code.
SetOTA("./package5")

2. Generate ensemble of partitions

OTAcluster provides a function GenData() generating partition on p dimmensional space.

set.seed(2070)
dat = GenData(n=5000,p=2,C=4)

OTAcluster provides two functions GenBsSamps() and GenNoiseSamps() for generating perturbed ensemble of partition.

# set parameters for 'kmeans' method in GenBsSamps.
C=4 
bs.samps = GenBsSamps(dat$X, nb=100, method="kmeans")
noise.samps = GenNoiseSamps(dat$X, nb=100, method="kmeans")

For method, both functions can take already defined methods strings(ex. “kmeans”, “Mclust”, “hclust”, “dbscan”, “PCAreduce”, “HMM-VB”) or user specified funtions which take inputs and output as follows.

kcca.user <- function(X, newdata){
  kcl = flexclust::kcca(X, k=C, flexclust::kccaFamily("kmeans"))
  res = flexclust::predict(kcl, newdata=newdata)
  return(res)
}
# set parameters for kcca method.
C=4
bs.samps = GenBsSamps(dat$X, nb=100, method=kcca.user)

3. Align ensemble and find mean partition

To find mean partition, first, reference partition (median partition) index should be obtained by pairwise Wasserstein distance of all partitions. This work is done by align2(). Next by alignment of all partition to the reference partition, mean partition and its uncertainty statistics are obtained by MeanPart().

# find mean partition and uncertainty statistics.
idx = align2(bs.samps$Z,100)
res = MeanPart(bs.samps$Z,100,idx)

4. The results

In the case that the ground truth is known, by plotting each partition and comparing Wasserstein distances of each partition, accuracy of clustering result can be checked.

# plot the result on two dimensional space.
plot_part(dat$X,dat$z)          # ground truth

plot_part(dat$X,kmeans(dat$X,C)$cluster)  # baseline method

plot_part(dat$X,res$repre)      # OTA


# distance between ground truth and each partition
WassDist(c(dat$z,kmeans(dat$X,C)$cluster,res$repre),3)
#>   V1       V2
#> 1  4 0.000000
#> 2  4 0.252062
#> 3  4 0.251448

# Topological relationships between mean partition and ensemble clusters
TRS = res$summary[,c(3,5,7,9)]
rownames(TRS) = c("C1","C2","C3","C4")
colnames(TRS) = c("match","split","merge","lack of correspondence")
t(TRS)
#>                        C1 C2 C3 C4
#> match                  85 84 84 97
#> split                   0  0  0  1
#> merge                   0  1  1  0
#> lack of correspondence 15 15 15  2

# Cluster Alignment and Points based (CAP) separability
CAP = res$jaccard_mat
rownames(CAP) = colnames(CAP) = c("C1","C2","C3","C4")
CAP
#>       C1    C2    C3    C4
#> C1 0.000 0.999 1.000 0.912
#> C2 0.999 0.000 0.931 1.000
#> C3 1.000 0.931 0.000 0.992
#> C4 0.912 1.000 0.992 0.000

# Confidence Point Set(CPS)
plot_part(dat$X,res$confset[[1]])

plot_part(dat$X,res$confset[[2]])

plot_part(dat$X,res$confset[[3]])

plot_part(dat$X,res$confset[[4]])

Experiment on Single-cell data

Here we apply OTAcluster on single-cell genomic data. This data set consists of \(300\) cells and \(8,686\) genes, which is available from the Github repository: https://github.com/JustinaZ/PCAreduce (Zurauskiene and Yau, 2016) We followed Lin and Li(2017)’s preprocessing steps.

# Pre-processing
  b <-read.table("Pollen2014.txt", sep=',', header = T,row.names=1)
  lb <-read.table("SupplementaryLabels.txt", sep=',', header = T)
  D <- log2(as.matrix(b) + 1) # log transformation of count data
  
  cd <- b
  cd <- cd[, colSums(cd>0)>1.8e3]
# remove genes that don't have many reads
  cd <- cd[rowSums(cd)>10, ]
# remove genes that are not seen in a sufficient number of cells
  cd <- cd[rowSums(cd>0)>5, ]
# transform to make more data normal
  mat <- log10(as.matrix(cd)+1)
  v <- apply(mat, 1, var)
  vi <- names(sort(v, decreasing = T)[1:500])
  Input <- t(mat[vi,]) # data

Ensemble of partitions is generated by bootstrap resampling method through GenBsSamps().

X = Input

# set parameters for 'PCAreduce' and "HMM-VB" methods in GenBsSamps.
C=14;
NVB=5;DSQ=rep(dim(X)[2]/NVB,NVB);CSQ=rep(40,NVB)
# set HMM-VB package directory.
directory_HMMVB = "./HMMVB"
#Train_data.txt, model_binary.dat, hyperparam_HMMVB.dat, Test_data.txt should be ready in ./HMMVB.

bs.samps.p = GenBsSamps(X,100,'PCAreduce')
bs.samps.v = GenBsSamps(X,100,'HMM-VB')

Mean partition and uncertainty measures for HMM-VB method are computed through align2() and MeanPart().

# for individual cluster study
idx.v = align2(bs.samps.v$Z,100)
res.v = MeanPart(bs.samps.v$Z,100,idx.v)
# Topological relationships between mean partition and ensemble clusters
# Empty clusters are deleted
TRS.v = res.v$summary[c(1,3,4,5,6,8,11,12,13,18),c(3,5,7,9)]
rownames(TRS.v) = c("HL60","Cl1","2338","K562","NPC","Kera","BJ","Cl2","iPS","2339")
colnames(TRS.v) = c("match","split","merge","lack of correspondence")
t(TRS.v)
#>                        HL60 Cl1 2338 K562 NPC Kera BJ Cl2 iPS 2339
#> match                    96  23   71   94  60   98 40  25  33   56
#> split                     4  34   25    6   1    1 58   8  35   33
#> merge                     0   0    0    0   0    0  0   0   0    0
#> lack of correspondence    0  43    4    0  39    1  2  67  32   11

# Cluster Alignment and Points based (CAP) separability
CAP.v = res.v$jaccard_mat[c(1,3,4,5,6,8,11,12,13,18),c(1,3,4,5,6,8,11,12,13,18)]
rownames(CAP.v) = colnames(CAP.v) = c("HL60","Cl1","2338","K562","NPC","Kera","BJ","Cl2","iPS","2339")
CAP.v*100
#>      HL60   Cl1  2338  K562   NPC  Kera    BJ   Cl2   iPS  2339
#> HL60  0.0  95.4  99.0  94.6  99.0  97.3  97.0  95.2  99.1  98.8
#> Cl1  95.4   0.0  96.2  99.0  81.2 100.0 100.0  16.0  91.8 100.0
#> 2338 99.0  96.2   0.0  97.6  96.9  94.8 100.0  95.9  94.5  93.6
#> K562 94.6  99.0  97.6   0.0 100.0 100.0 100.0  98.9  97.9  93.8
#> NPC  99.0  81.2  96.9 100.0   0.0 100.0 100.0  78.1  67.8 100.0
#> Kera 97.3 100.0  94.8 100.0 100.0   0.0  89.6 100.0 100.0 100.0
#> BJ   97.0 100.0 100.0 100.0 100.0  89.6   0.0 100.0 100.0 100.0
#> Cl2  95.2  16.0  95.9  98.9  78.1 100.0 100.0   0.0  90.0 100.0
#> iPS  99.1  91.8  94.5  97.9  67.8 100.0 100.0  90.0   0.0  96.6
#> 2339 98.8 100.0  93.6  93.8 100.0 100.0 100.0 100.0  96.6   0.0
# 10 - 90% Confidence Point Set(CPS) for cell iPS
for(i in (1:9)/10){
  assign(paste("res.v.",i,sep=""),
         MeanPart(bs.samps.v$Z,100,idx.v,alpha=i))
}

library(dplyr)
Ch <- function(X,A){
  s = cbind(X,A) %>% split(as.numeric(A))
  ch = s %>% lapply(., function(el) chull(el$PC1, el$PC2))
  ch = lapply(names(ch), function(x) s[[x]][ch[[x]],]) %>% do.call(rbind, .) 
  return(ch)
}

draw.obs <- function(X,true_cell_cls){
  X = data.frame(PC1=X[,1],PC2=X[,2])
  A = as.character(res.v.0.1$confset[[13]])
  A1 = as.character(res.v.0.1$confset[[13]])
  A2 = as.character(res.v.0.2$confset[[13]])
  A3 = as.character(res.v.0.3$confset[[13]])
  A4 = as.character(res.v.0.4$confset[[13]])
  A5 = as.character(res.v.0.5$confset[[13]])
  A6 = as.character(res.v.0.6$confset[[13]])
  A7 = as.character(res.v.0.7$confset[[13]])
  A8 = as.character(res.v.0.8$confset[[13]])
  A9 = as.character(res.v.0.9$confset[[13]])
  
  cha1 = Ch(X,A1)
  cha2 = Ch(X,A2)
  cha3 = Ch(X,A3)
  cha4 = Ch(X,A4)
  cha5 = Ch(X,A5)
  cha6 = Ch(X,A6)
  cha7 = Ch(X,A7)
  cha8 = Ch(X,A8)
  cha9 = Ch(X,A9)
  
  cha1 = cha1[cha1$A==1,]
  cha2 = cha2[cha2$A==1,]
  cha3 = cha3[cha3$A==1,]
  cha4 = cha4[cha4$A==1,]
  cha5 = cha5[cha5$A==1,]
  cha6 = cha6[cha6$A==1,]
  cha7 = cha7[cha7$A==1,]
  cha8 = cha8[cha8$A==1,]
  cha9 = cha9[cha9$A==1,]
  
  library(ggplot2)
  ggplot(as.data.frame(X), aes(x=PC1, y=PC2, color = factor(A, level=c("1","0")))) +
    geom_point(shape=19, size=1, alpha=0.5) +
    geom_polygon(data = cha1, aes(fill = A), alpha = 1/5, color = NA) +
    geom_polygon(data = cha2, aes(fill = A), alpha = 0.2/5, color = NA) +
    geom_polygon(data = cha3, aes(fill = A), alpha = 0.3/5, color = NA) +
    geom_polygon(data = cha4, aes(fill = A), alpha = 0.4/5, color = NA) +
    geom_polygon(data = cha5, aes(fill = A), alpha = 0.5/5, color = NA) +
    geom_polygon(data = cha6, aes(fill = A), alpha = 0.6/5, color = NA) +
    geom_polygon(data = cha7, aes(fill = A), alpha = 0.7/5, color = NA) +
    geom_polygon(data = cha8, aes(fill = A), alpha = 0.8/5, color = NA) +
    geom_polygon(data = cha9, aes(fill = A), alpha = 0.9/5, color = NA) +
    guides(color = guide_legend(title="Cluster"), fill=guide_legend(title="Cluster")) +
    theme_classic() +
    theme(legend.position="none")
}  

draw.obs(prcomp(t(D))$x[,1:2],res.v$repre)