library(atom2seq)
library(Biobase)
library(xtable)
library(igraph)
library(graph)
library(networkTools)
library(RCytoscape)
library(Biostrings)
library(magrittr)
library(IRanges)
library(ggplot2)

Sequence alignment of SEQRES and ATOM records

The problem:

PDB says that ATOM records (amino acid coordinates) should line up with the SEQRES records (amino acid sequence of the protein), but in many instances it appears that they do not. We need to be able to translate from the ATOM AA coordinates to the SEQRES AA coordinates.

Testing

Lets set up some mock instances to test our algorithm. This algorithm was used to generate the alignments used later.

# need to add a test for non-standards existing as insertion character.
atom <- AAStringSet(c(rmFrontAddBack="MDDTEKMSMKLT", addFrontRmBack="TSMDDTEKMSMK", insertion="MDDAATEKMSMKL", deletion="SMTEKMSMKL", indel="SMTEKMSMAAKL", nonAAInsert="SMDD--TEKMSMKL", nonAAInsertRmFront="MDD--TEKMSMKL"))

seq <- AAStringSet(c(rep("SMDDTEKMSMKL", 5), nonAAInsert="SMDDTEKMSMKL", nonAAInsertRmFront="SMDDTEKMSMKL"))

refNum <- list(rmFrontAddBack=c(2,3,4,5,6,7,8,9,10,11,12,0),
               addFrontRmBack=c(0,1,2,3,4,5,6,7,8,9,10,11),
               insertion=c(2,3,4,0,0,5,6,7,8,9,10,11,12),
               deletion=c(1,2,5,6,7,8,9,10,11,12),
               indel=c(1,2,5,6,7,8,9,10,0,0,11,12),
               nonAAInsert=c(1,2,3,4,0,0,5,6,7,8,9,10,11,12),
               nonAAInsertRmFront=c(2,3,4,0,0,5,6,7,8,9,10,11,12))
names(seq) <- names(atom)
useNames <- names(atom) # do a small sample for testing
outTest <- lapply(useNames, function(x){genAlign(atom[[x]], seq[[x]])})
outNum <- lapply(outTest, function(x){x$atomNum})
names(outNum) <- useNames
all.equal(outNum, refNum)
## [1] TRUE

Yes, our test appears to work, so lets start doing some work with the real data. Here is the code used to align the SEQRES and ATOM records. This is not run in the vignette (it takes a while).

useDir <- "/mlab/data/rmflight/Documents/projects/work/sen/coordination_families"
library(Biostrings)
seq <- readAAStringSet(file.path(useDir, "sequences.SEQRES.multi.txt"), use.names=TRUE)
atom <- readAAStringSet(file.path(useDir, "sequences.ATOM.multi.txt"), use.names=TRUE)

Lets do a sanity check, we should have the same PDB ID for all the names.

splitDot <- function(inNames){
  allSplit <- strsplit(inNames, ".", fixed=TRUE)
  outNames <- sapply(allSplit, function(x){x[[1]]})
}
seqNames <- splitDot(names(seq))
atomNames <- splitDot(names(atom))

all.equal(seqNames, atomNames)

seqSort <- sort(names(seq))
atomSort <- sort(names(atom))

all.equal(seqSort, atomSort)

OK, everything seems to match up nicely between the two. Lets see if we can successfully run on a subset of the sequences.

useNames <- sample(names(atom), 100)
outAlign <- mclapply(useNames, function(x){genAlign(atom[[x]], seq[[x]])}, mc.cores=6)

And it runs! Now lets run everything!

useNames <- names(atom)
outNum <- mclapply(useNames, function(x){
  tmpRes <- genAlign(atom[[x]], seq[[x]])
  tmpRes$atomNum
}, mc.cores=6)
names(outNum) <- useNames

save(outNum, file=file.path(useDir, "atom2seq.RData"))

Run InterProScan on Non-Redundant Sites

After passing the mapping results back to Sen, she has supplied the non-redundant zinc sites with the number of chains and which chain, as well as the ligands, positions, and sequences. We will generate a fasta file to run through InterProScan.

non_red <- read.table(file.path(useDir, "znlist.nonredundant.txt"), header = TRUE, sep = "\t", row.names = 1, stringsAsFactors = FALSE)
non_red$chain.id <- paste(non_red$zinc.id, non_red$chain.counts, sep = ":")
single_chains <- !duplicated(non_red$chain.id)
non_red <- non_red[single_chains,]
non_red_seq <- AAStringSet(non_red$sequence)
names(non_red_seq) <- non_red$chain.id
writeXStringSet(non_red_seq, file = file.path(useDir, "non_redundant_zinc.fa"))

We want to make sure that the ligands and locations actually match the locations in the sequences.

ligand_locs <- strsplit(non_red$ligand.position.combo, ".", fixed = TRUE) %>% lapply(., as.integer)
upper_code <- toupper(AMINO_ACID_CODE)
single_code <- names(upper_code)
ligand_combo <- strsplit(non_red$ligand.combo, ".", fixed = TRUE) %>% lapply(., function(x){paste(single_code[match(x, upper_code)], sep="", collapse="")})
seq_combo <- lapply(seq(1, nrow(non_red)), function(x){as.character(non_red_seq[[x]][ligand_locs[[x]]])})
all.equal(ligand_combo, seq_combo)
## [1] TRUE

Yes, everything matches up nicely. So we should be good to go for the next step in examining InterProScan result overlaps with the ligands.

InterProScan Run

InterProScan v 5.7-48.0 was run:

interpro -i non_redundant_zinc.fa -f tsv -iprlookup

With the following hmm databases:

Get Ranges of Active Sites

For each chain, we will record the location of the ligands as positions in an IRanges instance that can be queried for overlap with the results from Interproscan.

ligand_ranges <- lapply(ligand_locs, function(x){IRanges(x, width = 1)})
names(ligand_ranges) <- non_red$chain.id

InterProScan Results

Now we need to get the data from the InterProScan data.

interproRes <- read.table(file.path(useDir, "non_redundant_zinc.fa_1.tsv"), sep="\t", header=FALSE, stringsAsFactors=FALSE, fill=TRUE, quote="", comment.char="")
names(interproRes) <- c("id", "md5", "length", "analysis", "sigAccession", "sigDescription", "start", "stop", "score", "status", "date", "iprAccession", "iprDescription")

iprData <- unique(interproRes[, c("iprAccession", "iprDescription")])
rownames(iprData) <- iprData$iprAccession

Next we will split this into a list based on the ID string, and then transform to ranges with associated meta data,

splitInter <- split(interproRes, interproRes$id)
splitRanges <- lapply(splitInter, interpro2ranges)

Before we proceed, lets first check how many of the original non-redundant entries actually have any results from InterProScan.

interpro_names <- unique(interproRes$id)
ligand_names <- unique(non_red$chain.id)

length(ligand_names)
## [1] 6859
length(interpro_names)
## [1] 6778
sum(ligand_names %in% interpro_names)
## [1] 6778
sum(interpro_names %in% ligand_names)
## [1] 6778
no_ipr <- length(ligand_names) - sum(ligand_names %in% interpro_names)

So, for whatever reason we had 81 chains out of the 6859 that did not have any InterProScan results.

Finally, we will subset the InterPro sites by overlap with the binding sites previously defined.

overlapRanges <- lapply(interpro_names, function(x){
  #print(x)
  subsetByOverlaps(splitRanges[[x]], ligand_ranges[[x]])
})
names(overlapRanges) <- interpro_names

Check how many actually subsetted at all:

splitRanges <- splitRanges[interpro_names]
orgLength <- sapply(splitRanges[interpro_names], length)
subsetLength <- sapply(overlapRanges[interpro_names], length)
sum(subsetLength < orgLength)
## [1] 4143

OK, so stuff did actually get subsetted. This is good.

Is there anything that had no InterProScan results intersecting the ligands?

has_no_intersect <- interpro_names[subsetLength == 0]
length(has_no_intersect)
## [1] 36
splitRanges[has_no_intersect[1:3]]
## $`3U79.H.110:2`
## IRanges of length 7
##     start end width
## [1]     4 109   106
## [2]     5 110   106
## [3]     4  91    88
## [4]    20  92    73
## [5]     4 108   105
## [6]     3 111   109
## [7]     3 111   109
## 
## $`2FAE.A.408:1`
## IRanges of length 9
##     start end width
## [1]    31  46    16
## [2]     3  73    71
## [3]     2  76    75
## [4]     3  76    74
## [5]     2  76    75
## [6]     2  76    75
## [7]     2  76    75
## [8]    40  75    36
## [9]     5  72    68
## 
## $`2FAE.A.406:1`
## IRanges of length 9
##     start end width
## [1]    31  46    16
## [2]     3  73    71
## [3]     2  76    75
## [4]     3  76    74
## [5]     2  76    75
## [6]     2  76    75
## [7]     2  76    75
## [8]    40  75    36
## [9]     5  72    68
ligand_ranges[has_no_intersect[1:3]]
## $`3U79.H.110:2`
## IRanges of length 2
##     start end width
## [1]     1   1     1
## [2]     1   1     1
## 
## $`2FAE.A.408:1`
## IRanges of length 2
##     start end width
## [1]     1   1     1
## [2]     1   1     1
## 
## $`2FAE.A.406:1`
## IRanges of length 1
##     start end width
## [1]    77  77     1

There is 36 chains where there is no intersection of the InterProScan results with the ligands. Lets remove those from our list.

has_intersect <- interpro_names[subsetLength > 0]
splitRanges <- splitRanges[has_intersect]
ligand_ranges <- ligand_ranges[has_intersect]

This leaves us with 6742 chains to work with.

Finally, all of the clustering is based on individual zinc sites. The functional analysis was done on the chains, so we need to go back through by site and collapse together any that have multiple chains.

non_red <- non_red[(non_red$chain.id %in% has_intersect),] # filter down to what we have
split_chain_id <- split(non_red$chain.id, non_red$zinc.id)
collapsed_regions <- lapply(split_chain_id, function(x){
  unlist(RangesList(splitRanges[x]))
})

Functional Analysis of Clusters

We are really looking for some association between the clusters by k-means on the angles and the functionality. So lets analyze each functional annotation and see what clusters are present.

We will start by generating the annotation to site information.

site_to_sigAccession <- lapply(collapsed_regions, function(x){
  tmpData <- mcols(x)
  tmpAcc <- unique(tmpData$sigAccession)
  tmpAcc[nchar(tmpAcc) > 0]
})
site_to_sigAccession <- (sapply(site_to_sigAccession, length) > 0) %>% site_to_sigAccession[.]
sig_to_site <- reverseSplit(site_to_sigAccession) %>% lapply(., unique)

site_to_iprAccession <- lapply(collapsed_regions, function(x){
  tmpData <- mcols(x)
  tmpIPR <- unique(tmpData$iprAccession)
  tmpIPR[nchar(tmpIPR) > 0]
})
site_to_iprAccession <- (sapply(site_to_iprAccession, length) > 0) %>% site_to_iprAccession[.]

ipr_to_site <- reverseSplit(site_to_iprAccession) %>% lapply(., unique)

Now for each cluster, we will calculate the percentage of each annotation in each cluster.

normal_clusters <- read.table(file.path(useDir, "normal.cluster.txt"), header = TRUE, sep = ",", stringsAsFactors = FALSE)
names(normal_clusters) <- c("row_id", "zinc_site", "cluster")
ipr_normal <- filterListFrame(ipr_to_site, normal_clusters, "zinc_site")

normal_cluster_count <- clusterContributions(ipr_normal$filterList, ipr_normal$filterFrame, "cluster", "zinc_site")

cluster_sizes <- sapply(split(normal_clusters$zinc_site, normal_clusters$cluster), length)
cluster_sizes2 <- sapply(split(ipr_normal$filterFrame$zinc_site, ipr_normal$filterFrame$cluster), length)

Hiearchical Clustering Of Functional Cluster Co-Variance

Using the cluster-percentages (calculated by clusterContributions above (nGene in Cluster / clusterSize / sum(proportions))), we can calculate a co-variance for each cluster pair (multiply), sum across all annotations. This sum becomes the value for the pair in a matrix, and normalize co-variances to 1, take 1-co-variance as a distance, and do hierarchical clustering on the result. This will result in clustering the clusters by annotation-cluster contribution.

normalCovariance <- clusterCovariance(normal_cluster_count$percentage)
normalDist <- 1 - normalCovariance
normalDist <- as.dist(normalDist)
normalHierarch <- hclust(normalDist)
plot(normalHierarch, main = "Normal Zinc Binding Sites", ylab = "Functional Cluster Distances", xlab = "Cluster", sub = "")

plot of chunk normalClustersByFunctional

Save the distances so that they can be compared with other stuff.

save(normalDist, file = file.path(useDir, "normal_functional_dist_16.RData"))

Compare with Structural

Comparison with the distances from the structural comparison.

load(file.path(useDir, "structural.dist.RData"))
dist.mat2 <- as.dist(dist.mat)

distances <- data.frame(func = as.vector(normalDist), struc = as.vector(dist.mat2))

corVal <- cor(distances$struc, distances$func, method = "spearman")
corVal
## [1] 0.6533995
pVal <- cor.test(distances$struc, distances$func, method = "spearman")$p.value
ggplot(distances, aes(x = struc, y = func)) + geom_point(size = 4) + xlab("Structural") + ylab("Functional") + ggtitle("Cluster Distances") + geom_text(data = NULL, aes(family="serif"), x = 50, y = 0.985, label = paste("rho = ", substr(as.character(corVal), 1, 4), " \n ", "p-value < 2.2e-16", sep = ""), size = 10) + theme(axis.text = element_text(size = 14), axis.title = element_text(size = 18), plot.title = element_text(size = 20))

plot of chunk compare_structural

structHierarch <- hclust(dist.mat2)
plot(structHierarch, main = "Normal Zinc Binding Sites", ylab = "Structural Cluster Distances", xlab = "Cluster", sub = "")

plot of chunk plot_structural

Other Clusterings

The above was an initial comparison using the normal sites and 16 clusters, the most stable grouping. However, there is also the compressed sites, and other stable clusters. So we are going to generate the functional distances for these other clusterings, and save them so that Sen can do some comparisons.

Here are the other normal clusters (10 and 6).

normal_10 <- read.table(file.path(useDir, "normal.cluster.10.txt"), header = TRUE, sep = " ", stringsAsFactors = FALSE)
names(normal_10) <- c("zinc_site", "cluster")
ipr_normal_10 <- filterListFrame(ipr_to_site, normal_10, "zinc_site")

normal_count_10 <- clusterContributions(ipr_normal_10$filterList, ipr_normal_10$filterFrame, "cluster", "zinc_site")
normal_cov_10 <- clusterCovariance(normal_count_10$percentage)
normal_dist_10 <- (1 - normal_cov_10) %>% as.dist(.)
plot(hclust(normal_dist_10))

plot of chunk normal_10

save(normal_dist_10, file = file.path(useDir, "normal_funct_dist_10.RData"))
normal_6 <- read.table(file.path(useDir, "normal.cluster.6.txt"), header = TRUE, sep = " ", stringsAsFactors = FALSE)
names(normal_6) <- c("zinc_site", "cluster")
ipr_normal_6 <- filterListFrame(ipr_to_site, normal_6, "zinc_site")

normal_count_6 <- clusterContributions(ipr_normal_6$filterList, ipr_normal_6$filterFrame, "cluster", "zinc_site")
normal_cov_6 <- clusterCovariance(normal_count_6$percentage)
normal_dist_6 <- (1 - normal_cov_6) %>% as.dist(.)
plot(hclust(normal_dist_6))

plot of chunk normal_6

save(normal_dist_6, file = file.path(useDir, "normal_funct_dist_6.RData"))

Now the compressed clusters.

compressed_13 <- read.table(file.path(useDir, "compressed.cluster.txt"), header = TRUE, sep = " ", stringsAsFactors = FALSE)
names(compressed_13) <- c("zinc_site", "cluster")
ipr_compressed_13 <- filterListFrame(ipr_to_site, compressed_13, "zinc_site")

compressed_count_13 <- clusterContributions(ipr_compressed_13$filterList, ipr_compressed_13$filterFrame, "cluster", "zinc_site")
compressed_cov_13 <- clusterCovariance(compressed_count_13$percentage)
compressed_dist_13 <- (1 - compressed_cov_13) %>% as.dist(.)
plot(hclust(compressed_dist_13))

plot of chunk compressed_13

save(compressed_dist_13, file = file.path(useDir, "compressed_funct_dist_13.RData"))
compressed_8 <- read.table(file.path(useDir, "compressed.cluster.8.txt"), header = TRUE, sep = " ", stringsAsFactors = FALSE)
names(compressed_8) <- c("zinc_site", "cluster")
ipr_compressed_8 <- filterListFrame(ipr_to_site, compressed_8, "zinc_site")

compressed_count_8 <- clusterContributions(ipr_compressed_8$filterList, ipr_compressed_8$filterFrame, "cluster", "zinc_site")
compressed_cov_8 <- clusterCovariance(compressed_count_8$percentage)
compressed_dist_8 <- (1 - compressed_cov_8) %>% as.dist(.)
plot(hclust(compressed_dist_8))

plot of chunk compressed_8

save(compressed_dist_8, file = file.path(useDir, "compressed_funct_dist_8.RData"))
library(rmfNotifier)
jobNotify("comparison done")
## [1] "rmf_notifier: @rmflight comparison done 2014-10-30 14:53:05"