library(atom2seq)
library(Biobase)
library(xtable)
library(igraph)
library(graph)
library(networkTools)
library(RCytoscape)
library(Biostrings)
library(magrittr)
library(IRanges)
library(ggplot2)
The problem:
PDB says that ATOM records (amino acid coordinates) should line up with the SEQRES records (amino acid sequence of the protein), but in many instances it appears that they do not. We need to be able to translate from the ATOM AA coordinates to the SEQRES AA coordinates.
Lets set up some mock instances to test our algorithm. This algorithm was used to generate the alignments used later.
# need to add a test for non-standards existing as insertion character.
atom <- AAStringSet(c(rmFrontAddBack="MDDTEKMSMKLT", addFrontRmBack="TSMDDTEKMSMK", insertion="MDDAATEKMSMKL", deletion="SMTEKMSMKL", indel="SMTEKMSMAAKL", nonAAInsert="SMDD--TEKMSMKL", nonAAInsertRmFront="MDD--TEKMSMKL"))
seq <- AAStringSet(c(rep("SMDDTEKMSMKL", 5), nonAAInsert="SMDDTEKMSMKL", nonAAInsertRmFront="SMDDTEKMSMKL"))
refNum <- list(rmFrontAddBack=c(2,3,4,5,6,7,8,9,10,11,12,0),
addFrontRmBack=c(0,1,2,3,4,5,6,7,8,9,10,11),
insertion=c(2,3,4,0,0,5,6,7,8,9,10,11,12),
deletion=c(1,2,5,6,7,8,9,10,11,12),
indel=c(1,2,5,6,7,8,9,10,0,0,11,12),
nonAAInsert=c(1,2,3,4,0,0,5,6,7,8,9,10,11,12),
nonAAInsertRmFront=c(2,3,4,0,0,5,6,7,8,9,10,11,12))
names(seq) <- names(atom)
useNames <- names(atom) # do a small sample for testing
outTest <- lapply(useNames, function(x){genAlign(atom[[x]], seq[[x]])})
outNum <- lapply(outTest, function(x){x$atomNum})
names(outNum) <- useNames
all.equal(outNum, refNum)
## [1] TRUE
Yes, our test appears to work, so lets start doing some work with the real data. Here is the code used to align the SEQRES and ATOM records. This is not run in the vignette (it takes a while).
useDir <- "/mlab/data/rmflight/Documents/projects/work/sen/coordination_families"
library(Biostrings)
seq <- readAAStringSet(file.path(useDir, "sequences.SEQRES.multi.txt"), use.names=TRUE)
atom <- readAAStringSet(file.path(useDir, "sequences.ATOM.multi.txt"), use.names=TRUE)
Lets do a sanity check, we should have the same PDB ID for all the names.
splitDot <- function(inNames){
allSplit <- strsplit(inNames, ".", fixed=TRUE)
outNames <- sapply(allSplit, function(x){x[[1]]})
}
seqNames <- splitDot(names(seq))
atomNames <- splitDot(names(atom))
all.equal(seqNames, atomNames)
seqSort <- sort(names(seq))
atomSort <- sort(names(atom))
all.equal(seqSort, atomSort)
OK, everything seems to match up nicely between the two. Lets see if we can successfully run on a subset of the sequences.
useNames <- sample(names(atom), 100)
outAlign <- mclapply(useNames, function(x){genAlign(atom[[x]], seq[[x]])}, mc.cores=6)
And it runs! Now lets run everything!
useNames <- names(atom)
outNum <- mclapply(useNames, function(x){
tmpRes <- genAlign(atom[[x]], seq[[x]])
tmpRes$atomNum
}, mc.cores=6)
names(outNum) <- useNames
save(outNum, file=file.path(useDir, "atom2seq.RData"))
After passing the mapping results back to Sen, she has supplied the non-redundant zinc sites with the number of chains and which chain, as well as the ligands, positions, and sequences. We will generate a fasta
file to run through InterProScan.
non_red <- read.table(file.path(useDir, "znlist.nonredundant.txt"), header = TRUE, sep = "\t", row.names = 1, stringsAsFactors = FALSE)
non_red$chain.id <- paste(non_red$zinc.id, non_red$chain.counts, sep = ":")
single_chains <- !duplicated(non_red$chain.id)
non_red <- non_red[single_chains,]
non_red_seq <- AAStringSet(non_red$sequence)
names(non_red_seq) <- non_red$chain.id
writeXStringSet(non_red_seq, file = file.path(useDir, "non_redundant_zinc.fa"))
We want to make sure that the ligands and locations actually match the locations in the sequences.
ligand_locs <- strsplit(non_red$ligand.position.combo, ".", fixed = TRUE) %>% lapply(., as.integer)
upper_code <- toupper(AMINO_ACID_CODE)
single_code <- names(upper_code)
ligand_combo <- strsplit(non_red$ligand.combo, ".", fixed = TRUE) %>% lapply(., function(x){paste(single_code[match(x, upper_code)], sep="", collapse="")})
seq_combo <- lapply(seq(1, nrow(non_red)), function(x){as.character(non_red_seq[[x]][ligand_locs[[x]]])})
all.equal(ligand_combo, seq_combo)
## [1] TRUE
Yes, everything matches up nicely. So we should be good to go for the next step in examining InterProScan result overlaps with the ligands.
InterProScan v 5.7-48.0 was run:
interpro -i non_redundant_zinc.fa -f tsv -iprlookup
With the following hmm databases:
For each chain, we will record the location of the ligands as positions in an IRanges
instance that can be queried for overlap with the results from Interproscan
.
ligand_ranges <- lapply(ligand_locs, function(x){IRanges(x, width = 1)})
names(ligand_ranges) <- non_red$chain.id
Now we need to get the data from the InterProScan data.
interproRes <- read.table(file.path(useDir, "non_redundant_zinc.fa_1.tsv"), sep="\t", header=FALSE, stringsAsFactors=FALSE, fill=TRUE, quote="", comment.char="")
names(interproRes) <- c("id", "md5", "length", "analysis", "sigAccession", "sigDescription", "start", "stop", "score", "status", "date", "iprAccession", "iprDescription")
iprData <- unique(interproRes[, c("iprAccession", "iprDescription")])
rownames(iprData) <- iprData$iprAccession
Next we will split this into a list based on the ID string, and then transform to ranges with associated meta data,
splitInter <- split(interproRes, interproRes$id)
splitRanges <- lapply(splitInter, interpro2ranges)
Before we proceed, lets first check how many of the original non-redundant entries actually have any results from InterProScan
.
interpro_names <- unique(interproRes$id)
ligand_names <- unique(non_red$chain.id)
length(ligand_names)
## [1] 6859
length(interpro_names)
## [1] 6778
sum(ligand_names %in% interpro_names)
## [1] 6778
sum(interpro_names %in% ligand_names)
## [1] 6778
no_ipr <- length(ligand_names) - sum(ligand_names %in% interpro_names)
So, for whatever reason we had 81 chains out of the 6859 that did not have any InterProScan
results.
Finally, we will subset the InterPro sites by overlap with the binding sites previously defined.
overlapRanges <- lapply(interpro_names, function(x){
#print(x)
subsetByOverlaps(splitRanges[[x]], ligand_ranges[[x]])
})
names(overlapRanges) <- interpro_names
Check how many actually subsetted at all:
splitRanges <- splitRanges[interpro_names]
orgLength <- sapply(splitRanges[interpro_names], length)
subsetLength <- sapply(overlapRanges[interpro_names], length)
sum(subsetLength < orgLength)
## [1] 4143
OK, so stuff did actually get subsetted. This is good.
Is there anything that had no InterProScan
results intersecting the ligands?
has_no_intersect <- interpro_names[subsetLength == 0]
length(has_no_intersect)
## [1] 36
splitRanges[has_no_intersect[1:3]]
## $`3U79.H.110:2`
## IRanges of length 7
## start end width
## [1] 4 109 106
## [2] 5 110 106
## [3] 4 91 88
## [4] 20 92 73
## [5] 4 108 105
## [6] 3 111 109
## [7] 3 111 109
##
## $`2FAE.A.408:1`
## IRanges of length 9
## start end width
## [1] 31 46 16
## [2] 3 73 71
## [3] 2 76 75
## [4] 3 76 74
## [5] 2 76 75
## [6] 2 76 75
## [7] 2 76 75
## [8] 40 75 36
## [9] 5 72 68
##
## $`2FAE.A.406:1`
## IRanges of length 9
## start end width
## [1] 31 46 16
## [2] 3 73 71
## [3] 2 76 75
## [4] 3 76 74
## [5] 2 76 75
## [6] 2 76 75
## [7] 2 76 75
## [8] 40 75 36
## [9] 5 72 68
ligand_ranges[has_no_intersect[1:3]]
## $`3U79.H.110:2`
## IRanges of length 2
## start end width
## [1] 1 1 1
## [2] 1 1 1
##
## $`2FAE.A.408:1`
## IRanges of length 2
## start end width
## [1] 1 1 1
## [2] 1 1 1
##
## $`2FAE.A.406:1`
## IRanges of length 1
## start end width
## [1] 77 77 1
There is 36 chains where there is no intersection of the InterProScan
results with the ligands. Lets remove those from our list.
has_intersect <- interpro_names[subsetLength > 0]
splitRanges <- splitRanges[has_intersect]
ligand_ranges <- ligand_ranges[has_intersect]
This leaves us with 6742 chains to work with.
Finally, all of the clustering is based on individual zinc sites. The functional analysis was done on the chains, so we need to go back through by site and collapse together any that have multiple chains.
non_red <- non_red[(non_red$chain.id %in% has_intersect),] # filter down to what we have
split_chain_id <- split(non_red$chain.id, non_red$zinc.id)
collapsed_regions <- lapply(split_chain_id, function(x){
unlist(RangesList(splitRanges[x]))
})
We are really looking for some association between the clusters by k-means on the angles and the functionality. So lets analyze each functional annotation and see what clusters are present.
We will start by generating the annotation to site information.
site_to_sigAccession <- lapply(collapsed_regions, function(x){
tmpData <- mcols(x)
tmpAcc <- unique(tmpData$sigAccession)
tmpAcc[nchar(tmpAcc) > 0]
})
site_to_sigAccession <- (sapply(site_to_sigAccession, length) > 0) %>% site_to_sigAccession[.]
sig_to_site <- reverseSplit(site_to_sigAccession) %>% lapply(., unique)
site_to_iprAccession <- lapply(collapsed_regions, function(x){
tmpData <- mcols(x)
tmpIPR <- unique(tmpData$iprAccession)
tmpIPR[nchar(tmpIPR) > 0]
})
site_to_iprAccession <- (sapply(site_to_iprAccession, length) > 0) %>% site_to_iprAccession[.]
ipr_to_site <- reverseSplit(site_to_iprAccession) %>% lapply(., unique)
Now for each cluster, we will calculate the percentage of each annotation in each cluster.
normal_clusters <- read.table(file.path(useDir, "normal.cluster.txt"), header = TRUE, sep = ",", stringsAsFactors = FALSE)
names(normal_clusters) <- c("row_id", "zinc_site", "cluster")
ipr_normal <- filterListFrame(ipr_to_site, normal_clusters, "zinc_site")
normal_cluster_count <- clusterContributions(ipr_normal$filterList, ipr_normal$filterFrame, "cluster", "zinc_site")
cluster_sizes <- sapply(split(normal_clusters$zinc_site, normal_clusters$cluster), length)
cluster_sizes2 <- sapply(split(ipr_normal$filterFrame$zinc_site, ipr_normal$filterFrame$cluster), length)
Using the cluster-percentages (calculated by clusterContributions
above (nGene in Cluster / clusterSize / sum(proportions))), we can calculate a co-variance for each cluster pair (multiply), sum across all annotations. This sum becomes the value for the pair in a matrix, and normalize co-variances to 1, take 1-co-variance as a distance, and do hierarchical clustering on the result. This will result in clustering the clusters by annotation-cluster contribution.
normalCovariance <- clusterCovariance(normal_cluster_count$percentage)
normalDist <- 1 - normalCovariance
normalDist <- as.dist(normalDist)
normalHierarch <- hclust(normalDist)
plot(normalHierarch, main = "Normal Zinc Binding Sites", ylab = "Functional Cluster Distances", xlab = "Cluster", sub = "")
Save the distances so that they can be compared with other stuff.
save(normalDist, file = file.path(useDir, "normal_functional_dist_16.RData"))
Comparison with the distances from the structural comparison.
load(file.path(useDir, "structural.dist.RData"))
dist.mat2 <- as.dist(dist.mat)
distances <- data.frame(func = as.vector(normalDist), struc = as.vector(dist.mat2))
corVal <- cor(distances$struc, distances$func, method = "spearman")
corVal
## [1] 0.6533995
pVal <- cor.test(distances$struc, distances$func, method = "spearman")$p.value
ggplot(distances, aes(x = struc, y = func)) + geom_point(size = 4) + xlab("Structural") + ylab("Functional") + ggtitle("Cluster Distances") + geom_text(data = NULL, aes(family="serif"), x = 50, y = 0.985, label = paste("rho = ", substr(as.character(corVal), 1, 4), " \n ", "p-value < 2.2e-16", sep = ""), size = 10) + theme(axis.text = element_text(size = 14), axis.title = element_text(size = 18), plot.title = element_text(size = 20))
structHierarch <- hclust(dist.mat2)
plot(structHierarch, main = "Normal Zinc Binding Sites", ylab = "Structural Cluster Distances", xlab = "Cluster", sub = "")
The above was an initial comparison using the normal sites and 16 clusters, the most stable grouping. However, there is also the compressed sites, and other stable clusters. So we are going to generate the functional distances for these other clusterings, and save them so that Sen can do some comparisons.
Here are the other normal clusters (10 and 6).
normal_10 <- read.table(file.path(useDir, "normal.cluster.10.txt"), header = TRUE, sep = " ", stringsAsFactors = FALSE)
names(normal_10) <- c("zinc_site", "cluster")
ipr_normal_10 <- filterListFrame(ipr_to_site, normal_10, "zinc_site")
normal_count_10 <- clusterContributions(ipr_normal_10$filterList, ipr_normal_10$filterFrame, "cluster", "zinc_site")
normal_cov_10 <- clusterCovariance(normal_count_10$percentage)
normal_dist_10 <- (1 - normal_cov_10) %>% as.dist(.)
plot(hclust(normal_dist_10))
save(normal_dist_10, file = file.path(useDir, "normal_funct_dist_10.RData"))
normal_6 <- read.table(file.path(useDir, "normal.cluster.6.txt"), header = TRUE, sep = " ", stringsAsFactors = FALSE)
names(normal_6) <- c("zinc_site", "cluster")
ipr_normal_6 <- filterListFrame(ipr_to_site, normal_6, "zinc_site")
normal_count_6 <- clusterContributions(ipr_normal_6$filterList, ipr_normal_6$filterFrame, "cluster", "zinc_site")
normal_cov_6 <- clusterCovariance(normal_count_6$percentage)
normal_dist_6 <- (1 - normal_cov_6) %>% as.dist(.)
plot(hclust(normal_dist_6))
save(normal_dist_6, file = file.path(useDir, "normal_funct_dist_6.RData"))
Now the compressed clusters.
compressed_13 <- read.table(file.path(useDir, "compressed.cluster.txt"), header = TRUE, sep = " ", stringsAsFactors = FALSE)
names(compressed_13) <- c("zinc_site", "cluster")
ipr_compressed_13 <- filterListFrame(ipr_to_site, compressed_13, "zinc_site")
compressed_count_13 <- clusterContributions(ipr_compressed_13$filterList, ipr_compressed_13$filterFrame, "cluster", "zinc_site")
compressed_cov_13 <- clusterCovariance(compressed_count_13$percentage)
compressed_dist_13 <- (1 - compressed_cov_13) %>% as.dist(.)
plot(hclust(compressed_dist_13))
save(compressed_dist_13, file = file.path(useDir, "compressed_funct_dist_13.RData"))
compressed_8 <- read.table(file.path(useDir, "compressed.cluster.8.txt"), header = TRUE, sep = " ", stringsAsFactors = FALSE)
names(compressed_8) <- c("zinc_site", "cluster")
ipr_compressed_8 <- filterListFrame(ipr_to_site, compressed_8, "zinc_site")
compressed_count_8 <- clusterContributions(ipr_compressed_8$filterList, ipr_compressed_8$filterFrame, "cluster", "zinc_site")
compressed_cov_8 <- clusterCovariance(compressed_count_8$percentage)
compressed_dist_8 <- (1 - compressed_cov_8) %>% as.dist(.)
plot(hclust(compressed_dist_8))
save(compressed_dist_8, file = file.path(useDir, "compressed_funct_dist_8.RData"))
library(rmfNotifier)
jobNotify("comparison done")
## [1] "rmf_notifier: @rmflight comparison done 2014-10-30 14:53:05"