library(atom2seq)
library(Biobase)
library(xtable)
library(igraph)
library(graph)
library(networkTools)
library(RCytoscape)
library(Biostrings)
library(ggplot2)

Binding Site Size

Based on the three and four AA ligand cases, how big (long) are the Zn binding sites?

useDir <- "/mlab/data/rmflight/Documents/projects/work/sen/coordination_families"
seq <- readAAStringSet(file.path(useDir, "sequences.SEQRES.3.txt"), use.names=TRUE)
atom <- readAAStringSet(file.path(useDir, "sequences.ATOM.4.txt"), use.names=TRUE)
load((file.path(useDir, "atom2seq.RData")))
length(seq)
## [1] 19787
length(atom)
## [1] 19787

Check Chains

Before we do any further processing, we need to check the chain IDs in the header to make sure that the chains are consistent across which sequence is output, and which chain the ligands belong to. If these chains don’t match across all, then it is not going to be easy (perhaps not impossible) to do the subsequent sequence analysis, and therefore we want to filter them out.

Note that we do not penalize those things where some of the ligands are non-AA and on a different chain.

chainStatus <- sapply(names(atom), checkChains)
keepRecords <- names(atom)[chainStatus]
atom <- atom[keepRecords]
seq <- seq[keepRecords]
outNum <- outNum[keepRecords]
length(atom)
## [1] 18199
length(seq)
## [1] 18199

Get the ligands and locations

Parse out the ligands and their locations.

ligands <- mclapply(names(outNum), parseFHeader, mc.cores=6)
names(ligands) <- names(outNum)
numFile <- file.path(useDir, "sequences.ATOM.num.txt")
atomnum <- readATOMNum(numFile)
atomWithNum <- names(atomnum)
keepAtom <- intersect(atomWithNum, keepRecords) # some atom records have no numbering??
atomnum <- atomnum[keepAtom]

atom <- atom[keepAtom]
seq <- seq[keepAtom]
numLength <- sapply(atomnum, length)
atomLength <- sapply(atom, length)
atomGR0 <- names(atomLength)[atomLength > 0]

atomnum <- atomnum[atomGR0]
atom <- atom[atomGR0]
seq <- seq[atomGR0]

outNum <- outNum[atomGR0]
ligands <- ligands[atomGR0]
length(atom)
## [1] 17253

Now start checking that the lengths of things are the same, and checking that locations of ligands are correct.

lengthMatch <- sapply(atomnum, length) == sapply(atom, length)
atomnum <- atomnum[lengthMatch]
atom <- atom[lengthMatch]
seq <- seq[lengthMatch]
outNum <- outNum[lengthMatch]
ligands <- ligands[lengthMatch]
length(atom)
## [1] 16963
useName <- "4A7Y.A.951|A343.ASP.OD1.O|A345.ASP.OD1.O|A347.ASP.OD1.O|A349.GLU.O.O|A351.GLU.OE2.O|A15"
checkLigands(ligands[[useName]], atomnum[[useName]], atom[[useName]])
## [1] TRUE
testNames <- names(ligands)
outCheck <- sapply(testNames, function(x){
  checkLigands(ligands[[x]], atomnum[[x]], atom[[x]])
})

sum(outCheck) / length(testNames)
## [1] 0.9958
sum(outCheck)
## [1] 16891

We are losing 72 things from not having matching ligands, these appear to be largely due to having locations completely outside the range of the ATOM numbering.

keepRecords <- testNames[outCheck]
atom <- atom[keepRecords]
seq <- seq[keepRecords]
atomnum <- atomnum[keepRecords]
outNum <- outNum[keepRecords]
ligands <- ligands[keepRecords]
countData <- countLigands(ligands)

Ranges!

From those things that all match up, we can easily define ranges from the ATOM ligands that define the binding site on the SEQRES sequence, and then define ranges for the sites that were found by InterProScan, and do a nice intersection between the two.

4 AA Ligands

nLigand <- 4
useData4 <- countData[(countData$count == nLigand) & (countData$nAA == nLigand),]
nrow(useData4)
## [1] 8948
seqRanges4 <- lapply(useData4$id, function(x){
  #print(x)
  ligandRanges(ligands[[x]], atomnum[[x]], outNum[[x]])
})
seqRanges4 <- do.call(c, seqRanges4)

3 AA Ligands

useData3 <- countData[(countData$count == nLigand) & (countData$nAA == 3),]
nrow(useData3)
## [1] 4083
seqRanges3 <- lapply(useData3$id, function(x){
  #print(x)
  ligandRanges(ligands[[x]], atomnum[[x]], outNum[[x]])
})
seqRanges3 <- do.call(c, seqRanges3)

Distribution

widthDist <- data.frame(width = c(width(seqRanges3), width(seqRanges4)),
                        nAA = rep(c("3", "4"), c(length(seqRanges3), length(seqRanges4))))
ggplot(widthDist, aes(x = width, fill = nAA)) + geom_histogram(position="identity", alpha=0.5)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

plot of chunk widthDist

ggplot(widthDist, aes(x = width, fill = nAA)) + geom_density(alpha=0.5)

plot of chunk widthDist

From these plots, it looks like we should definitely allow a binding site to be up to 200 AA long.