So, at the request of Punya’s lab I am going to reanalyze the existing .hmp file and do a hierarchical clustering analysis. I am going to post what was previously done directly under this post and comment on what was done previously.
#library
library(ggplot2)
library(rrBLUP)
library(parallel)
library(ape)
library(data.table)
library(phangorn)
#remove directory
rm(list = ls(all.names = TRUE))
#set wd
setwd("C:/Users/zwinn/OneDrive/Collaborator Research/CSU/CSU_WSS_Reanalysis")
#read hmp file
hap <- fread(file = "CSU.hmp.txt", header = T, check.names = F, stringsAsFactors = F)
dim(hap)
colnames(hap)[6:7]=c('alleleA', 'alleleB')
#getting allele states
a = substring(hap$alleles, 1, 1)
b = substring(hap$alleles, 3, 3)
#compute number of homozygous individuals for each allele
hap$alleleA = rowSums(hap[, 12:ncol(hap)] == a, na.rm = T)
hap$alleleB = rowSums(hap[, 12:ncol(hap)] == b, na.rm = T)
#convert to phyDat
sf<-as.phyDat(hap[, 12:ncol(hap)])
#Distance trees
dm <- dist.ml(sf)
treeUPGMA <- upgma(dm)
plot(treeUPGMA)
#Bootstrap
fun <- function(x) upgma(dist.ml(x))
sf_upgma <- bootstrap.phyDat(sf, fun)
plotBS(treeUPGMA, sf_upgma)
#parsimony
parsimony(treeUPGMA,sf)
treeRatchet <- pratchet(sf, trace = 0, minit=100)
parsimony(treeRatchet, sf)
treeRatchet <- acctran(treeRatchet, sf)
treeRatchet <- di2multi(treeRatchet)
if(inherits(treeRatchet, "multiPhylo")){
treeRatchet <- unique(treeRatchet)
}
jpeg('CSU_Sawfly.jpg',width=1400,height=1250)
plotBS(midpoint(treeRatchet), type="phylogram")
dev.off()
a
add.scale.bar()
plotBS(midpoint(treeRatchet), type="phylogram")
add.scale.bar()
So, from what I can read it looks like whoever wrote this code read in the “CSU.hmp.txt” file and then converted the matrix into a numeric coded matrix and then used the package “phangorn” to parse the genetic information to then produce a cladiogram. A few major concerns:
So I am going to threshold only individuals who share the A genome. Then I am going to only keep the A genome. I will output this new hapmap file and I will then take that hapmap, fitler it, and then use Beagle to impute missing data
#library
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
#set wd
setwd("C:/Users/zwinn/OneDrive/Collaborator Research/CSU/CSU_WSS_Reanalysis")
#read hmp file
hap<-data.table::fread(file = "CSU.hmp.txt", header = T, check.names = F, stringsAsFactors = F)
#pull data
hap_head<-hap[,1:11]
hap_lines<-hap[,12:ncol(hap)]
#read in list of lines
check<-read.csv("phylo_tree_spp.csv") %>%
filter(Genome_A=="Yes")
#pull lines
hap_lines<-hap_lines %>%
select(all_of(check$name))
#bind back together
hap_sub<-cbind(hap_head, hap_lines)
#filter monomorphic sites amd only keep the A genome
hap_sub<-hap_sub %>%
filter(alleles %in% alleles[grep("/", alleles)]) %>%
filter(chrom %in% chrom[grep("A", chrom)])
#write out table
write.table(hap_sub,
"CSU_Hapmap_Sub.hmp.txt",
sep = "\t",
quote = FALSE,
row.names = FALSE)
I have made a fixed VCF and now I am going to do HC with the following data
#read geno
geno<-gaston::read.vcf("CSU_Sub_Filt_Imp.vcf.gz",
convert.chr = FALSE)
## ped stats and snps stats have been set.
## 'p' has been set.
## 'mu' and 'sigma' have been set.
#pull marker matrix
geno<-gaston::as.matrix(geno)
#geno
geno<-as.data.frame(geno) %>%
rownames_to_column("name")
#check
geno<-check %>%
select(name,species_sub) %>%
left_join(geno, by="name") %>%
mutate(ID=paste(name, " (", species_sub, ")", sep = "")) %>%
select(-name, -species_sub) %>%
select(ID, everything())
#names
rownames(geno)<-geno$ID
geno<-as.matrix(geno[,-1])
#calculate distance matrix
geno_dist<-dist(geno, method = "euclidean")
#make graph
clust<-hclust(geno_dist, method = "ward.D2")
thresh<-cutree(clust,k=2)
#plot
plot(clust,
main="Hierarchical Clustering of A Genome Progenitors",
ylab="Height",
xlab="Accession")
rect.hclust(clust,
k=2,
border = c("green4",
"red4"))
#plot
pdf("HC_of_A_Progenitors.pdf",
width = 15,
height = 8)
plot(clust,
main="Hierarchical Clustering of A Genome Progenitors",
ylab="Height",
xlab="Accession")
# rect.hclust(clust,
# k=2,
# border = c("green4",
# "red4"))
dev.off()
## png
## 2