Introduction

So, at the request of Punya’s lab I am going to reanalyze the existing .hmp file and do a hierarchical clustering analysis. I am going to post what was previously done directly under this post and comment on what was done previously.

#library
library(ggplot2)
library(rrBLUP)
library(parallel)
library(ape)
library(data.table)
library(phangorn)

#remove directory
rm(list = ls(all.names = TRUE))

#set wd
setwd("C:/Users/zwinn/OneDrive/Collaborator Research/CSU/CSU_WSS_Reanalysis")

#read hmp file
hap <- fread(file = "CSU.hmp.txt", header = T, check.names = F, stringsAsFactors = F)
dim(hap)

colnames(hap)[6:7]=c('alleleA', 'alleleB')

#getting allele states
a = substring(hap$alleles, 1, 1)
b = substring(hap$alleles, 3, 3)

#compute number of homozygous individuals for each allele
hap$alleleA = rowSums(hap[, 12:ncol(hap)] == a, na.rm = T)
hap$alleleB = rowSums(hap[, 12:ncol(hap)] == b, na.rm = T)

#convert to phyDat

sf<-as.phyDat(hap[, 12:ncol(hap)])

#Distance trees
dm  <- dist.ml(sf)
treeUPGMA  <- upgma(dm)
plot(treeUPGMA)

#Bootstrap
fun <- function(x) upgma(dist.ml(x))
sf_upgma <- bootstrap.phyDat(sf,  fun)
plotBS(treeUPGMA, sf_upgma)

#parsimony
parsimony(treeUPGMA,sf)
treeRatchet  <- pratchet(sf, trace = 0, minit=100)
parsimony(treeRatchet, sf)

treeRatchet  <- acctran(treeRatchet, sf)
treeRatchet  <- di2multi(treeRatchet)

if(inherits(treeRatchet, "multiPhylo")){
  treeRatchet <- unique(treeRatchet)
}
jpeg('CSU_Sawfly.jpg',width=1400,height=1250)
plotBS(midpoint(treeRatchet), type="phylogram")
dev.off()
a
add.scale.bar()

plotBS(midpoint(treeRatchet), type="phylogram")
add.scale.bar()

Commentary

So, from what I can read it looks like whoever wrote this code read in the “CSU.hmp.txt” file and then converted the matrix into a numeric coded matrix and then used the package “phangorn” to parse the genetic information to then produce a cladiogram. A few major concerns:

  1. Monomorphic markers
  2. Incomplete data
  3. Trying to compare genomes A, B, and D when some individuals only have A, some have A and B, and some only have D

So I am going to threshold only individuals who share the A genome. Then I am going to only keep the A genome. I will output this new hapmap file and I will then take that hapmap, fitler it, and then use Beagle to impute missing data

#library
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
#set wd
setwd("C:/Users/zwinn/OneDrive/Collaborator Research/CSU/CSU_WSS_Reanalysis")

#read hmp file
hap<-data.table::fread(file = "CSU.hmp.txt", header = T, check.names = F, stringsAsFactors = F)

#pull data
hap_head<-hap[,1:11]
hap_lines<-hap[,12:ncol(hap)]

#read in list of lines
check<-read.csv("phylo_tree_spp.csv") %>%
  filter(Genome_A=="Yes")

#pull lines
hap_lines<-hap_lines %>%
  select(all_of(check$name))

#bind back together
hap_sub<-cbind(hap_head, hap_lines)

#filter monomorphic sites amd only keep the A genome
hap_sub<-hap_sub %>%
  filter(alleles %in% alleles[grep("/", alleles)]) %>%
  filter(chrom %in% chrom[grep("A", chrom)])

#write out table
write.table(hap_sub,
            "CSU_Hapmap_Sub.hmp.txt",
            sep = "\t",
            quote = FALSE,
            row.names = FALSE)

Read in fixed VCF and do Hirarchical clustering

I have made a fixed VCF and now I am going to do HC with the following data

#read geno
geno<-gaston::read.vcf("CSU_Sub_Filt_Imp.vcf.gz",
                       convert.chr = FALSE)
## ped stats and snps stats have been set. 
## 'p' has been set. 
## 'mu' and 'sigma' have been set.
#pull marker matrix
geno<-gaston::as.matrix(geno)

#geno
geno<-as.data.frame(geno) %>%
  rownames_to_column("name")

#check
geno<-check %>%
  select(name,species_sub) %>%
  left_join(geno, by="name") %>%
  mutate(ID=paste(name, " (", species_sub, ")", sep = "")) %>%
  select(-name, -species_sub) %>%
  select(ID, everything())

#names
rownames(geno)<-geno$ID
geno<-as.matrix(geno[,-1])

#calculate distance matrix
geno_dist<-dist(geno, method = "euclidean")

#make graph
clust<-hclust(geno_dist, method = "ward.D2")
thresh<-cutree(clust,k=2)

#plot
plot(clust,
     main="Hierarchical Clustering of A Genome Progenitors",
     ylab="Height",
     xlab="Accession")
rect.hclust(clust,
            k=2,
            border = c("green4",
                       "red4"))

#plot
pdf("HC_of_A_Progenitors.pdf",
    width = 15,
    height = 8)

plot(clust,
     main="Hierarchical Clustering of A Genome Progenitors",
     ylab="Height",
     xlab="Accession")
# rect.hclust(clust, 
#             k=2,
#             border = c("green4",
#                        "red4"))
dev.off()
## png 
##   2