This Markdown will go through the Toy example used, but will highlight potential pitfalls DetHet can fall into, and why they occur (and thus how to avoid them)
Let’s set the seed first:
set.seed(12345)
And load necesarry packages:
library(lhs)
library(gridExtra)
library(ggplot2)
And load in required functions (for fitting and predicting HetGP and DetHetGP)
source("HetGP Fit.R")
source("HetGP Predict.R")
source("DetGP Fit.R")
source("DetGP Predict.R")
source("DetHetGP Fit.R")
source("DetHetGP Predict.R")
Here is the toy simulator:
dimen <- 1 #1D problem
#toy simulator
simulator <- function(x){
sin(pi+6*pi*x[,1])*(1-x[,1]) + log(0.2+x[,1]) + rnorm(length(x[,1]),0,1)*(1.2-x[,1])
}
And here is the deterministic approximation
#deterministic simulator
detsimulator <- function(x){
sin(pi+6*pi*x[,1])*(1-x[,1]) + log(0.2+x[,1]) + 1*(1.2-x[,1])
}
And what points do we (eventually) want to predict for?
Npred <- 100 #how many prediction points
xpred <-as.matrix(seq(0,1,len = Npred)) #x locations
Let’s create a useless deterministic approximation
#useless deterministic simulator
bad_detsimulator <- function(x){
log(0.1+4*x[,1])
}
Let’s generate the stochastic training data
N <- 50 #number of input points
x <- maximinLHS(N, dimen) #x values
#output data
yclean <- simulator(x)
#standardise:
meany <- mean(c(yclean)) #mean of data sample
sdy <- sd(c(yclean)) #standard deviation
y <- (yclean-meany)/sdy #standardised output
And the deterministic training data
Ndet <- 12 #number of input points
xdet <- maximinLHS(Ndet, dimen) #x values
#output data
ydetclean <- bad_detsimulator(xdet)
#standardise:
ydet <- (ydetclean-meany)/sdy
We can also simulate many, many, times to get the ``truth’’ (we could’ve also just get the truth analytically)
#########
Rep <- 100000 #number of repeats
Ntruth <- Npred*Rep #total number of inputs
xtruth <- do.call("rbind",replicate(Rep, xpred, simplify = FALSE)) #repeated x values
#output data
ytruth <- simulator(xtruth)
#standardise:
ytruth <- (ytruth-meany)/sdy
#means and variance of output (for each unique input value)
ytruth_mean <- rep(0, Npred)
ytruth_var <- rep(0, Npred)
for (i in 1:Npred){
ytruth_mean[i] <- mean(ytruth[(1:Rep)*Npred - (Npred-i)]) #get mean
ytruth_var[i] <- var(ytruth[(1:Rep)*Npred - (Npred-i)]) #get variance
}
Truefunc <- data.frame(x=xpred, mean = ytruth_mean, var = ytruth_var) #put into data frame
And now let’s fit DetHetGP
Lets fit DetHetGP and make predictions
parameterestimates <- DetHetGPFit(x, y, xdet, ydet) #Fit DetHetGP
samples <- DetHetGPPredict(x, y, xdet, ydet, parameterestimates, xpred, M = 1000) #make predictions (increasing M will reduce `spikeyness' of plot)
ysamples <- samples[[1]] #extract the DetHetGP predictions
And we can generate the plots as used in the paper
#the plot of just the data
p1 <- ggplot() +
geom_point(aes(x=x, y=y)) +
geom_point(aes(x=xdet, y=ydet), shape = 3) +
theme_minimal(base_size = 16) +
ylab("y") +
xlab("x")+
ylim(-5.5,2.5) +
xlim(0,1)
#The plot of the DetHetGP fit
GPinterDetHet <- data.frame(x=xpred, postmean = apply(ysamples,2,mean), postvar = apply(ysamples,2,var)) #create data frames for ggplot to use
GPdataDetHeth <- data.frame(x = x, y = y)
GPdataDetHetd <- data.frame(xdet=xdet, ydet=ydet)
#plot it
p2 <- ggplot(data = GPinterDetHet) +
geom_ribbon(data = GPinterDetHet, aes(x=x, ymin = (postmean -2*sqrt(postvar)), ymax = postmean + 2*sqrt(postvar), fill = "DetHet"), alpha = 0.15) +
geom_ribbon(data = Truefunc, aes(x=x, ymin = (mean -2*sqrt(var)), ymax = mean + 2*sqrt(var), fill = "True"), alpha = 0.15) +
geom_line(data = Truefunc, inherit.aes = FALSE, aes(x=x, y=mean, colour = "True")) +
geom_line(data = GPinterDetHet, aes(x=x, y=postmean, colour = "DetHet")) +
geom_point(data = GPdataDetHeth, aes(x=x, y=y, colour = "True")) +
geom_point(data = GPdataDetHetd, aes(x=xdet, y=ydet, colour = "True"), shape = 3) +
geom_line(data = GPinterDetHet, aes(x=x, y = apply(samples[[4]],2, mean), colour = "Det")) +
geom_hline(yintercept = 0, linetype="dashed", colour = "black") +
theme_minimal(base_size = 16) +
ylab("y") +
xlim(0,1)+
ylim(-5.5,2.5) +
scale_fill_manual(values = c("True" = "#000000", "DetHet" = "#0072B2", "Het" = "#0072B2", "Det" = "#D55E00"), guide=FALSE) +
scale_colour_manual(values = c("True" = "#000000", "DetHet" = "#0072B2", "Het" = "#0072B2", "Adjust" = "#CC79A7", "Det" = "#D55E00"), guide=FALSE)
grid.arrange(p1, p2, nrow = 1) #plot both plots
Let’s generate the stochastic training data
set.seed(54321) # set the seed again (adjusting M earlier would change results here otherwise)
N <- 58 #number of input points (more than before, because fewer deterministic points)
x <- maximinLHS(N, dimen) #x values
#output data
yclean <- simulator(x)
#standardise:
meany <- mean(c(yclean)) #mean of data sample
sdy <- sd(c(yclean)) #standard deviation
y <- (yclean-meany)/sdy #standardised output
And the deterministic training data
Ndet <- 4 #number of input points (much less than before)
xdet <- maximinLHS(Ndet, dimen) #x values
#output data
ydetclean <- detsimulator(xdet)
#standardise:
ydet <- (ydetclean-meany)/sdy
We can also simulate many, many, times to get the ``truth’’ (we could’ve also just get the truth analytically)
#########
Rep <- 100000 #number of repeats
Ntruth <- Npred*Rep #total number of inputs
xtruth <- do.call("rbind",replicate(Rep, xpred, simplify = FALSE)) #repeated x values
#output data
ytruth <- simulator(xtruth)
#standardise:
ytruth <- (ytruth-meany)/sdy
#means and variance of output (for each unique input value)
ytruth_mean <- rep(0, Npred)
ytruth_var <- rep(0, Npred)
for (i in 1:Npred){
ytruth_mean[i] <- mean(ytruth[(1:Rep)*Npred - (Npred-i)]) #get mean
ytruth_var[i] <- var(ytruth[(1:Rep)*Npred - (Npred-i)]) #get variance
}
Truefunc <- data.frame(x=xpred, mean = ytruth_mean, var = ytruth_var) #put into data frame
And now let’s fit DetHetGP
Lets fit DetHetGP and make predictions
parameterestimates <- DetHetGPFit(x, y, xdet, ydet) #Fit DetHetGP
samples <- DetHetGPPredict(x, y, xdet, ydet, parameterestimates, xpred, M = 1000) #make predictions (increasing M will reduce `spikeyness' of plot)
ysamples <- samples[[1]] #extract the DetHetGP predictions
And we can generate the plots as used in the paper
#the plot of just the data
p3 <- ggplot() +
geom_point(aes(x=x, y=y)) +
geom_point(aes(x=xdet, y=ydet), shape = 3) +
theme_minimal(base_size = 16) +
ylab("y") +
xlab("x")+
ylim(-5,2.5) +
xlim(0,1)
#The plot of the DetHetGP fit
GPinterDetHet <- data.frame(x=xpred, postmean = apply(ysamples,2,mean), postvar = apply(ysamples,2,var)) #create data frames for ggplot to use
GPdataDetHeth <- data.frame(x = x, y = y)
GPdataDetHetd <- data.frame(xdet=xdet, ydet=ydet)
#plot it
p4 <- ggplot(data = GPinterDetHet) +
geom_ribbon(data = GPinterDetHet, aes(x=x, ymin = (postmean -2*sqrt(postvar)), ymax = postmean + 2*sqrt(postvar), fill = "DetHet"), alpha = 0.15) +
geom_ribbon(data = Truefunc, aes(x=x, ymin = (mean -2*sqrt(var)), ymax = mean + 2*sqrt(var), fill = "True"), alpha = 0.15) +
geom_line(data = Truefunc, inherit.aes = FALSE, aes(x=x, y=mean, colour = "True")) +
geom_line(data = GPinterDetHet, aes(x=x, y=postmean, colour = "DetHet")) +
geom_point(data = GPdataDetHeth, aes(x=x, y=y, colour = "True")) +
geom_point(data = GPdataDetHetd, aes(x=xdet, y=ydet, colour = "True"), shape = 3) +
geom_line(data = GPinterDetHet, aes(x=x, y = apply(samples[[4]],2, mean), colour = "Det")) +
geom_hline(yintercept = 0, linetype="dashed", colour = "black") +
theme_minimal(base_size = 16) +
ylab("y") +
xlim(0,1)+
ylim(-5,2.5) +
scale_fill_manual(values = c("True" = "#000000", "DetHet" = "#0072B2", "Het" = "#0072B2", "Det" = "#D55E00"), guide=FALSE) +
scale_colour_manual(values = c("True" = "#000000", "DetHet" = "#0072B2", "Het" = "#0072B2", "Adjust" = "#CC79A7", "Det" = "#D55E00"), guide=FALSE)
grid.arrange(p3, p4, nrow = 1) #plot both plots
Let’s generate the stochastic training
set.seed(12321) # set the seed again (adjusting M earlier would change results here otherwise)
N <- 6 #number of input points (much less than before)
x <- maximinLHS(N, dimen) #x values
#output data
yclean <- simulator(x)
#standardise:
meany <- mean(c(yclean)) #mean of data sample
sdy <- sd(c(yclean)) #standard deviation
y <- (yclean-meany)/sdy #standardised output
And the deterministic training data
Ndet <- 56 #number of input points (more than before, because fewer stochastic points)
xdet <- maximinLHS(Ndet, dimen) #x values
#output data
ydetclean <- detsimulator(xdet)
#standardise:
ydet <- (ydetclean-meany)/sdy
We can also simulate many, many, times to get the ``truth’’ (we could’ve also just get the truth analytically)
#########
Rep <- 100000 #number of repeats
Ntruth <- Npred*Rep #total number of inputs
xtruth <- do.call("rbind",replicate(Rep, xpred, simplify = FALSE)) #repeated x values
#output data
ytruth <- simulator(xtruth)
#standardise:
ytruth <- (ytruth-meany)/sdy
#means and variance of output (for each unique input value)
ytruth_mean <- rep(0, Npred)
ytruth_var <- rep(0, Npred)
for (i in 1:Npred){
ytruth_mean[i] <- mean(ytruth[(1:Rep)*Npred - (Npred-i)]) #get mean
ytruth_var[i] <- var(ytruth[(1:Rep)*Npred - (Npred-i)]) #get variance
}
Truefunc <- data.frame(x=xpred, mean = ytruth_mean, var = ytruth_var) #put into data frame
And now let’s fit DetHetGP
Lets fit DetHetGP and make predictions
parameterestimates <- DetHetGPFit(x, y, xdet, ydet) #Fit DetHetGP
samples <- DetHetGPPredict(x, y, xdet, ydet, parameterestimates, xpred, M = 1000) #make predictions (increasing M will reduce `spikeyness' of plot)
ysamples <- samples[[1]] #extract the DetHetGP predictions
And we can generate the plots as used in the paper
#the plot of just the data
p5 <- ggplot() +
geom_point(aes(x=x, y=y)) +
geom_point(aes(x=xdet, y=ydet), shape = 3) +
theme_minimal(base_size = 16) +
ylab("y") +
xlab("x")+
ylim(-12.5,7.5) +
xlim(0,1)
#The plot of the DetHetGP fit
GPinterDetHet <- data.frame(x=xpred, postmean = apply(ysamples,2,mean), postvar = apply(ysamples,2,var)) #create data frames for ggplot to use
GPdataDetHeth <- data.frame(x = x, y = y)
GPdataDetHetd <- data.frame(xdet=xdet, ydet=ydet)
#plot it
p6 <- ggplot(data = GPinterDetHet) +
geom_ribbon(data = GPinterDetHet, aes(x=x, ymin = (postmean -2*sqrt(postvar)), ymax = postmean + 2*sqrt(postvar), fill = "DetHet"), alpha = 0.15) +
geom_ribbon(data = Truefunc, aes(x=x, ymin = (mean -2*sqrt(var)), ymax = mean + 2*sqrt(var), fill = "True"), alpha = 0.15) +
geom_line(data = Truefunc, inherit.aes = FALSE, aes(x=x, y=mean, colour = "True")) +
geom_line(data = GPinterDetHet, aes(x=x, y=postmean, colour = "DetHet")) +
geom_point(data = GPdataDetHeth, aes(x=x, y=y, colour = "True")) +
geom_point(data = GPdataDetHetd, aes(x=xdet, y=ydet, colour = "True"), shape = 3) +
geom_line(data = GPinterDetHet, aes(x=x, y = apply(samples[[4]],2, mean), colour = "Det")) +
geom_hline(yintercept = 0, linetype="dashed", colour = "black") +
theme_minimal(base_size = 16) +
ylab("y") +
xlim(0,1)+
ylim(-12.5,7.5) +
scale_fill_manual(values = c("True" = "#000000", "DetHet" = "#0072B2", "Het" = "#0072B2", "Det" = "#D55E00"), guide=FALSE) +
scale_colour_manual(values = c("True" = "#000000", "DetHet" = "#0072B2", "Het" = "#0072B2", "Adjust" = "#CC79A7", "Det" = "#D55E00"), guide=FALSE)
grid.arrange(p5, p6, nrow = 1) #plot both plots