This Markdown will go through the Toy example used, but will highlight potential pitfalls DetHet can fall into, and why they occur (and thus how to avoid them)

Setup

Let’s set the seed first:

set.seed(12345)

And load necesarry packages:

library(lhs)
library(gridExtra)
library(ggplot2)

And load in required functions (for fitting and predicting HetGP and DetHetGP)

source("HetGP Fit.R")
source("HetGP Predict.R")
source("DetGP Fit.R")
source("DetGP Predict.R")
source("DetHetGP Fit.R")
source("DetHetGP Predict.R")

Simulator

Here is the toy simulator:

dimen <- 1 #1D problem

#toy simulator
simulator <- function(x){
  sin(pi+6*pi*x[,1])*(1-x[,1]) + log(0.2+x[,1])  + rnorm(length(x[,1]),0,1)*(1.2-x[,1])
}

And here is the deterministic approximation

#deterministic simulator
detsimulator <- function(x){
  sin(pi+6*pi*x[,1])*(1-x[,1]) + log(0.2+x[,1])  + 1*(1.2-x[,1])
}

And what points do we (eventually) want to predict for?

Npred <- 100 #how many prediction points
xpred <-as.matrix(seq(0,1,len = Npred)) #x locations

The first problem occurs if the Deterministic approximation is not informative

Let’s create a useless deterministic approximation

#useless deterministic simulator
bad_detsimulator <- function(x){
  log(0.1+4*x[,1])
}

Let’s generate the stochastic training data

N <- 50 #number of input points
x <- maximinLHS(N, dimen) #x values

#output data
yclean <- simulator(x)

#standardise:
meany <- mean(c(yclean)) #mean of data sample
sdy <- sd(c(yclean)) #standard deviation
y <- (yclean-meany)/sdy #standardised output

And the deterministic training data

Ndet <- 12 #number of input points
xdet <- maximinLHS(Ndet, dimen) #x values

#output data
ydetclean <- bad_detsimulator(xdet)

#standardise:
ydet <- (ydetclean-meany)/sdy

We can also simulate many, many, times to get the ``truth’’ (we could’ve also just get the truth analytically)

#########

Rep <- 100000 #number of repeats
Ntruth <- Npred*Rep #total number of inputs

xtruth <- do.call("rbind",replicate(Rep, xpred, simplify = FALSE)) #repeated x values

#output data
ytruth <- simulator(xtruth)

#standardise:
ytruth <- (ytruth-meany)/sdy

#means and variance of output (for each unique input value)
ytruth_mean <- rep(0, Npred)
ytruth_var <- rep(0, Npred)
for (i in 1:Npred){
  ytruth_mean[i] <- mean(ytruth[(1:Rep)*Npred - (Npred-i)]) #get mean
  ytruth_var[i] <- var(ytruth[(1:Rep)*Npred - (Npred-i)])   #get variance
}

Truefunc <-  data.frame(x=xpred, mean = ytruth_mean, var = ytruth_var) #put into data frame

DetHetGP

And now let’s fit DetHetGP

Fitting and Prediction

Lets fit DetHetGP and make predictions

parameterestimates <- DetHetGPFit(x, y, xdet, ydet) #Fit DetHetGP
samples <- DetHetGPPredict(x, y, xdet, ydet, parameterestimates, xpred, M = 1000) #make predictions (increasing M will reduce `spikeyness' of plot)
ysamples <- samples[[1]] #extract the DetHetGP predictions 

And we can generate the plots as used in the paper

#the plot of just the data
p1 <- ggplot() +
  geom_point(aes(x=x, y=y)) +
  geom_point(aes(x=xdet, y=ydet), shape = 3) +
  theme_minimal(base_size = 16) +
  ylab("y") +
  xlab("x")+
  ylim(-5.5,2.5) +
  xlim(0,1)


#The plot of the DetHetGP fit

GPinterDetHet <- data.frame(x=xpred, postmean = apply(ysamples,2,mean), postvar = apply(ysamples,2,var)) #create data frames for ggplot to use
GPdataDetHeth <- data.frame(x = x, y = y)
GPdataDetHetd <- data.frame(xdet=xdet, ydet=ydet)

#plot it 
p2 <- ggplot(data = GPinterDetHet) +
  geom_ribbon(data = GPinterDetHet, aes(x=x, ymin = (postmean -2*sqrt(postvar)), ymax = postmean + 2*sqrt(postvar), fill = "DetHet"), alpha = 0.15) +
  geom_ribbon(data = Truefunc, aes(x=x, ymin = (mean -2*sqrt(var)), ymax = mean + 2*sqrt(var), fill = "True"), alpha = 0.15) +
  geom_line(data = Truefunc, inherit.aes = FALSE, aes(x=x, y=mean, colour = "True")) +
  geom_line(data = GPinterDetHet, aes(x=x, y=postmean, colour = "DetHet")) +
  geom_point(data = GPdataDetHeth, aes(x=x, y=y, colour = "True")) +
  geom_point(data = GPdataDetHetd, aes(x=xdet, y=ydet, colour = "True"), shape = 3) +
  geom_line(data = GPinterDetHet, aes(x=x, y = apply(samples[[4]],2, mean), colour = "Det")) +
  geom_hline(yintercept = 0, linetype="dashed", colour = "black") +
  theme_minimal(base_size = 16) +
  ylab("y") +
  xlim(0,1)+
  ylim(-5.5,2.5) +
  scale_fill_manual(values = c("True" = "#000000", "DetHet" = "#0072B2", "Het" = "#0072B2", "Det" = "#D55E00"), guide=FALSE) +
  scale_colour_manual(values = c("True" = "#000000", "DetHet" = "#0072B2", "Het" = "#0072B2", "Adjust" = "#CC79A7", "Det" = "#D55E00"), guide=FALSE)


grid.arrange(p1, p2, nrow = 1) #plot both plots

The second problem occurs if too few data points are assigned to the deterministic approximation

Let’s generate the stochastic training data

set.seed(54321) # set the seed again (adjusting M earlier would change results here otherwise)
N <- 58 #number of input points (more than before, because fewer deterministic points)
x <- maximinLHS(N, dimen) #x values

#output data
yclean <- simulator(x)

#standardise:
meany <- mean(c(yclean)) #mean of data sample
sdy <- sd(c(yclean)) #standard deviation
y <- (yclean-meany)/sdy #standardised output

And the deterministic training data

Ndet <- 4 #number of input points (much less than before)
xdet <- maximinLHS(Ndet, dimen) #x values

#output data
ydetclean <- detsimulator(xdet)

#standardise:
ydet <- (ydetclean-meany)/sdy

We can also simulate many, many, times to get the ``truth’’ (we could’ve also just get the truth analytically)

#########

Rep <- 100000 #number of repeats
Ntruth <- Npred*Rep #total number of inputs

xtruth <- do.call("rbind",replicate(Rep, xpred, simplify = FALSE)) #repeated x values

#output data
ytruth <- simulator(xtruth)

#standardise:
ytruth <- (ytruth-meany)/sdy

#means and variance of output (for each unique input value)
ytruth_mean <- rep(0, Npred)
ytruth_var <- rep(0, Npred)
for (i in 1:Npred){
  ytruth_mean[i] <- mean(ytruth[(1:Rep)*Npred - (Npred-i)]) #get mean
  ytruth_var[i] <- var(ytruth[(1:Rep)*Npred - (Npred-i)])   #get variance
}

Truefunc <-  data.frame(x=xpred, mean = ytruth_mean, var = ytruth_var) #put into data frame

DetHetGP

And now let’s fit DetHetGP

Fitting and Prediction

Lets fit DetHetGP and make predictions

parameterestimates <- DetHetGPFit(x, y, xdet, ydet) #Fit DetHetGP
samples <- DetHetGPPredict(x, y, xdet, ydet, parameterestimates, xpred, M = 1000) #make predictions (increasing M will reduce `spikeyness' of plot)
ysamples <- samples[[1]] #extract the DetHetGP predictions 

And we can generate the plots as used in the paper

#the plot of just the data
p3 <- ggplot() +
  geom_point(aes(x=x, y=y)) +
  geom_point(aes(x=xdet, y=ydet), shape = 3) +
  theme_minimal(base_size = 16) +
  ylab("y") +
  xlab("x")+
  ylim(-5,2.5) +
  xlim(0,1)


#The plot of the DetHetGP fit

GPinterDetHet <- data.frame(x=xpred, postmean = apply(ysamples,2,mean), postvar = apply(ysamples,2,var)) #create data frames for ggplot to use
GPdataDetHeth <- data.frame(x = x, y = y)
GPdataDetHetd <- data.frame(xdet=xdet, ydet=ydet)

#plot it 
p4 <- ggplot(data = GPinterDetHet) +
  geom_ribbon(data = GPinterDetHet, aes(x=x, ymin = (postmean -2*sqrt(postvar)), ymax = postmean + 2*sqrt(postvar), fill = "DetHet"), alpha = 0.15) +
  geom_ribbon(data = Truefunc, aes(x=x, ymin = (mean -2*sqrt(var)), ymax = mean + 2*sqrt(var), fill = "True"), alpha = 0.15) +
  geom_line(data = Truefunc, inherit.aes = FALSE, aes(x=x, y=mean, colour = "True")) +
  geom_line(data = GPinterDetHet, aes(x=x, y=postmean, colour = "DetHet")) +
  geom_point(data = GPdataDetHeth, aes(x=x, y=y, colour = "True")) +
  geom_point(data = GPdataDetHetd, aes(x=xdet, y=ydet, colour = "True"), shape = 3) +
  geom_line(data = GPinterDetHet, aes(x=x, y = apply(samples[[4]],2, mean), colour = "Det")) +
  geom_hline(yintercept = 0, linetype="dashed", colour = "black") +
  theme_minimal(base_size = 16) +
  ylab("y") +
  xlim(0,1)+
  ylim(-5,2.5) +
  scale_fill_manual(values = c("True" = "#000000", "DetHet" = "#0072B2", "Het" = "#0072B2", "Det" = "#D55E00"), guide=FALSE) +
  scale_colour_manual(values = c("True" = "#000000", "DetHet" = "#0072B2", "Het" = "#0072B2", "Adjust" = "#CC79A7", "Det" = "#D55E00"), guide=FALSE)


grid.arrange(p3, p4, nrow = 1) #plot both plots

The third problem occurs if too few data points are assigned to the stochastic simulator

Let’s generate the stochastic training

set.seed(12321) # set the seed again (adjusting M earlier would change results here otherwise)
N <- 6 #number of input points (much less than before)
x <- maximinLHS(N, dimen) #x values

#output data
yclean <- simulator(x)

#standardise:
meany <- mean(c(yclean)) #mean of data sample
sdy <- sd(c(yclean)) #standard deviation
y <- (yclean-meany)/sdy #standardised output

And the deterministic training data

Ndet <- 56 #number of input points (more than before, because fewer stochastic points)
xdet <- maximinLHS(Ndet, dimen) #x values

#output data
ydetclean <- detsimulator(xdet)

#standardise:
ydet <- (ydetclean-meany)/sdy

We can also simulate many, many, times to get the ``truth’’ (we could’ve also just get the truth analytically)

#########

Rep <- 100000 #number of repeats
Ntruth <- Npred*Rep #total number of inputs

xtruth <- do.call("rbind",replicate(Rep, xpred, simplify = FALSE)) #repeated x values

#output data
ytruth <- simulator(xtruth)

#standardise:
ytruth <- (ytruth-meany)/sdy

#means and variance of output (for each unique input value)
ytruth_mean <- rep(0, Npred)
ytruth_var <- rep(0, Npred)
for (i in 1:Npred){
  ytruth_mean[i] <- mean(ytruth[(1:Rep)*Npred - (Npred-i)]) #get mean
  ytruth_var[i] <- var(ytruth[(1:Rep)*Npred - (Npred-i)])   #get variance
}

Truefunc <-  data.frame(x=xpred, mean = ytruth_mean, var = ytruth_var) #put into data frame

DetHetGP

And now let’s fit DetHetGP

Fitting and Prediction

Lets fit DetHetGP and make predictions

parameterestimates <- DetHetGPFit(x, y, xdet, ydet) #Fit DetHetGP
samples <- DetHetGPPredict(x, y, xdet, ydet, parameterestimates, xpred, M = 1000) #make predictions (increasing M will reduce `spikeyness' of plot)
ysamples <- samples[[1]] #extract the DetHetGP predictions 

And we can generate the plots as used in the paper

#the plot of just the data
p5 <- ggplot() +
  geom_point(aes(x=x, y=y)) +
  geom_point(aes(x=xdet, y=ydet), shape = 3) +
  theme_minimal(base_size = 16) +
  ylab("y") +
  xlab("x")+
  ylim(-12.5,7.5) +
  xlim(0,1)


#The plot of the DetHetGP fit

GPinterDetHet <- data.frame(x=xpred, postmean = apply(ysamples,2,mean), postvar = apply(ysamples,2,var)) #create data frames for ggplot to use
GPdataDetHeth <- data.frame(x = x, y = y)
GPdataDetHetd <- data.frame(xdet=xdet, ydet=ydet)

#plot it 
p6 <- ggplot(data = GPinterDetHet) +
  geom_ribbon(data = GPinterDetHet, aes(x=x, ymin = (postmean -2*sqrt(postvar)), ymax = postmean + 2*sqrt(postvar), fill = "DetHet"), alpha = 0.15) +
  geom_ribbon(data = Truefunc, aes(x=x, ymin = (mean -2*sqrt(var)), ymax = mean + 2*sqrt(var), fill = "True"), alpha = 0.15) +
  geom_line(data = Truefunc, inherit.aes = FALSE, aes(x=x, y=mean, colour = "True")) +
  geom_line(data = GPinterDetHet, aes(x=x, y=postmean, colour = "DetHet")) +
  geom_point(data = GPdataDetHeth, aes(x=x, y=y, colour = "True")) +
  geom_point(data = GPdataDetHetd, aes(x=xdet, y=ydet, colour = "True"), shape = 3) +
  geom_line(data = GPinterDetHet, aes(x=x, y = apply(samples[[4]],2, mean), colour = "Det")) +
  geom_hline(yintercept = 0, linetype="dashed", colour = "black") +
  theme_minimal(base_size = 16) +
  ylab("y") +
  xlim(0,1)+
  ylim(-12.5,7.5) +
  scale_fill_manual(values = c("True" = "#000000", "DetHet" = "#0072B2", "Het" = "#0072B2", "Det" = "#D55E00"), guide=FALSE) +
  scale_colour_manual(values = c("True" = "#000000", "DetHet" = "#0072B2", "Het" = "#0072B2", "Adjust" = "#CC79A7", "Det" = "#D55E00"), guide=FALSE)


grid.arrange(p5, p6, nrow = 1) #plot both plots