--- title: "GPvecchia tutorial" author: "Marcin Jurek, Daniel Zilber, Matthias Katzfuss" date: "`r Sys.Date()`" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{GPvecchia} % \VignetteDepends{mvtnorm} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r setup, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.dim=c(7,5) ) ``` In this vignette, we will demonstrate the main capabilities of the `GPvecchia` package. These include estimating parameters and making spatial predictions. We also show how to use the package for processing non-linear, non-Gaussian data by combining the Vecchia with a Laplace approximation. We start by importing the `GPvecchia` library. ```{r} library(GPvecchia) library(Matrix) library(fields) ``` ### Simulating data for illustration To illustrate our method, we simulate a small dataset. First, consider a unit square and randomly select observation locations. (Set spatial.dim=1 to consider data on the one-dimensional unit interval.) ```{r} set.seed(1988) spatial.dim=2 n=50 if(spatial.dim==1){ locs=matrix(runif(n),ncol=1) } else { locs <- cbind(runif(n),runif(n)) } ``` Next, we define the covariance function of the field as well as the scale of the measurement error ```{r} beta=2 sig2=1; range=.1; smooth=1.5 covparms =c(sig2,range,smooth) covfun <- function(locs) sig2*MaternFun(fields::rdist(locs),covparms) nuggets=rep(.1,n) ``` We are now ready to simulate the field and visualize it as a sanity check. ```{r fig4, out.width = '400px'} Om0 <- covfun(locs)+diag(nuggets) z=as.numeric(t(chol(Om0))%*%rnorm(n)) data=z+beta # plot simulated data if(spatial.dim==1) { plot(locs,data) } else { fields::quilt.plot(locs,data, nx=n, ny=n) } ``` We also create a grid of $n.p$ locations at which we would like to make predictions. ```{r} n.p=100 if(spatial.dim==1){ # 1-D case locs.pred=matrix(seq(0,1,length=n.p),ncol=1) } else { # 2-D case grid.oneside=seq(0,1,length=round(sqrt(n.p))) locs.pred=as.matrix(expand.grid(grid.oneside,grid.oneside)) # grid of pred.locs } n.p=nrow(locs.pred) ``` ### Basic functions for parameter estimation and prediction Let us now estimate the mean and covariance parameters using the default settings, which assume a spatially constant mean or trend, and a Matern covariance structure. Note that the following code might take a minute or so to run. ```{r paramm-est, cache=TRUE} vecchia.est=vecchia_estimate(data,locs,,output.level=0) ``` Based on these parameter estimates, we can then make predictions at the grid of locations we had specified above. ```{r} preds=vecchia_pred(vecchia.est,locs.pred) ``` Finally, we compare the approximate predictions with the best possible ones (i.e. those obtained using analytic expressions for conditional mean in the Gaussian distribution). ```{r} ## exact prediction mu.exact=as.numeric(covfun(rbind(locs,locs.pred))[,1:n]%*%solve(Om0,data-beta))+beta cov.exact=covfun(rbind(locs,locs.pred))- covfun(rbind(locs,locs.pred))[,1:n]%*%solve(Om0,t(covfun(rbind(locs,locs.pred))[,1:n])) var.exact=diag(cov.exact) cov.exact.pred=cov.exact[n+(1:n.p),n+(1:n.p)] ### plot Vecchia and exact predictions if(spatial.dim==1) { plot(locs,z) lines(locs.pred,preds$mean.pred,col='blue') lines(locs.pred,preds$mean.pred-1.96*sqrt(preds$var.pred),col='blue',lty=2) lines(locs.pred,preds$mean.pred+1.96*sqrt(preds$var.pred),col='blue',lty=2) lines(locs.pred,mu.exact[n+(1:n.p)],col='red') lines(locs.pred,mu.exact[n+(1:n.p)]-1.96*sqrt(var.exact[n+(1:n.p)]),col='red',lty=2) lines(locs.pred,mu.exact[n+(1:n.p)]+1.96*sqrt(var.exact[n+(1:n.p)]),col='red',lty=2) } else { sdrange=range(sqrt(c(preds$var.pred,var.exact[n+(1:n.p)]))) defpar = par(mfrow=c(2,3)) fields::quilt.plot(locs,z, nx=sqrt(n.p), ny=sqrt(n.p)) fields::quilt.plot(locs.pred,preds$mean.pred, nx=sqrt(n.p), ny=sqrt(n.p)) fields::quilt.plot(locs.pred,sqrt(preds$var.pred),zlim=sdrange, nx=sqrt(n.p), ny=sqrt(n.p)) fields::quilt.plot(locs,z, nx=sqrt(n.p), ny=sqrt(n.p)) fields::quilt.plot(locs.pred,mu.exact[n+(1:n.p)], nx=sqrt(n.p), ny=sqrt(n.p)) fields::quilt.plot(locs.pred,sqrt(var.exact[n+(1:n.p)]),zlim=sdrange, nx=sqrt(n.p), ny=sqrt(n.p)) par(defpar) } ``` ### More details on likelihood evaluation Let's take a closer look at how the likelihood is evaluated using Vecchia. Most importantly, we can specify a parameter, $m$. Its value determines the number of "neighbours" of each point, or, in other words, how many other points a given point conditions on. The larger this parameter, the more accurate and expensive the approximation will be. ```{r} m=20 vecchia.approx=vecchia_specify(locs,m) vecchia_likelihood(z,vecchia.approx,covparms,nuggets) ``` Note that the function vecchia_specify determines the general properties of the approximation, but it does not depend on the data or the specific parameter values. Hence, it does not have to be re-run when searching over different parameter values in an estimation procedure. We can also compare the results to the exact likelihood: ```{r} library(mvtnorm) dmvnorm(z,mean=rep(0,n),sigma=Om0,log=TRUE) ``` In this case the approximation is very good. In general, $m=20$ is a good value, and $m$ should usually be between 10 and 40. For one-dimensional space, we can get good approximations even with $m=5$ or smaller. ### More details on spatial prediction Similar to the previous section we next specify the approximation and indicate at which locations prediction is desired. ```{r} m=30 vecchia.approx=vecchia_specify(locs,m,locs.pred=locs.pred) preds=vecchia_prediction(z,vecchia.approx,covparms,nuggets) # returns a list with elements mu.pred,mu.obs,var.pred,var.obs,V.ord ``` It is also possible to print the entire predictive covariance matrix. We do it here only for the purpose of illustration. If $n.p$ is very large, this matrix might use up a lot of memory and we generally do not recommend plotting it directly. ```{r} Sigma=V2covmat(preds)$Sigma.pred cov.range=quantile(rbind(Sigma,cov.exact.pred),c(.01,.99)) defpar = par(mfrow=c(1,2)) fields::image.plot(cov.exact.pred,zlim=cov.range) fields::image.plot(Sigma,zlim=cov.range) par(mfrow=c(defpar)) ``` #### Linear combinations We might sometimes be interested in a linear combination of the predicted values. In particular, we can limit our attention to only a subset of our predictions. This can be accomplished by specifying the linear combination coefficients as a matrix. As an example, we assume we are only interested in predictions at the unobserved prediction locations (not at the first n observed locations): ```{r} H=Matrix::sparseMatrix(i=1:(n+n.p),j=1:(n+n.p),x=1)[(n+1):(n+n.p),] # compute variances of Hy lincomb.vars=vecchia_lincomb(H,preds$U.obj,preds$V.ord) plot(preds$var.pred,lincomb.vars) ``` As another example, we consider the overall mean of the process at all prediction locations. Using the `vecchia_lincomb()` function enables us to get the variance estimates easily. ```{r} mean(preds$mu.pred) # compute entire covariance matrix of Hy (here, 1x1) H=Matrix::sparseMatrix(i=rep(1,n.p),j=n+(1:n.p),x=1/n.p) lincomb.cov=vecchia_lincomb(H,preds$U.obj,preds$V.ord,cov.mat=TRUE) ``` ### Other GP approximations as special cases By specifying appropriate options in vecchia_specify, we can do everything described above for several other GP approximations: Modified predictive process, FSA, MRA, latent, standard Vecchia Setting $M=1$ results in **block full-scale approximation**, specifically one with $r_0 = \frac{m}{2}$ knots spread over the entire domain and the remaining locations being partitioned into blocks of size $<\frac{m}{2}+1$. ```{r} m=20 mra.options.fulls=list(M=1) blockFS = vecchia_specify(locs, m, 'maxmin', conditioning='mra', mra.options=mra.options.fulls, verbose=TRUE) ``` Another popular existing approximation method, **modified predictive process (MPP)**, can also be obtained by specifying appropriate parameter settings: ```{r} mra.options.mpproc=list(r=c(m,1)) MPP = vecchia_specify(locs, m, 'maxmin', conditioning='mra', mra.options=mra.options.mpproc, verbose=TRUE) ``` As we can see, MPP can be viewed as a special case of the **multi-resolution approximation (MRA)**. A general MRA is obtained my specifying all of its three parameters ```{r} mra.options.mra = list(r=c(10, 5, 5), M=2, J=2) MRA_rJM = vecchia_specify(locs, m, 'maxmin', conditioning='mra', mra.options=mra.options.mra, verbose=TRUE) ``` We should note two things to note about this full specifiction of an MRA. First, providing all three $r$,$J$ and $M$ overrides whatever value of $m$ was provided. Second, in order to be able to place a knot at each point of the grid, the provided parameters might need to be adjusted. Finally, we can also use the `GPvecchia` package to specify a **Nearest Neighbour Gaussian Process (NNGP)** approximation. This can be accomplished as shown below. ```{r} NNGP = vecchia_specify(locs, m, cond.yz='y') ``` We can now easily compare different approximation methods and compare it with SGV and exact likelihood. ```{r} vecchia_likelihood(z,blockFS,covparms,nuggets) vecchia_likelihood(z,MPP,covparms,nuggets) vecchia_likelihood(z,MRA_rJM,covparms,nuggets) vecchia_likelihood(z,NNGP,covparms,nuggets) vecchia_likelihood(z, vecchia_specify(locs, m), covparms, nuggets) dmvnorm(z,mean=rep(0,n),sigma=Om0,log=TRUE) ``` ### Non-Gaussian data Here we demonstrate how GPVecchia can fit a latent model to non-Gaussian data using the Vecchia-Laplace method. We simulate data by first generating a correlated latent field without noise, assuming the same covariance and locations generated earlier: ```{r} # simulate latent process y=as.numeric(t(chol(Om0))%*%rnorm(n)) ``` Then we sample a single non-Gaussian value for each latent value. The variability introduced by the sampling induces heteroskedasticity, in contrast the the constant noise added to the Gaussian case. Below we use a logistic model for binary data, but there are implementations for count and continuous positive (right-skewed) data as well. ```{r} data.model = "logistic" # simulate data if(data.model=='poisson'){ z = rpois(n, exp(y)) } else if(data.model=='logistic'){ z = rbinom(n,1,prob = exp(y)/(1+exp(y))) } else if(data.model=='gamma'){ z = rgamma(n, shape = default_lh_params$alpha, rate = default_lh_params$alpha*exp(-y)) }else{ print('Error: Distribution not implemented yet.') } # plot simulated data, 1 or 2D defpar = par(mfrow=c(1,2)) if(spatial.dim==1) { plot(locs,y, main = "latent") plot(locs,z, main = "observed") } else { fields::quilt.plot(locs,y, main = "Latent") fields::quilt.plot(locs,z, main = "Observed") } par(defpar) ``` Given the simulated data, we now can efficiently estimate the latent field by specifying the number of conditioning points $m$ described earlier. Interweaved ordering is best for 1D data while response-first ('zy') ordering is best for higher dimensions. ```{r} m=10 if(spatial.dim==1){ vecchia.approx=vecchia_specify(locs,m) #IW ordering } else { vecchia.approx=vecchia_specify(locs,m,cond.yz='zy') #RF ordering } ``` With the approximated covariance structure, we can calculate the posterior estimate for the latent field using the Vecchia-Laplace method and plot the result. Pure Laplace approximation is included for comparison; even with a small value for $m$, we can get a result similar to Laplace but with much lower cost. ```{r} posterior = calculate_posterior_VL(z,vecchia.approx,likelihood_model=data.model, covparms = covparms) if (spatial.dim==1){ par(mfrow=c(1,1)) ord = order(locs) # order so that lines appear correctly y_limits = c(min(y, posterior$mean[ord]), max(y, posterior$mean[ord])) plot(locs[ord], y[ord], type = "l", ylim = y_limits ) lines(locs[ord], posterior$mean[ord], type = "l", col=3, lwd=3) legend("bottomright", legend = c("Latent", "VL"), col= c(1,3), lwd=c(1,3)) } else if (spatial.dim==2){ dfpar = par(mfrow=c(1,2)) # ordering unnecessary; we are using a scatter plot rather than lines quilt.plot(locs, y, main= "Truth") quilt.plot(locs, posterior$mean, main= "VL m=10") par(defpar) } ``` ### Non-Gaussian predictions Predictions are computed as before, using Vecchia-Laplace methods where needed ```{r} ###### specify prediction locations ####### n.p=30^2 if(spatial.dim==1){ # 1-D case locs.pred=matrix(seq(0,1,length=n.p),ncol=1) } else { # 2-D case grid.oneside=seq(0,1,length=round(sqrt(n.p))) locs.pred=as.matrix(expand.grid(grid.oneside,grid.oneside)) # grid of pred.locs } n.p=nrow(locs.pred) ###### specify Vecchia approximation ####### vecchia.approx.pred = vecchia_specify(locs, m=10, locs.pred=locs.pred) ### carry out prediction preds = vecchia_laplace_prediction(posterior, vecchia.approx.pred, covparms) # plotting predicitions if (spatial.dim==1){ defpar = par(mfrow=c(1,1)) ord = order(locs) # order so that lines appear correctly plot(locs[ord], y[ord], type = "l", xlim=c(0,1.2), ylim = c(-1,3)) lines(locs, posterior$mean, type = "p", col=4, lwd=3, lty=1) lines(locs.pred, preds$mu.pred, type = "l", col=3, lwd=3, lty=1) lines(locs.pred,preds$mu.pred+sqrt(preds$var.pred), type = "l", lty = 3, col=3) lines(locs.pred,preds$mu.pred-sqrt(preds$var.pred), type = "l", lty = 3, col=3) legend("topleft", legend = c("Latent", "VL: Pred", "VL: 1 stdev"), col= c(1,3,3), lwd=c(1,2,1), lty = c(1,1,3)) par(defpar) } else if (spatial.dim==2){ defpar = par(mfrow=c(1,2)) # ordering unnecessary; we are using a scatter plot rather than lines quilt.plot(locs, y, main= "True Latent", xlim = c(0,1), ylim = c(0,1), nx=64, ny=64) quilt.plot(locs.pred, preds$mu.pred, main= "VL Prediction",nx = 30, ny=30) par(defpar) } ``` ### Parameter estimation The likelihood of the data for a set of parameters can be computed efficiently using the command below. ```{r} vecchia_laplace_likelihood(z,vecchia.approx,likelihood_model=data.model,covparms = covparms) ``` This can be used for parameter estimation by evaluating the likelihood over a grid of parameter values or in an iterative optimization method such as Nelder-Mead. Reparameterizing the parameters improves performance. ```{r, eval = FALSE} # currently set up for covariance estimation vecchia.approx=vecchia_specify(locs, m=10, cond.yz = "zy") # for posterior vecchia.approx.IW = vecchia_specify(locs, m=10) # for integrated likelihood if (spatial.dim==1) vecchia.approx=vecchia.approx.IW vl_likelihood = function(x0){ theta = exp(x0) covparms=c(theta[1], theta[2], theta[3]) # sigma range smoothness prior_mean = 0 # can be a parameter as well # Perform inference on latent mean with Vecchia Laplace approximation vll = vecchia_laplace_likelihood(z,vecchia.approx, likelihood_model=data.model, covparms, return_all = FALSE, likparms = default_lh_params, prior_mean = prior_mean, vecchia.approx.IW = vecchia.approx.IW) return(-vll) } x0 = log(c(.07,1.88, 1.9)) vl_likelihood(x0) # Issues with R aborting, maxit set to 1 res = optim(x0, vl_likelihood, method = "Nelder-Mead", control = list("trace" = 1, "maxit" = 1)) exp(res$par[1:3]) vl_likelihood(x0) ```