--- title: "Introduction to SLGP Package" author: "Athénaïs Gautier" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Introduction to SLGP Package} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ``` This vignette serves as a startup guide to SLGP modeling, providing a practical introduction to the implementation of Spatial Logistic Gaussian Processes (SLGPs). # Dataset We illustrate the model's capabilities using the Boston Housing dataset @harrison_hedonic_1978, a widely used benchmark in statistical modeling and regression analysis. For this vignette, we focus on modeling the distribution of median home values (med) as a function of the proportion of pre-1940 owner-occupied units (age). This example highlights the ability of SLGPs to capture complex, spatially dependent distributions in data that exhibit heterogeneity and multi-modality. ```{r loadHousing, warning=FALSE} library(dplyr) # Load the dataset (available in MASS package) require(MASS) data("Boston", package = "MASS") df <- Boston %>% mutate(age_bin = cut(age, breaks = seq(0, 100, by = 10), include.lowest = FALSE)) %>% group_by(age_bin) %>% mutate(age_bin = paste0(age_bin, "\nn=", n()))%>% ungroup()%>% mutate(age_bin = factor(age_bin, levels = sort(unique(age_bin), decreasing = FALSE))) %>% data.frame() range_response <- c(0, 50) # Can use range(df$medv), or user defined range as we do here range_x <- c(0, 100) # Can use range(df$age), or user defined range as we do here ``` We represent the data. ```{r figureHousing, fig.cap = "A visual representation of the dependency of the median value of owner-occupied homes on proportion of owner-occupied units constructed before 1940 in the Boston Housing dataset.", fig.fullwidth=TRUE, fig.height=4, fig.width=10, fig.align='center',fig.pos="H"} library(ggplot2) library(ggpubr) library(viridis) # Scatterplot: med vs. age scatter_plot <- ggplot(df, aes(x = age, y = medv)) + geom_point(alpha = 0.5, color = "navy") + labs(x = "Proportion of owner-occupied units\nbuilt prior to 1940 [AGE, %]", y = "Median value of owner-occupied homes [MEDV, k$]", title = "Median value vs. age of homes") + theme_bw()+ coord_cartesian(xlim=range_x, ylim=range_response) # Histogram: Distribution of med by age bin hist_plot <- ggplot(df, aes(x = medv)) + geom_histogram(mapping=aes(y=after_stat(density)), position = "identity", breaks = seq(0, 50, 2.5), fill="darkgrey", col="grey50", lwd=0.2, alpha=0.7) + geom_rug(sides = "b", color = "navy", alpha = 0.5)+ facet_wrap(~ age_bin, scales = "free_y", nrow=2) + labs(x = "Median value of owner-occupied homes [MEDV, k$]", y = "Probability density", title = "Histogram of median housing values by AGE group") + theme_bw()+ coord_cartesian(xlim=range_response, ylim=c(0, 0.25)) ggarrange(scatter_plot, hist_plot, ncol = 2, nrow = 1, widths = c(0.3, 0.7)) # ggsave("./Figures/scatter.pdf", width=10, height=3.5) ``` We see that there is a general trend where older homes tend to have lower values, with exceptions likely due to survivor bias: older homes that remain tend to be of higher structural quality. This dataset provides an test case for SLGP modeling, offering a compact, one-dimensional covariate space, heterogeneously distributed data, and shifting distributional shapes. # SLGP model specifications To model the distributional changes observed in the Boston Housing dataset, we now introduce the Spatial Logistic Gaussian Process (SLGP) model. SLGPs provide a flexible non-parametric framework for modeling spatially dependent probability densities. By transforming a Gaussian Process (GP) through exponentiation and normalization, SLGPs ensure positivity and integration to one, making them well-suited for density estimation. ## Maximum a posteriori estimate For a quick approximation, MAP estimation provides a point estimate, offering a balance between computational efficiency and the depth of inference. It is the fastest estimation scheme we propose, however MAP does not facilitate uncertainty quantification because it yields a non-probabilistic estimate of the underlying density field, focusing instead on identifying the mode of the posterior distribution. ```{r SLGPfitting00} library(SLGP) modelMAP <- slgp(medv~age, # Use a formula to specify predictors VS response # Can use medv~. for all variables, # Or medv ~ age + var2 + var3 for more variables data=df, method="MAP", #Maximum a posteriori estimation scheme basisFunctionsUsed = "RFF", interpolateBasisFun="WNN", # Accelerate inference hyperparams = list(lengthscale=c(0.15, 0.15), # Applied to normalised data # So 0.15 is 15% of the range of values sigma2=1), # Will be re-selected with sigmaEstimationMethod sigmaEstimationMethod = "heuristic", # Set to heuristic for numerical stability predictorsLower= c(range_x[1]), predictorsUpper= c(range_x[2]), responseRange= range_response, opts_BasisFun = list(nFreq=100, MatParam=5/2), seed=1) ``` We can represent the conditional densities over slices: ```{r SLGPplotting2, fig.cap = "Predictive probability density of medv at age, seen over slices.", fig.fullwidth=TRUE, fig.height=5, fig.width=10, fig.align='center',fig.pos="H"} library(SLGP) library(viridis) dfGrid <- data.frame(expand.grid(seq(range_x[1], range_x[2], 5), seq(range_response[1], range_response[2],, 101))) colnames(dfGrid) <- c("age", "medv") pred <- predictSLGP_newNode(SLGPmodel=modelMAP, newNodes = dfGrid) scale_factor <- 100 ggplot() + labs(y = "Proportion of owner-occupied units\nbuilt prior to 1940 [%]", x = "Median value of owner-occupied homes [k$]", title = "Median value vs. age of homes") + theme_bw()+ geom_ribbon(data=pred, mapping=aes(x=medv, ymax=scale_factor*pdf_1+age, ymin=age, group=-age, fill=age), col="grey", alpha=0.9)+ geom_point(data=df, mapping=aes(x = medv, y = age), alpha = 0.5, color = "navy")+ scale_fill_viridis(option = "plasma", guide = guide_colorbar(nrow = 1, title = "Indexing variable: Proportion of owner-occupied units built prior to 1940", barheight = unit(2, units = "mm"), barwidth = unit(55, units = "mm"), title.position = 'top', label.position = "bottom", title.hjust = 0.5))+ theme(legend.position = "bottom")+ coord_flip() ``` ## Laplace approximation estimate By integrating the MAP approach with Laplace approximation, we refine our estimation strategy by approximating the posterior distribution with a multivariate Gaussian. This method strikes a balance between the full Bayesian inference of MCMC and the computational efficiency of MAP estimation. By leveraging both the gradient and Hessian of the posterior, it captures essential curvature information, providing a more informed approximation of the posterior landscape. ```{r SLGPfitting2} modelLaplace <- retrainSLGP(SLGPmodel=modelMAP, newdata = df, method="Laplace") ``` ```{r SLGPfitting22, eval=FALSE} # Or equivalent, more explicit in the re-using of the elements # From the SLGP prior modelLaplace <- slgp(medv~age, data=df, method="MAP", #Maximum a posteriori estimation scheme basisFunctionsUsed = "RFF", interpolateBasisFun="WNN", # Accelerate inference hyperparams = modelMAP@hyperparams, sigmaEstimationMethod = "none",# Already selected in the prior predictorsLower= c(range_x[1]), predictorsUpper= c(range_x[2]), responseRange= range_response, opts_BasisFun = modelMAP@opts_BasisFun, BasisFunParam = modelMAP@BasisFunParam, seed=1) ``` ```{r SLGPLaplaceplot, fig.cap = "Predictive probability density (and draws from a Laplace approximation) of medv at age, seen over 3 slices.", fig.fullwidth=TRUE, fig.height=4, fig.width=8, fig.align='center',fig.pos="H"} # Define the three selected values selected_values <- c(20, 50, 95) gap <- 2.5 dfGrid <- data.frame(expand.grid(seq(3), seq(range_response[1], range_response[2],, 101))) colnames(dfGrid) <- c("ID", "medv") dfGrid$age <- selected_values[dfGrid$ID] pred <- predictSLGP_newNode(SLGPmodel=modelLaplace, newNodes = dfGrid) pred$meanpdf <- rowMeans(pred[, -c(1:3)]) library(tidyr) # Filter the data: keep values within ±5 of the selected ones df_filtered <- df %>% mutate(interval=findInterval(age, c(0, selected_values[1]-gap, selected_values[1]+gap, selected_values[2]-gap, selected_values[2]+gap, selected_values[3]-gap, selected_values[3]+gap)))%>% filter(interval %in% c(2, 4, 6))%>% group_by(interval)%>% mutate(category = paste0("Age close to ", c("", selected_values[1], "", selected_values[2], "", selected_values[3])[interval], "\nn=", n())) names <- sort(unique(df_filtered$category)) pred$category <- names[pred$ID] set.seed(1) selected_cols <- sample(seq(1000), size=10, replace=FALSE) df_plot <- pred %>% dplyr::select(c("age", "medv", "category", paste0("pdf_", selected_cols)))%>% pivot_longer(-c("age", "medv", "category")) ggplot(mapping=aes(x = medv)) + geom_histogram(df_filtered, mapping=aes(y=after_stat(density)), position = "identity", breaks = seq(0, 50, 2.5), fill="darkgrey", col="grey50", lwd=0.2, alpha=0.7) + geom_rug(data=df_filtered, sides = "b", color = "navy", alpha = 0.5)+ geom_line(data=df_plot, mapping=aes(y=value, group=name), color = "black", lwd=0.1, alpha=0.5)+ geom_line(data=pred, mapping=aes(y=meanpdf, group=category), color = "red")+ facet_wrap(~ category, scales = "free_y", nrow=1) + labs(x = "Median value of owner-occupied homes [k$]", y = "Probability density", title = "Histogram of median value at bins centered at several 'age' values, with width 5\nversus SLGP draws from a Laplace approximation (black curves) and average (red curve)") + theme_bw()+ coord_cartesian(xlim=range_response, ylim=c(0, 0.25)) ``` ## MCMC estimate This method allows us to explore the posterior distribution by drawing samples from it. It enables precise, exact inference of the posterior distribution of the underlying density field knowing the data. The main drawback of this approach being its higher computational cost ```{r SLGPfitting3, eval=FALSE} modelMCMC <- slgp(medv~age, # Use a formula to specify predictors VS response # Can use medv~. for all variables, # Or medv ~ age + var2 + var3 for more variables data=df, method="MCMC", #MCMC basisFunctionsUsed = "RFF", interpolateBasisFun="WNN", # Accelerate inference hyperparams = list(lengthscale=c(0.15, 0.15), # Applied to normalised data # So 0.15 is 15% of the range of values sigma2=1), # Will be re-selected with sigmaEstimationMethod sigmaEstimationMethod = "heuristic", # Set to heuristic for numerical stability predictorsLower= c(range_x[1]), predictorsUpper= c(range_x[2]), responseRange= range_response, opts_BasisFun = list(nFreq=100, MatParam=5/2), opts = list(stan_chains=2, stan_iter=1000)) ``` # By-products of the estimation One of the advantages of the SLGP framework is that it predicts entire probability density functions (PDFs) over space. This opens the door to a wide range of nonlinear inferences on the estimated field. In particular, we can compute and visualize functionals of the predicted densities, such as moments (mean, variance, skewness, etc.) or quantiles. In our current implementation, we provide predictions for both centered and uncentered moments of the estimated PDFs at each location. These derived quantities can themselves be interpreted as spatial fields and are thus informative summaries of the underlying random process. Importantly, when using probabilistic inference schemes like Laplace approximation or MCMC, the uncertainty in the SLGP predictions naturally propagates to these functionals. As a result, we can also quantify uncertainty on the functionals. For example, we compute credible intervals for the mean or variance field. This is illustrated in the upcoming figure, where we use the MCMC-trained SLGP model to compute moments. ```{r SLGPMCMCplotMoments, fig.cap = "Simultaneous prediction of the fields moments (and associated uncertainty) using a SLGP model", fig.fullwidth=TRUE, fig.height=4, fig.width=10, fig.align='center',fig.pos="H"} dfX <- data.frame(age=seq(range_x[1], range_x[2], 2)) predMean <- predictSLGP_moments(SLGPmodel=modelLaplace, newNodes = dfX, power=c(1), centered=FALSE) # Uncentered moments # For the mean predVar <- predictSLGP_moments(SLGPmodel=modelLaplace, newNodes = dfX, power=c(2), centered=TRUE) # Centered moments # For the variance, Kurtosis and Skewness pred <- rbind(predMean, predVar) pred <- pred %>% pivot_longer(-c("age", "power"))%>% mutate(value=ifelse(power==2, sqrt(value), value))%>% # Define std data.frame() pred$power <- factor(c("Expected value", "Standard deviation")[as.numeric(pred$power)], levels=c("Expected value", "Standard deviation")) df_plot <- pred %>% group_by(age, power)%>% summarise(q10 = quantile(value, probs=c(0.1)), q50 = quantile(value, probs=c(0.5)), q90 = quantile(value, probs=c(0.9)), mean = mean(value), .groups="keep")%>% ungroup() # summarise uncertainty ggplot(df_plot, mapping=aes(x = age)) + geom_ribbon(mapping = aes(ymin=q10, ymax=q90), alpha = 0.25, lty=2, col="black", fill="cornflowerblue")+ geom_line(mapping=aes(y=q50))+ facet_grid(.~power)+ labs(x = "Proportion of owner-occupied units built prior to 1940 [AGE, %]", y = "Moment value") + theme_bw()+ coord_cartesian(xlim=range_x, ylim=range_response) ``` We also support the joint prediction of quantiles at arbitrary levels. Because quantiles are derived directly from the estimated densities, they are guaranteed to be consistent and non-crossing - a property that cannot always be ensured in standard quantile regression. This makes them reliable tools for summarizing distributional shape (e.g., asymmetry, spread) at each location. ```{r SLGPMCMCplotQuantiles, fig.cap = "Simultaneous quantile prediction using a SLGP model, with estimation performed by MAP", fig.fullwidth=TRUE, fig.height=5, fig.width=10, fig.align='center',fig.pos="H"} probsL <- c(5, 25, 50, 75, 95)/100 dfX <- data.frame(age=seq(range_x[1], range_x[2], 2)) pred <- predictSLGP_quantiles(SLGPmodel=modelLaplace, newNodes = dfX, probs=probsL) df_plot <- pred %>% pivot_longer(-c("age", "probs"))%>% group_by(age, probs)%>% summarise(q10 = quantile(value, probs=c(0.1)), q50 = quantile(value, probs=c(0.5)), q90 = quantile(value, probs=c(0.9)), mean = mean(value), .groups="keep")%>% ungroup()%>% mutate(probs=factor(paste0("Quantile: ", 100*probs, "%"), levels=paste0("Quantile: ", 100*probsL, "%"))) plot1 <- ggplot(df_plot, mapping=aes(x = age)) + geom_ribbon(mapping = aes(ymin=q10, ymax=q90, col=probs, fill=probs, group=probs), alpha = 0.25, lty=3)+ geom_line(mapping=aes(y=q50, lty="SLGP", col=probs,group=probs), lwd=0.75)+ labs(x = "Proportion of owner-occupied units built prior to 1940 [AGE, %]", y = "Median value of owner-occupied homes [MEDV, k$]", fill = "Quantile levels", col = "Quantile levels", lty="Quantile estimation method") + theme_bw()+ coord_cartesian(xlim=range_x, ylim=range_response)+ scale_linetype_manual(values=c(2, 1)) plot2 <- ggplot(df, mapping=aes(x=age))+ geom_histogram(col="navy", fill="grey", bins = 30, alpha = 0.3, breaks=seq(0, 100, 5))+ theme_minimal()+ labs(x = NULL, y = "Sample count") library(ggpubr) p_with_marginal <- ggarrange( plot2, plot1, ncol = 1, heights = c(1, 3), # adjust height ratio align = "v" ) print(p_with_marginal) ```