---
title: "Introduction to SLGP Package"
author: "Athénaïs Gautier"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{Introduction to SLGP Package}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---
  
```{r, include = FALSE}
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>"
)
```

This vignette serves as a startup guide to SLGP modeling, providing a practical introduction to the implementation of Spatial Logistic Gaussian Processes (SLGPs). 

# Dataset
We illustrate the model's capabilities using the Boston Housing dataset @harrison_hedonic_1978, a widely used benchmark in statistical modeling and regression analysis. 

For this vignette, we focus on modeling the distribution of median home values (med) as a function of the proportion of pre-1940 owner-occupied units (age). This example highlights the ability of SLGPs to capture complex, spatially dependent distributions in data that exhibit heterogeneity and multi-modality. 

```{r loadHousing, warning=FALSE}
library(dplyr)
# Load the dataset (available in MASS package)
require(MASS)
data("Boston", package = "MASS")
df <- Boston %>%
  mutate(age_bin = cut(age, breaks = seq(0, 100, by = 10), include.lowest = FALSE)) %>%
  group_by(age_bin) %>%
  mutate(age_bin = paste0(age_bin, "\nn=", n()))%>%
  ungroup()%>%
  mutate(age_bin = factor(age_bin, 
                          levels = sort(unique(age_bin), decreasing = FALSE))) %>%
  data.frame()

range_response <- c(0, 50) # Can use range(df$medv), or user defined range as we do here
range_x <- c(0, 100) # Can use range(df$age), or user defined range as we do here
```

We represent the data.
```{r figureHousing, fig.cap = "A visual representation of the dependency of the median value of owner-occupied homes on proportion of owner-occupied units constructed before 1940 in the Boston Housing dataset.", fig.fullwidth=TRUE, fig.height=4, fig.width=10, fig.align='center',fig.pos="H"}
library(ggplot2)
library(ggpubr)
library(viridis)
# Scatterplot: med vs. age
scatter_plot <- ggplot(df, aes(x = age, y = medv)) +
  geom_point(alpha = 0.5, color = "navy") +
  labs(x = "Proportion of owner-occupied units\nbuilt prior to 1940 [AGE, %]", 
       y = "Median value of owner-occupied homes [MEDV, k$]",
       title = "Median value vs. age of homes") +
  theme_bw()+
  coord_cartesian(xlim=range_x,
                  ylim=range_response)

# Histogram: Distribution of med by age bin
hist_plot <- ggplot(df, aes(x = medv)) +
  geom_histogram(mapping=aes(y=after_stat(density)),
                 position = "identity", breaks = seq(0, 50, 2.5),
                 fill="darkgrey", col="grey50", lwd=0.2, alpha=0.7) +
  geom_rug(sides = "b", color = "navy", alpha = 0.5)+
  facet_wrap(~ age_bin, scales = "free_y", nrow=2) +
  labs(x = "Median value of owner-occupied homes [MEDV, k$]", 
       y = "Probability density", 
       title = "Histogram of median housing values by AGE group") +
  theme_bw()+
  coord_cartesian(xlim=range_response,
                  ylim=c(0, 0.25))
ggarrange(scatter_plot, hist_plot, ncol = 2, nrow = 1,
          widths = c(0.3, 0.7))
# ggsave("./Figures/scatter.pdf", width=10, height=3.5)
```


We see that there is a general trend where older homes tend to have lower values, with exceptions likely due to survivor bias: older homes that remain tend to be of higher structural quality. This dataset provides an test case for SLGP modeling, offering a compact, one-dimensional covariate space, heterogeneously distributed data, and shifting distributional shapes.

# SLGP model specifications

To model the distributional changes observed in the Boston Housing dataset, we now introduce the Spatial Logistic Gaussian Process (SLGP) model. SLGPs provide a flexible non-parametric framework for modeling spatially dependent probability densities. By transforming a Gaussian Process (GP) through exponentiation and normalization, SLGPs ensure positivity and integration to one, making them well-suited for density estimation.

## Maximum a posteriori estimate

For a quick approximation, MAP estimation provides a point estimate, offering a balance between computational efficiency and the depth of inference. It is the fastest estimation scheme we propose, however MAP does not facilitate uncertainty quantification because it yields a non-probabilistic estimate of the underlying density field, focusing instead on identifying the mode of the posterior distribution. 

```{r SLGPfitting00}
library(SLGP)
modelMAP <- slgp(medv~age, # Use a formula to specify predictors VS response
                 # Can use medv~. for all variables,
                 # Or medv ~ age + var2 + var3 for more variables
                 data=df,
                 method="MAP", #Maximum a posteriori estimation scheme
                 basisFunctionsUsed = "RFF",
                 interpolateBasisFun="WNN", # Accelerate inference
                 hyperparams = list(lengthscale=c(0.15, 0.15), 
                                    # Applied to normalised data
                                    # So 0.15 is 15% of the range of values
                                    sigma2=1), 
                 # Will be re-selected with sigmaEstimationMethod
                 sigmaEstimationMethod = "heuristic", 
                 # Set to heuristic for numerical stability                 
                 predictorsLower= c(range_x[1]),
                 predictorsUpper= c(range_x[2]),
                 responseRange= range_response,
                 opts_BasisFun = list(nFreq=100,
                                      MatParam=5/2),
                 seed=1)
```
We can represent the conditional densities over slices:


```{r SLGPplotting2, fig.cap = "Predictive probability density of medv at age, seen over slices.", fig.fullwidth=TRUE, fig.height=5, fig.width=10, fig.align='center',fig.pos="H"}


library(SLGP)
library(viridis)
dfGrid <- data.frame(expand.grid(seq(range_x[1], range_x[2], 5), 
                                 seq(range_response[1], range_response[2],, 101)))
colnames(dfGrid) <- c("age", "medv")
pred <- predictSLGP_newNode(SLGPmodel=modelMAP,
                            newNodes = dfGrid)

scale_factor <- 100
ggplot()  +
  labs(y = "Proportion of owner-occupied units\nbuilt prior to 1940 [%]", 
       x = "Median value of owner-occupied homes [k$]",
       title = "Median value vs. age of homes") +
  theme_bw()+
  geom_ribbon(data=pred,
              mapping=aes(x=medv, ymax=scale_factor*pdf_1+age, 
                          ymin=age, group=-age, fill=age),
              col="grey", alpha=0.9)+
  geom_point(data=df,
             mapping=aes(x = medv, y = age), alpha = 0.5, color = "navy")+
  scale_fill_viridis(option = "plasma",
                     guide = guide_colorbar(nrow = 1,
                                            title = "Indexing variable: Proportion of owner-occupied units built prior to 1940",
                                            barheight = unit(2, units = "mm"),
                                            barwidth = unit(55, units = "mm"),
                                            title.position = 'top',
                                            label.position = "bottom",
                                            title.hjust = 0.5))+
  theme(legend.position = "bottom")+
  coord_flip()

```


## Laplace approximation estimate

By integrating the MAP approach with Laplace approximation, we refine our estimation strategy by approximating the posterior distribution with a multivariate Gaussian. This method strikes a balance between the full Bayesian inference of MCMC and the computational efficiency of MAP estimation. By leveraging both the gradient and Hessian of the posterior, it captures essential curvature information, providing a more informed approximation of the posterior landscape.

```{r SLGPfitting2}
modelLaplace <- retrainSLGP(SLGPmodel=modelMAP, 
                            newdata = df, 
                            method="Laplace")
```

```{r SLGPfitting22, eval=FALSE}
# Or equivalent, more explicit in the re-using of the elements
# From the SLGP prior

modelLaplace <- slgp(medv~age, 
                 data=df,
                 method="MAP", #Maximum a posteriori estimation scheme
                 basisFunctionsUsed = "RFF",
                 interpolateBasisFun="WNN", # Accelerate inference
                 hyperparams = modelMAP@hyperparams, 
                 sigmaEstimationMethod = "none",# Already selected in the prior
                 predictorsLower= c(range_x[1]),
                 predictorsUpper= c(range_x[2]),
                 responseRange= range_response,
                 opts_BasisFun = modelMAP@opts_BasisFun,
                 BasisFunParam = modelMAP@BasisFunParam,
                 seed=1)
```


```{r SLGPLaplaceplot, fig.cap = "Predictive probability density (and draws from a Laplace approximation) of medv at age, seen over 3 slices.", fig.fullwidth=TRUE, fig.height=4, fig.width=8, fig.align='center',fig.pos="H"}
# Define the three selected values
selected_values <- c(20, 50, 95)
gap <- 2.5

dfGrid <- data.frame(expand.grid(seq(3), 
                                 seq(range_response[1], range_response[2],, 101)))
colnames(dfGrid) <- c("ID", "medv")
dfGrid$age <- selected_values[dfGrid$ID]
pred <- predictSLGP_newNode(SLGPmodel=modelLaplace,
                            newNodes = dfGrid)
pred$meanpdf <- rowMeans(pred[, -c(1:3)])

library(tidyr)
# Filter the data: keep values within ±5 of the selected ones
df_filtered <- df %>%
  mutate(interval=findInterval(age, c(0, 
                                      selected_values[1]-gap, 
                                      selected_values[1]+gap, 
                                      selected_values[2]-gap, 
                                      selected_values[2]+gap, 
                                      selected_values[3]-gap, 
                                      selected_values[3]+gap)))%>%
  filter(interval %in% c(2, 4, 6))%>%
  group_by(interval)%>%
  mutate(category = paste0("Age close to ", c("", selected_values[1],
                                              "", selected_values[2],
                                              "", selected_values[3])[interval], 
                           "\nn=", n()))
names <- sort(unique(df_filtered$category))
pred$category <- names[pred$ID]

set.seed(1)
selected_cols <- sample(seq(1000), size=10, replace=FALSE)
df_plot <- pred %>%
  dplyr::select(c("age", "medv", "category", 
                  paste0("pdf_", selected_cols)))%>%
  pivot_longer(-c("age", "medv", "category"))


ggplot(mapping=aes(x = medv)) +
  geom_histogram(df_filtered,
                 mapping=aes(y=after_stat(density)),
                 position = "identity", breaks = seq(0, 50, 2.5),
                 fill="darkgrey", col="grey50", lwd=0.2, alpha=0.7) +
  geom_rug(data=df_filtered, sides = "b", color = "navy", alpha = 0.5)+
  geom_line(data=df_plot, mapping=aes(y=value, group=name), 
            color = "black", lwd=0.1, alpha=0.5)+
  geom_line(data=pred, mapping=aes(y=meanpdf, group=category), color = "red")+
  facet_wrap(~ category, scales = "free_y", nrow=1) +
  labs(x = "Median value of owner-occupied homes [k$]", 
       y = "Probability density", 
       title = "Histogram of median value at bins centered at several 'age' values, with width 5\nversus SLGP draws from a Laplace approximation (black curves) and average (red curve)") +
  theme_bw()+
  coord_cartesian(xlim=range_response,
                  ylim=c(0, 0.25))

```


## MCMC estimate

This method allows us to explore the posterior distribution by drawing samples from it. It enables precise, exact inference of the posterior distribution of the underlying density field knowing the data. The main drawback of this approach being its higher computational cost

```{r SLGPfitting3, eval=FALSE}
modelMCMC <- slgp(medv~age, # Use a formula to specify predictors VS response
                  # Can use medv~. for all variables,
                  # Or medv ~ age + var2 + var3 for more variables
                  data=df,
                  method="MCMC", #MCMC
                  basisFunctionsUsed = "RFF",
                  interpolateBasisFun="WNN", # Accelerate inference
                  hyperparams = list(lengthscale=c(0.15, 0.15), 
                                     # Applied to normalised data
                                     # So 0.15 is 15% of the range of values
                                     sigma2=1), 
                  # Will be re-selected with sigmaEstimationMethod
                  sigmaEstimationMethod = "heuristic", # Set to heuristic for numerical stability                 
                  predictorsLower= c(range_x[1]),
                  predictorsUpper= c(range_x[2]),
                  responseRange= range_response,
                  opts_BasisFun = list(nFreq=100,
                                       MatParam=5/2),
                  opts = list(stan_chains=2, stan_iter=1000))
```

# By-products of the estimation

One of the advantages of the SLGP framework is that it predicts entire probability density functions (PDFs) over space. This opens the door to a wide range of nonlinear inferences on the estimated field. In particular, we can compute and visualize functionals of the predicted densities, such as moments (mean, variance, skewness, etc.) or quantiles.

In our current implementation, we provide predictions for both centered and uncentered moments of the estimated PDFs at each location. These derived quantities can themselves be interpreted as spatial fields and are thus informative summaries of the underlying random process.

Importantly, when using probabilistic inference schemes like Laplace approximation or MCMC, the uncertainty in the SLGP predictions naturally propagates to these functionals. As a result, we can also quantify uncertainty on the functionals. For example, we compute credible intervals for the mean or variance field. This is illustrated in the upcoming figure, where we use the MCMC-trained SLGP model to compute moments.


```{r SLGPMCMCplotMoments, fig.cap = "Simultaneous prediction of the fields moments (and associated uncertainty) using a SLGP model", fig.fullwidth=TRUE, fig.height=4, fig.width=10, fig.align='center',fig.pos="H"}
dfX <- data.frame(age=seq(range_x[1], range_x[2], 2))

predMean <- predictSLGP_moments(SLGPmodel=modelLaplace,
                                newNodes = dfX, 
                                power=c(1),
                                centered=FALSE) # Uncentered moments
# For the mean
predVar <- predictSLGP_moments(SLGPmodel=modelLaplace,
                               newNodes = dfX, 
                               power=c(2), 
                               centered=TRUE) # Centered moments
# For the variance, Kurtosis and Skewness

pred <- rbind(predMean, predVar)
pred <- pred %>%
  pivot_longer(-c("age", "power"))%>%
  mutate(value=ifelse(power==2, sqrt(value), value))%>% # Define std
  data.frame()

pred$power <- factor(c("Expected value", 
                       "Standard deviation")[as.numeric(pred$power)],
                     levels=c("Expected value", "Standard deviation"))
df_plot <- pred %>%
  group_by(age, power)%>%
  summarise(q10 = quantile(value, probs=c(0.1)),
            q50 = quantile(value, probs=c(0.5)),
            q90 = quantile(value, probs=c(0.9)),
            mean = mean(value), .groups="keep")%>%
  ungroup() # summarise uncertainty


ggplot(df_plot, mapping=aes(x = age)) +
  geom_ribbon(mapping = aes(ymin=q10, ymax=q90),
              alpha = 0.25, lty=2, col="black", fill="cornflowerblue")+
  geom_line(mapping=aes(y=q50))+
  facet_grid(.~power)+
  labs(x = "Proportion of owner-occupied units built prior to 1940 [AGE, %]", 
       y = "Moment value") +
  theme_bw()+
  coord_cartesian(xlim=range_x,
                  ylim=range_response)

```

We also support the joint prediction of quantiles at arbitrary levels. Because quantiles are derived directly from the estimated densities, they are guaranteed to be consistent and non-crossing - a property that cannot always be ensured in standard quantile regression. This makes them reliable tools for summarizing distributional shape (e.g., asymmetry, spread) at each location.


```{r SLGPMCMCplotQuantiles, fig.cap = "Simultaneous quantile prediction using a SLGP model, with estimation performed by MAP", fig.fullwidth=TRUE, fig.height=5, fig.width=10, fig.align='center',fig.pos="H"}
probsL <- c(5, 25, 50, 75, 95)/100
dfX <- data.frame(age=seq(range_x[1], range_x[2], 2))
pred <- predictSLGP_quantiles(SLGPmodel=modelLaplace,
                              newNodes = dfX, 
                              probs=probsL)

df_plot <- pred %>%
  pivot_longer(-c("age", "probs"))%>%
  group_by(age, probs)%>%
  summarise(q10 = quantile(value, probs=c(0.1)),
            q50 = quantile(value, probs=c(0.5)),
            q90 = quantile(value, probs=c(0.9)),
            mean = mean(value), .groups="keep")%>%
  ungroup()%>%
  mutate(probs=factor(paste0("Quantile: ", 100*probs, "%"),
                      levels=paste0("Quantile: ", 100*probsL, "%")))


plot1 <- ggplot(df_plot, mapping=aes(x = age)) +
  geom_ribbon(mapping = aes(ymin=q10, ymax=q90, 
                            col=probs, fill=probs, group=probs),
              alpha = 0.25, lty=3)+
  geom_line(mapping=aes(y=q50, lty="SLGP", 
                        col=probs,group=probs), lwd=0.75)+
  labs(x = "Proportion of owner-occupied units built prior to 1940 [AGE, %]", 
       y = "Median value of owner-occupied homes [MEDV, k$]",
       fill = "Quantile levels",
       col = "Quantile levels", 
       lty="Quantile estimation method") +
  theme_bw()+
  coord_cartesian(xlim=range_x,
                  ylim=range_response)+
  scale_linetype_manual(values=c(2, 1))

plot2 <- ggplot(df, mapping=aes(x=age))+
  geom_histogram(col="navy", fill="grey", bins = 30, alpha = 0.3,
                 breaks=seq(0, 100, 5))+
  theme_minimal()+
  labs(x = NULL, 
       y = "Sample count") 

library(ggpubr)
p_with_marginal <- ggarrange(
  plot2, plot1,
  ncol = 1, heights = c(1, 3),  # adjust height ratio
  align = "v"
)

print(p_with_marginal)
```