---
title: "SLGP with integer outputs"
author: "Athénaïs Gautier"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{SLGP with integer outputs}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r, include = FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
```

This vignette serves as a quick guide to Spatial Logistic Gaussian Process (SLGP) modeling, with a focus on predicting distributions for integer outputs. 

# Dataset
We illustrate the model's capabilities using the Boston Housing dataset @harrison_hedonic_1978, a widely used benchmark in statistical modeling and regression analysis. 

In this vignette, we demonstrate how to model the distribution of rad (index of accessibility to radial highways, between 1 and 24 where 24 indicates the best accessibility and 1 the worst) as a function of medv (median value of owner-occupied homes) and dis (weighted distance to employment centers) using Spatial Logistic Gaussian Processes (SLGPs). Since rad is an integer-valued variable, our goal is to adjust the implementation for this.


```{r loadHousing}
library(dplyr)
# Load the dataset (available in MASS package)
require(MASS)
data("Boston", package = "MASS")
df <- Boston 
range_response <- range(df$rad)
range_x <- matrix(c(c(1, 13),
                    range(df$medv)), # Use c(1, 13) instead of range = c(1.1296 12.1265)
                  # For easier chunks
                  ncol=2, byrow=TRUE)
```

We represent the data.
```{r figureHousing, fig.cap = "Figure 1: Index of Accessibility to Radial Highways as a Function of Home Value and Distance to Employment Centers", fig.fullwidth=TRUE, fig.height=6, fig.width=6, fig.align='center',fig.pos="H"}
library(ggplot2)
library(ggpubr)

# Histogram: Distribution of med by age bin
ggplot(df, aes(x = dis, y=medv, fill=as.factor(rad))) +
  geom_point(pch=21)+
  labs(x = "Median value of owner-occupied homes [k$]", 
       y = "weighted distance to employment centers", 
       title = "Scatter-plot of accessibility to radial highways") +
  theme_minimal()+
  scale_fill_viridis_d(option = "viridis",
                       guide = guide_legend(nrow = 3,
                                            title = "Index of accessibility",
                                            barheight = unit(2, units = "mm"),
                                            barwidth = unit(55, units = "mm"),
                                            title.position = 'top',
                                            label.position = "bottom",
                                            title.hjust = 0.5))+
  theme(legend.position = "bottom")

```

We propose mapping the value 24 to 9 for rad, resulting in a more compact domain


```{r figureHousing2, fig.cap ="Figure 2: Index of Accessibility to Radial Highways as a Function of Home Value and Distance to Employment Centers, with rescaled indices", fig.fullwidth=TRUE, fig.height=6, fig.width=6, fig.align='center',fig.pos="H"}
df$rad <- ifelse(df$rad==24, 9, df$rad) 
range_response <- range(df$rad)

# Histogram: Distribution of med by age bin
ggplot(df, aes(x = dis, y=medv, fill=factor(rad, levels=seq(9)))) +
  geom_point(pch=21)+
  labs(x = "Median value of owner-occupied homes [k$]", 
       y = "weighted distance to employment centers", 
       title = "Scatter-plot of accessibility to radial highways") +
  theme_minimal()+
  scale_fill_viridis_d(option = "viridis",
                       guide = guide_legend(nrow = 3,
                                            title = "Index of accessibility",
                                            barheight = unit(2, units = "mm"),
                                            barwidth = unit(55, units = "mm"),
                                            title.position = 'top',
                                            label.position = "bottom",
                                            title.hjust = 0.5))+
  theme(legend.position = "bottom")
```

We can also display the empirical probabilities. For that, we use "bins" for the samples, as there are no replicates.

```{r figureHousing3, fig.cap ="Figure 3: Distribution of RAD across bins of various home value and distance to employment centers", fig.fullwidth=TRUE, fig.height=6, fig.width=8, fig.align='center',fig.pos="H"}

# Create interval variables for medv and dis
df <- df %>%
  mutate(medv_bin = factor(paste0("medv in ",
                                  cut(medv, breaks = seq(0, 50, by = 10), 
                                      include.lowest = FALSE, right = TRUE)),
                           levels = paste0("medv in ", 
                                           cut(seq(50, 1, -10), 
                                               breaks = seq(0, 50, by = 10),
                                               include.lowest = FALSE, right = TRUE))),
         dis_bin = factor(paste0("dis in ",
                                 cut(dis, breaks = seq(1, 13, by = 2), 
                                     include.lowest = FALSE, right = TRUE)),
                          levels = paste0("dis in ", 
                                          cut(seq(2, 13, 2), 
                                              breaks = seq(1, 13, by = 2),
                                              include.lowest = FALSE, right = TRUE))))

ggplot(df, aes(x = rad, y = after_stat(prop))) +
  geom_bar(fill="cornflowerblue", col="navy", lwd=0.2, alpha=0.7, 
           width=0.4)+
  facet_grid(medv_bin ~ dis_bin) +
  labs(x = "Index of accessibility to radial highways (RAD)", 
       y = "Proportion", 
       title = "Distribution of RAD across bins of various home value and distance to employment centers") +
  theme_minimal() +
  theme(legend.position = "bottom")+
  scale_x_continuous(breaks = c(1:9)) 

```


# SLGP model specifications


## Maximum a posteriori estimate


```{r SLGPfitting}
library(SLGP)

modelMAP <- slgp(rad~dis+medv, # Use a formula with two indexing variables
                 data=df,
                 method="MAP", #Maximum a posteriori estimation scheme
                 basisFunctionsUsed = "RFF",
                 interpolateBasisFun="WNN", # Accelerate inference
                 hyperparams = list(lengthscale=c(0.1, 0.15, 0.15), 
                                    # Applied to normalised data
                                    # So 0.15 is 15% of the range of values
                                    sigma2=1), 
                 nIntegral = 9, #or length(seq(seq(range_response[1], range_response[2])))
                 sigmaEstimationMethod = "heuristic", # Set to heuristic for numerical stability                 
                 predictorsLower= c(range_x[,1]),
                 predictorsUpper= c(range_x[,2]),
                 responseRange= range_response,
                 opts_BasisFun = list(nFreq=150,
                                      MatParam=5/2))
```

We can represent the conditional probabilities. For that, we use "bins" for the samples, as there are no replicates. We display the histograms of values in these bins compared to SLGP predictions of the probabilities at the center of the bins.

```{r SLGPplotting1, fig.cap = "Figure 2: Predictive probabilities of rad at medv and dis, as predicted by a SLGP.", fig.fullwidth=TRUE, fig.height=6, fig.width=8, fig.align='center',fig.pos="H"}
library(viridis)
dfGrid <- data.frame(expand.grid(seq(range_x[1, 1], range_x[1, 2], 0.5), 
                                 seq(range_x[2, 1], range_x[2, 2], 1), 
                                 seq(range_response[1], range_response[2])))
colnames(dfGrid) <- c("dis", "medv", "rad")
pred <- predictSLGP_newNode(SLGPmodel=modelMAP,
                            newNodes = dfGrid,
                            nIntegral = 9)
pred[, -c(1:3)] <- pred[, -c(1:3)] * diff(range_response) /(diff(range_response) +1) 
# Goes from values to integrate over a domain to discrete probabilities
df_plot <- pred %>%
  filter(dis %in% seq(2, 12, 2))%>%
  filter(medv %in% seq(5, 45, 10)) %>%
  mutate(medv_bin=paste0("medv in (", medv-5,",", medv+5,"]"))%>%
  mutate(dis_bin=paste0("dis in (", dis-1,",", dis+1,"]"))
df_counts <- df %>%
  group_by(medv_bin, dis_bin) %>%
  summarise(count = paste0("n=", n()), .groups = "keep")

ggplot() +
  geom_bar(data=df, mapping=aes(x = rad-0.1, y = after_stat(prop)),
           fill="cornflowerblue", col="navy", lwd=0.2, alpha=0.7, 
           width=0.4) +
  geom_col(data=df_plot, mapping=aes(x = rad+0.1, y = pdf_1),
           col="red", fill="grey", lwd=0.2, alpha=0.7, lty=2, 
           width=0.4) +
  geom_text(data=df_counts, mapping=aes(x = 1.5, y = 1, label=count), 
            col="grey10", size = 3, vjust = 1)+ 
  facet_grid(medv_bin ~ dis_bin) +
  labs(x = "Index of accessibility to radial highways (RAD)", 
       y = "Proportion", 
       title = "Distribution of RAD across bins of various medv and dis values (blue histogram)\nVS SLGP at the center of these bins") +
  theme_minimal() +
  theme(legend.position = "bottom")+
  scale_x_continuous(breaks = c(1:9)) 
```