## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>"
)

## ----loadHousing, warning=FALSE-----------------------------------------------
library(dplyr)
# Load the dataset (available in MASS package)
require(MASS)
data("Boston", package = "MASS")
df <- Boston %>%
  mutate(age_bin = cut(age, breaks = seq(0, 100, by = 10), include.lowest = FALSE)) %>%
  group_by(age_bin) %>%
  mutate(age_bin = paste0(age_bin, "\nn=", n()))%>%
  ungroup()%>%
  mutate(age_bin = factor(age_bin, 
                          levels = sort(unique(age_bin), decreasing = FALSE))) %>%
  data.frame()

range_response <- c(0, 50) # Can use range(df$medv), or user defined range as we do here
range_x <- c(0, 100) # Can use range(df$age), or user defined range as we do here

## ----figureHousing, fig.cap = "A visual representation of the dependency of the median value of owner-occupied homes on proportion of owner-occupied units constructed before 1940 in the Boston Housing dataset.", fig.fullwidth=TRUE, fig.height=4, fig.width=10, fig.align='center',fig.pos="H"----
library(ggplot2)
library(ggpubr)
library(viridis)
# Scatterplot: med vs. age
scatter_plot <- ggplot(df, aes(x = age, y = medv)) +
  geom_point(alpha = 0.5, color = "navy") +
  labs(x = "Proportion of owner-occupied units\nbuilt prior to 1940 [AGE, %]", 
       y = "Median value of owner-occupied homes [MEDV, k$]",
       title = "Median value vs. age of homes") +
  theme_bw()+
  coord_cartesian(xlim=range_x,
                  ylim=range_response)

# Histogram: Distribution of med by age bin
hist_plot <- ggplot(df, aes(x = medv)) +
  geom_histogram(mapping=aes(y=after_stat(density)),
                 position = "identity", breaks = seq(0, 50, 2.5),
                 fill="darkgrey", col="grey50", lwd=0.2, alpha=0.7) +
  geom_rug(sides = "b", color = "navy", alpha = 0.5)+
  facet_wrap(~ age_bin, scales = "free_y", nrow=2) +
  labs(x = "Median value of owner-occupied homes [MEDV, k$]", 
       y = "Probability density", 
       title = "Histogram of median housing values by AGE group") +
  theme_bw()+
  coord_cartesian(xlim=range_response,
                  ylim=c(0, 0.25))
ggarrange(scatter_plot, hist_plot, ncol = 2, nrow = 1,
          widths = c(0.3, 0.7))
# ggsave("./Figures/scatter.pdf", width=10, height=3.5)

## ----SLGPfitting00------------------------------------------------------------
library(SLGP)
modelMAP <- slgp(medv~age, # Use a formula to specify predictors VS response
                 # Can use medv~. for all variables,
                 # Or medv ~ age + var2 + var3 for more variables
                 data=df,
                 method="MAP", #Maximum a posteriori estimation scheme
                 basisFunctionsUsed = "RFF",
                 interpolateBasisFun="WNN", # Accelerate inference
                 hyperparams = list(lengthscale=c(0.15, 0.15), 
                                    # Applied to normalised data
                                    # So 0.15 is 15% of the range of values
                                    sigma2=1), 
                 # Will be re-selected with sigmaEstimationMethod
                 sigmaEstimationMethod = "heuristic", 
                 # Set to heuristic for numerical stability                 
                 predictorsLower= c(range_x[1]),
                 predictorsUpper= c(range_x[2]),
                 responseRange= range_response,
                 opts_BasisFun = list(nFreq=100,
                                      MatParam=5/2),
                 seed=1)

## ----SLGPplotting2, fig.cap = "Predictive probability density of medv at age, seen over slices.", fig.fullwidth=TRUE, fig.height=5, fig.width=10, fig.align='center',fig.pos="H"----


library(SLGP)
library(viridis)
dfGrid <- data.frame(expand.grid(seq(range_x[1], range_x[2], 5), 
                                 seq(range_response[1], range_response[2],, 101)))
colnames(dfGrid) <- c("age", "medv")
pred <- predictSLGP_newNode(SLGPmodel=modelMAP,
                            newNodes = dfGrid)

scale_factor <- 100
ggplot()  +
  labs(y = "Proportion of owner-occupied units\nbuilt prior to 1940 [%]", 
       x = "Median value of owner-occupied homes [k$]",
       title = "Median value vs. age of homes") +
  theme_bw()+
  geom_ribbon(data=pred,
              mapping=aes(x=medv, ymax=scale_factor*pdf_1+age, 
                          ymin=age, group=-age, fill=age),
              col="grey", alpha=0.9)+
  geom_point(data=df,
             mapping=aes(x = medv, y = age), alpha = 0.5, color = "navy")+
  scale_fill_viridis(option = "plasma",
                     guide = guide_colorbar(nrow = 1,
                                            title = "Indexing variable: Proportion of owner-occupied units built prior to 1940",
                                            barheight = unit(2, units = "mm"),
                                            barwidth = unit(55, units = "mm"),
                                            title.position = 'top',
                                            label.position = "bottom",
                                            title.hjust = 0.5))+
  theme(legend.position = "bottom")+
  coord_flip()


## ----SLGPfitting2-------------------------------------------------------------
modelLaplace <- retrainSLGP(SLGPmodel=modelMAP, 
                            newdata = df, 
                            method="Laplace")

## ----SLGPfitting22, eval=FALSE------------------------------------------------
#  # Or equivalent, more explicit in the re-using of the elements
#  # From the SLGP prior
#  
#  modelLaplace <- slgp(medv~age,
#                   data=df,
#                   method="MAP", #Maximum a posteriori estimation scheme
#                   basisFunctionsUsed = "RFF",
#                   interpolateBasisFun="WNN", # Accelerate inference
#                   hyperparams = modelMAP@hyperparams,
#                   sigmaEstimationMethod = "none",# Already selected in the prior
#                   predictorsLower= c(range_x[1]),
#                   predictorsUpper= c(range_x[2]),
#                   responseRange= range_response,
#                   opts_BasisFun = modelMAP@opts_BasisFun,
#                   BasisFunParam = modelMAP@BasisFunParam,
#                   seed=1)

## ----SLGPLaplaceplot, fig.cap = "Predictive probability density (and draws from a Laplace approximation) of medv at age, seen over 3 slices.", fig.fullwidth=TRUE, fig.height=4, fig.width=8, fig.align='center',fig.pos="H"----
# Define the three selected values
selected_values <- c(20, 50, 95)
gap <- 2.5

dfGrid <- data.frame(expand.grid(seq(3), 
                                 seq(range_response[1], range_response[2],, 101)))
colnames(dfGrid) <- c("ID", "medv")
dfGrid$age <- selected_values[dfGrid$ID]
pred <- predictSLGP_newNode(SLGPmodel=modelLaplace,
                            newNodes = dfGrid)
pred$meanpdf <- rowMeans(pred[, -c(1:3)])

library(tidyr)
# Filter the data: keep values within ±5 of the selected ones
df_filtered <- df %>%
  mutate(interval=findInterval(age, c(0, 
                                      selected_values[1]-gap, 
                                      selected_values[1]+gap, 
                                      selected_values[2]-gap, 
                                      selected_values[2]+gap, 
                                      selected_values[3]-gap, 
                                      selected_values[3]+gap)))%>%
  filter(interval %in% c(2, 4, 6))%>%
  group_by(interval)%>%
  mutate(category = paste0("Age close to ", c("", selected_values[1],
                                              "", selected_values[2],
                                              "", selected_values[3])[interval], 
                           "\nn=", n()))
names <- sort(unique(df_filtered$category))
pred$category <- names[pred$ID]

set.seed(1)
selected_cols <- sample(seq(1000), size=10, replace=FALSE)
df_plot <- pred %>%
  dplyr::select(c("age", "medv", "category", 
                  paste0("pdf_", selected_cols)))%>%
  pivot_longer(-c("age", "medv", "category"))


ggplot(mapping=aes(x = medv)) +
  geom_histogram(df_filtered,
                 mapping=aes(y=after_stat(density)),
                 position = "identity", breaks = seq(0, 50, 2.5),
                 fill="darkgrey", col="grey50", lwd=0.2, alpha=0.7) +
  geom_rug(data=df_filtered, sides = "b", color = "navy", alpha = 0.5)+
  geom_line(data=df_plot, mapping=aes(y=value, group=name), 
            color = "black", lwd=0.1, alpha=0.5)+
  geom_line(data=pred, mapping=aes(y=meanpdf, group=category), color = "red")+
  facet_wrap(~ category, scales = "free_y", nrow=1) +
  labs(x = "Median value of owner-occupied homes [k$]", 
       y = "Probability density", 
       title = "Histogram of median value at bins centered at several 'age' values, with width 5\nversus SLGP draws from a Laplace approximation (black curves) and average (red curve)") +
  theme_bw()+
  coord_cartesian(xlim=range_response,
                  ylim=c(0, 0.25))


## ----SLGPfitting3, eval=FALSE-------------------------------------------------
#  modelMCMC <- slgp(medv~age, # Use a formula to specify predictors VS response
#                    # Can use medv~. for all variables,
#                    # Or medv ~ age + var2 + var3 for more variables
#                    data=df,
#                    method="MCMC", #MCMC
#                    basisFunctionsUsed = "RFF",
#                    interpolateBasisFun="WNN", # Accelerate inference
#                    hyperparams = list(lengthscale=c(0.15, 0.15),
#                                       # Applied to normalised data
#                                       # So 0.15 is 15% of the range of values
#                                       sigma2=1),
#                    # Will be re-selected with sigmaEstimationMethod
#                    sigmaEstimationMethod = "heuristic", # Set to heuristic for numerical stability
#                    predictorsLower= c(range_x[1]),
#                    predictorsUpper= c(range_x[2]),
#                    responseRange= range_response,
#                    opts_BasisFun = list(nFreq=100,
#                                         MatParam=5/2),
#                    opts = list(stan_chains=2, stan_iter=1000))

## ----SLGPMCMCplotMoments, fig.cap = "Simultaneous prediction of the fields moments (and associated uncertainty) using a SLGP model", fig.fullwidth=TRUE, fig.height=4, fig.width=10, fig.align='center',fig.pos="H"----
dfX <- data.frame(age=seq(range_x[1], range_x[2], 2))

predMean <- predictSLGP_moments(SLGPmodel=modelLaplace,
                                newNodes = dfX, 
                                power=c(1),
                                centered=FALSE) # Uncentered moments
# For the mean
predVar <- predictSLGP_moments(SLGPmodel=modelLaplace,
                               newNodes = dfX, 
                               power=c(2), 
                               centered=TRUE) # Centered moments
# For the variance, Kurtosis and Skewness

pred <- rbind(predMean, predVar)
pred <- pred %>%
  pivot_longer(-c("age", "power"))%>%
  mutate(value=ifelse(power==2, sqrt(value), value))%>% # Define std
  data.frame()

pred$power <- factor(c("Expected value", 
                       "Standard deviation")[as.numeric(pred$power)],
                     levels=c("Expected value", "Standard deviation"))
df_plot <- pred %>%
  group_by(age, power)%>%
  summarise(q10 = quantile(value, probs=c(0.1)),
            q50 = quantile(value, probs=c(0.5)),
            q90 = quantile(value, probs=c(0.9)),
            mean = mean(value), .groups="keep")%>%
  ungroup() # summarise uncertainty


ggplot(df_plot, mapping=aes(x = age)) +
  geom_ribbon(mapping = aes(ymin=q10, ymax=q90),
              alpha = 0.25, lty=2, col="black", fill="cornflowerblue")+
  geom_line(mapping=aes(y=q50))+
  facet_grid(.~power)+
  labs(x = "Proportion of owner-occupied units built prior to 1940 [AGE, %]", 
       y = "Moment value") +
  theme_bw()+
  coord_cartesian(xlim=range_x,
                  ylim=range_response)


## ----SLGPMCMCplotQuantiles, fig.cap = "Simultaneous quantile prediction using a SLGP model, with estimation performed by MAP", fig.fullwidth=TRUE, fig.height=5, fig.width=10, fig.align='center',fig.pos="H"----
probsL <- c(5, 25, 50, 75, 95)/100
dfX <- data.frame(age=seq(range_x[1], range_x[2], 2))
pred <- predictSLGP_quantiles(SLGPmodel=modelLaplace,
                              newNodes = dfX, 
                              probs=probsL)

df_plot <- pred %>%
  pivot_longer(-c("age", "probs"))%>%
  group_by(age, probs)%>%
  summarise(q10 = quantile(value, probs=c(0.1)),
            q50 = quantile(value, probs=c(0.5)),
            q90 = quantile(value, probs=c(0.9)),
            mean = mean(value), .groups="keep")%>%
  ungroup()%>%
  mutate(probs=factor(paste0("Quantile: ", 100*probs, "%"),
                      levels=paste0("Quantile: ", 100*probsL, "%")))


plot1 <- ggplot(df_plot, mapping=aes(x = age)) +
  geom_ribbon(mapping = aes(ymin=q10, ymax=q90, 
                            col=probs, fill=probs, group=probs),
              alpha = 0.25, lty=3)+
  geom_line(mapping=aes(y=q50, lty="SLGP", 
                        col=probs,group=probs), lwd=0.75)+
  labs(x = "Proportion of owner-occupied units built prior to 1940 [AGE, %]", 
       y = "Median value of owner-occupied homes [MEDV, k$]",
       fill = "Quantile levels",
       col = "Quantile levels", 
       lty="Quantile estimation method") +
  theme_bw()+
  coord_cartesian(xlim=range_x,
                  ylim=range_response)+
  scale_linetype_manual(values=c(2, 1))

plot2 <- ggplot(df, mapping=aes(x=age))+
  geom_histogram(col="navy", fill="grey", bins = 30, alpha = 0.3,
                 breaks=seq(0, 100, 5))+
  theme_minimal()+
  labs(x = NULL, 
       y = "Sample count") 

library(ggpubr)
p_with_marginal <- ggarrange(
  plot2, plot1,
  ncol = 1, heights = c(1, 3),  # adjust height ratio
  align = "v"
)

print(p_with_marginal)