\documentclass[11pt]{article} \usepackage{Sweave} \usepackage{fullpage} \usepackage{amsfonts} \usepackage{amsmath} \usepackage{bm} \usepackage{mathabx} \usepackage{xspace} \usepackage{natbib} \bibliographystyle{abbrvnat} \newcommand{\susie}{\textsl{SuSiE}\xspace} \def\vb{\bm b} \def\vy{\bm y} \def\vx{\bm x} \begin{document} \title{Implementation of SuSiE trend filtering} \author{Kaiqian Zhang} \maketitle %\VignetteIndexEntry{Implementation of SuSiE trend filtering} \section{Notation} We first describe the notation used in this text. We denote matrices by boldface uppercase letters ($\mathbf{A}$), vectors are denoted by boldface lowercase letters ($\mathbf{a}$), and scalars are denoted by non-boldface letters ($a$ or $A$). All vectors are column-vectors. Lowercase letters may represent elements of a vector or matrix if they have subscripts. For example, $a_{ij}$ is the $(i,j)$th element of $\mathbf{A}$, $a_i$ is the $i$th element of $\mathbf{a}$, and $\mathbf{a}_{i}$ is either the $i$th row or $i$th column of $\mathbf{A}$. For indexing, we will generally use capital non-boldface letters to denote the total number of elements and their lowercase non-boldface versions to denote the index. For example, $i = 1,\ldots,I$. We let $\mathbf{A}_{n \times p}$ denote that $\mathbf{A} \in \mathbb{R}^{n \times p}$. We denote the matrix transpose by $\mathbf{A}^T$, the matrix inverse by $\mathbf{A}^{-1}$, and the matrix determinant by $\det(\mathbf{A})$. Finally, sets will be denoted by calligraphic letters ($\mathcal{A}$). \section{Overview} Trend filtering is a useful statistical tool for nonparametric regression. \cite{Kim07l1trend} first proposed $\ell_1$ trend filtering for estimating underlying piecewise linear trends in time series data. This idea can be further extended to fit piecewise polynomial of degree $k$ to the data. In their paper, Kim et al. showed the equivalence between the $\ell_1$ trend filtering and the $\ell_1$-regularized least squares problem. This motivates us to think about the connection between trend filtering and sparse approximation in general. \section{Trend filtering and sparse regression} Trend filtering problem is defined mathematically as follows. For a given integer $k \geq 0$, the kth order trend filtering is defined by a penalized least squares optimization problem, \begin{align} \hat{\vb} = \underset{\vb}{\mathrm{argmin}} \frac{1}{2}|| \vy - \vb||_2^2 + \frac{n^k}{k!}\lambda||D_{k+1}\vb||_1, \end{align} where $\vy = [y_1 \dots y_n]^T$ is an n vector of observations, $\lambda$ is a tuning parameter, and $D_{k+1}$ is the discrete difference operator of order $k$. When order $k=0$, $D$ is defined \begin{align}\label{D1} D_{1} = \begin{bmatrix} -1 & 1 & 0 & \dots & 0 & 0\\ 0 & -1 & 1 & \dots & 0 & 0\\ \vdots & \ddots & \\ 0 & 0 & 0 & \dots & -1 & 1\\ \end{bmatrix} \in \mathbb{R}^{(n-1)\times n}. \end{align} In this case, the components of the trend filtering estimate form a piecewise constant structure, with break points corresponding to the nonzero entries of $D_{1}\hat{\vb} = (\hat{b}_2 - \hat{b}_1, \dots, \hat{b}_n - \hat{b}_{n-1})$ \citep{Tibshirani2014}. And when $k\geq 1$, the operator $D_{k+1}$ is defined recursively, \begin{align}\label{D1_2} D_{k+1} = D_{1} \cdot D_{k} \in \mathbb{R}^{(n-k-1)\times n}, \end{align} where the dot product is matrix multiplication. Notice that $D_1$ here in \ref{D1_2} is the $(n-k-1)\times (n-k)$ version of $D_1$ described in \ref{D1}. Now we want to transform the trend filtering problem into a sparse regression problem. Let $\bm{\beta}=D_{k+1}\vb$. Then if $D_{k+1}$ were invertibe, we could write $\vb = (D_{k+1})^{-1}\bm{\beta}$ and the above problem would become \begin{align} \hat{\bm{\beta}} = \underset{\bm{\beta}}{\mathrm{argmin}} \frac{1}{2}||\vy - (D_{k+1})^{-1}\bm{\beta}||_2^2 + \frac{n^k}{k!}\lambda||\bm{\beta}||_1. \end{align} We can consider this a sparse regression with $\ell_1$ regularization problem, where the design matrix is $X_{k+1} = (D_{k+1})^{-1}$. \section{Modification of $D$} As we have seen, the trend filtering problem becomes a sparse regression with $\ell_1$ regularization if we consider the design matrix $X_{k+1} = (D_{k+1})^{-1}$. However, $D_{1} \in \mathbb{R}^{(n-1)\times n}$ is not invertible, so is $D_{k+1}$ for $k=1,2,\dots$ By observation, we complete $D_{1}$ as a square and symmetric matrix \begin{align} \hat{D}_{1} = \begin{bmatrix} -1 & 1 & 0 & \dots & 0 & 0\\ 0 & -1 & 1 & \dots & 0 & 0\\ \vdots & \ddots & \\ 0 & 0 & 0 & \dots & -1 & 1\\ 0 & 0 & 0 & \dots & 0 & -1 \end{bmatrix} \in \mathbb{R}^{n\times n}. \end{align} And for $k\geq 1$, we obtain \begin{align} \hat{D}_{k+1} = \hat{D}_{1} \cdot \hat{D}_{k} \in \mathbb{R}^{n\times n}. \end{align} We notice that, by this modification, $\hat{D}_{k+1}$ has $k$ more rows added at the bottom without changing any previous entry. With this modification, we are able to invert $\hat{D}_{k+1}$ and consider the inverse matrix as a design matrix in the sparse regression. \section{Special structure of $\hat{D}^{-1}$} After determining the design matrix $X$ in the sparse regression problem, we could apply \susie algorithm to help us find a possible fit. Here, we denote $X_{k+1} = (\hat{D}_{k+1})^{-1}$, where $k$ is the order of trend filtering. Rather than generating $\hat{D}^{-1}$ and set this as an $X$ input, we exploit the special structure of $\hat{D}^{-1}$ and perform \susie on this specific trend filtering problem with $O(n)$ complexity. We will talk about how to make different computations linear in complexity by utilizing the special structure respectively. \section{Computation of $Xb$} \label{computation-on-Xb} In the trend filtering application, since $X_{k+1} = (\hat{D}_{k+1})^{-1}$, we obtain \begin{align} X_{k+1} \vb & = (\hat{D}_{k+1})^{-1} \vb = (\underbrace{\hat{D}_{1}\dots \hat{D}_{1}}_{k+1})^{-1} \vb \\ &= \underbrace{(\hat{D}_{1})^{-1} \dots (\hat{D}_{1})^{-1}}_{k+1} \vb \\ &= \underbrace{X_1 \dots X_1}_{k+1} \vb. \end{align} We notice that since \begin{align} X_1 = \begin{bmatrix} -1 & -1 & -1 & \dots & -1 & -1\\ 0 & -1 & -1 & \dots & -1 & -1\\ \vdots & \ddots & \\ 0 & 0 & 0 & \dots & -1 & -1\\ 0 & 0 & 0 & \dots & 0 & -1 \end{bmatrix} \in \mathbb{R}^{n\times n}, \end{align} \begin{align} X_1 \vb & = -1 \cdot [b_1+b_2+\dots+b_n, b_2+\dots+b_n, \dots, b_{n-1}+b_n, b_n]^T \\ & = -1 \cdot \text{cumsum}(\text{reverse}(\vb)) . \end{align} Let $f: \mathbb{R}^n \to \mathbb{R}^n$ such that $f(\vx)= -\text{cumsum}(\text{reverse}(\vx))$ for any $\vx \in \mathbb{R}^n$. Then \begin{align} X_{k+1} \vb & = \underbrace{X_1 \dots X_1}_{k+1} \vb = f^{(k+1)}(\vb), \end{align} where $k$ is the order of trend filtering. \section{Computation of $X^Ty$} \label{computation-on-Xty} We consider $X_{k+1}^T\vy$. Here $X_{k+1} = (\hat{D}_{k+1})^{-1}$ in the trend filtering problem, and $\vy$ is an n vector. We have \begin{align} X_{k+1}^T \vy & = ((\hat{D}_{k+1})^{-1})^T \vy = ((\underbrace{\hat{D}_{1}\dots \hat{D}_{1}}_{k+1})^{-1})^T \vy \\ & = \underbrace{((\hat{D}_{1})^{-1})^T \dots ((\hat{D}_{1})^{-1})^T}_{k+1} \vy \\ & = \underbrace{X_1^T \dots X_1^T}_{k+1} \vy. \end{align} Similarly, we observe that since \begin{align} X_1^T = \begin{bmatrix} -1 & 0 & 0 & \dots & 0 & 0\\ -1 & -1 & 0 & \dots & 0 & 0\\ \vdots & \ddots & \\ -1 & -1 & -1 & \dots & -1 & 0\\ -1 & -1 & -1 & \dots & -1 & -1 \end{bmatrix} \in \mathbb{R}^{n\times n}, \end{align} \begin{align} X_1^T \vy & = -1 \cdot [y_1, y_1+y_2, \dots, y_1+y_2+\dots+y_n]^T \\ & = -1 \cdot \text{cumsum}(\vy). \end{align} Let $g: \mathbb{R}^n \to \mathbb{R}^n$ such that $g(\vx)= -\text{cumsum}(\vx)$ for any $\vx \in \mathbb{R}^n$. Then \begin{align} X_{k+1}^T \vy & = \underbrace{X_1^T \dots X_1^T}_{k+1} \vy = g^{(k+1)}(\vy), \end{align} where $k$ is the order of trend filtering. \section{Computation on $(X^2)^T 1$; i.e., colSums($X^2$)} \label{computation-on-d} To compute $(X_{k+1}^2)^T \bm{1}$, let's first explore the special structure of $X_{k+1}=(\hat{D}^{(k+1)})^{-1}$ for $k=0,1,2$. \begin{align} X_1 = \begin{bmatrix} -1 & -1 & -1 & -1 & -1 & \dots \\ 0 & -1 & -1 & -1 & -1 & \dots \\ 0 & 0 & -1 & -1 & -1 & \dots \\ 0 & 0 & 0 & -1 & -1 & \dots \\ \vdots & \ddots & \\ \end{bmatrix} \in \mathbb{R}^{n\times n}, \end{align} \begin{align} X_2 = \begin{bmatrix} 1 & 2 & 3 & 4 & 5 & 6 & \dots \\ 0 & 1 & 2 & 3 & 4 & 5 & \dots \\ 0 & 0 & 1 & 2 & 3 & 4 &\dots \\ 0 & 0 & 0 & 1 & 2 & 3 &\dots \\ \vdots & \ddots & \\ \end{bmatrix} \in \mathbb{R}^{n\times n}, \end{align} \begin{align} X_3 = \begin{bmatrix} -1 & -3 & -6 & -10 & -15 & \dots \\ 0 & -1 & -3 & -6 & -10 & \dots \\ 0 & 0 & -1 & -3 & -6 & \dots \\ 0 & 0 & 0 & -1 & -3 & \dots \\ \vdots & \ddots & \\ \end{bmatrix} \in \mathbb{R}^{n\times n}, \end{align} Define a triangular rotate matrix $Q \in \mathbb{R}^{n\times n}$ such that (i) For any $i, j \leq n$, $Q_{ij} = 0$ if $i>j$. (ii) For any $k < n$, $Q_{ab} = Q_{cd}$ if $b-a = d-c = k$. We observe that if $X$ is a triangular rotate matrix, then \begin{align} X^T \bm{1} = \text{cumsum}(X_{1.}). \end{align} Since $X^2$ is still a triangular rotate matrix, we obtain \begin{align} (X^2)^T \bm{1} = \text{cumsum}(X^2_{1.}). \end{align} Since $X_{k+1} = (\hat{D}_{k+1})^{-1}$ is a triangular rotate matrix, \begin{align} (X_{k+1}^2)^T \bm{1} = \text{cumsum}((X_{k+1})_{1.}^2). \end{align} And, obviously, the first row of $X_{k+1}$ is \begin{equation} (X_{k+1})_{1.} = \begin{cases} \bm{-1} & \text{if } k = 0 \\ g^{(k)}(\bm{1}) & \text{if } k > 0. \end{cases} \end{equation} \section{Computation of $\mu$; i.e., column means} \label{computation-on-cm} For each column $j = 1,2,\dots,n$, \begin{equation} \mu_j = E[X_{.j}], \end{equation} and $\bm{\mu} = [\mu_1, \mu_2, \dots, \mu_n]^T \in \mathbb{R}^n$. Hence we get \begin{equation} \bm{\mu} = \frac{1}{n} X_{k+1}^T \bm{1} = \frac{1}{n} \text{cumsum}((X_{k+1})_{1.}), \end{equation} where $(X_{k+1})_{1.}$ is defined above in \ref{computation-on-d}. \section{Computation of $\sigma$; i.e., column standard deviations} \label{computation-on-csd} For each column $j=1,2,\dots, n$, \begin{align} \sigma_j & = \sqrt{E[X_{.j}^2] - E[X_{.j}]^2} \\ & = \sqrt{\frac{1}{n}\sum_{i=1}^{n}X_{ij}^2 - (\frac{1}{n}\sum_{i=1}^{n} X_{ij})^2}. \end{align} Hence, $\bm{\sigma} = [\sigma_1, \sigma_2, \dots, \sigma_n]^T \in \mathbb{R}^n$ becomes \begin{align} \bm{\sigma} & = \sqrt{E[X^2] - E[X]^2} \\ & = \sqrt{\frac{1}{n}\text{colSums}(X^2) - (\frac{1}{n}\text{colSums}(X))^2} \\ & = \sqrt{\frac{1}{n}(X^2)^T \bm{1} + (\frac{1}{n} X^T \bm{1})^2}, \end{align} where the first term involves \ref{computation-on-d} and the second term is computed in \ref{computation-on-cm}. Note that in the algorithm, we set the column standard deviation 1 when the column has variance 0 for computation convenience. \section{Computation on $(\hat{X}^2)^T 1$; i.e., colSums($\hat{X}^2$)} \label{computation-on-std-d} We want to compute colSums($\hat{X}^2$), where $\hat{X}$ is scaled by both column means $\bm{\mu} = [\mu_1, \mu_2, \ldots, \mu_n]^T \in \mathbb{R}^n$ and column standard deviations $\bm{\sigma} = [\sigma_1, \sigma_2, \dots, \sigma_n]^T \in \mathbb{R}^n$. We define $\hat{X} \in \mathbb{R}^{n \times n}$ such that for each $i = 1,2,\ldots,n$ and $j=1,2,\dots,n$, \begin{equation*} \hat{X}_{ij} = (X_{ij} - \mu_j)/\sigma_j, \end{equation*} where $X = X_{k+1} = (\hat{D}_{k+1})^{-1}$ if the order is $k$. \section{Conclusion} Computation details from section \ref{computation-on-Xb} to section \ref{computation-on-std-d} explain how we can benefit from the unique structure of matrices from trend filtering problem. As shown by our formula, we do not need to form any matrix and complete \susie algorithm with $O(n)$ complexity. \bibliography{trendfiltering_derivations} \end{document}