Portfolio Optimization
Chapter 13: Index Tracking Portfolios

R code

Published

October 18, 2024

R code examples for Chapter 13 of the book:

Daniel P. Palomar (2024). Portfolio Optimization: Theory and Application. Cambridge University Press.

Loading packages

First load the packages used in the examples:

# basic finance
library(xts)                    # to manipulate time series of stock data
library(portfolioBacktest)      # to conduct backtests
library(pob)                    # book package with financial data
library(quantmod)               # to download market data from the Internet
 
# plotting
library(ggplot2)                # for nice plots
library(RcppRoll)               # fast rolling means (roll_meanr)

# optimization
library(quadprog)
library(CVXR)
library(sparseIndexTracking)    # to design index tracking portfolios
library(TRexSelector)           # T-Rex method for sparse regression with FDR control

What is index tracking?

Tracking of the S&P 500 index by the SPDR S&P 500 ETF:

# get data from Yahoo!Finance
library(quantmod)
SP500_prices  <- Ad(getSymbols("^GSPC", from = "2007-01-01", auto.assign = FALSE))
SPY_prices  <- Ad(getSymbols("SPY", from = "2007-01-01", auto.assign = FALSE))
SP500_SPY_prices <- cbind(SP500_prices, SPY_prices)
colnames(SP500_SPY_prices) <- c("S&P 500", "SPY")

ggplot(fortify(SP500_SPY_prices, melt = TRUE), aes(x = Index, y = Value, color = Series)) +
  geom_line(linewidth = 1.0) +
  scale_y_log10() +
  scale_x_date(date_breaks = "1 year", date_labels = "%Y", date_minor_breaks = "1 month") +
  theme(legend.title = element_blank()) +
  labs(title = "Tracking of S&P 500 index", x= NULL, y = "dollars")

Sparse index tracking

Sparse index tracking is a type of sparse regression problem formulated as \[ \begin{array}{ll} \underset{\w}{\textm{minimize}} & \frac{1}{T}\left\|\bm{r}^\textm{b} - \bm{X}\w\right\|^2_2 + \lambda \|\w\|_0\\ \textm{subject to} & \w \in \mathcal{W}, \end{array} \] where the matrix \(\bm{X}\) contains the returns of the constituent assets, the vector \(\bm{r}^\textm{b}\) contains the returns of the index to replicate, the parameter \(\lambda\) controls the level of sparsity in the portfolio, and \(\mathcal{W}\) is the constraint set which could be \(\{\w \mid \bm{1}^\T\w=1, \w\ge\bm{0} \}\).

We define some auxiliary functions:

Code
colCumProd <- function(X) {
  if (!is.xts(X))
    stop("Argument has to be an xts object.")
  xts(apply(X, 2, cumprod), index(X))
}

colCumSum <- function(X) {
  if (!is.xts(X))
    stop("Argument has to be an xts object.")
  xts(apply(X, 2, cumsum), index(X))
}

list2xts <- function(list, names = NULL) {
    n <- length(list)
    res <- NULL
    for (i in seq(n)) res <- cbind(res, list[[i]])
    res <- xts(res, index(list[[1]]))
    if (is.null(names))
      colnames(res) <- sapply(list, colnames)
    else
      colnames(res) <- names
    return(res)
}

To design sparse tracking portfolios, we can simply use the R package spIndexTrack. For completeness, here we have the simple iterative code based on the MM algorithm:

Code
library(quadprog)

# Recall that quadprog::solve.QP() solves:
#
#   minimize     -d'*x + 0.5*x'*D*x
#   subject to   A'*x =/>= b
#
sparse_tracking_via_MM <- function(X, r, lambda, w0 = NULL, max_iter = 5, thres = 1e-9, p = 1e-3, u = 1, verbose = FALSE) {
  T <- nrow(X)
  N <- ncol(X)
  X <- as.matrix(X)
  r <- as.matrix(r)
  
  # MM loop
  if (is.null(w0))
    w0 <- rep(1/N, N)
  w <- w0
  if (verbose)
    obj_value <- (1/T)*norm(r - X %*% w, "2")^2 + lambda * sum(log(1 + abs(w)/p)/log(1 + u/p))
  k <- 1
  while (k <= max_iter) {
    d <- 1/(log(1 + u/p)*(p + abs(w)))
    w_prev <- w
    # # Using CVXR:
    # w <- Variable(N)
    # prob <- Problem(Minimize((1/T)*sum((r - X %*% w)^2) + lambda * t(d) %*% w), 
    #                 constraints = list(w >= 0, w <= 1, sum(w)==1))
    # result <- solve(prob)
    # w <- as.vector(result$getValue(w))
    # Using quadprog:
    D <- 2* (1/T) * t(X) %*% X
    D <- D + 1e-8*mean(diag(D))*diag(N)
    dd <- (2/T)* as.vector(t(X) %*% r) - lambda*d
    tA  <- rbind(rep(-1, N), diag(N))  # sum(w) = 1 (-sum(w) >= -1) and w >= 0
    b   <- c(-1, rep(0, N))
    w <- tryCatch(solve.QP(Dmat = D, dvec = dd, Amat = t(tA), bvec = b, meq = 1)$solution,
                  warning = function(w) {if (verbose) message(w); return(NULL)}, 
                  error   = function(e) {if (verbose) message(e); return(NULL)})
    if (verbose)
      obj_value <- c(obj_value, (1/T)*norm(r - X %*% w, "2")^2 + lambda * sum(log(1 + abs(w)/p)/log(1 + u/p)))

    if(norm(w - w_prev, "2")/norm(w_prev, "2") < 1e-3) break
    k <- k + 1
  }
  # thresholding
  w[w < thres] <- 0
  w <- w / sum(w)
  if (!verbose)
    return(w)
  else
    return(list(w = w, obj_value = obj_value))
}

Fixed versus time-varying portfolio

The tracking error (TE) of a fixed portfolio \(\w_t=\w\) is \[ \textm{TE} = \frac{1}{T}\left\|\bm{r}^\textm{b} - \bm{X}\w\right\|^2_2, \] where \(\bm{X} \in \R^{T\times N}\) is the matrix containing the \(T\) return vectors \(\bm{r}_t\) of the \(N\) constituent assets along the rows and \(\bm{r}^\textm{b} \in \R^{T}\) is the vector containing the \(T\) returns of the index.

Now, it the portfolio is not rebalanced, it will naturally evolve over time as \[ \w_t = \frac{\w_{t-1} \odot \left(\bm{1} + \bm{r}_t\right)}{\w_{t-1}^\T \left(\bm{1} + \bm{r}_t\right)} \approx \w_{0} \odot \bm{\alpha}_t, \] where \(\w_{0}\) is the initial portfolio and \[ \bm{\alpha}_t = \prod_{t'=1}^{t}\frac{\bm{1} + \bm{r}_{t'}}{1 + r_{t'}^\textm{b}} \] denotes weights based on the cumulative returns.The resulting tracking error can be conveniently written as \[ \textm{TE}^\textm{time-varying} = \frac{1}{T}\left\|\bm{r}^\textm{b} - \tilde{\bm{X}}\w\right\|^2_2, \] where now \(\tilde{\bm{X}}\) contains the weighted returns \(\tilde{\bm{r}}_t = \bm{r}_t \odot \bm{\alpha}_{t-1}\) along the rows and \(\w\) denotes the initial portfolio \(\w_0\).

The code to compute \(\bm{\alpha}_{t}\) is straightforward:

Code
compute_alpha <- function(X, r) {
  X_cum <- colCumProd(1 + X)
  r_cum <- colCumProd(1 + r)
  alpha <- X_cum / as.vector(r_cum)
  alpha_lagged <- lag(alpha)
  alpha_lagged[1, ] <- 1
  return(list(alpha             = alpha, 
              alpha_lagged      = alpha_lagged, 
              last_alpha_lagged = as.vector(last(alpha_lagged)), 
              X_tilde           = X * alpha_lagged))  
}

We now perform a backtest to assess the tracking error over time with \(K=20\) active assets under the approximation (or assumption) of a fixed portfolio \(\w\) and with the more accurate approximation of a time-varying portfolio, which shows improved results.

Code
library(pob)
library(sparseIndexTracking)
library(RcppRoll)  # fast rolling means (roll_meanr)
data(SP500_2015to2020)
T <- nrow(SP500_2015to2020$stocks)  # ["::2020-02", ] excluded the COVID19 crisis
N <- ncol(SP500_2015to2020$stocks)
T_trn <- 2*252


tracking_using_plain_returns_wfixed <- function(dataset, ...) {
  X <- dataset$prices/lag(dataset$prices)[-1] - 1
  r <- dataset$index/lag(dataset$index)[-1] - 1
  #spIndexTrack(X, r, lambda = 7e-7)
  sparse_tracking_via_MM(X, r, lambda = 5e-7, max_iter = 10)
}

tracking_using_plain_returns_wt <- function(dataset, ...) {
  X <- dataset$prices/lag(dataset$prices)[-1] - 1
  r <- dataset$index/lag(dataset$index)[-1] - 1
  alpha <- compute_alpha(X, r)

  # design portfolio
  #w0 <- spIndexTrack(X_cumweighted, r, lambda = 8e-7, p_neg_exp = 5, max_iter = 100) # either 8e-7 or 8.5e-7
  w0 <- sparse_tracking_via_MM(alpha$X_tilde, r, lambda = 5e-7, max_iter = 10)
  w <- w0 * alpha$last_alpha_lagged
  return(w/sum(w))
}


# perform single backtest
bt <- portfolioBacktest(list("Assuming fixed portfolio"        = tracking_using_plain_returns_wfixed,
                             "Assuming time-varying portfolio" = tracking_using_plain_returns_wt),
                        dataset_list = list("dataset1" = list("prices" = SP500_2015to2020$stocks,
                                                              "index"  = SP500_2015to2020$index)),
                        benchmarks = c("index"),
                        lookback = T_trn, optimize_every = 6*21, rebalance_every = 6*21)
Backtesting 2 portfolios over 1 datasets (periodicity = daily data)...
Backtesting benchmarks...
Code
# make sure there was no error in the backtest
listPortfoliosWithFailures(bt)
num_pf <- length(bt)
bt <- bt[c(num_pf, 1:(num_pf-1))]  # move SP500 to the first position
bt <- lapply(bt, function(pf) {pf$dataset1$X_lin <- NULL; return(pf)})  # reduce object size

We can then plot the tracking error and cardinality over time:

Code
# # plot wealth
# B0 <- as.numeric(SP500_2015to2020$index[T_trn, ])
# price_all <- B0 * list2xts(lapply(bt, function(bt_single) bt_single$dataset1$wealth), names = names(bt))
# ggplot(fortify(price_all, melt = TRUE), aes(x = Index, y = Value, col = Series)) +
#   geom_line(linewidth = 1) +
#   scale_x_date(date_breaks = "6 months", date_labels = "%b %Y", date_minor_breaks = "1 week") +
#   theme(legend.title = element_text()) + labs(color = "Portfolio") +
#   labs(title = "Tracking of S&P 500 index", x= NULL, y = "dollars")


#
# plot smoothed TE
#
ret_all <- list2xts(lapply(bt, function(bt_single) bt_single$dataset1$return), names = names(bt))
TE_inst <- (ret_all[, -1] - as.vector(ret_all[, 1]))^2
TE_smooth <- xts(roll_meanr(as.matrix(TE_inst), n = 252, fill = NA), index(TE_inst))
TE_smooth <- na.omit(TE_smooth)

ggplot(fortify(TE_smooth, melt = TRUE), aes(x = Index, y = Value, color = Series)) +
  geom_line(linewidth = 1) +
  theme(legend.title = element_blank()) +
  scale_x_date(date_breaks = "6 months", date_labels = "%b %Y", date_minor_breaks = "1 week") +
  labs(title = "Tracking error over time", x= NULL, y = "TE")

Code
#
# plot cardinality
#
cardinality <- list2xts(lapply(bt[-1], function(bt_single) xts(rowSums(bt_single$dataset1$w_bop > 1e-5), index(bt_single$dataset1$w_bop))), names = names(bt[-1]))
cardinality <- cardinality[index(TE_smooth), ]
ggplot(fortify(cardinality, melt = TRUE), aes(x = Index, y = Value, col = Series)) +
  geom_line(linewidth = 1) +
  scale_x_date(date_breaks = "6 months", date_labels = "%b %Y", date_minor_breaks = "1 week") +
  theme(legend.title = element_blank()) +
  labs(title = "Cardinality over time", x= NULL, y = "cardinality")

Code
t(rbind("TE"          = colMeans(TE_inst),
        "cardinality" = colMeans(cardinality)))
                                          TE cardinality
Assuming fixed portfolio        4.626860e-06    20.92993
Assuming time-varying portfolio 3.712547e-06    21.76642

Plain versus cumulative returns

Minimizing errors in period-by-period returns does not guarantee better long-term cumulative return tracking, as return errors can accumulate unevenly over time. To control long-term cumulative return deviations, the error measure should use long-term or cumulative returns. However, for short-term hedging, period-by-period return tracking is essential, as outperformance is not the objective.

The price error can be measured by the tracking error in terms of the cumulative returns as \[ \textm{TE}^\textm{cum} = \frac{1}{T}\left\|\bm{r}^\textm{b,cum} - \bm{X}^\textm{cum}\w\right\|^2_2, \] where \(\bm{X}^\textm{cum}\) (similarly \(\bm{r}^\textm{b,cum}\)) denotes the cumulative returns, whose rows can be obtained as \[ \bm{r}^\textm{cum}_t \approx \sum_{t'=1}^{t}\bm{r}_{t'}. \]

We now perform a backtest to assess the tracking error over time using plain returns and cumulative returns:

Code
library(pob)
library(sparseIndexTracking)
library(RcppRoll)  # fast rolling means
data(SP500_2015to2020)
T <- nrow(SP500_2015to2020$stocks)  # ["::2020-02", ] exclused the COVID19 crisis
N <- ncol(SP500_2015to2020$stocks)
T_trn <- 2*252


tracking_using_plain_returns_wfixed <- function(dataset, ...) {
  X <- dataset$prices/lag(dataset$prices)[-1] - 1
  r <- dataset$index/lag(dataset$index)[-1] - 1
  sparse_tracking_via_MM(X, r, lambda = 5e-7, max_iter = 10)
}

tracking_using_cumreturns_wfixed <- function(dataset, ...) {
  X <- dataset$prices/lag(dataset$prices)[-1] - 1
  r <- dataset$index/lag(dataset$index)[-1] - 1
  sparse_tracking_via_MM(X = colCumSum(X), 
                         r = colCumSum(r), 
                         lambda = 20e-7, max_iter = 10)
}

tracking_using_cumreturns_wt <- function(dataset, ...) {
  X <- dataset$prices/lag(dataset$prices)[-1] - 1
  r <- dataset$index/lag(dataset$index)[-1] - 1
  alpha <- compute_alpha(X, r)
  w0 <- sparse_tracking_via_MM(X = colCumSum(alpha$X_tilde), 
                               r = colCumSum(r), lambda = 15e-7, max_iter = 10)
  w <- w0 * alpha$last_alpha_lagged
  return(w/sum(w))
}
  
# tracking_using_cumprodreturns_wfixed <- function(dataset, ...) {
#   X <- dataset$prices/lag(dataset$prices)[-1] - 1
#   r <- dataset$index/lag(dataset$index)[-1] - 1
#   # sparse_tracking_via_MM(X = colCumProd(1 + X),
#   #                        r = colCumProd(1 + r),
#   #                        lambda = 20e-7, max_iter = 10)
#   sparse_tracking_via_MM(X = exp(colCumSum(X)),
#                          r = exp(colCumSum(r)),
#                          lambda = 20e-7, max_iter = 10)
# }



# perform single backtest
bt <- portfolioBacktest(list("Tracking using plain returns"        = tracking_using_plain_returns_wfixed,
                             "Tracking using cumulative returns"   = tracking_using_cumreturns_wfixed),
                        dataset_list = list("dataset1" = list("prices" = SP500_2015to2020$stocks,
                                                              "index"  = SP500_2015to2020$index)),
                        benchmarks = c("index"),
                        lookback = T_trn, optimize_every = 6*21, rebalance_every = 6*21)
Backtesting 2 portfolios over 1 datasets (periodicity = daily data)...
Backtesting benchmarks...
Code
# make sure there was no error in the backtest
listPortfoliosWithFailures(bt)
num_pf <- length(bt)
bt <- bt[c(num_pf, 1:(num_pf-1))]  # move SP500 to the first position
bt <- lapply(bt, function(pf) {pf$dataset1$X_lin <- NULL; return(pf)})  # reduce object size

We can then plot the price tracked over time:

Code
#
# plot wealth
#
B0 <- as.numeric(SP500_2015to2020$index[T_trn, ])
price_all <- B0 * list2xts(lapply(bt, function(bt_single) bt_single$dataset1$wealth), names = names(bt))
    
ggplot(fortify(price_all, melt = TRUE), aes(x = Index, y = Value, color = Series)) +
  geom_line(linewidth = 1) +
  scale_x_date(date_breaks = "6 months", date_labels = "%b %Y", date_minor_breaks = "1 week") +
  theme(legend.title = element_text()) + labs(color = "Portfolio") +
  labs(title = "Tracking of S&P 500 index", x= NULL, y = "dollars")

Convergence of the iterative reweighted \(\ell_1\)-norm method

The convergence of the iterative reweighted \(\ell_1\)-norm method (based on the MM framework) for sparse index tracking is very fast:

Code
library(pob)
data(SP500_2015to2020)

obj_value <- sparse_tracking_via_MM(X = SP500_2015to2020$stocks/lag(SP500_2015to2020$stocks)[-1] - 1,
                                    r = SP500_2015to2020$index/lag(SP500_2015to2020$index)[-1] - 1,
                                    lambda = 5e-7, max_iter = 10, verbose = TRUE)$obj_value


# plot
last_iter <- length(obj_value) - 1
data.frame("k"               = 0:last_iter,
           "objective value" = obj_value, 
           check.names = FALSE) |>
  ggplot(aes(x = k, y = `objective value`)) +
  geom_line(linewidth = 1.2, color = "blue") +
  scale_x_continuous(breaks = 0:last_iter) +
  labs(title = "Convergence over iterations", x= "k", y = "")

Comparison of algorithms

We now compare the tracking of the S&P 500 index based on different tracking methods, namely:

  • the naive two-step approach with proportional weights to the index definition;
  • the two-step approach with refitted weights;
  • the iterative reweighted \(\ell_1\)-norm method; and
  • the MIP formulation (although the computational complexity is too high).

We now perform a backtest to assess the tracking error over time of the S&P 500 index with \(K=20\) active assets for different algorithms. The tracking portfolios are computed on a rolling window basis with a lookback period of two years and recomputed every six months.

Code
library(CVXR)
library(pob)
library(sparseIndexTracking)
library(RcppRoll)  # fast rolling means
data(SP500_2015to2020)
T <- nrow(SP500_2015to2020$stocks)
N <- ncol(SP500_2015to2020$stocks)
T_trn <- 2*252


full_tracking <- function(dataset, ...) {
  X <- dataset$prices/lag(dataset$prices)[-1] - 1
  r <- dataset$index/lag(dataset$index)[-1] - 1
  N <- ncol(X)
  # dense portfolio
  b <- Variable(N)
  prob <- Problem(Minimize(sum((as.matrix(r) - as.matrix(X) %*% b)^2)), 
                  constraints = list(b >= 0, sum(b)==1))
  result <- solve(prob)
  b <- as.vector(result$getValue(b))  
  return(b)
}

two_stage_naive_tracking <- function(dataset, ...) {
  X <- dataset$prices/lag(dataset$prices)[-1] - 1
  r <- dataset$index/lag(dataset$index)[-1] - 1
  N <- ncol(X)
  # first, get the dense portfolio as the reference
  b <- Variable(N)
  prob <- Problem(Minimize(sum((as.matrix(r) - as.matrix(X) %*% b)^2)), 
                  constraints = list(b >= 0, sum(b)==1))
  result <- solve(prob)
  b <- as.vector(result$getValue(b))
  # then, keep the K strongest
  idx_Klargest <- sort(b, decreasing = TRUE, index.return = TRUE)$ix[1:K]
  w <- rep(0, N)
  w[idx_Klargest] <- b[idx_Klargest]
  w <- w/sum(w)
  return(w)
}

two_stage_refitted_tracking <- function(dataset, ...) {
  X <- dataset$prices/lag(dataset$prices)[-1] - 1
  r <- dataset$index/lag(dataset$index)[-1] - 1
  N <- ncol(X)
  # first, get the dense portfolio as the reference
  b <- Variable(N)
  prob <- Problem(Minimize(sum((as.matrix(r) - as.matrix(X) %*% b)^2)), 
                  constraints = list(b >= 0, sum(b)==1))
  result <- solve(prob)
  b <- as.vector(result$getValue(b))    
  # then, keep the K strongest but reoptimizing weights
  idx_Klargest <- sort(b, decreasing = TRUE, index.return = TRUE)$ix[1:K]
  wK <- Variable(K)
  prob <- Problem(Minimize(sum((as.matrix(r) - as.matrix(X[, idx_Klargest]) %*% wK)^2)), 
                  constraints = list(wK >= 0, sum(wK)==1))
  result <- solve(prob)
  w <- rep(0, N)
  w[idx_Klargest] <- as.vector(result$getValue(wK))
  return(w)
}


iterative_reweighted_l1_norm_tracking <- function(dataset, ...) {
  X <- dataset$prices/lag(dataset$prices)[-1] - 1
  r <- dataset$index/lag(dataset$index)[-1] - 1
  sparse_tracking_via_MM(X, r, lambda = lmd, max_iter = 10)
}


# perform single backtest
K <- 20
lmd <- 5e-7
bt <- portfolioBacktest(list("Full tracking"                             = full_tracking,
                             "Two-step design with proportional weights" = two_stage_naive_tracking,
                             "Two-step design with refitted weights"     = two_stage_refitted_tracking,
                             "Iterative reweighted $l_1$-norm method"    = iterative_reweighted_l1_norm_tracking),
                        dataset_list = list("dataset1" = list("prices" = SP500_2015to2020$stocks,
                                                              "index"  = SP500_2015to2020$index)),
                        benchmarks = c("index"),
                        lookback = T_trn, optimize_every = 6*21, rebalance_every = 6*21)
Backtesting 4 portfolios over 1 datasets (periodicity = daily data)...
Backtesting benchmarks...
Code
# make sure there was no error in the backtest
listPortfoliosWithFailures(bt)
num_pf <- length(bt)
bt <- bt[c(num_pf, 1:(num_pf-1))]  # move SP500 to the first position
bt <- lapply(bt, function(pf) {pf$dataset1$X_lin <- NULL; return(pf)})  # reduce object size

We can then plot the price, tracking error, and cardinality over time:

Code
#
# plot wealth
#
B0 <- as.numeric(SP500_2015to2020$index[T_trn, ])
price_all <- B0 * list2xts(lapply(bt, function(bt_single) bt_single$dataset1$wealth), names = names(bt))
ggplot(fortify(price_all, melt = TRUE), aes(x = Index, y = Value, col = Series)) +
  geom_line(linewidth = 1) +
  scale_x_date(date_breaks = "6 months", date_labels = "%b %Y", date_minor_breaks = "1 week") +
  theme(legend.title = element_text()) + labs(color = "Portfolio") +
  labs(title = "Tracking of S&P 500 index", x= NULL, y = "dollars")

Code
#
# plot smoothed TE
#
ret_all <- list2xts(lapply(bt, function(bt_single) bt_single$dataset1$return), names = names(bt))
TE_inst <- (ret_all[, -1] - as.vector(ret_all[, 1]))^2
TE_smooth <- xts(roll_meanr(as.matrix(TE_inst), n = 252, fill = NA), index(TE_inst))
TE_smooth <- na.omit(TE_smooth)
TE_smooth <- TE_smooth[, -1]   # remove the full tracking
ggplot(fortify(TE_smooth, melt = TRUE), aes(x = Index, y = Value, color = Series)) +
  geom_line(linewidth = 1) +
  theme(legend.title = element_blank()) + #labs(color = "Tracking methods") +  
  scale_x_date(date_breaks = "6 months", date_labels = "%b %Y", date_minor_breaks = "1 week") +
  labs(title = "Tracking error over time", x= NULL, y = "TE")

Code
#
# plot cardinality
#
cardinality <- list2xts(lapply(bt[-1], function(bt_single) xts(rowSums(bt_single$dataset1$w_bop > 1e-5), index(bt_single$dataset1$w_bop))), names = names(bt[-1]))
cardinality <- cardinality[index(TE_smooth), ]
cardinality <- cardinality[, -1]   # remove the full tracking
ggplot(fortify(cardinality, melt = TRUE), aes(x = Index, y = Value, col = Series)) +
  geom_line(linewidth = 1) +
  scale_x_date(date_breaks = "6 months", date_labels = "%b %Y", date_minor_breaks = "1 week") +
  theme(legend.title = element_blank()) +
  labs(title = "Cardinality over time", x= NULL, y = "cardinality")

Code
t(rbind("TE"          = colMeans(TE_inst),
        "cardinality" = colMeans(cardinality)))
                                                    TE cardinality
Full tracking                             6.620929e-07    20.00000
Two-step design with proportional weights 7.717688e-06    19.63066
Two-step design with refitted weights     5.731222e-06    20.92993
Iterative reweighted $l_1$-norm method    4.626860e-06    20.00000

Comparison of formulations

We consider again the comparison between assuming a fixed portfolio and a time-varying one in the definition of tracking error. More specifically, we now study the tradeoff of tracking error versus cardinality. Indeed, the assumption of the time-varying portfolio is more accurate.

Code
library(pob)
library(sparseIndexTracking)
library(RcppRoll)  # fast rolling means
data(SP500_2015to2020)
T_trn <- 2*252


tracking_using_plain_returns_wfixed <- function(dataset, ...) {
  X <- dataset$prices/lag(dataset$prices)[-1] - 1
  r <- dataset$index/lag(dataset$index)[-1] - 1
  sparse_tracking_via_MM(X, r, lambda = lmd, max_iter = 10)
}

tracking_using_plain_returns_wt <- function(dataset, ...) {
  X <- dataset$prices/lag(dataset$prices)[-1] - 1
  r <- dataset$index/lag(dataset$index)[-1] - 1
  alpha <- compute_alpha(X, r)

  # design portfolio
  w0 <- sparse_tracking_via_MM(alpha$X_tilde, r, lambda = lmd, max_iter = 10)
  w <- w0 * alpha$last_alpha_lagged
  return(w/sum(w))
}


#
# Sweeping loop
#
lmd_sweep <- 1e-5 * c(1, 2, 5, 10, 20, 50, 100, 200, 1000)/300
df <- data.frame()
for (lmd in lmd_sweep) {
  bt <- portfolioBacktest(list("Assuming fixed portfolio"        = tracking_using_plain_returns_wfixed,
                               "Assuming time-varying portfolio" = tracking_using_plain_returns_wt),
                          dataset_list = list("dataset1" = list("prices" = SP500_2015to2020$stocks,
                                                                "index"  = SP500_2015to2020$index)),
                          benchmarks = c("index"),
                          lookback = T_trn, optimize_every = 6*21, rebalance_every = 6*21)
  listPortfoliosWithFailures(bt)
  num_pf <- length(bt)
  bt <- bt[c(num_pf, 1:(num_pf-1))]  # move index to the first position

  # store TE and cardinality
  ret_all <- list2xts(lapply(bt, function(bt_single) bt_single$dataset1$return), names = names(bt))
  TE_all <- colMeans((ret_all[, -1] - as.vector(ret_all[, 1]))^2)
  cardinality_all <- colMeans(list2xts(lapply(bt[-1], function(bt_single)
    xts(rowSums(bt_single$dataset1$w_bop > 1e-5), index(bt_single$dataset1$w_bop))), names = names(bt[-1])))

  df <- rbind(df, data.frame("method"  = names(TE_all),
                             "TE"      = TE_all,
                             "K"       = cardinality_all))
}
rownames(df) <- NULL


# plot
df$method <- factor(df$method, levels = unique(df$method))
ggplot(df, aes(x = K, y = TE, color = method, linetype = method)) +
  geom_line(linewidth = 1.2) +
  theme(legend.title = element_blank()) +
  labs(title = "Tracking error versus sparsity level", x= "K (active assets)", y = NULL)

Enhanced index tracking

We now compare the following tracking error (TE) measures to track the S&P 500 index:

  • basic TE: \[ \textm{TE}(\w) = \frac{1}{T}\left\|\bm{r}^\textm{b} - \bm{X}\w\right\|^2_2; \]

  • robust Huberized TE: \[ \textm{Hub-TE}(\w) = \frac{1}{T}\sum_{t=1}^T \phi^{\textm{hub}}(r_t^\textm{b} - \bm{X}_{t,:}\w), \] where \[\phi^{\textm{hub}}(x)=\begin{cases} x^{2}, & \quad|x|\leq M\\ M\left(2|x| - M\right), & \quad|x|>M; \end{cases}\]

  • \(\ell_1\)-norm TE: \[ \textm{TE}_1(\w) = \frac{1}{T}\left\|\bm{r}^\textm{b} - \bm{X}\w\right\|_1; \]

  • downside risk (DR) error measure: \[ \textm{DR}(\w) = \frac{1}{T}\left\|\left(\bm{r}^\textm{b} - \bm{X}\w\right)^+\right\|^2_2, \] where \((\cdot)^+\triangleq\textm{max}(0,\cdot)\).

We will use the R package spIndexTrack to obtain the designs for the backtest:

Code
library(pob)
library(sparseIndexTracking)
data(SP500_2015to2020)
T <- nrow(SP500_2015to2020$stocks)
N <- ncol(SP500_2015to2020$stocks)
T_trn <- 2*252


TE_index_tracking <- function(dataset, ...) {
  X <- dataset$prices/lag(dataset$prices)[-1] - 1
  r <- dataset$index/lag(dataset$index)[-1] - 1
  
  #w <- spIndexTrack(X, r, lambda = 6e-7, u = 0.5)
  w <- spIndexTrack(colCumSum(X), colCumSum(r), lambda = 150*6e-7, u = 0.5) #100*6e-7 - 
  return(w)
}

DR_index_tracking <- function(dataset, ...) {
  X <- dataset$prices/lag(dataset$prices)[-1] - 1
  r <- dataset$index/lag(dataset$index)[-1] - 1
  
  #w <- spIndexTrack(X, r, lambda = 1.8e-7, u = 0.5, measure = "dr")
  w <- spIndexTrack(colCumSum(X), colCumSum(r), lambda = 190*1.8e-7, u = 0.5, measure = "dr") #200*1.8e-7 - 250*1.8e-7
  return(w)
}

TE1_index_tracking <- function(dataset, ...) {
  X <- dataset$prices/lag(dataset$prices)[-1] - 1
  r <- dataset$index/lag(dataset$index)[-1] - 1
  
  #w_TE1 <- spIndexTrack(X, r, lambda = 6e-7, u = 0.5)  # initial point
  w_TE1 <- spIndexTrack(colCumSum(X), colCumSum(r), lambda = 100*6e-7, u = 0.5)
  for (k in 1:5) {
    alpha <- as.vector(sqrt(1/(2*abs(r - X %*% w_TE1))))
    #w_TE1 <- spIndexTrack(alpha*X, alpha*r, lambda = 6.5e-4, u = 0.5) # 6 -- 7
    w_TE1 <- spIndexTrack(colCumSum(alpha*X), colCumSum(alpha*r), lambda = 150*6.5e-4, u = 0.5) #100*6.5e-4 - 
  }
  return(w_TE1)
}

HubTE_index_tracking <- function(dataset, ...) {
  X <- dataset$prices/lag(dataset$prices)[-1] - 1
  r <- dataset$index/lag(dataset$index)[-1] - 1
  
  #w <- spIndexTrack(X, r, lambda = 5e-8, u = 0.5, measure = "hete", hub = 1e-4)
  w <- spIndexTrack(colCumSum(X), colCumSum(r), lambda = 50*5e-8, u = 0.5, measure = "hete", hub = 1e-4) # 50*5e-8 - 70*5e-8
  return(w)
}


# perform single backtest
bt <- portfolioBacktest(list("TE based design"     = TE_index_tracking,
                             "Hub-TE based design" = HubTE_index_tracking,
                             "TE_1 based design" = TE1_index_tracking,
                             "DR based design"     = DR_index_tracking),
                        dataset_list = list("dataset1" = list("prices" = SP500_2015to2020$stocks,
                                                              "index"  = SP500_2015to2020$index)),
                        benchmarks = c("index"),
                        lookback = T_trn, optimize_every = 6*21, rebalance_every = 6*21)
Backtesting 4 portfolios over 1 datasets (periodicity = daily data)...
Backtesting benchmarks...
Code
listPortfoliosWithFailures(bt)
bt <- bt[c(length(bt), 1:(length(bt)-1))]  # move SP500 to the first position
bt <- lapply(bt, function(pf) {pf$dataset1$X_lin <- NULL; return(pf)})  # reduce object size


#
# plot wealth
#
B0 <- as.numeric(SP500_2015to2020$index[T_trn, ])
price_all <- B0 * list2xts(lapply(bt, function(bt_single) bt_single$dataset1$wealth), names = names(bt))
ggplot(fortify(price_all, melt = TRUE), aes(x = Index, y = Value, col = Series)) +
  geom_line(linewidth = 1) +
  scale_x_date(date_breaks = "6 months", date_labels = "%b %Y", date_minor_breaks = "1 week") +
  theme(legend.title = element_text()) + labs(color = "Portfolio") +
  labs(title = "Tracking of S&P 500 index", x= NULL, y = "dollars")

Automatic sparsity control

The sparse index tracking formulation includes a regularization term \(\lambda \|\w\|_0\) in the objective or as a constraint \(\|\w\|_0 \le k\), where the hyper-parameter \(\lambda\) or \(k\) controls the sparsity level. Adjusting this hyper-parameter allows achieving different points on the error versus sparsity trade-off curve.

In practice, the goal is to choose a proper operating point on this trade-off curve without computing the entire curve. The false discovery rate (FDR), which is the probability of wrongly including a variable, is a key quantity for this decision. Controlling the FDR is the ideal way to decide whether to include a variable.

The T-Rex method, based on “dummy” variables, provides a practical approach for FDR control in sparse index tracking. Instead of tuning the sparsity hyper-parameter, T-Rex automatically selects the assets to include by controlling the FDR, avoiding the selection of irrelevant assets that are highly correlated with already selected ones. The T-Rex method is efficientlyu implemented in the R package TRexSelector (and soon to come in Python) (Machkour et al., 2022).

We now backest the tracking portfolios obtained from the sparse penalized regression formulation with the FDR-controlling T-Rex method (Machkour et al., 2024). The tracking portfolios are computed on a rolling window basis with a lookback period of two years and recomputed every six months.

Code
library(xts)
library(pob)
library(sparseIndexTracking)
library(RcppRoll)  # fast rolling means
library(quadprog)
data(SP500_2015to2020)
T <- nrow(SP500_2015to2020$stocks)
N <- ncol(SP500_2015to2020$stocks)
T_trn <- 2*252


#
# Define different tracking methods
#
iterative_reweighted_l1_norm_tracking_lmd1e_8 <- function(dataset, ...) {
  X <- dataset$prices/lag(dataset$prices)[-1] - 1
  r <- dataset$index/lag(dataset$index)[-1] - 1
  spIndexTrack(X, r, lambda = 1e-8, u = 0.5)
  #spIndexTrack(colCumSum(X), colCumSum(r), lambda = 150*1e-8, u = 0.5)
}

iterative_reweighted_l1_norm_tracking_lmd1e_7 <- function(dataset, ...) {
  X <- dataset$prices/lag(dataset$prices)[-1] - 1
  r <- dataset$index/lag(dataset$index)[-1] - 1
  spIndexTrack(X, r, lambda = 1e-7, u = 0.5)
  #spIndexTrack(colCumSum(X), colCumSum(r), lambda = 150*1e-7, u = 0.5)
}

iterative_reweighted_l1_norm_tracking_lmd1e_6 <- function(dataset, ...) {
  X <- dataset$prices/lag(dataset$prices)[-1] - 1
  r <- dataset$index/lag(dataset$index)[-1] - 1
  spIndexTrack(X, r, lambda = 1e-6, u = 0.5)
  #spIndexTrack(colCumSum(X), colCumSum(r), lambda = 150*1e-6, u = 0.5)
}

TRex_tracking <- function(dataset, ...) {
  X <- as.matrix(dataset$prices/lag(dataset$prices)[-1] - 1)
  r <- as.vector(dataset$index/lag(dataset$index)[-1] - 1)
  N <- ncol(X)
  X_cum <- apply(X, 2, function(x) cumsum(x))
  r_cum <- cumsum(r)

  # Step 1: T-Rex
  trex_selected <- TRexSelector::trex(X = tail(X, 120), y = tail(r, 120),
  #trex_selected <- TRexSelector::trex(X = X_cum, y = r_cum,                                      
                                      tFDR = 0.3, method = "trex+DA+NN",
                                      verbose = FALSE)$selected_var
  idx_trex <- which(trex_selected == 1)
  K <- length(idx_trex)
  # Step 2: Refit selected assets
  wK <- solve.QP(Dmat = 2 * t(X[, idx_trex]) %*% X[, idx_trex],
                 dvec = 2 * t(X[, idx_trex]) %*% r,
                 Amat = cbind(rep(1, K), diag(K)), bvec = c(1, rep(0, K)), meq = 1)$solution
  w <- rep(0, N)
  w[idx_trex] <- wK
  return(w)
}
Code
# perform single backtest
bt <- portfolioBacktest(
  list("Sparse penalized regression (lambda = 1e-8)" = iterative_reweighted_l1_norm_tracking_lmd1e_8,
       "Sparse penalized regression (lambda = 1e-7)" = iterative_reweighted_l1_norm_tracking_lmd1e_7,
       "Sparse penalized regression (lambda = 1e-6)" = iterative_reweighted_l1_norm_tracking_lmd1e_6,
       "T-Rex"         = TRex_tracking),
  dataset_list = list("dataset1" = list("prices" = SP500_2015to2020$stocks,
                                        "index"  = SP500_2015to2020$index)),
  benchmarks = c("index"),
  lookback = T_trn, optimize_every = 6*21, rebalance_every = 6*21)
listPortfoliosWithFailures(bt)
bt <- bt[c(length(bt), 1:(length(bt)-1))]  # move SP500 to the first position
bt <- lapply(bt, function(pf) {pf$dataset1$X_lin <- NULL; return(pf)})  # reduce object size

We can then plot the price, tracking error, and cardinality over time. As can be seen, the traditional sparse regression formulation is very sensitive to the choice of the parameter \(\lambda\), producing very different results in terms of tracking error and cardinality. On the other hand, the FDR-controlling T-Rex method automatically chooses the appropriate sparsity level without having to tune any parameter. The computational cost of T-Rex is slightly higher than that of the traditional method for a fixed \(\lambda\) but lower than the traditional method for a whole range of values for \(\lambda\).

Code
#
# plot wealth
#
B0 <- as.numeric(SP500_2015to2020$index[T_trn, ])
price_all <- B0 * list2xts(lapply(bt, function(bt_single) bt_single$dataset1$wealth), names = names(bt))
ggplot(fortify(price_all, melt = TRUE), aes(x = Index, y = Value, col = Series)) +
  geom_line(linewidth = 1) +
  scale_x_date(date_breaks = "6 months", date_labels = "%b %Y", date_minor_breaks = "1 week") +
  theme(legend.title = element_text()) + labs(color = "Portfolio") +
  labs(title = "Tracking of S&P 500 index", x= NULL, y = "dollars")

Code
#
# plot smoothed cumulative TE
#
cumTE_inst <- ((price_all[, -1] - as.vector(price_all[, 1]))/B0)^2
cumTE_smooth <- xts(roll_meanr(as.matrix(cumTE_inst), n = 21, fill = NA), index(cumTE_inst))
cumTE_smooth <- na.omit(cumTE_smooth)
ggplot(fortify(cumTE_smooth, melt = TRUE), aes(x = Index, y = Value, color = Series)) +
  geom_line(linewidth = 1) +
  scale_x_date(date_breaks = "6 months", date_labels = "%b %Y", date_minor_breaks = "1 week") +
  theme(legend.title = element_blank()) +
  labs(title = "Tracking error (cum-returns) over time", x= NULL, y = "TE (cum)")

Code
#
# plot smoothed period-by-period TE
#
ret_all <- list2xts(lapply(bt, function(bt_single) bt_single$dataset1$return), names = names(bt))
TE_inst <- (ret_all[, -1] - as.vector(ret_all[, 1]))^2
TE_smooth <- xts(roll_meanr(as.matrix(TE_inst), n = 252, fill = NA), index(TE_inst))
TE_smooth <- na.omit(TE_smooth)
ggplot(fortify(TE_smooth, melt = TRUE), aes(x = Index, y = Value, color = Series)) +
  geom_line(linewidth = 1) +
  scale_x_date(date_breaks = "6 months", date_labels = "%b %Y", date_minor_breaks = "1 week") +
  theme(legend.title = element_blank()) +
  labs(title = "Tracking error (plain-returns) over time", x= NULL, y = "TE")

Code
#
# plot cardinality
#
cardinality <- list2xts(lapply(bt[-1], function(bt_single) xts(rowSums(bt_single$dataset1$w_bop > 1e-5), index(bt_single$dataset1$w_bop))), names = names(bt[-1]))
cardinality <- cardinality[index(TE_smooth), ]
ggplot(fortify(cardinality, melt = TRUE), aes(x = Index, y = Value, color = Series)) +
  geom_line(linewidth = 1) +
  scale_x_date(date_breaks = "6 months", date_labels = "%b %Y", date_minor_breaks = "1 week") +
  theme(legend.title = element_blank()) +
  labs(title = "Cardinality over time", x= NULL, y = "cardinality")

Code
t(rbind("TE"          = colMeans(TE_inst),
        "cardinality" = colMeans(cardinality)))
                                                      TE cardinality
Sparse penalized regression (lambda = 1e-8) 9.195354e-07   114.95474
Sparse penalized regression (lambda = 1e-7) 1.712832e-06    47.84088
Sparse penalized regression (lambda = 1e-6) 6.203776e-06    15.02774
T-Rex                                       3.638299e-06    52.80146

References

Machkour, J., Palomar, D. P., and Muma, M. (2024). FDR-controlled portfolio optimization for sparse financial index tracking. Available at arXiv.
Machkour, J., Tien, S., Palomar, D. P., and Muma, M. (2022). TRexSelector: T-rex selector: High-dimensional variable selection & FDR control.