# basic finance
library(xts) # to manipulate time series of stock data
library(portfolioBacktest) # to conduct backtests
library(pob) # book package with financial data
library(quantmod) # to download market data from the Internet
# plotting
library(ggplot2) # for nice plots
library(RcppRoll) # fast rolling means (roll_meanr)
# optimization
library(quadprog)
library(CVXR)
library(sparseIndexTracking) # to design index tracking portfolios
library(TRexSelector) # T-Rex method for sparse regression with FDR control
Portfolio Optimization
Chapter 13: Index Tracking Portfolios
R code
R code examples for Chapter 13 of the book:
Daniel P. Palomar (2024). Portfolio Optimization: Theory and Application. Cambridge University Press.
Loading packages
First load the packages used in the examples:
What is index tracking?
Tracking of the S&P 500 index by the SPDR S&P 500 ETF:
# get data from Yahoo!Finance
library(quantmod)
<- Ad(getSymbols("^GSPC", from = "2007-01-01", auto.assign = FALSE))
SP500_prices <- Ad(getSymbols("SPY", from = "2007-01-01", auto.assign = FALSE))
SPY_prices <- cbind(SP500_prices, SPY_prices)
SP500_SPY_prices colnames(SP500_SPY_prices) <- c("S&P 500", "SPY")
ggplot(fortify(SP500_SPY_prices, melt = TRUE), aes(x = Index, y = Value, color = Series)) +
geom_line(linewidth = 1.0) +
scale_y_log10() +
scale_x_date(date_breaks = "1 year", date_labels = "%Y", date_minor_breaks = "1 month") +
theme(legend.title = element_blank()) +
labs(title = "Tracking of S&P 500 index", x= NULL, y = "dollars")
Sparse index tracking
Sparse index tracking is a type of sparse regression problem formulated as \[ \begin{array}{ll} \underset{\w}{\textm{minimize}} & \frac{1}{T}\left\|\bm{r}^\textm{b} - \bm{X}\w\right\|^2_2 + \lambda \|\w\|_0\\ \textm{subject to} & \w \in \mathcal{W}, \end{array} \] where the matrix \(\bm{X}\) contains the returns of the constituent assets, the vector \(\bm{r}^\textm{b}\) contains the returns of the index to replicate, the parameter \(\lambda\) controls the level of sparsity in the portfolio, and \(\mathcal{W}\) is the constraint set which could be \(\{\w \mid \bm{1}^\T\w=1, \w\ge\bm{0} \}\).
We define some auxiliary functions:
Code
<- function(X) {
colCumProd if (!is.xts(X))
stop("Argument has to be an xts object.")
xts(apply(X, 2, cumprod), index(X))
}
<- function(X) {
colCumSum if (!is.xts(X))
stop("Argument has to be an xts object.")
xts(apply(X, 2, cumsum), index(X))
}
<- function(list, names = NULL) {
list2xts <- length(list)
n <- NULL
res for (i in seq(n)) res <- cbind(res, list[[i]])
<- xts(res, index(list[[1]]))
res if (is.null(names))
colnames(res) <- sapply(list, colnames)
else
colnames(res) <- names
return(res)
}
To design sparse tracking portfolios, we can simply use the R package spIndexTrack
. For completeness, here we have the simple iterative code based on the MM algorithm:
Code
library(quadprog)
# Recall that quadprog::solve.QP() solves:
#
# minimize -d'*x + 0.5*x'*D*x
# subject to A'*x =/>= b
#
<- function(X, r, lambda, w0 = NULL, max_iter = 5, thres = 1e-9, p = 1e-3, u = 1, verbose = FALSE) {
sparse_tracking_via_MM <- nrow(X)
T <- ncol(X)
N <- as.matrix(X)
X <- as.matrix(r)
r
# MM loop
if (is.null(w0))
<- rep(1/N, N)
w0 <- w0
w if (verbose)
<- (1/T)*norm(r - X %*% w, "2")^2 + lambda * sum(log(1 + abs(w)/p)/log(1 + u/p))
obj_value <- 1
k while (k <= max_iter) {
<- 1/(log(1 + u/p)*(p + abs(w)))
d <- w
w_prev # # Using CVXR:
# w <- Variable(N)
# prob <- Problem(Minimize((1/T)*sum((r - X %*% w)^2) + lambda * t(d) %*% w),
# constraints = list(w >= 0, w <= 1, sum(w)==1))
# result <- solve(prob)
# w <- as.vector(result$getValue(w))
# Using quadprog:
<- 2* (1/T) * t(X) %*% X
D <- D + 1e-8*mean(diag(D))*diag(N)
D <- (2/T)* as.vector(t(X) %*% r) - lambda*d
dd <- rbind(rep(-1, N), diag(N)) # sum(w) = 1 (-sum(w) >= -1) and w >= 0
tA <- c(-1, rep(0, N))
b <- tryCatch(solve.QP(Dmat = D, dvec = dd, Amat = t(tA), bvec = b, meq = 1)$solution,
w warning = function(w) {if (verbose) message(w); return(NULL)},
error = function(e) {if (verbose) message(e); return(NULL)})
if (verbose)
<- c(obj_value, (1/T)*norm(r - X %*% w, "2")^2 + lambda * sum(log(1 + abs(w)/p)/log(1 + u/p)))
obj_value
if(norm(w - w_prev, "2")/norm(w_prev, "2") < 1e-3) break
<- k + 1
k
}# thresholding
< thres] <- 0
w[w <- w / sum(w)
w if (!verbose)
return(w)
else
return(list(w = w, obj_value = obj_value))
}
Fixed versus time-varying portfolio
The tracking error (TE) of a fixed portfolio \(\w_t=\w\) is \[ \textm{TE} = \frac{1}{T}\left\|\bm{r}^\textm{b} - \bm{X}\w\right\|^2_2, \] where \(\bm{X} \in \R^{T\times N}\) is the matrix containing the \(T\) return vectors \(\bm{r}_t\) of the \(N\) constituent assets along the rows and \(\bm{r}^\textm{b} \in \R^{T}\) is the vector containing the \(T\) returns of the index.
Now, it the portfolio is not rebalanced, it will naturally evolve over time as \[ \w_t = \frac{\w_{t-1} \odot \left(\bm{1} + \bm{r}_t\right)}{\w_{t-1}^\T \left(\bm{1} + \bm{r}_t\right)} \approx \w_{0} \odot \bm{\alpha}_t, \] where \(\w_{0}\) is the initial portfolio and \[ \bm{\alpha}_t = \prod_{t'=1}^{t}\frac{\bm{1} + \bm{r}_{t'}}{1 + r_{t'}^\textm{b}} \] denotes weights based on the cumulative returns.The resulting tracking error can be conveniently written as \[ \textm{TE}^\textm{time-varying} = \frac{1}{T}\left\|\bm{r}^\textm{b} - \tilde{\bm{X}}\w\right\|^2_2, \] where now \(\tilde{\bm{X}}\) contains the weighted returns \(\tilde{\bm{r}}_t = \bm{r}_t \odot \bm{\alpha}_{t-1}\) along the rows and \(\w\) denotes the initial portfolio \(\w_0\).
The code to compute \(\bm{\alpha}_{t}\) is straightforward:
Code
<- function(X, r) {
compute_alpha <- colCumProd(1 + X)
X_cum <- colCumProd(1 + r)
r_cum <- X_cum / as.vector(r_cum)
alpha <- lag(alpha)
alpha_lagged 1, ] <- 1
alpha_lagged[return(list(alpha = alpha,
alpha_lagged = alpha_lagged,
last_alpha_lagged = as.vector(last(alpha_lagged)),
X_tilde = X * alpha_lagged))
}
We now perform a backtest to assess the tracking error over time with \(K=20\) active assets under the approximation (or assumption) of a fixed portfolio \(\w\) and with the more accurate approximation of a time-varying portfolio, which shows improved results.
Code
library(pob)
library(sparseIndexTracking)
library(RcppRoll) # fast rolling means (roll_meanr)
data(SP500_2015to2020)
<- nrow(SP500_2015to2020$stocks) # ["::2020-02", ] excluded the COVID19 crisis
T <- ncol(SP500_2015to2020$stocks)
N <- 2*252
T_trn
<- function(dataset, ...) {
tracking_using_plain_returns_wfixed <- dataset$prices/lag(dataset$prices)[-1] - 1
X <- dataset$index/lag(dataset$index)[-1] - 1
r #spIndexTrack(X, r, lambda = 7e-7)
sparse_tracking_via_MM(X, r, lambda = 5e-7, max_iter = 10)
}
<- function(dataset, ...) {
tracking_using_plain_returns_wt <- dataset$prices/lag(dataset$prices)[-1] - 1
X <- dataset$index/lag(dataset$index)[-1] - 1
r <- compute_alpha(X, r)
alpha
# design portfolio
#w0 <- spIndexTrack(X_cumweighted, r, lambda = 8e-7, p_neg_exp = 5, max_iter = 100) # either 8e-7 or 8.5e-7
<- sparse_tracking_via_MM(alpha$X_tilde, r, lambda = 5e-7, max_iter = 10)
w0 <- w0 * alpha$last_alpha_lagged
w return(w/sum(w))
}
# perform single backtest
<- portfolioBacktest(list("Assuming fixed portfolio" = tracking_using_plain_returns_wfixed,
bt "Assuming time-varying portfolio" = tracking_using_plain_returns_wt),
dataset_list = list("dataset1" = list("prices" = SP500_2015to2020$stocks,
"index" = SP500_2015to2020$index)),
benchmarks = c("index"),
lookback = T_trn, optimize_every = 6*21, rebalance_every = 6*21)
Backtesting 2 portfolios over 1 datasets (periodicity = daily data)...
Backtesting benchmarks...
Code
# make sure there was no error in the backtest
listPortfoliosWithFailures(bt)
<- length(bt)
num_pf <- bt[c(num_pf, 1:(num_pf-1))] # move SP500 to the first position
bt <- lapply(bt, function(pf) {pf$dataset1$X_lin <- NULL; return(pf)}) # reduce object size bt
We can then plot the tracking error and cardinality over time:
Code
# # plot wealth
# B0 <- as.numeric(SP500_2015to2020$index[T_trn, ])
# price_all <- B0 * list2xts(lapply(bt, function(bt_single) bt_single$dataset1$wealth), names = names(bt))
# ggplot(fortify(price_all, melt = TRUE), aes(x = Index, y = Value, col = Series)) +
# geom_line(linewidth = 1) +
# scale_x_date(date_breaks = "6 months", date_labels = "%b %Y", date_minor_breaks = "1 week") +
# theme(legend.title = element_text()) + labs(color = "Portfolio") +
# labs(title = "Tracking of S&P 500 index", x= NULL, y = "dollars")
#
# plot smoothed TE
#
<- list2xts(lapply(bt, function(bt_single) bt_single$dataset1$return), names = names(bt))
ret_all <- (ret_all[, -1] - as.vector(ret_all[, 1]))^2
TE_inst <- xts(roll_meanr(as.matrix(TE_inst), n = 252, fill = NA), index(TE_inst))
TE_smooth <- na.omit(TE_smooth)
TE_smooth
ggplot(fortify(TE_smooth, melt = TRUE), aes(x = Index, y = Value, color = Series)) +
geom_line(linewidth = 1) +
theme(legend.title = element_blank()) +
scale_x_date(date_breaks = "6 months", date_labels = "%b %Y", date_minor_breaks = "1 week") +
labs(title = "Tracking error over time", x= NULL, y = "TE")
Code
#
# plot cardinality
#
<- list2xts(lapply(bt[-1], function(bt_single) xts(rowSums(bt_single$dataset1$w_bop > 1e-5), index(bt_single$dataset1$w_bop))), names = names(bt[-1]))
cardinality <- cardinality[index(TE_smooth), ]
cardinality ggplot(fortify(cardinality, melt = TRUE), aes(x = Index, y = Value, col = Series)) +
geom_line(linewidth = 1) +
scale_x_date(date_breaks = "6 months", date_labels = "%b %Y", date_minor_breaks = "1 week") +
theme(legend.title = element_blank()) +
labs(title = "Cardinality over time", x= NULL, y = "cardinality")
Code
t(rbind("TE" = colMeans(TE_inst),
"cardinality" = colMeans(cardinality)))
TE cardinality
Assuming fixed portfolio 4.626860e-06 20.92993
Assuming time-varying portfolio 3.712547e-06 21.76642
Plain versus cumulative returns
Minimizing errors in period-by-period returns does not guarantee better long-term cumulative return tracking, as return errors can accumulate unevenly over time. To control long-term cumulative return deviations, the error measure should use long-term or cumulative returns. However, for short-term hedging, period-by-period return tracking is essential, as outperformance is not the objective.
The price error can be measured by the tracking error in terms of the cumulative returns as \[ \textm{TE}^\textm{cum} = \frac{1}{T}\left\|\bm{r}^\textm{b,cum} - \bm{X}^\textm{cum}\w\right\|^2_2, \] where \(\bm{X}^\textm{cum}\) (similarly \(\bm{r}^\textm{b,cum}\)) denotes the cumulative returns, whose rows can be obtained as \[ \bm{r}^\textm{cum}_t \approx \sum_{t'=1}^{t}\bm{r}_{t'}. \]
We now perform a backtest to assess the tracking error over time using plain returns and cumulative returns:
Code
library(pob)
library(sparseIndexTracking)
library(RcppRoll) # fast rolling means
data(SP500_2015to2020)
<- nrow(SP500_2015to2020$stocks) # ["::2020-02", ] exclused the COVID19 crisis
T <- ncol(SP500_2015to2020$stocks)
N <- 2*252
T_trn
<- function(dataset, ...) {
tracking_using_plain_returns_wfixed <- dataset$prices/lag(dataset$prices)[-1] - 1
X <- dataset$index/lag(dataset$index)[-1] - 1
r sparse_tracking_via_MM(X, r, lambda = 5e-7, max_iter = 10)
}
<- function(dataset, ...) {
tracking_using_cumreturns_wfixed <- dataset$prices/lag(dataset$prices)[-1] - 1
X <- dataset$index/lag(dataset$index)[-1] - 1
r sparse_tracking_via_MM(X = colCumSum(X),
r = colCumSum(r),
lambda = 20e-7, max_iter = 10)
}
<- function(dataset, ...) {
tracking_using_cumreturns_wt <- dataset$prices/lag(dataset$prices)[-1] - 1
X <- dataset$index/lag(dataset$index)[-1] - 1
r <- compute_alpha(X, r)
alpha <- sparse_tracking_via_MM(X = colCumSum(alpha$X_tilde),
w0 r = colCumSum(r), lambda = 15e-7, max_iter = 10)
<- w0 * alpha$last_alpha_lagged
w return(w/sum(w))
}
# tracking_using_cumprodreturns_wfixed <- function(dataset, ...) {
# X <- dataset$prices/lag(dataset$prices)[-1] - 1
# r <- dataset$index/lag(dataset$index)[-1] - 1
# # sparse_tracking_via_MM(X = colCumProd(1 + X),
# # r = colCumProd(1 + r),
# # lambda = 20e-7, max_iter = 10)
# sparse_tracking_via_MM(X = exp(colCumSum(X)),
# r = exp(colCumSum(r)),
# lambda = 20e-7, max_iter = 10)
# }
# perform single backtest
<- portfolioBacktest(list("Tracking using plain returns" = tracking_using_plain_returns_wfixed,
bt "Tracking using cumulative returns" = tracking_using_cumreturns_wfixed),
dataset_list = list("dataset1" = list("prices" = SP500_2015to2020$stocks,
"index" = SP500_2015to2020$index)),
benchmarks = c("index"),
lookback = T_trn, optimize_every = 6*21, rebalance_every = 6*21)
Backtesting 2 portfolios over 1 datasets (periodicity = daily data)...
Backtesting benchmarks...
Code
# make sure there was no error in the backtest
listPortfoliosWithFailures(bt)
<- length(bt)
num_pf <- bt[c(num_pf, 1:(num_pf-1))] # move SP500 to the first position
bt <- lapply(bt, function(pf) {pf$dataset1$X_lin <- NULL; return(pf)}) # reduce object size bt
We can then plot the price tracked over time:
Code
#
# plot wealth
#
<- as.numeric(SP500_2015to2020$index[T_trn, ])
B0 <- B0 * list2xts(lapply(bt, function(bt_single) bt_single$dataset1$wealth), names = names(bt))
price_all
ggplot(fortify(price_all, melt = TRUE), aes(x = Index, y = Value, color = Series)) +
geom_line(linewidth = 1) +
scale_x_date(date_breaks = "6 months", date_labels = "%b %Y", date_minor_breaks = "1 week") +
theme(legend.title = element_text()) + labs(color = "Portfolio") +
labs(title = "Tracking of S&P 500 index", x= NULL, y = "dollars")
Convergence of the iterative reweighted \(\ell_1\)-norm method
The convergence of the iterative reweighted \(\ell_1\)-norm method (based on the MM framework) for sparse index tracking is very fast:
Code
library(pob)
data(SP500_2015to2020)
<- sparse_tracking_via_MM(X = SP500_2015to2020$stocks/lag(SP500_2015to2020$stocks)[-1] - 1,
obj_value r = SP500_2015to2020$index/lag(SP500_2015to2020$index)[-1] - 1,
lambda = 5e-7, max_iter = 10, verbose = TRUE)$obj_value
# plot
<- length(obj_value) - 1
last_iter data.frame("k" = 0:last_iter,
"objective value" = obj_value,
check.names = FALSE) |>
ggplot(aes(x = k, y = `objective value`)) +
geom_line(linewidth = 1.2, color = "blue") +
scale_x_continuous(breaks = 0:last_iter) +
labs(title = "Convergence over iterations", x= "k", y = "")
Comparison of algorithms
We now compare the tracking of the S&P 500 index based on different tracking methods, namely:
- the naive two-step approach with proportional weights to the index definition;
- the two-step approach with refitted weights;
- the iterative reweighted \(\ell_1\)-norm method; and
- the MIP formulation (although the computational complexity is too high).
We now perform a backtest to assess the tracking error over time of the S&P 500 index with \(K=20\) active assets for different algorithms. The tracking portfolios are computed on a rolling window basis with a lookback period of two years and recomputed every six months.
Code
library(CVXR)
library(pob)
library(sparseIndexTracking)
library(RcppRoll) # fast rolling means
data(SP500_2015to2020)
<- nrow(SP500_2015to2020$stocks)
T <- ncol(SP500_2015to2020$stocks)
N <- 2*252
T_trn
<- function(dataset, ...) {
full_tracking <- dataset$prices/lag(dataset$prices)[-1] - 1
X <- dataset$index/lag(dataset$index)[-1] - 1
r <- ncol(X)
N # dense portfolio
<- Variable(N)
b <- Problem(Minimize(sum((as.matrix(r) - as.matrix(X) %*% b)^2)),
prob constraints = list(b >= 0, sum(b)==1))
<- solve(prob)
result <- as.vector(result$getValue(b))
b return(b)
}
<- function(dataset, ...) {
two_stage_naive_tracking <- dataset$prices/lag(dataset$prices)[-1] - 1
X <- dataset$index/lag(dataset$index)[-1] - 1
r <- ncol(X)
N # first, get the dense portfolio as the reference
<- Variable(N)
b <- Problem(Minimize(sum((as.matrix(r) - as.matrix(X) %*% b)^2)),
prob constraints = list(b >= 0, sum(b)==1))
<- solve(prob)
result <- as.vector(result$getValue(b))
b # then, keep the K strongest
<- sort(b, decreasing = TRUE, index.return = TRUE)$ix[1:K]
idx_Klargest <- rep(0, N)
w <- b[idx_Klargest]
w[idx_Klargest] <- w/sum(w)
w return(w)
}
<- function(dataset, ...) {
two_stage_refitted_tracking <- dataset$prices/lag(dataset$prices)[-1] - 1
X <- dataset$index/lag(dataset$index)[-1] - 1
r <- ncol(X)
N # first, get the dense portfolio as the reference
<- Variable(N)
b <- Problem(Minimize(sum((as.matrix(r) - as.matrix(X) %*% b)^2)),
prob constraints = list(b >= 0, sum(b)==1))
<- solve(prob)
result <- as.vector(result$getValue(b))
b # then, keep the K strongest but reoptimizing weights
<- sort(b, decreasing = TRUE, index.return = TRUE)$ix[1:K]
idx_Klargest <- Variable(K)
wK <- Problem(Minimize(sum((as.matrix(r) - as.matrix(X[, idx_Klargest]) %*% wK)^2)),
prob constraints = list(wK >= 0, sum(wK)==1))
<- solve(prob)
result <- rep(0, N)
w <- as.vector(result$getValue(wK))
w[idx_Klargest] return(w)
}
<- function(dataset, ...) {
iterative_reweighted_l1_norm_tracking <- dataset$prices/lag(dataset$prices)[-1] - 1
X <- dataset$index/lag(dataset$index)[-1] - 1
r sparse_tracking_via_MM(X, r, lambda = lmd, max_iter = 10)
}
# perform single backtest
<- 20
K <- 5e-7
lmd <- portfolioBacktest(list("Full tracking" = full_tracking,
bt "Two-step design with proportional weights" = two_stage_naive_tracking,
"Two-step design with refitted weights" = two_stage_refitted_tracking,
"Iterative reweighted $l_1$-norm method" = iterative_reweighted_l1_norm_tracking),
dataset_list = list("dataset1" = list("prices" = SP500_2015to2020$stocks,
"index" = SP500_2015to2020$index)),
benchmarks = c("index"),
lookback = T_trn, optimize_every = 6*21, rebalance_every = 6*21)
Backtesting 4 portfolios over 1 datasets (periodicity = daily data)...
Backtesting benchmarks...
Code
# make sure there was no error in the backtest
listPortfoliosWithFailures(bt)
<- length(bt)
num_pf <- bt[c(num_pf, 1:(num_pf-1))] # move SP500 to the first position
bt <- lapply(bt, function(pf) {pf$dataset1$X_lin <- NULL; return(pf)}) # reduce object size bt
We can then plot the price, tracking error, and cardinality over time:
Code
#
# plot wealth
#
<- as.numeric(SP500_2015to2020$index[T_trn, ])
B0 <- B0 * list2xts(lapply(bt, function(bt_single) bt_single$dataset1$wealth), names = names(bt))
price_all ggplot(fortify(price_all, melt = TRUE), aes(x = Index, y = Value, col = Series)) +
geom_line(linewidth = 1) +
scale_x_date(date_breaks = "6 months", date_labels = "%b %Y", date_minor_breaks = "1 week") +
theme(legend.title = element_text()) + labs(color = "Portfolio") +
labs(title = "Tracking of S&P 500 index", x= NULL, y = "dollars")
Code
#
# plot smoothed TE
#
<- list2xts(lapply(bt, function(bt_single) bt_single$dataset1$return), names = names(bt))
ret_all <- (ret_all[, -1] - as.vector(ret_all[, 1]))^2
TE_inst <- xts(roll_meanr(as.matrix(TE_inst), n = 252, fill = NA), index(TE_inst))
TE_smooth <- na.omit(TE_smooth)
TE_smooth <- TE_smooth[, -1] # remove the full tracking
TE_smooth ggplot(fortify(TE_smooth, melt = TRUE), aes(x = Index, y = Value, color = Series)) +
geom_line(linewidth = 1) +
theme(legend.title = element_blank()) + #labs(color = "Tracking methods") +
scale_x_date(date_breaks = "6 months", date_labels = "%b %Y", date_minor_breaks = "1 week") +
labs(title = "Tracking error over time", x= NULL, y = "TE")
Code
#
# plot cardinality
#
<- list2xts(lapply(bt[-1], function(bt_single) xts(rowSums(bt_single$dataset1$w_bop > 1e-5), index(bt_single$dataset1$w_bop))), names = names(bt[-1]))
cardinality <- cardinality[index(TE_smooth), ]
cardinality <- cardinality[, -1] # remove the full tracking
cardinality ggplot(fortify(cardinality, melt = TRUE), aes(x = Index, y = Value, col = Series)) +
geom_line(linewidth = 1) +
scale_x_date(date_breaks = "6 months", date_labels = "%b %Y", date_minor_breaks = "1 week") +
theme(legend.title = element_blank()) +
labs(title = "Cardinality over time", x= NULL, y = "cardinality")
Code
t(rbind("TE" = colMeans(TE_inst),
"cardinality" = colMeans(cardinality)))
TE cardinality
Full tracking 6.620929e-07 20.00000
Two-step design with proportional weights 7.717688e-06 19.63066
Two-step design with refitted weights 5.731222e-06 20.92993
Iterative reweighted $l_1$-norm method 4.626860e-06 20.00000
Comparison of formulations
We consider again the comparison between assuming a fixed portfolio and a time-varying one in the definition of tracking error. More specifically, we now study the tradeoff of tracking error versus cardinality. Indeed, the assumption of the time-varying portfolio is more accurate.
Code
library(pob)
library(sparseIndexTracking)
library(RcppRoll) # fast rolling means
data(SP500_2015to2020)
<- 2*252
T_trn
<- function(dataset, ...) {
tracking_using_plain_returns_wfixed <- dataset$prices/lag(dataset$prices)[-1] - 1
X <- dataset$index/lag(dataset$index)[-1] - 1
r sparse_tracking_via_MM(X, r, lambda = lmd, max_iter = 10)
}
<- function(dataset, ...) {
tracking_using_plain_returns_wt <- dataset$prices/lag(dataset$prices)[-1] - 1
X <- dataset$index/lag(dataset$index)[-1] - 1
r <- compute_alpha(X, r)
alpha
# design portfolio
<- sparse_tracking_via_MM(alpha$X_tilde, r, lambda = lmd, max_iter = 10)
w0 <- w0 * alpha$last_alpha_lagged
w return(w/sum(w))
}
#
# Sweeping loop
#
<- 1e-5 * c(1, 2, 5, 10, 20, 50, 100, 200, 1000)/300
lmd_sweep <- data.frame()
df for (lmd in lmd_sweep) {
<- portfolioBacktest(list("Assuming fixed portfolio" = tracking_using_plain_returns_wfixed,
bt "Assuming time-varying portfolio" = tracking_using_plain_returns_wt),
dataset_list = list("dataset1" = list("prices" = SP500_2015to2020$stocks,
"index" = SP500_2015to2020$index)),
benchmarks = c("index"),
lookback = T_trn, optimize_every = 6*21, rebalance_every = 6*21)
listPortfoliosWithFailures(bt)
<- length(bt)
num_pf <- bt[c(num_pf, 1:(num_pf-1))] # move index to the first position
bt
# store TE and cardinality
<- list2xts(lapply(bt, function(bt_single) bt_single$dataset1$return), names = names(bt))
ret_all <- colMeans((ret_all[, -1] - as.vector(ret_all[, 1]))^2)
TE_all <- colMeans(list2xts(lapply(bt[-1], function(bt_single)
cardinality_all xts(rowSums(bt_single$dataset1$w_bop > 1e-5), index(bt_single$dataset1$w_bop))), names = names(bt[-1])))
<- rbind(df, data.frame("method" = names(TE_all),
df "TE" = TE_all,
"K" = cardinality_all))
}rownames(df) <- NULL
# plot
$method <- factor(df$method, levels = unique(df$method))
dfggplot(df, aes(x = K, y = TE, color = method, linetype = method)) +
geom_line(linewidth = 1.2) +
theme(legend.title = element_blank()) +
labs(title = "Tracking error versus sparsity level", x= "K (active assets)", y = NULL)
Enhanced index tracking
We now compare the following tracking error (TE) measures to track the S&P 500 index:
basic TE: \[ \textm{TE}(\w) = \frac{1}{T}\left\|\bm{r}^\textm{b} - \bm{X}\w\right\|^2_2; \]
robust Huberized TE: \[ \textm{Hub-TE}(\w) = \frac{1}{T}\sum_{t=1}^T \phi^{\textm{hub}}(r_t^\textm{b} - \bm{X}_{t,:}\w), \] where \[\phi^{\textm{hub}}(x)=\begin{cases} x^{2}, & \quad|x|\leq M\\ M\left(2|x| - M\right), & \quad|x|>M; \end{cases}\]
\(\ell_1\)-norm TE: \[ \textm{TE}_1(\w) = \frac{1}{T}\left\|\bm{r}^\textm{b} - \bm{X}\w\right\|_1; \]
downside risk (DR) error measure: \[ \textm{DR}(\w) = \frac{1}{T}\left\|\left(\bm{r}^\textm{b} - \bm{X}\w\right)^+\right\|^2_2, \] where \((\cdot)^+\triangleq\textm{max}(0,\cdot)\).
We will use the R package spIndexTrack
to obtain the designs for the backtest:
Code
library(pob)
library(sparseIndexTracking)
data(SP500_2015to2020)
<- nrow(SP500_2015to2020$stocks)
T <- ncol(SP500_2015to2020$stocks)
N <- 2*252
T_trn
<- function(dataset, ...) {
TE_index_tracking <- dataset$prices/lag(dataset$prices)[-1] - 1
X <- dataset$index/lag(dataset$index)[-1] - 1
r
#w <- spIndexTrack(X, r, lambda = 6e-7, u = 0.5)
<- spIndexTrack(colCumSum(X), colCumSum(r), lambda = 150*6e-7, u = 0.5) #100*6e-7 -
w return(w)
}
<- function(dataset, ...) {
DR_index_tracking <- dataset$prices/lag(dataset$prices)[-1] - 1
X <- dataset$index/lag(dataset$index)[-1] - 1
r
#w <- spIndexTrack(X, r, lambda = 1.8e-7, u = 0.5, measure = "dr")
<- spIndexTrack(colCumSum(X), colCumSum(r), lambda = 190*1.8e-7, u = 0.5, measure = "dr") #200*1.8e-7 - 250*1.8e-7
w return(w)
}
<- function(dataset, ...) {
TE1_index_tracking <- dataset$prices/lag(dataset$prices)[-1] - 1
X <- dataset$index/lag(dataset$index)[-1] - 1
r
#w_TE1 <- spIndexTrack(X, r, lambda = 6e-7, u = 0.5) # initial point
<- spIndexTrack(colCumSum(X), colCumSum(r), lambda = 100*6e-7, u = 0.5)
w_TE1 for (k in 1:5) {
<- as.vector(sqrt(1/(2*abs(r - X %*% w_TE1))))
alpha #w_TE1 <- spIndexTrack(alpha*X, alpha*r, lambda = 6.5e-4, u = 0.5) # 6 -- 7
<- spIndexTrack(colCumSum(alpha*X), colCumSum(alpha*r), lambda = 150*6.5e-4, u = 0.5) #100*6.5e-4 -
w_TE1
}return(w_TE1)
}
<- function(dataset, ...) {
HubTE_index_tracking <- dataset$prices/lag(dataset$prices)[-1] - 1
X <- dataset$index/lag(dataset$index)[-1] - 1
r
#w <- spIndexTrack(X, r, lambda = 5e-8, u = 0.5, measure = "hete", hub = 1e-4)
<- spIndexTrack(colCumSum(X), colCumSum(r), lambda = 50*5e-8, u = 0.5, measure = "hete", hub = 1e-4) # 50*5e-8 - 70*5e-8
w return(w)
}
# perform single backtest
<- portfolioBacktest(list("TE based design" = TE_index_tracking,
bt "Hub-TE based design" = HubTE_index_tracking,
"TE_1 based design" = TE1_index_tracking,
"DR based design" = DR_index_tracking),
dataset_list = list("dataset1" = list("prices" = SP500_2015to2020$stocks,
"index" = SP500_2015to2020$index)),
benchmarks = c("index"),
lookback = T_trn, optimize_every = 6*21, rebalance_every = 6*21)
Backtesting 4 portfolios over 1 datasets (periodicity = daily data)...
Backtesting benchmarks...
Code
listPortfoliosWithFailures(bt)
<- bt[c(length(bt), 1:(length(bt)-1))] # move SP500 to the first position
bt <- lapply(bt, function(pf) {pf$dataset1$X_lin <- NULL; return(pf)}) # reduce object size
bt
#
# plot wealth
#
<- as.numeric(SP500_2015to2020$index[T_trn, ])
B0 <- B0 * list2xts(lapply(bt, function(bt_single) bt_single$dataset1$wealth), names = names(bt))
price_all ggplot(fortify(price_all, melt = TRUE), aes(x = Index, y = Value, col = Series)) +
geom_line(linewidth = 1) +
scale_x_date(date_breaks = "6 months", date_labels = "%b %Y", date_minor_breaks = "1 week") +
theme(legend.title = element_text()) + labs(color = "Portfolio") +
labs(title = "Tracking of S&P 500 index", x= NULL, y = "dollars")
Automatic sparsity control
The sparse index tracking formulation includes a regularization term \(\lambda \|\w\|_0\) in the objective or as a constraint \(\|\w\|_0 \le k\), where the hyper-parameter \(\lambda\) or \(k\) controls the sparsity level. Adjusting this hyper-parameter allows achieving different points on the error versus sparsity trade-off curve.
In practice, the goal is to choose a proper operating point on this trade-off curve without computing the entire curve. The false discovery rate (FDR), which is the probability of wrongly including a variable, is a key quantity for this decision. Controlling the FDR is the ideal way to decide whether to include a variable.
The T-Rex method, based on “dummy” variables, provides a practical approach for FDR control in sparse index tracking. Instead of tuning the sparsity hyper-parameter, T-Rex automatically selects the assets to include by controlling the FDR, avoiding the selection of irrelevant assets that are highly correlated with already selected ones. The T-Rex method is efficientlyu implemented in the R package TRexSelector
(and soon to come in Python) (Machkour et al., 2022).
We now backest the tracking portfolios obtained from the sparse penalized regression formulation with the FDR-controlling T-Rex method (Machkour et al., 2024). The tracking portfolios are computed on a rolling window basis with a lookback period of two years and recomputed every six months.
Code
library(xts)
library(pob)
library(sparseIndexTracking)
library(RcppRoll) # fast rolling means
library(quadprog)
data(SP500_2015to2020)
<- nrow(SP500_2015to2020$stocks)
T <- ncol(SP500_2015to2020$stocks)
N <- 2*252
T_trn
#
# Define different tracking methods
#
<- function(dataset, ...) {
iterative_reweighted_l1_norm_tracking_lmd1e_8 <- dataset$prices/lag(dataset$prices)[-1] - 1
X <- dataset$index/lag(dataset$index)[-1] - 1
r spIndexTrack(X, r, lambda = 1e-8, u = 0.5)
#spIndexTrack(colCumSum(X), colCumSum(r), lambda = 150*1e-8, u = 0.5)
}
<- function(dataset, ...) {
iterative_reweighted_l1_norm_tracking_lmd1e_7 <- dataset$prices/lag(dataset$prices)[-1] - 1
X <- dataset$index/lag(dataset$index)[-1] - 1
r spIndexTrack(X, r, lambda = 1e-7, u = 0.5)
#spIndexTrack(colCumSum(X), colCumSum(r), lambda = 150*1e-7, u = 0.5)
}
<- function(dataset, ...) {
iterative_reweighted_l1_norm_tracking_lmd1e_6 <- dataset$prices/lag(dataset$prices)[-1] - 1
X <- dataset$index/lag(dataset$index)[-1] - 1
r spIndexTrack(X, r, lambda = 1e-6, u = 0.5)
#spIndexTrack(colCumSum(X), colCumSum(r), lambda = 150*1e-6, u = 0.5)
}
<- function(dataset, ...) {
TRex_tracking <- as.matrix(dataset$prices/lag(dataset$prices)[-1] - 1)
X <- as.vector(dataset$index/lag(dataset$index)[-1] - 1)
r <- ncol(X)
N <- apply(X, 2, function(x) cumsum(x))
X_cum <- cumsum(r)
r_cum
# Step 1: T-Rex
<- TRexSelector::trex(X = tail(X, 120), y = tail(r, 120),
trex_selected #trex_selected <- TRexSelector::trex(X = X_cum, y = r_cum,
tFDR = 0.3, method = "trex+DA+NN",
verbose = FALSE)$selected_var
<- which(trex_selected == 1)
idx_trex <- length(idx_trex)
K # Step 2: Refit selected assets
<- solve.QP(Dmat = 2 * t(X[, idx_trex]) %*% X[, idx_trex],
wK dvec = 2 * t(X[, idx_trex]) %*% r,
Amat = cbind(rep(1, K), diag(K)), bvec = c(1, rep(0, K)), meq = 1)$solution
<- rep(0, N)
w <- wK
w[idx_trex] return(w)
}
Code
# perform single backtest
<- portfolioBacktest(
bt list("Sparse penalized regression (lambda = 1e-8)" = iterative_reweighted_l1_norm_tracking_lmd1e_8,
"Sparse penalized regression (lambda = 1e-7)" = iterative_reweighted_l1_norm_tracking_lmd1e_7,
"Sparse penalized regression (lambda = 1e-6)" = iterative_reweighted_l1_norm_tracking_lmd1e_6,
"T-Rex" = TRex_tracking),
dataset_list = list("dataset1" = list("prices" = SP500_2015to2020$stocks,
"index" = SP500_2015to2020$index)),
benchmarks = c("index"),
lookback = T_trn, optimize_every = 6*21, rebalance_every = 6*21)
listPortfoliosWithFailures(bt)
<- bt[c(length(bt), 1:(length(bt)-1))] # move SP500 to the first position
bt <- lapply(bt, function(pf) {pf$dataset1$X_lin <- NULL; return(pf)}) # reduce object size bt
We can then plot the price, tracking error, and cardinality over time. As can be seen, the traditional sparse regression formulation is very sensitive to the choice of the parameter \(\lambda\), producing very different results in terms of tracking error and cardinality. On the other hand, the FDR-controlling T-Rex method automatically chooses the appropriate sparsity level without having to tune any parameter. The computational cost of T-Rex is slightly higher than that of the traditional method for a fixed \(\lambda\) but lower than the traditional method for a whole range of values for \(\lambda\).
Code
#
# plot wealth
#
<- as.numeric(SP500_2015to2020$index[T_trn, ])
B0 <- B0 * list2xts(lapply(bt, function(bt_single) bt_single$dataset1$wealth), names = names(bt))
price_all ggplot(fortify(price_all, melt = TRUE), aes(x = Index, y = Value, col = Series)) +
geom_line(linewidth = 1) +
scale_x_date(date_breaks = "6 months", date_labels = "%b %Y", date_minor_breaks = "1 week") +
theme(legend.title = element_text()) + labs(color = "Portfolio") +
labs(title = "Tracking of S&P 500 index", x= NULL, y = "dollars")
Code
#
# plot smoothed cumulative TE
#
<- ((price_all[, -1] - as.vector(price_all[, 1]))/B0)^2
cumTE_inst <- xts(roll_meanr(as.matrix(cumTE_inst), n = 21, fill = NA), index(cumTE_inst))
cumTE_smooth <- na.omit(cumTE_smooth)
cumTE_smooth ggplot(fortify(cumTE_smooth, melt = TRUE), aes(x = Index, y = Value, color = Series)) +
geom_line(linewidth = 1) +
scale_x_date(date_breaks = "6 months", date_labels = "%b %Y", date_minor_breaks = "1 week") +
theme(legend.title = element_blank()) +
labs(title = "Tracking error (cum-returns) over time", x= NULL, y = "TE (cum)")
Code
#
# plot smoothed period-by-period TE
#
<- list2xts(lapply(bt, function(bt_single) bt_single$dataset1$return), names = names(bt))
ret_all <- (ret_all[, -1] - as.vector(ret_all[, 1]))^2
TE_inst <- xts(roll_meanr(as.matrix(TE_inst), n = 252, fill = NA), index(TE_inst))
TE_smooth <- na.omit(TE_smooth)
TE_smooth ggplot(fortify(TE_smooth, melt = TRUE), aes(x = Index, y = Value, color = Series)) +
geom_line(linewidth = 1) +
scale_x_date(date_breaks = "6 months", date_labels = "%b %Y", date_minor_breaks = "1 week") +
theme(legend.title = element_blank()) +
labs(title = "Tracking error (plain-returns) over time", x= NULL, y = "TE")
Code
#
# plot cardinality
#
<- list2xts(lapply(bt[-1], function(bt_single) xts(rowSums(bt_single$dataset1$w_bop > 1e-5), index(bt_single$dataset1$w_bop))), names = names(bt[-1]))
cardinality <- cardinality[index(TE_smooth), ]
cardinality ggplot(fortify(cardinality, melt = TRUE), aes(x = Index, y = Value, color = Series)) +
geom_line(linewidth = 1) +
scale_x_date(date_breaks = "6 months", date_labels = "%b %Y", date_minor_breaks = "1 week") +
theme(legend.title = element_blank()) +
labs(title = "Cardinality over time", x= NULL, y = "cardinality")
Code
t(rbind("TE" = colMeans(TE_inst),
"cardinality" = colMeans(cardinality)))
TE cardinality
Sparse penalized regression (lambda = 1e-8) 9.195354e-07 114.95474
Sparse penalized regression (lambda = 1e-7) 1.712832e-06 47.84088
Sparse penalized regression (lambda = 1e-6) 6.203776e-06 15.02774
T-Rex 3.638299e-06 52.80146