# basic finance
library(xts) # to manipulate time series of stock data
library(portfolioBacktest) # to conduct backtests
library(pob) # book package with financial data
library(quantmod) # to download market data from the Internet
library(mvtnorm) # to generate synthetic data
# plotting
library(ggplot2) # for nice plots
library(reshape2) # to reshape data
library(ggridges) # to plot the geom_density_ridges
# optimization
#library(quadprog)
library(CVXR)
Portfolio Optimization
Chapter 14: Robust Portfolios
R code
R code examples for Chapter 14 of the book:
Daniel P. Palomar (2024). Portfolio Optimization: Theory and Application. Cambridge University Press.
Loading packages
First load the packages used in the examples:
Introduction
Markowitz’s mean-variance portfolio optimization balances expected return \(\w^\T\bmu\) against risk, measured by variance \(\w^\T\bSigma\w\): \[ \begin{array}{ll} \underset{\w}{\text{maximize}} & \w^\T\bmu - \frac{\lambda}{2}\w^\T\bSigma\w\\ \text{subject to} & \w \in \mathcal{W}, \end{array} \] where \(\bmu\) and \(\bSigma\) are parameters, \(\lambda\) controls risk aversion, and \(\mathcal{W}\) represents constraints like \(\{\w \mid \bm{1}^\T\w=1, \w\ge\bm{0} \}\).
In practice, \(\bmu\) and \(\bSigma\) are estimated from historical data, leading to estimation errors due to limited data and non-stationarity. This noise, especially in \(\hat{\bmu}\), results in erratic portfolio designs, a problem known as the “Markowitz optimization enigma.” Michaud highlighted that portfolio optimization often maximizes estimation errors, rendering “optimal” portfolios financially meaningless.
Next figure shows the sensitivity of the mean-variance portfolio to estimation errors, demonstrating its impracticality due to erratic behavior.
Code
library(pob)
library(CVXR)
library(mvtnorm)
data(SP500_2015to2020)
<- SP500_2015to2020$stocks["2019::", c("AAPL", "AMZN", "AMD", "GM", "GOOGL", "MGM", "MSFT", "QCOM", "TSCO", "UPS")]
stock_prices
<- function(mu, Sigma, lambda = 1, ub = Inf) {
design_MVP <- Variable(nrow(Sigma))
w <- Problem(Maximize(t(mu) %*% w - (lambda/2)*quad_form(w, Sigma)),
prob constraints = list(w >= 0, sum(w) == 1, w <= ub))
<- solve(prob)
result <- as.vector(result$getValue(w))
w names(w) <- colnames(Sigma)
return(w)
}
<- function(dataset, ...) {
MVP_noisy <- diff(log(dataset$prices))[-1]
X <- colMeans(X)
mu <- cov(X)
Sigma
# noisy version
<- rmvnorm(n = nrow(X), mean = mu, sigma = Sigma)
X_noisy <- colMeans(X_noisy)
mu_noisy <- cov(X_noisy)
Sigma_noisy
design_MVP(mu = mu_noisy, Sigma = Sigma_noisy, lambda = 1)
}
# backtests
set.seed(42)
<- portfolioBacktest(list("#1" = MVP_noisy,
bt "#2" = MVP_noisy,
"#3" = MVP_noisy,
"#4" = MVP_noisy,
"#5" = MVP_noisy,
"#6" = MVP_noisy),
list(list("prices" = stock_prices)), price_name = "prices",
lookback = 6*21, optimize_every = 10000, rebalance_every = 10000)
Backtesting 6 portfolios over 1 datasets (periodicity = daily data)...
Code
listPortfoliosWithFailures(bt)
data.frame(
"stocks" = names(stock_prices),
"#1" = as.numeric(bt$`#1`$data1$w_optimized[1, ]),
"#2" = as.numeric(bt$`#2`$data1$w_optimized[1, ]),
"#3" = as.numeric(bt$`#3`$data1$w_optimized[1, ]),
"#4" = as.numeric(bt$`#4`$data1$w_optimized[1, ]),
"#5" = as.numeric(bt$`#5`$data1$w_optimized[1, ]),
"#6" = as.numeric(bt$`#6`$data1$w_optimized[1, ]),
check.names = FALSE) |>
melt(id.vars = "stocks") |>
ggplot(aes(x = stocks, y = value, fill = variable)) +
geom_bar(stat = "identity", position = "dodge", color = "black", width = 0.8) +
theme(legend.title = element_text()) + labs(fill = "realization") +
labs(title = "Portfolio for different estimation errors", y = "weight")
Robust worst-case portfolio optimization
To study the uncertainty in \(\bmu\), we consider the global maximum return portfolio (GMRP) for an estimated mean vector \(\hat{\bmu}\): \[ \begin{array}{ll} \underset{\w}{\textm{maximize}} & \w^\T\hat{\bmu}\\ \textm{subject to} & \bm{1}^\T\w=1, \quad \w\ge\bm{0}, \end{array} \] which is is known to be terribly sensitive to estimation errors.
We will instead use the worst-case formulation: \[ \begin{array}{ll} \underset{\w}{\textm{maximize}} & \textm{inf}_{\bmu \in \mathcal{U}_{\bmu}} \; \w^\T\bmu\\ \textm{subject to} & \bm{1}^\T\w=1, \quad \w\ge\bm{0}, \end{array} \] where typical choices for the uncertainty region \(\mathcal{U}_{\bmu}\) are the ellipsoidal and box sets considered next.
Ellipsoidal uncertainty set
The ellipsoidal uncertainty set for \(\bmu\) is \[ \mathcal{U}_{\bmu} = \left\{\bmu = \hat{\bmu} + \kappa \bm{S}^{1/2}\bm{u} \mid \|\bm{u}\|_2 \le 1\right\}, \] where \(\bm{S}^{1/2}\) is the symmetric square-root matrix of the shape \(\bm{S}\), e.g., \(\bm{S}=(1/T)\bSigma\), and \(\kappa\) determines the size of the ellipsoid.
The corresponding worst-case value of \(\w^\T\bmu\) is \[ \begin{aligned} \underset{\bmu \in \mathcal{U}_{\bmu}}{\text{inf}} \; \w^\T\bmu &= \w^\T\hat{\bmu} - \kappa\; \|\bm{S}^{1/2}\w\|_2. \end{aligned} \]
Box uncertainty set
The box uncertainty set for \(\bmu\) is \[ \mathcal{U}_{\bmu} = \left\{\bmu \mid -\bm{\delta} \le \bmu - \hat{\bmu} \le \bm{\delta}\right\}, \] where \(\bm{\delta}\) is the half-width of the box in all dimensions.
The corresponding worst-case value of \(\w^\T\bmu\) is \[ \begin{aligned} \underset{\bmu \in \mathcal{U}_{\bmu}}{\text{inf}} \; \w^\T\bmu &= \w^\T\hat{\bmu} - |\w|^\T\bm{\delta}. \end{aligned} \]
Numerical experiments
Sensitivity
First, let’s write the code for the robust mean–variance formulations with an ellipsoidal and box uncertainty sets for the mean vector:
Code
library(CVXR)
<- function(mu_hat, delta,
design_MVP_robust_mu_box
Sigma_hat,lambda = 1) {
<- Variable(length(mu_hat))
w <- Problem(Maximize(t(w) %*% mu_hat - t(abs(w)) %*% delta - (lambda/2)*quad_form(w, Sigma_hat)),
prob constraints = list(w >= 0, sum(w) == 1))
<- solve(prob)
result return(as.vector(result$getValue(w)))
}
<- function(mu_hat, S_mu = Sigma_hat, kappa_mu = 0.1,
design_MVP_robust_mu_ellipsoid
Sigma_hat, lambda = 1) {
<- chol(S_mu) # t(S12) %*% S12 = Sigma
S12 <- Variable(length(mu_hat))
w <- Problem(Maximize(t(w) %*% mu_hat - kappa_mu*norm2(S12 %*% w)
prob - (lambda/2) * (quad_form(w, Sigma_hat))),
constraints = list(w >= 0, sum(w) == 1))
<- solve(prob)
result return(as.vector(result$getValue(w)))
}
We can now observe the improved sensitivity of the robust mean–variance portfolio under an ellipsoidal uncertainty set for \(\bm{\mu}\):
Code
library(pob)
library(mvtnorm)
data(SP500_2015to2020)
<- SP500_2015to2020$stocks["2019::", c("AAPL", "AMZN", "AMD", "GM", "GOOGL", "MGM", "MSFT", "QCOM", "TSCO", "UPS")]
stock_prices
<- function(dataset, ...) {
MVP_noisy_robust_mu_ellipsoid <- diff(log(dataset$prices))[-1]
X <- colMeans(X)
mu <- cov(X)
Sigma
# noisy version
<- rmvnorm(n = nrow(X), mean = mu, sigma = Sigma)
X_noisy <- colMeans(X_noisy)
mu_noisy <- cov(X_noisy)
Sigma_noisy
# robust design
design_MVP_robust_mu_ellipsoid(mu_hat = mu_noisy, kappa_mu = 0.5,
Sigma_hat = Sigma_noisy,
lambda = 1)
}
# backtests
set.seed(42)
<- portfolioBacktest(list("#1" = MVP_noisy_robust_mu_ellipsoid,
bt "#2" = MVP_noisy_robust_mu_ellipsoid,
"#3" = MVP_noisy_robust_mu_ellipsoid,
"#4" = MVP_noisy_robust_mu_ellipsoid,
"#5" = MVP_noisy_robust_mu_ellipsoid,
"#6" = MVP_noisy_robust_mu_ellipsoid),
list(list("prices" = stock_prices)), price_name = "prices",
lookback = 6*21, optimize_every = 10000, rebalance_every = 10000)
Backtesting 6 portfolios over 1 datasets (periodicity = daily data)...
Code
listPortfoliosWithFailures(bt)
data.frame(
"stocks" = names(stock_prices),
"#1" = as.numeric(bt$`#1`$data1$w_optimized[1, ]),
"#2" = as.numeric(bt$`#2`$data1$w_optimized[1, ]),
"#3" = as.numeric(bt$`#3`$data1$w_optimized[1, ]),
"#4" = as.numeric(bt$`#4`$data1$w_optimized[1, ]),
"#5" = as.numeric(bt$`#5`$data1$w_optimized[1, ]),
"#6" = as.numeric(bt$`#6`$data1$w_optimized[1, ]),
check.names = FALSE) |>
melt(id.vars = "stocks") |>
ggplot(aes(x = stocks, y = value, fill = variable)) +
geom_bar(stat = "identity", position = "dodge", color = "black", width = 0.8) +
theme(legend.title = element_text()) + labs(fill = "realization") +
labs(title = "Portfolio for different estimation errors", y = "weight")
Naive versus robust portfolios
We now compute the empirical distribution of the achieved mean–variance objective, as well as the Sharpe ratio, calculated over 1,000 Monte Carlo noisy observations. We can see that the robust designs avoid the worst-case realizations (left tail in the distributions) at the expense of not achieving the best-case realizations (right tail); thus, they are more stable and robust as expected.
Code
library(pob)
library(mvtnorm)
library(ggridges)
<- function(dataset, ...) {
MVP_noisy <- diff(log(dataset$prices))[-1]
X <- rmvnorm(n = nrow(X), mean = colMeans(X), sigma = cov(X))
X_noisy design_MVP(mu = colMeans(X_noisy), Sigma = cov(X_noisy), lambda = 1)
}
<- function(dataset, ...) {
MVP_noisy_robust_mu_box <- diff(log(dataset$prices))[-1]
X <- rmvnorm(n = nrow(X), mean = colMeans(X), sigma = cov(X))
X_noisy design_MVP_robust_mu_box(mu_hat = colMeans(X_noisy), delta = 0.2*sqrt(diag(cov(X_noisy))),
Sigma_hat = cov(X_noisy),
lambda = 1)
}
<- function(dataset, ...) {
MVP_noisy_robust_mu_ellipsoid <- diff(log(dataset$prices))[-1]
X <- rmvnorm(n = nrow(X), mean = colMeans(X), sigma = cov(X))
X_noisy design_MVP_robust_mu_ellipsoid(mu_hat = colMeans(X_noisy), kappa_mu = 0.2,
Sigma_hat = cov(X_noisy),
lambda = 1)
}
# parameters
<- 1000
num_realiz_histogram <- 50
N <- 12
lookback_months
set.seed(42)
<- SP500_2015to2020$stocks["2019::", sample(471, N)]
stock_prices
# true mu and Sigma in the first training set
<- diff(log(stock_prices[1:(lookback_months*21), ]))[-1]
X_ <- colMeans(X_)
mu_true <- cov(X_)
Sigma_true <- design_MVP(mu = mu_true, Sigma = Sigma_true, lambda = 1)
w_clairvoyant
# backtests for naive MVP
set.seed(42)
<- portfolioBacktest(rep(list(MVP_noisy), num_realiz_histogram),
bt_naive list(list("prices" = stock_prices)), price_name = "prices",
lookback = lookback_months*21, optimize_every = 10000, rebalance_every = 10000,
paral_portfolios = 6, show_progress_bar = TRUE,
return_returns = FALSE)
Backtesting 1000 portfolios over 1 datasets (periodicity = daily data)...
Code
listPortfoliosWithFailures(bt_naive)
# backtests for robust MVP under box
set.seed(42)
<- portfolioBacktest(rep(list(MVP_noisy_robust_mu_box), num_realiz_histogram),
bt_robust_box list(list("prices" = stock_prices)), price_name = "prices",
lookback = lookback_months*21, optimize_every = 10000, rebalance_every = 10000,
paral_portfolios = 6, show_progress_bar = TRUE,
return_returns = FALSE)
Backtesting 1000 portfolios over 1 datasets (periodicity = daily data)...
Code
listPortfoliosWithFailures(bt_robust_box)
# backtests for robust MVP under ellipsoid
set.seed(42)
<- portfolioBacktest(rep(list(MVP_noisy_robust_mu_ellipsoid), num_realiz_histogram),
bt_robust_ellipsoid list(list("prices" = stock_prices)), price_name = "prices",
lookback = lookback_months*21, optimize_every = 10000, rebalance_every = 10000,
paral_portfolios = 6, show_progress_bar = TRUE,
return_returns = FALSE)
Backtesting 1000 portfolios over 1 datasets (periodicity = daily data)...
Code
listPortfoliosWithFailures(bt_robust_ellipsoid)
# evaluation
<- function(w) as.numeric(t(w) %*% mu_true - (1/2) * t(w) %*% Sigma_true %*% w)
fun_mean_variance <- function(w) as.numeric(t(w) %*% mu_true / sqrt(t(w) %*% Sigma_true %*% w))
fun_Sharpe_ratio
<- fun_mean_variance(w_clairvoyant)
mean_variance_clairvoyant <- sapply(bt_naive, function(bt_realiz) fun_mean_variance(as.vector(bt_realiz$data1$w_optimized[1, ])))
mean_variance_naive <- sapply(bt_robust_ellipsoid, function(bt_realiz) fun_mean_variance(as.vector(bt_realiz$data1$w_optimized[1, ])))
mean_variance_robust_ellipsoid <- sapply(bt_robust_box, function(bt_realiz) fun_mean_variance(as.vector(bt_realiz$data1$w_optimized[1, ])))
mean_variance_robust_box
<- fun_Sharpe_ratio(w_clairvoyant)
SR_clairvoyant <- sapply(bt_naive, function(bt_realiz) fun_Sharpe_ratio(as.vector(bt_realiz$data1$w_optimized[1, ])))
SR_naive <- sapply(bt_robust_ellipsoid, function(bt_realiz) fun_Sharpe_ratio(as.vector(bt_realiz$data1$w_optimized[1, ])))
SR_robust_ellipsoid <- sapply(bt_robust_box, function(bt_realiz) fun_Sharpe_ratio(as.vector(bt_realiz$data1$w_optimized[1, ])))
SR_robust_box
<- data.frame("method" = "Naive MVP",
df "realization" = 1:num_realiz_histogram,
"mean-variance" = mean_variance_naive,
"Sharpe ratio" = SR_naive,
check.names = FALSE)
<- rbind(df, data.frame("method" = "Robust MVP (box uncertainty set)",
df "realization" = 1:num_realiz_histogram,
"mean-variance" = mean_variance_robust_box,
"Sharpe ratio" = SR_robust_box,
check.names = FALSE))
<- rbind(df, data.frame("method" = "Robust MVP (ellipsoidal uncertainty set)",
df "realization" = 1:num_realiz_histogram,
"mean-variance" = mean_variance_robust_ellipsoid,
"Sharpe ratio" = SR_robust_ellipsoid,
check.names = FALSE))
$method <- factor(df$method, levels = unique(df$method)) df
Code
ggplot(df, aes(x = `mean-variance`, y = method, fill = method)) +
geom_density_ridges(show.legend = FALSE) +
scale_y_discrete(limits = rev(levels(df$method)), expand = c(0, 0.2, 0, 1.6)) +
theme(axis.text.y = element_text(colour = "black", size = 10)) +
xlim(c(-0.001, NA)) +
labs(title = "Empirical distribution of mean-variance value", y = NULL)
Code
ggplot(df, aes(x = `Sharpe ratio`, y = method, fill = method)) +
geom_density_ridges(show.legend = FALSE) +
scale_y_discrete(limits = rev(levels(df$method)), expand = c(0, 0.2, 0, 1.6)) +
theme(axis.text.y = element_text(colour = "black", size = 10)) +
xlim(c(-0.05, NA)) +
labs(title = "Empirical distribution of Sharpe ratio", y = NULL)
Backtests
Backtest of naive versus robust mean–variance portfolios:
Code
library(pob)
<- function(dataset, ...) {
MVP_naive <- diff(log(dataset$prices))[-1]
X design_MVP(mu = colMeans(X), Sigma = cov(X), lambda = 1)
}
<- function(dataset, ...) {
MVP_robust_mu_box <- diff(log(dataset$prices))[-1]
X <- cov(X)
Sigma # robust design
design_MVP_robust_mu_box(mu_hat = colMeans(X), delta = 0.2*sqrt(diag(Sigma)),
Sigma_hat = Sigma,
lambda = 1)
}
<- function(dataset, ...) {
MVP_robust_mu_ellipsoid <- diff(log(dataset$prices))[-1]
X
design_MVP_robust_mu_ellipsoid(mu_hat = colMeans(X), kappa_mu = 0.2,
Sigma_hat = cov(X),
lambda = 1)
}
# single backtest
<- 50
N <- 6
lookback_months set.seed(42)
<- SP500_2015to2020$stocks["2017::", sample(471, N)]
stock_prices
<- portfolioBacktest(list("Naive MVP" = MVP_naive,
bt "Robust MVP (box uncertainty set)" = MVP_robust_mu_box,
"Robust MVP (ellipsoidal uncertainty set)" = MVP_robust_mu_ellipsoid),
list(list("prices" = stock_prices)), price_name = "prices",
lookback = lookback_months*21, optimize_every = 21, rebalance_every = 1,
return_portfolio = FALSE)
Backtesting 3 portfolios over 1 datasets (periodicity = daily data)...
Code
listPortfoliosWithFailures(bt)
<- lapply(bt, function(pf) {pf$data1$X_lin <- NULL; return(pf)}) # reduce object size bt
Code
backtestChartCumReturn(bt) +
scale_x_date(date_breaks = "6 months", date_labels = "%b %Y", date_minor_breaks = "1 week") +
theme(legend.title = element_blank()) +
ggtitle("Cumulative P&L")
Code
backtestChartDrawdown(bt) +
scale_x_date(date_breaks = "6 months", date_labels = "%b %Y", date_minor_breaks = "1 week") +
theme(legend.title = element_blank()) +
ggtitle("Drawdown")
To be more exhaustive, we now run multiple backtests of naive versus robust mean–variance portfolios:
Code
library(pob)
# multiple backtests
<- 50
N <- 6
lookback_months set.seed(42)
<- financialDataResample(list("prices" = SP500_2015to2020$stocks),
stock_prices_resampled num_datasets = 200, N_sample = N, T_sample = 12*21)
200 datasets resampled (with N = 50 instruments and length T = 252) from the original data between 2015-01-05 and 2020-09-22.
Code
<- portfolioBacktest(list("Naive MVP" = MVP_naive,
bt "Robust MVP (box uncertainty set)" = MVP_robust_mu_box,
"Robust MVP (ellipsoidal uncertainty set)" = MVP_robust_mu_ellipsoid),
dataset_list = stock_prices_resampled,
lookback = lookback_months*21, optimize_every = 21, rebalance_every = 1,
return_portfolio = FALSE, return_returns = FALSE,
paral_datasets = 6, show_progress_bar = TRUE)
Backtesting 3 portfolios over 200 datasets (periodicity = daily data)...
Backtesting function "Naive MVP " (1/3)
Backtesting function "Robust MVP (box uncertainty set)" (2/3)
Backtesting function "Robust MVP (ellipsoidal uncertainty set)" (3/3)
Code
listPortfoliosWithFailures(bt)
Code
backtestBoxPlot(bt, "Sharpe ratio")
Code
backtestBoxPlot(bt, "max drawdown")
Portfolio resampling
Resampling methods
Estimating a parameter \(\bm{\theta}\) with \(\hat{\bm{\theta}}\) is insufficient without knowing the estimate’s accuracy. Confidence intervals traditionally provide this accuracy but rely heavily on theoretical mathematics. Resampling methods, like cross-validation and the bootstrap, use computer-based techniques to assess statistical accuracy without complex formulas (Efron and Tibshirani, 1993).
The bootstrap, introduced by Efron in 1979, mimics the original sampling process by resampling the observed data with replacement to create multiple bootstrap samples. Each sample leads to different realizations of the estimate, \(\hat{\bm{\theta}}^{*(b)}\), from which accuracy measures (bias, variance, confidence intervals) are empirically derived. This method allows for the assessment of estimator accuracy without needing additional data, with the statistical behavior of \(\hat{\bm{\theta}}^{*(b)}\) faithfully representing that of \(\hat{\bm{\theta}}\) compared to the true parameter \(\bm{\theta}\), especially as the number of resamples \(B\) increases (Efron and Tibshirani, 1993).
To be more precise, the original \(n\) observations \(\bm{x}_1,\dots,\bm{x}_n\) are sampled \(n\) times with replacement (some samples will be selected multiple times while others will not be used). This procedure is repeated \(B\) times to obtain the bootstrap samples, \[ \big(\bm{x}_1,\dots,\bm{x}_n\big) \rightarrow \left(\bm{x}_1^{*(b)},\dots,\bm{x}_n^{*(b)}\right), \quad b=1,\dots,B, \] each of which leads to a different realization of the estimation (bootstrap replicates), \[ \hat{\bm{\theta}}^{*(b)} = f\left(\bm{x}_1^{*(b)},\dots,\bm{x}_n^{*(b)}\right), \quad b=1,\dots,B, \] from which measures of accuracy of the estimator (bias, variance, confidence intervals, etc.) can then be empirically obtained.
The key theoretical result is that the statistical behavior of the random resampled estimates \(\hat{\bm{\theta}}^{*(b)}\) compared to \(\hat{\bm{\theta}}\) (taken as the true parameter) faithfully represent the statistics of the random estimates \(\hat{\bm{\theta}}\) compared to the true (unknown) parameter \(\bm{\theta}\). More exactly, the estimations of accuracy are asymptotically consistent as \(B\rightarrow\infty\) (under some technical conditions) (Efron and Tibshirani, 1993). This is a rather surprising result that allows the empirical assessment of the accuracy of the estimator without having access to the true parameter \(\bm{\theta}\).
We now illustrate the magic of the bootstrap to estimate the accuracy of the sample mean estimator (from \(n=100\) observations). In this case, the empirical distribution of the bias of the estimator is computed via \(B=1,000\) bootstraps, producing an accurate histogram compared to the true distribution. In practice, confidence intervals may suffice to assess the accuracy and fewer bootstraps may be used (even less bootstraps are necessary to compute the standard deviation of the bias).
Code
library(ggplot2)
<- 1000 # number of boostraps
B <- 100 # number of observations for the estimation
n <- 0
mu_true <- 1
sd_true set.seed(42)
# nominal sample
<- rnorm(n = n, mean = mu_true, sd = sd_true)
x_nominal <- mean(x_nominal)
mu_nominal <- sd(x_nominal)
sd_nominal <- data.frame()
df
# nonparametric bootstrap
<- c()
mu_hat_resampled for (i in 1:B) {
<- sample(x_nominal, size = n, replace = TRUE)
x_resampled <- c(mu_hat_resampled, mean(x_resampled))
mu_hat_resampled
}<- rbind(df, data.frame("bias" = mu_hat_resampled - mu_nominal,
df "distribution" = "nonparametric bootstrap"))
# plot
ggplot(df[df$distribution != "true", ], aes(x = bias, fill = distribution)) +
geom_histogram(aes(y = ..density..), position = "identity", alpha = 0.7, color = "gray31", show.legend = FALSE) +
stat_function(fun = dnorm, args = list(mean = mu_true, sd = sd_true/sqrt(n)), color = "black", linewidth = 1.2) +
annotate("text", x = 0.13, y = 3.58, label = "true distribution") +
geom_segment(aes(x = 0.1, y = 3.45, xend = 0.088, yend = 2.8)) +
xlim(-0.4, 0.4) +
theme(axis.text.y = element_blank()) +
theme(legend.title = element_blank()) +
ggtitle("Empirical distribution of the mean estimation bias")
Portfolio resampling
In portfolio design, an optimization problem is formulated based on \(T\) observations of the assets’ returns \(\bm{x}_1,\dots,\bm{x}_T\) whose solution is supposedly an optimal portfolio \(\w\). As previous discussed, this solution is very sensitive to the inherent noise in the observed data or in the noise in the estimated parameters used in the portfolio formulation, such as the mean vector \(\hat{\bmu}\) and covariance matrix \(\hat{\bSigma}\).
Fortunately, we can capitalize on the results from the past half century in statistics; in particular, we can use resampling techniques, such as the bootstrap and bagging, to improve the portfolio design.
Portfolio bagging: The technique of aggregating portfolios was considered in 1998 via a bagging procedure (Michaud and Michaud, 1998, 2007, 2008; Scherer, 2002):
- Resample the original data \((\bm{x}_1,\dots,\bm{x}_T)\) \(B\) times via the bootstrap method and estimate \(B\) different versions of the mean vector and covariance matrix: \(\hat{\bmu}^{*(b)}\) and \(\hat{\bSigma}^{*(b)}\) for \(b=1,\dots,B\);
- Solve the optimal portfolio \(\w^{*(b)}\) for each bootstrap sample; and
- Average the portfolios via bagging: \[ \w^{\textm{bag}} = \frac{1}{B}\sum_{b=1}^B \w^{*(b)}. \]
Numerical experiments
Sensitivity
First, let’s write the code for some resampled portfolios, namely, bagged portfolio, subsample resampled (SSR) portfolio, and their combination SSR bagged:
Code
<- function(portfolio_fun, X,
bootstrap_portfolio num_bootstraps = 20, same_seed = TRUE, SSR_exp = 0.7,
type_bootstrap = c("vanilla", "subset resampling", "vanilla bootstrap + subset resampling"),
type_aggregation = c("mean", "gmedian"), force_sum_abs_w_1 = FALSE, cardinality = NULL
) {<- match.arg(type_bootstrap)
type_bootstrap <- as.matrix(X)
X if (is.null(colnames(X)))
colnames(X) <- paste0("asset #", 1:ncol(X))
# generate bootstrap samples
<- switch(type_bootstrap,
X_sample_list "vanilla" = {
replicate(num_bootstraps, X[sample(nrow(X), replace = TRUE), ], simplify = FALSE)
},"subset resampling" = {
replicate(num_bootstraps, X[, sample(ncol(X), ceiling(ncol(X)^SSR_exp))], simplify = FALSE)
},"vanilla bootstrap + subset resampling" = {
replicate(num_bootstraps, X[sample(nrow(X), replace = TRUE), sample(ncol(X), ceiling(ncol(X)^SSR_exp))], simplify = FALSE)
}, stop("Bootstrap method unknown")
)
# design portfolios
<- lapply(X_sample_list, portfolio_fun)
w_list
# include zeros in case of subspace resampling
if (grepl("subset resampling", type_bootstrap)) {
<- lapply(w_list, function(w) {
w_list if (is.null(names(w)))
stop("w does not have names in subset resampling")
<- rep(0, ncol(X))
w_full names(w_full) <- colnames(X)
names(w)] <- w
w_full[return(w_full)
})
}
# aggregate portfolios
<- do.call(rbind, w_list)
w_matrix <- colMeans(w_matrix)
w return(w)
}
<- function(X, B, lambda = 1) {
design_MVP_bagged bootstrap_portfolio(
portfolio_fun = function(X_sample) design_MVP(mu = colMeans(X_sample), Sigma = cov(X_sample), lambda = lambda),
X = X,
num_bootstraps = B,
type_bootstrap = "vanilla",
type_aggregation = "mean",
cardinality = ncol(X)
)
}
<- function(X, B, lambda = 1) {
design_MVP_SSR bootstrap_portfolio(
portfolio_fun = function(X_sample) design_MVP(mu = colMeans(X_sample), Sigma = cov(X_sample), lambda = lambda),
X = X,
num_bootstraps = B,
type_bootstrap = "subset resampling",
type_aggregation = "mean",
cardinality = ncol(X)
)
}
<- function(X, B, lambda = 1) {
design_MVP_SSR_bagged bootstrap_portfolio(
portfolio_fun = function(X_sample) design_MVP(mu = colMeans(X_sample), Sigma = cov(X_sample), lambda = lambda),
X = X,
num_bootstraps = B,
type_bootstrap = "vanilla bootstrap + subset resampling",
type_aggregation = "mean",
cardinality = ncol(X)
) }
We can now observe the improved sensitivity of the bagged mean–variance portfolios:
Code
library(pob)
library(mvtnorm)
<- SP500_2015to2020$stocks["2019::", c("AAPL", "AMZN", "AMD", "GM", "GOOGL", "MGM", "MSFT", "QCOM", "TSCO", "UPS")]
stock_prices
<- function(dataset, ...) {
MVP_bagged_noisy <- diff(log(dataset$prices))[-1]
X <- rmvnorm(n = nrow(X), mean = colMeans(X), sigma = cov(X))
X_noisy design_MVP_bagged(X_noisy, B = 20, lambda = 1)
}
# backtests
set.seed(42)
<- portfolioBacktest(list("#1" = MVP_bagged_noisy,
bt "#2" = MVP_bagged_noisy,
"#3" = MVP_bagged_noisy,
"#4" = MVP_bagged_noisy,
"#5" = MVP_bagged_noisy,
"#6" = MVP_bagged_noisy),
list(list("prices" = stock_prices)), price_name = "prices",
lookback = 6*21, optimize_every = 10000, rebalance_every = 10000)
Backtesting 6 portfolios over 1 datasets (periodicity = daily data)...
Code
listPortfoliosWithFailures(bt)
# plot
data.frame(
"stocks" = names(stock_prices),
"#1" = as.numeric(bt$`#1`$data1$w_optimized[1, ]),
"#2" = as.numeric(bt$`#2`$data1$w_optimized[1, ]),
"#3" = as.numeric(bt$`#3`$data1$w_optimized[1, ]),
"#4" = as.numeric(bt$`#4`$data1$w_optimized[1, ]),
"#5" = as.numeric(bt$`#5`$data1$w_optimized[1, ]),
"#6" = as.numeric(bt$`#6`$data1$w_optimized[1, ]),
check.names = FALSE) |>
melt(id.vars = "stocks") |>
ggplot(aes(x = stocks, y = value, fill = variable)) +
geom_bar(stat = "identity", position = "dodge", color = "black", width = 0.8) +
theme(legend.title = element_text()) + labs(fill = "realization") +
labs(title = "Portfolio for different estimation errors", y = "weight")
Naive versus resampled portfolios
We now compute the empirical distribution of the achieved mean–variance objective, as well as the Sharpe ratio, calculated over 1,000 Monte Carlo noisy observations. We can observe that resampled portfolios are more stable and do not suffer from extreme bad realizations (unlike the naive portfolio). However, the naive portfolio can be superior for some realizations.
Code
library(pob)
library(mvtnorm)
library(ggridges)
<- function(dataset, ...) {
MVP_noisy <- diff(log(dataset$prices))[-1]
X <- rmvnorm(n = nrow(X), mean = colMeans(X), sigma = cov(X))
X_noisy design_MVP(mu = colMeans(X_noisy), Sigma = cov(X_noisy), lambda = 1)
}
<- function(dataset, ...) {
MVP_bagged_noisy <- diff(log(dataset$prices))[-1]
X <- rmvnorm(n = nrow(X), mean = colMeans(X), sigma = cov(X))
X_noisy design_MVP_bagged(X_noisy, B = 20, lambda = 1)
}
<- function(dataset, ...) {
MVP_SSR_noisy <- diff(log(dataset$prices))[-1]
X <- rmvnorm(n = nrow(X), mean = colMeans(X), sigma = cov(X))
X_noisy design_MVP_SSR(X_noisy, B = 20, lambda = 1)
}
<- function(dataset, ...) {
MVP_SSR_bagged_noisy <- diff(log(dataset$prices))[-1]
X <- rmvnorm(n = nrow(X), mean = colMeans(X), sigma = cov(X))
X_noisy design_MVP_SSR_bagged(X_noisy, B = 20, lambda = 1)
}
# parameters
<- 1000
num_realiz_histogram <- 50
N <- 12
lookback_months
set.seed(42)
<- SP500_2015to2020$stocks["2019::", sample(471, N)]
stock_prices
# true mu and Sigma in the first training set
<- diff(log(stock_prices[1:(lookback_months*21), ]))[-1]
X_ <- colMeans(X_)
mu_true <- cov(X_)
Sigma_true <- design_MVP(mu = mu_true, Sigma = Sigma_true, lambda = 1)
w_clairvoyant
# backtests for naive MVP
set.seed(42)
<- portfolioBacktest(rep(list(MVP_noisy), num_realiz_histogram),
bt_naive list(list("prices" = stock_prices)), price_name = "prices",
lookback = lookback_months*21, optimize_every = 10000, rebalance_every = 10000,
paral_portfolios = 6, show_progress_bar = TRUE,
return_returns = FALSE)
Backtesting 1000 portfolios over 1 datasets (periodicity = daily data)...
Code
listPortfoliosWithFailures(bt_naive)
# backtests for bagged MVP
set.seed(42)
<- portfolioBacktest(rep(list(MVP_bagged_noisy), num_realiz_histogram),
bt_bagged list(list("prices" = stock_prices)), price_name = "prices",
lookback = lookback_months*21, optimize_every = 10000, rebalance_every = 10000,
paral_portfolios = 6, show_progress_bar = TRUE,
return_returns = FALSE)
Backtesting 1000 portfolios over 1 datasets (periodicity = daily data)...
Code
listPortfoliosWithFailures(bt_bagged)
# backtests for SSR MVP
set.seed(42)
<- portfolioBacktest(rep(list(MVP_SSR_noisy), num_realiz_histogram),
bt_SSR list(list("prices" = stock_prices)), price_name = "prices",
lookback = lookback_months*21, optimize_every = 10000, rebalance_every = 10000,
paral_portfolios = 6, show_progress_bar = TRUE,
return_returns = FALSE)
Backtesting 1000 portfolios over 1 datasets (periodicity = daily data)...
Code
listPortfoliosWithFailures(bt_SSR)
# backtests for SSR bagged MVP
set.seed(42)
<- portfolioBacktest(rep(list(MVP_SSR_bagged_noisy), num_realiz_histogram),
bt_SSR_bagged list(list("prices" = stock_prices)), price_name = "prices",
lookback = lookback_months*21, optimize_every = 10000, rebalance_every = 10000,
paral_portfolios = 6, show_progress_bar = TRUE,
return_returns = FALSE)
Backtesting 1000 portfolios over 1 datasets (periodicity = daily data)...
Code
listPortfoliosWithFailures(bt_SSR_bagged)
# evaluation
<- function(w) as.numeric(t(w) %*% mu_true - (1/2) * t(w) %*% Sigma_true %*% w)
fun_mean_variance <- function(w) as.numeric(t(w) %*% mu_true / sqrt(t(w) %*% Sigma_true %*% w))
fun_Sharpe_ratio
<- fun_mean_variance(w_clairvoyant)
mean_variance_clairvoyant <- sapply(bt_naive, function(bt_realiz) fun_mean_variance(as.vector(bt_realiz$data1$w_optimized[1, ])))
mean_variance_naive <- sapply(bt_bagged, function(bt_realiz) fun_mean_variance(as.vector(bt_realiz$data1$w_optimized[1, ])))
mean_variance_bagged <- sapply(bt_SSR, function(bt_realiz) fun_mean_variance(as.vector(bt_realiz$data1$w_optimized[1, ])))
mean_variance_SSR <- sapply(bt_SSR_bagged, function(bt_realiz) fun_mean_variance(as.vector(bt_realiz$data1$w_optimized[1, ])))
mean_variance_SSR_bagged
<- fun_Sharpe_ratio(w_clairvoyant)
SR_clairvoyant <- sapply(bt_naive, function(bt_realiz) fun_Sharpe_ratio(as.vector(bt_realiz$data1$w_optimized[1, ])))
SR_naive <- sapply(bt_bagged, function(bt_realiz) fun_Sharpe_ratio(as.vector(bt_realiz$data1$w_optimized[1, ])))
SR_bagged <- sapply(bt_SSR, function(bt_realiz) fun_Sharpe_ratio(as.vector(bt_realiz$data1$w_optimized[1, ])))
SR_SSR <- sapply(bt_SSR_bagged, function(bt_realiz) fun_Sharpe_ratio(as.vector(bt_realiz$data1$w_optimized[1, ])))
SR_SSR_bagged
<- data.frame("method" = "Naive MVP",
df "realization" = 1:num_realiz_histogram,
"mean-variance" = mean_variance_naive,
"Sharpe ratio" = SR_naive,
check.names = FALSE)
<- rbind(df, data.frame("method" = "Bagged MVP",
df "realization" = 1:num_realiz_histogram,
"mean-variance" = mean_variance_bagged,
"Sharpe ratio" = SR_bagged,
check.names = FALSE))
<- rbind(df, data.frame("method" = "Subset resampled MVP",
df "realization" = 1:num_realiz_histogram,
"mean-variance" = mean_variance_SSR,
"Sharpe ratio" = SR_SSR,
check.names = FALSE))
<- rbind(df, data.frame("method" = "Subset bagged MVP",
df "realization" = 1:num_realiz_histogram,
"mean-variance" = mean_variance_SSR_bagged,
"Sharpe ratio" = SR_SSR_bagged,
check.names = FALSE))
$method <- factor(df$method, levels = unique(df$method)) df
Code
ggplot(df, aes(x = `mean-variance`, y = method, fill = method)) +
geom_density_ridges(show.legend = FALSE) +
scale_y_discrete(limits = rev(levels(df$method)), expand = c(0, 0.2, 0, 1.6)) +
theme(axis.text.y = element_text(colour = "black", size = 10)) +
xlim(c(-0.001, NA)) +
labs(title = "Empirical distribution of mean-variance value", y = NULL)
Code
ggplot(df, aes(x = `Sharpe ratio`, y = method, fill = method)) +
geom_density_ridges(show.legend = FALSE) +
scale_y_discrete(limits = rev(levels(df$method)), expand = c(0, 0.2, 0, 1.6)) +
theme(axis.text.y = element_text(colour = "black", size = 10)) +
xlim(c(-0.05, NA)) +
labs(title = "Empirical distribution of Sharpe ratio", y = NULL)
Backtests
Backtest of naive versus resampled mean–variance portfolios:
Code
library(pob)
<- function(dataset, ...) {
MVP_naive <- diff(log(dataset$prices))[-1]
X design_MVP(mu = colMeans(X), Sigma = cov(X), lambda = 1)
}
<- function(dataset, ...) {
MVP_bagged <- diff(log(dataset$prices))[-1]
X design_MVP_bagged(X, B = 20, lambda = 1)
}
<- function(dataset, ...) {
MVP_SSR <- diff(log(dataset$prices))[-1]
X design_MVP_SSR(X, B = 20, lambda = 1)
}
<- function(dataset, ...) {
MVP_SSR_bagged <- diff(log(dataset$prices))[-1]
X design_MVP_SSR_bagged(X, B = 20, lambda = 1)
}
# single backtest
<- 50
N <- 6
lookback_months set.seed(42)
<- SP500_2015to2020$stocks["2017::", sample(471, N)]
stock_prices
<- portfolioBacktest(list("Naive MVP" = MVP_naive,
bt "Bagged MVP" = MVP_bagged,
"Subset resampled MVP" = MVP_SSR,
"Subset bagged MVP" = MVP_SSR_bagged),
list(list("prices" = stock_prices)), price_name = "prices",
lookback = lookback_months*21, optimize_every = 21, rebalance_every = 1,
return_portfolio = FALSE)
Backtesting 4 portfolios over 1 datasets (periodicity = daily data)...
Code
listPortfoliosWithFailures(bt)
<- lapply(bt, function(pf) {pf$data1$X_lin <- NULL; return(pf)}) # reduce object size bt
Code
backtestChartCumReturn(bt) +
scale_x_date(date_breaks = "6 months", date_labels = "%b %Y", date_minor_breaks = "1 week") +
theme(legend.title = element_blank()) +
ggtitle("Cumulative P&L")
Code
backtestChartDrawdown(bt) +
scale_x_date(date_breaks = "6 months", date_labels = "%b %Y", date_minor_breaks = "1 week") +
theme(legend.title = element_blank()) +
ggtitle("Drawdown")
To be more exhaustive, we now run multiple backtests of naive versus resampled mean–variance portfolios:
Code
library(pob)
# multiple backtests
<- 50
N <- 6
lookback_months set.seed(42)
<- financialDataResample(list("prices" = SP500_2015to2020$stocks),
stock_prices_resampled num_datasets = 200, N_sample = N, T_sample = 12*21)
200 datasets resampled (with N = 50 instruments and length T = 252) from the original data between 2015-01-05 and 2020-09-22.
Code
<- portfolioBacktest(list("Naive MVP" = MVP_naive,
bt "Bagged MVP" = MVP_bagged,
"Subset resampled MVP" = MVP_SSR,
"Subset bagged MVP" = MVP_SSR_bagged),
dataset_list = stock_prices_resampled,
lookback = lookback_months*21, optimize_every = 21, rebalance_every = 1,
return_portfolio = FALSE, return_returns = FALSE,
paral_datasets = 6, show_progress_bar = TRUE)
Backtesting 4 portfolios over 200 datasets (periodicity = daily data)...
Backtesting function "Naive MVP " (1/4)
Backtesting function "Bagged MVP " (2/4)
Backtesting function "Subset resampled MVP" (3/4)
Backtesting function "Subset bagged MVP" (4/4)
Code
listPortfoliosWithFailures(bt)
Code
backtestBoxPlot(bt, "Sharpe ratio")
Code
backtestBoxPlot(bt, "max drawdown")