# basic finance
library(xts) # to manipulate time series of stock data
library(pob) # book package with financial data
# install with: devtools::install_github("dppalomar/pob")
# data
library(mvtnorm) # to generate heavy-tailed data
library(fitHeavyTail) # to fit heavy-tailed distributions
library(ICSNP) # to calculate spatial mean
library(covFactorModel) # factor model estimation
# devtools::install_github("dppalomar/covFactorModel")
# plotting
library(ggplot2) # for nice plots
library(patchwork) # for combining plots
library(reshape2) # to reshape data
library(dplyr); conflictRules('dplyr', exclude = 'lag'); options(xts.warn_dplyr_breaks_lag = FALSE)
library(scales)
library(latex2exp) # for latex symbols with TeX()
Portfolio Optimization
Chapter 3 - Financial Data: IID Modeling
R code
R code examples for Chapter 3 of the book:
Daniel P. Palomar (2024). Portfolio Optimization: Theory and Application. Cambridge University Press.
Loading packages
The following packages are used in the examples:
IID model
Example of a synthetic Gaussian i.i.d. time series:
library(mvtnorm)
# get realistic mu and variance
<- SP500_2015to2020$index
sp500_prices <- diff(log(sp500_prices))[-1]
sp500_returns <- mean(sp500_returns)
mu <- var(sp500_returns)
var
# generate synthetic data
set.seed(42)
<- 500
T <- rmvnorm(n = T, mean = mu, sigma = var)
x
data.frame(t = 1:T, x = x) |>
ggplot(aes(x = t, y = x)) +
geom_line(linewidth = 0.8, color = "blue", show.legend = FALSE) +
labs(title = "i.i.d. time series", x = NULL, y = NULL)
Sample estimators
Estimation error of sample estimators versus number of observations (for Gaussian data with \(N=100\)):
library(mvtnorm)
# get realistic mu and Sigma
<- diff(log(SP500_2015to2020$stocks))[-1, 1:100]
X <- colMeans(X)
mu_true <- cov(X)
Sigma_true
# generate synthetic data
set.seed(42)
<- 2000
T_max <- rmvnorm(n = 10*T_max, mean = mu_true, sigma = Sigma_true)
X
# main loop
<- data.frame()
df <- seq(from = 50, by = 50, to = T_max)
T_sweep for(T in T_sweep) {
for (i in 1:100) {
<- X[sample(nrow(X), T), ]
X_ <- colMeans(X_)
mu <- cov(X_)
Sigma <- rbind(df,
df data.frame("T" = T,
"method" = "sample mean",
"error mu" = norm(mu - mu_true, "2")/norm(mu_true, "2"),
"error Sigma" = norm(Sigma - Sigma_true, "F")/norm(Sigma_true, "F"),
check.names = FALSE))
}
}
<- df |>
df group_by(method, T) |>
summarize("error mu" = 100*mean(`error mu`),
"error Sigma" = 100*mean(`error Sigma`)) |>
ungroup()
# plot
<- df |>
p_error_mu ggplot(aes(x = T, y = `error mu`, color = method)) +
geom_line(linewidth = 1.2, show.legend = FALSE) +
labs(title = "Estimation error of sample mean", x = "T", y = "normalized error (%)")
<- df |>
p_error_Sigma ggplot(aes(x = T, y = `error Sigma`, color = method)) +
geom_line(linewidth = 1.2, show.legend = FALSE) +
labs(title = "Estimation error of sample covariance matrix", x = "T", y = "normalized error (%)")
/ p_error_Sigma p_error_mu
Location estimators
Estimation error of location estimators versus number of observations (with \(N=100\)):
library(mvtnorm)
# get realistic mu and Sigma
<- diff(log(SP500_2015to2020$stocks))[-1, 1:100]
X <- colMeans(X)
mu_true <- cov(X)
Sigma_true <- 4
nu <- (nu-2)/nu * Sigma_true # cov=nu/(nu-2)*scatter
scatter_true
# generate synthetic data
set.seed(42)
<- 1000
T_max <- rmvnorm(n = 10*T_max, mean = mu_true, sigma = Sigma_true)
X_Gaussian <- rmvt(n = 10*T_max, delta = mu_true, sigma = scatter_true, df = nu)
X_heavy_tail
# main loop
<- data.frame()
df <- seq(from = 50, by = 50, to = T_max)
T_sweep for(T in T_sweep) {
for (distribution in c("Gaussian", "heavy-tailed")) {
for (i in 1:100) {
if (distribution == "Gaussian")
<- X_Gaussian[sample(nrow(X_Gaussian), T), ]
X_ else if (distribution == "heavy-tailed")
<- X_heavy_tail[sample(nrow(X_heavy_tail), T), ]
X_ else stop("Unknown distribution.")
# sample mean
<- colMeans(X_)
mu <- rbind(df, data.frame("T" = T,
df "distribution" = distribution,
"method" = "sample mean",
"error mu" = norm(mu - mu_true, "2")/norm(mu_true, "2"),
check.names = FALSE))
# median
<- apply(X_, 2, median)
mu <- rbind(df, data.frame("T" = T,
df "distribution" = distribution,
"method" = "median",
"error mu" = norm(mu - mu_true, "2")/norm(mu_true, "2"),
check.names = FALSE))
# spatial median
<- ICSNP::spatial.median(X_)
mu <- rbind(df, data.frame("T" = T,
df "distribution" = distribution,
"method" = "spatial median",
"error mu" = norm(mu - mu_true, "2")/norm(mu_true, "2"),
check.names = FALSE))
}
}
}$method <- factor(df$method, levels = unique(df$method))
df
<- df |>
df group_by(method, T, distribution) |>
summarize("error mu" = 100*mean(`error mu`)) |>
ungroup()
# plot0
<- df[df$distribution == "Gaussian", ] |>
p_Gaussian ggplot(aes(x = T, y = `error mu`, color = method,)) +
geom_line(linewidth = 1.2) +
labs(color = "estimator") +
coord_cartesian(xlim = c(1, T_max)) +
labs(title = "Error of location estimators under Gaussian data", x = "T", y = "normalized error (%)")
<- df[df$distribution == "heavy-tailed", ] |>
p_heavy_tail ggplot(aes(x = T, y = `error mu`, color = method)) +
geom_line(linewidth = 1.2) +
labs(color = "estimator") +
coord_cartesian(xlim = c(1, T_max)) +
labs(title = "Error of location estimators under heavy-tailed data", x = "T", y = "normalized error (%)")
/ p_heavy_tail + plot_layout(guides = "collect") p_Gaussian
Estimation error of location estimators versus degrees of freedom in a \(t\) distribution (with \(T=200\) and \(N=100\)):
library(mvtnorm)
# get realistic mu and Sigma
<- diff(log(SP500_2015to2020$stocks))[-1, 1:100]
X <- colMeans(X)
mu_true <- cov(X)
Sigma_true <- 200
T
# main loop
set.seed(42)
<- data.frame()
df <- seq(from = 3, by = 2, to = 30)
nu_sweep for(nu in nu_sweep) {
for (i in 1:200) {
# generate synthetic data
<- rmvt(n = T, delta = mu_true, sigma = (nu-2)/nu * Sigma_true, df = nu)
X_
# sample mean
<- colMeans(X_)
mu <- rbind(df, data.frame("T" = T,
df "nu" = nu,
"method" = "sample mean",
"error mu" = norm(mu - mu_true, "2")/norm(mu_true, "2"),
check.names = FALSE))
# median
<- apply(X_, 2, median)
mu <- rbind(df, data.frame("T" = T,
df "nu" = nu,
"method" = "median",
"error mu" = norm(mu - mu_true, "2")/norm(mu_true, "2"),
check.names = FALSE))
# spatial median
<- ICSNP::spatial.median(X_)
mu <- rbind(df, data.frame("T" = T,
df "nu" = nu,
"method" = "spatial median",
"error mu" = norm(mu - mu_true, "2")/norm(mu_true, "2"),
check.names = FALSE))
}
}$method <- factor(df$method, levels = unique(df$method))
df
<- df |>
df group_by(method, T, nu) |>
summarize("error mu" = 100*mean(`error mu`)) |>
ungroup()
# plot
|>
df ggplot(aes(x = nu, y = `error mu`, color = method)) +
geom_line(linewidth = 1.2) +
labs(color = "estimator") +
labs(title = "Error of location estimators", x = TeX("$\\nu$"), y = "normalized error (%)")
Gaussian ML estimators
Estimation error of Gaussian ML estimators versus number of observations (with \(N=100\)):
library(mvtnorm)
# get realistic mu and Sigma
<- diff(log(SP500_2015to2020$stocks))[-1, 1:100]
X <- colMeans(X)
mu_true <- cov(X)
Sigma_true <- 4
nu <- (nu-2)/nu * Sigma_true # cov=nu/(nu-2)*scatter
scatter_true
# generate synthetic data
set.seed(42)
<- 1000
T_max <- rmvnorm(n = 10*T_max, mean = mu_true, sigma = Sigma_true)
X_Gaussian <- rmvt(n = 10*T_max, delta = mu_true, sigma = scatter_true, df = nu)
X_heavy_tail
# main loop
<- data.frame()
df <- seq(from = 50, by = 50, to = T_max)
T_sweep for(T in T_sweep) {
for (distribution in c("Gaussian", "heavy-tailed")) {
for (i in 1:100) {
if (distribution == "Gaussian")
<- X_Gaussian[sample(nrow(X_Gaussian), T), ]
X_ else if (distribution == "heavy-tailed")
<- X_heavy_tail[sample(nrow(X_heavy_tail), T), ]
X_ else stop("Unknown distribution.")
# sample mean
<- colMeans(X_)
mu <- (T-1)/T * cov(X_)
Sigma <- rbind(df, data.frame("T" = T,
df "distribution" = distribution,
"method" = "Gaussian ML",
"error mu" = norm(mu - mu_true, "2")/norm(mu_true, "2"),
"error Sigma" = norm(Sigma - Sigma_true, "F")/norm(Sigma_true, "F"),
check.names = FALSE))
}
}
}$method <- factor(df$method, levels = unique(df$method))
df
<- df |>
df group_by(method, T, distribution) |>
summarize("error mu" = 100*mean(`error mu`),
"error Sigma" = 100*mean(`error Sigma`)) |>
ungroup()
# plot
<- df |>
p_error_mu ggplot(aes(x = T, y = `error mu`, color = distribution)) +
geom_line(linewidth = 1.2) +
labs(color = "data distribution") +
labs(title = "Error of Gaussian ML mean estimator", x = "T", y = "normalized error (%)")
<- df |>
p_error_Sigma ggplot(aes(x = T, y = `error Sigma`, color = distribution)) +
geom_line(linewidth = 1.2) +
labs(color = "data distribution") +
labs(title = "Error of Gaussian ML covariance estimator", x = "T", y = "normalized error (%)")
/ p_error_Sigma + plot_layout(guides = "collect") p_error_mu
Estimation error of Gaussian ML estimators versus degrees of freedom in a \(t\) distribution (with \(T=200\) and \(N=100\)):
library(mvtnorm)
# get realistic mu and Sigma
<- diff(log(SP500_2015to2020$stocks))[-1, 1:100]
X <- colMeans(X)
mu_true <- cov(X)
Sigma_true <- 200
T
# main loop
set.seed(42)
<- data.frame()
df <- seq(from = 3, by = 2, to = 30)
nu_sweep for(nu in nu_sweep) {
for (i in 1:200) {
# generate synthetic data
<- rmvt(n = T, delta = mu_true, sigma = (nu-2)/nu * Sigma_true, df = nu)
X
# sample mean
<- colMeans(X)
mu <- (T-1)/T * cov(X)
Sigma <- rbind(df, data.frame("T" = T,
df "nu" = nu,
"method" = "Gaussian ML",
"error mu" = norm(mu - mu_true, "2")/norm(mu_true, "2"),
"error Sigma" = norm(Sigma - Sigma_true, "F")/norm(Sigma_true, "F"),
check.names = FALSE))
}
}$method <- factor(df$method, levels = unique(df$method))
df
<- df |>
df group_by(method, T, nu) |>
summarize("error mu" = 100*mean(`error mu`),
"error Sigma" = 100*mean(`error Sigma`)) |>
ungroup()
# plot
<- df |>
p_error_mu ggplot(aes(x = nu, y = `error mu`, color = method)) +
geom_line(linewidth = 1.2, show.legend = FALSE) +
coord_cartesian(ylim = c(200, 250)) +
labs(title = "Estimation error of Gaussian ML mean estimator", x = TeX("$\\nu$"), y = "normalized error (%)")
<- df |>
p_error_Sigma ggplot(aes(x = nu, y = `error Sigma`, color = method)) +
geom_line(linewidth = 1.2, show.legend = FALSE) +
labs(title = "Estimation error of Gaussian ML covariance estimator", x = TeX("$\\nu$"), y = "normalized error (%)")
/ p_error_Sigma p_error_mu
Heavy-tailed ML estimators
Estimation error of different ML estimators versus number of observations (for \(t\)-distributed heavy-tailed data with \(\nu=4\) and \(N=100\)):
library(mvtnorm)
library(fitHeavyTail)
library(ICSNP)
# get realistic mu and Sigma
<- diff(log(SP500_2015to2020$stocks))[-1, 1:100]
X <- colMeans(X)
mu_true <- cov(X)
Sigma_true <- 4
nu
# generate synthetic data
set.seed(42)
<- 1000
T_max <- rmvt(n = 10*T_max, delta = mu_true, sigma = (nu-2)/nu * Sigma_true, df = nu)
X
# main loop
<- data.frame()
df <- seq(from = 150, by = 50, to = T_max)
T_sweep for(T in T_sweep) {
for (i in 1:100) {
<- X[sample(nrow(X), T), ]
X_
# Gaussian MLE
<- colMeans(X_)
mu <- (T-1)/T * cov(X_)
Sigma <- rbind(df, data.frame("T" = T,
df "method" = "Gaussian MLE",
"error mu" = norm(mu - mu_true, "2")/norm(mu_true, "2"),
"MAE mu" = sum(abs(mu - mu_true))/sum(abs(mu_true)),
"error Sigma" = norm(Sigma - Sigma_true, "F")/norm(Sigma_true, "F"),
check.names = FALSE))
# heavy-tailed MLE
<- fit_mvt(X_, nu = "iterative", nu_iterative_method = "POP")
fitted <- rbind(df, data.frame("T" = T,
df "method" = "heavy-tailed MLE",
"error mu" = norm(fitted$mu - mu_true, "2")/norm(mu_true, "2"),
"MAE mu" = sum(abs(fitted$mu - mu_true))/sum(abs(mu_true)),
"error Sigma" = norm(fitted$cov - Sigma_true, "F")/norm(Sigma_true, "F"),
check.names = FALSE))
# Tyler (with spatial median)
<- fit_Tyler(X_, initial = list(mu = ICSNP::spatial.median(X_)))
fitted <- rbind(df, data.frame("T" = T,
df "method" = "Tyler estimator",
"error mu" = norm(fitted$mu - mu_true, "2")/norm(mu_true, "2"),
"MAE mu" = sum(abs(fitted$mu - mu_true))/sum(abs(mu_true)),
"error Sigma" = norm(fitted$cov - Sigma_true, "F")/norm(Sigma_true, "F"),
check.names = FALSE))
}
}$method <- factor(df$method, levels = unique(df$method))
df
<- df |>
df group_by(method, T) |>
summarize("error mu" = 100*mean(`error mu`),
"RMSE mu" = 100*sqrt(mean(`error mu`)),
"MAE mu" = 100*mean(`MAE mu`),
"error Sigma" = 100*mean(`error Sigma`)) |>
ungroup()
# plot
<- df |>
p_error_mu ggplot(aes(x = T, y = `error mu`, color = method)) +
geom_line(linewidth = 1.2) +
scale_color_discrete(name = "estimator", labels = c("Gaussian MLE", "heavy-tailed MLE", "spatial median")) +
labs(title = "Error of mean estimators", x = "T", y = "normalized error (%)")
<- df |>
p_error_Sigma ggplot(aes(x = T, y = `error Sigma`, color = method)) +
geom_line(linewidth = 1.2) +
labs(color = "estimator") +
labs(title = "Error of covariance estimators", x = "T", y = "normalized error (%)")
/ p_error_Sigma p_error_mu
Estimation error of different ML estimators versus degrees of freedom in a \(t\) distribution (with \(T=200\) and \(N=100\)):
library(mvtnorm)
library(fitHeavyTail)
library(ICSNP)
# get realistic mu and Sigma
<- diff(log(SP500_2015to2020$stocks))[-1, 1:100]
X <- colMeans(X)
mu_true <- cov(X)
Sigma_true <- 200
T
# main loop
set.seed(42)
<- data.frame()
df <- seq(from = 3, by = 2, to = 30)
nu_sweep for(nu in nu_sweep) {
for (i in 1:200) {
# generate synthetic data
<- rmvt(n = T, delta = mu_true, sigma = (nu-2)/nu * Sigma_true, df = nu)
X
# Gaussian MLE
<- colMeans(X)
mu <- (T-1)/T * cov(X)
Sigma <- rbind(df, data.frame("T" = T,
df "nu" = nu,
"method" = "Gaussian MLE",
"error mu" = norm(mu - mu_true, "2")/norm(mu_true, "2"),
"error Sigma" = norm(Sigma - Sigma_true, "F")/norm(Sigma_true, "F"),
check.names = FALSE))
# heavy-tailed MLE
<- fit_mvt(X, nu = "iterative", nu_iterative_method = "POP")
fitted <- rbind(df, data.frame("T" = T,
df "nu" = nu,
"method" = "heavy-tailed MLE",
"error mu" = norm(fitted$mu - mu_true, "2")/norm(mu_true, "2"),
"error Sigma" = norm(fitted$cov - Sigma_true, "F")/norm(Sigma_true, "F"),
check.names = FALSE))
# Tyler (with spatial median)
<- fit_Tyler(X, initial = list(mu = ICSNP::spatial.median(X)))
fitted <- rbind(df, data.frame("T" = T,
df "nu" = nu,
"method" = "Tyler estimator",
"error mu" = norm(fitted$mu - mu_true, "2")/norm(mu_true, "2"),
"error Sigma" = norm(fitted$cov - Sigma_true, "F")/norm(Sigma_true, "F"),
check.names = FALSE))
}
}$method <- factor(df$method, levels = unique(df$method))
df
<- df |>
df group_by(method, T, nu) |>
summarize("error mu" = 100*mean(`error mu`),
"error Sigma" = 100*mean(`error Sigma`)) |>
ungroup()
# plot
<- df |>
p_error_mu ggplot(aes(x = nu, y = `error mu`, color = method)) +
geom_line(linewidth = 1.2) +
scale_color_discrete(name = "estimator", labels = c("Gaussian MLE", "heavy-tailed MLE", "spatial median")) +
labs(title = "Error of mean estimators", x = TeX("$\\nu$"), y = "normalized error (%)")
<- df |>
p_error_Sigma ggplot(aes(x = nu, y = `error Sigma`, color = method)) +
geom_line(linewidth = 1.2) +
labs(color = "estimator") +
labs(title = "Error of covariance estimators", x = TeX("$\\nu$"), y = "normalized error (%)")
/ p_error_Sigma p_error_mu
Convergence of iterative robust heavy-tailed ML estimators:
library(mvtnorm)
library(fitHeavyTail)
# get realistic mu and Sigma
<- diff(log(SP500_2015to2020$stocks))[-1, 1:100]
X <- colMeans(X)
mu_true <- cov(X)
Sigma_true <- 200
T <- 4
nu
# generate synthetic data
set.seed(42)
<- rmvt(n = T, delta = mu_true, sigma = (nu-2)/nu * Sigma_true, df = nu)
X
# heavy-tailed MLE
<- fit_mvt(X, nu = "iterative", nu_iterative_method = "ECM", ftol = 1e-3, return_iterates = TRUE)
fitted
# plot
data.frame("k" = 0:fitted$num_iterations,
"negative log-likelihood" = -fitted$log_likelihood_vs_iterations,
check.names = FALSE) |>
ggplot(aes(x = k, y = `negative log-likelihood`)) +
geom_line(linewidth = 1.2, color = "blue") +
ggtitle("Converge of objective function") + xlab("iterations")
Prior information
Shrinkage
First, we define a simple and convenient function to shrink the mean vector:
<- function(X, mu = NULL, cov = NULL, mu_sh_use_cor = TRUE,
shrink_mu mu_target = c("grand mean", "volatility weighted", "covmat weighted")) {
<- nrow(X)
T <- ncol(X)
N
if (is.null(mu))
<- colMeans(X)
mu if (is.null(cov))
<- cov(X)
cov
# target
if (!is.numeric(mu_target))
<- switch(match.arg(mu_target),
mu_target "grand mean" = mean(mu),
"volatility weighted" = {
<- sqrt(diag(cov))
vol mean(mu/vol) * vol
},"covmat weighted" = {
<- if (mu_sh_use_cor) solve(cov2cor(cov), rep(1, N))
invSigma_ones else solve(cov, rep(1, N))
<- pmax(0, invSigma_ones)
invSigma_ones as.numeric(mu %*% invSigma_ones)/sum(invSigma_ones) # return achieved by GMVP
},stop("Shrinkage estimation method for mu not recognized!"))
# shrinkage
<- (N + 2) / ((N + 2) + T * as.numeric((mu - mu_target) %*% solve(cov, mu - mu_target))) # improved James-Stein estimator
rho #rho <- (N - 2)/(T*as.numeric((mu - mu_target) %*% solve(cov, mu - mu_target))) # original James-Stein estimator
<- max(0, min(1, rho))
rho <- (1 - rho) * mu + rho * mu_target
mu_shrinked return(mu_shrinked)
}
Estimation error of different shrinkage estimators versus number of observations (for Gaussian data with \(N=100\)):
library(mvtnorm)
# get realistic mu and Sigma
<- diff(log(SP500_2015to2020$stocks))[-1, 1:100]
X <- ncol(X)
N <- colMeans(X)
mu_true <- cov(X)
Sigma_true <- 8
nu
# generate synthetic data
set.seed(42)
<- 550
T_max <- rmvt(n = 10*T_max, delta = mu_true, sigma = (nu-2)/nu * Sigma_true, df = nu)
X
# main loop
<- data.frame()
df <- seq(from = 50, by = 50, to = T_max)
T_sweep for(T in T_sweep) {
for (i in 1:200) {
<- X[sample(nrow(X), T), ]
X_
############# #############
############# Mean estimation #############
############# #############
<- cov(X_)
Sigma <- Sigma + mean(diag(Sigma)) * diag(N) # regularize Sigma for stability
Sigma_reg
#
# plain sample mean
#
<- colMeans(X_)
mu <- rbind(df, data.frame("T" = T,
df "parameter" = "mu",
"method" = "sample mean",
"error" = norm(mu - mu_true, "2")/norm(mu_true, "2"),
check.names = FALSE))
#
# sample mean + shrinkage to zero
#
<- shrink_mu(X_, mu = colMeans(X_), mu_target = 0, cov = Sigma_reg)
mu_JS <- rbind(df, data.frame("T" = T,
df "parameter" = "mu",
"method" = "shrinkage to zero",
"error" = norm(mu_JS - mu_true, "2")/norm(mu_true, "2"),
check.names = FALSE))
#
# sample mean + shrinkage to grand mean
#
<- shrink_mu(X_, mu = colMeans(X_), mu_target = "grand mean", cov = Sigma_reg)
mu_JS <- rbind(df, data.frame("T" = T,
df "parameter" = "mu",
"method" = "shrinkage to grand mean",
"error" = norm(mu_JS - mu_true, "2")/norm(mu_true, "2"),
check.names = FALSE))
#
# sample mean + shrinkage to volatility weighted mean
#
<- shrink_mu(X_, mu = colMeans(X_), mu_target = "covmat weighted", cov = Sigma_reg, mu_sh_use_cor = FALSE)
mu_JS <- rbind(df, data.frame("T" = T,
df "parameter" = "mu",
"method" = "shrinkage to volatility-weighted mean",
"error" = norm(mu_JS - mu_true, "2")/norm(mu_true, "2"),
check.names = FALSE))
############# #############
############# Covariance matrix estimation #############
############# #############
#
# plain SCM
#
<- cov(X_)
Sigma <- rbind(df, data.frame("T" = T,
df "parameter" = "Sigma",
"method" = "sample covariance matrix",
"error" = norm(Sigma - Sigma_true, "F")/norm(Sigma_true, "F"),
check.names = FALSE))
#
# SCM + Ledoit-Wolf shrinked to scaled identity
#
<- cov(X_)
S <- mean(diag(S)) * diag(N)
Sigma_target <- X_ - matrix(colMeans(X_), T, N, byrow = TRUE)
Xc <- min(1, ((1/T^2) * sum(rowSums(Xc^2)^2) - 1/T * sum(S^2)) / sum((S - Sigma_target)^2))
rho <- (1 - rho)*S + rho*Sigma_target
Sigma_sh <- rbind(df, data.frame("T" = T,
df "parameter" = "Sigma",
"method" = "shrinkage to scaled identity",
"error" = norm(Sigma_sh - Sigma_true, "F")/norm(Sigma_true, "F"),
check.names = FALSE))
#
# SCM + Ledoit-Wolf shrinked to diagonal matrix
#
<- cov(X_)
S <- diag(diag(S))
Sigma_target <- X_ - matrix(colMeans(X_), T, N, byrow = TRUE)
Xc <- min(1, ((1/T^2) * sum(rowSums(Xc^2)^2) - 1/T * sum(S^2)) / sum((S - Sigma_target)^2))
rho <- (1 - rho)*S + rho*Sigma_target
Sigma_sh <- rbind(df, data.frame("T" = T,
df "parameter" = "Sigma",
"method" = "shrinkage to diagonal matrix",
"error" = norm(Sigma_sh - Sigma_true, "F")/norm(Sigma_true, "F"),
check.names = FALSE))
#
# SCM + Ledoit-Wolf shrinked to equal-correlation matrix
#
<- cov(X_)
S <- sqrt(diag(S))
sigma <- cov2cor(S)
C <- mean(C[lower.tri(C)])
ave_corr lower.tri(C)] <- ave_corr
C[upper.tri(C)] <- ave_corr
C[<- t(C * sigma) * sigma
Sigma_target <- X_ - matrix(colMeans(X_), T, N, byrow = TRUE)
Xc <- min(1, ((1/T^2) * sum(rowSums(Xc^2)^2) - 1/T * sum(S^2)) / sum((S - Sigma_target)^2))
rho <- (1 - rho)*S + rho*Sigma_target
Sigma_sh <- rbind(df, data.frame("T" = T,
df "parameter" = "Sigma",
"method" = "shrinkage to equal-correlation matrix",
"error" = norm(Sigma_sh - Sigma_true, "F")/norm(Sigma_true, "F"),
check.names = FALSE))
}
}$method <- factor(df$method, levels = unique(df$method))
df
<- df |>
df group_by(parameter, method, T) |>
summarize("error" = 100*mean(`error`)) |>
ungroup()
# plot
<- df[df$parameter == "mu", ] |>
p_error_mu ggplot(aes(x = T, y = `error`, color = method)) +
geom_line(linewidth = 1.2) +
labs(color = "estimator") +
scale_x_continuous(breaks = seq(from = T_sweep[1], to = last(T_sweep), by = 100)) +
labs(title = "Error of mean estimators", x= "T", y = "normalized error (%)")
<- df[df$parameter == "Sigma", ] |>
p_error_Sigma ggplot(aes(x = T, y = `error`, color = method)) +
geom_line(linewidth = 1.2) +
labs(color = "estimator") +
scale_x_continuous(breaks = seq(from = T_sweep[1], to = last(T_sweep), by = 100)) +
labs(title = "Error of covariance estimators", x= "T", y = "normalized error (%)")
/ p_error_Sigma p_error_mu
Factor models
Estimation error of covariance matrix under factor modeling versus number of observations (with \(N=100\)):
library(mvtnorm)
library(covFactorModel)
# get realistic mu and Sigma
<- diff(log(SP500_2015to2020$stocks))[-1, 1:100]
X <- ncol(X)
N <- rep(0, N) #colMeans(X)
mu_true
# Forcing low-rank covmat - method #1
<- covFactorModel(X, type = "Stat-PCA", K = 3)
Sigma_true
# generate synthetic data
set.seed(42)
<- 500
T_max <- rmvt(n = 10*T_max, delta = mu_true, sigma = Sigma_true, df = Inf)
X <- as.xts(X, order.by = as.Date("1995-03-15") + 1:nrow(X))
X
# main loop
<- data.frame()
df <- seq(from = 100, by = 50, to = T_max)
T_sweep <- eigen(Sigma_true)$values
eigenvalues_true for(T in T_sweep) {
for (i in 1:200) {
<- X[sample(nrow(X), T), ]
X_
# SCM
<- cov(X_)
Sigma <- rbind(df, data.frame("T" = T,
df "method" = "sample covariance matrix",
"error eigenvalues" = norm(eigen(Sigma)$values - eigenvalues_true, "2")/norm(eigenvalues_true, "2"),
"error Sigma" = norm(Sigma - Sigma_true, "F")/norm(Sigma_true, "F"),
check.names = FALSE))
# 1-factor model
<- covFactorModel(X_, type = "Stat-PCA", K = 1)
Sigma <- rbind(df, data.frame("T" = T,
df "method" = "factor model (K=1)",
"error eigenvalues" = norm(eigen(Sigma)$values - eigenvalues_true, "2")/norm(eigenvalues_true, "2"),
"error Sigma" = norm(Sigma - Sigma_true, "F")/norm(Sigma_true, "F"),
check.names = FALSE))
# 3-factor model
<- covFactorModel(X_, type = "Stat-PCA", K = 3)
Sigma <- rbind(df, data.frame("T" = T,
df "method" = "factor model (K=3)",
"error eigenvalues" = norm(eigen(Sigma)$values - eigenvalues_true, "2")/norm(eigenvalues_true, "2"),
"error Sigma" = norm(Sigma - Sigma_true, "F")/norm(Sigma_true, "F"),
check.names = FALSE))
# 5-factor model
<- covFactorModel(X_, type = "Stat-PCA", K = 5)
Sigma <- rbind(df, data.frame("T" = T,
df "method" = "factor model (K=5)",
"error eigenvalues" = norm(eigen(Sigma)$values - eigenvalues_true, "2")/norm(eigenvalues_true, "2"),
"error Sigma" = norm(Sigma - Sigma_true, "F")/norm(Sigma_true, "F"),
check.names = FALSE))
}
}$method <- factor(df$method, levels = unique(df$method))
df
<- df |>
df group_by(method, T) |>
summarize("error Sigma" = 100*mean(`error Sigma`),
"error eigenvalues" = 100*mean(`error eigenvalues`)) |>
ungroup()
`summarise()` has grouped output by 'method'. You can override using the
`.groups` argument.
# plot
|>
df ggplot(aes(x = T, y = `error Sigma`, color = method)) +
geom_line(linewidth = 1.2) +
labs(color = "estimator") +
labs(title = "Error of covariance estimators", x= "T", y = "normalized error (%)")