R Code for Portfolio Optimization
Chapter 3 – Financial Data: I.I.D. Modeling

Daniel P. Palomar (2025). Portfolio Optimization: Theory and Application. Cambridge University Press.

Last update: February 17, 2025

Packages

The following packages are used in the examples:

# basic finance
library(xts)                    # to manipulate time series of stock data
library(pob)                    # book package with financial data
                                # install with: devtools::install_github("dppalomar/pob")

# data
library(mvtnorm)                # to generate heavy-tailed data
library(fitHeavyTail)           # to fit heavy-tailed distributions
library(ICSNP)                  # to calculate spatial mean
library(covFactorModel)         # factor model estimation
                                # devtools::install_github("dppalomar/covFactorModel")

# plotting
library(ggplot2)                # for nice plots
library(patchwork)              # for combining plots
library(reshape2)               # to reshape data
library(dplyr); conflictRules('dplyr', exclude = 'lag'); options(xts.warn_dplyr_breaks_lag = FALSE)
library(scales)
library(latex2exp)              # for latex symbols with TeX()
library(ellipse)

I.I.D. model

Example of a synthetic Gaussian i.i.d. time series:

library(mvtnorm)

# get realistic mu and variance
sp500_prices <- SP500_2015to2020$index
sp500_returns <- diff(log(sp500_prices))[-1]
mu <- mean(sp500_returns)
var <- var(sp500_returns)

# generate synthetic data 
set.seed(42)
T <- 500
x <- rmvnorm(n = T, mean = mu, sigma = var)

data.frame(t = 1:T, x = x) |>
  ggplot(aes(x = t, y = x)) +
  geom_line(linewidth = 0.8, color = "blue", show.legend = FALSE) +
  labs(title = "i.i.d. time series", x = NULL, y = NULL)

Sample estimators

Let’s warm up using the classical estimators for the mean and covariance matrix, namely the sample mean and the sample covariance matrix: \[ \begin{aligned} \hat{\bmu} &= \frac{1}{T}\sum_{t=1}^{T}\bm{x}_{t}\\ \hat{\bSigma} &= \frac{1}{T-1}\sum_{t=1}^{T}(\bm{x}_t - \hat{\bmu})(\bm{x}_t - \hat{\bmu})^\T. \end{aligned} \]

Illustration of sample estimators in 2D:

library(mvtnorm)
library(ellipse)

mu_true = c(0, 0)
Sigma_true <- cbind(c(0.0012, 0.0011), c(0.0011, 0.0014))
T <- 10

#
# Gaussian data
#
set.seed(42)
X <- rmvnorm(n = 200, mean = mu_true, sigma = Sigma_true)
X_ <- X[1:T, ]
mu_sm <- colMeans(X_)
X_centered <- X_ - matrix(mu_sm, T, 2, byrow = TRUE)
Sigma_scm <- 1/T * t(X_centered) %*% X_centered

# scatter plot
ggplot(data.frame(x = X[, 1], y = X[, 2]), aes(x, y)) +
  geom_point(alpha = 1, size = 0.8, color = "black") +
  geom_path(data = data.frame(ellipse(Sigma_true)), aes(x, y, color = "1"), linewidth = 1.2) +
  geom_path(data = data.frame(ellipse(Sigma_scm)), aes(x, y, color = "2"), linewidth = 1.2) +
  geom_point(data = data.frame(x=0, y=0), aes(x, y), color = "blue", size = 4) +
  geom_point(data = data.frame(x=mu_sm[1], y=mu_sm[2]), aes(x, y), color = "red", size = 4) +
  coord_cartesian(xlim = c(-0.2, 0.2), ylim = c(-0.2, 0.2)) +
  scale_color_manual(name = "",
                     values = c("1" = "blue", "2" = "red"),
                     labels = c("true", "sample estimator")) +
  labs(title = "Gaussian data", x = NULL, y = NULL)

Estimation error of sample estimators versus number of observations (for Gaussian data with \(N=100\)):

library(mvtnorm)

# get realistic mu and Sigma
X <- diff(log(SP500_2015to2020$stocks))[-1, 1:100]
mu_true <- colMeans(X)
Sigma_true <- cov(X)

# generate synthetic data
set.seed(42)
T_max <- 2000
X <- rmvnorm(n = 10*T_max, mean = mu_true, sigma = Sigma_true)

# main loop
df <- data.frame()
T_sweep <- seq(from = 50, by = 50, to = T_max)
for(T in T_sweep) {
  for (i in 1:100) {
    X_ <- X[sample(nrow(X), T), ]
    mu <- colMeans(X_)
    Sigma <- cov(X_)
    df <- rbind(df, 
                data.frame("T"            = T,
                           "method"       = "sample mean",
                           "error mu"     = norm(mu - mu_true, "2")/norm(mu_true, "2"),
                           "error Sigma"  = norm(Sigma - Sigma_true, "F")/norm(Sigma_true, "F"),
                           check.names    = FALSE))
  }
}
 
df <- df |>
  group_by(method, T) |>
  summarize("error mu"    = 100*mean(`error mu`),
            "error Sigma" = 100*mean(`error Sigma`)) |>
  ungroup()

# plot
p_error_mu <- df |>
  ggplot(aes(x = T, y = `error mu`, color = method)) +
  geom_line(linewidth = 1.2, show.legend = FALSE) +
  labs(title = "Estimation error of sample mean", x = "T", y = "normalized error (%)")

p_error_Sigma <- df |>
  ggplot(aes(x = T, y = `error Sigma`, color = method)) +
  geom_line(linewidth = 1.2, show.legend = FALSE) +
  labs(title = "Estimation error of sample covariance matrix", x = "T", y = "normalized error (%)")

p_error_mu / p_error_Sigma

Location estimators

Illustration of different location estimators in 2D:

library(mvtnorm)
library(ellipse)
library(ICSNP)

mu_true = c(0, 0)
Sigma_true <- cbind(c(0.0012, 0.0011), c(0.0011, 0.0014))
nu <- 4
scatter_true <- (nu-2)/nu * Sigma_true  # cov=nu/(nu-2)*scatter
T <- 10  #20

# Generate synthetic heavy-tailed data
set.seed(42)
X <- rmvt(n = 800, delta = mu_true, sigma = scatter_true, df = nu)

# estimators
mu_mean <- colMeans(X[1:T, ])
mu_median <- apply(X[1:T, ], 2, median)
mu_spatial_median <- ICSNP::spatial.median(X[1:T, ])
df <- data.frame("location" = c("true", "sample mean", "median", "spatial median"),
                 x = c(mu_true[1], mu_mean[1], mu_median[1], mu_spatial_median[1]),
                 y = c(mu_true[2], mu_mean[2], mu_median[2], mu_spatial_median[2]))
df$location <- factor(df$location, levels = unique(df$location))

# scatter plot
colnames(Sigma_true) <- rownames(Sigma_true) <- NULL
ggplot(data.frame(x = X[, 1], y = X[, 2]), aes(x, y)) +
  geom_point(alpha = 0.7, size = 1) +
  geom_point(data = df, aes(x, y, fill = location, shape = location), size = 6) + 
  scale_shape_manual(values = c(21, 22, 23, 25)) +
  coord_cartesian(xlim = c(-0.08, 0.08), ylim = c(-0.08, 0.08)) +
  labs(title = "Scatter plot of data points with different location estimators", x = NULL, y = NULL)

Estimation error of location estimators versus number of observations (with \(N=100\)):

library(mvtnorm)

# get realistic mu and Sigma
X <- diff(log(SP500_2015to2020$stocks))[-1, 1:100]
mu_true <- colMeans(X)
Sigma_true <- cov(X)
nu <- 4
scatter_true <- (nu-2)/nu * Sigma_true  # cov=nu/(nu-2)*scatter

# generate synthetic data
set.seed(42)
T_max <- 1000
X_Gaussian   <- rmvnorm(n = 10*T_max, mean  = mu_true, sigma = Sigma_true)
X_heavy_tail <- rmvt(n = 10*T_max,    delta = mu_true, sigma = scatter_true, df = nu)

# main loop
df <- data.frame()
T_sweep <- seq(from = 50, by = 50, to = T_max)
for(T in T_sweep) {
  for (distribution in c("Gaussian", "heavy-tailed")) {
    for (i in 1:100) {
      if (distribution == "Gaussian")
        X_ <- X_Gaussian[sample(nrow(X_Gaussian), T), ]
      else if (distribution == "heavy-tailed")
        X_ <- X_heavy_tail[sample(nrow(X_heavy_tail), T), ]
      else stop("Unknown distribution.")
    
      # sample mean
      mu <- colMeans(X_)
      df <- rbind(df, data.frame("T"            = T,
                                 "distribution" = distribution,
                                 "method"       = "sample mean",
                                 "error mu"     = norm(mu - mu_true, "2")/norm(mu_true, "2"),
                                 check.names    = FALSE))
      # median
      mu <- apply(X_, 2, median)
      df <- rbind(df, data.frame("T"            = T,
                                 "distribution" = distribution,
                                 "method"       = "median",
                                 "error mu"     = norm(mu - mu_true, "2")/norm(mu_true, "2"),
                                 check.names    = FALSE))
      # spatial median
      mu <- ICSNP::spatial.median(X_)
      df <- rbind(df, data.frame("T"            = T,
                                 "distribution" = distribution,
                                 "method"       = "spatial median",
                                 "error mu"     = norm(mu - mu_true, "2")/norm(mu_true, "2"),
                                 check.names    = FALSE))
    }
  }
}
df$method <- factor(df$method, levels = unique(df$method))

df <- df |>
  group_by(method, T, distribution) |>
  summarize("error mu" = 100*mean(`error mu`)) |>
  ungroup()

# plot0
p_Gaussian <- df[df$distribution == "Gaussian", ] |>
  ggplot(aes(x = T, y = `error mu`, color = method,)) +
  geom_line(linewidth = 1.2) +
  labs(color = "estimator") +
  coord_cartesian(xlim = c(1, T_max)) +
  labs(title = "Error of location estimators under Gaussian data", x = "T", y = "normalized error (%)")

p_heavy_tail <- df[df$distribution == "heavy-tailed", ] |>
  ggplot(aes(x = T, y = `error mu`, color = method)) +
  geom_line(linewidth = 1.2) +
  labs(color = "estimator") +
  coord_cartesian(xlim = c(1, T_max)) +
  labs(title = "Error of location estimators under heavy-tailed data", x = "T", y = "normalized error (%)")

p_Gaussian / p_heavy_tail + plot_layout(guides = "collect")

Estimation error of location estimators versus degrees of freedom in a \(t\) distribution (with \(T=200\) and \(N=100\)):

library(mvtnorm)

# get realistic mu and Sigma
X <- diff(log(SP500_2015to2020$stocks))[-1, 1:100]
mu_true <- colMeans(X)
Sigma_true <- cov(X)
T <- 200

# main loop
set.seed(42)
df <- data.frame()
nu_sweep <- seq(from = 3, by = 2, to = 30)
for(nu in nu_sweep) {
  for (i in 1:200) {
    # generate synthetic data
    X_ <- rmvt(n = T, delta = mu_true, sigma = (nu-2)/nu * Sigma_true, df = nu)
    
    # sample mean
    mu    <- colMeans(X_)
    df <- rbind(df, data.frame("T"            = T,
                               "nu"           = nu,
                               "method"       = "sample mean",
                               "error mu"     = norm(mu - mu_true, "2")/norm(mu_true, "2"),
                               check.names    = FALSE))
    # median
    mu <- apply(X_, 2, median)
    df <- rbind(df, data.frame("T"            = T,
                               "nu"           = nu,
                               "method"       = "median",
                               "error mu"     = norm(mu - mu_true, "2")/norm(mu_true, "2"),
                               check.names    = FALSE))
    # spatial median
    mu <- ICSNP::spatial.median(X_)
    df <- rbind(df, data.frame("T"            = T,
                               "nu"           = nu,
                               "method"       = "spatial median",
                               "error mu"     = norm(mu - mu_true, "2")/norm(mu_true, "2"),
                               check.names    = FALSE))
  }
}
df$method <- factor(df$method, levels = unique(df$method))

df <- df |>
  group_by(method, T, nu) |>
  summarize("error mu"    = 100*mean(`error mu`)) |>
  ungroup()

# plot
df |>
  ggplot(aes(x = nu, y = `error mu`, color = method)) +
  geom_line(linewidth = 1.2) +
  labs(color = "estimator") +
  labs(title = "Error of location estimators", x = TeX("$\\nu$"), y = "normalized error (%)")

Gaussian ML estimators

Estimation error of Gaussian ML estimators versus number of observations (with \(N=100\)):

library(mvtnorm)

# get realistic mu and Sigma
X <- diff(log(SP500_2015to2020$stocks))[-1, 1:100]
mu_true <- colMeans(X)
Sigma_true <- cov(X)
nu <- 4
scatter_true <- (nu-2)/nu * Sigma_true  # cov=nu/(nu-2)*scatter

# generate synthetic data
set.seed(42)
T_max <- 1000
X_Gaussian   <- rmvnorm(n = 10*T_max, mean  = mu_true, sigma = Sigma_true)
X_heavy_tail <- rmvt(n = 10*T_max,    delta = mu_true, sigma = scatter_true, df = nu)

# main loop
df <- data.frame()
T_sweep <- seq(from = 50, by = 50, to = T_max)
for(T in T_sweep) {
  for (distribution in c("Gaussian", "heavy-tailed")) {
    for (i in 1:100) {
      if (distribution == "Gaussian")
        X_ <- X_Gaussian[sample(nrow(X_Gaussian), T), ]
      else if (distribution == "heavy-tailed")
        X_ <- X_heavy_tail[sample(nrow(X_heavy_tail), T), ]
      else stop("Unknown distribution.")
    
      # sample mean
      mu    <- colMeans(X_)
      Sigma <- (T-1)/T * cov(X_)
      df <- rbind(df, data.frame("T"            = T,
                                 "distribution" = distribution,
                                 "method"       = "Gaussian ML",
                                 "error mu"     = norm(mu - mu_true, "2")/norm(mu_true, "2"),
                                 "error Sigma"  = norm(Sigma - Sigma_true, "F")/norm(Sigma_true, "F"),
                                 check.names    = FALSE))
    }
  }
}
df$method <- factor(df$method, levels = unique(df$method))

df <- df |>
  group_by(method, T, distribution) |>
  summarize("error mu"    = 100*mean(`error mu`),
            "error Sigma" = 100*mean(`error Sigma`)) |>
  ungroup()

# plot
p_error_mu <- df |>
  ggplot(aes(x = T, y = `error mu`, color = distribution)) +
  geom_line(linewidth = 1.2) +
  labs(color = "data distribution") +
  labs(title = "Error of Gaussian ML mean estimator", x = "T", y = "normalized error (%)")

p_error_Sigma <- df |>
  ggplot(aes(x = T, y = `error Sigma`, color = distribution)) +
  geom_line(linewidth = 1.2) +
  labs(color = "data distribution") +
  labs(title = "Error of Gaussian ML covariance estimator", x = "T", y = "normalized error (%)")

p_error_mu / p_error_Sigma + plot_layout(guides = "collect")

Estimation error of Gaussian ML estimators versus degrees of freedom in a \(t\) distribution (with \(T=200\) and \(N=100\)):

library(mvtnorm)

# get realistic mu and Sigma
X <- diff(log(SP500_2015to2020$stocks))[-1, 1:100]
mu_true <- colMeans(X)
Sigma_true <- cov(X)
T <- 200

# main loop
set.seed(42)
df <- data.frame()
nu_sweep <- seq(from = 3, by = 2, to = 30)
for(nu in nu_sweep) {
  for (i in 1:200) {
    # generate synthetic data
    X <- rmvt(n = T, delta = mu_true, sigma = (nu-2)/nu * Sigma_true, df = nu)
    
    # sample mean
    mu    <- colMeans(X)
    Sigma <- (T-1)/T * cov(X)
    df <- rbind(df, data.frame("T"            = T,
                               "nu"           = nu,
                               "method"       = "Gaussian ML",
                               "error mu"     = norm(mu - mu_true, "2")/norm(mu_true, "2"),
                               "error Sigma"  = norm(Sigma - Sigma_true, "F")/norm(Sigma_true, "F"),
                               check.names    = FALSE))
  }
}
df$method <- factor(df$method, levels = unique(df$method))

df <- df |>
  group_by(method, T, nu) |>
  summarize("error mu"    = 100*mean(`error mu`),
            "error Sigma" = 100*mean(`error Sigma`)) |>
  ungroup()

# plot
p_error_mu <- df |>
  ggplot(aes(x = nu, y = `error mu`, color = method)) +
  geom_line(linewidth = 1.2, show.legend = FALSE) +
  coord_cartesian(ylim = c(200, 250)) +
  labs(title = "Estimation error of Gaussian ML mean estimator", x = TeX("$\\nu$"), y = "normalized error (%)")

p_error_Sigma <- df |>
  ggplot(aes(x = nu, y = `error Sigma`, color = method)) +
  geom_line(linewidth = 1.2, show.legend = FALSE) +
  labs(title = "Estimation error of Gaussian ML covariance estimator", x = TeX("$\\nu$"), y = "normalized error (%)")

p_error_mu / p_error_Sigma

The failure of Gaussian ML estimators

Effect of heavy tails and outliers in the Gaussian ML covariance matrix estimator:

Gaussian data:

mu_true = c(0, 0)
Sigma_true <- cbind(c(0.0012, 0.0011), c(0.0011, 0.0014))
T <- 10

#
# Gaussian data
#
set.seed(42)
X_gaussian <- rmvnorm(n = 200, mean = mu_true, sigma = Sigma_true)
X_ <- X_gaussian[1:T, ]
Sigma_scm <- 1/T * t(X_) %*% X_

# scatter plot
ggplot(data.frame(x = X[, 1], y = X[, 2]), aes(x, y)) +
  geom_point(alpha = 1, size = 0.8, color = "black") +
  geom_path(data = data.frame(ellipse(Sigma_true)), aes(x, y, color = "1"), linewidth = 1.2) +
  geom_path(data = data.frame(ellipse(Sigma_scm)), aes(x, y, color = "2"), linewidth = 1.2) +
  coord_cartesian(xlim = c(-0.2, 0.2), ylim = c(-0.2, 0.2)) +
  scale_color_manual(name = "",
                     values = c("1" = "blue", "2" = "red"),
                     labels = c("true", "Gaussian ML estimator")) +
  labs(title = "Gaussian data", x = NULL, y = NULL)

Heavy-tailed data:

mu_true = c(0, 0)
Sigma_true <- cbind(c(0.0012, 0.0011), c(0.0011, 0.0014))
nu <- 4
T <- 10


#
# Heavy-tailed data
#
set.seed(42)
X_heavytailed <- rmvt(n = 200, delta = mu_true, sigma = (nu-2)/nu * Sigma_true, df = nu)
X_ <- X_heavytailed[1:T, ]
Sigma_scm <- 1/T * t(X_) %*% X_


# scatter plot
ggplot(data.frame(x = X[, 1], y = X[, 2]), aes(x, y)) +
  geom_point(alpha = 1, size = 0.8, color = "black") +
  geom_path(data = data.frame(ellipse(Sigma_true)), aes(x, y, color = "1"), linewidth = 1.2) +
  geom_path(data = data.frame(ellipse(Sigma_scm)), aes(x, y, color = "2"), linewidth = 1.2) +
  coord_cartesian(xlim = c(-0.2, 0.2), ylim = c(-0.2, 0.2)) +
  scale_color_manual(name = "",
                     values = c("1" = "blue", "2" = "red"),
                     labels = c("true", "Gaussian ML estimator")) +
  labs(title = "Heavy-tailed data", x = NULL, y = NULL)

Gaussian data with outliers:

mu_true = c(0, 0)
Sigma_true <- cbind(c(0.0012, 0.0011), c(0.0011, 0.0014))
T <- 10

#
# Gaussian data with one outlier
#
set.seed(42)
X_gaussian <- rmvnorm(n = 200, mean = mu_true, sigma = Sigma_true)
X_gaussian_with_outlier <- rbind(c(-0.15,  0.05), X_gaussian)
X_ <- X_gaussian_with_outlier[1:T, ]
Sigma_scm <- 1/T * t(X_) %*% X_

# scatter plot
ggplot(data.frame(x = X[, 1], y = X[, 2]), aes(x, y)) +
  geom_point(alpha = 1, size = 0.8, color = "black") +
  geom_path(data = data.frame(ellipse(Sigma_true)), aes(x, y, color = "1"), linewidth = 1.2) +
  geom_path(data = data.frame(ellipse(Sigma_scm)), aes(x, y, color = "2"), linewidth = 1.2) +
  coord_cartesian(xlim = c(-0.2, 0.2), ylim = c(-0.2, 0.2)) +
  scale_color_manual(name = "",
                     values = c("1" = "blue", "2" = "red"),
                     labels = c("true", "Gaussian ML estimator")) +
  labs(title = "Gaussian data with one outlier", x = NULL, y = NULL)

Heavy-tailed ML estimators

Recall that the ML estimators can be iteratively computed as the weighted sample mean and sample covariance matrix: \[ \begin{aligned} \bmu^{k+1} &= \frac{\frac{1}{T}\sum_{t=1}^T w_t(\bmu^k,\bSigma^k) \times \bm{x}_t}{\frac{1}{T}\sum_{t=1}^T w_t(\bmu^k,\bSigma^k)}\\ \bSigma^{k+1} &= \frac{1}{T}\sum_{t=1}^T w_t(\bmu^{k+1},\bSigma^k) \times (\bm{x}_t - \bmu^{k+1})(\bm{x}_t - \bmu^{k+1})^\T \end{aligned} \] where the weights \(w_t(\bmu,\bSigma)\) are defined as \[w_t(\bmu,\bSigma) = \frac{\nu+N}{\nu + (\bm{x}_t - \bmu)^\T\bSigma^{-1}(\bm{x}_t - \bmu)}.\]

Effect of heavy tails and outliers in heavy-tailed ML covariance matrix estimator:

Gaussian data:

mu_true = c(0, 0)
Sigma_true <- cbind(c(0.0012, 0.0011), c(0.0011, 0.0014))
T <- 10

#
# Gaussian data
#
set.seed(42)
X_gaussian <- rmvnorm(n = 200, mean = mu_true, sigma = Sigma_true)
X_ <- X_gaussian[1:T, ]
Sigma_scm <- 1/T * t(X_) %*% X_
Sigma_heavytail <- fit_mvt(X_, optimize_mu = FALSE, nu = 4, scale_covmat = TRUE)$cov
colnames(Sigma_heavytail) <- rownames(Sigma_heavytail) <- NULL

# scatter plot
ggplot(data.frame(x = X[, 1], y = X[, 2]), aes(x, y)) +
  geom_point(alpha = 1, size = 0.8, color = "black") +
  geom_path(data = data.frame(ellipse(Sigma_true)), aes(x, y, color = "1"), linewidth = 1.2) +
  geom_path(data = data.frame(ellipse(Sigma_scm)), aes(x, y, color = "2"), linewidth = 1.2) +
  geom_path(data = data.frame(ellipse(Sigma_heavytail)), aes(x, y, color = "3"), linewidth = 1.2) +
  coord_cartesian(xlim = c(-0.2, 0.2), ylim = c(-0.2, 0.2)) +
  scale_color_manual(name = "",
                     values = c("1" = "darkblue", "2" = "darkred", "3" = "darkgreen"),
                     labels = c("true", "Gaussian ML estimator", "Heavy-tailed ML estimator")) +
  labs(title = "Gaussian data", x = NULL, y = NULL)

Heavy-tailed data:

mu_true = c(0, 0)
Sigma_true <- cbind(c(0.0012, 0.0011), c(0.0011, 0.0014))
nu <- 4
T <- 10

#
# Heavy-tailed data
#
set.seed(42)
X_heavytailed <- rmvt(n = 200, delta = mu_true, sigma = (nu-2)/nu * Sigma_true, df = nu)
X_ <- X_heavytailed[1:T, ]
Sigma_scm <- 1/T * t(X_) %*% X_
Sigma_heavytail <- fit_mvt(X_, optimize_mu = FALSE, nu = 4, scale_covmat = TRUE)$cov
colnames(Sigma_heavytail) <- rownames(Sigma_heavytail) <- NULL

# scatter plot
ggplot(data.frame(x = X[, 1], y = X[, 2]), aes(x, y)) +
  geom_point(alpha = 1, size = 0.8, color = "black") +
  geom_path(data = data.frame(ellipse(Sigma_true)), aes(x, y, color = "1"), linewidth = 1.2) +
  geom_path(data = data.frame(ellipse(Sigma_scm)), aes(x, y, color = "2"), linewidth = 1.2) +
  geom_path(data = data.frame(ellipse(Sigma_heavytail)), aes(x, y, color = "3"), linewidth = 1.2) +
  coord_cartesian(xlim = c(-0.2, 0.2), ylim = c(-0.2, 0.2)) +
  scale_color_manual(name = "",
                     values = c("1" = "darkblue", "2" = "darkred", "3" = "darkgreen"),
                     labels = c("true", "Gaussian ML estimator", "Heavy-tailed ML estimator")) +
  labs(title = "Heavy-tailed data", x = NULL, y = NULL)

Gaussian data with outliers:

mu_true = c(0, 0)
Sigma_true <- cbind(c(0.0012, 0.0011), c(0.0011, 0.0014))
T <- 10

#
# Gaussian data with one outlier
#
set.seed(42)
X_gaussian <- rmvnorm(n = 200, mean = mu_true, sigma = Sigma_true)
X_gaussian_with_outlier <- rbind(c(-0.15,  0.05), X_gaussian)
X_ <- X_gaussian_with_outlier[1:T, ]
Sigma_scm <- 1/T * t(X_) %*% X_
Sigma_heavytail <- fit_mvt(X_, optimize_mu = FALSE, nu = 4, scale_covmat = TRUE)$cov
colnames(Sigma_heavytail) <- rownames(Sigma_heavytail) <- NULL

# scatter plot
ggplot(data.frame(x = X[, 1], y = X[, 2]), aes(x, y)) +
  geom_point(alpha = 1, size = 0.8, color = "black") +
  geom_path(data = data.frame(ellipse(Sigma_true)), aes(x, y, color = "1"), linewidth = 1.2) +
  geom_path(data = data.frame(ellipse(Sigma_scm)), aes(x, y, color = "2"), linewidth = 1.2) +
  geom_path(data = data.frame(ellipse(Sigma_heavytail)), aes(x, y, color = "3"), linewidth = 1.2) +
  coord_cartesian(xlim = c(-0.2, 0.2), ylim = c(-0.2, 0.2)) +
  scale_color_manual(name = "",
                     values = c("1" = "darkblue", "2" = "darkred", "3" = "darkgreen"),
                     labels = c("true", "Gaussian ML estimator", "Heavy-tailed ML estimator")) +
  labs(title = "Gaussian data with one outlier", x = NULL, y = NULL)

Estimation error of different ML estimators versus number of observations (for \(t\)-distributed heavy-tailed data with \(\nu=4\) and \(N=100\)):

library(mvtnorm)
library(fitHeavyTail)
library(ICSNP)

# get realistic mu and Sigma
X <- diff(log(SP500_2015to2020$stocks))[-1, 1:100]
mu_true <- colMeans(X)
Sigma_true <- cov(X)
nu <- 4

# generate synthetic data
set.seed(42)
T_max <- 1000
X <- rmvt(n = 10*T_max, delta = mu_true, sigma = (nu-2)/nu * Sigma_true, df = nu)

# main loop
df <- data.frame()
T_sweep <- seq(from = 150, by = 50, to = T_max)
for(T in T_sweep) {
  for (i in 1:100) {
    X_ <- X[sample(nrow(X), T), ]
    
    # Gaussian MLE
    mu    <- colMeans(X_)
    Sigma <- (T-1)/T * cov(X_)
    df <- rbind(df, data.frame("T"            = T,
                               "method"       = "Gaussian MLE",
                               "error mu"     = norm(mu - mu_true, "2")/norm(mu_true, "2"),
                               "MAE mu"       = sum(abs(mu - mu_true))/sum(abs(mu_true)),
                               "error Sigma"  = norm(Sigma - Sigma_true, "F")/norm(Sigma_true, "F"),
                               check.names    = FALSE))
    
    # heavy-tailed MLE
    fitted <- fit_mvt(X_, nu = "iterative", nu_iterative_method = "POP")
    df <- rbind(df, data.frame("T"            = T,
                               "method"       = "heavy-tailed MLE",
                               "error mu"     = norm(fitted$mu - mu_true, "2")/norm(mu_true, "2"),
                               "MAE mu"       = sum(abs(fitted$mu - mu_true))/sum(abs(mu_true)),
                               "error Sigma"  = norm(fitted$cov - Sigma_true, "F")/norm(Sigma_true, "F"),
                               check.names    = FALSE))    
    
    # Tyler (with spatial median)
    fitted <- fit_Tyler(X_, initial = list(mu = ICSNP::spatial.median(X_)))
    df <- rbind(df, data.frame("T"            = T,
                               "method"       = "Tyler estimator",
                               "error mu"     = norm(fitted$mu - mu_true, "2")/norm(mu_true, "2"),
                               "MAE mu"       = sum(abs(fitted$mu - mu_true))/sum(abs(mu_true)),
                               "error Sigma"  = norm(fitted$cov - Sigma_true, "F")/norm(Sigma_true, "F"),
                               check.names    = FALSE))
  }
}
df$method <- factor(df$method, levels = unique(df$method))

df <- df |>
  group_by(method, T) |>
  summarize("error mu"    = 100*mean(`error mu`),
            "RMSE mu"     = 100*sqrt(mean(`error mu`)),
            "MAE mu"      = 100*mean(`MAE mu`),
            "error Sigma" = 100*mean(`error Sigma`)) |>
  ungroup()

# plot
p_error_mu <- df |>
  ggplot(aes(x = T, y = `error mu`, color = method)) +
  geom_line(linewidth = 1.2) +
  scale_color_discrete(name = "estimator", labels = c("Gaussian MLE", "heavy-tailed MLE", "spatial median")) +
  labs(title = "Error of mean estimators", x = "T", y = "normalized error (%)")

p_error_Sigma <- df |>
  ggplot(aes(x = T, y = `error Sigma`, color = method)) +
  geom_line(linewidth = 1.2) +
  labs(color = "estimator") +
  labs(title = "Error of covariance estimators", x = "T", y = "normalized error (%)")

p_error_mu / p_error_Sigma

Estimation error of different ML estimators versus degrees of freedom in a \(t\) distribution (with \(T=200\) and \(N=100\)):

library(mvtnorm)
library(fitHeavyTail)
library(ICSNP)

# get realistic mu and Sigma
X <- diff(log(SP500_2015to2020$stocks))[-1, 1:100]
mu_true <- colMeans(X)
Sigma_true <- cov(X)
T <- 200

# main loop
set.seed(42)
df <- data.frame()
nu_sweep <- seq(from = 3, by = 2, to = 30)
for(nu in nu_sweep) {
  for (i in 1:200) {
    # generate synthetic data
    X <- rmvt(n = T, delta = mu_true, sigma = (nu-2)/nu * Sigma_true, df = nu)
    
    # Gaussian MLE
    mu    <- colMeans(X)
    Sigma <- (T-1)/T * cov(X)
    df <- rbind(df, data.frame("T"            = T,
                               "nu"           = nu,
                               "method"       = "Gaussian MLE",
                               "error mu"     = norm(mu - mu_true, "2")/norm(mu_true, "2"),
                               "error Sigma"  = norm(Sigma - Sigma_true, "F")/norm(Sigma_true, "F"),
                               check.names    = FALSE))
    
    # heavy-tailed MLE
    fitted <- fit_mvt(X, nu = "iterative", nu_iterative_method = "POP")
    df <- rbind(df, data.frame("T"            = T,
                               "nu"           = nu,
                               "method"       = "heavy-tailed MLE",
                               "error mu"     = norm(fitted$mu - mu_true, "2")/norm(mu_true, "2"),
                               "error Sigma"  = norm(fitted$cov - Sigma_true, "F")/norm(Sigma_true, "F"),
                               check.names    = FALSE))

    # Tyler (with spatial median)
    fitted <- fit_Tyler(X, initial = list(mu = ICSNP::spatial.median(X)))
    df <- rbind(df, data.frame("T"            = T,
                               "nu"           = nu,
                               "method"       = "Tyler estimator",
                               "error mu"     = norm(fitted$mu - mu_true, "2")/norm(mu_true, "2"),
                               "error Sigma"  = norm(fitted$cov - Sigma_true, "F")/norm(Sigma_true, "F"),
                               check.names    = FALSE))
  }
}
df$method <- factor(df$method, levels = unique(df$method))

df <- df |>
  group_by(method, T, nu) |>
  summarize("error mu"    = 100*mean(`error mu`),
            "error Sigma" = 100*mean(`error Sigma`)) |>
  ungroup()

# plot
p_error_mu <- df |>
  ggplot(aes(x = nu, y = `error mu`, color = method)) +
  geom_line(linewidth = 1.2) +
  scale_color_discrete(name = "estimator", labels = c("Gaussian MLE", "heavy-tailed MLE", "spatial median")) +
  labs(title = "Error of mean estimators", x = TeX("$\\nu$"), y = "normalized error (%)")

p_error_Sigma <- df |>
  ggplot(aes(x = nu, y = `error Sigma`, color = method)) +
  geom_line(linewidth = 1.2) +
  labs(color = "estimator") +
  labs(title = "Error of covariance estimators", x = TeX("$\\nu$"), y = "normalized error (%)")

p_error_mu / p_error_Sigma

Convergence of iterative robust heavy-tailed ML estimators:

library(mvtnorm)
library(fitHeavyTail)

# get realistic mu and Sigma
X <- diff(log(SP500_2015to2020$stocks))[-1, 1:100]
mu_true <- colMeans(X)
Sigma_true <- cov(X)
T <- 200
nu <- 4

# generate synthetic data
set.seed(42)
X <- rmvt(n = T, delta = mu_true, sigma = (nu-2)/nu * Sigma_true, df = nu)

# heavy-tailed MLE
fitted<- fit_mvt(X, nu = "iterative", nu_iterative_method = "ECM", ftol = 1e-3, return_iterates = TRUE)

# plot
data.frame("k"                       = 0:fitted$num_iterations,
           "negative log-likelihood" = -fitted$log_likelihood_vs_iterations,
           check.names = FALSE) |>
  ggplot(aes(x = k, y = `negative log-likelihood`)) +
  geom_line(linewidth = 1.2, color = "blue") +
  ggtitle("Converge of objective function") + xlab("iterations")

Prior information

Shrinkage

First, we define a simple and convenient function to shrink the mean vector:

shrink_mu <- function(X, mu = NULL, cov = NULL, mu_sh_use_cor = TRUE, 
                      mu_target = c("grand mean", "volatility weighted", "covmat weighted")) {
  T <- nrow(X)
  N <- ncol(X)

  if (is.null(mu))
    mu <- colMeans(X)
  if (is.null(cov))
    cov <- cov(X)
  
  # target
  if (!is.numeric(mu_target))
    mu_target <- switch(match.arg(mu_target),
                        "grand mean" = mean(mu),
                        "volatility weighted" = {
                          vol <- sqrt(diag(cov))
                          mean(mu/vol) * vol
                          },
                        "covmat weighted" = {
                          invSigma_ones <- if (mu_sh_use_cor) solve(cov2cor(cov), rep(1, N))
                                           else solve(cov, rep(1, N))
                          invSigma_ones <- pmax(0, invSigma_ones)
                          as.numeric(mu %*% invSigma_ones)/sum(invSigma_ones)  # return achieved by GMVP
                          },
                        stop("Shrinkage estimation method for mu not recognized!"))
  
  # shrinkage
  rho <- (N + 2) / ((N + 2) + T * as.numeric((mu - mu_target) %*% solve(cov, mu - mu_target)))  # improved James-Stein estimator
  #rho <- (N - 2)/(T*as.numeric((mu - mu_target) %*% solve(cov, mu - mu_target)))  # original James-Stein estimator
  rho <- max(0, min(1, rho))
  mu_shrinked <- (1 - rho) * mu + rho * mu_target
  return(mu_shrinked)
}

Estimation error of different shrinkage estimators versus number of observations (for Gaussian data with \(N=100\)):

library(mvtnorm)

# get realistic mu and Sigma
X <- diff(log(SP500_2015to2020$stocks))[-1, 1:100]
N <- ncol(X)
mu_true <- colMeans(X)
Sigma_true <- cov(X)
nu <- 8

# generate synthetic data
set.seed(42)
T_max <- 550
X <- rmvt(n = 10*T_max, delta = mu_true, sigma = (nu-2)/nu * Sigma_true, df = nu)

# main loop
df <- data.frame()
T_sweep <- seq(from = 50, by = 50, to = T_max)
for(T in T_sweep) {
  for (i in 1:200) {
    X_ <- X[sample(nrow(X), T), ]

    #############                    #############
    #############  Mean estimation   #############
    #############                    #############
    Sigma <- cov(X_)
    Sigma_reg <- Sigma + mean(diag(Sigma)) * diag(N)    # regularize Sigma for stability
    
    #
    # plain sample mean
    #
    mu    <- colMeans(X_)
    df <- rbind(df, data.frame("T"            = T,
                               "parameter"    = "mu",
                               "method"       = "sample mean",
                               "error"        = norm(mu - mu_true, "2")/norm(mu_true, "2"),
                               check.names    = FALSE))
    #
    # sample mean + shrinkage to zero
    #
    mu_JS <- shrink_mu(X_, mu = colMeans(X_), mu_target = 0, cov = Sigma_reg)
    df <- rbind(df, data.frame("T"            = T,
                               "parameter"    = "mu",
                               "method"       = "shrinkage to zero",
                               "error"        = norm(mu_JS - mu_true, "2")/norm(mu_true, "2"),
                               check.names    = FALSE))
    #   
    # sample mean + shrinkage to grand mean
    #
    mu_JS <- shrink_mu(X_, mu = colMeans(X_), mu_target = "grand mean", cov = Sigma_reg)
    df <- rbind(df, data.frame("T"            = T,
                               "parameter"    = "mu",
                               "method"       = "shrinkage to grand mean",
                               "error"        = norm(mu_JS - mu_true, "2")/norm(mu_true, "2"),
                               check.names    = FALSE))
    #
    # sample mean + shrinkage to volatility weighted mean
    #
    mu_JS <- shrink_mu(X_, mu = colMeans(X_), mu_target = "covmat weighted", cov = Sigma_reg, mu_sh_use_cor = FALSE)
    df <- rbind(df, data.frame("T"            = T,
                               "parameter"    = "mu",
                               "method"       = "shrinkage to volatility-weighted mean",
                               "error"        = norm(mu_JS - mu_true, "2")/norm(mu_true, "2"),
                               check.names    = FALSE))

    
    #############                                 #############
    #############  Covariance matrix estimation   #############
    #############                                 #############
    
    #
    # plain SCM
    #
    Sigma <- cov(X_)
    df <- rbind(df, data.frame("T"            = T,
                               "parameter"    = "Sigma",
                               "method"       = "sample covariance matrix",
                               "error"        = norm(Sigma - Sigma_true, "F")/norm(Sigma_true, "F"),
                               check.names    = FALSE))

    #
    # SCM + Ledoit-Wolf shrinked to scaled identity
    #
    S <- cov(X_)
    Sigma_target <- mean(diag(S)) * diag(N)
    Xc <- X_ - matrix(colMeans(X_), T, N, byrow = TRUE)
    rho <- min(1, ((1/T^2) * sum(rowSums(Xc^2)^2) - 1/T * sum(S^2)) / sum((S - Sigma_target)^2))
    Sigma_sh <- (1 - rho)*S + rho*Sigma_target
    df <- rbind(df, data.frame("T"            = T,
                               "parameter"    = "Sigma",
                               "method"       = "shrinkage to scaled identity",
                               "error"        = norm(Sigma_sh - Sigma_true, "F")/norm(Sigma_true, "F"),
                               check.names    = FALSE)) 
    
    #
    # SCM + Ledoit-Wolf shrinked to diagonal matrix
    #
    S <- cov(X_)
    Sigma_target <- diag(diag(S))
    Xc <- X_ - matrix(colMeans(X_), T, N, byrow = TRUE)
    rho <- min(1, ((1/T^2) * sum(rowSums(Xc^2)^2) - 1/T * sum(S^2)) / sum((S - Sigma_target)^2))
    Sigma_sh <- (1 - rho)*S + rho*Sigma_target
    df <- rbind(df, data.frame("T"            = T,
                               "parameter"    = "Sigma",
                               "method"       = "shrinkage to diagonal matrix",
                               "error"        = norm(Sigma_sh - Sigma_true, "F")/norm(Sigma_true, "F"),
                               check.names    = FALSE)) 
    
    
    #
    # SCM + Ledoit-Wolf shrinked to equal-correlation matrix
    #    
    S <- cov(X_)
    sigma <- sqrt(diag(S))
    C <- cov2cor(S)
    ave_corr <- mean(C[lower.tri(C)])
    C[lower.tri(C)] <- ave_corr
    C[upper.tri(C)] <- ave_corr
    Sigma_target <- t(C * sigma) * sigma
    Xc <- X_ - matrix(colMeans(X_), T, N, byrow = TRUE)
    rho <- min(1, ((1/T^2) * sum(rowSums(Xc^2)^2) - 1/T * sum(S^2)) / sum((S - Sigma_target)^2))
    Sigma_sh <- (1 - rho)*S + rho*Sigma_target
    df <- rbind(df, data.frame("T"            = T,
                               "parameter"    = "Sigma",
                               "method"       = "shrinkage to equal-correlation matrix",
                               "error"        = norm(Sigma_sh - Sigma_true, "F")/norm(Sigma_true, "F"),
                               check.names    = FALSE)) 
  }
}
df$method <- factor(df$method, levels = unique(df$method))

df <- df |>
  group_by(parameter, method, T) |>
  summarize("error" = 100*mean(`error`)) |>
  ungroup()

# plot
p_error_mu <- df[df$parameter == "mu", ] |>
  ggplot(aes(x = T, y = `error`, color = method)) +
  geom_line(linewidth = 1.2) +
  labs(color = "estimator") +
  scale_x_continuous(breaks = seq(from = T_sweep[1], to = last(T_sweep), by = 100)) +
  labs(title = "Error of mean estimators", x= "T", y = "normalized error (%)")
    
p_error_Sigma <- df[df$parameter == "Sigma", ] |>
  ggplot(aes(x = T, y = `error`, color = method)) +
  geom_line(linewidth = 1.2) +
  labs(color = "estimator") +
  scale_x_continuous(breaks = seq(from = T_sweep[1], to = last(T_sweep), by = 100)) +
  labs(title = "Error of covariance estimators", x= "T", y = "normalized error (%)")

p_error_mu / p_error_Sigma

Factor models

Estimation error of covariance matrix under factor modeling versus number of observations (with \(N=100\)):

library(mvtnorm)
library(covFactorModel)

# get realistic mu and Sigma
X <- diff(log(SP500_2015to2020$stocks))[-1, 1:100]
N <- ncol(X)
mu_true <- rep(0, N)  #colMeans(X)

# Forcing low-rank covmat - method #1
Sigma_true <- covFactorModel(X, type = "Stat-PCA", K = 3)

# generate synthetic data
set.seed(42)
T_max <- 500
X <- rmvt(n = 10*T_max, delta = mu_true, sigma = Sigma_true, df = Inf)
X <- as.xts(X, order.by = as.Date("1995-03-15") + 1:nrow(X))

# main loop
df <- data.frame()
T_sweep <- seq(from = 100, by = 50, to = T_max)
eigenvalues_true <- eigen(Sigma_true)$values
for(T in T_sweep) {
  for (i in 1:200) {
    X_ <- X[sample(nrow(X), T), ]

    # SCM
    Sigma <- cov(X_)
    df <- rbind(df, data.frame("T"                 = T,
                               "method"            = "sample covariance matrix",
                               "error eigenvalues" = norm(eigen(Sigma)$values - eigenvalues_true, "2")/norm(eigenvalues_true, "2"),
                               "error Sigma"       = norm(Sigma - Sigma_true, "F")/norm(Sigma_true, "F"),
                               check.names    = FALSE))

    # 1-factor model
    Sigma <- covFactorModel(X_, type = "Stat-PCA", K = 1)
    df <- rbind(df, data.frame("T"                 = T,
                               "method"            = "factor model (K=1)",
                               "error eigenvalues" = norm(eigen(Sigma)$values - eigenvalues_true, "2")/norm(eigenvalues_true, "2"),
                               "error Sigma"       = norm(Sigma - Sigma_true, "F")/norm(Sigma_true, "F"),
                               check.names    = FALSE))
    
    # 3-factor model
    Sigma <- covFactorModel(X_, type = "Stat-PCA", K = 3)
    df <- rbind(df, data.frame("T"                 = T,
                               "method"            = "factor model (K=3)",
                               "error eigenvalues" = norm(eigen(Sigma)$values - eigenvalues_true, "2")/norm(eigenvalues_true, "2"),
                               "error Sigma"       = norm(Sigma - Sigma_true, "F")/norm(Sigma_true, "F"),
                               check.names    = FALSE))

    # 5-factor model
    Sigma <- covFactorModel(X_, type = "Stat-PCA", K = 5)
    df <- rbind(df, data.frame("T"                 = T,
                               "method"            = "factor model (K=5)",
                               "error eigenvalues" = norm(eigen(Sigma)$values - eigenvalues_true, "2")/norm(eigenvalues_true, "2"),
                               "error Sigma"       = norm(Sigma - Sigma_true, "F")/norm(Sigma_true, "F"),
                               check.names    = FALSE))
  }
}
df$method <- factor(df$method, levels = unique(df$method))

df <- df |>
  group_by(method, T) |>
  summarize("error Sigma"       = 100*mean(`error Sigma`),
            "error eigenvalues" = 100*mean(`error eigenvalues`)) |>
  ungroup()

`summarise()` has grouped output by 'method'. You can override using the
`.groups` argument.

# plot
df |>
  ggplot(aes(x = T, y = `error Sigma`, color = method)) +
  geom_line(linewidth = 1.2) +
  labs(color = "estimator") +
  labs(title = "Error of covariance estimators", x= "T", y = "normalized error (%)")