R Code for Portfolio Optimization
Chapter 3 – Financial Data: I.I.D. Modeling

Daniel P. Palomar (2025). Portfolio Optimization: Theory and Application. Cambridge University Press.

Last update: February 17, 2025


Packages

The following packages are used in the examples:

# basic finance
library(xts)                    # to manipulate time series of stock data
library(pob)                    # book package with financial data
                                # install with: devtools::install_github("dppalomar/pob")

# data
library(mvtnorm)                # to generate heavy-tailed data
library(fitHeavyTail)           # to fit heavy-tailed distributions
library(ICSNP)                  # to calculate spatial mean
library(covFactorModel)         # factor model estimation
                                # devtools::install_github("dppalomar/covFactorModel")

# plotting
library(ggplot2)                # for nice plots
library(patchwork)              # for combining plots
library(reshape2)               # to reshape data
library(dplyr); conflictRules('dplyr', exclude = 'lag'); options(xts.warn_dplyr_breaks_lag = FALSE)
library(scales)
library(latex2exp)              # for latex symbols with TeX()
library(ellipse)

I.I.D. model

Example of a synthetic Gaussian i.i.d. time series:

library(mvtnorm)

# get realistic mu and variance
sp500_prices <- SP500_2015to2020$index
sp500_returns <- diff(log(sp500_prices))[-1]
mu <- mean(sp500_returns)
var <- var(sp500_returns)

# generate synthetic data 
set.seed(42)
T <- 500
x <- rmvnorm(n = T, mean = mu, sigma = var)

data.frame(t = 1:T, x = x) |>
  ggplot(aes(x = t, y = x)) +
  geom_line(linewidth = 0.8, color = "blue", show.legend = FALSE) +
  labs(title = "i.i.d. time series", x = NULL, y = NULL)

Sample estimators

Let’s warm up using the classical estimators for the mean and covariance matrix, namely the sample mean and the sample covariance matrix: \[ \begin{aligned} \hat{\bmu} &= \frac{1}{T}\sum_{t=1}^{T}\bm{x}_{t}\\ \hat{\bSigma} &= \frac{1}{T-1}\sum_{t=1}^{T}(\bm{x}_t - \hat{\bmu})(\bm{x}_t - \hat{\bmu})^\T. \end{aligned} \]

Illustration of sample estimators in 2D:

library(mvtnorm)
library(ellipse)

mu_true = c(0, 0)
Sigma_true <- cbind(c(0.0012, 0.0011), c(0.0011, 0.0014))
T <- 10

#
# Gaussian data
#
set.seed(42)
X <- rmvnorm(n = 200, mean = mu_true, sigma = Sigma_true)
X_ <- X[1:T, ]
mu_sm <- colMeans(X_)
X_centered <- X_ - matrix(mu_sm, T, 2, byrow = TRUE)
Sigma_scm <- 1/T * t(X_centered) %*% X_centered

# scatter plot
ggplot(data.frame(x = X[, 1], y = X[, 2]), aes(x, y)) +
  geom_point(alpha = 1, size = 0.8, color = "black") +
  geom_path(data = data.frame(ellipse(Sigma_true)), aes(x, y, color = "1"), linewidth = 1.2) +
  geom_path(data = data.frame(ellipse(Sigma_scm)), aes(x, y, color = "2"), linewidth = 1.2) +
  geom_point(data = data.frame(x=0, y=0), aes(x, y), color = "blue", size = 4) +
  geom_point(data = data.frame(x=mu_sm[1], y=mu_sm[2]), aes(x, y), color = "red", size = 4) +
  coord_cartesian(xlim = c(-0.2, 0.2), ylim = c(-0.2, 0.2)) +
  scale_color_manual(name = "",
                     values = c("1" = "blue", "2" = "red"),
                     labels = c("true", "sample estimator")) +
  labs(title = "Gaussian data", x = NULL, y = NULL)

Estimation error of sample estimators versus number of observations (for Gaussian data with \(N=100\)):

library(mvtnorm)

# get realistic mu and Sigma
X <- diff(log(SP500_2015to2020$stocks))[-1, 1:100]
mu_true <- colMeans(X)
Sigma_true <- cov(X)

# generate synthetic data
set.seed(42)
T_max <- 2000
X <- rmvnorm(n = 10*T_max, mean = mu_true, sigma = Sigma_true)

# main loop
df <- data.frame()
T_sweep <- seq(from = 50, by = 50, to = T_max)
for(T in T_sweep) {
  for (i in 1:100) {
    X_ <- X[sample(nrow(X), T), ]
    mu <- colMeans(X_)
    Sigma <- cov(X_)
    df <- rbind(df, 
                data.frame("T"            = T,
                           "method"       = "sample mean",
                           "error mu"     = norm(mu - mu_true, "2")/norm(mu_true, "2"),
                           "error Sigma"  = norm(Sigma - Sigma_true, "F")/norm(Sigma_true, "F"),
                           check.names    = FALSE))
  }
}
 
df <- df |>
  group_by(method, T) |>
  summarize("error mu"    = 100*mean(`error mu`),
            "error Sigma" = 100*mean(`error Sigma`)) |>
  ungroup()

# plot
p_error_mu <- df |>
  ggplot(aes(x = T, y = `error mu`, color = method)) +
  geom_line(linewidth = 1.2, show.legend = FALSE) +
  labs(title = "Estimation error of sample mean", x = "T", y = "normalized error (%)")

p_error_Sigma <- df |>
  ggplot(aes(x = T, y = `error Sigma`, color = method)) +
  geom_line(linewidth = 1.2, show.legend = FALSE) +
  labs(title = "Estimation error of sample covariance matrix", x = "T", y = "normalized error (%)")

p_error_mu / p_error_Sigma

Location estimators

Illustration of different location estimators in 2D:

library(mvtnorm)
library(ellipse)
library(ICSNP)

mu_true = c(0, 0)
Sigma_true <- cbind(c(0.0012, 0.0011), c(0.0011, 0.0014))
nu <- 4
scatter_true <- (nu-2)/nu * Sigma_true  # cov=nu/(nu-2)*scatter
T <- 10  #20

# Generate synthetic heavy-tailed data
set.seed(42)
X <- rmvt(n = 800, delta = mu_true, sigma = scatter_true, df = nu)

# estimators
mu_mean <- colMeans(X[1:T, ])
mu_median <- apply(X[1:T, ], 2, median)
mu_spatial_median <- ICSNP::spatial.median(X[1:T, ])
df <- data.frame("location" = c("true", "sample mean", "median", "spatial median"),
                 x = c(mu_true[1], mu_mean[1], mu_median[1], mu_spatial_median[1]),
                 y = c(mu_true[2], mu_mean[2], mu_median[2], mu_spatial_median[2]))
df$location <- factor(df$location, levels = unique(df$location))

# scatter plot
colnames(Sigma_true) <- rownames(Sigma_true) <- NULL
ggplot(data.frame(x = X[, 1], y = X[, 2]), aes(x, y)) +
  geom_point(alpha = 0.7, size = 1) +
  geom_point(data = df, aes(x, y, fill = location, shape = location), size = 6) + 
  scale_shape_manual(values = c(21, 22, 23, 25)) +
  coord_cartesian(xlim = c(-0.08, 0.08), ylim = c(-0.08, 0.08)) +
  labs(title = "Scatter plot of data points with different location estimators", x = NULL, y = NULL)

Estimation error of location estimators versus number of observations (with \(N=100\)):

library(mvtnorm)

# get realistic mu and Sigma
X <- diff(log(SP500_2015to2020$stocks))[-1, 1:100]
mu_true <- colMeans(X)
Sigma_true <- cov(X)
nu <- 4
scatter_true <- (nu-2)/nu * Sigma_true  # cov=nu/(nu-2)*scatter

# generate synthetic data
set.seed(42)
T_max <- 1000
X_Gaussian   <- rmvnorm(n = 10*T_max, mean  = mu_true, sigma = Sigma_true)
X_heavy_tail <- rmvt(n = 10*T_max,    delta = mu_true, sigma = scatter_true, df = nu)

# main loop
df <- data.frame()
T_sweep <- seq(from = 50, by = 50, to = T_max)
for(T in T_sweep) {
  for (distribution in c("Gaussian", "heavy-tailed")) {
    for (i in 1:100) {
      if (distribution == "Gaussian")
        X_ <- X_Gaussian[sample(nrow(X_Gaussian), T), ]
      else if (distribution == "heavy-tailed")
        X_ <- X_heavy_tail[sample(nrow(X_heavy_tail), T), ]
      else stop("Unknown distribution.")
    
      # sample mean
      mu <- colMeans(X_)
      df <- rbind(df, data.frame("T"            = T,
                                 "distribution" = distribution,
                                 "method"       = "sample mean",
                                 "error mu"     = norm(mu - mu_true, "2")/norm(mu_true, "2"),
                                 check.names    = FALSE))
      # median
      mu <- apply(X_, 2, median)
      df <- rbind(df, data.frame("T"            = T,
                                 "distribution" = distribution,
                                 "method"       = "median",
                                 "error mu"     = norm(mu - mu_true, "2")/norm(mu_true, "2"),
                                 check.names    = FALSE))
      # spatial median
      mu <- ICSNP::spatial.median(X_)
      df <- rbind(df, data.frame("T"            = T,
                                 "distribution" = distribution,
                                 "method"       = "spatial median",
                                 "error mu"     = norm(mu - mu_true, "2")/norm(mu_true, "2"),
                                 check.names    = FALSE))
    }
  }
}
df$method <- factor(df$method, levels = unique(df$method))

df <- df |>
  group_by(method, T, distribution) |>
  summarize("error mu" = 100*mean(`error mu`)) |>
  ungroup()

# plot0
p_Gaussian <- df[df$distribution == "Gaussian", ] |>
  ggplot(aes(x = T, y = `error mu`, color = method,)) +
  geom_line(linewidth = 1.2) +
  labs(color = "estimator") +
  coord_cartesian(xlim = c(1, T_max)) +
  labs(title = "Error of location estimators under Gaussian data", x = "T", y = "normalized error (%)")

p_heavy_tail <- df[df$distribution == "heavy-tailed", ] |>
  ggplot(aes(x = T, y = `error mu`, color = method)) +
  geom_line(linewidth = 1.2) +
  labs(color = "estimator") +
  coord_cartesian(xlim = c(1, T_max)) +
  labs(title = "Error of location estimators under heavy-tailed data", x = "T", y = "normalized error (%)")

p_Gaussian / p_heavy_tail + plot_layout(guides = "collect")

Estimation error of location estimators versus degrees of freedom in a \(t\) distribution (with \(T=200\) and \(N=100\)):

library(mvtnorm)

# get realistic mu and Sigma
X <- diff(log(SP500_2015to2020$stocks))[-1, 1:100]
mu_true <- colMeans(X)
Sigma_true <- cov(X)
T <- 200

# main loop
set.seed(42)
df <- data.frame()
nu_sweep <- seq(from = 3, by = 2, to = 30)
for(nu in nu_sweep) {
  for (i in 1:200) {
    # generate synthetic data
    X_ <- rmvt(n = T, delta = mu_true, sigma = (nu-2)/nu * Sigma_true, df = nu)
    
    # sample mean
    mu    <- colMeans(X_)
    df <- rbind(df, data.frame("T"            = T,
                               "nu"           = nu,
                               "method"       = "sample mean",
                               "error mu"     = norm(mu - mu_true, "2")/norm(mu_true, "2"),
                               check.names    = FALSE))
    # median
    mu <- apply(X_, 2, median)
    df <- rbind(df, data.frame("T"            = T,
                               "nu"           = nu,
                               "method"       = "median",
                               "error mu"     = norm(mu - mu_true, "2")/norm(mu_true, "2"),
                               check.names    = FALSE))
    # spatial median
    mu <- ICSNP::spatial.median(X_)
    df <- rbind(df, data.frame("T"            = T,
                               "nu"           = nu,
                               "method"       = "spatial median",
                               "error mu"     = norm(mu - mu_true, "2")/norm(mu_true, "2"),
                               check.names    = FALSE))
  }
}
df$method <- factor(df$method, levels = unique(df$method))

df <- df |>
  group_by(method, T, nu) |>
  summarize("error mu"    = 100*mean(`error mu`)) |>
  ungroup()

# plot
df |>
  ggplot(aes(x = nu, y = `error mu`, color = method)) +
  geom_line(linewidth = 1.2) +
  labs(color = "estimator") +
  labs(title = "Error of location estimators", x = TeX("$\\nu$"), y = "normalized error (%)")

Gaussian ML estimators

Estimation error of Gaussian ML estimators versus number of observations (with \(N=100\)):

library(mvtnorm)

# get realistic mu and Sigma
X <- diff(log(SP500_2015to2020$stocks))[-1, 1:100]
mu_true <- colMeans(X)
Sigma_true <- cov(X)
nu <- 4
scatter_true <- (nu-2)/nu * Sigma_true  # cov=nu/(nu-2)*scatter

# generate synthetic data
set.seed(42)
T_max <- 1000
X_Gaussian   <- rmvnorm(n = 10*T_max, mean  = mu_true, sigma = Sigma_true)
X_heavy_tail <- rmvt(n = 10*T_max,    delta = mu_true, sigma = scatter_true, df = nu)

# main loop
df <- data.frame()
T_sweep <- seq(from = 50, by = 50, to = T_max)
for(T in T_sweep) {
  for (distribution in c("Gaussian", "heavy-tailed")) {
    for (i in 1:100) {
      if (distribution == "Gaussian")
        X_ <- X_Gaussian[sample(nrow(X_Gaussian), T), ]
      else if (distribution == "heavy-tailed")
        X_ <- X_heavy_tail[sample(nrow(X_heavy_tail), T), ]
      else stop("Unknown distribution.")
    
      # sample mean
      mu    <- colMeans(X_)
      Sigma <- (T-1)/T * cov(X_)
      df <- rbind(df, data.frame("T"            = T,
                                 "distribution" = distribution,
                                 "method"       = "Gaussian ML",
                                 "error mu"     = norm(mu - mu_true, "2")/norm(mu_true, "2"),
                                 "error Sigma"  = norm(Sigma - Sigma_true, "F")/norm(Sigma_true, "F"),
                                 check.names    = FALSE))
    }
  }
}
df$method <- factor(df$method, levels = unique(df$method))

df <- df |>
  group_by(method, T, distribution) |>
  summarize("error mu"    = 100*mean(`error mu`),
            "error Sigma" = 100*mean(`error Sigma`)) |>
  ungroup()

# plot
p_error_mu <- df |>
  ggplot(aes(x = T, y = `error mu`, color = distribution)) +
  geom_line(linewidth = 1.2) +
  labs(color = "data distribution") +
  labs(title = "Error of Gaussian ML mean estimator", x = "T", y = "normalized error (%)")

p_error_Sigma <- df |>
  ggplot(aes(x = T, y = `error Sigma`, color = distribution)) +
  geom_line(linewidth = 1.2) +
  labs(color = "data distribution") +
  labs(title = "Error of Gaussian ML covariance estimator", x = "T", y = "normalized error (%)")

p_error_mu / p_error_Sigma + plot_layout(guides = "collect")

Estimation error of Gaussian ML estimators versus degrees of freedom in a \(t\) distribution (with \(T=200\) and \(N=100\)):

library(mvtnorm)

# get realistic mu and Sigma
X <- diff(log(SP500_2015to2020$stocks))[-1, 1:100]
mu_true <- colMeans(X)
Sigma_true <- cov(X)
T <- 200

# main loop
set.seed(42)
df <- data.frame()
nu_sweep <- seq(from = 3, by = 2, to = 30)
for(nu in nu_sweep) {
  for (i in 1:200) {
    # generate synthetic data
    X <- rmvt(n = T, delta = mu_true, sigma = (nu-2)/nu * Sigma_true, df = nu)
    
    # sample mean
    mu    <- colMeans(X)
    Sigma <- (T-1)/T * cov(X)
    df <- rbind(df, data.frame("T"            = T,
                               "nu"           = nu,
                               "method"       = "Gaussian ML",
                               "error mu"     = norm(mu - mu_true, "2")/norm(mu_true, "2"),
                               "error Sigma"  = norm(Sigma - Sigma_true, "F")/norm(Sigma_true, "F"),
                               check.names    = FALSE))
  }
}
df$method <- factor(df$method, levels = unique(df$method))

df <- df |>
  group_by(method, T, nu) |>
  summarize("error mu"    = 100*mean(`error mu`),
            "error Sigma" = 100*mean(`error Sigma`)) |>
  ungroup()

# plot
p_error_mu <- df |>
  ggplot(aes(x = nu, y = `error mu`, color = method)) +
  geom_line(linewidth = 1.2, show.legend = FALSE) +
  coord_cartesian(ylim = c(200, 250)) +
  labs(title = "Estimation error of Gaussian ML mean estimator", x = TeX("$\\nu$"), y = "normalized error (%)")

p_error_Sigma <- df |>
  ggplot(aes(x = nu, y = `error Sigma`, color = method)) +
  geom_line(linewidth = 1.2, show.legend = FALSE) +
  labs(title = "Estimation error of Gaussian ML covariance estimator", x = TeX("$\\nu$"), y = "normalized error (%)")

p_error_mu / p_error_Sigma

The failure of Gaussian ML estimators

Effect of heavy tails and outliers in the Gaussian ML covariance matrix estimator:

  • Gaussian data:
mu_true = c(0, 0)
Sigma_true <- cbind(c(0.0012, 0.0011), c(0.0011, 0.0014))
T <- 10

#
# Gaussian data
#
set.seed(42)
X_gaussian <- rmvnorm(n = 200, mean = mu_true, sigma = Sigma_true)
X_ <- X_gaussian[1:T, ]
Sigma_scm <- 1/T * t(X_) %*% X_

# scatter plot
ggplot(data.frame(x = X[, 1], y = X[, 2]), aes(x, y)) +
  geom_point(alpha = 1, size = 0.8, color = "black") +
  geom_path(data = data.frame(ellipse(Sigma_true)), aes(x, y, color = "1"), linewidth = 1.2) +
  geom_path(data = data.frame(ellipse(Sigma_scm)), aes(x, y, color = "2"), linewidth = 1.2) +
  coord_cartesian(xlim = c(-0.2, 0.2), ylim = c(-0.2, 0.2)) +
  scale_color_manual(name = "",
                     values = c("1" = "blue", "2" = "red"),
                     labels = c("true", "Gaussian ML estimator")) +
  labs(title = "Gaussian data", x = NULL, y = NULL)

  • Heavy-tailed data:
mu_true = c(0, 0)
Sigma_true <- cbind(c(0.0012, 0.0011), c(0.0011, 0.0014))
nu <- 4
T <- 10


#
# Heavy-tailed data
#
set.seed(42)
X_heavytailed <- rmvt(n = 200, delta = mu_true, sigma = (nu-2)/nu * Sigma_true, df = nu)
X_ <- X_heavytailed[1:T, ]
Sigma_scm <- 1/T * t(X_) %*% X_


# scatter plot
ggplot(data.frame(x = X[, 1], y = X[, 2]), aes(x, y)) +
  geom_point(alpha = 1, size = 0.8, color = "black") +
  geom_path(data = data.frame(ellipse(Sigma_true)), aes(x, y, color = "1"), linewidth = 1.2) +
  geom_path(data = data.frame(ellipse(Sigma_scm)), aes(x, y, color = "2"), linewidth = 1.2) +
  coord_cartesian(xlim = c(-0.2, 0.2), ylim = c(-0.2, 0.2)) +
  scale_color_manual(name = "",
                     values = c("1" = "blue", "2" = "red"),
                     labels = c("true", "Gaussian ML estimator")) +
  labs(title = "Heavy-tailed data", x = NULL, y = NULL)

  • Gaussian data with outliers:
mu_true = c(0, 0)
Sigma_true <- cbind(c(0.0012, 0.0011), c(0.0011, 0.0014))
T <- 10

#
# Gaussian data with one outlier
#
set.seed(42)
X_gaussian <- rmvnorm(n = 200, mean = mu_true, sigma = Sigma_true)
X_gaussian_with_outlier <- rbind(c(-0.15,  0.05), X_gaussian)
X_ <- X_gaussian_with_outlier[1:T, ]
Sigma_scm <- 1/T * t(X_) %*% X_

# scatter plot
ggplot(data.frame(x = X[, 1], y = X[, 2]), aes(x, y)) +
  geom_point(alpha = 1, size = 0.8, color = "black") +
  geom_path(data = data.frame(ellipse(Sigma_true)), aes(x, y, color = "1"), linewidth = 1.2) +
  geom_path(data = data.frame(ellipse(Sigma_scm)), aes(x, y, color = "2"), linewidth = 1.2) +
  coord_cartesian(xlim = c(-0.2, 0.2), ylim = c(-0.2, 0.2)) +
  scale_color_manual(name = "",
                     values = c("1" = "blue", "2" = "red"),
                     labels = c("true", "Gaussian ML estimator")) +
  labs(title = "Gaussian data with one outlier", x = NULL, y = NULL)

Heavy-tailed ML estimators

Recall that the ML estimators can be iteratively computed as the weighted sample mean and sample covariance matrix: \[ \begin{aligned} \bmu^{k+1} &= \frac{\frac{1}{T}\sum_{t=1}^T w_t(\bmu^k,\bSigma^k) \times \bm{x}_t}{\frac{1}{T}\sum_{t=1}^T w_t(\bmu^k,\bSigma^k)}\\ \bSigma^{k+1} &= \frac{1}{T}\sum_{t=1}^T w_t(\bmu^{k+1},\bSigma^k) \times (\bm{x}_t - \bmu^{k+1})(\bm{x}_t - \bmu^{k+1})^\T \end{aligned} \] where the weights \(w_t(\bmu,\bSigma)\) are defined as \[w_t(\bmu,\bSigma) = \frac{\nu+N}{\nu + (\bm{x}_t - \bmu)^\T\bSigma^{-1}(\bm{x}_t - \bmu)}.\]

Effect of heavy tails and outliers in heavy-tailed ML covariance matrix estimator:

  • Gaussian data:
mu_true = c(0, 0)
Sigma_true <- cbind(c(0.0012, 0.0011), c(0.0011, 0.0014))
T <- 10

#
# Gaussian data
#
set.seed(42)
X_gaussian <- rmvnorm(n = 200, mean = mu_true, sigma = Sigma_true)
X_ <- X_gaussian[1:T, ]
Sigma_scm <- 1/T * t(X_) %*% X_
Sigma_heavytail <- fit_mvt(X_, optimize_mu = FALSE, nu = 4, scale_covmat = TRUE)$cov
colnames(Sigma_heavytail) <- rownames(Sigma_heavytail) <- NULL

# scatter plot
ggplot(data.frame(x = X[, 1], y = X[, 2]), aes(x, y)) +
  geom_point(alpha = 1, size = 0.8, color = "black") +
  geom_path(data = data.frame(ellipse(Sigma_true)), aes(x, y, color = "1"), linewidth = 1.2) +
  geom_path(data = data.frame(ellipse(Sigma_scm)), aes(x, y, color = "2"), linewidth = 1.2) +
  geom_path(data = data.frame(ellipse(Sigma_heavytail)), aes(x, y, color = "3"), linewidth = 1.2) +
  coord_cartesian(xlim = c(-0.2, 0.2), ylim = c(-0.2, 0.2)) +
  scale_color_manual(name = "",
                     values = c("1" = "darkblue", "2" = "darkred", "3" = "darkgreen"),
                     labels = c("true", "Gaussian ML estimator", "Heavy-tailed ML estimator")) +
  labs(title = "Gaussian data", x = NULL, y = NULL)

  • Heavy-tailed data:
mu_true = c(0, 0)
Sigma_true <- cbind(c(0.0012, 0.0011), c(0.0011, 0.0014))
nu <- 4
T <- 10

#
# Heavy-tailed data
#
set.seed(42)
X_heavytailed <- rmvt(n = 200, delta = mu_true, sigma = (nu-2)/nu * Sigma_true, df = nu)
X_ <- X_heavytailed[1:T, ]
Sigma_scm <- 1/T * t(X_) %*% X_
Sigma_heavytail <- fit_mvt(X_, optimize_mu = FALSE, nu = 4, scale_covmat = TRUE)$cov
colnames(Sigma_heavytail) <- rownames(Sigma_heavytail) <- NULL

# scatter plot
ggplot(data.frame(x = X[, 1], y = X[, 2]), aes(x, y)) +
  geom_point(alpha = 1, size = 0.8, color = "black") +
  geom_path(data = data.frame(ellipse(Sigma_true)), aes(x, y, color = "1"), linewidth = 1.2) +
  geom_path(data = data.frame(ellipse(Sigma_scm)), aes(x, y, color = "2"), linewidth = 1.2) +
  geom_path(data = data.frame(ellipse(Sigma_heavytail)), aes(x, y, color = "3"), linewidth = 1.2) +
  coord_cartesian(xlim = c(-0.2, 0.2), ylim = c(-0.2, 0.2)) +
  scale_color_manual(name = "",
                     values = c("1" = "darkblue", "2" = "darkred", "3" = "darkgreen"),
                     labels = c("true", "Gaussian ML estimator", "Heavy-tailed ML estimator")) +
  labs(title = "Heavy-tailed data", x = NULL, y = NULL)

  • Gaussian data with outliers:
mu_true = c(0, 0)
Sigma_true <- cbind(c(0.0012, 0.0011), c(0.0011, 0.0014))
T <- 10

#
# Gaussian data with one outlier
#
set.seed(42)
X_gaussian <- rmvnorm(n = 200, mean = mu_true, sigma = Sigma_true)
X_gaussian_with_outlier <- rbind(c(-0.15,  0.05), X_gaussian)
X_ <- X_gaussian_with_outlier[1:T, ]
Sigma_scm <- 1/T * t(X_) %*% X_
Sigma_heavytail <- fit_mvt(X_, optimize_mu = FALSE, nu = 4, scale_covmat = TRUE)$cov
colnames(Sigma_heavytail) <- rownames(Sigma_heavytail) <- NULL

# scatter plot
ggplot(data.frame(x = X[, 1], y = X[, 2]), aes(x, y)) +
  geom_point(alpha = 1, size = 0.8, color = "black") +
  geom_path(data = data.frame(ellipse(Sigma_true)), aes(x, y, color = "1"), linewidth = 1.2) +
  geom_path(data = data.frame(ellipse(Sigma_scm)), aes(x, y, color = "2"), linewidth = 1.2) +
  geom_path(data = data.frame(ellipse(Sigma_heavytail)), aes(x, y, color = "3"), linewidth = 1.2) +
  coord_cartesian(xlim = c(-0.2, 0.2), ylim = c(-0.2, 0.2)) +
  scale_color_manual(name = "",
                     values = c("1" = "darkblue", "2" = "darkred", "3" = "darkgreen"),
                     labels = c("true", "Gaussian ML estimator", "Heavy-tailed ML estimator")) +
  labs(title = "Gaussian data with one outlier", x = NULL, y = NULL)

Estimation error of different ML estimators versus number of observations (for \(t\)-distributed heavy-tailed data with \(\nu=4\) and \(N=100\)):

library(mvtnorm)
library(fitHeavyTail)
library(ICSNP)

# get realistic mu and Sigma
X <- diff(log(SP500_2015to2020$stocks))[-1, 1:100]
mu_true <- colMeans(X)
Sigma_true <- cov(X)
nu <- 4

# generate synthetic data
set.seed(42)
T_max <- 1000
X <- rmvt(n = 10*T_max, delta = mu_true, sigma = (nu-2)/nu * Sigma_true, df = nu)

# main loop
df <- data.frame()
T_sweep <- seq(from = 150, by = 50, to = T_max)
for(T in T_sweep) {
  for (i in 1:100) {
    X_ <- X[sample(nrow(X), T), ]
    
    # Gaussian MLE
    mu    <- colMeans(X_)
    Sigma <- (T-1)/T * cov(X_)
    df <- rbind(df, data.frame("T"            = T,
                               "method"       = "Gaussian MLE",
                               "error mu"     = norm(mu - mu_true, "2")/norm(mu_true, "2"),
                               "MAE mu"       = sum(abs(mu - mu_true))/sum(abs(mu_true)),
                               "error Sigma"  = norm(Sigma - Sigma_true, "F")/norm(Sigma_true, "F"),
                               check.names    = FALSE))
    
    # heavy-tailed MLE
    fitted <- fit_mvt(X_, nu = "iterative", nu_iterative_method = "POP")
    df <- rbind(df, data.frame("T"            = T,
                               "method"       = "heavy-tailed MLE",
                               "error mu"     = norm(fitted$mu - mu_true, "2")/norm(mu_true, "2"),
                               "MAE mu"       = sum(abs(fitted$mu - mu_true))/sum(abs(mu_true)),
                               "error Sigma"  = norm(fitted$cov - Sigma_true, "F")/norm(Sigma_true, "F"),
                               check.names    = FALSE))    
    
    # Tyler (with spatial median)
    fitted <- fit_Tyler(X_, initial = list(mu = ICSNP::spatial.median(X_)))
    df <- rbind(df, data.frame("T"            = T,
                               "method"       = "Tyler estimator",
                               "error mu"     = norm(fitted$mu - mu_true, "2")/norm(mu_true, "2"),
                               "MAE mu"       = sum(abs(fitted$mu - mu_true))/sum(abs(mu_true)),
                               "error Sigma"  = norm(fitted$cov - Sigma_true, "F")/norm(Sigma_true, "F"),
                               check.names    = FALSE))
  }
}
df$method <- factor(df$method, levels = unique(df$method))

df <- df |>
  group_by(method, T) |>
  summarize("error mu"    = 100*mean(`error mu`),
            "RMSE mu"     = 100*sqrt(mean(`error mu`)),
            "MAE mu"      = 100*mean(`MAE mu`),
            "error Sigma" = 100*mean(`error Sigma`)) |>
  ungroup()

# plot
p_error_mu <- df |>
  ggplot(aes(x = T, y = `error mu`, color = method)) +
  geom_line(linewidth = 1.2) +
  scale_color_discrete(name = "estimator", labels = c("Gaussian MLE", "heavy-tailed MLE", "spatial median")) +
  labs(title = "Error of mean estimators", x = "T", y = "normalized error (%)")

p_error_Sigma <- df |>
  ggplot(aes(x = T, y = `error Sigma`, color = method)) +
  geom_line(linewidth = 1.2) +
  labs(color = "estimator") +
  labs(title = "Error of covariance estimators", x = "T", y = "normalized error (%)")

p_error_mu / p_error_Sigma

Estimation error of different ML estimators versus degrees of freedom in a \(t\) distribution (with \(T=200\) and \(N=100\)):

library(mvtnorm)
library(fitHeavyTail)
library(ICSNP)

# get realistic mu and Sigma
X <- diff(log(SP500_2015to2020$stocks))[-1, 1:100]
mu_true <- colMeans(X)
Sigma_true <- cov(X)
T <- 200

# main loop
set.seed(42)
df <- data.frame()
nu_sweep <- seq(from = 3, by = 2, to = 30)
for(nu in nu_sweep) {
  for (i in 1:200) {
    # generate synthetic data
    X <- rmvt(n = T, delta = mu_true, sigma = (nu-2)/nu * Sigma_true, df = nu)
    
    # Gaussian MLE
    mu    <- colMeans(X)
    Sigma <- (T-1)/T * cov(X)
    df <- rbind(df, data.frame("T"            = T,
                               "nu"           = nu,
                               "method"       = "Gaussian MLE",
                               "error mu"     = norm(mu - mu_true, "2")/norm(mu_true, "2"),
                               "error Sigma"  = norm(Sigma - Sigma_true, "F")/norm(Sigma_true, "F"),
                               check.names    = FALSE))
    
    # heavy-tailed MLE
    fitted <- fit_mvt(X, nu = "iterative", nu_iterative_method = "POP")
    df <- rbind(df, data.frame("T"            = T,
                               "nu"           = nu,
                               "method"       = "heavy-tailed MLE",
                               "error mu"     = norm(fitted$mu - mu_true, "2")/norm(mu_true, "2"),
                               "error Sigma"  = norm(fitted$cov - Sigma_true, "F")/norm(Sigma_true, "F"),
                               check.names    = FALSE))

    # Tyler (with spatial median)
    fitted <- fit_Tyler(X, initial = list(mu = ICSNP::spatial.median(X)))
    df <- rbind(df, data.frame("T"            = T,
                               "nu"           = nu,
                               "method"       = "Tyler estimator",
                               "error mu"     = norm(fitted$mu - mu_true, "2")/norm(mu_true, "2"),
                               "error Sigma"  = norm(fitted$cov - Sigma_true, "F")/norm(Sigma_true, "F"),
                               check.names    = FALSE))
  }
}
df$method <- factor(df$method, levels = unique(df$method))

df <- df |>
  group_by(method, T, nu) |>
  summarize("error mu"    = 100*mean(`error mu`),
            "error Sigma" = 100*mean(`error Sigma`)) |>
  ungroup()

# plot
p_error_mu <- df |>
  ggplot(aes(x = nu, y = `error mu`, color = method)) +
  geom_line(linewidth = 1.2) +
  scale_color_discrete(name = "estimator", labels = c("Gaussian MLE", "heavy-tailed MLE", "spatial median")) +
  labs(title = "Error of mean estimators", x = TeX("$\\nu$"), y = "normalized error (%)")

p_error_Sigma <- df |>
  ggplot(aes(x = nu, y = `error Sigma`, color = method)) +
  geom_line(linewidth = 1.2) +
  labs(color = "estimator") +
  labs(title = "Error of covariance estimators", x = TeX("$\\nu$"), y = "normalized error (%)")

p_error_mu / p_error_Sigma