Python Code for Portfolio Optimization Chapter 3 – Financial Data: I.I.D. Modeling
Daniel P. Palomar (2025). Portfolio Optimization: Theory and Application. Cambridge University Press.
Last update: February 18, 2025
Contributors:
Packages¶
The following packages are used in the examples:
# Core numerical computing
import numpy as np
import pandas as pd
from scipy.stats import multivariate_normal, multivariate_t
# Financial data handling
import yfinance as yf
# Book data (pip install "git+https://github.com/dppalomar/pob.git#subdirectory=python")
from pob_python import SP500_stocks_2015to2020, SP500_index_2015to2020
# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="darkgrid")
from tqdm import tqdm # For progress tracking
IID model¶
Example of a synthetic Gaussian i.i.d. time series:
# Use S&P 500 index from the book data
SP500_index_2015to2020.head()
SP500_INDEX | |
---|---|
Date | |
2015-01-05 | 2020.58 |
2015-01-06 | 2002.61 |
2015-01-07 | 2025.90 |
2015-01-08 | 2062.14 |
2015-01-09 | 2044.81 |
sp500_prices = SP500_index_2015to2020.values.squeeze()
# Compute log returns and drop the first difference
sp500_returns = np.diff(np.log(sp500_prices))[1:]
# Calculate sample mean and sample variance
mu = np.mean(sp500_returns)
var = np.var(sp500_returns)
# Generate synthetic data
np.random.seed(42)
T = 500
x = np.random.normal(mu, np.sqrt(var), T)
# Create a DataFrame with a time index and the synthetic data
df = pd.DataFrame({'t': np.arange(1, T + 1), 'x': x})
# Plot the time series
fig, ax = plt.subplots(figsize=(10, 6))
plt.plot(df['t'], df['x'], linewidth=0.8, color='blue')
plt.title('i.i.d. time series')
plt.xlabel('')
plt.ylabel('')
plt.show()
Sample estimators¶
Let's warm up using the classical estimators for the mean and covariance matrix, namely the sample mean and the sample covariance matrix: $$ \begin{aligned} \hat{\boldsymbol{\mu}} & = \frac{1}{T}\sum_{t=1}^T\mathbf{x}_t\\ \hat{\boldsymbol{\Sigma}} & = \frac{1}{T-1}\sum_{t=1}^T(\mathbf{x}_t-\boldsymbol{\mu})(\mathbf{x}_t-\boldsymbol{\mu})^T. \end{aligned} $$
Illustration of sample estimators in two dimensions:
# Generate Gaussian synthetic return data
np.random.seed(42)
N = 2
T = 10
mu_true = np.zeros(N)
Sigma_true = np.array([[0.0012, 0.0011],
[0.0011, 0.0014]])
X = np.random.multivariate_normal(mean=mu_true, cov=Sigma_true, size=200)
#X = multivariate_normal.rvs(mean=mu_true, cov=Sigma_true, size=200)
X_ = X[:T, :]
# Estimators
mu_sm = np.mean(X_, axis=0) # classical mean estimator (sample mean)
X_centered = X_ - mu_sm
Sigma_scm = np.cov(X_centered.T) # classical covariance estimator (sample covariance matrix)
# Print errors
print(np.linalg.norm(mu_sm - mu_true))
print(np.linalg.norm(Sigma_scm - Sigma_true, 'fro') / np.linalg.norm(Sigma_true, 'fro'))
0.009323178928090176 0.17331755138107485
from matplotlib.patches import Ellipse
def confidence_ellipse(cov, mean, ax, n_std=2, **kwargs):
"""Create covariance ellipse with proper scaling"""
vals, vecs = np.linalg.eigh(cov)
theta = np.degrees(np.arctan2(*vecs[:,0][::-1]))
width, height = 2 * n_std * np.sqrt(vals)
return Ellipse(xy=mean, width=width, height=height, angle=theta, fill=False, **kwargs)
# Scatter plot with corrected ellipses
fig, ax = plt.subplots(figsize=(10, 8))
# Plot data points
ax.scatter(X[:, 0], X[:, 1], color='black', s=2)
# Add ellipses with 95% confidence (n_std=2)
ax.add_patch(confidence_ellipse(Sigma_scm, mu_sm, ax, n_std=2,
edgecolor='red', linewidth=2, label='sample estimator'))
ax.add_patch(confidence_ellipse(Sigma_true, mu_true, ax, n_std=2,
edgecolor='blue', linewidth=2, label='true'))
# Plot mean estimates with big colored dots
ax.scatter(mu_sm[0], mu_sm[1], color='red', s=100)
ax.scatter(mu_true[0], mu_true[1], color='blue', s=100)
# Configure plot
ax.set_xlabel('X-axis')
ax.set_ylabel('Y-axis')
ax.legend()
plt.tight_layout()
plt.show()
Estimation error of sample estimators versus number of observations (for Gaussian data with $N=100$):
# Configuration
np.random.seed(42)
N = 100
T_max = 2000
# Get realistic mu and Sigma
X_market = np.log(SP500_stocks_2015to2020).diff().dropna().to_numpy()[:, :N]
mu_true = np.mean(X_market, axis=0)
Sigma_true = np.cov(X_market.T)
# # Alternatively, create synthetic true parameters
# mu_true = np.random.randn(N) * 0.01
# Sigma_true = np.diag(np.abs(np.random.randn(N))) + 0.1*np.eye(N)
# Generate synthetic Gaussian data
X_gaussian = multivariate_normal.rvs(mean=mu_true, cov=Sigma_true, size=10*T_max)
# Benchmarking framework
T_sweep = np.linspace(50, T_max, 50, dtype=int)
error_mu_vs_T = []
error_Sigma_vs_T = []
for T in T_sweep:
X_ = X_gaussian[:T,]
# Sample estimators
mu_sm = np.mean(X_, axis=0)
Sigma_scm = np.cov(X_.T)
# Compute error
error_mu_vs_T.append(np.linalg.norm(mu_sm - mu_true) / np.linalg.norm(mu_true))
error_Sigma_vs_T.append(np.linalg.norm(Sigma_scm - Sigma_true, 'fro') / np.linalg.norm(Sigma_true, 'fro'))
error_mu_vs_T = np.asarray(error_mu_vs_T)
error_Sigma_vs_T = np.asarray(error_Sigma_vs_T)
# Plots
fig, ax = plt.subplots(figsize=(10, 6))
plt.plot(T_sweep, 100*error_mu_vs_T)
plt.title("Estimation error of sample mean")
plt.xlabel("T")
plt.ylabel("normalized error (%)")
plt.show()
fig, ax = plt.subplots(figsize=(10, 6))
plt.plot(T_sweep, 100*error_Sigma_vs_T)
plt.title("Estimation error of sample covariance matrix")
plt.xlabel("T")
plt.ylabel("normalized error (%)")
plt.show()
Location estimators¶
Illustration of different location estimators in 2D:
# Configuration
np.random.seed(42)
N = 2
T = 10
mu_true = np.zeros(N)
Sigma_true = np.array([[0.0012, 0.0011],
[0.0011, 0.0014]])
nu = 4
scatter_true = (nu - 2)/nu * Sigma_true
# Generate synthetic data
X = multivariate_t.rvs(
loc=mu_true,
shape=scatter_true,
df=nu,
size=800
)
def spatial_median(X, max_iter=100, tol=1e-5):
"""Weiszfeld algorithm for spatial median"""
med = np.median(X, axis=0)
for _ in range(max_iter):
dist = np.linalg.norm(X - med, axis=1)
np.clip(dist, 1e-12, None, out=dist) # Avoid division by zero
weights = 1 / dist
new_med = X.T @ weights / weights.sum()
if np.linalg.norm(new_med - med) < tol:
break
med = new_med
return med
# Calculate estimators using first T samples
X_est = X[:T]
estimators = {
"true": mu_true,
"sample mean": np.mean(X_est, axis=0),
"median": np.median(X_est, axis=0),
"spatial median": spatial_median(X_est) # Assume this function exists
}
# Create comparison dataframe
df = pd.DataFrame([
{"location": key, "x": val[0], "y": val[1]}
for key, val in estimators.items()
])
df["location"] = pd.Categorical(
df["location"],
categories=["true", "sample mean", "median", "spatial median"]
)
# Visualization
fig, ax = plt.subplots(figsize=(10, 8))
# Plot data points
ax.scatter(X[:, 0], X[:, 1], alpha=0.7, s=10, label='Data points')
# Plot estimators with custom markers
marker_map = {
"true": "X", # Big X for ground truth
"sample mean": "o", # Circle for sample mean
"median": "s", # Square for median
"spatial median": "D" # Diamond for spatial median
}
for location in df.location.cat.categories:
subset = df[df.location == location]
ax.scatter(subset.x, subset.y,
s=150,
edgecolor='w',
linewidth=1.5,
marker=marker_map[location],
label=location)
# Formatting
ax.set_xlim(-0.08, 0.08)
ax.set_ylim(-0.08, 0.08)
ax.set_title("Scatter plot of data points with different location estimators", pad=20)
ax.set_xlabel("")
ax.set_ylabel("")
ax.grid(True, alpha=0.3)
ax.legend(loc='upper right', frameon=True)
plt.tight_layout()
plt.show()
Estimation error of location estimators versus number of observations (with $N=100$):
# Configuration
np.random.seed(42)
N = 100
T_max = 2000
n_trials = 100
# Get realistic mu and Sigma
X_market = np.log(SP500_stocks_2015to2020).diff().dropna().to_numpy()[:, :N]
mu_true = np.mean(X_market, axis=0)
Sigma_true = np.cov(X_market.T)
# Generate synthetic Gaussian data
X_gaussian = multivariate_normal.rvs(mean=mu_true, cov=Sigma_true, size=10*T_max)
# Generate synthetic heavy-tailed distribution data
nu = 4
scatter_true = ((nu-2)/nu) * Sigma_true
X_heavy = multivariate_t.rvs(loc=mu_true, shape=scatter_true, df=nu, size=10*T_max)
# Benchmarking framework
T_sweep = np.linspace(50, T_max, 50, dtype=int)
results = []
for T in T_sweep:
for dist_name, data in [('Gaussian', X_gaussian),
('heavy-tailed', X_heavy)]:
for _ in range(n_trials):
# Random subsample
idx = np.random.choice(len(data), size=T, replace=False)
X = data[idx]
# Calculate estimators
mu_sm = np.mean(X, axis=0)
mu_med = np.median(X, axis=0)
mu_spmed = spatial_median(X)
# Calculate relative errors
error = lambda x: 100*np.linalg.norm(x - mu_true)/np.linalg.norm(mu_true)
results.append({
'T': T,
'distribution': dist_name,
'method': 'sample mean',
'error': error(mu_sm)
})
results.append({
'T': T,
'distribution': dist_name,
'method': 'median',
'error': error(mu_med)
})
results.append({
'T': T,
'distribution': dist_name,
'method': 'spatial median',
'error': error(mu_spmed)
})
# Create DataFrame and aggregate results
df = pd.DataFrame(results)
df_agg = df.groupby(['method', 'T', 'distribution']).mean().reset_index()
# Visualization
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10), sharex=True)
for method in df_agg['method'].unique():
subset = df_agg[(df_agg['distribution'] == 'Gaussian') &
(df_agg['method'] == method)]
ax1.plot(subset['T'], subset['error'], label=method)
subset = df_agg[(df_agg['distribution'] == 'heavy-tailed') &
(df_agg['method'] == method)]
ax2.plot(subset['T'], subset['error'], label=method)
ax1.set_title('Error of location estimators under Gaussian data')
ax2.set_title('Error of location estimators under heavy-tailed data')
ax2.set_xlabel('Sample Size (T)')
for ax in (ax1, ax2):
ax.set_ylabel('Relative Error (%)')
ax.legend()
ax.grid(True)
plt.tight_layout()
plt.show()
Estimation error of location estimators versus degrees of freedom in a $t$ distribution (with $T=200$ and $N=100$):
# Configuration
np.random.seed(42)
N = 100
T = 200
n_trials = 200
nu_values = np.arange(3, 31, 2)
# Get realistic mu and Sigma
X_market = np.log(SP500_stocks_2015to2020).diff().dropna().to_numpy()[:, :N]
mu_true = np.mean(X_market, axis=0)
Sigma_true = np.cov(X_market.T)
# Benchmarking framework
results = []
for nu in tqdm(nu_values, desc="Processing ν values"):
scatter_matrix = ((nu - 2)/nu) * Sigma_true
for _ in range(n_trials):
# Generate multivariate t-distributed data
X = multivariate_t.rvs(loc=mu_true, shape=scatter_matrix, df=nu, size=T)
# Calculate estimators
methods = {
'sample mean': np.mean(X, axis=0),
'median': np.median(X, axis=0),
'spatial median': spatial_median(X)
}
# Calculate errors
for method, estimate in methods.items():
error = 100 * np.linalg.norm(estimate - mu_true) / np.linalg.norm(mu_true)
results.append({
'ν': nu,
'method': method,
'error': error
})
# Create and process DataFrame
df = pd.DataFrame(results)
df_agg = df.groupby(['method', 'ν'], as_index=False).agg(
mean_error=('error', 'mean'),
std_error=('error', 'std')
)
# Visualization
fig, ax = plt.subplots(1, 1, figsize=(12, 7))
colors = {'sample mean': '#1f77b4', 'median': '#ff7f0e', 'spatial median': '#2ca02c'}
for method in df_agg['method'].unique():
subset = df_agg[df_agg['method'] == method]
ax.plot(subset['ν'], subset['mean_error'], label=method, color=colors[method], lw=2)
ax.set_title('Error of location estimators under heavy-tailed data')
ax.set_xlabel('ν')
ax.set_ylabel('Relative Error (%)')
plt.legend()
plt.tight_layout()
plt.show()
Gaussian ML estimators¶
Estimation error of Gaussian ML estimators versus number of observations (with $N=100$):
# Configuration
np.random.seed(42)
N = 100
T_max = 1000
n_trials = 100
# Get realistic mu and Sigma
X_market = np.diff(np.log(SP500_stocks_2015to2020.values), axis=0)[1:, :N]
mu_true = X_market.mean(axis=0)
Sigma_true = np.cov(X_market, rowvar=False)
nu = 4
scatter_true = (nu-2)/nu * Sigma_true
# Generate synthetic datasets
X_gaussian = multivariate_normal.rvs(
mean=mu_true, cov=Sigma_true, size=10*T_max
)
X_heavy = multivariate_t.rvs(
loc=mu_true, shape=scatter_true, df=nu, size=10*T_max
)
# Benchmarking framework
T_sweep = np.linspace(50, T_max, num=20, dtype=int)
results = []
for T in T_sweep:
for dist_name, data in [('Gaussian', X_gaussian),
('heavy-tailed', X_heavy)]:
for _ in range(n_trials):
# Subsample data
idx = np.random.choice(len(data), size=T, replace=False)
X = data[idx]
# Calculate estimators
mu_hat = X.mean(axis=0)
Sigma_hat = (T-1)/T * np.cov(X, rowvar=False)
# Calculate relative errors
rel_error_mu = (np.linalg.norm(mu_hat - mu_true, 2) / np.linalg.norm(mu_true, 2))
rel_error_Sigma = (np.linalg.norm(Sigma_hat - Sigma_true, 'fro') / np.linalg.norm(Sigma_true, 'fro'))
results.append({
'T': T,
'distribution': dist_name,
'error_mu': 100 * rel_error_mu,
'error_Sigma': 100 * rel_error_Sigma
})
# Create DataFrame and aggregate
df = pd.DataFrame(results)
df_agg = df.groupby(['distribution', 'T']).mean().reset_index()
# Visualization
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 8), sharex=True)
for dist in ['Gaussian', 'heavy-tailed']:
subset = df_agg[df_agg['distribution'] == dist]
ax1.plot(subset['T'], subset['error_mu'], label=dist)
ax2.plot(subset['T'], subset['error_Sigma'], label=dist)
ax1.set_title('Error of Gaussian ML mean estimator')
ax2.set_title('Error of Gaussian ML covariance estimator')
ax2.set_xlabel('Sample Size (T)')
for ax in (ax1, ax2):
ax.set_ylabel('Normalized Error (%)')
ax.legend(title='Data distribution')
ax.grid(True)
ax.label_outer()
plt.tight_layout()
plt.show()
Estimation error of Gaussian ML estimators versus degrees of freedom in a $t$ distribution (with $T=200$ and $N=100$):
# Configuration
np.random.seed(42)
N = 100
T = 200
n_trials = 200
nu_values = np.arange(3, 31, 2)
# Get realistic mu and Sigma (assuming SP500_stocks_2015to2020 is a DataFrame)
X_market = np.log(SP500_stocks_2015to2020).diff().dropna().to_numpy()[:, :N]
mu_true = np.mean(X_market, axis=0)
Sigma_true = np.cov(X_market, rowvar=False)
# Benchmarking framework
results = []
for nu in tqdm(nu_values, desc="Processing ν values"):
scatter_matrix = ((nu - 2)/nu) * Sigma_true
for _ in range(n_trials):
# Generate multivariate t-distributed data
X = multivariate_t.rvs(
loc=mu_true,
shape=scatter_matrix,
df=nu,
size=T,
)
# Calculate estimators
mu_est = np.mean(X, axis=0)
Sigma_est = np.cov(X, rowvar=False) # MLE covariance
# Calculate normalized errors
mu_error = np.linalg.norm(mu_est - mu_true) / np.linalg.norm(mu_true)
Sigma_error = np.linalg.norm(Sigma_est - Sigma_true, 'fro') / np.linalg.norm(Sigma_true, 'fro')
results.append({
'ν': nu,
'method': 'Gaussian ML',
'error mu': 100 * mu_error,
'error Sigma': 100 * Sigma_error
})
# Create and process DataFrame
df = pd.DataFrame(results)
df_agg = df.groupby(['method', 'ν'], as_index=False).agg({
'error mu': 'mean',
'error Sigma': 'mean'
})
# Visualization
plt.figure(figsize=(12, 10))
# Plot for mean estimation error
plt.subplot(2, 1, 1)
plt.plot(df_agg['ν'], df_agg['error mu'], color='#1f77b4', lw=2)
plt.title('Estimation error of Gaussian ML mean estimator')
plt.xlabel('ν')
plt.ylabel('Normalized Error (%)')
plt.ylim(200, 250)
# Plot for covariance estimation error
plt.subplot(2, 1, 2)
plt.plot(df_agg['ν'], df_agg['error Sigma'],
color='#2ca02c', lw=2)
plt.title('Estimation error of Gaussian ML covariance estimator')
plt.xlabel('ν')
plt.ylabel('Normalized Error (%)')
plt.tight_layout()
plt.show()
The failure of Gaussian ML estimators¶
def create_2covariance_plot(X, Sigma_true, Sigma_scm, title):
"""Helper function to create consistent plots"""
fig, ax = plt.subplots(figsize=(10, 8))
# Scatter plot
ax.scatter(X[:,0], X[:,1], s=8, color='black', alpha=0.8)
# Add ellipses
for cov, color, label in zip([Sigma_true, Sigma_scm],
['blue', 'red'],
['True', 'Gaussian ML']):
ellipse = confidence_ellipse(cov, [0,0], ax=ax,
edgecolor=color,
linewidth=1.5)
ax.add_patch(ellipse)
# Plot configuration
ax.set_xlim(-0.2, 0.2)
ax.set_ylim(-0.2, 0.2)
ax.set_title(title, pad=20)
ax.legend(handles=[plt.Line2D([0], [0], color=c, lw=2) for c in ['blue', 'red']],
labels=['True covariance', 'Gaussian ML estimator'])
plt.tight_layout()
return fig
# Parameters
mu_true = np.array([0.0, 0.0])
Sigma_true = np.array([[0.0012, 0.0011],
[0.0011, 0.0014]])
T = 10
np.random.seed(42)
- Gaussian Data:
X_gaussian = multivariate_normal.rvs(
mean=mu_true, cov=Sigma_true, size=200
)
X_subset = X_gaussian[:T]
Sigma_scm = (X_subset.T @ X_subset) / T
create_2covariance_plot(X_gaussian, Sigma_true, Sigma_scm, "Gaussian Data")
plt.show()
- Heavy-tailed data:
nu = 4
scatter_matrix = (nu-2)/nu * Sigma_true # For multivariate_t shape param
X_heavy = multivariate_t.rvs(
loc=mu_true,
shape=scatter_matrix,
df=nu,
size=200
)
X_subset = X_heavy[:T]
Sigma_scm = (X_subset.T @ X_subset) / T
create_2covariance_plot(X_heavy, Sigma_true, Sigma_scm, "Heavy-tailed Data (ν=4)")
plt.show()
- Gaussian data with outliers:
X_gaussian = multivariate_normal.rvs(
mean=mu_true, cov=Sigma_true, size=200
)
# Add outlier at (-0.15, 0.05)
X_outlier = np.vstack([[-0.15, 0.05], X_gaussian])
X_subset = X_outlier[:T]
Sigma_scm = (X_subset.T @ X_subset) / T
create_2covariance_plot(X_outlier, Sigma_true, Sigma_scm, "Gaussian Data with Outlier")
plt.show()
Heavy-tailed ML estimators¶
Recall that the ML estimators can be iteratively computed as the weighted sample mean and sample covariance matrix: $$ \begin{aligned} \boldsymbol{\mu}^{k+1} &= \frac{\frac{1}{T}\sum_{t=1}^T w_t(\boldsymbol{\mu}^k,\boldsymbol{\Sigma}^k) \times \mathbf{x}_t}{\frac{1}{T}\sum_{t=1}^T w_t(\boldsymbol{\mu}^k,\boldsymbol{\Sigma}^k)}\\ \boldsymbol{\Sigma}^{k+1} &= \frac{1}{T}\sum_{t=1}^T w_t(\boldsymbol{\mu}^{k+1},\boldsymbol{\Sigma}^k) \times (\mathbf{x}_t - \boldsymbol{\mu}^{k+1})(\mathbf{x}_t - \boldsymbol{\mu}^{k+1})^{T} \end{aligned} $$ where the weights $w_t(\boldsymbol{\mu},\boldsymbol{\Sigma})$ are defined as $$w_t(\boldsymbol{\mu},\boldsymbol{\Sigma}) = \frac{\nu+N}{\nu + (\mathbf{x}_t - \boldsymbol{\mu})^{T}\boldsymbol{\Sigma}^{-1}(\mathbf{x}_t - \boldsymbol{\mu})}.$$
For convenience, we define the function fit_mvt()
to estimate the mean and covariance matrix for a heavy-tailed distribution:
def nu_POP_estimator(r2, nu, N):
"""
Estimate the degrees of freedom of a heavy-tailed t distribution based on the POP estimator.
Based on:
Frédéric Pascal, Esa Ollila, and Daniel P. Palomar, "Improved estimation of the degree of freedom parameter of
multivariate t-distribution," in Proc. European Signal Processing Conference (EUSIPCO), Dublin, Ireland, Aug. 23-27, 2021.
<https://doi.org/10.23919/EUSIPCO54536.2021.9616162>
"""
T = len(r2)
u = (N + nu) / (nu + r2 * T / (T - 1))
r2i = r2 / (1 - r2 * u / T)
theta = (1 - N / T) * np.sum(r2i) / T / N
nu = 2 * theta / (theta - 1)
nu = min(100, max(2.5, nu))
return nu
def fit_mvt(X, verbose=False, max_iter=100, ptol=1e-3):
"""
Fit of a multivariate t distribution to the data X.
References:
Chuanhai Liu and Donald B. Rubin, "ML estimation of the t-distribution using EM and its extensions, ECM and ECME,"
Statistica Sinica (5), pp. 19-39, 1995.
Chuanhai Liu, Donald B. Rubin, and Ying Nian Wu, "Parameter Expansion to Accelerate EM: The PX-EM Algorithm,"
Biometrika, Vol. 85, No. 4, pp. 755-770, Dec., 1998
Rui Zhou, Junyan Liu, Sandeep Kumar, and Daniel P. Palomar, "Robust factor analysis parameter estimation,"
Lecture Notes in Computer Science (LNCS), 2019. <https://arxiv.org/abs/1909.12530>
Esa Ollila, Daniel P. Palomar, and Frédéric Pascal, "Shrinking the Eigenvalues of M-estimators of Covariance Matrix,"
IEEE Trans. on Signal Processing, vol. 69, pp. 256-269, Jan. 2021. <https://doi.org/10.1109/TSP.2020.3043952>
Frédéric Pascal, Esa Ollila, and Daniel P. Palomar, "Improved estimation of the degree of freedom parameter of
multivariate t-distribution," in Proc. European Signal Processing Conference (EUSIPCO), Dublin, Ireland, Aug. 23-27, 2021.
<https://doi.org/10.23919/EUSIPCO54536.2021.9616162>
"""
X = np.asarray(X)
T, N = np.shape(X)
# initial points
nu = 4
mu = np.mean(X, axis=0)
cov = np.cov(X, rowvar=False)
Sigma = (nu - 2)/nu * cov
def fnu(nu):
return nu/(nu - 2)
#
# loop
#
Sigma_diff_record = np.empty(max_iter)
for k in range(max_iter):
nu_prev = np.copy(nu)
mu_prev = np.copy(mu)
Sigma_prev = np.copy(Sigma)
# intermediate variables
Xc = X - mu
#r2 = np.sum(Xc * np.dot(Xc, np.linalg.inv(Sigma)), axis=1) # np.diag( Xc @ np.linalg.inv(Sigma) @ t(Xc) )
r2 = np.sum(Xc * np.linalg.solve(Sigma + 1e-6 * np.eye(N), Xc.T).T, axis=1)
# update nu, mu, Sigma
nu = nu_POP_estimator(r2=r2, nu=nu, N=N)
E_tau = (N + nu) / (nu + r2)
ave_E_tau = np.mean(E_tau)
ave_E_tau_X = np.mean(X * E_tau[:, None], axis=0)
mu = ave_E_tau_X / ave_E_tau
Xc = X - mu
ave_E_tau_XX = (1 / T) * np.dot((Xc * np.sqrt(E_tau)[:, None]).T, Xc * np.sqrt(E_tau)[:, None])
#ave_E_tau_XX_ = (1 / T) * Xc.T @ np.diag(E_tau) @ Xc
Sigma = ave_E_tau_XX
Sigma = Sigma / ave_E_tau # PX_EM_acceleration
# stopping criterion
Sigma_diff_record[k] = np.linalg.norm(Sigma - Sigma_prev) / np.linalg.norm(Sigma_prev)
if (np.all(np.abs(mu - mu_prev) <= .5 * ptol * (np.abs(mu_prev) + np.abs(mu))) and
np.abs(fnu(nu) - fnu(nu_prev)) <= .5 * ptol * (np.abs(fnu(nu_prev)) + np.abs(fnu(nu))) and
np.all(np.abs(Sigma - Sigma_prev) <= .5 * ptol * (np.abs(Sigma_prev) + np.abs(Sigma)))):
break
Sigma_diff_record = Sigma_diff_record[:k]
Sigma_cov = nu / (nu - 2) * Sigma
if verbose:
fig, ax = plt.subplots(figsize=(16, 6))
ax.plot(np.arange(k), Sigma_diff_record)
plt.xlabel("Iterations")
plt.ylabel("Sigma differences")
plt.show()
return mu, Sigma_cov
Now we can assess the effect of heavy tails and outliers in heavy-tailed ML covariance matrix estimator:
# Parameters
mu_true = np.array([0.0, 0.0])
Sigma_true = np.array([[0.0012, 0.0011],
[0.0011, 0.0014]])
T = 10
np.random.seed(42)
def create_3covariance_plot(X, Sigma_true, Sigma_scm, Sigma_heavytail, title):
"""Helper function to create consistent plots"""
fig, ax = plt.subplots(figsize=(10, 8))
# Scatter plot
ax.scatter(X[:,0], X[:,1], s=8, color='black', alpha=0.8)
# Add ellipses
for cov, color, label in zip([Sigma_true, Sigma_scm, Sigma_heavytail],
['blue', 'red', 'green'],
['True', 'Gaussian ML', 'Heavy-tailed ML']):
ellipse = confidence_ellipse(cov, [0,0], ax=ax,
edgecolor=color,
linewidth=1.5)
ax.add_patch(ellipse)
# Plot configuration
ax.set_xlim(-0.2, 0.2)
ax.set_ylim(-0.2, 0.2)
ax.set_title(title, pad=20)
ax.legend(handles=[plt.Line2D([0], [0], color=c, lw=2)
for c in ['blue', 'red', 'green']],
labels=['True covariance', 'Gaussian ML estimator', 'Heavy-tailed ML estimator'])
plt.tight_layout()
return fig
- Gaussian Data:
# Data generation
X_gaussian = multivariate_normal.rvs(
mean=mu_true, cov=Sigma_true, size=200
)
X_subset = X_gaussian[:T]
# Estimators
Sigma_scm = (X_subset.T @ X_subset) / T
mu_heavytail, Sigma_heavytail = fit_mvt(X_subset)
# Plot
create_3covariance_plot(X_gaussian, Sigma_true, Sigma_scm, Sigma_heavytail, "Gaussian Data")
plt.show()
- Heavy-tailed data:
# Data generation
nu = 4
scatter_matrix = (nu-2)/nu * Sigma_true # For multivariate_t shape param
X_heavy = multivariate_t.rvs(
loc=mu_true,
shape=scatter_matrix,
df=nu,
size=200
)
X_subset = X_heavy[:T]
# Estimators
Sigma_scm = (X_subset.T @ X_subset) / T
mu_heavytail, Sigma_heavytail = fit_mvt(X_subset)
# Plot
create_3covariance_plot(X_gaussian, Sigma_true, Sigma_scm, Sigma_heavytail, "Heavy-tailed Data (ν=4)")
plt.show()
- Gaussian data with outliers:
# Data generation
X_gaussian = multivariate_normal.rvs(
mean=mu_true, cov=Sigma_true, size=200
)
# Add outlier at (-0.15, 0.05)
X_outlier = np.vstack([[-0.15, 0.05], X_gaussian])
X_subset = X_outlier[:T]
# Estimators
Sigma_scm = (X_subset.T @ X_subset) / T
mu_heavytail, Sigma_heavytail = fit_mvt(X_subset)
# Plot
create_3covariance_plot(X_gaussian, Sigma_true, Sigma_scm, Sigma_heavytail, "Gaussian Data with Outlier")
plt.show()
Estimation error of different ML estimators versus number of observations (for $t$-distributed heavy-tailed data with $\nu=4$ and $N=100$):
# Configuration
np.random.seed(42)
N = 100
T_max = 1000
n_trials = 100
nu = 4
# Get realistic mu and Sigma
X_market = np.log(SP500_stocks_2015to2020).diff().dropna().to_numpy()[:, :N]
mu_true = np.mean(X_market, axis=0)
Sigma_true = np.cov(X_market.T)
# Generate synthetic heavy-tailed data
scatter_true = ((nu-2)/nu) * Sigma_true
X_heavtytailed = multivariate_t.rvs(
loc=mu_true,
shape=scatter_true,
df=nu,
size=10*T_max
)
# Benchmarking framework
T_sweep = np.arange(150, T_max+1, 50)
results = []
for T in T_sweep:
for _ in range(n_trials):
# Random subsample
idx = np.random.choice(len(X_heavtytailed), size=T, replace=False)
X_sub = X_heavtytailed[idx]
# Gaussian MLE
mu_gauss = np.mean(X_sub, axis=0)
Sigma_gauss = (T-1)/T * np.cov(X_sub.T)
# Heavy-tailed MLE (assuming fit_mvt exists)
mu_heavy, Sigma_heavy = fit_mvt(X_sub)
# Calculate errors
errors = {
'Gaussian MLE': {
'mu': np.linalg.norm(mu_gauss - mu_true)/np.linalg.norm(mu_true),
'Sigma': np.linalg.norm(Sigma_gauss - Sigma_true, 'fro')/np.linalg.norm(Sigma_true, 'fro')
},
'heavy-tailed MLE': {
'mu': np.linalg.norm(mu_heavy - mu_true)/np.linalg.norm(mu_true),
'Sigma': np.linalg.norm(Sigma_heavy - Sigma_true, 'fro')/np.linalg.norm(Sigma_true, 'fro')
}
}
for method in errors:
results.append({
'T': T,
'method': method,
'error mu': errors[method]['mu'],
'error Sigma': errors[method]['Sigma']
})
# Process results
df = pd.DataFrame(results)
df_agg = df.groupby(['method', 'T'], as_index=False).agg({
'error mu': lambda x: 100*np.mean(x),
'error Sigma': lambda x: 100*np.mean(x)
})
# Visualization
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10), sharex=True)
for method in df_agg['method'].unique():
subset = df_agg[df_agg['method'] == method]
ax1.plot(subset['T'], subset['error mu'], label=method)
ax2.plot(subset['T'], subset['error Sigma'], label=method)
ax1.set_title('Error of Mean Estimators')
ax2.set_title('Error of Covariance Estimators')
ax2.set_xlabel('Sample Size (T)')
ax1.set_ylabel('Normalized Error (%)')
ax2.set_ylabel('Normalized Error (%)')
ax1.legend()
ax1.grid(True)
ax2.grid(True)
plt.tight_layout()
plt.show()
Estimation error of different ML estimators versus degrees of freedom in a $t$ distribution (with $T=200$ and $N=100$):
# Configuration
np.random.seed(42)
N = 100
T = 200
n_trials = 200
nu_values = np.arange(3, 31, 2)
# Get realistic mu and Sigma
X_market = np.log(SP500_stocks_2015to2020).diff().dropna().to_numpy()[:, :N]
mu_true = np.mean(X_market, axis=0)
Sigma_true = np.cov(X_market.T)
# Benchmarking framework
results = []
for nu in tqdm(nu_values, desc="Processing ν values"):
scatter_matrix = ((nu - 2)/nu) * Sigma_true
for _ in range(n_trials):
# Generate multivariate t-distributed data
X = multivariate_t.rvs(
loc=mu_true,
shape=scatter_matrix,
df=nu,
size=T,
)
# Gaussian MLE
mu_gauss = np.mean(X, axis=0)
Sigma_gauss = (T-1)/T * np.cov(X, rowvar=False)
mu_error_gauss = np.linalg.norm(mu_gauss - mu_true) / np.linalg.norm(mu_true)
Sigma_error_gauss = np.linalg.norm(Sigma_gauss - Sigma_true, 'fro') / np.linalg.norm(Sigma_true, 'fro')
results.append({
'ν': nu,
'method': 'Gaussian MLE',
'error mu': 100 * mu_error_gauss,
'error Sigma': 100 * Sigma_error_gauss
})
# Heavy-tailed MLE (assuming fit_mvt exists)
mu_heavy, Sigma_heavy = fit_mvt(X) # Replace with actual implementation of fit_mvt
mu_error_heavy = np.linalg.norm(mu_heavy - mu_true) / np.linalg.norm(mu_true)
Sigma_error_heavy = np.linalg.norm(Sigma_heavy - Sigma_true, 'fro') / np.linalg.norm(Sigma_true, 'fro')
results.append({
'ν': nu,
'method': 'Heavy-tailed MLE',
'error mu': 100 * mu_error_heavy,
'error Sigma': 100 * Sigma_error_heavy
})
# Create and process DataFrame
df = pd.DataFrame(results)
df_agg = df.groupby(['method', 'ν'], as_index=False).agg({
'error mu': 'mean',
'error Sigma': 'mean'
})
# Visualization
plt.figure(figsize=(12, 10))
# Plot for mean estimation error
plt.subplot(2, 1, 1)
for method in df_agg['method'].unique():
subset = df_agg[df_agg['method'] == method]
plt.plot(subset['ν'], subset['error mu'], label=method)
plt.title('Estimation error of mean estimators')
plt.xlabel('ν')
plt.ylabel('Normalized Error (%)')
plt.legend()
plt.grid(True)
# Plot for covariance estimation error
plt.subplot(2, 1, 2)
for method in df_agg['method'].unique():
subset = df_agg[df_agg['method'] == method]
plt.plot(subset['ν'], subset['error Sigma'], label=method)
plt.title('Estimation error of covariance estimators')
plt.xlabel('ν')
plt.ylabel('Normalized Error (%)')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
Convergence of iterative robust heavy-tailed ML estimators:
# Configuration
np.random.seed(42)
N = 100
T = 200
n_trials = 200
nu_values = np.arange(3, 31, 2)
# Get realistic mu and Sigma
X_market = np.log(SP500_stocks_2015to2020).diff().dropna().to_numpy()[:, :N]
mu_true = np.mean(X_market, axis=0)
Sigma_true = np.cov(X_market.T)
nu <- 4
scatter_matrix = (nu-2)/nu * Sigma_true
# Generate multivariate t-distributed data
X = multivariate_t.rvs(
loc=mu_true,
shape=scatter_matrix,
df=nu,
size=T,
)
# heavy-tailed MLE
mu_heavy, Sigma_heavy = fit_mvt(X, verbose=True)
Prior information¶
Shrinkage¶
Factor models¶