Python Code for Portfolio Optimization Chapter 3 – Financial Data: I.I.D. Modeling

Daniel P. Palomar (2025). Portfolio Optimization: Theory and Application. Cambridge University Press.

Last update: February 18, 2025

Contributors:

Packages¶

The following packages are used in the examples:

# Core numerical computing
import numpy as np
import pandas as pd
from scipy.stats import multivariate_normal, multivariate_t
from scipy import linalg

# Financial data handling
import yfinance as yf

# Book data (pip install "git+https://github.com/dppalomar/pob.git#subdirectory=python")
from pob_python import SP500_stocks_2015to2020, SP500_index_2015to2020

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="darkgrid")
from tqdm import tqdm  # For progress tracking

IID model¶

Example of a synthetic Gaussian i.i.d. time series:

# Use S&P 500 index from the book data
SP500_index_2015to2020.head()

	SP500_INDEX
Date
2015-01-05	2020.58
2015-01-06	2002.61
2015-01-07	2025.90
2015-01-08	2062.14
2015-01-09	2044.81

sp500_prices = SP500_index_2015to2020.values.squeeze()

# Compute log returns and drop the first difference
sp500_returns = np.diff(np.log(sp500_prices))[1:]

# Calculate sample mean and sample variance
mu = np.mean(sp500_returns)
var = np.var(sp500_returns)

# Generate synthetic data
np.random.seed(42)
T = 500
x = np.random.normal(mu, np.sqrt(var), T)

# Create a DataFrame with a time index and the synthetic data
df = pd.DataFrame({'t': np.arange(1, T + 1), 'x': x})

# Plot the time series
fig, ax = plt.subplots(figsize=(10, 6))
plt.plot(df['t'], df['x'], linewidth=0.8, color='blue')
plt.title('i.i.d. time series')
plt.xlabel('')
plt.ylabel('')
plt.show()

No description has been provided for this image

Sample estimators¶

Let's warm up using the classical estimators for the mean and covariance matrix, namely the sample mean and the sample covariance matrix: $$ \begin{aligned} \hat{\boldsymbol{\mu}} & = \frac{1}{T}\sum_{t=1}^T\mathbf{x}_t\\ \hat{\boldsymbol{\Sigma}} & = \frac{1}{T-1}\sum_{t=1}^T(\mathbf{x}_t-\boldsymbol{\mu})(\mathbf{x}_t-\boldsymbol{\mu})^T. \end{aligned} $$

Illustration of sample estimators in two dimensions:

# Generate Gaussian synthetic return data
np.random.seed(42)
N = 2
T = 10
mu_true = np.zeros(N)
Sigma_true = np.array([[0.0012, 0.0011],
                       [0.0011, 0.0014]])
X = np.random.multivariate_normal(mean=mu_true, cov=Sigma_true, size=200)
#X = multivariate_normal.rvs(mean=mu_true, cov=Sigma_true, size=200)
X_ = X[:T, :]


# Estimators
mu_sm = np.mean(X_, axis=0)  # classical mean estimator (sample mean)
X_centered = X_ - mu_sm
Sigma_scm = np.cov(X_centered.T) # classical covariance estimator (sample covariance matrix)

# Print errors
print(np.linalg.norm(mu_sm - mu_true))
print(np.linalg.norm(Sigma_scm - Sigma_true, 'fro') / np.linalg.norm(Sigma_true, 'fro'))

0.009323178928090176
0.17331755138107485

from matplotlib.patches import Ellipse

def confidence_ellipse(cov, mean, ax, n_std=2, **kwargs):
    """Create covariance ellipse with proper scaling"""
    vals, vecs = np.linalg.eigh(cov)
    theta = np.degrees(np.arctan2(*vecs[:,0][::-1]))
    width, height = 2 * n_std * np.sqrt(vals)
    return Ellipse(xy=mean, width=width, height=height, angle=theta, fill=False, **kwargs)

# Scatter plot with corrected ellipses
fig, ax = plt.subplots(figsize=(10, 8))

# Plot data points
ax.scatter(X[:, 0], X[:, 1], color='black', s=2)

# Add ellipses with 95% confidence (n_std=2)
ax.add_patch(confidence_ellipse(Sigma_scm, mu_sm, ax, n_std=2, 
                                edgecolor='red', linewidth=2, label='sample estimator'))
ax.add_patch(confidence_ellipse(Sigma_true, mu_true, ax, n_std=2,
                                edgecolor='blue', linewidth=2, label='true'))

# Plot mean estimates with big colored dots
ax.scatter(mu_sm[0], mu_sm[1], color='red', s=100)
ax.scatter(mu_true[0], mu_true[1], color='blue', s=100)

# Configure plot
ax.set_xlabel('X-axis')
ax.set_ylabel('Y-axis')
ax.legend()
plt.tight_layout()
plt.show()

Estimation error of sample estimators versus number of observations (for Gaussian data with $N=100$):

# Configuration
np.random.seed(42)
N = 100
T_max = 2000

# Get realistic mu and Sigma
X_market = np.log(SP500_stocks_2015to2020).diff().dropna().to_numpy()[:, :N]
mu_true = np.mean(X_market, axis=0)
Sigma_true = np.cov(X_market.T)

# # Alternatively, create synthetic true parameters
# mu_true = np.random.randn(N) * 0.01
# Sigma_true = np.diag(np.abs(np.random.randn(N))) + 0.1*np.eye(N)

# Generate synthetic Gaussian data
X_gaussian = multivariate_normal.rvs(mean=mu_true, cov=Sigma_true, size=10*T_max)

# Benchmarking framework
T_sweep = np.linspace(50, T_max, 50, dtype=int)
error_mu_vs_T = []
error_Sigma_vs_T = []

for T in T_sweep:
    X_ = X_gaussian[:T,]
    
    # Sample estimators
    mu_sm = np.mean(X_, axis=0)
    Sigma_scm = np.cov(X_.T)
    
    # Compute error
    error_mu_vs_T.append(np.linalg.norm(mu_sm - mu_true) / np.linalg.norm(mu_true))
    error_Sigma_vs_T.append(np.linalg.norm(Sigma_scm - Sigma_true, 'fro') / np.linalg.norm(Sigma_true, 'fro'))
    
error_mu_vs_T = np.asarray(error_mu_vs_T)
error_Sigma_vs_T = np.asarray(error_Sigma_vs_T)

# Plots
fig, ax = plt.subplots(figsize=(10, 6))
plt.plot(T_sweep, 100*error_mu_vs_T)
plt.title("Estimation error of sample mean")
plt.xlabel("T")
plt.ylabel("normalized error (%)")
plt.show()

fig, ax = plt.subplots(figsize=(10, 6))
plt.plot(T_sweep, 100*error_Sigma_vs_T)
plt.title("Estimation error of sample covariance matrix")
plt.xlabel("T")
plt.ylabel("normalized error (%)")
plt.show()

Location estimators¶

Illustration of different location estimators in 2D:

# Configuration
np.random.seed(42)
N = 2
T = 10
mu_true = np.zeros(N)
Sigma_true = np.array([[0.0012, 0.0011],
                       [0.0011, 0.0014]])
nu = 4
scatter_true = (nu - 2)/nu * Sigma_true

# Generate synthetic data
X = multivariate_t.rvs(
    loc=mu_true,
    shape=scatter_true,
    df=nu,
    size=800
)

def spatial_median(X, max_iter=100, tol=1e-5):
    """Weiszfeld algorithm for spatial median"""
    med = np.median(X, axis=0)
    for _ in range(max_iter):
        dist = np.linalg.norm(X - med, axis=1)
        np.clip(dist, 1e-12, None, out=dist)  # Avoid division by zero
        weights = 1 / dist
        new_med = X.T @ weights / weights.sum()
        if np.linalg.norm(new_med - med) < tol:
            break
        med = new_med
    return med

# Calculate estimators using first T samples
X_est = X[:T]
estimators = {
    "true": mu_true,
    "sample mean": np.mean(X_est, axis=0),
    "median": np.median(X_est, axis=0),
    "spatial median": spatial_median(X_est)  # Assume this function exists
}

# Create comparison dataframe
df = pd.DataFrame([
    {"location": key, "x": val[0], "y": val[1]} 
    for key, val in estimators.items()
])
df["location"] = pd.Categorical(
    df["location"], 
    categories=["true", "sample mean", "median", "spatial median"]
)

# Visualization
fig, ax = plt.subplots(figsize=(10, 8))

# Plot data points
ax.scatter(X[:, 0], X[:, 1], alpha=0.7, s=10, label='Data points')

# Plot estimators with custom markers
marker_map = {
    "true": "X",         # Big X for ground truth
    "sample mean": "o",  # Circle for sample mean
    "median": "s",       # Square for median
    "spatial median": "D" # Diamond for spatial median
}

for location in df.location.cat.categories:
    subset = df[df.location == location]
    ax.scatter(subset.x, subset.y, 
               s=150, 
               edgecolor='w',
               linewidth=1.5,
               marker=marker_map[location],
               label=location)

# Formatting
ax.set_xlim(-0.08, 0.08)
ax.set_ylim(-0.08, 0.08)
ax.set_title("Scatter plot of data points with different location estimators", pad=20)
ax.set_xlabel("")
ax.set_ylabel("")
ax.grid(True, alpha=0.3)
ax.legend(loc='upper right', frameon=True)

plt.tight_layout()
plt.show()

Estimation error of location estimators versus number of observations (with $N=100$):

# Configuration
np.random.seed(42)
N = 100
T_max = 2000
n_trials = 100

# Get realistic mu and Sigma
X_market = np.log(SP500_stocks_2015to2020).diff().dropna().to_numpy()[:, :N]
mu_true = np.mean(X_market, axis=0)
Sigma_true = np.cov(X_market.T)

# Generate synthetic Gaussian data
X_gaussian = multivariate_normal.rvs(mean=mu_true, cov=Sigma_true, size=10*T_max)

# Generate synthetic heavy-tailed distribution data
nu = 4
scatter_true = ((nu-2)/nu) * Sigma_true
X_heavy = multivariate_t.rvs(loc=mu_true, shape=scatter_true, df=nu, size=10*T_max)

# Benchmarking framework
T_sweep = np.linspace(50, T_max, 50, dtype=int)
results = []

for T in T_sweep:
    for dist_name, data in [('Gaussian', X_gaussian), 
                            ('heavy-tailed', X_heavy)]:
        for _ in range(n_trials):
            # Random subsample
            idx = np.random.choice(len(data), size=T, replace=False)
            X = data[idx]
            
            # Calculate estimators
            mu_sm = np.mean(X, axis=0)
            mu_med = np.median(X, axis=0)
            mu_spmed = spatial_median(X)
            
            # Calculate relative errors
            error = lambda x: 100*np.linalg.norm(x - mu_true)/np.linalg.norm(mu_true)
            
            results.append({
                'T': T,
                'distribution': dist_name,
                'method': 'sample mean',
                'error': error(mu_sm)
            })
            results.append({
                'T': T,
                'distribution': dist_name,
                'method': 'median',
                'error': error(mu_med)
            })
            results.append({
                'T': T,
                'distribution': dist_name,
                'method': 'spatial median',
                'error': error(mu_spmed)
            })

# Create DataFrame and aggregate results
df = pd.DataFrame(results)
df_agg = df.groupby(['method', 'T', 'distribution']).mean().reset_index()

# Visualization
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10), sharex=True)

for method in df_agg['method'].unique():
    subset = df_agg[(df_agg['distribution'] == 'Gaussian') & 
                   (df_agg['method'] == method)]
    ax1.plot(subset['T'], subset['error'], label=method)
    
    subset = df_agg[(df_agg['distribution'] == 'heavy-tailed') & 
                   (df_agg['method'] == method)]
    ax2.plot(subset['T'], subset['error'], label=method)

ax1.set_title('Error of location estimators under Gaussian data')
ax2.set_title('Error of location estimators under heavy-tailed data')
ax2.set_xlabel('Sample Size (T)')
for ax in (ax1, ax2):
    ax.set_ylabel('Relative Error (%)')
    ax.legend()
    ax.grid(True)

plt.tight_layout()
plt.show()

Estimation error of location estimators versus degrees of freedom in a $t$ distribution (with $T=200$ and $N=100$):

# Configuration
np.random.seed(42)
N = 100
T = 200
n_trials = 200
nu_values = np.arange(3, 31, 2)

# Get realistic mu and Sigma
X_market = np.log(SP500_stocks_2015to2020).diff().dropna().to_numpy()[:, :N]
mu_true = np.mean(X_market, axis=0)
Sigma_true = np.cov(X_market.T)


# Benchmarking framework
results = []
for nu in tqdm(nu_values, desc="Processing ν values"):
    scatter_matrix = ((nu - 2)/nu) * Sigma_true
    
    for _ in range(n_trials):
        # Generate multivariate t-distributed data
        X = multivariate_t.rvs(loc=mu_true, shape=scatter_matrix, df=nu, size=T)
        
        # Calculate estimators
        methods = {
            'sample mean': np.mean(X, axis=0),
            'median': np.median(X, axis=0),
            'spatial median': spatial_median(X)
        }
        
        # Calculate errors
        for method, estimate in methods.items():
            error = 100 * np.linalg.norm(estimate - mu_true) / np.linalg.norm(mu_true)
            results.append({
                'ν': nu,
                'method': method,
                'error': error
            })

# Create and process DataFrame
df = pd.DataFrame(results)
df_agg = df.groupby(['method', 'ν'], as_index=False).agg(
    mean_error=('error', 'mean'),
    std_error=('error', 'std')
)

# Visualization
fig, ax = plt.subplots(1, 1, figsize=(12, 7))
colors = {'sample mean': '#1f77b4', 'median': '#ff7f0e', 'spatial median': '#2ca02c'}

for method in df_agg['method'].unique():
    subset = df_agg[df_agg['method'] == method]
    ax.plot(subset['ν'], subset['mean_error'], label=method, color=colors[method], lw=2)
ax.set_title('Error of location estimators under heavy-tailed data')
ax.set_xlabel('ν')
ax.set_ylabel('Relative Error (%)')
plt.legend()
plt.tight_layout()
plt.show()

Gaussian ML estimators¶

Estimation error of Gaussian ML estimators versus number of observations (with $N=100$):

# Configuration
np.random.seed(42)
N = 100
T_max = 1000
n_trials = 100

# Get realistic mu and Sigma
X_market = np.diff(np.log(SP500_stocks_2015to2020.values), axis=0)[1:, :N]
mu_true = X_market.mean(axis=0)
Sigma_true = np.cov(X_market, rowvar=False)
nu = 4
scatter_true = (nu-2)/nu * Sigma_true

# Generate synthetic datasets
X_gaussian = multivariate_normal.rvs(
    mean=mu_true, cov=Sigma_true, size=10*T_max
)
X_heavy = multivariate_t.rvs(
    loc=mu_true, shape=scatter_true, df=nu, size=10*T_max
)

# Benchmarking framework
T_sweep = np.linspace(50, T_max, num=20, dtype=int)
results = []

for T in T_sweep:
    for dist_name, data in [('Gaussian', X_gaussian),
                            ('heavy-tailed', X_heavy)]:
        for _ in range(n_trials):
            # Subsample data
            idx = np.random.choice(len(data), size=T, replace=False)
            X = data[idx]
            
            # Calculate estimators
            mu_hat = X.mean(axis=0)
            Sigma_hat = (T-1)/T * np.cov(X, rowvar=False)
            
            # Calculate relative errors
            rel_error_mu = (np.linalg.norm(mu_hat - mu_true, 2) / np.linalg.norm(mu_true, 2))
            rel_error_Sigma = (np.linalg.norm(Sigma_hat - Sigma_true, 'fro') / np.linalg.norm(Sigma_true, 'fro'))
            
            results.append({
                'T': T,
                'distribution': dist_name,
                'error_mu': 100 * rel_error_mu,
                'error_Sigma': 100 * rel_error_Sigma
            })

# Create DataFrame and aggregate
df = pd.DataFrame(results)
df_agg = df.groupby(['distribution', 'T']).mean().reset_index()

# Visualization
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 8), sharex=True)

for dist in ['Gaussian', 'heavy-tailed']:
    subset = df_agg[df_agg['distribution'] == dist]
    ax1.plot(subset['T'], subset['error_mu'], label=dist)
    ax2.plot(subset['T'], subset['error_Sigma'], label=dist)

ax1.set_title('Error of Gaussian ML mean estimator')
ax2.set_title('Error of Gaussian ML covariance estimator')
ax2.set_xlabel('Sample Size (T)')

for ax in (ax1, ax2):
    ax.set_ylabel('Normalized Error (%)')
    ax.legend(title='Data distribution')
    ax.grid(True)
    ax.label_outer()

plt.tight_layout()
plt.show()

Estimation error of Gaussian ML estimators versus degrees of freedom in a $t$ distribution (with $T=200$ and $N=100$):

# Configuration
np.random.seed(42)
N = 100
T = 200
n_trials = 200
nu_values = np.arange(3, 31, 2)

# Get realistic mu and Sigma (assuming SP500_stocks_2015to2020 is a DataFrame)
X_market = np.log(SP500_stocks_2015to2020).diff().dropna().to_numpy()[:, :N]
mu_true = np.mean(X_market, axis=0)
Sigma_true = np.cov(X_market, rowvar=False)

# Benchmarking framework
results = []
for nu in tqdm(nu_values, desc="Processing ν values"):
    scatter_matrix = ((nu - 2)/nu) * Sigma_true
    
    for _ in range(n_trials):
        # Generate multivariate t-distributed data
        X = multivariate_t.rvs(
            loc=mu_true, 
            shape=scatter_matrix, 
            df=nu, 
            size=T,
        )
        
        # Calculate estimators
        mu_est = np.mean(X, axis=0)
        Sigma_est = np.cov(X, rowvar=False)  # MLE covariance
        
        # Calculate normalized errors
        mu_error = np.linalg.norm(mu_est - mu_true) / np.linalg.norm(mu_true)
        Sigma_error = np.linalg.norm(Sigma_est - Sigma_true, 'fro') / np.linalg.norm(Sigma_true, 'fro')
        
        results.append({
            'ν': nu,
            'method': 'Gaussian ML',
            'error mu': 100 * mu_error,
            'error Sigma': 100 * Sigma_error
        })

# Create and process DataFrame
df = pd.DataFrame(results)
df_agg = df.groupby(['method', 'ν'], as_index=False).agg({
    'error mu': 'mean',
    'error Sigma': 'mean'
})

# Visualization
plt.figure(figsize=(12, 10))

# Plot for mean estimation error
plt.subplot(2, 1, 1)
plt.plot(df_agg['ν'], df_agg['error mu'], color='#1f77b4', lw=2)
plt.title('Estimation error of Gaussian ML mean estimator')
plt.xlabel('ν')
plt.ylabel('Normalized Error (%)')
plt.ylim(200, 250)

# Plot for covariance estimation error
plt.subplot(2, 1, 2)
plt.plot(df_agg['ν'], df_agg['error Sigma'], 
         color='#2ca02c', lw=2)
plt.title('Estimation error of Gaussian ML covariance estimator')
plt.xlabel('ν')
plt.ylabel('Normalized Error (%)')

plt.tight_layout()
plt.show()

The failure of Gaussian ML estimators¶

def create_2covariance_plot(X, Sigma_true, Sigma_scm, title):
    """Helper function to create consistent plots"""
    fig, ax = plt.subplots(figsize=(10, 8))
    
    # Scatter plot
    ax.scatter(X[:,0], X[:,1], s=8, color='black', alpha=0.8)
    
    # Add ellipses
    for cov, color, label in zip([Sigma_true, Sigma_scm], 
                               ['blue', 'red'], 
                               ['True', 'Gaussian ML']):
        ellipse = confidence_ellipse(cov, [0,0], ax=ax,
                                     edgecolor=color, 
                                     linewidth=1.5)
        ax.add_patch(ellipse)
    
    # Plot configuration
    ax.set_xlim(-0.2, 0.2)
    ax.set_ylim(-0.2, 0.2)
    ax.set_title(title, pad=20)
    ax.legend(handles=[plt.Line2D([0], [0], color=c, lw=2) for c in ['blue', 'red']],
             labels=['True covariance', 'Gaussian ML estimator'])
    plt.tight_layout()
    return fig

# Parameters
mu_true = np.array([0.0, 0.0])
Sigma_true = np.array([[0.0012, 0.0011], 
                       [0.0011, 0.0014]])
T = 10
np.random.seed(42)

Gaussian Data:

X_gaussian = multivariate_normal.rvs(
    mean=mu_true, cov=Sigma_true, size=200
)
X_subset = X_gaussian[:T]
Sigma_scm = (X_subset.T @ X_subset) / T
create_2covariance_plot(X_gaussian, Sigma_true, Sigma_scm, "Gaussian Data")
plt.show()

Heavy-tailed data:

nu = 4
scatter_matrix = (nu-2)/nu * Sigma_true  # For multivariate_t shape param

X_heavy = multivariate_t.rvs(
    loc=mu_true,
    shape=scatter_matrix,
    df=nu,
    size=200
)
X_subset = X_heavy[:T]
Sigma_scm = (X_subset.T @ X_subset) / T
create_2covariance_plot(X_heavy, Sigma_true, Sigma_scm, "Heavy-tailed Data (ν=4)")
plt.show()

Gaussian data with outliers:

X_gaussian = multivariate_normal.rvs(
    mean=mu_true, cov=Sigma_true, size=200
)
# Add outlier at (-0.15, 0.05)
X_outlier = np.vstack([[-0.15, 0.05], X_gaussian])
X_subset = X_outlier[:T]
Sigma_scm = (X_subset.T @ X_subset) / T
create_2covariance_plot(X_outlier, Sigma_true, Sigma_scm, "Gaussian Data with Outlier")
plt.show()

Heavy-tailed ML estimators¶

Recall that the ML estimators can be iteratively computed as the weighted sample mean and sample covariance matrix: $$ \begin{aligned} \boldsymbol{\mu}^{k+1} &= \frac{\frac{1}{T}\sum_{t=1}^T w_t(\boldsymbol{\mu}^k,\boldsymbol{\Sigma}^k) \times \mathbf{x}_t}{\frac{1}{T}\sum_{t=1}^T w_t(\boldsymbol{\mu}^k,\boldsymbol{\Sigma}^k)}\\ \boldsymbol{\Sigma}^{k+1} &= \frac{1}{T}\sum_{t=1}^T w_t(\boldsymbol{\mu}^{k+1},\boldsymbol{\Sigma}^k) \times (\mathbf{x}_t - \boldsymbol{\mu}^{k+1})(\mathbf{x}_t - \boldsymbol{\mu}^{k+1})^{T} \end{aligned} $$ where the weights $w_t(\boldsymbol{\mu},\boldsymbol{\Sigma})$ are defined as $$w_t(\boldsymbol{\mu},\boldsymbol{\Sigma}) = \frac{\nu+N}{\nu + (\mathbf{x}_t - \boldsymbol{\mu})^{T}\boldsymbol{\Sigma}^{-1}(\mathbf{x}_t - \boldsymbol{\mu})}.$$

For convenience, we define the function fit_mvt() to estimate the mean and covariance matrix for a heavy-tailed distribution:

def nu_POP_estimator(r2, nu, N):
    """
    Estimate the degrees of freedom of a heavy-tailed t distribution based on the POP estimator.

    Based on:
      Frédéric Pascal, Esa Ollila, and Daniel P. Palomar, "Improved estimation of the degree of freedom parameter of
      multivariate t-distribution," in Proc. European Signal Processing Conference (EUSIPCO), Dublin, Ireland, Aug. 23-27, 2021.
      <https://doi.org/10.23919/EUSIPCO54536.2021.9616162>
    """
    T = len(r2)
    u = (N + nu) / (nu + r2 * T / (T - 1))
    r2i = r2 / (1 - r2 * u / T)
    theta = (1 - N / T) * np.sum(r2i) / T / N
    nu = 2 * theta / (theta - 1)
    nu = min(100, max(2.5, nu))
    return nu
    
def fit_mvt(X, verbose=False, max_iter=100, ptol=1e-3):
    """
    Fit of a multivariate t distribution to the data X.

    References:
    Chuanhai Liu and Donald B. Rubin, "ML estimation of the t-distribution using EM and its extensions, ECM and ECME,"
    Statistica Sinica (5), pp. 19-39, 1995.

    Chuanhai Liu, Donald B. Rubin, and Ying Nian Wu, "Parameter Expansion to Accelerate EM: The PX-EM Algorithm,"
    Biometrika, Vol. 85, No. 4, pp. 755-770, Dec., 1998

    Rui Zhou, Junyan Liu, Sandeep Kumar, and Daniel P. Palomar, "Robust factor analysis parameter estimation,"
    Lecture Notes in Computer Science (LNCS), 2019. <https://arxiv.org/abs/1909.12530>

    Esa Ollila, Daniel P. Palomar, and Frédéric Pascal, "Shrinking the Eigenvalues of M-estimators of Covariance Matrix,"
    IEEE Trans. on Signal Processing, vol. 69, pp. 256-269, Jan. 2021. <https://doi.org/10.1109/TSP.2020.3043952>

    Frédéric Pascal, Esa Ollila, and Daniel P. Palomar, "Improved estimation of the degree of freedom parameter of
    multivariate t-distribution," in Proc. European Signal Processing Conference (EUSIPCO), Dublin, Ireland, Aug. 23-27, 2021.
    <https://doi.org/10.23919/EUSIPCO54536.2021.9616162>
    """
    X = np.asarray(X)
    T, N = np.shape(X)

    # initial points
    nu = 4
    mu = np.mean(X, axis=0)
    cov = np.cov(X, rowvar=False)
    Sigma = (nu - 2)/nu * cov
    def fnu(nu):
      return nu/(nu - 2)

    #
    #  loop
    #
    Sigma_diff_record = np.empty(max_iter)
    for k in range(max_iter):
        nu_prev    = np.copy(nu)
        mu_prev    = np.copy(mu)
        Sigma_prev = np.copy(Sigma)

        # intermediate variables
        Xc = X - mu
        #r2 = np.sum(Xc * np.dot(Xc, np.linalg.inv(Sigma)), axis=1)  # np.diag( Xc @ np.linalg.inv(Sigma) @ t(Xc) )
        r2 = np.sum(Xc * np.linalg.solve(Sigma + 1e-6 * np.eye(N), Xc.T).T, axis=1)

        # update nu, mu, Sigma
        nu = nu_POP_estimator(r2=r2, nu=nu, N=N)
        E_tau = (N + nu) / (nu + r2)
        ave_E_tau = np.mean(E_tau)
        ave_E_tau_X = np.mean(X * E_tau[:, None], axis=0)
        mu = ave_E_tau_X / ave_E_tau
        Xc = X - mu
        ave_E_tau_XX = (1 / T) * np.dot((Xc * np.sqrt(E_tau)[:, None]).T, Xc * np.sqrt(E_tau)[:, None])
        #ave_E_tau_XX_ = (1 / T) * Xc.T @ np.diag(E_tau) @ Xc
        Sigma = ave_E_tau_XX
        Sigma = Sigma / ave_E_tau  # PX_EM_acceleration

        # stopping criterion
        Sigma_diff_record[k] = np.linalg.norm(Sigma - Sigma_prev) / np.linalg.norm(Sigma_prev)
        if (np.all(np.abs(mu - mu_prev) <= .5 * ptol * (np.abs(mu_prev) + np.abs(mu))) and
                np.abs(fnu(nu) - fnu(nu_prev)) <= .5 * ptol * (np.abs(fnu(nu_prev)) + np.abs(fnu(nu))) and
                np.all(np.abs(Sigma - Sigma_prev) <= .5 * ptol * (np.abs(Sigma_prev) + np.abs(Sigma)))):
            break
    Sigma_diff_record = Sigma_diff_record[:k]

    Sigma_cov = nu / (nu - 2) * Sigma

    if verbose:
        fig, ax = plt.subplots(figsize=(16, 6))
        ax.plot(np.arange(k), Sigma_diff_record)
        plt.xlabel("Iterations")
        plt.ylabel("Sigma differences")
        plt.show()

    return mu, Sigma_cov

Now we can assess the effect of heavy tails and outliers in heavy-tailed ML covariance matrix estimator:

# Parameters
mu_true = np.array([0.0, 0.0])
Sigma_true = np.array([[0.0012, 0.0011], 
                       [0.0011, 0.0014]])
T = 10
np.random.seed(42)

def create_3covariance_plot(X, Sigma_true, Sigma_scm, Sigma_heavytail, title):
    """Helper function to create consistent plots"""
    fig, ax = plt.subplots(figsize=(10, 8))
    
    # Scatter plot
    ax.scatter(X[:,0], X[:,1], s=8, color='black', alpha=0.8)
    
    # Add ellipses
    for cov, color, label in zip([Sigma_true, Sigma_scm, Sigma_heavytail], 
                               ['blue', 'red', 'green'], 
                               ['True', 'Gaussian ML', 'Heavy-tailed ML']):
        ellipse = confidence_ellipse(cov, [0,0], ax=ax,
                                    edgecolor=color, 
                                    linewidth=1.5)
        ax.add_patch(ellipse)
    
    # Plot configuration
    ax.set_xlim(-0.2, 0.2)
    ax.set_ylim(-0.2, 0.2)
    ax.set_title(title, pad=20)
    ax.legend(handles=[plt.Line2D([0], [0], color=c, lw=2) 
                      for c in ['blue', 'red', 'green']],
             labels=['True covariance', 'Gaussian ML estimator', 'Heavy-tailed ML estimator'])
    plt.tight_layout()
    return fig

Gaussian Data:

# Data generation
X_gaussian = multivariate_normal.rvs(
    mean=mu_true, cov=Sigma_true, size=200
)
X_subset = X_gaussian[:T]

# Estimators
Sigma_scm = (X_subset.T @ X_subset) / T
mu_heavytail, Sigma_heavytail = fit_mvt(X_subset)

# Plot
create_3covariance_plot(X_gaussian, Sigma_true, Sigma_scm, Sigma_heavytail, "Gaussian Data")
plt.show()

Heavy-tailed data:

# Data generation
nu = 4
scatter_matrix = (nu-2)/nu * Sigma_true  # For multivariate_t shape param
X_heavy = multivariate_t.rvs(
    loc=mu_true,
    shape=scatter_matrix,
    df=nu,
    size=200
)
X_subset = X_heavy[:T]

# Estimators
Sigma_scm = (X_subset.T @ X_subset) / T
mu_heavytail, Sigma_heavytail = fit_mvt(X_subset)

# Plot
create_3covariance_plot(X_gaussian, Sigma_true, Sigma_scm, Sigma_heavytail, "Heavy-tailed Data (ν=4)")
plt.show()

Gaussian data with outliers:

# Data generation
X_gaussian = multivariate_normal.rvs(
    mean=mu_true, cov=Sigma_true, size=200
)
# Add outlier at (-0.15, 0.05)
X_outlier = np.vstack([[-0.15, 0.05], X_gaussian])
X_subset = X_outlier[:T]

# Estimators
Sigma_scm = (X_subset.T @ X_subset) / T
mu_heavytail, Sigma_heavytail = fit_mvt(X_subset)

# Plot
create_3covariance_plot(X_gaussian, Sigma_true, Sigma_scm, Sigma_heavytail, "Gaussian Data with Outlier")
plt.show()

Estimation error of different ML estimators versus number of observations (for $t$-distributed heavy-tailed data with $\nu=4$ and $N=100$):

# Configuration
np.random.seed(42)
N = 100
T_max = 1000
n_trials = 100
nu = 4

# Get realistic mu and Sigma
X_market = np.log(SP500_stocks_2015to2020).diff().dropna().to_numpy()[:, :N]
mu_true = np.mean(X_market, axis=0)
Sigma_true = np.cov(X_market.T)

# Generate synthetic heavy-tailed data
scatter_true = ((nu-2)/nu) * Sigma_true
X_heavtytailed = multivariate_t.rvs(
    loc=mu_true,
    shape=scatter_true,
    df=nu,
    size=10*T_max
)

# Benchmarking framework
T_sweep = np.arange(150, T_max+1, 50)
results = []

for T in T_sweep:
    for _ in range(n_trials):
        # Random subsample
        idx = np.random.choice(len(X_heavtytailed), size=T, replace=False)
        X_sub = X_heavtytailed[idx]
        
        # Gaussian MLE
        mu_gauss = np.mean(X_sub, axis=0)
        Sigma_gauss = (T-1)/T * np.cov(X_sub.T)
        
        # Heavy-tailed MLE (assuming fit_mvt exists)
        mu_heavy, Sigma_heavy = fit_mvt(X_sub)
        
        # Calculate errors
        errors = {
            'Gaussian MLE': {
                'mu': np.linalg.norm(mu_gauss - mu_true)/np.linalg.norm(mu_true),
                'Sigma': np.linalg.norm(Sigma_gauss - Sigma_true, 'fro')/np.linalg.norm(Sigma_true, 'fro')
            },
            'heavy-tailed MLE': {
                'mu': np.linalg.norm(mu_heavy - mu_true)/np.linalg.norm(mu_true),
                'Sigma': np.linalg.norm(Sigma_heavy - Sigma_true, 'fro')/np.linalg.norm(Sigma_true, 'fro')
            }
        }
        
        for method in errors:
            results.append({
                'T': T,
                'method': method,
                'error mu': errors[method]['mu'],
                'error Sigma': errors[method]['Sigma']
            })

# Process results
df = pd.DataFrame(results)
df_agg = df.groupby(['method', 'T'], as_index=False).agg({
    'error mu': lambda x: 100*np.mean(x),
    'error Sigma': lambda x: 100*np.mean(x)
})

# Visualization
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10), sharex=True)

for method in df_agg['method'].unique():
    subset = df_agg[df_agg['method'] == method]
    ax1.plot(subset['T'], subset['error mu'], label=method)
    ax2.plot(subset['T'], subset['error Sigma'], label=method)

ax1.set_title('Error of Mean Estimators')
ax2.set_title('Error of Covariance Estimators')
ax2.set_xlabel('Sample Size (T)')
ax1.set_ylabel('Normalized Error (%)')
ax2.set_ylabel('Normalized Error (%)')
ax1.legend()
ax1.grid(True)
ax2.grid(True)

plt.tight_layout()
plt.show()

Estimation error of different ML estimators versus degrees of freedom in a $t$ distribution (with $T=200$ and $N=100$):

# Configuration
np.random.seed(42)
N = 100
T = 200
n_trials = 200
nu_values = np.arange(3, 31, 2)


# Get realistic mu and Sigma
X_market = np.log(SP500_stocks_2015to2020).diff().dropna().to_numpy()[:, :N]
mu_true = np.mean(X_market, axis=0)
Sigma_true = np.cov(X_market.T)

# Benchmarking framework
results = []
for nu in tqdm(nu_values, desc="Processing ν values"):
    scatter_matrix = ((nu - 2)/nu) * Sigma_true
    
    for _ in range(n_trials):
        # Generate multivariate t-distributed data
        X = multivariate_t.rvs(
            loc=mu_true, 
            shape=scatter_matrix, 
            df=nu, 
            size=T,
        )
        
        # Gaussian MLE
        mu_gauss = np.mean(X, axis=0)
        Sigma_gauss = (T-1)/T * np.cov(X, rowvar=False)
        
        mu_error_gauss = np.linalg.norm(mu_gauss - mu_true) / np.linalg.norm(mu_true)
        Sigma_error_gauss = np.linalg.norm(Sigma_gauss - Sigma_true, 'fro') / np.linalg.norm(Sigma_true, 'fro')
        
        results.append({
            'ν': nu,
            'method': 'Gaussian MLE',
            'error mu': 100 * mu_error_gauss,
            'error Sigma': 100 * Sigma_error_gauss
        })
        
        # Heavy-tailed MLE (assuming fit_mvt exists)
        mu_heavy, Sigma_heavy = fit_mvt(X)  # Replace with actual implementation of fit_mvt
        
        mu_error_heavy = np.linalg.norm(mu_heavy - mu_true) / np.linalg.norm(mu_true)
        Sigma_error_heavy = np.linalg.norm(Sigma_heavy - Sigma_true, 'fro') / np.linalg.norm(Sigma_true, 'fro')
        
        results.append({
            'ν': nu,
            'method': 'Heavy-tailed MLE',
            'error mu': 100 * mu_error_heavy,
            'error Sigma': 100 * Sigma_error_heavy
        })

# Create and process DataFrame
df = pd.DataFrame(results)
df_agg = df.groupby(['method', 'ν'], as_index=False).agg({
    'error mu': 'mean',
    'error Sigma': 'mean'
})

# Visualization
plt.figure(figsize=(12, 10))

# Plot for mean estimation error
plt.subplot(2, 1, 1)
for method in df_agg['method'].unique():
    subset = df_agg[df_agg['method'] == method]
    plt.plot(subset['ν'], subset['error mu'], label=method)

plt.title('Estimation error of mean estimators')
plt.xlabel('ν')
plt.ylabel('Normalized Error (%)')
plt.legend()
plt.grid(True)

# Plot for covariance estimation error
plt.subplot(2, 1, 2)
for method in df_agg['method'].unique():
    subset = df_agg[df_agg['method'] == method]
    plt.plot(subset['ν'], subset['error Sigma'], label=method)

plt.title('Estimation error of covariance estimators')
plt.xlabel('ν')
plt.ylabel('Normalized Error (%)')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()

Convergence of iterative robust heavy-tailed ML estimators:

# Configuration
np.random.seed(42)
N = 100
T = 200
n_trials = 200
nu_values = np.arange(3, 31, 2)

# Get realistic mu and Sigma
X_market = np.log(SP500_stocks_2015to2020).diff().dropna().to_numpy()[:, :N]
mu_true = np.mean(X_market, axis=0)
Sigma_true = np.cov(X_market.T)
nu <- 4
scatter_matrix = (nu-2)/nu * Sigma_true

# Generate multivariate t-distributed data
X = multivariate_t.rvs(
    loc=mu_true, 
    shape=scatter_matrix, 
    df=nu, 
    size=T,
)

# heavy-tailed MLE
mu_heavy, Sigma_heavy = fit_mvt(X, verbose=True)

Prior information¶

Shrinkage¶

First, we define a simple and convenient function to shrink the mean vector:

def shrink_mu(X, mu=None, cov=None, mu_sh_use_cor=True,
              mu_target="grand mean"):
    T, N = X.shape

    if mu is None:
        mu = np.mean(X, axis=0)
    if cov is None:
        cov = np.cov(X, rowvar=False)

    # target
    if mu_target == "zero":
        mu_target = np.zeros(N)
    elif mu_target == "grand mean":
        mu_target = np.mean(mu)
    elif mu_target == "volatility weighted":
        vol = np.sqrt(np.diag(cov))
        mu_target = np.mean(mu/vol) * vol
    elif mu_target == "covmat weighted":
        if mu_sh_use_cor:
            corr = cov / np.outer(np.sqrt(np.diag(cov)), np.sqrt(np.diag(cov)))
            invSigma_ones = linalg.solve(corr, np.ones(N))
        else:
            invSigma_ones = linalg.solve(cov, np.ones(N))
        invSigma_ones = np.maximum(0, invSigma_ones)
        mu_target = np.dot(mu, invSigma_ones) / np.sum(invSigma_ones)
    else:
        raise ValueError("Shrinkage estimation method for mu not recognized!")

    # shrinkage
    diff = mu - mu_target
    rho = (N + 2) / ((N + 2) + T * np.dot(diff, linalg.solve(cov, diff)))  # improved James-Stein estimator
    #rho = (N - 2) / (T * np.dot(diff, linalg.solve(cov, diff)))  # original James-Stein estimator
    rho = max(0, min(1, rho))
    mu_shrinked = (1 - rho) * mu + rho * mu_target
    return mu_shrinked

To shrink the covariance matrix we will use Ledoit-Wolf's estimator. This can be founc in the library scikit-learn, but we will implement it ourselves.

Estimation error of different shrinkage estimators versus number of observations (for Gaussian data with $N=100$):

# Configuration
np.random.seed(42)
N = 100
T_max = 550
n_trials = 100
nu = 5

# Get realistic mu and Sigma
X_market = np.log(SP500_stocks_2015to2020).diff().dropna().to_numpy()[:, :N]
mu_true = np.mean(X_market, axis=0)
Sigma_true = np.cov(X_market.T)

# Generate synthetic heavy-tailed data
scatter_true = ((nu-2)/nu) * Sigma_true
X_heavtytailed = multivariate_t.rvs(
    loc=mu_true,
    shape=scatter_true,
    df=nu,
    size=10*T_max
)

# Benchmarking framework
T_sweep = np.arange(50, T_max+1, 50)
results = []

for T in T_sweep:
    for _ in range(n_trials):
        # Random subsample
        idx = np.random.choice(len(X_heavtytailed), size=T, replace=False)
        X_sub = X_heavtytailed[idx]


        #
        # Mean estimation
        #
        Sigma = np.cov(X_sub.T)
        Sigma_reg = Sigma + np.mean(np.diag(Sigma)) * np.eye(N)  # regularize Sigma for stability
        # sample mean
        mu = np.mean(X_sub, axis=0)
        # Shrinkage estimators
        mu_shrinked_0 = shrink_mu(X_sub, mu=mu, mu_target="zero", cov=Sigma_reg)
        mu_shrinked_grand_mean = shrink_mu(X_sub, mu=mu, mu_target="grand mean", cov=Sigma_reg)
        mu_shrinked_covmat_weighted = shrink_mu(X_sub, mu=mu, mu_target="covmat weighted", cov=Sigma_reg)

        # Calculate errors
        results.append({
            'T': T,
            'parameter': 'mu',
            'method': 'sample mean',
            'error': np.linalg.norm(mu - mu_true)/np.linalg.norm(mu_true),
        })
        results.append({
            'T': T,
            'parameter': 'mu',
            'method': 'shrinkage to zero',
            'error': np.linalg.norm(mu_shrinked_0 - mu_true)/np.linalg.norm(mu_true),
        })
        results.append({
            'T': T,
            'parameter': 'mu',
            'method': 'shrinkage to grand mean',
            'error': np.linalg.norm(mu_shrinked_grand_mean - mu_true)/np.linalg.norm(mu_true),
        })
        results.append({
            'T': T,
            'parameter': 'mu',
            'method': 'shrinkage to volatility-weighted mean',
            'error': np.linalg.norm(mu_shrinked_covmat_weighted - mu_true)/np.linalg.norm(mu_true),
        })



        #
        # Covariance matrix estimation
        #

        # Sample covariance matrix (SCM)
        S = np.cov(X_sub.T)
        Xc = X_sub - mu  # center data matrix
        # SCM + Ledoit-Wolf shrinked to scaled identity
        Sigma_target = np.mean(np.diag(S)) * np.eye(N)
        rho = min(1, ((1/T**2) * np.sum(np.sum(Xc**2, axis=1)**2) - 1/T * np.sum(S**2)) / np.sum((S - Sigma_target)**2))
        Sigma_shrinked_identity = (1 - rho)*S + rho*Sigma_target
        # SCM + Ledoit-Wolf shrinked to diagonal matrix
        Sigma_target = np.diag(np.diag(S))
        rho = min(1, ((1/T**2) * np.sum(np.sum(Xc**2, axis=1)**2) - 1/T * np.sum(S**2)) / np.sum((S - Sigma_target)**2))
        Sigma_shrinked_diagonal = (1 - rho) * S + rho * Sigma_target
        # SCM + Ledoit-Wolf shrinked to equal-correlation matrix
        sigma = np.sqrt(np.diag(S))
        C = S / np.outer(sigma, sigma)
        ave_corr = np.mean(C[np.tril_indices_from(C, k=-1)])
        C[np.tril_indices_from(C, k=-1)] = ave_corr
        C[np.triu_indices_from(C, k=1)] = ave_corr
        Sigma_target = np.outer(sigma, sigma) * C
        rho = min(1, ((1/T**2) * np.sum(np.sum(Xc**2, axis=1)**2) - 1/T * np.sum(S**2)) / np.sum((S - Sigma_target)**2))
        Sigma_shrinked_eq_corr = (1 - rho) * S + rho * Sigma_target

        # Calculate errors
        results.append({
            'T': T,
            'parameter': 'Sigma',
            'method': 'sample covariance matrix',
            'error': np.linalg.norm(S - Sigma_true, 'fro')/np.linalg.norm(Sigma_true, 'fro')
        })
        results.append({
            'T': T,
            'parameter': 'Sigma',
            'method': 'shrinkage to scaled identity',
            'error': np.linalg.norm(Sigma_shrinked_identity - Sigma_true, 'fro')/np.linalg.norm(Sigma_true, 'fro')
        })
        results.append({
            'T': T,
            'parameter': 'Sigma',
            'method': 'shrinkage to diagonal matrix',
            'error': np.linalg.norm(Sigma_shrinked_diagonal - Sigma_true, 'fro')/np.linalg.norm(Sigma_true, 'fro')
        })
        results.append({
            'T': T,
            'parameter': 'Sigma',
            'method': 'shrinkage to equal-correlation matrix',
            'error': np.linalg.norm(Sigma_shrinked_eq_corr - Sigma_true, 'fro')/np.linalg.norm(Sigma_true, 'fro')
        })

# Process results
df = pd.DataFrame(results)
df_agg = df.groupby(['parameter', 'method', 'T']).agg({
    'error': lambda x: 100*np.mean(x)
}).reset_index()

# Plots
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10), sharex=True)

# Plot mu methods
mu_methods = df_agg[df_agg['parameter'] == 'mu']['method'].unique()
for method in mu_methods:
    subset_mu = df_agg[(df_agg['method'] == method) & (df_agg['parameter'] == 'mu')]
    ax1.plot(subset_mu['T'], subset_mu['error'], label=method)

# Plot Sigma methods
sigma_methods = df_agg[df_agg['parameter'] == 'Sigma']['method'].unique()
for method in sigma_methods:
    subset_sigma = df_agg[(df_agg['method'] == method) & (df_agg['parameter'] == 'Sigma')]
    ax2.plot(subset_sigma['T'], subset_sigma['error'], label=method)

ax1.set_title('Error of Mean Estimators')
ax2.set_title('Error of Covariance Estimators')
ax2.set_xlabel('Sample Size (T)')
ax1.set_ylabel('Normalized Error (%)')
ax2.set_ylabel('Normalized Error (%)')
ax1.legend()
ax2.legend()
ax1.grid(True)
ax2.grid(True)

plt.tight_layout()
plt.show()

Factor models¶

TBD