R Code for Portfolio Optimization
Chapter 4 – Financial Data: Graphs

Daniel P. Palomar (2025). Portfolio Optimization: Theory and Application. Cambridge University Press.

Last update: April 14, 2025


Loading packages

The following packages are used in the examples:

# graphs
library(clusterSim)             # graph datasets
library(igraph)                 # for graph related functions
library(dbscan)                 # for kNN computation
library(glasso)                 # GLASSO implementation
library(spectralGraphTopology)  # implementation of many graph learning functions

For convenience, we define the following wrapper function (based on the package igraph) to plot graphs:

library(igraph)

# define function to plot nice graphs
plot_graph <- function(W, L = NULL, main = "", layout = NULL, clusters, edges = TRUE, labels = FALSE) {
  # if the Laplacian is passed instead, then use it to get W
  if (!is.null(L)) {
    W <- abs(L)
    diag(W) <- 0
  }
  # default layout
  if(is.null(layout)) layout <- layout_with_fr
  
  # build graph
  graph <- graph_from_adjacency_matrix(W, mode = "undirected", weighted = TRUE)
  
  # colorify nodes and edges
  V(graph)$cluster <- clusters
  colors <- c("#706FD3", "#FF5252", "#33D9B2")
  V(graph)$color <- colors[clusters]  #V(graph)$color <- clusters
  if (edges)
    E(graph)$color <- apply(as.data.frame(get.edgelist(graph)), 1,
                        function(x) {
                          x <- as.numeric(x)
                          ifelse(V(graph)$cluster[x[1]] == V(graph)$cluster[x[2]],
                                 colors[V(graph)$cluster[x[1]]], "grey")
                        })
  else
    E(graph)$color <- NA
    
  E(graph)$weight <- E(graph)$weight / max(E(graph)$weight)
  E(graph)$width <- 2*E(graph)$weight

  plot(graph, layout = layout,
       edge.width = E(graph)$width, 
       vertex.size = 3,
       vertex.label = NA, #vertex.label.cex = 0.8, vertex.label.dist = 0.7, vertex.label.color = "black",
       edge.curved = 0.25, 
       main = main)
}

Learning graphs from similarity measures

First load some toy data:

library(clusterSim)
set.seed(42)
data2moons <- shapes.two.moon(100)
data3circles <- shapes.circles3(100)

Let’s define the functions to learn the following graphs (in the form of the adjacency matrix \bm{W}) based on similarity measures:

  • thresholded distance graph
  • Gaussian graph
  • thresholded Gaussian graph
  • kNN graph
  • feature correlation graph
  • selftuned Gaussian graph
library(dbscan)

W_thresholded <- function(X, threshold = NULL) {
  # compute Euclidean distance matrix
  euclid_dist <- as.matrix(dist(X, method = "euclidean", diag = FALSE, upper = FALSE))
  # choose threshold automatically if needed
  if (is.null(threshold)) {
    max_dist <- max(euclid_dist)
    min_dist <- min(euclid_dist[euclid_dist != 0])
    threshold <- min_dist + 0.15*(max_dist-min_dist)
  }
  # generate W
  W <- ifelse(euclid_dist <= threshold, 1, 0)  
  diag(W) <- 0
  return(W)
}

W_Gaussian <- function(X, sigma2 = 0.1) {
  # compute Euclidean distance matrix
  euclid_dist <- as.matrix(dist(X, method = "euclidean", diag = FALSE, upper = FALSE))
  # choose sigma2 automatically if needed
  if (is.null(sigma2))
    sigma2 <- mean(euclid_dist^2)/10
  # generate W
  W <- exp(-(euclid_dist^2)/(2*sigma2))
  diag(W) <- 0
  return(W)
}

W_thresholded_Gaussian <- function(X, sigma2 = 0.1, threshold = NULL) {
  # compute Euclidean distance matrix
  euclid_dist <- as.matrix(dist(X, method = "euclidean", diag = FALSE, upper = FALSE))
  # choose sigma2 automatically?
  if (is.null(sigma2))
    sigma2 <- mean(euclid_dist^2)/10
  # choose threshold automatically if needed
  if (is.null(threshold)) {
    max_dist <- max(euclid_dist)
    min_dist <- min(euclid_dist[euclid_dist != 0])
    threshold <- min_dist + 0.15*(max_dist-min_dist)
  }  
  # generate W
  W <- exp(-(euclid_dist^2)/(2*sigma2))
  W[euclid_dist > threshold] <- 0  # threshold
  diag(W) <- 0
  return(W)
}

W_knn <- function(X, k = NULL) {
  p <- nrow(X)
  # choose k automatically if needed
  if (is.null(k))
    k <- 1 + floor(log2(p))
  # compute the knn distances and so
  knn_nodes <- dbscan::kNN(X, k, sort = FALSE)$id
  # generate W
  W <- matrix(0, p, p)
  for (i in 1:p)
    W[knn_nodes[i, ], i] <- W[i, knn_nodes[i, ]] <- 1
  colnames(W) <- rownames(W) <- rownames(X)
  return(W)
}

W_feature_correlation <- function(X, threshold = 0.5) {
  W <- X %*% t(X)
  W[abs(W) < threshold*mean(diag(W))] <- 0
  diag(W) <- 0  
  return(W)
}

W_selftuned_Gaussian <- function(X, k = NULL) {
  p <- nrow(X)
  # choose k automatically if needed
  if (is.null(k))
    k <- floor(1 + log2(p))  #0.5*log2(p)
  # compute Euclidean distance matrix and knn distances
  knn_dist <- dbscan::kNN(X, k, sort = TRUE)$dist[, k]
  euclid_dist2 <- as.matrix(dist(X, method = "euclidean", diag = FALSE, upper = FALSE))^2
  #euclid_dist2 <- diag(1/knn_dist) %*% euclid_dist2 %*% diag(1/knn_dist)
  euclid_dist2 <- t((1/knn_dist) * t((1/knn_dist) * euclid_dist2))
  # generate W
  W <- exp(-euclid_dist2/2)
  diag(W) <- 0
  return(W)
}

Graphs learned based on similarity measures for the two-moon dataset:

old_par <- par(mfcol = c(2, 3), mar = c(0.6, 0, 0.8, 0))   # b-l-t-r
# thresholded distance graph
plot_graph(W = W_thresholded(data2moons$data), 
           layout = data2moons$data, clusters = data2moons$clusters,
           main = "Thresholded distance 0-1 graph")
# Gaussian graph
plot_graph(W = W_Gaussian(data2moons$data), 
           layout = data2moons$data, clusters = data2moons$clusters,
           main = "Gaussian graph")
# thresholded Gaussian graph
plot_graph(W = W_thresholded_Gaussian(data2moons$data), 
           layout = data2moons$data, clusters = data2moons$clusters,
           main = "Thresholded Gaussian graph")
# k-NN graph
plot_graph(W = W_knn(data2moons$data),
           layout = data2moons$data, clusters = data2moons$clusters,
           main = "k-NN 0-1 graph")
# feature correlation graph
plot_graph(W = W_feature_correlation(data2moons$data),
           layout = data2moons$data, clusters = data2moons$clusters,
           main = "Feature correlation graph")
# self-tuned Gaussian graph
plot_graph(W = W_selftuned_Gaussian(data2moons$data, k = 4),
           layout = data2moons$data, clusters = data2moons$clusters,
           main = "Self-tuned Gaussian graph")
par(old_par)

Graphs learned based on similarity measures for the three-circle dataset:

old_par <- par(mfcol = c(2, 3), mar = c(0.6, 0, 0.8, 0))   # b-l-t-r
# thresholded distance graph
plot_graph(W = W_thresholded(data3circles$data), 
           layout = data3circles$data, clusters = data3circles$clusters,
           main = "Thresholded distance 0-1 graph")
# Gaussian graph
plot_graph(W = W_Gaussian(data3circles$data), 
           layout = data3circles$data, clusters = data3circles$clusters,
           main = "Gaussian graph")
# thresholded Gaussian graph
plot_graph(W = W_thresholded_Gaussian(data3circles$data), 
           layout = data3circles$data, clusters = data3circles$clusters,
           main = "Thresholded Gaussian graph")
# k-NN graph
plot_graph(W = W_knn(data3circles$data),
           layout = data3circles$data, clusters = data3circles$clusters,
           main = "k-NN 0-1 graph")
# feature correlation graph
plot_graph(W = W_feature_correlation(data3circles$data),
           layout = data3circles$data, clusters = data3circles$clusters,
           main = "Feature correlation graph")
# self-tuned Gaussian graph
plot_graph(W = W_selftuned_Gaussian(data3circles$data, k = 4),
           layout = data3circles$data, clusters = data3circles$clusters,
           main = "Self-tuned Gaussian graph")
par(old_par)

Learning graphs from smooth signals

We now learn the following graphs in the form of the adjacency matrix \bm{W} based on similarity measures:

  • Graph with fixed degrees (Nie et al., 2016): \[ \begin{array}{ll} \underset{\bm{W}}{\textm{minimize}} & \textm{Tr}(\bm{W}\bm{Z}) + \gamma\|\bm{W}\|_\textm{F}^2\\ \textm{subject to} & \textm{diag}(\bm{W})=\bm{0}, \quad \bm{W}=\bm{W}^\T \ge \bm{0}\\ & \bm{W}\bm{1} = \bm{1}. \end{array} \]

  • Graph with regularized degrees (Kalofolias, 2016): \[ \begin{array}{ll} \underset{\bm{W}}{\textm{minimize}} & \textm{Tr}(\bm{W}\bm{Z}) - \alpha\bm{1}^\T\textm{log}(\bm{W}\bm{1}) + \frac{\beta}{2}\|\bm{W}\|_\textm{F}^2\\ \textm{subject to} & \textm{diag}(\bm{W})=\bm{0}, \quad \bm{W}=\bm{W}^\T \ge \bm{0}, \end{array} \]

  • Clustered \(k\) component graph based on the rank-contrained Laplacian algorithm (Nie et al., 2016), which is implemented in the package spectralGraphTopology: \[ \begin{array}{ll} \underset{\bm{W}}{\textm{minimize}} & \|\bm{W} - \bm{W}_0\|_\textm{F}^2\\ \textm{subject to} & \bm{W}\bm{1} = \bm{1}, \quad \textm{diag}(\bm{W})=\bm{0}, \quad \bm{W}=\bm{W}^\T \ge \bm{0}\\ & \textm{rank}\left(\mathcal{L}(\bm{W})\right) = p - k, \end{array} \]

W_smooth <- function(X, m = NULL) {
  p <- nrow(X)
  # choose m automatically if needed
  if (is.null(m)) 
    m <- ceiling(0.08*p)
  # compute squared Euclidean distance matrix
  Z <- as.matrix(dist(X, method = "euclidean", diag = FALSE, upper = FALSE))^2
  diag(Z) <- 10*max(Z)
  # generate W
  W <- matrix(0, p, p)
  for (i in 1:p) {
    z <- Z[, i]
    z_sorted <- sort(z)
    w_ <- pmax(z_sorted[m+1] - z, 0)
    W[, i] <- w_/sum(w_)
  }
  W <- 1/2*(W + t(W)) # symmetrize
  colnames(W) <- rownames(W) <- rownames(X)
  return(W)
}

W_kalofolias_cvx <- function(X, alpha = 1e-2, beta = 1e-4) {
  p <- nrow(X)
  
  # compute squared Euclidean distance matrix
  Z <- as.matrix(dist(X, method = "euclidean", diag = FALSE, upper = FALSE))^2
  
  # optimize
  W <- Symmetric(p)
  objective <- .5 * sum_entries(W * Z) - alpha * sum_entries(log(sum_entries(W, axis = 1))) + .5 * beta * sum_squares(W)
  #objective_value <- .5 * sum(W * Z) - alpha * sum(log(rowSums(W))) + .5 * beta * norm(W, "F")^2
  constraints <- list(W >= 0, 
                      diag(W) == 0,
                      sum_entries(W, axis = 1) >= 1e-3)
  problem <- Problem(Minimize(objective), constraints)
  result <- solve(problem, solver = "SCS")
  if (result$status != "optimal") message("Warning: CVX status: ", result$status)
  W <- as.matrix(result$getValue(W))
  W[W < 1e-7] <- 0
  return(W)
}

W_kalofolias <- function(X, m = NULL, alpha_over_beta = 1e2) {
  p <- nrow(X)
  # choose m automatically if needed
  if (is.null(m)) 
    m <- ceiling(0.08*p)
  # compute squared Euclidean distance matrix
  Z <- as.matrix(dist(X, method = "euclidean", diag = FALSE, upper = FALSE))^2
  diag(Z) <- 10*max(Z)
  # generate W
  W <- matrix(0, p, p)
  for (i in 1:p) {
    z <- Z[, i]
    z_sorted <- sort(z)
    w_ <- alpha_over_beta * pmax(1 - z/z_sorted[m+1], 0)
    W[, i] <- w_/sqrt(sum(w_))
  }
  W <- 1/2*(W + t(W)) # symmetrize
  colnames(W) <- rownames(W) <- rownames(X)
  return(W)
}

Graphs learned based on smooth signals for the two-moon dataset:

library(spectralGraphTopology)

old_par <- par(mfcol = c(1, 3), mar = c(0, 0, 0.8, 0))   # b-l-t-r
plot_graph(W = W_smooth(data2moons$data, m = 20), 
           layout = data2moons$data, clusters = data2moons$clusters,
           main = "Graph with fixed degrees")
plot_graph(W = W_kalofolias(X = data2moons$data, m = 20), 
           layout = data2moons$data, clusters = data2moons$clusters,
           main = "Graph with regularized degrees")
plot_graph(W = cluster_k_component_graph(data2moons$data, k = 2, m = 20)$adjacency, 
           layout = data2moons$data, clusters = data2moons$clusters,
           main = "Low-rank graph")
par(old_par)

old_par <- par(mfcol = c(1, 3), mar = c(0, 0, 0.8, 0))   # b-l-t-r
plot_graph(W = W_smooth(data3circles$data, m = 20),
           layout = data3circles$data, clusters = data3circles$clusters,
           main = "Graph with fixed degrees")
plot_graph(W = W_kalofolias(X = data3circles$data, m = 20),
           layout = data3circles$data, clusters = data3circles$clusters,
           main = "Graph with regularized degrees")
plot_graph(W = cluster_k_component_graph(data3circles$data, k = 3, m = 20)$adjacency,
           layout = data3circles$data, clusters = data3circles$clusters,
           main = "Low-rank graph")
par(old_par)