# optimization
library(CVXR)
# plotting
library(ggplot2) # for nice plots
library(scales)
# others
library(dplyr) # data frame manipulation
library(microbenchmark) # CPU time computation
Portfolio Optimization
Appendix B: Optimization Algorithms
R code
R code examples for Appendix B of the book:
Daniel P. Palomar (2024). Portfolio Optimization: Theory and Application. Cambridge University Press.
Loading packages
First load the packages used in the examples:
Algorithms
We will now compare the different algorithms with the \(\ell_2 - \ell_1\)-norm minimization example: \[ \begin{array}{ll} \underset{\bm{x}}{\textm{minimize}} & \frac{1}{2}\|\bm{A}\bm{x} - \bm{b}\|_2^2 + \lambda\|\bm{x}\|_1. \end{array} \] To facilitate comparison, we express the iterates of the various algorithms in a uniform manner, utilizing the soft-thresholding operator \(\mathcal{S}_\lambda(\cdot)\): \[ \mathcal{S}_\lambda(u) = \textm{sign}(u)(|u| - \lambda)^+ \] where \((\cdot)^+=\textm{max}(0,\cdot)\).
BCD (a.k.a. Gauss-Seidel) iterates: \[ \bm{x}^{k+1} = \mathcal{S}_{\frac{\lambda}{\textm{diag}(\bm{A}^\T\bm{A})}}\left(\bm{x}^k - \frac{\bm{A}^\T\left(\bm{A}\bm{x}^{(k,i)} - \bm{b}\right)}{\textm{diag}\left(\bm{A}^\T\bm{A}\right)}\right), \qquad i=1,\dots,n,\quad k=0,1,2,\dots \] where \(\bm{x}^{(k,i)} \triangleq \left(x_1^{k+1},\dots,x_{i-1}^{k+1}, x_{i}^{k},\dots,x_{n}^{k}\right).\)
Parallel version of BCD (a.k.a. Jacobi) iterates: \[ \bm{x}^{k+1} = \mathcal{S}_{\frac{\lambda}{\textm{diag}(\bm{A}^\T\bm{A})}}\left(\bm{x}^k - \frac{\bm{A}^\T\left(\bm{A}\bm{x}^{k} - \bm{b}\right)}{\textm{diag}\left(\bm{A}^\T\bm{A}\right)}\right), \qquad i=1,\dots,n,\quad k=0,1,2,\dots \]
MM iterates: \[ \bm{x}^{k+1} = \mathcal{S}_{\frac{\lambda}{\kappa}} \left( \bm{x}^k - \frac{1}{\kappa}\bm{A}^\T\left(\bm{A}\bm{x}^k - \bm{b}\right) \right), \qquad k=0,1,2,\dots \]
Accelerated MM iterates: \[ \begin{aligned} \bm{r}^k &= R(\bm{x}^k) \triangleq \textm{MM}(\bm{x}^k) - \bm{x}^k\\ \bm{v}^k &= R(\textm{MM}(\bm{x}^k)) - R(\bm{x}^k)\\ \alpha^k &= -\textm{max}\left(1, \|\bm{r}^k\|_2/\|\bm{v}^k\|_2\right)\\ \bm{y}^k &= \bm{x}^k - \alpha^k \bm{r}^k\\ \bm{x}^{k+1} &= \textm{MM}(\bm{y}^k) \end{aligned} \qquad k=0,1,2,\dots \]
SCA iterates: \[ \begin{array}{ll} \begin{aligned} \hat{\bm{x}}^{k+1} &= \mathcal{S}_{\frac{\lambda}{\tau + \textm{diag}(\bm{A}^\T\bm{A})}} \left(\bm{x}^k - \frac{\bm{A}^\T\left(\bm{A}\bm{x}^k - \bm{b}\right)}{\tau + \textm{diag}\left(\bm{A}^\T\bm{A}\right)}\right)\\ \bm{x}^{k+1} &= \gamma^k \hat{\bm{x}}^{k+1} + \left(1 - \gamma^k\right) \bm{x}^{k} \end{aligned} \qquad k=0,1,2,\dots \end{array} \]
ADMM iterates: \[ \begin{aligned} \bm{x}^{k+1} &= \left(\bm{A}^\T\bm{A} + \rho\bm{I}\right)^{-1}\left(\bm{A}^\T\bm{b} +\rho\left(\bm{z}^k - \bm{u}^k\right)\right)\\ \bm{z}^{k+1} &= \mathcal{S}_{\lambda/\rho} \left(\bm{x}^{k+1} + \bm{u}^k\right)\\ \bm{u}^{k+1} &= \bm{u}^{k} + \left(\bm{x}^{k+1} - \bm{z}^{k+1}\right) \end{aligned} \qquad k=0,1,2,\dots \]
Observe that BCD updates each element sequentially, whereas all the other methods update all the elements simultaneously (this translates in an unacceptable increase of the computational cost for BCD).
The Jacobi update is the parallel version of BCD, although in principle it is not guaranteed to converge. Interestingly, the Jacobi update looks strikingly similar to the SCA update, except that SCA includes \(\tau\) and the smoothing step, which are precisely the necessary ingredients to guarantee convergence.
A hidden but critical detail in the MM method is the need to compute the largest eigenvalue of \(\bm{A}^\T\bm{A}\) to be able to choose the parameter \(\kappa > \lambda_\textm{max}\left(\bm{A}^\T\bm{A}\right)\), which results in an upfront increase of computation. In addition, such a value of \(\kappa\) is a very conservative upper-bound used in the update of all the elements of \(\bm{x}\). In SCA, this common conservative value of \(\kappa\) is replaced by the vector \(\textm{diag}\left(\bm{A}^\T\bm{A}\right)\), which is better tailored to each element of \(\bm{x}\) and results in a faster convergence.
Numerical comparison
Next figures compare the convergence of all these methods in terms of iterations, as well as CPU time, for the resolution of the \(\ell_2 - \ell_1\)-norm minimization problem with \(m = 500\) linear equations and \(n = 100\) variables. Note that each outer iteration of BCD involves a sequential update of each element, which results in an extremely high CPU time and the curve lies outside the range of the plot. Observe that ADMM converges with a much lower accuracy than the other methods.
Code
library(CVXR)
library(microbenchmark)
library(dplyr)
# generate data
set.seed(42)
<- 10L # to compute the cpu time
num_times <- 2
lmd <- 500
m <- 100
n <- matrix(rnorm(m*n), m, n)
A <- rnorm(n)
x_true <- A %*% x_true + 0.1*rnorm(m)
b
# get optimal value via CVX
<- Variable(n)
x <- Problem(Minimize(0.5*cvxr_norm(A %*% x - b, 2)^2 + lmd*cvxr_norm(x, 1)))
prob <- solve(prob)
res <- res$getValue(x)
x_cvx <- res$value #0.5*sum((A %*% x_cvx - b)^2) + lmd*sum(abs(x_cvx))
opt_value
# solve problem iteratively
<- rnorm(n)
x0 <- function(u, lmd) sign(u)*pmax(0, abs(u) - lmd)
soft_thresholding
#
# BCD
#
<- x0
x <- microbenchmark({
cpu_time <- apply(A, 2, function(x) sum(x^2))
a2 unit = "microseconds", times = num_times)$time |> median()
}, <- data.frame("k" = 0,
df "cpu time k" = cpu_time,
"gap" = 0.5*sum((A %*% x - b)^2) + lmd*sum(abs(x)) - opt_value,
"method" = "BCD",
check.names = FALSE)
for (k in 1:50) {
<- microbenchmark({
cpu_time <- x
x_new for (i in 1:n) {
<- b - A[, -i] %*% x_new[-i]
b_ <- soft_thresholding(t(A[, i]) %*% b_, lmd)/a2[i]
x_new[i]
}unit = "microseconds", times = num_times)$time |> median()
}, <- x_new
x
<- rbind(df, list("k" = k,
df "cpu time k" = cpu_time,
"gap" = 0.5*sum((A %*% x - b)^2) + lmd*sum(abs(x)) - opt_value,
"method" = "BCD"))
}
#
# MM
#
<- x0
x <- microbenchmark({
cpu_time <- t(A) %*% b
Atb <- t(A) %*% A
AtA #kappa <- 1.1 * max(eigen(AtA)$values)
<- x0; for (i in 1:20) u <- AtA %*% u # power iteration method
u <- 1.1 * as.numeric(t(u) %*% AtA %*% u / sum(u^2))
kappa unit = "microseconds", times = num_times)$time |> median()
}, <- rbind(df, list("k" = 0,
df "cpu time k" = cpu_time,
"gap" = 0.5*sum((A %*% x - b)^2) + lmd*sum(abs(x)) - opt_value,
"method" = "MM"))
for (k in 1:50) {
<- microbenchmark({
cpu_time <- soft_thresholding(x - (AtA %*% x - Atb)/kappa, lmd/kappa)
x_new unit = "microseconds", times = num_times)$time |> median()
}, <- x_new
x
<- rbind(df, list("k" = k,
df "cpu time k" = cpu_time,
"gap" = 0.5*sum((A %*% x - b)^2) + lmd*sum(abs(x)) - opt_value,
"method" = "MM"))
}
#
# Accelerated MM
#
<- x0
x <- microbenchmark({
cpu_time <- t(A) %*% b
Atb <- t(A) %*% A
AtA #kappa <- 1.1 * max(eigen(AtA)$values)
<- x0; for (i in 1:20) u <- AtA %*% u # power iteration method
u <- 1.1 * as.numeric(t(u) %*% AtA %*% u / sum(u^2))
kappa unit = "microseconds", times = num_times)$time |> median()
}, <- rbind(df, list("k" = 0,
df "cpu time k" = cpu_time,
"gap" = 0.5*sum((A %*% x - b)^2) + lmd*sum(abs(x)) - opt_value,
"method" = "Acc-MM"))
for (k in 1:50) {
<- microbenchmark({
cpu_time <- soft_thresholding(x - (AtA %*% x - Atb)/kappa, lmd/kappa)
MM_x <- soft_thresholding(MM_x - (AtA %*% MM_x - Atb)/kappa, lmd/kappa)
MM_MM_x <- MM_x - x
r <- MM_MM_x - MM_x - r
v <- -max(1, sqrt(sum(r^2))/sqrt(sum(v^2)))
alpha <- x - alpha * r
y <- soft_thresholding(y - (AtA %*% y - Atb)/kappa, lmd/kappa)
x_new unit = "microseconds", times = num_times)$time |> median()
}, <- x_new
x
<- rbind(df, list("k" = k,
df "cpu time k" = cpu_time,
"gap" = 0.5*sum((A %*% x - b)^2) + lmd*sum(abs(x)) - opt_value,
"method" = "Acc-MM"))
}
#
# SCA
#
<- 1e-6
tau <- 0.01
eps <- 1
gamma <- x0
x <- microbenchmark({
cpu_time <- t(A) %*% b
Atb <- t(A) %*% A
AtA <- tau + diag(AtA)
tau_plus_a2 unit = "microseconds", times = num_times)$time |> median()
}, <- rbind(df, list("k" = 0,
df "cpu time k" = cpu_time,
"gap" = 0.5*sum((A %*% x - b)^2) + lmd*sum(abs(x)) - opt_value,
"method" = "SCA"))
for (k in 1:50) {
<- microbenchmark({
cpu_time <- soft_thresholding(x - (AtA %*% x - Atb)/tau_plus_a2, lmd/tau_plus_a2)
x_hat <- gamma*x_hat + (1 - gamma)*x
x_new unit = "microseconds", times = num_times)$time |> median()
}, <- x_new
x <- gamma * (1 - eps*gamma)
gamma
<- rbind(df, list("k" = k,
df "cpu time k" = cpu_time,
"gap" = 0.5*sum((A %*% x - b)^2) + lmd*sum(abs(x)) - opt_value,
"method" = "SCA"))
}
#
# ADMM
#
<- 1.0
rho <- x0
x <- x
z <- rep(0, n)
u <- microbenchmark({
cpu_time <- t(A) %*% b
Atb <- t(A) %*% A
AtA unit = "microseconds", times = num_times)$time |> median()
}, <- rbind(df, list("k" = 0,
df "cpu time k" = cpu_time,
"gap" = 0.5*sum((A %*% x - b)^2) + lmd*sum(abs(x)) - opt_value,
"method" = "ADMM"))
for (k in 1:50) {
<- microbenchmark({
cpu_time <- solve(AtA + rho*diag(n), Atb + rho*(z - u))
x_new <- soft_thresholding(x_new + u, lmd/rho)
z_new <- u + (x_new - z_new)
u_new unit = "microseconds", times = num_times)$time |> median()
}, <- x_new
x <- z_new
z <- u_new
u
<- rbind(df, list("k" = k,
df "cpu time k" = cpu_time,
"gap" = 0.5*sum((A %*% x - b)^2) + lmd*sum(abs(x)) - opt_value,
"method" = "ADMM"))
}
# polish data frame
$method <- factor(df$method, levels = unique(df$method)) # to preserve the order of appearance (instead of alphabetic)
df<- df |>
df group_by(method) |>
mutate("CPU time [ms]" = cumsum(`cpu time k`)/1e6) |>
ungroup()
Comparison of different iterative methods for the \(\ell_2 - \ell_1\)-norm minimization:
Code
library(ggplot2)
library(scales)
# plot
|>
df ggplot(aes(x = k, y = gap, color = method)) +
geom_line(linewidth = 1.2) +
scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x))) +
labs(color = "Method") +
ggtitle("Optimality gap versus iterations") + theme(axis.title.x = element_text(face = "italic"))