# basic finance
library(xts) # to manipulate time series of stock data
library(portfolioBacktest) # to conduct backtests
library(pob) # book package with financial data
library(quantmod) # to download market data from the Internet
library(TTR) # generation of signals
# plotting
library(ggplot2) # for nice plots
library(patchwork) # for combining plots
# cointegration
library(egcm)
library(urca)
# modeling
library(MARSS) # state space modeling and KalmanR Code for Portfolio Optimization
Chapter 15 – Pairs Trading Portfolios
Daniel P. Palomar (2025). Portfolio Optimization: Theory and Application. Cambridge University Press.
Last update: March 14, 2025
Loading packages
First load the packages used in the examples:
Introduction
Stationarity
Stationarity refers to the property that the statistics of a time series remain fixed over time. In that sense, a stationary time series can be considered mean-reverting.
Example of a random walk (nonstationary time series with unit root):
Code
# generate synthetic data
T <- 200
set.seed(421)
y <- rep(0, T)
for (i in 2:T)
y[i] <- y[i-1] + rnorm(1)
data.frame("t" = 1:T, "y" = y) |>
ggplot(aes(x = t, y = y)) +
geom_line(linewidth = 1, color = "blue") +
labs(title = "Random walk", y = NULL)Example of a unit-root stationary AR(1) sequence:
Code
# generate synthetic data
T <- 200
set.seed(421)
y <- rep(0, T)
for (i in 2:T)
y[i] <- 0.2*y[i-1] + rnorm(1)
data.frame("t" = 1:T, "y" = y) |>
ggplot(aes(x = t, y = y)) +
geom_line(linewidth = 1, color = "blue") +
labs(title = "AR(1) sequence", y = NULL)Correlation versus cointegration
Cointegration refers to a property by which two (or more) assets, while not being mean-reverting individually, may be mean-reverting with respect to each other. This commonly happens when the series themselves contain stochastic trends (i.e., they are nonstationary) but nevertheless they move closely together over time in a way that their difference remains stable (i.e., stationary). Thus, the concept of cointegration mimics the existence of a long-run equilibrium to which an economic system converges over time.
Correlation is a basic concept in probability that refers to how correlated two random variables are. We can use this measure for stationary time series but definitely not with nonstationary time series. In fact, when we refer to correlation between two financial assets, we are actually employing this concept to the returns of the assets and not the price values.
Thus, both correlation and cointegration attempt to measure the same concept of co-movement of time series, but they do it in a very different way, namely:
Correlation is high when the two time series co-move (they move simultaneously in the same direction) and zero when they move independently.
Cointegration is high when the two time series move together and remain close to each other, and inexistent when they do not stay together.
Example of cointegrated time series with low correlation:
Code
# generate synthetic data
set.seed(421)
T <- 200
gamma <- 1
w <- rnorm(T, sd = 0.1)
x <- cumsum(w)
y1 <- gamma*x + rnorm(T, sd = 0.2)
y2 <- x + rnorm(T, sd = 0.2)
spread <- y1 - y2
cat("Correlation:", cor(diff(y1), diff(y2)))Correlation: 0.03433501
Code
# plots
df <- rbind(data.frame("t" = 1:T,
"series" = "y1",
"value" = y1),
data.frame("t" = 1:T,
"series" = "y2",
"value" = y2),
data.frame("t" = 1:T,
"series" = "spread",
"value" = y1 - y2))
df$series <- factor(df$series, levels = unique(df$series))
p_time_series <- ggplot(df, aes(x = t, y = value, color = series)) +
geom_line(linewidth = 1) +
theme(legend.title=element_blank()) +
labs(title = "Time series", y = NULL)
p_scatter <- data.frame("log-return 1" = diff(y1), "log-return 2" = diff(y2), check.names= FALSE) |>
ggplot(aes(`log-return 1`, `log-return 2`)) +
geom_point() +
xlim(c(-0.8,0.8)) + ylim(c(-0.8,0.8)) +
labs(title = "Scatter plot of differences", x = NULL, y = NULL)
p_time_series + p_scatter + plot_layout(ncol = 2, widths = c(2, 1))Example of non-cointegrated time series with high correlation:
Code
# generate synthetic data
set.seed(421)
T <- 200
gamma <- 1
w <- rnorm(T, sd = 0.3)
x <- cumsum(w)
y1 <- gamma*x + rnorm(T, sd = 0.05) + 0.01*c(1:T - 1)
y2 <- x + rnorm(T, sd = 0.05)
spread <- y1 - y2
#cor(diff(y1), diff(y2)) # 0.9521495
# plots
df <- rbind(data.frame("t" = 1:T,
"series" = "y1",
"value" = y1),
data.frame("t" = 1:T,
"series" = "y2",
"value" = y2),
data.frame("t" = 1:T,
"series" = "spread",
"value" = y1 - y2))
df$series <- factor(df$series, levels = unique(df$series))
p_time_series <- ggplot(df, aes(x = t, y = value, color = series)) +
geom_line(linewidth = 1) +
theme(legend.title=element_blank()) +
labs(title = "Time series", y = NULL)
p_scatter <- data.frame("log-return 1" = diff(y1), "log-return 2" = diff(y2), check.names= FALSE) |>
ggplot(aes(`log-return 1`, `log-return 2`)) +
geom_point() +
xlim(c(-0.8,0.8)) + ylim(c(-0.8,0.8)) +
labs(title = "Scatter plot of differences", x = NULL, y = NULL)
p_time_series + p_scatter + plot_layout(ncol = 2, widths = c(2, 1))Cointegration analysis
Cointegration tests check whether or not a linear combination of the two time series follows a stationary autoregressive model and will be mean-reverting. A time series with a unit root is nonstationary and behaves like a random walk. On the other hand, in the absence of unit roots, a time series tends to revert to its long-term mean. Thus, cointegration tests are typically implemented via unit-root stationarity tests.
Mathematically, we want to determine whether there exists a value of \(\gamma\) such that the spread \[ z_{t} = y_{1t} - \gamma \, y_{2t} \] is stationary. Note that, in practice, the mean of the spread \(\mu\) (equilibrium value) is not necessarily zero and \(\gamma\) does not have to be one.
Cointegrated synthetic data
Code
library(egcm)
# generate synthetic data
set.seed(421)
T <- 200
gamma <- 1
w <- rnorm(T, sd = 0.1)
x <- cumsum(w)
y1 <- gamma*x + rnorm(T, sd = 0.2)
y2 <- x + rnorm(T, sd = 0.2)
res <- egcm(y1, y2)
summary(res)Y[i] = 0.7970 X[i] + 0.1962 + R[i], R[i] = 0.1237 R[i-1] + eps[i], eps ~ N(0, 0.2648^2)
(0.0449) (0.1393) (0.0708)
R[200] = 0.2403 (t = 0.904)
WARNING: X does not seem to be integrated. Y does not seem to be integrated. X and Y do not appear to be cointegrated.
Unit Root Tests of Residuals
Statistic p-value
Augmented Dickey Fuller (ADF) -4.083 0.00805
Phillips-Perron (PP) -200.373 0.00010
Pantula, Gonzales-Farias and Fuller (PGFF) 0.112 0.00010
Elliott, Rothenberg and Stock DF-GLS (ERSD) -4.327 0.00081
Johansen's Trace Test (JOT) -97.506 0.00010
Schmidt and Phillips Rho (SPR) -365.457 0.00010
Variances
SD(diff(X)) = 0.307816
SD(diff(Y)) = 0.264605
SD(diff(residuals)) = 0.354600
SD(residuals) = 0.265758
SD(innovations) = 0.264754
Half life = 0.331653
R[last] = 0.240287 (t=0.90)
Code
# plot residuals
ggplot(data.frame(t = 1:T, value = res$residuals), aes(x = t, y = value)) +
geom_line(linewidth = 1, color = "blue") +
labs(title = "Residual time series", y = NULL)Non-cointegrated synthetic data
Code
library(egcm)
# generate synthetic data
set.seed(421)
T <- 200
gamma <- 1
w <- rnorm(T, sd = 0.3)
x <- cumsum(w)
y1 <- gamma*x + rnorm(T, sd = 0.05)
y2 <- x + rnorm(T, sd = 0.05)
y1 <- y1 + 0.01*c(1:T - 1)
# cointegration tests
res <- egcm(y1, y2)
summary(res)Y[i] = 0.6795 X[i] + 0.1641 + R[i], R[i] = 0.9093 R[i-1] + eps[i], eps ~ N(0, 0.1152^2)
(0.0105) (0.0710) (0.0337)
R[200] = -0.0775 (t = -0.321)
WARNING: X does not seem to be integrated. X and Y do not appear to be cointegrated.
Unit Root Tests of Residuals
Statistic p-value
Augmented Dickey Fuller (ADF) -2.084 0.45287
Phillips-Perron (PP) -20.144 0.06081
Pantula, Gonzales-Farias and Fuller (PGFF) 0.876 0.06999
Elliott, Rothenberg and Stock DF-GLS (ERSD) -2.371 0.07666
Johansen's Trace Test (JOT) -18.421 0.09961
Schmidt and Phillips Rho (SPR) -16.547 0.26708
Variances
SD(diff(X)) = 0.326260
SD(diff(Y)) = 0.308399
SD(diff(residuals)) = 0.118571
SD(residuals) = 0.241868
SD(innovations) = 0.115192
Half life = 7.293170
R[last] = -0.077548 (t=-0.32)
Code
ggplot(data.frame(t = 1:T, value = res$residuals), aes(x = t, y = value)) +
geom_line(linewidth = 1, color = "blue") +
labs(title = "Residual time series", y = NULL)Market data EWA–EWC
Code
library(quantmod)
library(egcm)
pair_names <- c("EWA", "EWC")
# download data
xts_pair <- cbind(Ad(getSymbols(pair_names[1], from = "2016-01-01", to = "2019-12-31",
auto.assign = FALSE)),
Ad(getSymbols(pair_names[2], from = "2016-01-01", to = "2019-12-31",
auto.assign = FALSE)))
colnames(xts_pair) <- pair_names
# cointegration tests
res <- egcm(xts_pair[, 1], xts_pair[, 2], log = FALSE)
summary(res)EWC[i] = 1.2905 EWA[i] + 2.4869 + R[i], R[i] = 0.9638 R[i-1] + eps[i], eps ~ N(0, 0.1422^2)
(0.0099) (0.1757) (0.0086)
R[2019-12-30] = 0.0226 (t = 0.043)
Unit Root Tests of Residuals
Statistic p-value
Augmented Dickey Fuller (ADF) -4.440 0.00486
Phillips-Perron (PP) -38.361 0.00577
Pantula, Gonzales-Farias and Fuller (PGFF) 0.965 0.00615
Elliott, Rothenberg and Stock DF-GLS (ERSD) -1.001 0.53105
Johansen's Trace Test (JOT) -28.609 0.00688
Schmidt and Phillips Rho (SPR) -11.987 0.38398
Variances
SD(diff(EWA)) = 0.157675
SD(diff(EWC)) = 0.193611
SD(diff(residuals)) = 0.143896
SD(residuals) = 0.522172
SD(innovations) = 0.142209
Half life = 18.785181
R[last] = 0.022620 (t=0.04)
Code
# plot residual
xts(res$residuals, index(xts_pair)) |>
fortify(melt = TRUE) |>
ggplot(aes(x = Index, y = Value)) +
geom_line(linewidth = 0.8, color = "blue") +
scale_x_date(date_breaks = "1 year", date_labels = "%Y", date_minor_breaks = "1 month") +
labs(title = "Residual time series", x = NULL, y = NULL)Market data KO–PEP
Code
library(quantmod)
library(egcm)
pair_names <- c("KO", "PEP")
# download data (from = "2013-01-01", to = "2022-12-31")
xts_pair <- cbind(Ad(getSymbols(pair_names[1], from = "2017-01-01", to = "2019-12-31",
auto.assign = FALSE)),
Ad(getSymbols(pair_names[2], from = "2017-01-01", to = "2019-12-31",
auto.assign = FALSE)))
colnames(xts_pair) <- pair_names
# cointegration tests
res <- egcm(xts_pair[, 1], xts_pair[, 2], log = FALSE)
summary(res)PEP[i] = 2.4166 KO[i] + 4.7629 + R[i], R[i] = 0.9901 R[i-1] + eps[i], eps ~ N(0, 0.7189^2)
(0.0337) (1.3552) (0.0069)
R[2019-12-30] = -0.5170 (t = -0.136)
WARNING: KO and PEP do not appear to be cointegrated.
Unit Root Tests of Residuals
Statistic p-value
Augmented Dickey Fuller (ADF) -2.518 0.26936
Phillips-Perron (PP) -14.136 0.18506
Pantula, Gonzales-Farias and Fuller (PGFF) 0.981 0.13946
Elliott, Rothenberg and Stock DF-GLS (ERSD) -2.504 0.04857
Johansen's Trace Test (JOT) -10.826 0.56213
Schmidt and Phillips Rho (SPR) -16.657 0.20348
Variances
SD(diff(KO)) = 0.354417
SD(diff(PEP)) = 0.893168
SD(diff(residuals)) = 0.721490
SD(residuals) = 3.788398
SD(innovations) = 0.718891
Half life = 69.616140
R[last] = -0.516977 (t=-0.14)
Code
# plot residual
xts(res$residuals, index(xts_pair)) |>
fortify(melt = TRUE) |>
ggplot(aes(x = Index, y = Value)) +
geom_line(linewidth = 0.8, color = "blue") +
scale_x_date(date_breaks = "1 year", date_labels = "%Y", date_minor_breaks = "1 month") +
labs(title = "Residual time series", x = NULL, y = NULL)Market data SPY–IVV–VOO
Now we consider three assets (SPY, IVV, and VOO are S&P 500 ETFs), which require the use of Johansen’s test.
Code
library(quantmod)
library(egcm)
pair_names <- c("SPY", "IVV", "VOO")
# download data
xts_triplet <- cbind(Ad(getSymbols(pair_names[1], from = "2017-01-01", to = "2019-12-31",
auto.assign = FALSE)),
Ad(getSymbols(pair_names[2], from = "2017-01-01", to = "2019-12-31",
auto.assign = FALSE)),
Ad(getSymbols(pair_names[3], from = "2017-01-01", to = "2019-12-31",
auto.assign = FALSE)))
colnames(xts_triplet) <- pair_names
# cointegration test for the triplet
res_urca <- urca::ca.jo(as.matrix(xts_triplet), type = "trace")
residual1_triplet <- xts(xts_triplet %*% res_urca@V[, 1], index(xts_triplet))
residual2_triplet <- xts(xts_triplet %*% res_urca@V[, 2], index(xts_triplet))
# plot residual
p1 <- fortify(residual1_triplet, melt = TRUE) |>
ggplot(aes(x = Index, y = Value)) +
geom_line(linewidth = 0.8, color = "blue") +
scale_x_date(date_breaks = "1 year", date_labels = "%Y", date_minor_breaks = "1 month") +
labs(title = "First residual time series", x = NULL, y = NULL)
p2 <- fortify(residual2_triplet, melt = TRUE) |>
ggplot(aes(x = Index, y = Value)) +
geom_line(linewidth = 0.8, color = "blue") +
scale_x_date(date_breaks = "1 year", date_labels = "%Y", date_minor_breaks = "1 month") +
labs(title = "Second residual time series", x = NULL, y = NULL)
p1 / p2Trading the spread
To define a strategy to trade the spread, it suffices to determine a rule for the sizing signal \(s_t\). For this purpose, it is convenient to use a normalized version of the spread, called standard score or \(z\)-score: \[ z^\textm{score}_t = \frac{z_t - \E[z_t]}{\sqrt{\textm{Var}(z_t)}}, \] which has zero mean and unit variance. This \(z\)-score cannot be used in a real implementation since it is not causal and suffers from look-ahead bias (when estimating the mean and variance). A naive approach would be to use some training data to determine the mean and standard deviation, and then apply that to the future out-of-sample data. A more sophisticated way is to make the calculation adaptive by implementing it in a rolling fashion, e.g., via the so-called Bollinger Bands.
Bollinger Bands are a technical trading tool created by John Bollinger in the early 1980s. They arose from the need for adaptive trading bands and the observation that volatility was dynamic. They are computed on a rolling window basis over some lookback window. In particular, the rolling mean and rolling standard deviation are first computed, from which the upper and lower bands are easily obtained (typically the mean plus/minus one or two standard deviations). In the context of the \(z\)-score, the spread can be adaptively normalized with the rolling mean and rolling standard deviation.
We now describe two of the simplest possible strategies for trading a spread, namely, the linear strategy and the thresholded strategy.
The linear strategy is very simple to describe based on the contrarian idea of buying low and selling high. As a first attempt, we could define the sizing signal simply as the negative \(z\)-score, \(s_t = -z^\textm{score}_t\), to gradually scale-in and scale-out or, even better, including a scaling factor as \(s_t = -z^\textm{score}_t / s_0\), where \(s_0\) denotes the threshold at which the signal is fully leveraged. In practice, to limit the leverage to 1, we can project this value to lie on the interval \([-1,1]\): \[ s_t = -\left[\frac{z^\textm{score}_t}{s_0}\right]_{-1}^{+1}, \] where \([\cdot]_a^b\) clips the argument to \(a\) if below that value and to \(b\) if above that value.
The thresholded strategy follows similarly the contrarian nature of buying low and selling high, but rather than linear it takes an all-in or all-out sizing based on thresholds. The simplest implementation is based on comparing the \(z\)-score to a threshold \(s_0\): buy when the \(z\)-score is below \(-s_0\) and short-sell when it is above \(s_0\), while unwinding the position after reverting to the equilibrium value of 0. In terms of the sizing signal: \[ s_t = \begin{cases} \begin{aligned} +1 &\qquad \textm{if } z^\textm{score}_t < - s_0\\ 0 &\qquad \textm{after }z^\textm{score}_t\textm{ reverts to }0\\ -1 &\qquad \textm{if } z^\textm{score}_t > + s_0. \end{aligned} \end{cases} \]
Code implementing the thresholded and linear signals from the z-score:
Code
library(TTR)
generate_thresholded_signal <- function(z_score, s0 = 1, start_signal_at = 1) {
T <- NROW(z_score)
signal <- z_score
signal[] <- 0
# initial point
if (z_score[start_signal_at] < -s0) { # buy
signal[start_signal_at] <- 1
} else if (z_score[start_signal_at] > s0) # short-sell
signal[start_signal_at] <- -1
# loop for other points
for (t in (start_signal_at+1):T) {
if (z_score[t] < -s0) # buy
signal[t] <- 1
else if (z_score[t] > s0) # short-sell
signal[t] <- -1
else { # maintain position or close position
if (signal[t-1] == 1 && z_score[t] < 0) # maintain buy position
signal[t] <- signal[t-1]
else if (signal[t-1] == -1 && z_score[t] > 0) # maintain short-sell position
signal[t] <- signal[t-1]
}
}
return(signal)
}
generate_linear_signal <- function(z_score, s0 = 1) {
signal <- -z_score/s0
# clip
signal[signal > +1] <- +1
signal[signal < -1] <- -1
return(signal)
}
generate_BB_linear_signal <- function(spread, s0 = 1, lookback = 20, start_signal_at = lookback) {
# compute Z-score via Bollinger Bands
bb <- BBands(spread, n = lookback, sd = 1)
sdev <- (bb[, "up"] - bb[, "dn"])/2
z_score <- (spread - bb[, "mavg"])/sdev
# linear signal
signal <- -z_score/s0
signal[1:(start_signal_at-1)] <- 0
signal[is.na(signal)] <- 0
# clip signal
signal[signal > +1] <- +1
signal[signal < -1] <- -1
return(list(signal = signal, z_score = z_score))
}
generate_BB_thresholded_signal <- function(spread, entry_zscore = 1, exit_zscore = 0, lookback = 20,
start_signal_at = lookback) {
# compute Z-score via Bollinger Bands
#z_score <- 2*(BBands(spread, n = lookback, sd = 2)[, "pctB"] - 0.5)
bb <- BBands(spread, n = lookback, sd = 1)
sdev <- (bb[, "up"] - bb[, "dn"])/2
z_score <- (spread - bb[, "mavg"])/sdev
# signal
T <- NROW(spread)
signal <- spread
signal[] <- 0
start_signal_at <- max(start_signal_at, lookback)
for (t in start_signal_at:T) {
if (z_score[t] < -entry_zscore) # buy
signal[t] <- 1
else if (z_score[t] > entry_zscore) # short-sell
signal[t] <- -1
else { # maintain position or close position
if (signal[t-1] == 1 && z_score[t] < -exit_zscore) # maintain buy position
signal[t] <- signal[t-1]
else if (signal[t-1] == -1 && z_score[t] > exit_zscore) # maintain short-sell position
signal[t] <- signal[t-1]
}
}
return(list(signal = signal, z_score = z_score))
}Helper function to compute the cumulative P&L from a spread and a signal:
Code
compute_cumPnL_spread_trading <- function(spread, signal, compounded = FALSE) {
if (!all.equal(index(spread), index(signal)))
stop("Temporal indices not aligned in spread and signal.")
if (is.xts(spread))
index_orig <- index(spread)
spread <- as.vector(spread)
signal <- as.vector(signal)
T <- length(spread)
signal_delayed <- c(0, signal[-T])
spread_ret <- c(0, diff(spread))
portf_ret <- signal_delayed*spread_ret
if (compounded)
portf_cumret <- cumprod(1 + portf_ret) - 1
else
portf_cumret <- cumsum(portf_ret)
if (exists("index_orig"))
portf_cumret <- xts(portf_cumret, index_orig)
return(portf_cumret)
}Illustration of pairs trading via the linear strategy on the spread:
Code
# generate synthetic spread (log-prices) as an AR(1)
T <- 200
rho <- 0.7
z_mu <- 0.02 # from real data KO
z_sd <- 0.10 # from real data KO
phi <- (1 - rho)*z_mu
sd_eps <- sqrt(1 - rho^2)*z_sd
set.seed(42)
z <- rep(NA, T)
z[1] <- phi + rnorm(1, sd = sd_eps)
for (t in 2:T)
z[t] <- phi + rho*z[t - 1] + rnorm(1, sd = sd_eps)
# compute Z-score
z_score <- (z - mean(z))/sd(z)
# generate signal
s0 <- 1
signal <- generate_linear_signal(z_score, s0)
# compute P&L
portf_cumret <- compute_cumPnL_spread_trading(z, signal)
# plots
p_zscore <- ggplot(data.frame(t = 1:T, value = z_score), aes(x = t, y = value)) +
geom_line(linewidth = 1, color = "blue") +
geom_hline(yintercept = -s0, color = "black" , linetype = "longdash") +
geom_hline(yintercept = 0, color = "black" , linetype = "solid") +
geom_hline(yintercept = s0, color = "black" , linetype = "longdash") +
labs(title = "Z-score", y = NULL)
p_signal <- ggplot(data.frame(t = 1:T, value = signal), aes(x = t, y = value)) +
geom_line(linewidth = 1, color = "black") +
labs(title = "Signal", y = NULL)
p_cumret <- ggplot(data.frame(t = 1:T, value = portf_cumret), aes(x = t, y = value)) +
geom_line(linewidth = 1, color = "blue") +
labs(title = "Cumulative return", y = NULL)
p_zscore / p_signal / p_cumretIllustration of pairs trading via the thresholded strategy on the spread:
Code
# compute Z-score
z_score <- (z - mean(z))/sd(z)
# generate signal
s0 <- 1
signal <- generate_thresholded_signal(z_score, s0)
# compute P&L
portf_cumret <- compute_cumPnL_spread_trading(z, signal)
# plots
p_zscore <- ggplot(data.frame(t = 1:T, value = z_score), aes(x = t, y = value)) +
geom_line(linewidth = 1, color = "blue") +
geom_hline(yintercept = -s0, color = "black" , linetype = "longdash") +
geom_hline(yintercept = 0, color = "black" , linetype = "solid") +
geom_hline(yintercept = s0, color = "black" , linetype = "longdash") +
labs(title = "Z-score", y = NULL)
p_signal <- ggplot(data.frame(t = 1:T, value = signal), aes(x = t, y = value)) +
geom_line(linewidth = 1, color = "black") +
labs(title = "Signal", y = NULL)
p_cumret <- ggplot(data.frame(t = 1:T, value = portf_cumret), aes(x = t, y = value)) +
geom_line(linewidth = 1, color = "blue") +
labs(title = "Cumulative return", y = NULL)
p_zscore / p_signal / p_cumretMarket data EWA–EWC
Pairs trading on EWA–EWC with 6-month rolling \(z\)-score and 2-year fixed least squares for the estimation of the hedge ratio \(\gamma\):
Code
library(quantmod)
# download data
y1 <- log(Ad(getSymbols("EWA", from = "2013-01-01", to = "2022-12-31", auto.assign = FALSE)))
y2 <- log(Ad(getSymbols("EWC", from = "2013-01-01", to = "2022-12-31", auto.assign = FALSE)))
T <- nrow(y1)
T_trn <- round(2*252)
# hedge ratio via fixed LS
gamma <- lm(y1[1:T_trn] ~ y2[1:T_trn] + 1)$coefficients[2]
# compute spread with leverage = 1
#w <- c(-gamma, 1)/(1 + gamma) # y2 ~ y1
w <- c(1, -gamma)/(1 + gamma) # y1 ~ y2
spread <- xts(cbind(y1, y2) %*% w, index(y1)) # this spread still contains a nonzero mean
spread <- spread - mean(spread[1:T_trn])
# generate sizing signal via thresholded strategy
entry_zscore = 1
res_BB <- generate_BB_thresholded_signal(spread, entry_zscore, lookback = 6*21, start_signal_at = T_trn+1)
z_score <- res_BB$z_score
signal <- res_BB$signal
# compute P&L
portf_cumret <- compute_cumPnL_spread_trading(spread, signal)
#
# plots
#
p_spread <- fortify(spread, melt = TRUE) |>
ggplot(aes(x = Index, y = Value)) +
geom_line(linewidth = 0.8, color = "blue") +
geom_vline(xintercept = index(portf_cumret)[T_trn], color = "black" , linetype = 5) +
annotate("text", x = index(spread)[T_trn], y = max(spread), hjust = "outward", label= "training ") +
annotate("text", x = index(spread)[T_trn], y = max(spread), hjust = "inward", label= " out-of-sample") +
scale_x_date(date_breaks = "1 year", date_labels = "%Y", date_minor_breaks = "1 month") +
labs(title = "Spread", x = NULL, y = NULL)
p_zscore <- fortify(z_score, melt = TRUE) |>
ggplot(aes(x = Index, y = Value)) +
geom_line(linewidth = 0.8, color = "blue") +
geom_hline(yintercept = -entry_zscore, color = "black" , linetype = 5) +
geom_hline(yintercept = 0, color = "black" , linetype = 1) +
geom_hline(yintercept = entry_zscore, color = "black" , linetype = 5) +
scale_x_date(date_breaks = "1 year", date_labels = "%Y", date_minor_breaks = "1 month") +
labs(title = "Z-score", x = NULL, y = NULL)
p_signal <- fortify(signal, melt = TRUE) |>
ggplot(aes(x = Index, y = Value)) +
geom_line(linewidth = 0.8, color = "black") +
scale_x_date(date_breaks = "1 year", date_labels = "%Y", date_minor_breaks = "1 month") +
labs(title = "Signal", x = NULL, y = NULL)
p_cumret <- fortify(portf_cumret, melt = TRUE) |>
ggplot(aes(x = Index, y = Value)) +
geom_line(linewidth = 0.8, color = "blue") +
scale_x_date(date_breaks = "1 year", date_labels = "%Y", date_minor_breaks = "1 month") +
labs(title = "Cumulative return", x = NULL, y = NULL)
p_spread / p_zscore / p_signal / p_cumretWe can observe that the spread does not show a strong persistent cointegration relationship over the whole period; in practice, the hedge ratio should be adapted over time. The \(z\)-score is able to produce a more constant mean-reverting version due to the rolling window adaptation. In any case, any practical implementation of pairs trading would re-compute the hedge ratio over time, as it is assumed in the rest of the experiments via a rolling least squares.
We will now implement a rolling least squares fitting:
Code
fit_rollingLS <- function(y1, y2, lookback = 2*252, every = 50) {
T <- NROW(y1)
# create update points
t_update <- seq(from = lookback, to = T, by = every)
if (last(t_update) < T)
t_update <- c(t_update, T)
# LS loop
gamma <- mu <- y1
gamma[] <- mu[] <- NA
for (t in t_update) {
res_LS <- lm(y1[(t-lookback+1):t] ~ y2[(t-lookback+1):t] + 1)
gamma[t] <- res_LS$coefficients[2]
mu[t] <- res_LS$coefficients[1]
}
gamma <- na.locf(gamma)
mu <- na.locf(mu)
# for make-up purposes (I will not trade during the first lookback samples anyway)
gamma <- na.locf(gamma, fromLast = TRUE)
mu <- na.locf(mu, fromLast = TRUE)
return(list(gamma = gamma, mu = mu))
}Pairs trading on EWA–EWC with 6-month rolling \(z\)-score and 2-year rolling least squares for the estimation of the hedge ratio \(\gamma\):
Code
# hedge ratio via rolling LS
rollingLS <- fit_rollingLS(y1, y2, lookback = T_trn, every = 1)
# compute spread with leverage=1
rollingLS$w <- cbind(1, -rollingLS$gamma)/as.numeric(1 + rollingLS$gamma) #w <- lag(w)
rollingLS$spread <- xts(rowSums(cbind(y1, y2) * rollingLS$w) - rollingLS$mu/(1 + rollingLS$gamma), index(y1))
# generate sizing signal via thresholded strategy
entry_zscore = 1
rollingLS <- c(rollingLS, generate_BB_thresholded_signal(rollingLS$spread,
entry_zscore, lookback = 6*21, start_signal_at = T_trn+1))
# compute P&L
rollingLS$portf_cumret <- compute_cumPnL_spread_trading(rollingLS$spread, rollingLS$signal)
#
# plots
#
p_spread <- fortify(rollingLS$spread, melt = TRUE) |>
ggplot(aes(x = Index, y = Value)) +
geom_line(linewidth = 0.8, color = "blue") +
geom_vline(xintercept = index(rollingLS$portf_cumret)[T_trn], color = "black" , linetype = 5) +
annotate("text", x = index(rollingLS$spread)[T_trn], y = max(rollingLS$spread), hjust = "outward", label= "training ") +
annotate("text", x = index(rollingLS$spread)[T_trn], y = max(rollingLS$spread), hjust = "inward", label= " out-of-sample") +
scale_x_date(date_breaks = "1 year", date_labels = "%Y", date_minor_breaks = "1 month") +
labs(title = "Spread", x = NULL, y = NULL)
p_zscore <- fortify(rollingLS$z_score, melt = TRUE) |>
ggplot(aes(x = Index, y = Value)) +
geom_line(linewidth = 0.8, color = "blue") +
geom_hline(yintercept = -entry_zscore, color = "black" , linetype = 5) +
geom_hline(yintercept = 0, color = "black" , linetype = 1) +
geom_hline(yintercept = entry_zscore, color = "black" , linetype = 5) +
scale_x_date(date_breaks = "1 year", date_labels = "%Y", date_minor_breaks = "1 month") +
labs(title = "Z-score", x = NULL, y = NULL)
p_signal <- fortify(rollingLS$signal, melt = TRUE) |>
ggplot(aes(x = Index, y = Value)) +
geom_line(linewidth = 0.8, color = "black") +
scale_x_date(date_breaks = "1 year", date_labels = "%Y", date_minor_breaks = "1 month") +
labs(title = "Signal", x = NULL, y = NULL)
p_cumret <- fortify(rollingLS$portf_cumret, melt = TRUE) |>
ggplot(aes(x = Index, y = Value)) +
geom_line(linewidth = 0.8, color = "blue") +
scale_x_date(date_breaks = "1 year", date_labels = "%Y", date_minor_breaks = "1 month") +
labs(title = "Cumulative return", x = NULL, y = NULL)
p_spread / p_zscore / p_signal / p_cumret