This notebook presents the R code used for the paper “Characteristics-driven
returns in equilibrium”.
Only the strict minimum of material is provided.
The dataset, with anonymized firm identifiers, is
available here.
The R Markdown notebook that can be run directly in RStudio is
accessible here.
Some results towards the end were generated with additional
notebooks, hence they require external output.
The non-baseline notebooks (neural networks, macro-data, sparse models)
are available upo request.
First, we load packages & import the data.
rstudioapi::writeRStudioPreference("data_viewer_max_columns", 200L) # Allows to view up to 200 columns
## NULL
library(tidyverse) # Data wrangling package
library(readxl) # Read excel files
library(data.table) # Fast wrangling!
library(fst) # Fast reading & writing of dataframes
library(dtplyr) # Combining dplyr & data.table
library(lubridate) # Date management
library(ggallin) # Addin to ggplot for negative log scales
library(viridis) # Cool color palette
library(patchwork) # To layout graphs
library(latex2exp) # LaTeX in graphs
library(RcppEigen) # For fast linear models
library(reshape2) # List wrangling
library(Matrix) # High speed matrix manipulation
library(fastDummies) # For fixed effects
data_demand <- read_fst("data/data_demand.fst") # Loading main dataset
#data_demand <- read_fst("data/demand_macro.fst") # Loading main dataset
specs <- read_excel("data/ML_supp.xlsx") # Loading additional info
na_0 <- function(x){ # Function that deals with missing points
x[is.na(x)] <- 0 # Values are in [-0.5,0.5], NAs => 0
return(x)
}
data_demand <- data_demand %>% # The above function is applied to the data
mutate(across(3:ncol(data_demand), na_0))
We create useful variables, especially predictor names used in models
later on.
The baseline estimates are based on the characteristics that are
released monthly (there are 20 of them).
The original dataset has strong variations in the number of well-defined
fields before 1983.
Thus, we truncate the sample before this date.
data_demand <- data_demand %>% filter(Date > "1984-06-01") # Stability filter
data_demand <- data_demand %>% arrange(Date, permno) # Order
dates <- data_demand$Date %>% unique() %>% sort() # Vector of dates
features <- specs %>% # Predictors / firm characteristics
# filter(Frequency == "Monthly") %>%
pull(Acronym)
#features <- c("mvel1", "bm", "mom12m") # Characteristics used in panel models
D_features <- paste0("D_", features) # Change in predictors
varz <- c(features, D_features) # List of all independent variables
#varz <- colnames(data_demand)[4:ncol(data_demand)]
Below, we create the objects in which we will store some
results/outputs.
Also, we fix the sample size used throughout the notebook!
5 years means 60 monthly points. If there are roughly 5,000 firms in one
sample, this makes 300k observations for one estimation.
We recall that there are 20 characteristics + 20 changes thereof, which
makes 40 predictors.
sample <- 24 # Sample size (in months)
frequency <- 12 # How often we estimate => every X months
dates_oos <- dates[sample:length(dates)] # Out-of-sample dates
res_pool <- list() # Initialize results
res_fixed <- list() # Initialize results
R2_pool <- 0
R2_fixed <- 0
cor_series <- array(0, dim = c((length(dates_oos)-1), # Initialize corr matrices, nb dates
length(varz) + 1,
length(varz) + 1))
cov_series <- array(0, dim = c((length(dates_oos)-1), # Initialize cov matrices, nb dates
length(varz) + 1,
length(varz) + 1))
f_e <- c() # Initialize fixed effects
avg_ret_fac <- matrix(0, nrow = length(dates_oos)-1, # Initialize average returns
ncol = 3) # We focus on 3 factors
avg_fe_fac <- matrix(0, nrow = length(dates_oos)-1, # Initialize average fixed effects
ncol = 3 )
We rapidly source the custom functions we use below.
source("func.R")
The loop below perform 4 tasks:
1. perform the pooled estimations and store the corresponding estimates
& metrics (e.g., \(R^2\)).
2. compute the correlations and covariances between the characteristics
and their changes.
3. estimate the model with fixed effects & store the values.
4. compute the average return and average fixed effect of sorted
portfolios on the rolling samples.
The loop is on the dates.
We could make do without a loop via the pmap()
function.
But the complexity of the output is a bit deterring (this would generate
a complex list).
se_fe <- F
j <- 1
dates_used <- c()
avg_ret <- c()
features_sort <- c("mvel1", "bm", "mom12m")
tictoc::tic()
for(i in 1:(length(dates_oos)-2)){
if(i %% 60 == 1){print(dates_oos[i])} # Keeps track of progress
data_train <- data_prep(data_demand, dates[i], dates_oos[i]) # Sample at time t
data_test <- data_prep(data_demand, dates_oos[i+1], dates_oos[i+1]) # Test set: t+1
data_test_0 <- data_train %>% filter(Date == dates_oos[i]) # Test set: t
avg_ret <- bind_rows(avg_ret, # This stores the average returns in the sample
data_train |> group_by(permno) |>
summarise(avg_ret = mean(Return, na.rm = T)) |>
mutate(date = dates_oos[i]))
y <- data_train$Return
x <- data_train %>% select(all_of(varz)) %>% data.matrix()
if(i %% frequency == 1){
if(se_fe == T){
d <- cbind(x, dummy_cols(data_train$permno)[,-1]) %>% data.matrix() %>% Matrix(sparse = T) # For fixed effects
d <- t(d) %*% d
nn <- ncol(x)
NN <- ncol(d)
d_inv <- solve(d)
d_iag <- diag(d_inv)[(nn+1):NN]
}
if(se_fe == F){ d_iag <- NA}
dates_used <- c(dates_used, as.character(dates_oos[i]))
# The code below is for the pooled OLS estimations
permnos <- data_train$permno %>% unique() %>% sort() # Keep model names
pool_temp <- fit_lin(x = x, y = y, data_train$permno, permnos, as.character(dates_oos[i]), d_iag, "pooled")
res_pool[[j]] <- pool_temp %>% mutate(sample = sample) # Storing estimation results
}
test_set <- intersect(data_test$permno, data_test_0$permno) %>% # Common stocks for test
intersect(permnos)
pred_temp <- pred_lin(res = pool_temp, # OOS prediction
x_new = data_test_0 %>%
filter(permno %in% test_set) %>%
select(all_of(varz)),
permno = test_set,
type = "pooled")
target <- data_test %>% filter(permno %in% test_set) %>% pull(Return) # OOS realization
error <- pred_temp - target
R2_pool[i] <- 1 - var(error) / var(target)
## Next we compute the correlations & covariances
## cor_series[i,,] <- cor(data_train %>% select(-permno, -Date), use = "pairwise.complete.obs") # Store the correlation matrix
## cov_series[i,,] <- cov(data_train %>% select(-permno, -Date), use = "pairwise.complete.obs") # Store the covariance matrix
if(i %% frequency == 1){
# The code below pertains to the panel model
fixed_temp <- fit_lin(x = x, y = y, data_train$permno, permnos, as.character(dates_oos[i]), d_iag, "fixed") # Within estimation
res_fixed[[j]] <- fixed_temp %>% mutate(sample = sample)
# Finally, the average returns & average fixed-effects of the sorted portfolios
for(k in 1:length(features_sort)){
avg_ret_fac[i,k] <- sorted_metric(data_train, features_sort[k], "Return") # Store the factor average return
avg_fe_fac[i,k] <- sorted_metric(data = fixed_temp |> filter(field == "fe") |> # Store the factor fixed effects
# bind_cols(f_e = fixef(fit_fe) %>% as.numeric(),
# permno = unique(data_train$permno) %>% sort()) %>%
right_join(data_train |> select(permno, Date, mvel1, mom12m, bm),
by = "permno"),
var = features_sort[k],
metric = "value")
}
j <- j + 1
}
pred_temp <- pred_lin(res = fixed_temp, # OOS prediction
x_new = data_test_0 %>%
filter(permno %in% test_set) %>%
select(all_of(varz)),
permnos = test_set,
type = "fixed")
error <- pred_temp - target # OOS error
R2_fixed[i] <- 1 - var(error) / var(target) # OOS R2
}
## [1] "1986-05-30"
## [1] "1991-05-31"
## [1] "1996-05-31"
## [1] "2001-05-31"
## [1] "2006-05-31"
## [1] "2011-05-31"
## [1] "2016-05-31"
## [1] "2021-05-28"
tictoc::toc()
## 758.912 sec elapsed
save(avg_ret, file = paste0("avg_ret_", sample, ".RData"))
save(avg_ret_fac, file = paste0("avg_ret_fac_", sample, ".RData"))
save(avg_fe_fac, file = paste0("avg_fe_fac_", sample, ".RData"))
Below, we wrangle the output, for estimation outputs (estimates, t-stats, p-values) and \(R^2\) separately.
res_pool <- bind_rows(res_pool)
res_fixed <- bind_rows(res_fixed)
Saving important results below for future use.
model <- "linear"
spec <- "simple"
res_full <-
bind_rows(
res_pool %>% mutate(type = "pooled",
model = model,
spec = spec,
date = as.Date(date)),
res_fixed %>% mutate(type = "fixed",
model = model,
spec = spec,
date = as.Date(date)),
data.frame(value = R2_pool,
factor = NA,
field = "R2_oos",
date = dates_oos[1:(length(dates_oos)-2)],
permno = NA,
sample = sample,
type = "pooled",
model = model,
spec = spec),
data.frame(value = R2_fixed,
factor = NA,
field = "R2_oos",
date = dates_oos[1:(length(dates_oos)-2)],
permno = NA,
sample = sample,
type = "fixed",
model = model,
spec = spec)
)
save(res_full, file = paste0("res_full_linear_",sample,".RData"))
g1 <- res_full %>%
filter(field == "t_stat",
factor %in% c("mom12m", "mvel1", "bm"),
type == "pooled") %>%
ggplot(aes(x = date, y = value, color = factor)) + geom_line() + theme_bw() +
geom_hline(yintercept = 2.5, linetype = 2) + # Upper 1% threshold
geom_hline(yintercept = -2.5, linetype = 2) + # Lower 1% threshold
theme(legend.position = c(0.75, 1.2), # Legend position
text = element_text(size = 12), # Overall text size
plot.title = element_text(face = "bold"), # Bold title
legend.title = element_text(face = "bold", size = 9), # Title font size for the legend
plot.subtitle = element_text(size = 10, face = "italic"), # Italic subtitle
axis.title.x = element_blank(), # No x-axis title (obvious)
axis.title.y = element_blank(), # No y-axis title (=title)
panel.grid.major = element_line(size = 0.3, linetype = 'solid', colour = "#EEEEEE"), # Grid adjustment
panel.grid.minor = element_line(size = 0.1, linetype = 'solid', colour = "#FFFFFF")) + # Grid adjustment
geom_line(alpha = 0.6, size = 0.6) + labs(subtitle = "Pooled estimation t-statistic") + # Lines + subtitle
scale_color_manual(values = c("#D5280C", "#FC9203", "#219B96"),
labels = c("value", "mom", "size"),
name = "Characteristic") +
ggtitle(TeX("Estimates for $\\textbf{c}_t^{(k)}$ (2 year samples)")) +
guides(colour = guide_legend(nrow = 1))
g2 <- res_full %>%
filter(field == "t_stat",
factor %in% c("mom12m", "mvel1", "bm"),
type == "fixed") %>%
ggplot(aes(x = date, y = value, color = factor)) + geom_line() + theme_bw() +
geom_hline(yintercept = 2.5, linetype = 2) + # Upper 1% threshold
geom_hline(yintercept = -2.5, linetype = 2) + # Lower 1% threshold
theme(legend.position = c(0.75, 1.2), # Legend position
text = element_text(size = 12), # Overall text size
plot.title = element_text(face = "bold"), # Bold title
legend.title = element_text(face = "bold", size = 9), # Title font size for the legend
plot.subtitle = element_text(size = 10, face = "italic"), # Italic subtitle
axis.title.x = element_blank(), # No x-axis title (obvious)
axis.title.y = element_blank(), # No y-axis title (=title)
panel.grid.major = element_line(size = 0.3, linetype = 'solid', colour = "#EEEEEE"), # Grid adjustment
panel.grid.minor = element_line(size = 0.1, linetype = 'solid', colour = "#FFFFFF")) + # Grid adjustment
geom_line(alpha = 0.6, size = 0.6) +
labs(subtitle = "Within estimation t-statistic (with fixed effects)") + # Lines + subtitle
scale_color_manual(values = c("#D5280C", "#FC9203", "#219B96"),
labels = c("value", "mom", "size"),
name = "Characteristic") +
ggtitle(TeX("Estimates for $\\textbf{c}_t^{(k)}$ (2 year samples)")) +
guides(colour = guide_legend(nrow = 1))
g1/g2
#ggsave("t_stats_1y.pdf")
g3 <- res_full %>%
filter(field == "t_stat",
factor %in% c("D_mom12m", "D_mvel1", "D_bm"),
type == "pooled") %>%
ggplot(aes(x = date, y = value, color = factor)) + geom_line() + theme_bw() +
geom_hline(yintercept = 2.5, linetype = 2) + # Upper 1% threshold
geom_hline(yintercept = -2.5, linetype = 2) + # Lower 1% threshold
theme(legend.position = c(0.75, 1.2), # Legend position
text = element_text(size = 12), # Overall text size
plot.title = element_text(face = "bold"), # Bold title
legend.title = element_text(face = "bold", size = 9), # Title font size for the legend
plot.subtitle = element_text(size = 10, face = "italic"), # Italic subtitle
axis.title.x = element_blank(), # No x-axis title (obvious)
axis.title.y = element_blank(), # No y-axis title (=title)
panel.grid.major = element_line(size = 0.3, linetype = 'solid', colour = "#EEEEEE"), # Grid adjustment
panel.grid.minor = element_line(size = 0.1, linetype = 'solid', colour = "#FFFFFF")) + # Grid adjustment
geom_line(alpha = 0.6, size = 0.6) + labs(subtitle = "Pooled estimation t-statistic") + # Lines + subtitle
scale_color_manual(values = c("#D5280C", "#FC9203", "#219B96"),
labels = c("value", "mom", "size"),
name = "Characteristic") +
ggtitle(TeX("Estimates for $\\Delta \\textbf{c}_t^{(k)}$ (2 year samples)")) +
guides(colour = guide_legend(nrow = 1))
g4 <- res_full %>%
filter(field == "t_stat",
factor %in% c("D_mom12m", "D_mvel1", "D_bm"),
type == "fixed") %>%
ggplot(aes(x = date, y = value, color = factor)) + geom_line() + theme_bw() +
geom_hline(yintercept = 2.5, linetype = 2) + # Upper 1% threshold
geom_hline(yintercept = -2.5, linetype = 2) + # Lower 1% threshold
theme(legend.position = "none", # Legend position
text = element_text(size = 12), # Overall text size
plot.title = element_text(face = "bold"), # Bold title
legend.title = element_text(face = "bold", size = 9), # Title font size for the legend
plot.subtitle = element_text(size = 10, face = "italic"), # Italic subtitle
axis.title.x = element_blank(), # No x-axis title (obvious)
axis.title.y = element_blank(), # No y-axis title (=title)
panel.grid.major = element_line(size = 0.3, linetype = 'solid', colour = "#EEEEEE"), # Grid adjustment
panel.grid.minor = element_line(size = 0.1, linetype = 'solid', colour = "#FFFFFF")) + # Grid adjustment
geom_line(alpha = 0.6, size = 0.6) +
#labs(subtitle = "Within estimation t-statistic (with fixed effects)") + # Lines + subtitle
scale_color_manual(values = c("#D5280C", "#FC9203", "#219B96"),
labels = c("value", "mom", "size"),
name = "Characteristic") +
ggtitle(TeX("Estimates for $\\Delta \\textbf{c}_t^{(k)}$ (2 year samples)")) +
guides(colour = guide_legend(nrow = 1))
g2/g4
#ggsave("t_stats_2y.pdf", width = 8, height = 5)
load("res_full_linear_12.RData")
res_12 <- res_full
load("res_full_linear_24.RData")
res_24 <- res_full
load("res_full_linear_120.RData")
res_120 <- res_full
load("res_full_linear_60.RData")
res_full <- res_full |> bind_rows(res_24) |> bind_rows(res_12) |> bind_rows(res_120) |>
mutate(sample = paste(sample, "months"))
load("avg_ret_12.RData")
avg_12 <- avg_ret |> mutate(sample = "12 months")
load("avg_ret_120.RData")
avg_120 <- avg_ret |> mutate(sample = "120 months")
load("avg_ret_24.RData")
avg_24 <- avg_ret |> mutate(sample = "24 months")
load("avg_ret_60.RData")
avg_ret <- avg_ret |>
mutate(sample = "60 months") |> bind_rows(avg_12) |> bind_rows(avg_24) |> bind_rows(avg_120)
Dispersion in mean returns
avg_ret |>
mutate(sample = sample |> recode_factor(`12 months` = "12 months",
`24 months` = "24 months",
`60 months` = "60 months",
`120 months` = "120 months",
.ordered = T)) |>
group_by(date, sample) |>
summarise(sd = sd(avg_ret)) |>
ggplot(aes(x = date, y = sd, color = sample)) + geom_line() +
theme_bw() +
scale_color_manual(values = c("#FFC300", "#FF5733", "#C70039", "4ABCDE"), name = "sample \n size")
## `summarise()` has grouped output by 'date'. You can override using the
## `.groups` argument.
R-squared
res_full |>
filter(field == "R2_is", type == "fixed") |>
mutate(sample = sample |> recode_factor(`12 months` = "12 months",
`24 months` = "24 months",
`60 months` = "60 months",
`120 months` = "120 months",
.ordered = T)) |>
ggplot(aes(x = date, y = value, color = sample)) + geom_line() + geom_point(size = 0.5) +
theme_bw() +
theme(axis.title = element_blank(),
legend.position = "bottom",
plot.title = element_text(vjust = -10, hjust = 0.05, face = "bold"),
legend.title = element_text(face = "bold")) +
scale_color_manual(values = c("#FFC300", "#FF5733", "#C70039", "4ABCDE"), name = "sample \n size") +
ggtitle(TeX("In-sample $R^2$"))
ggsave("R2.pdf", width = 5, height = 5)
Fixed effects
res_full %>%
filter(field == "fe", type == "fixed") %>%
mutate(sample = sample |> recode_factor(`12 months` = "12 months",
`24 months` = "24 months",
`60 months` = "60 months",
`120 months` = "120 months",
.ordered = T)) |>
group_by(date, sample, model) %>%
summarize(avg_fe = mean(value),
fe_90 = quantile(value, 0.9),
fe_10 = quantile(value, 0.1),
fe_med = median(value),
fe_25 = quantile(value, 0.25),
fe_75 = quantile(value, 0.75)) |>
pivot_longer(avg_fe:fe_75, names_to = "series", values_to = "value") |>
filter(series %in% c("fe_10", "fe_90")) |>
ggplot(aes(x = date, y = value, color = sample, shape = as.factor(series))) + geom_line() + geom_point() +
theme_bw() + ylim(c(-0.8,0.8)) +
theme(axis.title = element_blank(),
legend.box = "vertical",
plot.title = element_text(vjust = -8, hjust = 0.05, face = "bold", margin = margin()),
legend.margin = margin(),
legend.position = "bottom",
legend.title = element_text(face = "bold")) +
scale_color_manual(values = c("#FFC300", "#FF5733", "#C70039", "4ABCDE"), name = "sample \n size") +
scale_shape_manual(values = c(1,2), name = "quantile", labels = c("10% (lower)", "90% (upper)")) +
ggtitle(TeX("Extreme deciles of fixed-effects"))
## `summarise()` has grouped output by 'date', 'sample'. You can override using
## the `.groups` argument.
## Warning: Removed 1 rows containing missing values (geom_point).
ggsave("fe.pdf", width = 5, height = 5)
## Warning: Removed 1 rows containing missing values (geom_point).
A decomposition of dispersion of mean returns.
res_full |>
filter(field == "fe", type == "fixed") |>
left_join(avg_ret,
by = c("date", "sample", "permno")) |>
mutate(model = avg_ret - value,
sample = sample |> recode_factor(`12 months` = "12 months",
`24 months` = "24 months",
`60 months` = "60 months",
`120 months` = "120 months",
.ordered = T)) |>
group_by(date, sample) |>
summarise(
var = var(avg_ret),
cov_ret_fe = cov(avg_ret, value, use = "complete.obs"),
cov_ret_model = cov(avg_ret, model, use = "complete.obs")
) |>
pivot_longer(var:cov_ret_model, values_to = "values", names_to = "series") |>
ggplot(aes(x = date, y = values, color = series)) + geom_line() +
facet_grid(sample ~., scales = "free") +
theme_bw() + geom_hline(yintercept = 0, linetype = 2) +
scale_color_manual(values = c("#EE1111", "#11AADD", "#EE9911"),
label = c("cov(avg returns, fixed effects)", "cov(avg returns, model)", "var(avg returns)"),
name = "series") +
theme(axis.title = element_blank(),
strip.text = element_text(face = "bold"),
legend.position = "bottom",
strip.background = element_rect(fill = "#EEEEEE"),
legend.title = element_text(face = "bold"))
## `summarise()` has grouped output by 'date'. You can override using the
## `.groups` argument.
ggsave("decompo_var.pdf", width = 8, height = 5)
res_full |>
filter(field == "fe", type == "fixed", sample %in% c("12 months", "24 months", "60 months")) |>
group_by(date, sample) |>
summarise(se_fe = sd(value, na.rm = T)) |>
left_join(avg_ret |>
group_by(date, sample) |>
summarise(se_ret = sd(avg_ret, na.rm = T)),
by = c("date", "sample")) |>
pivot_longer(se_fe:se_ret, names_to = "series", values_to = "value") |>
ggplot(aes(x = date, y = value, color = as.factor(sample))) + theme_bw() +
geom_line(aes(linetype = series)) + geom_point(aes(shape = series)) +
scale_color_manual(values = c("#000000", "#1188DD", "#EE9911"), label = c("12 months", "24 months", "60 months"), name = "sample size") +
scale_shape_manual(values = c(1,2), labels = c("fixed effects", "avg. returns"), name = "std. dev. of:") +
scale_linetype_manual(values = c(1,2), labels = c("fixed effects", "avg. returns"), name = "std. dev. of:") +
theme(axis.title = element_blank(),
legend.position = c(0.87,0.74),
legend.title = element_text(face = "bold")) #+ scale_y_log10()
## `summarise()` has grouped output by 'date'. You can override using the
## `.groups` argument.
## `summarise()` has grouped output by 'date'. You can override using the
## `.groups` argument.
This section (and the remainder of the notebook) is based on external data.
load("res_full_linear_24_3vars.RData")
res_3 <- res_full
load("res_full_linear_macro_24.RData")
res_full <- bind_rows(res_full, res_3)
res_full <- res_full |> mutate(sample = paste(sample, "months"))
res_full |>
filter(field == "fe", type == "fixed", date > "1986-04-14") |>
left_join(avg_ret |> mutate(date = as.Date(date)),
by = c("date", "permno")) |>
mutate(model = avg_ret - value,
spec = spec |> recode_factor(simple = "simple (3+3)",
macro = "macro (376+376)",
.ordered = T)) |>
group_by(date, spec) |>
summarise(
var = var(avg_ret),
cov_ret_fe = cov(avg_ret, value, use = "complete.obs"),
cov_ret_model = cov(avg_ret, model, use = "complete.obs")
) |>
pivot_longer(var:cov_ret_model, values_to = "values", names_to = "series") |>
ggplot(aes(x = date, y = values, color = series)) + geom_line() +
facet_grid(spec ~., scales = "free") +
theme_bw() + geom_hline(yintercept = 0, linetype = 2) +
scale_color_manual(values = c("#EE1111", "#11AADD", "#EE9911"),
label = c("cov(avg returns, fixed effects)", "cov(avg returns, model)", "var(avg returns)"),
name = "series") +
theme(axis.title = element_blank(),
strip.text = element_text(face = "bold"),
legend.position = "bottom",
strip.background = element_rect(fill = "#EEEEEE"),
legend.title = element_text(face = "bold"))
## `summarise()` has grouped output by 'date'. You can override using the
## `.groups` argument.
ggsave("decompo_alt.pdf", width = 8, height = 4)
This is also based on external data.
load("res_nn.RData")
nn <- res_nn |> mutate(model = "baseline")
# load("avg_ret_24.RData")
nn |> group_by(date) |>
mutate(model = avg_ret - f_e) |>
summarise(
var = var(avg_ret),
cov_ret_fe = cov(avg_ret, f_e, use = "complete.obs"),
cov_ret_model = cov(avg_ret, model, use = "complete.obs")
) |>
pivot_longer(var:cov_ret_model, values_to = "values", names_to = "series") |>
ggplot(aes(x = date, y = values, color = series)) + geom_line() + theme_bw() +
scale_color_manual(values = c("#EE1111", "#11AADD", "#EE9911", "#000000"),
label = c("cov(avg retrurns, fixed effects)", "cov(avg returns, mode)", "var(avg returns)"),
name = "series") +
theme(axis.title = element_blank(),
strip.text = element_text(face = "bold"),
legend.position = "right",
strip.background = element_rect(fill = "#EEEEEE"),
legend.title = element_text(face = "bold"))
load("res_nn_2.RData")
nn <- bind_rows(nn, res_nn |> mutate(model = "sophisticated"))
nn |>
mutate(model_val = avg_ret - f_e) |>
group_by(date, model) |>
summarise(
var = var(avg_ret),
cov_ret_fe = cov(avg_ret, f_e, use = "complete.obs"),
cov_ret_model = cov(avg_ret, model_val, use = "complete.obs")
) |>
pivot_longer(var:cov_ret_model, values_to = "values", names_to = "series") |>
ggplot(aes(x = date, y = values, color = series, linetype = model)) + geom_line() + theme_bw() +
scale_color_manual(values = c("#EE1111", "#11AADD", "#EE9911", "#000000"),
label = c("cov(avg retrurns, fixed effects)", "cov(avg returns, model)", "var(avg returns)"),
name = "series") +
theme(axis.title = element_blank(),
strip.text = element_text(face = "bold"),
legend.position = "right",
strip.background = element_rect(fill = "#EEEEEE"),
legend.title = element_text(face = "bold"))
## `summarise()` has grouped output by 'date'. You can override using the
## `.groups` argument.
ggsave("decompo_nn.pdf", width = 8, height = 4)
load("res_lasso_0_0003.RData")
load("avg_ret_24.RData")
dates_used <- unique(res_nn$date)
prop_char <- 0
prop_fe <- 0
lasso_res <- c()
for(k in 1:length(res_lasso)){
z_temp <- res_lasso[[k]]
prop_char[k] <- mean(z_temp$beta[1:94] != 0)
prop_fe[k] <- mean(z_temp$beta[95:length(z_temp$beta)] != 0)
lasso_res <- bind_rows(lasso_res,
data.frame(date = dates_used[k],
f_e = z_temp$beta[95:length(z_temp$beta)],
permno = rownames(z_temp$beta)[95:length(z_temp$beta)]))
}
lasso_res <- lasso_res |> mutate(permno = permno |> str_replace_all("permno_", "") |> as.numeric())
prop <- data.frame(date = dates_used,
char = prop_char,
fe = prop_fe) |> pivot_longer(-date, names_to = "type", values_to = "value")
g1 <- lasso_res |>
filter(date > "1986-04-14") |>
left_join(avg_ret, by = c("date", "permno")) |>
group_by(date) |>
mutate(model = avg_ret - f_e) |>
summarise(
var = var(avg_ret),
cov_ret_fe = cov(avg_ret, f_e, use = "complete.obs"),
cov_ret_model = cov(avg_ret, model, use = "complete.obs")
) |>
pivot_longer(var:cov_ret_model, names_to = "series", values_to = "values") |>
ggplot(aes(x = date, y = values, color = series)) + geom_line() + theme_bw() +
scale_color_manual(values = c("#EE1111", "#11AADD", "#EE9911", "#000000"),
label = c("cov(avg retrurns, \n \ \ \ \ \ fixed effects)", "cov(avg returns, model)", "var(avg returns)"),
name = "series") +
ylab("variance / covariance") +
ggtitle(label = "Low penalization",
subtitle = TeX("($\\lambda=$0.0003)")) +
theme(axis.title.x = element_blank(),
strip.text = element_text(face = "bold"),
legend.position = c(0.74,0.72),
plot.title = element_text(face = "bold", margin = margin(0,0,-0.5,0, "cm")),
plot.subtitle = element_text(color = "#333333", hjust = 0.55, vjust = 0.5),
legend.background = element_rect(color = "black", size = 0.2),
axis.title.y = element_text(face = "bold"),
strip.background = element_rect(fill = "#EEEEEE"),
legend.title = element_text(face = "bold"))
g2 <- prop |> ggplot(aes(x = date, y = value, color = type)) + geom_line() +
theme_bw() + ylab("proportion of nonzero coefficients") +
theme(axis.title.x = element_blank(),
legend.position = c(0.83, 0.98),
legend.background = element_rect(color = "black", size = 0.2),
legend.title = element_text(face = "bold"),
axis.title.y = element_text(face = "bold")) +
scale_color_manual(values = c("#000000", "#AAAAAA"),
labels = c("characteristics", "fixed effects"),
name = "coef type")
g1+g2 + plot_layout(widths = c(13, 10))
ggsave("decompo_lasso.pdf", width = 8, height = 3.3)
load("res_lasso_0_001.RData")
load("avg_ret_24.RData")
dates_used <- unique(res_nn$date)
prop_char <- 0
prop_fe <- 0
lasso_res <- c()
for(k in 1:length(res_lasso)){
z_temp <- res_lasso[[k]]
prop_char[k] <- mean(z_temp$beta[1:94] != 0)
prop_fe[k] <- mean(z_temp$beta[95:length(z_temp$beta)] != 0)
lasso_res <- bind_rows(lasso_res,
data.frame(date = dates_used[k],
f_e = z_temp$beta[95:length(z_temp$beta)],
permno = rownames(z_temp$beta)[95:length(z_temp$beta)]))
}
lasso_res <- lasso_res |> mutate(permno = permno |> str_replace_all("permno_", "") |> as.numeric())
prop <- data.frame(date = dates_used,
char = prop_char,
fe = prop_fe) |> pivot_longer(-date, names_to = "type", values_to = "value")
g1 <- lasso_res |>
filter(date > "1986-04-14") |>
left_join(avg_ret, by = c("date", "permno")) |>
group_by(date) |>
mutate(model = avg_ret - f_e) |>
summarise(
var = var(avg_ret),
cov_ret_fe = cov(avg_ret, f_e, use = "complete.obs"),
cov_ret_model = cov(avg_ret, model, use = "complete.obs")
) |>
pivot_longer(var:cov_ret_model, names_to = "series", values_to = "values") |>
ggplot(aes(x = date, y = values, color = series)) + geom_line() + theme_bw() +
scale_color_manual(values = c("#EE1111", "#11AADD", "#EE9911", "#000000"),
label = c("cov(avg retrurns, \n \ \ \ \ \ fixed effects)", "cov(avg returns, model)", "var(avg returns)"),
name = "series") +
ylab("variance / covariance") +
ggtitle(label = "High penalization",
subtitle = TeX("($\\lambda=$0.001)")) +
theme(axis.title.x = element_blank(),
strip.text = element_text(face = "bold"),
plot.title = element_text(face = "bold", margin = margin(0,0,-0.5,0, "cm")),
plot.subtitle = element_text(color = "#333333", hjust = 0.55, vjust = 0.5),
legend.position = c(0.75,0.72),
legend.background = element_rect(color = "black", size = 0.2),
axis.title.y = element_text(face = "bold"),
strip.background = element_rect(fill = "#EEEEEE"),
legend.title = element_text(face = "bold"))
g2 <- prop |> ggplot(aes(x = date, y = value, color = type)) + geom_line() +
theme_bw() + ylab("proportion of nonzero coefficients") +
theme(axis.title.x = element_blank(),
legend.position = c(0.83, 0.98),
legend.background = element_rect(color = "black", size = 0.2),
legend.title = element_text(face = "bold"),
axis.title.y = element_text(face = "bold")) +
scale_color_manual(values = c("#000000", "#AAAAAA"),
labels = c("characteristics", "fixed effects"),
name = "coef type")
g1+g2 + plot_layout(widths = c(13, 10))
ggsave("decompo_lasso_2.pdf", width = 8, height = 3.3)
IN SAMPLE OVERALL
load("R2_nn.RData")
R2 <- R2_nn |> mutate(model = "NN (baseline)")
load("R2_nn_2.RData")
R2 <- bind_rows(R2, R2_nn |> mutate(model = "NN (sophisticated)"))
load("R2_lasso_0_0003.RData")
R2_lasso <- R2_lasso |> mutate(model = "LASSO (low)")
R2 <- bind_rows(R2, R2_lasso)
load("R2_lasso_0_001.RData")
R2_lasso <- R2_lasso |> mutate(model = "LASSO (high)")
R2 <- bind_rows(R2, R2_lasso)
load("res_full_linear_24.RData")
R2 <- bind_rows(R2, res_full |>
filter(field == "R2_is", type == "fixed") |>
mutate(model = "Panel", type = field) |>
select(date, value, type, model))
R2 |>
filter(type == "R2_is") |>
ggplot(aes(x = date, y = value, color = model)) + geom_line() +
theme_bw()
Correl!
load("res_full_linear_24.RData")
OLD STUFF BELOW
res_full %>%
filter(field %in% c("R2_is", "R2_oos"),
date %in% (res_full |> filter(field == "R2_is") |> pull(date))) %>%
ggplot(aes(x = date, y = value, color = field)) + geom_line() +
facet_wrap(vars(type)) + theme_bw() + ylab("R-squared") +
theme(axis.title.x = element_blank(),
legend.position = c(0.67,0.2)) +
scale_color_manual(name = "R-squared type", label = c("In-sample", "Out-of-sample"), values = c("#000000", "#999999"))
ggsave("R2_2y.pdf", width = 8, height = 4)
load("res_full_linear_24.RData")
res_base <- res_full |> mutate(vars = "baseline")
load("res_full_linear_24_macro_lasso.RData")
res_macro <- res_full |> mutate(vars = "macro_lasso")
load("res_full_linear_24_3vars.RData")
res_3 <- res_full |> mutate(vars = "3 variables")
res_base |> bind_rows(res_macro) |> bind_rows(res_3) |>
filter(field %in% c("R2_oos"),
date %in% res_macro$date) %>%
ggplot(aes(x = date, y = value, color = vars)) + geom_line() +
facet_wrap(vars(type)) + theme_bw() + ylab("Out-of-sample R-squared") +
theme(axis.title.x = element_blank(),
legend.position = c(0.67,0.2)) +
scale_color_manual(name = "Batch of predictors",
label = c("3 predictors", "baseline", "macro + lasso"),
values = c("#FF1111", "#888888", "#000000"))
ggsave("R2_nb.pdf", width = 8, height = 4)
R^2
res_full |>
filter(field == "R2_oos", value > -0.9) |>
mutate(sample = sample |> recode_factor(`12 months` = "12 months",
`24 months` = "24 months",
`60 months` = "60 months",
`120 months` = "120 months",
.ordered = T)) |>
ggplot(aes(x = value, fill = as.factor(sample))) + geom_histogram(position = "dodge", bins = 20) +
facet_wrap(vars(type)) + theme_bw() +
theme(legend.position = c(0.15,0.5)) +
scale_fill_manual(name = "Sample size (months)", values = c("#DD051C", "#552D9B", "#4ABCDE", "#999999"))
## Warning in recode.numeric(.x, !!!values, .default = .default, .missing
## = .missing): NAs introduced by coercion
## Warning: Unreplaced values treated as NA as `.x` is not compatible.
## Please specify replacements exhaustively or supply `.default`.
ggsave("R2_sample.pdf", width = 8, height = 4)
f_e_temp <- res_full %>%
filter(field == "fe", type == "fixed") %>%
group_by(date, sample, model) %>%
summarize(avg_fe = mean(value),
fe_90 = quantile(value, 0.9),
fe_10 = quantile(value, 0.1),
fe_med = median(value),
fe_25 = quantile(value, 0.25),
fe_75 = quantile(value, 0.75))
## `summarise()` has grouped output by 'date', 'sample'. You can override using
## the `.groups` argument.
f_e_temp %>%
ggplot(aes(x = date, y = avg_fe)) + geom_line(color = "#999999") +
geom_line(aes(y = fe_90), color = "#2299DD") + geom_line(aes(y = fe_75), linetype = 2) +
geom_line(aes(y = fe_10), color = "#2299DD") + geom_line(aes(y = fe_25), linetype = 2) +
annotate("text", x = as.Date("2022-10-01"), y = f_e_temp$fe_90[nrow(f_e_temp)], label = "90%", color = "#2299DD") +
annotate("text", x = as.Date("2022-10-01"), y = f_e_temp$fe_75[nrow(f_e_temp)], label = "75%") +
annotate("text", x = as.Date("2022-06-21"), y = f_e_temp$fe_med[nrow(f_e_temp)], label = "median", color = "#999999", angle = 90) +
annotate("text", x = as.Date("2022-10-01"), y = f_e_temp$fe_25[nrow(f_e_temp)], label = "25%") +
annotate("text", x = as.Date("2022-10-01"), y = f_e_temp$fe_10[nrow(f_e_temp)], label = "10%", color = "#2299DD") +
theme_light() + ggtitle("Distribution of fixed effects (2Y samples)") +
theme(axis.title.x = element_blank(),
axis.title.y = element_blank(),
text = element_text(size = 15),
plot.title = element_text(face = "bold"),
panel.grid.major = element_line(size = 0.3, linetype = 'solid', colour = "#EEEEEE"), # Grid adjustment
panel.grid.minor = element_line(size = 0.1, linetype = 'solid', colour = "#FFFFFF"),
legend.title = element_text(face = "bold", size = 9))
ggsave("fe_2y.pdf", width = 8, height = 4)
res_full |>
filter(field == "coef",
factor %in% c("mom12m", "mvel1", "bm"),
type == "fixed") %>%
ggplot(aes(x = date, y = value, linetype = as.factor(sample))) + geom_line() + theme_bw() +
facet_grid(factor ~., scales = "free") +
theme(legend.position = c(0.75, 1.09), # Legend position
text = element_text(size = 12), # Overall text size
plot.title = element_text(face = "bold"), # Bold title
legend.title = element_text(face = "bold", size = 9), # Title font size for the legend
plot.subtitle = element_text(size = 10, face = "italic"), # Italic subtitle
axis.title.x = element_blank(), # No x-axis title (obvious)
axis.title.y = element_blank(), # No y-axis title (=title)
panel.grid.major = element_line(size = 0.3, linetype = 'solid', colour = "#EEEEEE"), # Grid adjustment
panel.grid.minor = element_line(size = 0.1, linetype = 'solid', colour = "#FFFFFF")) + # Grid adjustment
geom_line(alpha = 0.6, size = 0.6) + labs(subtitle = "Model with fixed effects") + # Lines + subtitle
scale_linetype_manual(values = c(1,2,3),
labels = c("12 months", "24 months", "60 months"),
name = "Sample size") +
ggtitle(TeX("Coefficient estimates for 3 factors")) +
guides(linetype = guide_legend(nrow = 1))