ADIA Lab Market Prediction
Description (from CrunchDAO):
In finance, predicting asset price returns is a fascinating yet very hard problem. For this reason, alternative prediction problems have emerged in an attempt to circumvent these difficulties and still obtain predictions with tradeable potential. One of the most interesting alternatives is the problem of identifying the relative ordering in performance of an investment vehicle, in the cross-section of a pool or subset of them. This is the cross-section forecast problem. In this setting, we track a pool of investment vehicles that are generally obtained through some rule (for example S&P 500 tracks the stock performance of the 500 largest companies in the US) at different dates. This pool is known as the universe in financial jargon and its definition is an object of study by itself. The goal of this competition is to rank the performance of all assets in the universe from best to worst at each given date. The target to predict in this competition is the ranking of the future performance of each asset, remapped to the interval [-1,1], and the scoring function is Spearman's rank correlation between the predicted vs true rankings.
To illustrate an interesting use case of this problem, we can imagine an investment strategy that is long on the best-performing element of the universe, and short in the worst. In this setting, no matter the direction of the market is still possible to obtain positive returns - or to minimize losses.
Evaluation Metric: Spearman Rank Correlation
Ranking: 21 out of 4376
Models: Siamese RankNet Neural Network
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from keras.models import load_model
from pathlib import Path
import tensorflow as tf
from tensorflow import keras
from keras.callbacks import ModelCheckpoint
from keras.callbacks import EarlyStopping
from sklearn.decomposition import PCA
import pickle
import math
import gc
from collections import defaultdict
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Dense, Dropout, Lambda
from tensorflow.keras import backend as K
from tensorflow.keras.metrics import AUC
##### FUNCTIONS #####
def pairwise_combinations_with_ids_and_values(X):
n_samples = X.shape[0]
generated_pairs = set() # To store generated pairs and avoid duplicates
X_pairs = []
X_pair_ids = []
for i in range(n_samples):
date_i = X[i, 0]
same_date_indices = [j for j in range(i + 1, n_samples) if X[j, 0] == date_i]
for j in same_date_indices:
pair_key = tuple(sorted((i, j))) # Create a unique key for the pair
if pair_key not in generated_pairs: # Check if pair is already generated
X_pair = np.array([X[i, 2:], X[j, 2:]])
X_pair_id = np.array([X[i, :2], X[j, :2]])
generated_pairs.add(pair_key)
X_pairs.append(X_pair)
X_pair_ids.append(X_pair_id)
return np.array(X_pairs, dtype='float32'), np.array(X_pair_ids)
def pairwise_generator5(X, y, batch_size):
n_samples = X.shape[0]
# Create a dictionary to store indices for each date
date_indices = defaultdict(list)
for i in range(n_samples):
date_indices[X[i, 0]].append(i)
while True:
X_batch = []
y_batch = []
for date_i in date_indices.keys():
same_date_indices = date_indices[date_i]
if len(same_date_indices) < 2:
continue
# Create pairs for all samples of the same date
for i in range(len(same_date_indices)):
for j in range(i+1, len(same_date_indices)):
before_comparison = y[same_date_indices]
# Ensure the first sample in each pair is the one with the higher target value
if y[same_date_indices[i]][2] > y[same_date_indices[j]][2]:
X_pair = [X[same_date_indices[i], 2:], X[same_date_indices[j], 2:]] # Ignore the first two columns
y_pair = 1
else:
X_pair = [X[same_date_indices[j], 2:], X[same_date_indices[i], 2:]] # Ignore the first two columns
y_pair = 0
X_batch.append(X_pair)
y_batch.append(y_pair)
if len(X_batch) == batch_size:
X_batch_array = np.array(X_batch, dtype='float32')
yield [X_batch_array[:, 0], X_batch_array[:, 1]], np.array(y_batch, dtype='float32').reshape(-1,1)
X_batch = []
y_batch = [] # Reset batches
if X_batch: # If there are any remaining samples that didn't make a full batch
X_batch_array = np.array(X_batch, dtype='float32')
yield [X_batch_array[:, 0], X_batch_array[:, 1]], np.array(y_batch, dtype='float32').reshape(-1,1)
def create_siamese_model(input_shape):
# Define the tensors for the two input images
left_input = Input(input_shape)
right_input = Input(input_shape)
# Neural Network
model = Sequential()
model.add(Dense(400, activation='relu'))
model.add(Dense(100, activation='relu'))
model.add(Dense(50, activation='relu'))
model.add(Dense(20, activation='relu'))
# Generate the encodings for the two images
encoded_l = model(left_input)
encoded_r = model(right_input)
# Add a customized layer to compute the absolute difference between the encodings
L1_layer = Lambda(lambda tensors: K.abs(tensors[0] - tensors[1]))
L1_distance = L1_layer([encoded_l, encoded_r])
# Add a dense layer with a sigmoid unit to generate the similarity score
prediction = Dense(1, activation='sigmoid')(L1_distance)
# Connect the inputs with the outputs
siamese_net = Model(inputs=[left_input, right_input], outputs=prediction)
# Return the model
return siamese_net
##### TRAIN #####
def train(X_train: pd.DataFrame, y_train: pd.DataFrame, model_directory_path: str = "../resources") -> None:
max_date = X_train['date'].max()
min_date = 0
X_train_orig = X_train[(X_train['date'] < max_date) & (X_train['date'] > min_date)]
X_test = X_train[X_train['date'] == max_date]
y_train_orig = y_train[(y_train['date'] < max_date) & (y_train['date'] > min_date)]
y_test = y_train[y_train['date'] == max_date]
#Scaling
X_ids = np.asarray(X_train_orig[['date', 'id']])
X_scale_pca = X_train_orig.drop(columns=['date', 'id'])
X_scale_pca = np.asarray(X_scale_pca)
X_test_ids = np.asarray(X_test[['date', 'id']])
X_test_scale_pca = X_test.drop(columns=['date', 'id'])
X_test_scale_pca = np.asarray(X_test_scale_pca)
#PCA
n_components = 40
pca = PCA(n_components=n_components)
pca_features = pca.fit_transform(X_scale_pca)
X_train_concat = np.concatenate((X_ids, pca_features), axis=1)
y_train = np.asarray(y_train)
pca_features_test = pca.transform(X_test_scale_pca)
X_test_concat = np.concatenate((X_test_ids, pca_features_test), axis=1)
y_test = np.asarray(y_test)
#Save out Scaler and PCA
with open(Path(model_directory_path) / 'pca.pkl', 'wb') as file:
pickle.dump(pca, file)
date_list = list(set(X_train_orig['date']))
batch_size = 3500
train_generator = pairwise_generator5(X_train_concat, y_train, batch_size)
test_generator= pairwise_generator5(X_test_concat, y_test, batch_size)
print(X_train_concat.shape)
#Model Training
model_pathname = Path('../resources') / "model.keras"
if model_pathname.is_file():
early_stopping = EarlyStopping(
monitor='val_loss',
patience=8,
verbose=0,
mode='auto',
baseline=None)
history = model.fit(
train_generator,
batch_size=batch_size,
steps_per_epoch = 1000,
epochs=1000,
validation_data=test_generator,
validation_steps = 500,
callbacks=[mc, early_stopping],
shuffle=True,
use_multiprocessing=False,
verbose=1
)
else:
#Neural Network Model
mc = ModelCheckpoint(model_pathname, monitor='val_loss', mode='min', verbose=1, save_best_only=True)
early_stopping = EarlyStopping(
monitor='val_loss',
patience=8,
verbose=1,
mode='auto',
baseline=None)
model = create_siamese_model((X_train_concat.shape[1] - 2))
optimizer = keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=optimizer,
loss='binary_crossentropy',
metrics=[AUC(name='auc')])
history = model.fit(
train_generator,
batch_size=batch_size,
steps_per_epoch=1000,
epochs=1000,
validation_data=test_generator,
validation_steps=500,
callbacks=[mc, early_stopping],
shuffle=True,
use_multiprocessing=False,
verbose=1
)
model.save(model_pathname)
gc.collect()
print("Finished All Training")
def infer(X_test: pd.DataFrame, model_directory_path: str = "../resources") -> pd.DataFrame:
X_test_orig = X_test.copy()
# Load Scaler
with open(Path(model_directory_path) / 'scaler.pkl', 'rb') as file:
scaler = pickle.load(file)
# Scaling
X_ids = np.asarray(X_test_orig[['date', 'id']])
X_scale_pca = X_test_orig.drop(columns=['date', 'id'])
X_scale_pca = scaler.transform(X_scale_pca)
X_test_concat = np.concatenate((X_ids, X_scale_pca), axis=1)
result_df = pd.DataFrame(columns=['date', 'id', 'value'])
# Load Model
model_pathname = Path(model_directory_path) / "model.keras"
model = load_model(model_pathname)
# Pairwise Transformation using the pairwise generator
batch_size = 3500
X_test, X_test_ids = pairwise_combinations_with_ids_and_values(X_test_concat)
print("Predicting for Test Data")
preds = model.predict(X_test)
preds_df = pd.DataFrame({'id': X_test_ids[:, 0, 1].flatten(), 'date': X_test_ids[:, 0, 0].flatten(), 'value': preds.flatten()})
preds_df = preds_df.groupby(['date', 'id']).mean().reset_index()
result_df = pd.merge(X_test_orig, preds_df, on=['id', 'date'], how='left')
result_df['value'] = result_df['value'].fillna(result_df['value'].mean())
minmax = MinMaxScaler(feature_range=(-1, 1))
result_df['value'] = minmax.fit_transform(result_df[['value']])
print("Finished predicting Test Data")
return result_df
X_train = pd.read_parquet('../data/X_train.parquet')
y_train = pd.read_parquet('../data/y_train.parquet')
X_test = pd.read_parquet('../data/X_test.parquet')
train(X_train, y_train)
results = infer(X_test)