ADIA Lab Market Prediction

Aug 11

Description (from CrunchDAO):

In finance, predicting asset price returns is a fascinating yet very hard problem. For this reason, alternative prediction problems have emerged in an attempt to circumvent these difficulties and still obtain predictions with tradeable potential. One of the most interesting alternatives is the problem of identifying the relative ordering in performance of an investment vehicle, in the cross-section of a pool or subset of them. This is the cross-section forecast problem. In this setting, we track a pool of investment vehicles that are generally obtained through some rule (for example S&P 500 tracks the stock performance of the 500 largest companies in the US) at different dates. This pool is known as the universe in financial jargon and its definition is an object of study by itself. The goal of this competition is to rank the performance of all assets in the universe from best to worst at each given date. The target to predict in this competition is the ranking of the future performance of each asset, remapped to the interval [-1,1], and the scoring function is Spearman's rank correlation between the predicted vs true rankings.
To illustrate an interesting use case of this problem, we can imagine an investment strategy that is long on the best-performing element of the universe, and short in the worst. In this setting, no matter the direction of the market is still possible to obtain positive returns - or to minimize losses.

Evaluation Metric: Spearman Rank Correlation

Ranking: 21 out of 4376

Models: Siamese RankNet Neural Network

import pandas as pd
import numpy as np
import joblib

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from keras.models import load_model
from pathlib import Path
import tensorflow as tf
from tensorflow import keras
from keras.callbacks import ModelCheckpoint
from keras.callbacks import EarlyStopping
from sklearn.decomposition import PCA
import pickle
import math
import gc
from collections import defaultdict
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Dense, Dropout, Lambda
from tensorflow.keras import backend as K
from tensorflow.keras.metrics import AUC

##### FUNCTIONS #####


def pairwise_combinations_with_ids_and_values(X):
    n_samples = X.shape[0]
    generated_pairs = set()  # To store generated pairs and avoid duplicates
    X_pairs = []
    X_pair_ids = []

    for i in range(n_samples):
        date_i = X[i, 0]
        same_date_indices = [j for j in range(i + 1, n_samples) if X[j, 0] == date_i]
        for j in same_date_indices:
            pair_key = tuple(sorted((i, j)))  # Create a unique key for the pair
            if pair_key not in generated_pairs:  # Check if pair is already generated
                X_pair = np.array([X[i, 2:], X[j, 2:]])
                X_pair_id = np.array([X[i, :2], X[j, :2]])
                generated_pairs.add(pair_key)
                X_pairs.append(X_pair)
                X_pair_ids.append(X_pair_id)

    return np.array(X_pairs, dtype='float32'), np.array(X_pair_ids)




def pairwise_generator5(X, y, batch_size):
    n_samples = X.shape[0]

    # Create a dictionary to store indices for each date
    date_indices = defaultdict(list)
    for i in range(n_samples):
        date_indices[X[i, 0]].append(i)

    while True:
        X_batch = []
        y_batch = []

        for date_i in date_indices.keys():
            same_date_indices = date_indices[date_i]

            if len(same_date_indices) < 2:
                continue

            # Create pairs for all samples of the same date
            for i in range(len(same_date_indices)):
                for j in range(i+1, len(same_date_indices)):
                    before_comparison = y[same_date_indices]
                    # Ensure the first sample in each pair is the one with the higher target value
                    if y[same_date_indices[i]][2] > y[same_date_indices[j]][2]:
                        X_pair = [X[same_date_indices[i], 2:], X[same_date_indices[j], 2:]]  # Ignore the first two columns
                        y_pair = 1
                    else:
                        X_pair = [X[same_date_indices[j], 2:], X[same_date_indices[i], 2:]]  # Ignore the first two columns
                        y_pair = 0

                    X_batch.append(X_pair)
                    y_batch.append(y_pair)

                    if len(X_batch) == batch_size:
                        X_batch_array = np.array(X_batch, dtype='float32')
                        yield [X_batch_array[:, 0], X_batch_array[:, 1]], np.array(y_batch, dtype='float32').reshape(-1,1)
                        X_batch = []
                        y_batch = []  # Reset batches

        if X_batch:  # If there are any remaining samples that didn't make a full batch
            X_batch_array = np.array(X_batch, dtype='float32')
            yield [X_batch_array[:, 0], X_batch_array[:, 1]], np.array(y_batch, dtype='float32').reshape(-1,1)


def create_siamese_model(input_shape):
    # Define the tensors for the two input images
    left_input = Input(input_shape)
    right_input = Input(input_shape)

    # Neural Network
    model = Sequential()
    model.add(Dense(400, activation='relu'))
    model.add(Dense(100, activation='relu'))
    model.add(Dense(50, activation='relu'))
    model.add(Dense(20, activation='relu'))

    # Generate the encodings for the two images
    encoded_l = model(left_input)
    encoded_r = model(right_input)

    # Add a customized layer to compute the absolute difference between the encodings
    L1_layer = Lambda(lambda tensors: K.abs(tensors[0] - tensors[1]))
    L1_distance = L1_layer([encoded_l, encoded_r])

    # Add a dense layer with a sigmoid unit to generate the similarity score
    prediction = Dense(1, activation='sigmoid')(L1_distance)

    # Connect the inputs with the outputs
    siamese_net = Model(inputs=[left_input, right_input], outputs=prediction)

    # Return the model
    return siamese_net

##### TRAIN #####
def train(X_train: pd.DataFrame, y_train: pd.DataFrame, model_directory_path: str = "../resources") -> None:
    max_date = X_train['date'].max()
    min_date = 0
    X_train_orig = X_train[(X_train['date'] < max_date) & (X_train['date'] > min_date)]
    X_test = X_train[X_train['date'] == max_date]
    y_train_orig = y_train[(y_train['date'] < max_date) & (y_train['date'] > min_date)]
    y_test = y_train[y_train['date'] == max_date]

    #Scaling
    X_ids = np.asarray(X_train_orig[['date', 'id']])
    X_scale_pca = X_train_orig.drop(columns=['date', 'id'])
    X_scale_pca = np.asarray(X_scale_pca)


    X_test_ids = np.asarray(X_test[['date', 'id']])
    X_test_scale_pca = X_test.drop(columns=['date', 'id'])
    X_test_scale_pca = np.asarray(X_test_scale_pca)

    #PCA
    n_components = 40
    pca = PCA(n_components=n_components)
    pca_features = pca.fit_transform(X_scale_pca)
    X_train_concat = np.concatenate((X_ids, pca_features), axis=1)
    y_train = np.asarray(y_train)

    pca_features_test = pca.transform(X_test_scale_pca)
    X_test_concat = np.concatenate((X_test_ids, pca_features_test), axis=1)
    y_test = np.asarray(y_test)


    #Save out Scaler and PCA
    with open(Path(model_directory_path) / 'pca.pkl', 'wb') as file:
        pickle.dump(pca, file)

    date_list = list(set(X_train_orig['date']))

    batch_size = 3500
    train_generator = pairwise_generator5(X_train_concat, y_train, batch_size)
    test_generator= pairwise_generator5(X_test_concat, y_test, batch_size)

    print(X_train_concat.shape)


    #Model Training
    model_pathname = Path('../resources') / "model.keras"

    if model_pathname.is_file():
        early_stopping = EarlyStopping(
            monitor='val_loss',
            patience=8,
            verbose=0,
            mode='auto',
            baseline=None)

        history = model.fit(
            train_generator,
            batch_size=batch_size,
            steps_per_epoch = 1000,
            epochs=1000,
            validation_data=test_generator,
            validation_steps = 500,
            callbacks=[mc, early_stopping],
            shuffle=True,
            use_multiprocessing=False,
            verbose=1
        )

    else:
        #Neural Network Model
        mc = ModelCheckpoint(model_pathname, monitor='val_loss', mode='min', verbose=1, save_best_only=True)

        early_stopping = EarlyStopping(
            monitor='val_loss',
            patience=8,
            verbose=1,
            mode='auto',
            baseline=None)

        model = create_siamese_model((X_train_concat.shape[1] - 2))

        optimizer = keras.optimizers.Adam(learning_rate=0.001)

        model.compile(optimizer=optimizer,
                      loss='binary_crossentropy',
                      metrics=[AUC(name='auc')])

        history = model.fit(
            train_generator,
            batch_size=batch_size,
            steps_per_epoch=1000,
            epochs=1000,
            validation_data=test_generator,
            validation_steps=500,
            callbacks=[mc, early_stopping],
            shuffle=True,
            use_multiprocessing=False,
            verbose=1
        )

        model.save(model_pathname)




    gc.collect()

    print("Finished All Training")

def infer(X_test: pd.DataFrame, model_directory_path: str = "../resources") -> pd.DataFrame:
    X_test_orig = X_test.copy()

    # Load Scaler
    with open(Path(model_directory_path) / 'scaler.pkl', 'rb') as file:
        scaler = pickle.load(file)


    # Scaling
    X_ids = np.asarray(X_test_orig[['date', 'id']])
    X_scale_pca = X_test_orig.drop(columns=['date', 'id'])
    X_scale_pca = scaler.transform(X_scale_pca)

    X_test_concat = np.concatenate((X_ids, X_scale_pca), axis=1)


    result_df = pd.DataFrame(columns=['date', 'id', 'value'])

    # Load Model
    model_pathname = Path(model_directory_path) / "model.keras"
    model = load_model(model_pathname)

    # Pairwise Transformation using the pairwise generator
    batch_size = 3500
    X_test, X_test_ids = pairwise_combinations_with_ids_and_values(X_test_concat)

    print("Predicting for Test Data")
    preds = model.predict(X_test)

    preds_df = pd.DataFrame({'id': X_test_ids[:, 0, 1].flatten(), 'date': X_test_ids[:, 0, 0].flatten(), 'value': preds.flatten()})
    preds_df = preds_df.groupby(['date', 'id']).mean().reset_index()

    result_df = pd.merge(X_test_orig, preds_df, on=['id', 'date'], how='left')
    result_df['value'] = result_df['value'].fillna(result_df['value'].mean())

    minmax = MinMaxScaler(feature_range=(-1, 1))
    result_df['value'] = minmax.fit_transform(result_df[['value']])

    print("Finished predicting Test Data")

    return result_df

X_train = pd.read_parquet('../data/X_train.parquet')
y_train = pd.read_parquet('../data/y_train.parquet')
X_test = pd.read_parquet('../data/X_test.parquet')

train(X_train, y_train)

results = infer(X_test)

Mike Anderson

ADIA Lab Market Prediction

Heart Disease Prediction Using Neural Networks and Deep Learning

Fortnight Hackathon Series For Data Scientists: Work Hour Prediction Challenge

Mike Anderson

mikeanderson0289@gmail.com