Dare in Reality Hackathon 2021: Predict Lap Timings for Qualifying Session

Problem Description (from Machine Hack):

In the heat of a Formula E race, teams need fast access to insights that can help drivers make split-second decisions and cross the finish line first. Can your data-science skills help Envision Racing, one of the founding teams in the championship, take home even more trophies?

To do so, you will have to build a machine learning model that predicts the Envision Racing drivers’ lap times for the all-important qualifying sessions that determine what position they start the race in. Winning races involves a combination of both a driver’s skills and data analytics. To help the team you’ll need to consider several factors that affect performance during a session, including weather, track conditions, and a driver’s familiarity with the track.

Genpact, a leading professional services firm that focuses on digital transformation, is collaborating with Envision Racing, a Formula E racing team and digital hackathon platform MachineHack, a brainchild of Analytics India Magazine, is launching ‘Dare in Reality’.’ This two-week hackathon allows data science professionals, machine learning engineers, artificial intelligence practitioners, and other tech enthusiasts to showcase their skills, impress the judges, and stand a chance to win exciting cash prizes.

Genpact (NYSE: G) is a global professional services firm that makes business transformation real, driving digital-led innovation and digitally enabled intelligent operations for our clients.

Evaluation Metric: Root Mean Squared Log Error (RMSLE)

Ranking: 46 out of 346. First Place RMSLE was 0.46946 and mine was 0.47991

Models: Random Forest, Gradient Boost, Neural Network

Data Processing

 
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

pd.set_option('display.max_columns', None)

import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv('../Data/train.csv')
df.columns = [x.title().strip() for x in df.columns]
df = df.dropna(subset=['S1'])

drop_cols = ['S1_Large', 'S2_Large', 'S3_Large']
df['Event'].value_counts()

for index, row in df.iterrows():
    number = str(row['Number'])
    location_number = row['Location'][-1:]
    if row['Event'] == 'Free Practice 1':
        event = 'FP1'
    elif row['Event'] == 'Free Practice 2':
        event = 'FP2'
    elif row['Event'] == 'Free Practice 3':
        event = 'FP3'
    elif row['Event'] == 'Qualifying Group 1':
        event = 'QG1'
    elif row['Event'] == 'Qualifying Group 2':
        event = 'QG2'
    elif row['Event'] == 'Qualifying Group 3':
        event = 'QG3'
    elif row['Event'] == 'Qualifying Group 4':
        event = 'QG4'
    df.at[index, 'Trial_ID'] = event + '-' + location_number + '-' + number
for x in df['Trial_ID'].unique():
    temp = df.loc[df['Trial_ID'] == x]
    
    lap_number_previous = 1
    trial_identifier = 1
    
    for index, row in temp.iterrows():
        if row['Lap_Number'] >= lap_number_previous:
            df.at[index, 'Trial_Number'] = trial_identifier
            lap_number_previous += 1
        elif row['Lap_Number'] < lap_number_previous:
            trial_identifier += 1
            df.at[index, 'Trial_Number'] = trial_identifier
            lap_number_previous = 1
            
df['Trial_ID_2'] = df['Trial_ID'] + '-' + df['Trial_Number'].astype(int).astype(str)
def TimeConversion(x):
    x = str(x)
    if x != 'nan':
        try:
            y = datetime.strptime(x, '%M:%S.%f').time()
        except ValueError:
            try:
                y = datetime.strptime(x, '%S.%f').time()
            except ValueError:
                try:
                    y = datetime.strptime(x, '%S').time()
                except ValueError:
                    y = datetime.strptime('0', '%S').time()
    if x == 'nan':
        y = datetime.strptime('0', '%S').time()
    z = timedelta(minutes=y.minute, seconds=y.second, microseconds=y.microsecond)
    return z
time_cols = [
    'S1',
    'S2',
    'S3',
    'Elapsed',
    'Hour',
    'S1_Large',
    'S2_Large',
    'S3_Large',
    'Pit_Time',
]

daytime_cols = ['Hour']

for x in time_cols:
    df[x] = df[x].apply(TimeConversion)
for x in df['Trial_ID_2']:
    temp = df.loc[df['Trial_ID_2'] == x]
    laps = len(temp)
    pit_time = timedelta(0)
    for index, row in temp.iterrows():
        if ~pd.isna(row['Pit_Time']):
            pit_amount = row['Pit_Time']
            pit_time += pit_amount
            
    df.loc[df['Trial_ID_2'] == x, 'Pit_Time'] = pit_time / laps
df = df.drop(columns=[
    'S1_Large',
    'S2_Large',
    'S3_Large',
    'Trial_Number',
    'Trial_ID',
    'Number',
    'Driver_Number',
    'Crossing_Finish_Line_In_Pit'
])
def ConvertToSeconds(x):
    y = x.total_seconds()
    return y

df['S1'] = df['S1'].apply(ConvertToSeconds)
df['S2'] = df['S2'].apply(ConvertToSeconds)
df['S3'] = df['S3'].apply(ConvertToSeconds)
df['Pit_Time'] = df['Pit_Time'].apply(ConvertToSeconds)

df['Time_Minutes'] = [(x.total_seconds() / 60) for x in df['Hour']]
wdf_train = pd.read_csv('../Data/train_weather.csv')
wdf_test = pd.read_csv('../Data/test_weather.csv')
wdf_test = wdf_test.rename(columns={'EVENTS': 'EVENT'})
wdf_test['RAIN'] = wdf_test['RAIN'].astype(str)

wdf = pd.concat([wdf_train, wdf_test])

wdf['TIME_UTC_STR'] = pd.to_datetime(wdf['TIME_UTC_STR'], dayfirst=True)
wdf['TIME_UTC_MINUTE'] = [x.minute for x in wdf['TIME_UTC_STR']]

num_cols = ['AIR_TEMP', 'TRACK_TEMP', 'HUMIDITY', 'PRESSURE', 'WIND_SPEED', 'RAIN']
wdf['RAIN'] = [x.replace(',', '.') for x in wdf['RAIN']]

for col in num_cols:
    wdf[col] = wdf[col].str.replace(',', '').replace('.', '').astype(float)

wdf.dtypes

for index, row in df.iterrows():
    location = row['Location']
    event = row['Event']
    time = row['Time_Minutes']
    
    try:
        weather = wdf.loc[(wdf['LOCATION'] == location) & (wdf['EVENT'] == event) & (wdf['TIME_UTC_MINUTE'] <= time) & (time < (wdf['TIME_UTC_MINUTE'] + 1))]
        weather = weather.iloc[0]
        df.at[index, 'Air_Temp'] = weather['AIR_TEMP']
        df.at[index, 'Track_Temp'] = weather['TRACK_TEMP']
        df.at[index, 'Humidity'] = weather['HUMIDITY']
        df.at[index, 'Pressure'] = weather['PRESSURE']
        df.at[index, 'Wind_Speed'] = weather['WIND_SPEED']
        df.at[index, 'Wind_Direction'] = weather['WIND_DIRECTION']
        df.at[index, 'Rain'] = weather['RAIN']
    except IndexError:
        weather = wdf.loc[(wdf['LOCATION'] == location) & (wdf['EVENT'] == event)]
        if not weather.empty:
            df.at[index, 'Air_Temp'] = weather['AIR_TEMP'].mean()
            df.at[index, 'Track_Temp'] = weather['TRACK_TEMP'].mean()
            df.at[index, 'Humidity'] = weather['HUMIDITY'].mean()
            df.at[index, 'Pressure'] = weather['PRESSURE'].mean()
            df.at[index, 'Wind_Speed'] = weather['WIND_SPEED'].mean()
            df.at[index, 'Wind_Direction'] = weather['WIND_DIRECTION'].mean()
            df.at[index, 'Rain'] = weather['RAIN'].mode()[0]
        else:
            weather = wdf.loc[wdf['LOCATION'] == location]
            df.at[index, 'Air_Temp'] = weather['AIR_TEMP'].mean()
            df.at[index, 'Track_Temp'] = weather['TRACK_TEMP'].mean()
            df.at[index, 'Humidity'] = weather['HUMIDITY'].mean()
            df.at[index, 'Pressure'] = weather['PRESSURE'].mean()
            df.at[index, 'Wind_Speed'] = weather['WIND_SPEED'].mean()
            df.at[index, 'Wind_Direction'] = weather['WIND_DIRECTION'].mean()
            df.at[index, 'Rain'] = weather['RAIN'].mode()[0]
    

df['Power'] = df['Power'].fillna(df['Power'].mode()[0])
df['Kph'] = df['Kph'].fillna(df['Kph'].mean())
df = df.drop(columns=['Group', 'Hour', 'Trial_ID_2', 'Time_Minutes'])

Function

def DataProcessing(csv_url):
    df = pd.read_csv(csv_url)
    df.columns = [x.title().strip() for x in df.columns]
    df = df.dropna(subset=['S1'])

    drop_cols = ['S1_Large', 'S2_Large', 'S3_Large']

    df['Event'].value_counts()

    for index, row in df.iterrows():
        number = str(row['Number'])
        location_number = row['Location'][-1:]
        if row['Event'] == 'Free Practice 1':
            event = 'FP1'
        elif row['Event'] == 'Free Practice 2':
            event = 'FP2'
        elif row['Event'] == 'Free Practice 3':
            event = 'FP3'
        elif row['Event'] == 'Qualifying Group 1':
            event = 'QG1'
        elif row['Event'] == 'Qualifying Group 2':
            event = 'QG2'
        elif row['Event'] == 'Qualifying Group 3':
            event = 'QG3'
        elif row['Event'] == 'Qualifying Group 4':
            event = 'QG4'
        df.at[index, 'Trial_ID'] = event + '-' + location_number + '-' + number

    for x in df['Trial_ID'].unique():
        temp = df.loc[df['Trial_ID'] == x]

        lap_number_previous = 1
        trial_identifier = 1

        for index, row in temp.iterrows():
            if row['Lap_Number'] >= lap_number_previous:
                df.at[index, 'Trial_Number'] = trial_identifier
                lap_number_previous += 1
            elif row['Lap_Number'] < lap_number_previous:
                trial_identifier += 1
                df.at[index, 'Trial_Number'] = trial_identifier
                lap_number_previous = 1

    df['Trial_ID_2'] = df['Trial_ID'] + '-' + df['Trial_Number'].astype(int).astype(str)

    def TimeConversion(x):
        x = str(x)
        if x != 'nan':
            try:
                y = datetime.strptime(x, '%M:%S.%f').time()
            except ValueError:
                try:
                    y = datetime.strptime(x, '%S.%f').time()
                except ValueError:
                    try:
                        y = datetime.strptime(x, '%S').time()
                    except ValueError:
                        y = datetime.strptime('0', '%S').time()
        if x == 'nan':
            y = datetime.strptime('0', '%S').time()
        z = timedelta(minutes=y.minute, seconds=y.second, microseconds=y.microsecond)
        return z

    time_cols = [
        'S1',
        'S2',
        'S3',
        'Elapsed',
        'Hour',
        'S1_Large',
        'S2_Large',
        'S3_Large',
        'Pit_Time',
    ]

    daytime_cols = ['Hour']

    for x in time_cols:
        df[x] = df[x].apply(TimeConversion)

    df = df.drop(columns=[
    'S1_Large',
    'S2_Large',
    'S3_Large',
    'Trial_Number',
    'Trial_ID',
    'Number',
    'Driver_Number',
    'Crossing_Finish_Line_In_Pit'
    ])

    def ConvertToSeconds(x):
        y = x.total_seconds()
        return y

    df['S1'] = df['S1'].apply(ConvertToSeconds)
    df['S2'] = df['S2'].apply(ConvertToSeconds)
    df['S3'] = df['S3'].apply(ConvertToSeconds)
    df['Pit_Time'] = df['Pit_Time'].apply(ConvertToSeconds)
    df['Elapsed'] = df['Elapsed'].apply(ConvertToSeconds)

    df['Time_Minutes'] = [(x.total_seconds() / 60) for x in df['Hour']]

    wdf_train = pd.read_csv('../Data/train_weather.csv')
    wdf_test = pd.read_csv('../Data/test_weather.csv')
    wdf_test = wdf_test.rename(columns={'EVENTS': 'EVENT'})
    wdf_test['RAIN'] = wdf_test['RAIN'].astype(str)

    wdf = pd.concat([wdf_train, wdf_test])

    #split into location due to different number formats
    train_weather_l1 = wdf[wdf['LOCATION'].isin(['Location 1','Location 2','Location 3','Location 4'])]
    train_weather_l1['AIR_TEMP'] = train_weather_l1['AIR_TEMP'] .str.replace(',','.')
    train_weather_l1['AIR_TEMP'] = pd.to_numeric(train_weather_l1['AIR_TEMP'])
    train_weather_l1['TRACK_TEMP'] = train_weather_l1['TRACK_TEMP'] .str.replace(',','.')
    train_weather_l1['TRACK_TEMP'] = pd.to_numeric(train_weather_l1['TRACK_TEMP'])
    train_weather_l1['HUMIDITY'] = train_weather_l1['HUMIDITY'] .str.replace(',','.')
    train_weather_l1['HUMIDITY'] = pd.to_numeric(train_weather_l1['HUMIDITY'])
    train_weather_l1['PRESSURE'] = train_weather_l1['PRESSURE'] .str.replace(',','.')
    train_weather_l1['PRESSURE'] = pd.to_numeric(train_weather_l1['PRESSURE'])
    train_weather_l1['WIND_SPEED'] = train_weather_l1['WIND_SPEED'] .str.replace(',','.')
    train_weather_l1['WIND_SPEED'] = pd.to_numeric(train_weather_l1['WIND_SPEED'])
    train_weather_l1['RAIN'] = train_weather_l1['RAIN'].str.replace(',', '.')
    train_weather_l1['RAIN'] = pd.to_numeric(train_weather_l1['RAIN'])



    train_weather_l2 = wdf[wdf['LOCATION'].isin(['Location 5','Location 6','Location 7', 'Location 8'])]
    train_weather_l2['AIR_TEMP'] = train_weather_l2['AIR_TEMP'] .str.replace(',','')
    train_weather_l2['AIR_TEMP'] = pd.to_numeric(train_weather_l2['AIR_TEMP'], errors='coerce')
    conditions = [
        (train_weather_l2['AIR_TEMP'] > 100)  & (train_weather_l2['AIR_TEMP'] < 1000),
        (train_weather_l2['AIR_TEMP'] > 1000) & (train_weather_l2['AIR_TEMP'] < 10000),
        (train_weather_l2['AIR_TEMP'] > 10000) & (train_weather_l2['AIR_TEMP'] < 100000),
        (train_weather_l2['AIR_TEMP'] > 100000)]
    choices = [train_weather_l2['AIR_TEMP']/10,train_weather_l2['AIR_TEMP']/100,
               train_weather_l2['AIR_TEMP']/1000,train_weather_l2['AIR_TEMP']/10000]
    train_weather_l2['AIR_TEMP'] = np.select(conditions, choices, default=20)

    train_weather_l2['TRACK_TEMP'] = train_weather_l2['TRACK_TEMP'] .str.replace(',','.')
    train_weather_l2['TRACK_TEMP'] = pd.to_numeric(train_weather_l2['TRACK_TEMP'], errors='coerce')

    train_weather_l2['HUMIDITY'] = train_weather_l2['HUMIDITY'] .str.replace(',','.')
    train_weather_l2['HUMIDITY'] = pd.to_numeric(train_weather_l2['HUMIDITY'], errors='coerce')



    train_weather_l2['PRESSURE'] = train_weather_l2['PRESSURE'] .str.replace(',','')
    train_weather_l2['PRESSURE'] = pd.to_numeric(train_weather_l2['PRESSURE'], errors='coerce')
    conditions = [
        (train_weather_l2['PRESSURE'] > 10000) & (train_weather_l2['PRESSURE'] < 20000),
        (train_weather_l2['PRESSURE'] > 20000) & (train_weather_l2['PRESSURE'] < 200000),
        (train_weather_l2['PRESSURE'] > 200000)]

    choices = [train_weather_l2['PRESSURE']/10,
               train_weather_l2['PRESSURE']/100,
               train_weather_l2['PRESSURE']/1000]
    train_weather_l2['PRESSURE'] = np.select(conditions, choices, default=1000)


    train_weather_l2['WIND_SPEED'] = train_weather_l2['WIND_SPEED'] .str.replace(',','')
    train_weather_l2['WIND_SPEED'] = pd.to_numeric(train_weather_l2['WIND_SPEED'], errors='coerce')
    conditions = [
        (train_weather_l2['WIND_SPEED'] > 10) & (train_weather_l2['WIND_SPEED'] < 100),
        (train_weather_l2['WIND_SPEED'] > 100) & (train_weather_l2['WIND_SPEED'] < 1000),
        (train_weather_l2['WIND_SPEED'] > 1000) & (train_weather_l2['WIND_SPEED'] < 10000),
        (train_weather_l2['WIND_SPEED'] > 10000) & (train_weather_l2['WIND_SPEED'] < 100000),
        (train_weather_l2['WIND_SPEED'] > 100000)]

    choices = [train_weather_l2['WIND_SPEED']/10,
               train_weather_l2['WIND_SPEED']/100,
               train_weather_l2['WIND_SPEED']/1000,
               train_weather_l2['WIND_SPEED']/10000,
               train_weather_l2['WIND_SPEED']/100000,
               ]
    train_weather_l2['WIND_SPEED'] = np.select(conditions, choices, default=1)
    train_weather_l2['RAIN'] = train_weather_l2['RAIN'].str.replace(',', '.')
    train_weather_l2['RAIN'] = pd.to_numeric(train_weather_l2['RAIN'])

    wdf = pd.concat([train_weather_l1,train_weather_l2])

    wdf['TIME_UTC_STR'] = pd.to_datetime(wdf['TIME_UTC_STR'], dayfirst=True)
    wdf['TIME_UTC_MINUTE'] = [x.minute for x in wdf['TIME_UTC_STR']]

    num_cols = ['AIR_TEMP', 'TRACK_TEMP', 'HUMIDITY', 'PRESSURE', 'WIND_SPEED', 'RAIN']

    for index, row in df.iterrows():
        location = row['Location']
        event = row['Event']
        time = row['Time_Minutes']

        try:
            weather = wdf.loc[(wdf['LOCATION'] == location) & (wdf['EVENT'] == event) & (wdf['TIME_UTC_MINUTE'] <= time) & (time < (wdf['TIME_UTC_MINUTE'] + 1))]
            weather = weather.iloc[0]
            df.at[index, 'Air_Temp'] = weather['AIR_TEMP']
            df.at[index, 'Track_Temp'] = weather['TRACK_TEMP']
            df.at[index, 'Humidity'] = weather['HUMIDITY']
            df.at[index, 'Pressure'] = weather['PRESSURE']
            df.at[index, 'Wind_Speed'] = weather['WIND_SPEED']
            df.at[index, 'Wind_Direction'] = weather['WIND_DIRECTION']
            df.at[index, 'Rain'] = weather['RAIN']
        except IndexError:
            weather = wdf.loc[(wdf['LOCATION'] == location) & (wdf['EVENT'] == event)]
            if not weather.empty:
                df.at[index, 'Air_Temp'] = weather['AIR_TEMP'].mean()
                df.at[index, 'Track_Temp'] = weather['TRACK_TEMP'].mean()
                df.at[index, 'Humidity'] = weather['HUMIDITY'].mean()
                df.at[index, 'Pressure'] = weather['PRESSURE'].mean()
                df.at[index, 'Wind_Speed'] = weather['WIND_SPEED'].mean()
                df.at[index, 'Wind_Direction'] = weather['WIND_DIRECTION'].mean()
                df.at[index, 'Rain'] = weather['RAIN'].mode()[0]
            else:
                weather = wdf.loc[wdf['LOCATION'] == location]
                df.at[index, 'Air_Temp'] = weather['AIR_TEMP'].mean()
                df.at[index, 'Track_Temp'] = weather['TRACK_TEMP'].mean()
                df.at[index, 'Humidity'] = weather['HUMIDITY'].mean()
                df.at[index, 'Pressure'] = weather['PRESSURE'].mean()
                df.at[index, 'Wind_Speed'] = weather['WIND_SPEED'].mean()
                df.at[index, 'Wind_Direction'] = weather['WIND_DIRECTION'].mean()
                df.at[index, 'Rain'] = weather['RAIN'].mode()[0]


    df['Power'] = df['Power'].fillna(df['Power'].mode()[0])
    df['Kph'] = df['Kph'].fillna(df['Kph'].mean())
    df['Pit_Time'] = df['Pit_Time'].fillna(0)

    if csv_url == '../Data/train.csv':
        df = df[df['Lap_Time'] > 65]

    df = df.drop(columns=['Group', 'Hour', 'Trial_ID_2', 'Time_Minutes'])
    
    for index, row in df.iterrows():
        pit_time = row['Pit_Time']
        s1 = row['S1']
        s2 = row['S2']
        s3 = row['S3']

        if pit_time != 0:
            if s1 > pit_time:
                df.at[index, 'S1'] = s1 - pit_time
            elif s2 > pit_time:
                df.at[index, 'S2'] = s2 - pit_time
            elif s3 > pit_time:
                df.at[index, 'S3'] = s3 - pit_time

    return df
df = DataProcessing('../Data/train.csv')
 

Random Forest

 
import sys
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
from sklearn.preprocessing import PowerTransformer, FunctionTransformer, QuantileTransformer

sys.path.append('../Scripts')
from Data_Processing import DataProcessing
from pprint import pprint

import warnings
warnings.filterwarnings('ignore')
df = DataProcessing('../Data/train.csv')
df = df.loc[df['Lap_Time'] != 0]

y = df['Lap_Time']
X = df.drop(columns=['Lap_Time'])

obj_columns = list(X.select_dtypes(include=object).columns)
obj_columns.append('Lap_Number')
obj_columns.append('Lap_Improvement')

num_columns = list(X.select_dtypes(include='number').columns)
num_columns.remove('Lap_Number')
num_columns.remove('Lap_Improvement')

Scalers

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import joblib
ColumnTransformer = joblib.load('../Models/Column_Transformer.pkl')
#PowerTransformer = joblib.load('../Models/Power_Transformer.pkl')

trans_X = ColumnTransformer.transform(X)

y = np.asarray(y)

Train Test Split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(trans_X, y, random_state=11, test_size=0.2)

RF Model

from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(
    n_estimators=1600,
    min_samples_split=2,
    min_samples_leaf=4,
    max_features='sqrt',
    max_depth=10,
    bootstrap=True)
rf.fit(X_train, y_train)
results = pd.DataFrame()
results['Predicted'] = rf.predict(X_test)
results['Actual']= y_test
results['Difference'] = abs(results['Predicted'] - results['Actual'])
results['Difference'].mean()
from sklearn.metrics import mean_squared_error
mean_squared_error(results['Actual'], results['Predicted'], squared=False)
joblib.dump(rf, '../Models/RF_Model.h5')

New Features

from sklearn.model_selection import train_test_split
trans_X = trans_X[:,indexes]
X_train, X_test, y_train, y_test = train_test_split(trans_X, y, random_state=11, test_size=0.3)
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(
    n_estimators=1600,
    min_samples_split=2,
    min_samples_leaf=4,
    max_features='sqrt',
    max_depth=10,
    bootstrap=True)
rf.fit(X_train, y_train)
y_predicted = rf.predict(X_test)

root_mean_squared_log_error(y_test, y_predicted)
joblib.dump(rf, '../Models/RF_Model.h5')

Output

results = pd.DataFrame()
results['Predicted'] = y_predicted
results['Actual'] = y_test
results
columns = column_transformer.get_feature_names_out()
importances = rf.feature_importances_

features = pd.DataFrame()
features['Column'] = columns
features['Importance'] = importances
features.sort_values(by='Importance', ascending=False).to_csv('../Data/Feature_Importances.csv', index=False)
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from keras import backend as K
def root_mean_squared_log_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(K.log(1+y_pred) - K.log(1+y_true))))
rf = RandomForestRegressor(random_state=42)
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

print(random_grid)
rf_random = RandomizedSearchCV(
    estimator = rf, 
    param_distributions = random_grid, 
    n_iter = 100, 
    cv = 3, 
    verbose=2, 
    random_state=42, 
    n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)
rf_random.best_params_
 

Gradient Boost

 
import sys
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
from sklearn.preprocessing import PowerTransformer, FunctionTransformer, QuantileTransformer

sys.path.append('../Scripts')
from Data_Processing import DataProcessing
from pprint import pprint

import warnings
warnings.filterwarnings('ignore')
df = DataProcessing('../Data/train.csv')
df = df.loc[df['Lap_Time'] != 0]

y = df['Lap_Time']
X = df.drop(columns=['Lap_Time', 'Lap_Improvement', 'S1_Improvement', 'S2_Improvement', 'S3_Improvement'])

obj_columns = list(X.select_dtypes(include=object).columns)

num_columns = list(X.select_dtypes(include='number').columns)

Scalers

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
import joblib
column_transformer = ColumnTransformer(
[('num', MinMaxScaler(), num_columns),
('obj', OneHotEncoder(), obj_columns)],
remainder='passthrough')

trans_X = column_transformer.fit_transform(X)

joblib.dump(column_transformer, '../Models/Column_Transformer.pkl')

y = np.asarray(y)

Train Test Split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(trans_X, y, random_state=16, test_size=0.2)
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, mean_squared_log_error, mean_absolute_percentage_error
from keras import backend as K
def root_mean_squared_log_error(y_true, y_pred):
        return np.sqrt(np.mean(np.square(np.log(1+y_pred) - np.log(1+y_true))))
gb = GradientBoostingRegressor(random_state=42)
scoring = {'MSLE': make_scorer(mean_squared_log_error),
           'MAPE': make_scorer(mean_absolute_percentage_error)}
random_grid = {
    "loss":['squared_error', 'absolute_error', 'huber'],
    "learning_rate": [0.001, 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
    "min_samples_split": np.linspace(1, 200, 10, dtype=int),
    "min_samples_leaf": np.linspace(0.1, 0.5, 12),
    "max_depth":[3,5,8,10,12],
    "max_features":["log2","sqrt"],
    "criterion": ["friedman_mse",  "absolute_error"],
    "subsample":[0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0],
    "n_estimators":[10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
    }
gb_random = RandomizedSearchCV(
    estimator = gb, 
    param_distributions = random_grid, 
    n_iter = 100, 
    cv = 5, 
    verbose=2, 
    random_state=42, 
    n_jobs = -1)
gb_random.fit(X_train, y_train)
params = gb_random.best_params_

GB Best Params

gb = GradientBoostingRegressor(
subsample=1.0,
n_estimators=80,
min_samples_split=23,
min_samples_leaf=0.13636363636363638,
max_features='sqrt',
max_depth=8,
loss='huber',
learning_rate=0.1,
criterion='absolute_error')
gb.fit(X_train, y_train)
results = pd.DataFrame()
results['Predicted'] = (1 / gb.predict(X_test)) - 1
results['Actual']= (1 / y_test) - 1
results['Difference'] = abs(results['Predicted'] - results['Actual'])
from sklearn.metrics import mean_squared_error
mean_squared_error(results['Actual'], results['Predicted'], squared=False)
joblib.dump(gb, '../Models/Gradient_Boost_Model.h5')
 

Neural Network

 
import sys
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
from sklearn.preprocessing import PowerTransformer, FunctionTransformer, QuantileTransformer
from sklearn.metrics import mean_squared_error

sys.path.append('../Scripts')
from Data_Processing import DataProcessing

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
df = DataProcessing('../Data/train.csv')

y = df['Lap_Time']
X = df.drop(columns=['Lap_Time'])

obj_columns = list(X.select_dtypes(include=object).columns)

obj_columns.append('Lap_Improvement')
obj_columns.append('Lap_Number')
obj_columns.append('S1_Improvement')
obj_columns.append('S2_Improvement')
obj_columns.append('S3_Improvement')


num_columns = list(X.select_dtypes(include='number').columns)
num_columns.remove('Lap_Number')
num_columns.remove('Lap_Improvement')
num_columns.remove('S1_Improvement')
num_columns.remove('S2_Improvement')
num_columns.remove('S3_Improvement')

Scalers

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler, OrdinalEncoder
import joblib
column_transformer = ColumnTransformer(
[('num', MinMaxScaler(), num_columns),
('obj', OneHotEncoder(drop='first'), obj_columns)],
remainder='passthrough')

trans_X = column_transformer.fit_transform(X)

joblib.dump(column_transformer, '../Models/Column_Transformer_NN.pkl')

y = np.asarray(y).astype(float)
scaler = StandardScaler()
y = scaler.fit_transform(y.reshape(-1,1))

joblib.dump(scaler, '../Models/NN_Y_Scaler.pkl')

Train Test Split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(trans_X, y, random_state=42, test_size=0.1, shuffle=True)

Neural Network

from tensorflow import keras
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from keras import backend as K
from keras.callbacks import EarlyStopping
from datetime import datetime
from tensorflow import keras
from tensorflow.keras.losses import MeanSquaredLogarithmicError
from sklearn.metrics import mean_squared_log_error
def root_mean_squared_log_error(y_true, y_pred):
        return np.sqrt(mean_squared_log_error(y_true, y_pred))
mc = ModelCheckpoint(f'../Models/NN_model_test.h5', monitor='val_loss', mode='min', verbose=1, save_best_only=True)

early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=10,
    verbose=0,
    mode="auto",
    baseline=None,
    restore_best_weights=True)

model = keras.Sequential([
    keras.layers.Dense(100, activation='relu', input_dim=89),
    keras.layers.LeakyReLU(500),
    keras.layers.LeakyReLU(800),
    keras.layers.LeakyReLU(200),
    keras.layers.Dense(200, activation='relu'),
    keras.layers.Dense(10, activation='relu'),
    keras.layers.Dense(1, activation='relu')
])

opt = keras.optimizers.Adam(learning_rate=0.0001)

model.compile(optimizer=opt,
              loss='mean_squared_logarithmic_error',
              metrics=['mean_squared_logarithmic_error'])

history = model.fit(
    X_train,
    y_train,
    batch_size=100,
    epochs=5000,
    validation_data=(X_test, y_test),
    callbacks=[mc, early_stopping],
    shuffle=True,
    steps_per_epoch=3
)
results = pd.DataFrame()
y_pred = scaler.inverse_transform(model.predict(X_test))
results['Predicted'] = ((1 / y_pred) - 1).ravel()
y_actual = scaler.inverse_transform(y_test)
results['Actual'] = ((1 / y_actual) - 1).ravel()

results['Predicted'] = y_pred
results['Actual'] = y_actual

results['Difference'] = abs(results['Predicted'] - results['Actual'])
scaler.inverse_transform(model.predict(X_test))
mean_squared_error(results['Actual'], results['Predicted'], squared=False)
root_mean_squared_log_error(results['Actual'], results['Predicted'])
 

Ensemble Prediction

 
import sys
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt

sys.path.append('../Scripts')
from Data_Processing import DataProcessing

from tensorflow import keras
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from keras import backend as K
from datetime import datetime
from sklearn.preprocessing import PowerTransformer

import joblib

import warnings
warnings.filterwarnings('ignore')
ColumnTransformer = joblib.load('../Models/Column_Transformer.pkl')
ColumnTransformer_NN = joblib.load('../Models/Column_Transformer_NN.pkl')
df = DataProcessing('../Data/test.csv')
y = df['Lap_Time']
X = df.drop(columns=['Lap_Time'])

obj_columns = list(X.select_dtypes(include=object).columns)
obj_columns.append('Lap_Number')
obj_columns.append('Lap_Improvement')

num_columns = list(X.select_dtypes(include='number').columns)
num_columns.remove('Lap_Number')
num_columns.remove('Lap_Improvement')
#NN Only
y = df['Lap_Time']
X = df.drop(columns=['Lap_Time'])

obj_columns = list(X.select_dtypes(include=object).columns)

obj_columns.append('Lap_Improvement')
obj_columns.append('Lap_Number')
obj_columns.append('S1_Improvement')
obj_columns.append('S2_Improvement')
obj_columns.append('S3_Improvement')

num_columns = list(X.select_dtypes(include='number').columns)
num_columns.remove('Lap_Number')
num_columns.remove('Lap_Improvement')
num_columns.remove('S1_Improvement')
num_columns.remove('S2_Improvement')
num_columns.remove('S3_Improvement')
trans_X_nn = ColumnTransformer_NN.transform(X)
def root_mean_squared_log_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(K.log(1+y_pred) - K.log(1+y_true))))
#Neural Network
nn_model = load_model('../Models/NN_model_test.h5')

#Random Forest
rf_model = joblib.load('../Models/RF_Model.h5')

#Gradient Boost
gb_model = joblib.load('../Models/Gradient_Boost_Model.h5')
nn_y_scaler = joblib.load('../Models/NN_Y_Scaler.pkl')

y_predicted_nn = nn_y_scaler.inverse_transform(nn_model.predict(trans_X_nn))
y_predicted_nn = ((1 / y_predicted_nn) - 1).ravel()
y_predicted_nn = nn_y_scaler.inverse_transform(nn_model.predict(trans_X_nn))
results = pd.DataFrame()
results['NN'] = y_predicted_nn
results['RF'] = y_predicted_rf
results['GB'] = y_predicted_gb
results['LAP_TIME'] = (results['NN'] + results['RF'] + results['GB']) / 3
submission = results[['LAP_TIME']]
results
today = datetime.today().strftime('%m-%d-%y %H-%M')
submission.to_csv(f'../Submissions/Dare_In_Reality .csv', index=False)
Previous
Previous

Deloitte Presents Machine Learning Challenge: Predict Loan Defaulters

Next
Next

NHANES Part 1: Supplements