Dare in Reality Hackathon 2021: Predict Lap Timings for Qualifying Session
Problem Description (from Machine Hack):
In the heat of a Formula E race, teams need fast access to insights that can help drivers make split-second decisions and cross the finish line first. Can your data-science skills help Envision Racing, one of the founding teams in the championship, take home even more trophies?
To do so, you will have to build a machine learning model that predicts the Envision Racing drivers’ lap times for the all-important qualifying sessions that determine what position they start the race in. Winning races involves a combination of both a driver’s skills and data analytics. To help the team you’ll need to consider several factors that affect performance during a session, including weather, track conditions, and a driver’s familiarity with the track.
Genpact, a leading professional services firm that focuses on digital transformation, is collaborating with Envision Racing, a Formula E racing team and digital hackathon platform MachineHack, a brainchild of Analytics India Magazine, is launching ‘Dare in Reality’.’ This two-week hackathon allows data science professionals, machine learning engineers, artificial intelligence practitioners, and other tech enthusiasts to showcase their skills, impress the judges, and stand a chance to win exciting cash prizes.
Genpact (NYSE: G) is a global professional services firm that makes business transformation real, driving digital-led innovation and digitally enabled intelligent operations for our clients.
Evaluation Metric: Root Mean Squared Log Error (RMSLE)
Ranking: 46 out of 346. First Place RMSLE was 0.46946 and mine was 0.47991
Models: Random Forest, Gradient Boost, Neural Network
Data Processing
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv('../Data/train.csv')
df.columns = [x.title().strip() for x in df.columns]
df = df.dropna(subset=['S1'])
drop_cols = ['S1_Large', 'S2_Large', 'S3_Large']
df['Event'].value_counts()
for index, row in df.iterrows():
number = str(row['Number'])
location_number = row['Location'][-1:]
if row['Event'] == 'Free Practice 1':
event = 'FP1'
elif row['Event'] == 'Free Practice 2':
event = 'FP2'
elif row['Event'] == 'Free Practice 3':
event = 'FP3'
elif row['Event'] == 'Qualifying Group 1':
event = 'QG1'
elif row['Event'] == 'Qualifying Group 2':
event = 'QG2'
elif row['Event'] == 'Qualifying Group 3':
event = 'QG3'
elif row['Event'] == 'Qualifying Group 4':
event = 'QG4'
df.at[index, 'Trial_ID'] = event + '-' + location_number + '-' + number
for x in df['Trial_ID'].unique():
temp = df.loc[df['Trial_ID'] == x]
lap_number_previous = 1
trial_identifier = 1
for index, row in temp.iterrows():
if row['Lap_Number'] >= lap_number_previous:
df.at[index, 'Trial_Number'] = trial_identifier
lap_number_previous += 1
elif row['Lap_Number'] < lap_number_previous:
trial_identifier += 1
df.at[index, 'Trial_Number'] = trial_identifier
lap_number_previous = 1
df['Trial_ID_2'] = df['Trial_ID'] + '-' + df['Trial_Number'].astype(int).astype(str)
def TimeConversion(x):
x = str(x)
if x != 'nan':
try:
y = datetime.strptime(x, '%M:%S.%f').time()
except ValueError:
try:
y = datetime.strptime(x, '%S.%f').time()
except ValueError:
try:
y = datetime.strptime(x, '%S').time()
except ValueError:
y = datetime.strptime('0', '%S').time()
if x == 'nan':
y = datetime.strptime('0', '%S').time()
z = timedelta(minutes=y.minute, seconds=y.second, microseconds=y.microsecond)
return z
time_cols = [
'S1',
'S2',
'S3',
'Elapsed',
'Hour',
'S1_Large',
'S2_Large',
'S3_Large',
'Pit_Time',
]
daytime_cols = ['Hour']
for x in time_cols:
df[x] = df[x].apply(TimeConversion)
for x in df['Trial_ID_2']:
temp = df.loc[df['Trial_ID_2'] == x]
laps = len(temp)
pit_time = timedelta(0)
for index, row in temp.iterrows():
if ~pd.isna(row['Pit_Time']):
pit_amount = row['Pit_Time']
pit_time += pit_amount
df.loc[df['Trial_ID_2'] == x, 'Pit_Time'] = pit_time / laps
df = df.drop(columns=[
'S1_Large',
'S2_Large',
'S3_Large',
'Trial_Number',
'Trial_ID',
'Number',
'Driver_Number',
'Crossing_Finish_Line_In_Pit'
])
def ConvertToSeconds(x):
y = x.total_seconds()
return y
df['S1'] = df['S1'].apply(ConvertToSeconds)
df['S2'] = df['S2'].apply(ConvertToSeconds)
df['S3'] = df['S3'].apply(ConvertToSeconds)
df['Pit_Time'] = df['Pit_Time'].apply(ConvertToSeconds)
df['Time_Minutes'] = [(x.total_seconds() / 60) for x in df['Hour']]
wdf_train = pd.read_csv('../Data/train_weather.csv')
wdf_test = pd.read_csv('../Data/test_weather.csv')
wdf_test = wdf_test.rename(columns={'EVENTS': 'EVENT'})
wdf_test['RAIN'] = wdf_test['RAIN'].astype(str)
wdf = pd.concat([wdf_train, wdf_test])
wdf['TIME_UTC_STR'] = pd.to_datetime(wdf['TIME_UTC_STR'], dayfirst=True)
wdf['TIME_UTC_MINUTE'] = [x.minute for x in wdf['TIME_UTC_STR']]
num_cols = ['AIR_TEMP', 'TRACK_TEMP', 'HUMIDITY', 'PRESSURE', 'WIND_SPEED', 'RAIN']
wdf['RAIN'] = [x.replace(',', '.') for x in wdf['RAIN']]
for col in num_cols:
wdf[col] = wdf[col].str.replace(',', '').replace('.', '').astype(float)
wdf.dtypes
for index, row in df.iterrows():
location = row['Location']
event = row['Event']
time = row['Time_Minutes']
try:
weather = wdf.loc[(wdf['LOCATION'] == location) & (wdf['EVENT'] == event) & (wdf['TIME_UTC_MINUTE'] <= time) & (time < (wdf['TIME_UTC_MINUTE'] + 1))]
weather = weather.iloc[0]
df.at[index, 'Air_Temp'] = weather['AIR_TEMP']
df.at[index, 'Track_Temp'] = weather['TRACK_TEMP']
df.at[index, 'Humidity'] = weather['HUMIDITY']
df.at[index, 'Pressure'] = weather['PRESSURE']
df.at[index, 'Wind_Speed'] = weather['WIND_SPEED']
df.at[index, 'Wind_Direction'] = weather['WIND_DIRECTION']
df.at[index, 'Rain'] = weather['RAIN']
except IndexError:
weather = wdf.loc[(wdf['LOCATION'] == location) & (wdf['EVENT'] == event)]
if not weather.empty:
df.at[index, 'Air_Temp'] = weather['AIR_TEMP'].mean()
df.at[index, 'Track_Temp'] = weather['TRACK_TEMP'].mean()
df.at[index, 'Humidity'] = weather['HUMIDITY'].mean()
df.at[index, 'Pressure'] = weather['PRESSURE'].mean()
df.at[index, 'Wind_Speed'] = weather['WIND_SPEED'].mean()
df.at[index, 'Wind_Direction'] = weather['WIND_DIRECTION'].mean()
df.at[index, 'Rain'] = weather['RAIN'].mode()[0]
else:
weather = wdf.loc[wdf['LOCATION'] == location]
df.at[index, 'Air_Temp'] = weather['AIR_TEMP'].mean()
df.at[index, 'Track_Temp'] = weather['TRACK_TEMP'].mean()
df.at[index, 'Humidity'] = weather['HUMIDITY'].mean()
df.at[index, 'Pressure'] = weather['PRESSURE'].mean()
df.at[index, 'Wind_Speed'] = weather['WIND_SPEED'].mean()
df.at[index, 'Wind_Direction'] = weather['WIND_DIRECTION'].mean()
df.at[index, 'Rain'] = weather['RAIN'].mode()[0]
df['Power'] = df['Power'].fillna(df['Power'].mode()[0])
df['Kph'] = df['Kph'].fillna(df['Kph'].mean())
df = df.drop(columns=['Group', 'Hour', 'Trial_ID_2', 'Time_Minutes'])
Function
def DataProcessing(csv_url):
df = pd.read_csv(csv_url)
df.columns = [x.title().strip() for x in df.columns]
df = df.dropna(subset=['S1'])
drop_cols = ['S1_Large', 'S2_Large', 'S3_Large']
df['Event'].value_counts()
for index, row in df.iterrows():
number = str(row['Number'])
location_number = row['Location'][-1:]
if row['Event'] == 'Free Practice 1':
event = 'FP1'
elif row['Event'] == 'Free Practice 2':
event = 'FP2'
elif row['Event'] == 'Free Practice 3':
event = 'FP3'
elif row['Event'] == 'Qualifying Group 1':
event = 'QG1'
elif row['Event'] == 'Qualifying Group 2':
event = 'QG2'
elif row['Event'] == 'Qualifying Group 3':
event = 'QG3'
elif row['Event'] == 'Qualifying Group 4':
event = 'QG4'
df.at[index, 'Trial_ID'] = event + '-' + location_number + '-' + number
for x in df['Trial_ID'].unique():
temp = df.loc[df['Trial_ID'] == x]
lap_number_previous = 1
trial_identifier = 1
for index, row in temp.iterrows():
if row['Lap_Number'] >= lap_number_previous:
df.at[index, 'Trial_Number'] = trial_identifier
lap_number_previous += 1
elif row['Lap_Number'] < lap_number_previous:
trial_identifier += 1
df.at[index, 'Trial_Number'] = trial_identifier
lap_number_previous = 1
df['Trial_ID_2'] = df['Trial_ID'] + '-' + df['Trial_Number'].astype(int).astype(str)
def TimeConversion(x):
x = str(x)
if x != 'nan':
try:
y = datetime.strptime(x, '%M:%S.%f').time()
except ValueError:
try:
y = datetime.strptime(x, '%S.%f').time()
except ValueError:
try:
y = datetime.strptime(x, '%S').time()
except ValueError:
y = datetime.strptime('0', '%S').time()
if x == 'nan':
y = datetime.strptime('0', '%S').time()
z = timedelta(minutes=y.minute, seconds=y.second, microseconds=y.microsecond)
return z
time_cols = [
'S1',
'S2',
'S3',
'Elapsed',
'Hour',
'S1_Large',
'S2_Large',
'S3_Large',
'Pit_Time',
]
daytime_cols = ['Hour']
for x in time_cols:
df[x] = df[x].apply(TimeConversion)
df = df.drop(columns=[
'S1_Large',
'S2_Large',
'S3_Large',
'Trial_Number',
'Trial_ID',
'Number',
'Driver_Number',
'Crossing_Finish_Line_In_Pit'
])
def ConvertToSeconds(x):
y = x.total_seconds()
return y
df['S1'] = df['S1'].apply(ConvertToSeconds)
df['S2'] = df['S2'].apply(ConvertToSeconds)
df['S3'] = df['S3'].apply(ConvertToSeconds)
df['Pit_Time'] = df['Pit_Time'].apply(ConvertToSeconds)
df['Elapsed'] = df['Elapsed'].apply(ConvertToSeconds)
df['Time_Minutes'] = [(x.total_seconds() / 60) for x in df['Hour']]
wdf_train = pd.read_csv('../Data/train_weather.csv')
wdf_test = pd.read_csv('../Data/test_weather.csv')
wdf_test = wdf_test.rename(columns={'EVENTS': 'EVENT'})
wdf_test['RAIN'] = wdf_test['RAIN'].astype(str)
wdf = pd.concat([wdf_train, wdf_test])
#split into location due to different number formats
train_weather_l1 = wdf[wdf['LOCATION'].isin(['Location 1','Location 2','Location 3','Location 4'])]
train_weather_l1['AIR_TEMP'] = train_weather_l1['AIR_TEMP'] .str.replace(',','.')
train_weather_l1['AIR_TEMP'] = pd.to_numeric(train_weather_l1['AIR_TEMP'])
train_weather_l1['TRACK_TEMP'] = train_weather_l1['TRACK_TEMP'] .str.replace(',','.')
train_weather_l1['TRACK_TEMP'] = pd.to_numeric(train_weather_l1['TRACK_TEMP'])
train_weather_l1['HUMIDITY'] = train_weather_l1['HUMIDITY'] .str.replace(',','.')
train_weather_l1['HUMIDITY'] = pd.to_numeric(train_weather_l1['HUMIDITY'])
train_weather_l1['PRESSURE'] = train_weather_l1['PRESSURE'] .str.replace(',','.')
train_weather_l1['PRESSURE'] = pd.to_numeric(train_weather_l1['PRESSURE'])
train_weather_l1['WIND_SPEED'] = train_weather_l1['WIND_SPEED'] .str.replace(',','.')
train_weather_l1['WIND_SPEED'] = pd.to_numeric(train_weather_l1['WIND_SPEED'])
train_weather_l1['RAIN'] = train_weather_l1['RAIN'].str.replace(',', '.')
train_weather_l1['RAIN'] = pd.to_numeric(train_weather_l1['RAIN'])
train_weather_l2 = wdf[wdf['LOCATION'].isin(['Location 5','Location 6','Location 7', 'Location 8'])]
train_weather_l2['AIR_TEMP'] = train_weather_l2['AIR_TEMP'] .str.replace(',','')
train_weather_l2['AIR_TEMP'] = pd.to_numeric(train_weather_l2['AIR_TEMP'], errors='coerce')
conditions = [
(train_weather_l2['AIR_TEMP'] > 100) & (train_weather_l2['AIR_TEMP'] < 1000),
(train_weather_l2['AIR_TEMP'] > 1000) & (train_weather_l2['AIR_TEMP'] < 10000),
(train_weather_l2['AIR_TEMP'] > 10000) & (train_weather_l2['AIR_TEMP'] < 100000),
(train_weather_l2['AIR_TEMP'] > 100000)]
choices = [train_weather_l2['AIR_TEMP']/10,train_weather_l2['AIR_TEMP']/100,
train_weather_l2['AIR_TEMP']/1000,train_weather_l2['AIR_TEMP']/10000]
train_weather_l2['AIR_TEMP'] = np.select(conditions, choices, default=20)
train_weather_l2['TRACK_TEMP'] = train_weather_l2['TRACK_TEMP'] .str.replace(',','.')
train_weather_l2['TRACK_TEMP'] = pd.to_numeric(train_weather_l2['TRACK_TEMP'], errors='coerce')
train_weather_l2['HUMIDITY'] = train_weather_l2['HUMIDITY'] .str.replace(',','.')
train_weather_l2['HUMIDITY'] = pd.to_numeric(train_weather_l2['HUMIDITY'], errors='coerce')
train_weather_l2['PRESSURE'] = train_weather_l2['PRESSURE'] .str.replace(',','')
train_weather_l2['PRESSURE'] = pd.to_numeric(train_weather_l2['PRESSURE'], errors='coerce')
conditions = [
(train_weather_l2['PRESSURE'] > 10000) & (train_weather_l2['PRESSURE'] < 20000),
(train_weather_l2['PRESSURE'] > 20000) & (train_weather_l2['PRESSURE'] < 200000),
(train_weather_l2['PRESSURE'] > 200000)]
choices = [train_weather_l2['PRESSURE']/10,
train_weather_l2['PRESSURE']/100,
train_weather_l2['PRESSURE']/1000]
train_weather_l2['PRESSURE'] = np.select(conditions, choices, default=1000)
train_weather_l2['WIND_SPEED'] = train_weather_l2['WIND_SPEED'] .str.replace(',','')
train_weather_l2['WIND_SPEED'] = pd.to_numeric(train_weather_l2['WIND_SPEED'], errors='coerce')
conditions = [
(train_weather_l2['WIND_SPEED'] > 10) & (train_weather_l2['WIND_SPEED'] < 100),
(train_weather_l2['WIND_SPEED'] > 100) & (train_weather_l2['WIND_SPEED'] < 1000),
(train_weather_l2['WIND_SPEED'] > 1000) & (train_weather_l2['WIND_SPEED'] < 10000),
(train_weather_l2['WIND_SPEED'] > 10000) & (train_weather_l2['WIND_SPEED'] < 100000),
(train_weather_l2['WIND_SPEED'] > 100000)]
choices = [train_weather_l2['WIND_SPEED']/10,
train_weather_l2['WIND_SPEED']/100,
train_weather_l2['WIND_SPEED']/1000,
train_weather_l2['WIND_SPEED']/10000,
train_weather_l2['WIND_SPEED']/100000,
]
train_weather_l2['WIND_SPEED'] = np.select(conditions, choices, default=1)
train_weather_l2['RAIN'] = train_weather_l2['RAIN'].str.replace(',', '.')
train_weather_l2['RAIN'] = pd.to_numeric(train_weather_l2['RAIN'])
wdf = pd.concat([train_weather_l1,train_weather_l2])
wdf['TIME_UTC_STR'] = pd.to_datetime(wdf['TIME_UTC_STR'], dayfirst=True)
wdf['TIME_UTC_MINUTE'] = [x.minute for x in wdf['TIME_UTC_STR']]
num_cols = ['AIR_TEMP', 'TRACK_TEMP', 'HUMIDITY', 'PRESSURE', 'WIND_SPEED', 'RAIN']
for index, row in df.iterrows():
location = row['Location']
event = row['Event']
time = row['Time_Minutes']
try:
weather = wdf.loc[(wdf['LOCATION'] == location) & (wdf['EVENT'] == event) & (wdf['TIME_UTC_MINUTE'] <= time) & (time < (wdf['TIME_UTC_MINUTE'] + 1))]
weather = weather.iloc[0]
df.at[index, 'Air_Temp'] = weather['AIR_TEMP']
df.at[index, 'Track_Temp'] = weather['TRACK_TEMP']
df.at[index, 'Humidity'] = weather['HUMIDITY']
df.at[index, 'Pressure'] = weather['PRESSURE']
df.at[index, 'Wind_Speed'] = weather['WIND_SPEED']
df.at[index, 'Wind_Direction'] = weather['WIND_DIRECTION']
df.at[index, 'Rain'] = weather['RAIN']
except IndexError:
weather = wdf.loc[(wdf['LOCATION'] == location) & (wdf['EVENT'] == event)]
if not weather.empty:
df.at[index, 'Air_Temp'] = weather['AIR_TEMP'].mean()
df.at[index, 'Track_Temp'] = weather['TRACK_TEMP'].mean()
df.at[index, 'Humidity'] = weather['HUMIDITY'].mean()
df.at[index, 'Pressure'] = weather['PRESSURE'].mean()
df.at[index, 'Wind_Speed'] = weather['WIND_SPEED'].mean()
df.at[index, 'Wind_Direction'] = weather['WIND_DIRECTION'].mean()
df.at[index, 'Rain'] = weather['RAIN'].mode()[0]
else:
weather = wdf.loc[wdf['LOCATION'] == location]
df.at[index, 'Air_Temp'] = weather['AIR_TEMP'].mean()
df.at[index, 'Track_Temp'] = weather['TRACK_TEMP'].mean()
df.at[index, 'Humidity'] = weather['HUMIDITY'].mean()
df.at[index, 'Pressure'] = weather['PRESSURE'].mean()
df.at[index, 'Wind_Speed'] = weather['WIND_SPEED'].mean()
df.at[index, 'Wind_Direction'] = weather['WIND_DIRECTION'].mean()
df.at[index, 'Rain'] = weather['RAIN'].mode()[0]
df['Power'] = df['Power'].fillna(df['Power'].mode()[0])
df['Kph'] = df['Kph'].fillna(df['Kph'].mean())
df['Pit_Time'] = df['Pit_Time'].fillna(0)
if csv_url == '../Data/train.csv':
df = df[df['Lap_Time'] > 65]
df = df.drop(columns=['Group', 'Hour', 'Trial_ID_2', 'Time_Minutes'])
for index, row in df.iterrows():
pit_time = row['Pit_Time']
s1 = row['S1']
s2 = row['S2']
s3 = row['S3']
if pit_time != 0:
if s1 > pit_time:
df.at[index, 'S1'] = s1 - pit_time
elif s2 > pit_time:
df.at[index, 'S2'] = s2 - pit_time
elif s3 > pit_time:
df.at[index, 'S3'] = s3 - pit_time
return df
df = DataProcessing('../Data/train.csv')
Random Forest
import sys
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
from sklearn.preprocessing import PowerTransformer, FunctionTransformer, QuantileTransformer
sys.path.append('../Scripts')
from Data_Processing import DataProcessing
from pprint import pprint
import warnings
warnings.filterwarnings('ignore')
df = DataProcessing('../Data/train.csv')
df = df.loc[df['Lap_Time'] != 0]
y = df['Lap_Time']
X = df.drop(columns=['Lap_Time'])
obj_columns = list(X.select_dtypes(include=object).columns)
obj_columns.append('Lap_Number')
obj_columns.append('Lap_Improvement')
num_columns = list(X.select_dtypes(include='number').columns)
num_columns.remove('Lap_Number')
num_columns.remove('Lap_Improvement')
Scalers
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import joblib
ColumnTransformer = joblib.load('../Models/Column_Transformer.pkl')
#PowerTransformer = joblib.load('../Models/Power_Transformer.pkl')
trans_X = ColumnTransformer.transform(X)
y = np.asarray(y)
Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(trans_X, y, random_state=11, test_size=0.2)
RF Model
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(
n_estimators=1600,
min_samples_split=2,
min_samples_leaf=4,
max_features='sqrt',
max_depth=10,
bootstrap=True)
rf.fit(X_train, y_train)
results = pd.DataFrame()
results['Predicted'] = rf.predict(X_test)
results['Actual']= y_test
results['Difference'] = abs(results['Predicted'] - results['Actual'])
results['Difference'].mean()
from sklearn.metrics import mean_squared_error
mean_squared_error(results['Actual'], results['Predicted'], squared=False)
joblib.dump(rf, '../Models/RF_Model.h5')
New Features
from sklearn.model_selection import train_test_split
trans_X = trans_X[:,indexes]
X_train, X_test, y_train, y_test = train_test_split(trans_X, y, random_state=11, test_size=0.3)
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(
n_estimators=1600,
min_samples_split=2,
min_samples_leaf=4,
max_features='sqrt',
max_depth=10,
bootstrap=True)
rf.fit(X_train, y_train)
y_predicted = rf.predict(X_test)
root_mean_squared_log_error(y_test, y_predicted)
joblib.dump(rf, '../Models/RF_Model.h5')
Output
results = pd.DataFrame()
results['Predicted'] = y_predicted
results['Actual'] = y_test
results
columns = column_transformer.get_feature_names_out()
importances = rf.feature_importances_
features = pd.DataFrame()
features['Column'] = columns
features['Importance'] = importances
features.sort_values(by='Importance', ascending=False).to_csv('../Data/Feature_Importances.csv', index=False)
Grid Search
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from keras import backend as K
def root_mean_squared_log_error(y_true, y_pred):
return K.sqrt(K.mean(K.square(K.log(1+y_pred) - K.log(1+y_true))))
rf = RandomForestRegressor(random_state=42)
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'bootstrap': bootstrap}
print(random_grid)
rf_random = RandomizedSearchCV(
estimator = rf,
param_distributions = random_grid,
n_iter = 100,
cv = 3,
verbose=2,
random_state=42,
n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)
rf_random.best_params_
Gradient Boost
import sys
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
from sklearn.preprocessing import PowerTransformer, FunctionTransformer, QuantileTransformer
sys.path.append('../Scripts')
from Data_Processing import DataProcessing
from pprint import pprint
import warnings
warnings.filterwarnings('ignore')
df = DataProcessing('../Data/train.csv')
df = df.loc[df['Lap_Time'] != 0]
y = df['Lap_Time']
X = df.drop(columns=['Lap_Time', 'Lap_Improvement', 'S1_Improvement', 'S2_Improvement', 'S3_Improvement'])
obj_columns = list(X.select_dtypes(include=object).columns)
num_columns = list(X.select_dtypes(include='number').columns)
Scalers
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
import joblib
column_transformer = ColumnTransformer(
[('num', MinMaxScaler(), num_columns),
('obj', OneHotEncoder(), obj_columns)],
remainder='passthrough')
trans_X = column_transformer.fit_transform(X)
joblib.dump(column_transformer, '../Models/Column_Transformer.pkl')
y = np.asarray(y)
Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(trans_X, y, random_state=16, test_size=0.2)
Gradient Boost Grid Search
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, mean_squared_log_error, mean_absolute_percentage_error
from keras import backend as K
def root_mean_squared_log_error(y_true, y_pred):
return np.sqrt(np.mean(np.square(np.log(1+y_pred) - np.log(1+y_true))))
gb = GradientBoostingRegressor(random_state=42)
scoring = {'MSLE': make_scorer(mean_squared_log_error),
'MAPE': make_scorer(mean_absolute_percentage_error)}
random_grid = {
"loss":['squared_error', 'absolute_error', 'huber'],
"learning_rate": [0.001, 0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
"min_samples_split": np.linspace(1, 200, 10, dtype=int),
"min_samples_leaf": np.linspace(0.1, 0.5, 12),
"max_depth":[3,5,8,10,12],
"max_features":["log2","sqrt"],
"criterion": ["friedman_mse", "absolute_error"],
"subsample":[0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0],
"n_estimators":[10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
}
gb_random = RandomizedSearchCV(
estimator = gb,
param_distributions = random_grid,
n_iter = 100,
cv = 5,
verbose=2,
random_state=42,
n_jobs = -1)
gb_random.fit(X_train, y_train)
params = gb_random.best_params_
GB Best Params
gb = GradientBoostingRegressor(
subsample=1.0,
n_estimators=80,
min_samples_split=23,
min_samples_leaf=0.13636363636363638,
max_features='sqrt',
max_depth=8,
loss='huber',
learning_rate=0.1,
criterion='absolute_error')
gb.fit(X_train, y_train)
results = pd.DataFrame()
results['Predicted'] = (1 / gb.predict(X_test)) - 1
results['Actual']= (1 / y_test) - 1
results['Difference'] = abs(results['Predicted'] - results['Actual'])
from sklearn.metrics import mean_squared_error
mean_squared_error(results['Actual'], results['Predicted'], squared=False)
joblib.dump(gb, '../Models/Gradient_Boost_Model.h5')
Neural Network
import sys
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
from sklearn.preprocessing import PowerTransformer, FunctionTransformer, QuantileTransformer
from sklearn.metrics import mean_squared_error
sys.path.append('../Scripts')
from Data_Processing import DataProcessing
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
df = DataProcessing('../Data/train.csv')
y = df['Lap_Time']
X = df.drop(columns=['Lap_Time'])
obj_columns = list(X.select_dtypes(include=object).columns)
obj_columns.append('Lap_Improvement')
obj_columns.append('Lap_Number')
obj_columns.append('S1_Improvement')
obj_columns.append('S2_Improvement')
obj_columns.append('S3_Improvement')
num_columns = list(X.select_dtypes(include='number').columns)
num_columns.remove('Lap_Number')
num_columns.remove('Lap_Improvement')
num_columns.remove('S1_Improvement')
num_columns.remove('S2_Improvement')
num_columns.remove('S3_Improvement')
Scalers
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler, OrdinalEncoder
import joblib
column_transformer = ColumnTransformer(
[('num', MinMaxScaler(), num_columns),
('obj', OneHotEncoder(drop='first'), obj_columns)],
remainder='passthrough')
trans_X = column_transformer.fit_transform(X)
joblib.dump(column_transformer, '../Models/Column_Transformer_NN.pkl')
y = np.asarray(y).astype(float)
scaler = StandardScaler()
y = scaler.fit_transform(y.reshape(-1,1))
joblib.dump(scaler, '../Models/NN_Y_Scaler.pkl')
Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(trans_X, y, random_state=42, test_size=0.1, shuffle=True)
Neural Network
from tensorflow import keras
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from keras import backend as K
from keras.callbacks import EarlyStopping
from datetime import datetime
from tensorflow import keras
from tensorflow.keras.losses import MeanSquaredLogarithmicError
from sklearn.metrics import mean_squared_log_error
def root_mean_squared_log_error(y_true, y_pred):
return np.sqrt(mean_squared_log_error(y_true, y_pred))
mc = ModelCheckpoint(f'../Models/NN_model_test.h5', monitor='val_loss', mode='min', verbose=1, save_best_only=True)
early_stopping = EarlyStopping(
monitor='val_loss',
patience=10,
verbose=0,
mode="auto",
baseline=None,
restore_best_weights=True)
model = keras.Sequential([
keras.layers.Dense(100, activation='relu', input_dim=89),
keras.layers.LeakyReLU(500),
keras.layers.LeakyReLU(800),
keras.layers.LeakyReLU(200),
keras.layers.Dense(200, activation='relu'),
keras.layers.Dense(10, activation='relu'),
keras.layers.Dense(1, activation='relu')
])
opt = keras.optimizers.Adam(learning_rate=0.0001)
model.compile(optimizer=opt,
loss='mean_squared_logarithmic_error',
metrics=['mean_squared_logarithmic_error'])
history = model.fit(
X_train,
y_train,
batch_size=100,
epochs=5000,
validation_data=(X_test, y_test),
callbacks=[mc, early_stopping],
shuffle=True,
steps_per_epoch=3
)
results = pd.DataFrame()
y_pred = scaler.inverse_transform(model.predict(X_test))
results['Predicted'] = ((1 / y_pred) - 1).ravel()
y_actual = scaler.inverse_transform(y_test)
results['Actual'] = ((1 / y_actual) - 1).ravel()
results['Predicted'] = y_pred
results['Actual'] = y_actual
results['Difference'] = abs(results['Predicted'] - results['Actual'])
scaler.inverse_transform(model.predict(X_test))
mean_squared_error(results['Actual'], results['Predicted'], squared=False)
root_mean_squared_log_error(results['Actual'], results['Predicted'])
Ensemble Prediction
import sys
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
sys.path.append('../Scripts')
from Data_Processing import DataProcessing
from tensorflow import keras
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from keras import backend as K
from datetime import datetime
from sklearn.preprocessing import PowerTransformer
import joblib
import warnings
warnings.filterwarnings('ignore')
ColumnTransformer = joblib.load('../Models/Column_Transformer.pkl')
ColumnTransformer_NN = joblib.load('../Models/Column_Transformer_NN.pkl')
df = DataProcessing('../Data/test.csv')
y = df['Lap_Time']
X = df.drop(columns=['Lap_Time'])
obj_columns = list(X.select_dtypes(include=object).columns)
obj_columns.append('Lap_Number')
obj_columns.append('Lap_Improvement')
num_columns = list(X.select_dtypes(include='number').columns)
num_columns.remove('Lap_Number')
num_columns.remove('Lap_Improvement')
#NN Only
y = df['Lap_Time']
X = df.drop(columns=['Lap_Time'])
obj_columns = list(X.select_dtypes(include=object).columns)
obj_columns.append('Lap_Improvement')
obj_columns.append('Lap_Number')
obj_columns.append('S1_Improvement')
obj_columns.append('S2_Improvement')
obj_columns.append('S3_Improvement')
num_columns = list(X.select_dtypes(include='number').columns)
num_columns.remove('Lap_Number')
num_columns.remove('Lap_Improvement')
num_columns.remove('S1_Improvement')
num_columns.remove('S2_Improvement')
num_columns.remove('S3_Improvement')
trans_X_nn = ColumnTransformer_NN.transform(X)
def root_mean_squared_log_error(y_true, y_pred):
return K.sqrt(K.mean(K.square(K.log(1+y_pred) - K.log(1+y_true))))
#Neural Network
nn_model = load_model('../Models/NN_model_test.h5')
#Random Forest
rf_model = joblib.load('../Models/RF_Model.h5')
#Gradient Boost
gb_model = joblib.load('../Models/Gradient_Boost_Model.h5')
nn_y_scaler = joblib.load('../Models/NN_Y_Scaler.pkl')
y_predicted_nn = nn_y_scaler.inverse_transform(nn_model.predict(trans_X_nn))
y_predicted_nn = ((1 / y_predicted_nn) - 1).ravel()
y_predicted_nn = nn_y_scaler.inverse_transform(nn_model.predict(trans_X_nn))
results = pd.DataFrame()
results['NN'] = y_predicted_nn
results['RF'] = y_predicted_rf
results['GB'] = y_predicted_gb
results['LAP_TIME'] = (results['NN'] + results['RF'] + results['GB']) / 3
submission = results[['LAP_TIME']]
results
today = datetime.today().strftime('%m-%d-%y %H-%M')
submission.to_csv(f'../Submissions/Dare_In_Reality .csv', index=False)