# load libraries
import yaml
import lightgbm
import pandas as pd
import numpy as np
import plotly.figure_factory as ff
import plotly.express as px
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV
# read in (yaml) configs
with open('/home/desktop0/itu/conf/model_config_1.yaml', 'r') as conf:
model_config = yaml.safe_load(conf)
# import data
dataset = model_config['model']['loc'] + model_config['model']['file']
dataset = pd.read_csv(dataset)
# subset for faster trial and error
#dataset = dataset.iloc[0:1000,:]
# define predictors and target
predictor = model_config['meta']['predictors']
target = model_config['meta']['target']
# prepare data
X = dataset[pred]
y = dataset[target]
print('X Shape:', X.shape)
print('y Shape:', y.shape)
# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = model_config['parameter']['test_size'],
random_state = 42)
print('X_train, X_test, y_train, y_test shapes:', X_train.shape, X_test.shape, y_train.shape, y_test.shape)
print("size of training dataset = ", len(X_train))
print("size of test dataset = ", len(X_test))
from sklearn.metrics import make_scorer
# customized evaluation function for CV
# Focusing on the predictions that are below .3
def custom_eval_metric(y_true, y_pred):
errors_low = abs(y_true.iloc[y_pred<0.8,0].to_numpy() - y_pred[y_pred<0.8])
return np.mean(errors_low)
custom_scorer = make_scorer(custom_eval_metric, greater_is_better = False)
pred = search.predict(X_test)
#pred[pred<0.3]
#pred
custom_eval_metric(y_test, pred)
# create inner and outer cross-validation sets
inner_cv = KFold(n_splits = model_config['parameter']['inner_cv'], shuffle=True)
# define parameter grid
parameters = {"boosting_type": model_config['parameter']['lightgbm']['boosting_type'],"max_depth": model_config['parameter']['lightgbm']['max_depth'],
"learning_rate": model_config['parameter']['lightgbm']['learning_rate']],
"n_estimators": model_config['parameter']['lightgbm']['n_estimators']}
# define model class to use
model = lightgbm.LGBMRegressor(random_state = 42)
# define grid search
search = RandomizedSearchCV(model,
parameters,
cv = 2,
random_state = 42,
verbose = 2,
n_iter = 3,
scoring = custom_scorer)
# find best parameters
search.fit(X_train, y_train)
# all results
print(search.cv_results_)
# best results
best_parameter = search.best_params_
print(best_parameter)
# define model class to use
model = lightgbm.LGBMRegressor(random_state = 42, boosting_type = best_parameter['boosting_type'],
n_estimators = best_parameter['n_estimators'],
max_depth = best_parameter['max_depth'],
learning_rate = best_parameter['learning_rate'])
# find best parameters
model.fit(X_train, y_train)
# predict holdout
pred = search.predict(X_test)
# mean absolute error as KPI
errors = abs(pred - y_test.iloc[:,0].to_numpy())
avg_error = np.mean(errors)
print('MAE:', np.round(avg_error, 3))
y = y_test.iloc[:,0].to_numpy()
y_pred = pred
fig = px.scatter(x=y, y=y_pred, labels={'x': 'ground truth', 'y': 'prediction'},
title = 'Comparison between predictions and reality',
template = 'plotly_dark')
fig.update_traces(marker=dict(size=3,
color=((abs(y-y_pred) < 0.15).astype('int')),
colorscale=[[0, '#FAED27'],[1, '#98FB98']])
)
fig.add_shape(
type="line", line=dict(dash='dash'),
x0=y.min(), y0=y.min(),
x1=y.max(), y1=y.max()
)
fig.show()
res_df = pd.DataFrame()
res_df['prediction'] = y_pred
res_df['ground truth'] = y
#res_df['train'] = y_train
res_df['residual'] = (pred - y_test.iloc[:,0].to_numpy())
fig = px.scatter(
res_df, x='ground truth', y='residual',
#marginal_y='violin',
trendline='ols', template = 'plotly_dark',
title = 'Comparison between residuals and reality'
)
fig.update_traces(marker=dict(size=3,
color=((abs(res_df.residual) < 0.15).astype('int')),
colorscale=[[0, '#FAED27'],[1, '#98FB98']])
)
fig.show()
fig = px.scatter(
res_df, x='prediction', y='residual',
#marginal_y='violin',
trendline='ols', template = 'plotly_dark',
title = 'Comparison between residuals and predictions'
)
fig.update_traces(marker=dict(size=3,
color=((abs(res_df.residual) < 0.15).astype('int')),
colorscale=[[0, '#FAED27'],[1, '#98FB98']])
)
fig.show()
online_pop = [pred, y_test.iloc[:,0].to_numpy()]
labels = ['predictions', 'reality']
fig = ff.create_distplot(online_pop, labels, show_hist = False)
fig.layout.update({'title':'Comparison of distributions of reality and predictions',
'title_font_color':'white',
'legend_bgcolor':'#545454',
'font_color':'white',
'plot_bgcolor':'#545454',
'paper_bgcolor':'#2a2a2a',
'yaxis':{'gridcolor':'#2a2a2a', 'zerolinecolor':'#2a2a2a'},
'xaxis':{'gridcolor':'#2a2a2a'}
})
fig.show()
lightgbm.plot_importance(model)