# load libraries
import yaml
import xgboost as xgb
import pandas as pd
import numpy as np
import plotly.figure_factory as ff
import plotly.express as px
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score
import os
import pandas as pd
#You will need to reset this to your own working directory
os.chdir('/home/desktop3/itu')
path = os.getcwd()
print(path)
# read in (yaml) configs
with open(path + '/conf/model_config.yaml', 'r') as conf:
model_config = yaml.safe_load(conf)
# import data
dataset = model_config['model']['loc'] + model_config['model']['file']
dataset = pd.read_csv(dataset)
# subset for faster trial and error
#dataset = dataset.iloc[0:1000,:]
# define predictors and target
predictor = model_config['meta']['predictors']
target = model_config['meta']['target']
dataset.columns
# prepare data
X = dataset[predictor]
y = dataset[target]
print('X Shape:', X.shape)
print('y Shape:', y.shape)
# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = model_config['parameter']['test_size'],
random_state = 42)
print('X_train, X_test, y_train, y_test shapes:', X_train.shape, X_test.shape, y_train.shape, y_test.shape)
print("size of training dataset = ", len(X_train))
print("size of test dataset = ", len(X_test))
from sklearn.metrics import make_scorer
# customized evaluation function for CV
# Focusing on the predictions that are below .3
def custom_eval_metric(y_true, y_pred):
errors_low=abs(y_pred[y_pred<model_config['parameter']['threshold']] - np.asarray(y_true[y_pred<model_config['parameter']['threshold']]).flatten())
return np.mean(errors_low)
custom_scorer = make_scorer(custom_eval_metric, greater_is_better = False)
# configure cv
cv_inner = KFold(n_splits = 3, shuffle=True, random_state=123)
# define parameter space
parameters = {"n_estimators": model_config['parameter']['XGBoost']['n_estimators'],
"max_depth": model_config['parameter']['XGBoost']['max_depth'],
"eta": model_config['parameter']['XGBoost']['eta'] }
# define the model
model = xgb.XGBRegressor(random_state = 1234)
# define grid search
#search = GridSearchCV(model, parameters, scoring = custom_scorer, cv = cv_inner, refit = True, verbose = 2)
# define grid search
search = RandomizedSearchCV(model, parameters, scoring = custom_scorer, cv = cv_inner, refit = True, verbose = 2, n_iter=model_config['parameter']['iterations'])
# execute search
result = search.fit(X_train, y_train)
# get best perf model
best_model = result.best_estimator_
# evaluate the model
#acc = accuracy_score(y_test, yhat)
# store the result
#outer_results.append(acc)
# report progress
print('est=%.3f, cfg=%s' % (result.best_score_, result.best_params_))
#print('Accuracy: %.3f (%.3f)' % (np.mean(outer_results), np.std(outer_results)))
# best results
best_parameter = search.best_params_
print(best_parameter)
# choose best parameter from tuning
best_parameter = search.best_params_
print("#####################################################")
print("choose best parameter from tuning: ", best_parameter)
print("#####################################################")
model = xgb.XGBRegressor(random_state = 42, n_estimators = best_parameter['n_estimators'],
max_depth = best_parameter['max_depth'],
eta = best_parameter['eta'])
# find best parameters
model.fit(X_train, y_train)
# predict holdout
pred = model.predict(X_test)
# mean absolute error as KPI
errors = abs(pred - y_test.iloc[:,0].to_numpy())
avg_error = np.mean(errors)
#Low tail error
errors_low = abs(pred[pred<model_config['parameter']['threshold']] - np.asarray(y_test[pred<model_config['parameter']['threshold']]).flatten())
#Low tail error
errors_low_ytest = abs(pred[np.asarray(y_test).flatten()<model_config['parameter']['threshold']] - np.asarray(y_test[np.asarray(y_test).flatten()<model_config['parameter']['threshold']]).flatten())
#avg error
avg_error_low = np.mean(errors_low)
#avg error
avg_error_low_ytest = np.mean(errors_low_ytest)
#standard deviation
stan_dev_low= np.std(errors_low)
print('avg error: ', avg_error)
print('Mean lower error: ', avg_error_low)
print('Mean ytest lower error: ', avg_error_low_ytest)
print('Standard Dev of Low Error: ', stan_dev_low)
y = y_test.iloc[:,0].to_numpy()
y_pred = pred
fig = px.scatter(x=y, y=y_pred, labels={'x': 'ground truth', 'y': 'prediction'},
title = 'Comparison between predictions and reality',
template = 'plotly_dark')
fig.update_traces(marker=dict(size=3,
color=((abs(y-y_pred) < 0.15).astype('int')),
colorscale=[[0, '#FAED27'],[1, '#98FB98']])
)
fig.add_shape(
type="line", line=dict(dash='dash'),
x0=y.min(), y0=y.min(),
x1=y.max(), y1=y.max()
)
fig.show()
y_pred
importance = pd.DataFrame(
{"Feature": X.columns, "Importance": model.feature_importances_}
).sort_values("Importance")
importance
import matplotlib.pyplot as plt
fig,ax =plt.subplots(1, figsize=(14,6))
# add a title and annotation
ax.set_title('Feature Importances', fontdict={'fontsize': '13', 'fontweight' : '3'})
(pd.Series(model.feature_importances_, index=X.columns)
.nsmallest(12).plot(kind='barh'))