Predicting the worlds offline population

First Model Training and Evaluation

(LightGBM on Brazil)

In [4]:
# load libraries
import yaml
import lightgbm
import pandas as pd
import numpy as np
import plotly.figure_factory as ff
import plotly.express as px
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV

Load and prepare data

In [9]:
# read in (yaml) configs
with open('/home/desktop0/itu/conf/model_config_1.yaml', 'r') as conf:
    model_config = yaml.safe_load(conf)

# import data
dataset = model_config['model']['loc'] + model_config['model']['file']
dataset = pd.read_csv(dataset)
# subset for faster trial and error
#dataset = dataset.iloc[0:1000,:]

# define predictors and target
predictor   =  model_config['meta']['predictors']
target = model_config['meta']['target']
In [10]:
# prepare data
X = dataset[pred]
y = dataset[target]
print('X Shape:', X.shape)
print('y Shape:', y.shape)
   
# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = model_config['parameter']['test_size'], 
                                                    random_state = 42)

print('X_train, X_test, y_train, y_test shapes:', X_train.shape, X_test.shape, y_train.shape, y_test.shape)
print("size of training dataset = ", len(X_train))
print("size of test dataset = ", len(X_test))
X Shape: (11732, 6)
y Shape: (11732, 1)
X_train, X_test, y_train, y_test shapes: (8212, 6) (3520, 6) (8212, 1) (3520, 1)
size of training dataset =  8212
size of test dataset =  3520

Training

Prepare model tuning

In [73]:
from sklearn.metrics import make_scorer

# customized evaluation function for CV
# Focusing on the predictions that are below .3
def custom_eval_metric(y_true, y_pred):
    errors_low = abs(y_true.iloc[y_pred<0.8,0].to_numpy() - y_pred[y_pred<0.8])
    return np.mean(errors_low)

custom_scorer = make_scorer(custom_eval_metric, greater_is_better = False)
In [64]:
pred = search.predict(X_test)
#pred[pred<0.3]
#pred
custom_eval_metric(y_test, pred)
Out[64]:
0.1550486214470504
In [74]:
# create inner and outer cross-validation sets
inner_cv = KFold(n_splits = model_config['parameter']['inner_cv'], shuffle=True)

# define parameter grid
parameters = {"boosting_type": model_config['parameter']['lightgbm']['boosting_type'],"max_depth": model_config['parameter']['lightgbm']['max_depth'],
              "learning_rate": model_config['parameter']['lightgbm']['learning_rate']],
              "n_estimators": model_config['parameter']['lightgbm']['n_estimators']}

# define model class to use
model = lightgbm.LGBMRegressor(random_state = 42)

# define grid search
search = RandomizedSearchCV(model, 
                            parameters, 
                            cv = 2, 
                            random_state = 42, 
                            verbose = 2, 
                            n_iter = 3,
                            scoring = custom_scorer)

Model tuning

In [75]:
# find best parameters
search.fit(X_train, y_train)
Fitting 2 folds for each of 3 candidates, totalling 6 fits
[CV] END boosting_type=gbdt, learning_rate=0.01, max_depth=5, n_estimators=100; total time= 1.3min
[CV] END boosting_type=gbdt, learning_rate=0.01, max_depth=5, n_estimators=100; total time= 1.2min
[CV] END boosting_type=gbdt, learning_rate=0.05, max_depth=5, n_estimators=100; total time= 1.0min
[CV] END boosting_type=gbdt, learning_rate=0.05, max_depth=5, n_estimators=100; total time= 1.0min
[CV] END boosting_type=gbdt, learning_rate=0.01, max_depth=5, n_estimators=50; total time=  38.7s
[CV] END boosting_type=gbdt, learning_rate=0.01, max_depth=5, n_estimators=50; total time=  34.0s
Out[75]:
RandomizedSearchCV(cv=2, estimator=LGBMRegressor(random_state=42), n_iter=3,
                   param_distributions={'boosting_type': ['gbdt'],
                                        'learning_rate': [0.01, 0.05],
                                        'max_depth': [5, 10],
                                        'n_estimators': [50, 100]},
                   random_state=42,
                   scoring=make_scorer(custom_eval_metric, greater_is_better=False),
                   verbose=2)

Tuning results

In [76]:
# all results
print(search.cv_results_)
# best results
best_parameter = search.best_params_
print(best_parameter)
{'mean_fit_time': array([72.98091173, 60.42100775, 36.33998334]), 'std_fit_time': array([2.89863825, 0.13085091, 2.31868875]), 'mean_score_time': array([0.02046323, 0.02019429, 0.02389824]), 'std_score_time': array([0.00083613, 0.00050139, 0.00194323]), 'param_n_estimators': masked_array(data=[100, 100, 50],
             mask=[False, False, False],
       fill_value='?',
            dtype=object), 'param_max_depth': masked_array(data=[5, 5, 5],
             mask=[False, False, False],
       fill_value='?',
            dtype=object), 'param_learning_rate': masked_array(data=[0.01, 0.05, 0.01],
             mask=[False, False, False],
       fill_value='?',
            dtype=object), 'param_boosting_type': masked_array(data=['gbdt', 'gbdt', 'gbdt'],
             mask=[False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.01, 'boosting_type': 'gbdt'}, {'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.05, 'boosting_type': 'gbdt'}, {'n_estimators': 50, 'max_depth': 5, 'learning_rate': 0.01, 'boosting_type': 'gbdt'}], 'split0_test_score': array([-0.14731592, -0.13970645, -0.15046979]), 'split1_test_score': array([-0.14903991, -0.1411775 , -0.15272928]), 'mean_test_score': array([-0.14817792, -0.14044197, -0.15159953]), 'std_test_score': array([0.000862  , 0.00073553, 0.00112975]), 'rank_test_score': array([2, 1, 3], dtype=int32)}
{'n_estimators': 100, 'max_depth': 5, 'learning_rate': 0.05, 'boosting_type': 'gbdt'}

Fit model with best parameters

In [9]:
# define model class to use
model = lightgbm.LGBMRegressor(random_state = 42, boosting_type = best_parameter['boosting_type'],
                              n_estimators = best_parameter['n_estimators'],
                              max_depth = best_parameter['max_depth'],
                              learning_rate = best_parameter['learning_rate'])

# find best parameters
model.fit(X_train, y_train)
Out[9]:
LGBMRegressor(max_depth=20, n_estimators=500, random_state=42)

Evaluation

In [10]:
# predict holdout
pred = search.predict(X_test)

# mean absolute error as KPI
errors = abs(pred - y_test.iloc[:,0].to_numpy())
avg_error = np.mean(errors)
print('MAE:', np.round(avg_error, 3))
MAE: 0.1
In [10]:
y = y_test.iloc[:,0].to_numpy()
y_pred = pred

fig = px.scatter(x=y, y=y_pred, labels={'x': 'ground truth', 'y': 'prediction'}, 
                 title = 'Comparison between predictions and reality',
                 template = 'plotly_dark')
fig.update_traces(marker=dict(size=3, 
                              color=((abs(y-y_pred) < 0.15).astype('int')),
                              colorscale=[[0, '#FAED27'],[1, '#98FB98']])
                             )
fig.add_shape(
    type="line", line=dict(dash='dash'),
    x0=y.min(), y0=y.min(),
    x1=y.max(), y1=y.max()
)
fig.show()
In [11]:
res_df = pd.DataFrame()
res_df['prediction'] = y_pred
res_df['ground truth'] = y
#res_df['train'] = y_train
res_df['residual'] = (pred - y_test.iloc[:,0].to_numpy())
fig = px.scatter(
    res_df, x='ground truth', y='residual',
    #marginal_y='violin',
    trendline='ols', template = 'plotly_dark',
    title = 'Comparison between residuals and reality'
)
fig.update_traces(marker=dict(size=3, 
                              color=((abs(res_df.residual) < 0.15).astype('int')),
                              colorscale=[[0, '#FAED27'],[1, '#98FB98']])
                             )
fig.show()
In [14]:
fig = px.scatter(
    res_df, x='prediction', y='residual',
    #marginal_y='violin',
    trendline='ols', template = 'plotly_dark',
    title = 'Comparison between residuals and predictions'
)
fig.update_traces(marker=dict(size=3, 
                              color=((abs(res_df.residual) < 0.15).astype('int')),
                              colorscale=[[0, '#FAED27'],[1, '#98FB98']])
                             )
fig.show()
In [12]:
online_pop = [pred, y_test.iloc[:,0].to_numpy()]
labels = ['predictions', 'reality']
         
fig = ff.create_distplot(online_pop, labels, show_hist = False)
fig.layout.update({'title':'Comparison of distributions of reality and predictions',
                   'title_font_color':'white',
                   'legend_bgcolor':'#545454',
                   'font_color':'white',
                   'plot_bgcolor':'#545454',
                   'paper_bgcolor':'#2a2a2a',
                   'yaxis':{'gridcolor':'#2a2a2a', 'zerolinecolor':'#2a2a2a'},
                   'xaxis':{'gridcolor':'#2a2a2a'}
                   })
fig.show()
In [13]:
lightgbm.plot_importance(model)
Out[13]:
<AxesSubplot:title={'center':'Feature importance'}, xlabel='Feature importance', ylabel='Features'>