# load libraries
import yaml
import lightgbm
import matplotlib.pyplot as plt
import pandas as pd
import geopandas
import numpy as np
import plotly.figure_factory as ff
import plotly.express as px
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
import os
import pandas as pd
#you will need to reset this to your own working directory
os.chdir('/home/desktop3')
path = os.getcwd()
print(path)
# read in (yaml) configs
with open(path + '/itu/conf/model_config.yaml', 'r') as conf:
model_config = yaml.safe_load(conf)
# import data
dataset = path + model_config['model']['loc'] + model_config['model']['file']
dataset = pd.read_csv(dataset)
# subset for faster trial and error
#dataset = dataset.iloc[0:1000,:]
# define predictors and target
predictor = model_config['meta']['predictors']
target = model_config['meta']['target']
#All the columns available
dataset.columns
#Let's peek at the data table
dataset.head(n=3)
# prepare data
X = dataset[predictor]
y = dataset[target]
print('X Shape:', X.shape)
print('y Shape:', y.shape)
# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = model_config['parameter']['test_size'],
random_state = 42)
print('X_train, X_test, y_train, y_test shapes:', X_train.shape, X_test.shape, y_train.shape, y_test.shape)
print("size of training dataset = ", len(X_train))
print("size of test dataset = ", len(X_test))
import seaborn as sns
sns.set(rc={'figure.figsize':(15,15)})
sns.heatmap(
X.corr(),
cmap="coolwarm",
annot=True,
vmin=-1,
vmax=1,
)
# create inner and outer cross-validation sets
inner_cv = KFold(n_splits = model_config['parameter']['inner_cv'], shuffle=True)
# define parameter grid
parameters = {"n_estimators": model_config['parameter']['Random_Forest']['n_estimators'],
"max_depth": model_config['parameter']['Random_Forest']['n_estimators']}
# define model class to use
model = RandomForestRegressor(random_state=42)
#Create custom scoring
from sklearn.metrics import make_scorer
def custom_eval_metric(y_true, y_pred):
#errors_low_ytest = abs(y_pred[np.asarray(y_true).flatten()<0.3] - np.asarray(y_true[np.asarray(y_true).flatten()<0.3]).flatten())
errors_low=abs(y_pred[y_pred<model_config['parameter']['threshold']] - np.asarray(y_true[y_pred<model_config['parameter']['threshold']]).flatten())
return np.mean(errors_low)
custom_scorer = make_scorer(custom_eval_metric, greater_is_better = False)
## Can use either randomized or grid search, use gridsearch for now but randomized when you have a larger parameter space
## define grid search
# search = RandomizedSearchCV(model, parameters, cv = inner_cv, random_state = 42,
# verbose = 2, n_iter = model_config['parameter']['iterations'],
# scoring = custom_scorer)
##define grid search
search = GridSearchCV(model, parameters, scoring = custom_scorer, cv = inner_cv ,
refit=True,
verbose = 2)
# find best parameters
search.fit(X_train, y_train.values.ravel())
# all results
search.cv_results_
# best results
best_parameter = search.best_params_
print(best_parameter)
print(search.cv_results_)
print(search.best_score_)
# define model class to use
model = RandomForestRegressor(random_state = 42,
n_estimators = best_parameter['n_estimators'],
max_depth = best_parameter['max_depth']
)
# find best parameters
model.fit(X_train, y_train.values.ravel())
y_test.values.ravel()
import pickle
#changing directory
os.chdir('/home/desktop3/itu/models/')
path = os.getcwd()
print(path)
#Pickle the model
# save the model to disk
filename = 'RF_model.sav'
pickle.dump(model, open(filename, 'wb'))
# predict holdout
#pred = model.predict(X_test)
y_pred = model.predict(X_test)
# Absolute error
errors = abs(y_pred - y_test.iloc[:,0].to_numpy())
avg_error = np.mean(errors)
#Low tail error
errors_low = abs(y_pred[y_pred<model_config['parameter']['threshold']] - np.asarray(y_test[y_pred<model_config['parameter']['threshold']]).flatten())
#Low tail error
errors_low_ytest = abs(y_pred[np.asarray(y_test).flatten()<model_config['parameter']['threshold']]
- np.asarray(y_test[np.asarray(y_test).flatten()<model_config['parameter']['threshold']]).flatten())
#avg error
avg_error_low = np.mean(errors_low)
#avg error
avg_error_low_ytest = np.mean(errors_low_ytest)
#standard deviation
stan_dev_low= np.std(errors_low)
print('errors: ', errors)
print('avg error: ', avg_error)
# print('Just the lower errors: ', errors_low)
print('Mean lower error: ', avg_error_low)
print('Mean ytest lower error: ', avg_error_low_ytest)
# print('y test error: ', errors_low_ytest)
print('Standard Dev of Low Error: ', stan_dev_low)
#Checking to see the raw amount of school
print('The amount of schools predicted to be lower than 30%: ', len(pred[pred<.3]))
print('The amount of schools that are actually below 30%: ', len(y_test[y_test['target']<0.3]))
#Creating y_test dataframe to merge back
y_test['Predictions']= pred.tolist()
y_test['Errors']= abs(y_test['target']-y_test['Predictions'])
y_test
#Only values where ground truth less than .3
onlygtvalues = y_test.loc[y_test['target']<.3]
# Merge y_test back into main df
df_merge = pd.merge(y_test, dataset, how= "inner", left_index=True, right_index=True)
High_Error_Schools = df_merge.loc[df_merge['Errors']>.3]
High_Error_Schools.shape
High_Pred_Schools = df_merge.loc[df_merge['Predictions']>.7]
High_Pred_Schools
df_merge.shape
#Creating a geodataframe
from shapely import wkt
df_merge['school_location'] = geopandas.GeoSeries.from_wkt(df_merge['school_location'])
gdf = geopandas.GeoDataFrame(df_merge, geometry='school_location')
#Subset by just low Ground truth values
Low_GT = gdf.loc[gdf['target_x']<.3]
Low_pred = gdf.loc[gdf['Predictions']<.3]
Low_GT.loc[Low_GT['Predictions']>.3]
fig,ax =plt.subplots(1, figsize=(14,6))
# add a title and annotation
ax.set_title('Predictions for schools in test set', fontdict={'fontsize': '13', 'fontweight' : '3'})
gdf.plot(column="Predictions", cmap = 'viridis' ,legend=True, ax=ax)
# ctx.add_basemap(ax)
plt.show()
fig,ax =plt.subplots(1, figsize=(14,6))
# add a title and annotation
ax.set_title('Errors in Schools where Prediction is less than 30% connected', fontdict={'fontsize': '13', 'fontweight' : '3'})
#Do errors where ground truth is below .3 and also pred below .3
Low_pred.plot(column="Errors", cmap = 'viridis' ,legend=True, ax=ax)
plt.show()
vmin, vmax= 0, .7
fig,ax =plt.subplots(1, figsize=(14,6))
# add a title and annotation
ax.set_title('Errors in Schools where Ground Truth is less than 30% connected', fontdict={'fontsize': '13', 'fontweight' : '3'})
#Do errors where ground truth is below .3 and also pred below .3
Low_GT.plot(column="Errors", cmap = 'viridis' ,legend=True, ax=ax)
plt.show()
vmin, vmax= 0, .7
fig,ax =plt.subplots(1, figsize=(14,6))
# add a title and annotation
ax.set_title('Errors in Schools in Brazil test set', fontdict={'fontsize': '13', 'fontweight' : '3'})
gdf.plot(column="Errors", cmap = 'viridis' ,legend=True, ax=ax)
plt.show()
y = y_test.iloc[:,0].to_numpy()
y_pred = pred
fig = px.scatter(x=y, y=y_pred, labels={'x': 'ground truth', 'y': 'prediction'},
title = 'Comparison between predictions and reality',
template = 'plotly_dark')
fig.update_traces(marker=dict(size=3,
color=((abs(y-y_pred) < 0.15).astype('int')),
colorscale=[[0, '#FAED27'],[1, '#98FB98']])
)
fig.add_shape(
type="line", line=dict(dash='dash'),
x0=y.min(), y0=y.min(),
x1=y.max(), y1=y.max()
)
fig.show()
res_df = pd.DataFrame()
res_df['prediction'] = y_pred
res_df['ground truth'] = y
#res_df['train'] = y_train
res_df['residual'] = (pred - y_test.iloc[:,0].to_numpy())
fig = px.scatter(
res_df, x='ground truth', y='residual',
#marginal_y='violin',
trendline='ols', template = 'plotly_dark',
title = 'Comparison between residuals and reality'
)
fig.update_traces(marker=dict(size=3,
color=((abs(res_df.residual) < 0.15).astype('int')),
colorscale=[[0, '#FAED27'],[1, '#98FB98']])
)
fig.show()
fig = px.scatter(
res_df, x='prediction', y='residual',
#marginal_y='violin',
trendline='ols', template = 'plotly_dark',
title = 'Comparison between residuals and predictions'
)
fig.update_traces(marker=dict(size=3,
color=((abs(res_df.residual) < 0.15).astype('int')),
colorscale=[[0, '#FAED27'],[1, '#98FB98']])
)
fig.show()
online_pop = [pred, y_test.iloc[:,0].to_numpy()]
labels = ['predictions', 'reality']
fig = ff.create_distplot(online_pop, labels, show_hist = False)
fig.layout.update({'title':'Comparison of distributions of reality and predictions',
'title_font_color':'white',
'legend_bgcolor':'#545454',
'font_color':'white',
'plot_bgcolor':'#545454',
'paper_bgcolor':'#2a2a2a',
'yaxis':{'gridcolor':'#2a2a2a', 'zerolinecolor':'#2a2a2a'},
'xaxis':{'gridcolor':'#2a2a2a'}
})
fig.show()
# Make the dataframe
importance = pd.DataFrame(
{"Feature": X.columns, "Importance": model.feature_importances_}
).sort_values("Importance")
importance
import matplotlib.pyplot as plt
fig,ax =plt.subplots(1, figsize=(14,6))
# add a title and annotation
ax.set_title('Feature Importances', fontdict={'fontsize': '13', 'fontweight' : '3'})
(pd.Series(model.feature_importances_, index=X.columns)
.nsmallest(12).plot(kind='barh'))