In [1]:
# load libraries
import yaml
import lightgbm
import matplotlib.pyplot as plt
import pandas as pd
import geopandas
import numpy as np
import os
import pandas as pd
import pickle
os.chdir('/home/desktop3/itu')
path = os.getcwd()
print(path)
/home/desktop3/itu
/opt/anaconda3/lib/python3.8/site-packages/geopandas/_compat.py:106: UserWarning: The Shapely GEOS version (3.8.0-CAPI-1.13.1 ) is incompatible with the GEOS version PyGEOS was compiled with (3.9.1-CAPI-1.14.2). Conversions between both will be slow.
  warnings.warn(

Loading the Thailand Data

In [2]:
# read in (yaml) configs
with open(path + '/conf/model_config.yaml', 'r') as conf:
    model_config = yaml.safe_load(conf)

# import data
dataset = model_config['model']['loc'] + model_config['model']['file']
dataset = pd.read_csv(dataset)

# define predictors and target
predictor   =  ['avg_d_kbps',
       'avg_u_kbps',
       'mean_ghm', 'avg_rad_mean',  'cf_cvg_mean',  'slope_avg_rad',
       'slope_cf_cvg', 'slope_month_avg_rad',
       'slope_monthcf_cvg', 'estimate_mau', 'value_norm']
target = ['A4A_right']

Current issue is that predictors are named differently in Thailand than in Brazil dataset and also, it needs to be the same exact predictors used for both

In [3]:
dataset.columns
Out[3]:
Index(['Unnamed: 0', 'Unnamed: 0.1', 'School', 'source_school_id', 'ENUM',
       'H107', 'lon', 'lat', 'range', 'samples', 'avg_d_kbps', 'avg_u_kbps',
       'avg_d_kbps3', 'avg_u_kbps3', 'avg_rad_mean', 'cf_cvg_mean',
       'slope_avg_rad', 'change_avg_rad', 'slope_cf_cvg', 'change_cf_cvg',
       'slope_monthcf_cvg', 'change_monthcf_cvg', 'slope_month_avg_rad',
       'change_month_avg_rad', 'slope_NDVI', 'change_NDVI', 'mean_ghm',
       'estimate_dau', 'estimate_mau', 'estimate_ready', 'value',
       'value_norm'],
      dtype='object')
In [4]:
dataset
Out[4]:
Unnamed: 0 Unnamed: 0.1 School source_school_id ENUM H107 lon lat range samples ... slope_month_avg_rad change_month_avg_rad slope_NDVI change_NDVI mean_ghm estimate_dau estimate_mau estimate_ready value value_norm
0 0 0 POINT (98.2554367 8.6658159) 255542593 11.0 1.0 98.257309 8.666424 1.0 1.0 ... 0.008125 0.008125 -63.107123 -63.107123 0.556310 4835 5900 True 254.46342 0.804133
1 1 1 POINT (100.5637277 13.7281689) 280389453 11.0 1.0 100.563824 13.728090 1.0 12.0 ... 0.154787 0.154787 -124.015608 -124.015608 0.891303 1040046 1600000 True 13295.45700 0.908735
2 2 2 POINT (99.3090438 6.4918845) 321664554 11.0 1.0 99.309902 6.491847 1.0 4.0 ... 0.003737 0.003737 132.490023 132.490023 0.348923 0 0 False 109.93822 0.802974
3 3 3 POINT (98.99020710000001 18.8422541) 336232077 14.0 1.0 98.990578 18.842300 1.0 1.0 ... 0.029212 0.029212 -52.833499 -52.833499 0.773788 237917 330000 True 1407.96750 0.813386
4 4 4 POINT (102.773318 16.4357572) 373156854 14.0 1.0 102.774895 16.435555 1.0 22.0 ... 0.023249 0.023249 19.408091 19.408091 0.819569 42591 58000 True 548.45380 0.806492
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
4718 4718 4718 POINT (101.2758538 14.2842555) 8879142203 11.0 1.0 101.275375 14.284611 1.0 3.0 ... 0.012734 0.012734 43.746437 43.746437 0.599212 10540 14000 True 145.64415 0.803261
4719 4719 4719 POINT (105.4027044 14.9497545) 8905212408 11.0 1.0 105.407876 14.950656 1.0 7.0 ... 0.001070 0.001070 22.923611 22.923611 0.477034 1486 1700 True 37.91742 0.802397
4720 4720 4720 POINT (100.2320763 16.8519458) 8908633917 14.0 1.0 100.231919 16.851753 1.0 11.0 ... 0.005698 0.005698 27.090730 27.090730 0.799692 89652 120000 True 1028.61450 0.810343
4721 4721 4721 POINT (99.82662910000001 19.904528) 8931699084 14.0 1.0 99.826927 19.904137 1.0 4.0 ... 0.048839 0.048839 -143.544324 -143.544324 0.772304 97046 130000 True 2506.85130 0.822200
4722 4722 4722 POINT (99.8260198 19.9058235) 8931701767 14.0 1.0 99.825790 19.905846 1.0 1.0 ... 0.050350 0.050350 -144.775981 -144.775981 0.771019 97944 130000 True 2071.77270 0.818710

4723 rows × 32 columns

In [5]:
# prepare data
X = dataset[predictor]
# y = dataset[target]
print('X Shape:', X.shape)
# print('y Shape:', y.shape)
X Shape: (4723, 11)

Reloading saved pickel model

In [6]:
import mlflow
logged_model = 'file:///files/mlruns/78/add3053875954bc494facaa833b20746/artifacts/model'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)

# Predict on a Pandas DataFrame.
import pandas as pd
y_pred = loaded_model.predict(X)
In [7]:
y_pred
Out[7]:
array([0.6241376 , 0.67853665, 0.7255879 , ..., 0.62164384, 0.7443871 ,
       0.74196595], dtype=float32)

Evaluating the Model

Can't do the below because don't have ground truth on the same level

In [8]:
# Absolute error
errors = abs(y_pred - y.iloc[:,0].to_numpy())
avg_error = np.mean(errors)

#Low tail error
errors_low = abs(y_pred[y_pred<0.3] - np.asarray(y[y_pred<0.3]).flatten())

#Low tail error
errors_low_ytest = abs(y_pred[np.asarray(y).flatten()<0.3] - np.asarray(y[np.asarray(y).flatten()<0.3]).flatten())

#avg error
avg_error_low = np.mean(errors_low)


#avg error
avg_error_low_ytest = np.mean(errors_low)

#standard deviation
stan_dev_low= np.std(errors_low)

print('errors: ', errors)
print('avg error: ', avg_error)
# print('Just the lower errors: ', errors_low)
print('Mean lower error: ', avg_error_low)
print('Mean ytest lower error: ', avg_error_low_ytest)
# print('y test error: ', errors_low_ytest)
print('Standard Dev of Low Error: ', stan_dev_low)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-8-471cdd741ba6> in <module>
      1 # Absolute error
----> 2 errors = abs(y_pred - y.iloc[:,0].to_numpy())
      3 avg_error = np.mean(errors)
      4 
      5 #Low tail error

NameError: name 'y' is not defined
In [ ]:
# Absolute error
errors = abs(y_pred - y.iloc[:,0].to_numpy())
avg_error = np.mean(errors)

#Low tail error
errors_low = abs(y_pred[y_pred<0.5] - np.asarray(y[y_pred<0.5]).flatten())

#Low tail error
errors_low_ytest = abs(pred[np.asarray(y).flatten()<0.5] - np.asarray(y[np.asarray(y).flatten()<0.5]).flatten())

#avg error
avg_error_low = np.mean(errors_low)


#avg error
avg_error_low_ytest = np.mean(errors_low)

#standard deviation
stan_dev_low= np.std(errors_low)

print('errors: ', errors)
print('avg error: ', avg_error)
# print('Just the lower errors: ', errors_low)
print('Mean lower error: ', avg_error_low)
print('Mean ytest lower error: ', avg_error_low_ytest)
# print('y test error: ', errors_low_ytest)
print('Standard Dev of Low Error: ', stan_dev_low)

Mapping out the Predictions

In [9]:
#Adding the predictions as a column to the dataset
dataset['Predictions']= y_pred
In [10]:
#Creating a geodataframe
from shapely import wkt
#Changing dataframe into a geodataframe focused on the school geometry column
dataset['School'] = geopandas.GeoSeries.from_wkt(dataset['School'])
gdf = geopandas.GeoDataFrame(dataset, geometry='School')
In [11]:
gdf.shape
Out[11]:
(4723, 33)
In [12]:
#Set the figure size
fig,ax =plt.subplots(1, figsize=(15,10))

#plotting with the new geodataframe, each school point by prediction value
gdf.plot(column="Predictions", cmap = 'viridis' ,legend=True, ax=ax)

# add a title and annotation
plt.suptitle('Predictions for all Thailand schools', fontsize=18, fontweight=3)
plt.title('4723 schools', fontsize=13)

#show the plot
plt.show()

Schools below 50% because there were no schools below 30%

In [13]:
Low_pred = gdf.loc[gdf['Predictions']<.5]
Low_pred.shape
Out[13]:
(97, 33)
In [14]:
fig,ax =plt.subplots(1, figsize=(15,10))

# add a title and annotation
# ax.set_title('Predictions for Schools below 50% internet connectivity', fontdict={'fontsize': '13', 'fontweight' : '3'})
# ax.set_suptitle('69 schools')

Low_pred.plot(column="Predictions", cmap = 'viridis' ,legend=True, ax=ax)
plt.suptitle('Predictions for Schools below 50% internet connectivity', fontsize=18, fontweight=3)
plt.title('97 schools', fontsize=13)

plt.show()