Predicting the worlds offline population¶

First Model Training and Evaluation¶

(Random Forest on Brazil)¶

# load libraries
import yaml
import lightgbm
import matplotlib.pyplot as plt
import pandas as pd
import geopandas
import numpy as np
import plotly.figure_factory as ff
import plotly.express as px
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestRegressor

Load and prepare data¶

import os
import pandas as pd

#you will need to reset this to your own working directory
os.chdir('/home/desktop3')
path = os.getcwd()
print(path)

/home/desktop3

# read in (yaml) configs
with open(path + '/itu/conf/model_config.yaml', 'r') as conf:
    model_config = yaml.safe_load(conf)
    
# import data
dataset = path + model_config['model']['loc'] + model_config['model']['file']
dataset = pd.read_csv(dataset)
# subset for faster trial and error
#dataset = dataset.iloc[0:1000,:]

# define predictors and target
predictor   =  model_config['meta']['predictors']
target = model_config['meta']['target']

#All the columns available
dataset.columns

Index(['source_school_id', 'latitude', 'longitude', 'school_location',
       'geometry', 'target', 'mean_ghm', 'mean_avg_rad', 'change_year_avg_rad',
       'slope_year_avg_rad', 'change_month_avg_rad', 'slope_month_avg_rad',
       'mean_cf_cvg', 'change_year_cf_cvg', 'slope_year_cf_cvg',
       'change_month_cf_cvg', 'slope_month_cf_cvg', 'mean_NDVI',
       'change_year_NDVI', 'slope_year_NDVI', 'change_month_NDVI',
       'slope_month_NDVI', 'estimate_dau', 'estimate_mau', 'estimate_ready',
       'range', 'avg_d_kbps', 'avg_u_kbps', 'population'],
      dtype='object')

#Let's peek at the data table
dataset.head(n=3)

# prepare data
X = dataset[predictor]
y = dataset[target]
print('X Shape:', X.shape)
print('y Shape:', y.shape)
   
# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = model_config['parameter']['test_size'], 
                                                    random_state = 42)

print('X_train, X_test, y_train, y_test shapes:', X_train.shape, X_test.shape, y_train.shape, y_test.shape)
print("size of training dataset = ", len(X_train))
print("size of test dataset = ", len(X_test))

X Shape: (11732, 12)
y Shape: (11732, 1)
X_train, X_test, y_train, y_test shapes: (8212, 12) (3520, 12) (8212, 1) (3520, 1)
size of training dataset =  8212
size of test dataset =  3520

import seaborn as sns

sns.set(rc={'figure.figsize':(15,15)})

sns.heatmap(
    X.corr(), 
    cmap="coolwarm", 
    annot=True, 
    vmin=-1, 
    vmax=1,
)

<AxesSubplot:>

Training¶

Prepare model tuning¶

# create inner and outer cross-validation sets
inner_cv = KFold(n_splits = model_config['parameter']['inner_cv'], shuffle=True)

# define parameter grid
parameters = {"n_estimators":  model_config['parameter']['Random_Forest']['n_estimators'],
              "max_depth": model_config['parameter']['Random_Forest']['n_estimators']}

# define model class to use
model = RandomForestRegressor(random_state=42)

#Create custom scoring
from sklearn.metrics import make_scorer

def custom_eval_metric(y_true, y_pred):
    #errors_low_ytest = abs(y_pred[np.asarray(y_true).flatten()<0.3] - np.asarray(y_true[np.asarray(y_true).flatten()<0.3]).flatten())
    errors_low=abs(y_pred[y_pred<model_config['parameter']['threshold']] - np.asarray(y_true[y_pred<model_config['parameter']['threshold']]).flatten())
    return np.mean(errors_low)
    
custom_scorer = make_scorer(custom_eval_metric, greater_is_better = False)

## Can use either randomized or grid search, use gridsearch for now but randomized when you have a larger parameter space
## define grid search
# search = RandomizedSearchCV(model, parameters, cv = inner_cv, random_state = 42,
# verbose = 2, n_iter = model_config['parameter']['iterations'],
# scoring = custom_scorer)

##define grid search
search = GridSearchCV(model, parameters, scoring = custom_scorer, cv = inner_cv ,
                            refit=True,
                            verbose = 2)

Model tuning¶

# find best parameters
search.fit(X_train, y_train.values.ravel())

Fitting 10 folds for each of 16 candidates, totalling 160 fits
[CV] END ........................max_depth=4, n_estimators=4; total time=   0.1s
[CV] END ........................max_depth=4, n_estimators=4; total time=   0.1s

/opt/anaconda3/lib/python3.8/site-packages/numpy/core/fromnumeric.py:3372: RuntimeWarning: Mean of empty slice.
  return _methods._mean(a, axis=axis, dtype=dtype,
/opt/anaconda3/lib/python3.8/site-packages/numpy/core/_methods.py:170: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)
/opt/anaconda3/lib/python3.8/site-packages/numpy/core/fromnumeric.py:3372: RuntimeWarning: Mean of empty slice.
  return _methods._mean(a, axis=axis, dtype=dtype,
/opt/anaconda3/lib/python3.8/site-packages/numpy/core/_methods.py:170: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)

[CV] END ........................max_depth=4, n_estimators=4; total time=   0.1s
[CV] END ........................max_depth=4, n_estimators=4; total time=   0.1s
[CV] END ........................max_depth=4, n_estimators=4; total time=   0.1s

/opt/anaconda3/lib/python3.8/site-packages/numpy/core/fromnumeric.py:3372: RuntimeWarning: Mean of empty slice.
  return _methods._mean(a, axis=axis, dtype=dtype,
/opt/anaconda3/lib/python3.8/site-packages/numpy/core/_methods.py:170: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)

[CV] END ........................max_depth=4, n_estimators=4; total time=   0.1s
[CV] END ........................max_depth=4, n_estimators=4; total time=   0.1s
[CV] END ........................max_depth=4, n_estimators=4; total time=   0.1s

/opt/anaconda3/lib/python3.8/site-packages/numpy/core/fromnumeric.py:3372: RuntimeWarning: Mean of empty slice.
  return _methods._mean(a, axis=axis, dtype=dtype,
/opt/anaconda3/lib/python3.8/site-packages/numpy/core/_methods.py:170: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)

[CV] END ........................max_depth=4, n_estimators=4; total time=   0.1s
[CV] END ........................max_depth=4, n_estimators=4; total time=   0.1s
[CV] END ........................max_depth=4, n_estimators=5; total time=   0.1s

/opt/anaconda3/lib/python3.8/site-packages/numpy/core/fromnumeric.py:3372: RuntimeWarning: Mean of empty slice.
  return _methods._mean(a, axis=axis, dtype=dtype,
/opt/anaconda3/lib/python3.8/site-packages/numpy/core/_methods.py:170: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)

[CV] END ........................max_depth=4, n_estimators=5; total time=   0.1s
[CV] END ........................max_depth=4, n_estimators=5; total time=   0.1s

/opt/anaconda3/lib/python3.8/site-packages/numpy/core/fromnumeric.py:3372: RuntimeWarning: Mean of empty slice.
  return _methods._mean(a, axis=axis, dtype=dtype,
/opt/anaconda3/lib/python3.8/site-packages/numpy/core/_methods.py:170: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)
/opt/anaconda3/lib/python3.8/site-packages/numpy/core/fromnumeric.py:3372: RuntimeWarning: Mean of empty slice.
  return _methods._mean(a, axis=axis, dtype=dtype,
/opt/anaconda3/lib/python3.8/site-packages/numpy/core/_methods.py:170: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)

Tuning results¶

# all results
search.cv_results_
# best results
best_parameter = search.best_params_
print(best_parameter)

{'max_depth': 15, 'n_estimators': 15}

print(search.cv_results_)

{'mean_fit_time': array([0.07910135, 0.0982065 , 0.11717696, 0.28790522, 0.09567783,
       0.11873469, 0.14162459, 0.34995434, 0.11202571, 0.13834589,
       0.16678114, 0.40926919, 0.22247028, 0.27622738, 0.33125165,
       0.82284429]), 'std_fit_time': array([0.00054757, 0.00057304, 0.00047324, 0.00153448, 0.000497  ,
       0.00048053, 0.00053455, 0.0006884 , 0.00074375, 0.0003917 ,
       0.00098656, 0.00142268, 0.00157012, 0.00198704, 0.00261009,
       0.00620867]), 'mean_score_time': array([0.00200298, 0.00230398, 0.00218599, 0.00282063, 0.00200024,
       0.00215628, 0.00212386, 0.00288062, 0.00210047, 0.00209129,
       0.00227525, 0.00294757, 0.00250533, 0.00261865, 0.00280118,
       0.00419517]), 'std_score_time': array([9.40121341e-05, 2.91441787e-04, 6.05983911e-05, 9.70042079e-05,
       8.13428988e-05, 6.40603215e-05, 5.89370004e-05, 6.04334242e-05,
       7.92547970e-05, 5.62807496e-05, 6.06109970e-05, 3.98567128e-05,
       3.56372298e-05, 2.05566595e-05, 2.04081290e-05, 8.94169993e-05]), 'param_max_depth': masked_array(data=[4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 15, 15, 15, 15],
             mask=[False, False, False, False, False, False, False, False,
                   False, False, False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_n_estimators': masked_array(data=[4, 5, 6, 15, 4, 5, 6, 15, 4, 5, 6, 15, 4, 5, 6, 15],
             mask=[False, False, False, False, False, False, False, False,
                   False, False, False, False, False, False, False, False],
       fill_value='?',
            dtype=object), 'params': [{'max_depth': 4, 'n_estimators': 4}, {'max_depth': 4, 'n_estimators': 5}, {'max_depth': 4, 'n_estimators': 6}, {'max_depth': 4, 'n_estimators': 15}, {'max_depth': 5, 'n_estimators': 4}, {'max_depth': 5, 'n_estimators': 5}, {'max_depth': 5, 'n_estimators': 6}, {'max_depth': 5, 'n_estimators': 15}, {'max_depth': 6, 'n_estimators': 4}, {'max_depth': 6, 'n_estimators': 5}, {'max_depth': 6, 'n_estimators': 6}, {'max_depth': 6, 'n_estimators': 15}, {'max_depth': 15, 'n_estimators': 4}, {'max_depth': 15, 'n_estimators': 5}, {'max_depth': 15, 'n_estimators': 6}, {'max_depth': 15, 'n_estimators': 15}], 'split0_test_score': array([-0.22356737,         nan,         nan,         nan, -0.17631073,
       -0.23260765, -0.26466566, -0.26091029, -0.19545767, -0.00365546,
       -0.01002528, -0.20395971, -0.11561418, -0.14427118, -0.15891525,
       -0.13814708]), 'split1_test_score': array([        nan, -0.00343132, -0.00166672,         nan, -0.11760898,
       -0.10730376, -0.10323661, -0.02698559, -0.19125562, -0.08052456,
       -0.20810883, -0.21739078, -0.14024986, -0.10484981, -0.1450001 ,
       -0.20412106]), 'split2_test_score': array([-0.26763712, -0.26953656, -0.26297051, -0.21356642, -0.24709154,
       -0.19767323, -0.19816198, -0.20118594, -0.4002039 , -0.32317861,
       -0.31107532, -0.1576869 , -0.18625956, -0.17669661, -0.1954207 ,
       -0.07514732]), 'split3_test_score': array([        nan,         nan,         nan,         nan,         nan,
               nan, -0.27646142,         nan, -0.17471473, -0.24543689,
       -0.26149261, -0.24965397, -0.21896856, -0.19814096, -0.13173146,
       -0.15091711]), 'split4_test_score': array([        nan,         nan, -0.28129067, -0.22755668, -0.23903165,
       -0.27385613, -0.2651069 , -0.21707105, -0.19135661, -0.23357873,
       -0.23986346, -0.20827053, -0.1400037 , -0.16622308, -0.14647477,
       -0.1795446 ]), 'split5_test_score': array([-0.51958256, -0.56408845, -0.59375905, -0.4584404 , -0.45172014,
       -0.37826115, -0.38224149,         nan, -0.36979099, -0.51955243,
       -0.5566457 ,         nan, -0.22510709, -0.21127382, -0.23323217,
       -0.16665255]), 'split6_test_score': array([-0.23646745, -0.23178952, -0.25123896, -0.23781047, -0.22843724,
       -0.24057155, -0.23436155, -0.1790539 , -0.24712481, -0.22962242,
       -0.23133954, -0.14803534, -0.09045553, -0.10182204, -0.09921888,
       -0.04999728]), 'split7_test_score': array([        nan,         nan,         nan,         nan,         nan,
               nan, -0.15462395, -0.03459257, -0.25257486, -0.07586206,
       -0.07014462, -0.11873855, -0.17710481, -0.14550134, -0.13326182,
       -0.11681136]), 'split8_test_score': array([-0.20091869,         nan, -0.29202825, -0.1600823 , -0.17153428,
       -0.10057573, -0.15181858, -0.14601029, -0.19840723, -0.19914027,
       -0.16850033, -0.18106035, -0.11018954, -0.09736984, -0.10844179,
       -0.11416779]), 'split9_test_score': array([-0.28270229, -0.23935701, -0.26083698, -0.26802271, -0.229502  ,
       -0.21438243, -0.22358974, -0.2744847 , -0.15242453, -0.16883386,
       -0.20223639, -0.19611431, -0.08238933, -0.10346069, -0.07716221,
       -0.08251064]), 'mean_test_score': array([        nan,         nan,         nan,         nan,         nan,
               nan, -0.22542679,         nan, -0.2373311 , -0.20793853,
       -0.22594321,         nan, -0.14863422, -0.14496094, -0.14288592,
       -0.12780168]), 'std_test_score': array([       nan,        nan,        nan,        nan,        nan,
              nan, 0.07516502,        nan, 0.0793955 , 0.13781651,
       0.13898625,        nan, 0.04848454, 0.04017781, 0.04338951,
       0.04672973]), 'rank_test_score': array([ 9, 10, 11, 12, 13, 14,  6, 15,  8,  5,  7, 16,  4,  3,  2,  1],
      dtype=int32)}

print(search.best_score_)

-0.1278016798323091

Fit model with best parameters¶

# define model class to use
model = RandomForestRegressor(random_state = 42,
                               n_estimators = best_parameter['n_estimators'],
                              max_depth = best_parameter['max_depth']
                              )

# find best parameters
model.fit(X_train, y_train.values.ravel())

RandomForestRegressor(max_depth=15, n_estimators=15, random_state=42)

y_test.values.ravel()

array([0.64628737, 0.75724209, 0.11095472, ..., 1.        , 0.7690012 ,
       0.2309988 ])

Pickling the best model (also saved within mlflow as an artifact)¶

import pickle

#changing directory
os.chdir('/home/desktop3/itu/models/')
path = os.getcwd()
print(path)

/home/desktop3/itu/models

#Pickle the model
# save the model to disk
filename = 'RF_model.sav'
pickle.dump(model, open(filename, 'wb'))

Evaluation¶

# predict holdout
#pred = model.predict(X_test)
y_pred = model.predict(X_test)

# Absolute error
errors = abs(y_pred - y_test.iloc[:,0].to_numpy())
avg_error = np.mean(errors)

#Low tail error
errors_low = abs(y_pred[y_pred<model_config['parameter']['threshold']] - np.asarray(y_test[y_pred<model_config['parameter']['threshold']]).flatten())

#Low tail error
errors_low_ytest = abs(y_pred[np.asarray(y_test).flatten()<model_config['parameter']['threshold']]
                       - np.asarray(y_test[np.asarray(y_test).flatten()<model_config['parameter']['threshold']]).flatten())

#avg error
avg_error_low = np.mean(errors_low)


#avg error
avg_error_low_ytest = np.mean(errors_low_ytest)

#standard deviation
stan_dev_low= np.std(errors_low)



print('errors: ', errors)
print('avg error: ', avg_error)
# print('Just the lower errors: ', errors_low)
print('Mean lower error: ', avg_error_low)
print('Mean ytest lower error: ', avg_error_low_ytest)
# print('y test error: ', errors_low_ytest)
print('Standard Dev of Low Error: ', stan_dev_low)

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-155-deea4b29d28a> in <module>
      8 
      9 #Low tail error
---> 10 errors_low = abs(y_pred[y_pred<model_config['parameter']['threshold']] - np.asarray(y_test[y_pred<model_config['parameter']['threshold']]).flatten())
     11 
     12 #Low tail error

ValueError: operands could not be broadcast together with shapes (51,) (153,)

Examining the errors¶

#Checking to see the raw amount of school
print('The amount of schools predicted to be lower than 30%: ', len(pred[pred<.3]))
print('The amount of schools that are actually below 30%: ', len(y_test[y_test['target']<0.3]))

The amount of schools predicted to be lower than 30%:  10
The amount of schools that are actually below 30%:  127

#Creating y_test dataframe to merge back
y_test['Predictions']= pred.tolist()
y_test['Errors']= abs(y_test['target']-y_test['Predictions'])
y_test

#Only values where ground truth less than .3
onlygtvalues = y_test.loc[y_test['target']<.3]

# Merge y_test back into main df
df_merge = pd.merge(y_test, dataset, how= "inner", left_index=True, right_index=True)

High_Error_Schools = df_merge.loc[df_merge['Errors']>.3]
High_Error_Schools.shape

(257, 32)

High_Pred_Schools = df_merge.loc[df_merge['Predictions']>.7]
High_Pred_Schools

df_merge.shape

(3520, 32)

#Creating a geodataframe
from shapely import wkt

df_merge['school_location'] = geopandas.GeoSeries.from_wkt(df_merge['school_location'])
gdf = geopandas.GeoDataFrame(df_merge, geometry='school_location')

#Subset by just low Ground truth values
Low_GT = gdf.loc[gdf['target_x']<.3]
Low_pred = gdf.loc[gdf['Predictions']<.3]

Low_GT.loc[Low_GT['Predictions']>.3]

Plotting the errors¶

fig,ax =plt.subplots(1, figsize=(14,6))

# add a title and annotation
ax.set_title('Predictions for schools in test set', fontdict={'fontsize': '13', 'fontweight' : '3'})

gdf.plot(column="Predictions", cmap = 'viridis' ,legend=True, ax=ax)
# ctx.add_basemap(ax)

plt.show()

fig,ax =plt.subplots(1, figsize=(14,6))

# add a title and annotation
ax.set_title('Errors in Schools where Prediction is less than 30% connected', fontdict={'fontsize': '13', 'fontweight' : '3'})

#Do errors where ground truth is below .3 and also pred below .3
Low_pred.plot(column="Errors", cmap = 'viridis' ,legend=True, ax=ax)

plt.show()

vmin, vmax= 0, .7

fig,ax =plt.subplots(1, figsize=(14,6))

# add a title and annotation
ax.set_title('Errors in Schools where Ground Truth is less than 30% connected', fontdict={'fontsize': '13', 'fontweight' : '3'})

#Do errors where ground truth is below .3 and also pred below .3
Low_GT.plot(column="Errors", cmap = 'viridis' ,legend=True, ax=ax)

plt.show()

vmin, vmax= 0, .7

fig,ax =plt.subplots(1, figsize=(14,6))

# add a title and annotation
ax.set_title('Errors in Schools in Brazil test set', fontdict={'fontsize': '13', 'fontweight' : '3'})

gdf.plot(column="Errors", cmap = 'viridis' ,legend=True, ax=ax)

plt.show()

y = y_test.iloc[:,0].to_numpy()
y_pred = pred

fig = px.scatter(x=y, y=y_pred, labels={'x': 'ground truth', 'y': 'prediction'}, 
                 title = 'Comparison between predictions and reality',
                 template = 'plotly_dark')
fig.update_traces(marker=dict(size=3, 
                              color=((abs(y-y_pred) < 0.15).astype('int')),
                              colorscale=[[0, '#FAED27'],[1, '#98FB98']])
                             )
fig.add_shape(
    type="line", line=dict(dash='dash'),
    x0=y.min(), y0=y.min(),
    x1=y.max(), y1=y.max()
)
fig.show()

res_df = pd.DataFrame()
res_df['prediction'] = y_pred
res_df['ground truth'] = y
#res_df['train'] = y_train
res_df['residual'] = (pred - y_test.iloc[:,0].to_numpy())
fig = px.scatter(
    res_df, x='ground truth', y='residual',
    #marginal_y='violin',
    trendline='ols', template = 'plotly_dark',
    title = 'Comparison between residuals and reality'
)
fig.update_traces(marker=dict(size=3, 
                              color=((abs(res_df.residual) < 0.15).astype('int')),
                              colorscale=[[0, '#FAED27'],[1, '#98FB98']])
                             )
fig.show()

fig = px.scatter(
    res_df, x='prediction', y='residual',
    #marginal_y='violin',
    trendline='ols', template = 'plotly_dark',
    title = 'Comparison between residuals and predictions'
)
fig.update_traces(marker=dict(size=3, 
                              color=((abs(res_df.residual) < 0.15).astype('int')),
                              colorscale=[[0, '#FAED27'],[1, '#98FB98']])
                             )
fig.show()

online_pop = [pred, y_test.iloc[:,0].to_numpy()]
labels = ['predictions', 'reality']
         
fig = ff.create_distplot(online_pop, labels, show_hist = False)
fig.layout.update({'title':'Comparison of distributions of reality and predictions',
                   'title_font_color':'white',
                   'legend_bgcolor':'#545454',
                   'font_color':'white',
                   'plot_bgcolor':'#545454',
                   'paper_bgcolor':'#2a2a2a',
                   'yaxis':{'gridcolor':'#2a2a2a', 'zerolinecolor':'#2a2a2a'},
                   'xaxis':{'gridcolor':'#2a2a2a'}
                   })
fig.show()

Model Interpretation¶

Examining feature importances¶

# Make the dataframe
importance = pd.DataFrame(
    {"Feature": X.columns, "Importance": model.feature_importances_}
).sort_values("Importance")

importance

import matplotlib.pyplot as plt
fig,ax =plt.subplots(1, figsize=(14,6))

# add a title and annotation
ax.set_title('Feature Importances', fontdict={'fontsize': '13', 'fontweight' : '3'})

(pd.Series(model.feature_importances_, index=X.columns)
   .nsmallest(12).plot(kind='barh'))

<AxesSubplot:title={'center':'Feature Importances'}>

	source_school_id	latitude	longitude	school_location	geometry	target	mean_ghm	mean_avg_rad	change_year_avg_rad	slope_year_avg_rad	...	slope_year_NDVI	change_month_NDVI	slope_month_NDVI	estimate_dau	estimate_mau	estimate_ready	range	avg_d_kbps	avg_u_kbps	population
0	11000023	-8.7585	-63.8540	POINT (-63.854 -8.7585)	POLYGON ((-63.844 -8.7585, -63.84404815273328 ...	1.0	0.778150	29.486260	1.040262	0.173377	...	23.184182	352.089866	2.551376	180872	270000	True	1.0	111541.0	41429.0	21621.277
1	11001364	-8.7627	-63.8642	POINT (-63.8642 -8.762700000000001)	POLYGON ((-63.8542 -8.762700000000001, -63.854...	1.0	0.813903	32.686109	-0.064534	-0.010756	...	24.580166	336.748641	2.440208	189483	280000	True	1.0	19693.0	16480.0	25260.330
2	11003138	-8.7579	-63.8554	POINT (-63.8554 -8.757899999999999)	POLYGON ((-63.84540000000001 -8.75789999999999...	1.0	0.781545	29.852872	0.962008	0.160335	...	23.466914	356.627458	2.584257	180714	270000	True	1.0	111541.0	41429.0	32302.738

	target	Predictions	Errors
4217	0.646287	0.757242	0.110955
10753	0.603167	0.757242	0.154075
6862	0.861206	0.769001	0.092205
10890	1.000000	0.814451	0.185549
6422	0.701705	0.757242	0.055538
...	...	...	...
11297	0.859788	0.757242	0.102546
7402	0.916813	0.769001	0.147812
5860	0.628346	0.625350	0.002995
7383	0.743563	0.701304	0.042260
5095	1.000000	0.769001	0.230999

	target_x	Predictions	Errors	source_school_id	latitude	longitude	school_location	geometry	target_y	mean_ghm	...	slope_year_NDVI	change_month_NDVI	slope_month_NDVI	estimate_dau	estimate_mau	estimate_ready	range	avg_d_kbps	avg_u_kbps	population
4217	0.646287	0.757242	0.110955	33136424	-22.8980	-43.5612	POINT (-43.5612 -22.898)	POLYGON ((-43.5512 -22.898, -43.55124815273328...	0.646287	0.850619	...	47.371039	-18.119513	-0.131301	286201	440000	True	1.0	13498.0	8925.0	16845.479
10753	0.603167	0.757242	0.154075	35191905	-23.6853	-46.7645	POINT (-46.7645 -23.6853)	POLYGON ((-46.7545 -23.6853, -46.7545481527332...	0.603167	0.820720	...	49.638491	360.960122	2.615653	477677	740000	True	1.0	23018.0	10107.0	81397.070
6862	0.861206	0.769001	0.092205	41151321	-25.4652	-49.2637	POINT (-49.2637 -25.4652)	POLYGON ((-49.2537 -25.4652, -49.2537481527332...	0.861206	0.927893	...	-3.049995	-43.557768	-0.315636	501140	740000	True	1.0	43183.0	20725.0	17793.387
10890	1.000000	0.814451	0.185549	35201558	-23.6397	-46.5570	POINT (-46.557 -23.6397)	POLYGON ((-46.547 -23.6397, -46.54704815273328...	1.000000	0.941395	...	44.330307	192.747505	1.396721	505241	750000	True	1.0	20768.0	7836.0	48765.023
6422	0.701705	0.757242	0.055538	35087658	-23.6506	-46.7752	POINT (-46.7752 -23.6506)	POLYGON ((-46.7652 -23.6506, -46.7652481527332...	0.701705	0.905878	...	44.500204	199.094558	1.442714	605050	920000	True	1.0	19232.0	9061.0	81984.070
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2273	0.819661	0.757242	0.062418	26179563	-8.1382	-34.9738	POINT (-34.9738 -8.138199999999999)	POLYGON ((-34.9638 -8.138199999999999, -34.963...	0.819661	0.662177	...	-1.000300	-529.923684	-3.840027	175496	280000	True	1.0	8060.0	4091.0	17098.629
11297	0.859788	0.757242	0.102546	43293069	-30.1587	-51.1714	POINT (-51.1714 -30.1587)	POLYGON ((-51.1614 -30.1587, -51.1614481527332...	0.859788	0.654317	...	32.973480	184.810163	1.339204	100934	150000	True	1.0	28656.0	15522.0	14002.161
7402	0.916813	0.769001	0.147812	50006304	-20.4430	-54.6207	POINT (-54.6207 -20.443)	POLYGON ((-54.6107 -20.443, -54.61074815273328...	0.916813	0.838306	...	-6.521056	42.282435	0.306394	187619	280000	True	1.0	9010.0	18841.0	14832.796
7383	0.743563	0.701304	0.042260	50023268	-20.5192	-54.6528	POINT (-54.6528 -20.5192)	POLYGON ((-54.6428 -20.5192, -54.6428481527332...	0.743563	0.824927	...	-21.618274	126.488789	0.916585	169553	250000	True	1.0	17964.0	21932.0	22000.328
5095	1.000000	0.769001	0.230999	35052462	-23.5477	-46.5314	POINT (-46.5314 -23.5477)	POLYGON ((-46.5214 -23.5477, -46.5214481527332...	1.000000	0.937670	...	41.812569	113.288040	0.820928	425181	660000	True	1.0	34611.0	12885.0	46067.285

	target_x	Predictions	Errors	source_school_id	latitude	longitude	school_location	geometry	target_y	mean_ghm	...	slope_year_NDVI	change_month_NDVI	slope_month_NDVI	estimate_dau	estimate_mau	estimate_ready	range	avg_d_kbps	avg_u_kbps	population
10884	0.226581	0.757242	0.530661	35463361	-23.6343	-46.7861	POINT (-46.78610 -23.63430)	POLYGON ((-46.7761 -23.6343, -46.7761481527332...	0.226581	0.881110	...	46.104083	202.052343	1.464147	529637	800000	True	1.0	14617.0	5047.0	70512.72000
3520	0.182751	0.660610	0.477858	32004257	-18.8462	-41.1259	POINT (-41.12590 -18.84620)	POLYGON ((-41.1159 -18.8462, -41.1159481527332...	0.182751	0.376236	...	51.010652	614.754626	4.454744	2607	3800	True	1.0	29117.0	12463.0	712.15967
8076	0.226581	0.757242	0.530661	35906438	-23.6325	-46.7838	POINT (-46.78380 -23.63250)	POLYGON ((-46.7738 -23.6325, -46.7738481527332...	0.226581	0.885265	...	46.598101	200.930922	1.456021	534820	810000	True	1.0	21332.0	13140.0	70512.72000
2227	0.000000	0.757242	0.757242	26109255	-8.1487	-34.9350	POINT (-34.93500 -8.14870)	POLYGON ((-34.925 -8.1487, -34.92504815273328 ...	0.000000	0.770606	...	7.016483	-477.589528	-3.460794	288657	490000	True	1.0	19016.0	14455.0	40151.73400
5216	0.283260	0.769001	0.485741	35237206	-23.4992	-46.4456	POINT (-46.44560 -23.49920)	POLYGON ((-46.4356 -23.4992, -46.4356481527332...	0.283260	0.919346	...	45.165093	310.164783	2.247571	607192	940000	True	1.0	20874.0	9890.0	37301.99600
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
4954	0.000000	0.769001	0.769001	35052224	-23.4902	-46.6176	POINT (-46.61760 -23.49020)	POLYGON ((-46.60760000000001 -23.4902, -46.607...	0.000000	0.913985	...	49.607272	130.863354	0.948285	371466	580000	True	1.0	31305.0	10573.0	45300.36000
5204	0.283260	0.769001	0.485741	35002872	-23.4999	-46.4548	POINT (-46.45480 -23.49990)	POLYGON ((-46.4448 -23.4999, -46.4448481527332...	0.283260	0.923450	...	46.947221	285.283845	2.067274	578935	890000	True	1.0	49289.0	12209.0	45736.30000
561	0.156091	0.354302	0.198211	15028712	-1.0963	-48.6672	POINT (-48.66720 -1.09630)	POLYGON ((-48.6572 -1.0963, -48.65724815273328...	0.156091	0.221789	...	-0.005057	-769.659676	-5.577244	2083	3600	True	1.0	26178.0	22267.0	89.23215
1162	0.288257	0.351950	0.063693	21029644	-2.8216	-42.8776	POINT (-42.87760 -2.82160)	POLYGON ((-42.8676 -2.8216, -42.86764815273328...	0.288257	0.251747	...	55.154046	1469.419944	10.647971	1156	2700	True	1.0	459.0	847.0	260.47064
2724	0.121238	0.492794	0.371555	29073073	-11.4377	-40.4247	POINT (-40.42470 -11.43770)	POLYGON ((-40.4147 -11.4377, -40.4147481527332...	0.121238	0.252865	...	68.665824	-2903.342207	-21.038712	498	1000	True	1.0	9830.0	2704.0	1532.29200

	Feature	Importance
0	avg_d_kbps	0.000000
1	avg_u_kbps	0.006089
5	slope_year_avg_rad	0.011660
11	population	0.017606
8	slope_month_cf_cvg	0.026004
2	mean_ghm	0.033470
9	mean_NDVI	0.043408
6	slope_month_avg_rad	0.058660
4	mean_cf_cvg	0.115417
10	slope_year_NDVI	0.120491
7	estimate_mau	0.123015
3	mean_avg_rad	0.444180