{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Predicting the worlds offline population\n", "## First Model Training and Evaluation \n", "## (Random Forest on Brazil)" ] }, { "cell_type": "code", "execution_count": 119, "metadata": {}, "outputs": [], "source": [ "# load libraries\n", "import yaml\n", "import lightgbm\n", "import matplotlib.pyplot as plt\n", "import pandas as pd\n", "import geopandas\n", "import numpy as np\n", "import plotly.figure_factory as ff\n", "import plotly.express as px\n", "from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV, GridSearchCV\n", "from sklearn.ensemble import RandomForestRegressor" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Load and prepare data" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/home/desktop3\n" ] } ], "source": [ "import os\n", "import pandas as pd\n", "\n", "#you will need to reset this to your own working directory\n", "os.chdir('/home/desktop3')\n", "path = os.getcwd()\n", "print(path)" ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [], "source": [ "# read in (yaml) configs\n", "with open(path + '/itu/conf/model_config.yaml', 'r') as conf:\n", " model_config = yaml.safe_load(conf)\n", " \n", "# import data\n", "dataset = path + model_config['model']['loc'] + model_config['model']['file']\n", "dataset = pd.read_csv(dataset)\n", "# subset for faster trial and error\n", "#dataset = dataset.iloc[0:1000,:]\n", "\n", "# define predictors and target\n", "predictor = model_config['meta']['predictors']\n", "target = model_config['meta']['target']" ] }, { "cell_type": "code", "execution_count": 71, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['source_school_id', 'latitude', 'longitude', 'school_location',\n", " 'geometry', 'target', 'mean_ghm', 'mean_avg_rad', 'change_year_avg_rad',\n", " 'slope_year_avg_rad', 'change_month_avg_rad', 'slope_month_avg_rad',\n", " 'mean_cf_cvg', 'change_year_cf_cvg', 'slope_year_cf_cvg',\n", " 'change_month_cf_cvg', 'slope_month_cf_cvg', 'mean_NDVI',\n", " 'change_year_NDVI', 'slope_year_NDVI', 'change_month_NDVI',\n", " 'slope_month_NDVI', 'estimate_dau', 'estimate_mau', 'estimate_ready',\n", " 'range', 'avg_d_kbps', 'avg_u_kbps', 'population'],\n", " dtype='object')" ] }, "execution_count": 71, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#All the columns available\n", "dataset.columns" ] }, { "cell_type": "code", "execution_count": 72, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | source_school_id | \n", "latitude | \n", "longitude | \n", "school_location | \n", "geometry | \n", "target | \n", "mean_ghm | \n", "mean_avg_rad | \n", "change_year_avg_rad | \n", "slope_year_avg_rad | \n", "... | \n", "slope_year_NDVI | \n", "change_month_NDVI | \n", "slope_month_NDVI | \n", "estimate_dau | \n", "estimate_mau | \n", "estimate_ready | \n", "range | \n", "avg_d_kbps | \n", "avg_u_kbps | \n", "population | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "11000023 | \n", "-8.7585 | \n", "-63.8540 | \n", "POINT (-63.854 -8.7585) | \n", "POLYGON ((-63.844 -8.7585, -63.84404815273328 ... | \n", "1.0 | \n", "0.778150 | \n", "29.486260 | \n", "1.040262 | \n", "0.173377 | \n", "... | \n", "23.184182 | \n", "352.089866 | \n", "2.551376 | \n", "180872 | \n", "270000 | \n", "True | \n", "1.0 | \n", "111541.0 | \n", "41429.0 | \n", "21621.277 | \n", "
1 | \n", "11001364 | \n", "-8.7627 | \n", "-63.8642 | \n", "POINT (-63.8642 -8.762700000000001) | \n", "POLYGON ((-63.8542 -8.762700000000001, -63.854... | \n", "1.0 | \n", "0.813903 | \n", "32.686109 | \n", "-0.064534 | \n", "-0.010756 | \n", "... | \n", "24.580166 | \n", "336.748641 | \n", "2.440208 | \n", "189483 | \n", "280000 | \n", "True | \n", "1.0 | \n", "19693.0 | \n", "16480.0 | \n", "25260.330 | \n", "
2 | \n", "11003138 | \n", "-8.7579 | \n", "-63.8554 | \n", "POINT (-63.8554 -8.757899999999999) | \n", "POLYGON ((-63.84540000000001 -8.75789999999999... | \n", "1.0 | \n", "0.781545 | \n", "29.852872 | \n", "0.962008 | \n", "0.160335 | \n", "... | \n", "23.466914 | \n", "356.627458 | \n", "2.584257 | \n", "180714 | \n", "270000 | \n", "True | \n", "1.0 | \n", "111541.0 | \n", "41429.0 | \n", "32302.738 | \n", "
3 rows × 29 columns
\n", "\n", " | target | \n", "Predictions | \n", "Errors | \n", "
---|---|---|---|
4217 | \n", "0.646287 | \n", "0.757242 | \n", "0.110955 | \n", "
10753 | \n", "0.603167 | \n", "0.757242 | \n", "0.154075 | \n", "
6862 | \n", "0.861206 | \n", "0.769001 | \n", "0.092205 | \n", "
10890 | \n", "1.000000 | \n", "0.814451 | \n", "0.185549 | \n", "
6422 | \n", "0.701705 | \n", "0.757242 | \n", "0.055538 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "
11297 | \n", "0.859788 | \n", "0.757242 | \n", "0.102546 | \n", "
7402 | \n", "0.916813 | \n", "0.769001 | \n", "0.147812 | \n", "
5860 | \n", "0.628346 | \n", "0.625350 | \n", "0.002995 | \n", "
7383 | \n", "0.743563 | \n", "0.701304 | \n", "0.042260 | \n", "
5095 | \n", "1.000000 | \n", "0.769001 | \n", "0.230999 | \n", "
3520 rows × 3 columns
\n", "" ], "text/plain": [ " target Predictions Errors\n", "4217 0.646287 0.757242 0.110955\n", "10753 0.603167 0.757242 0.154075\n", "6862 0.861206 0.769001 0.092205\n", "10890 1.000000 0.814451 0.185549\n", "6422 0.701705 0.757242 0.055538\n", "... ... ... ...\n", "11297 0.859788 0.757242 0.102546\n", "7402 0.916813 0.769001 0.147812\n", "5860 0.628346 0.625350 0.002995\n", "7383 0.743563 0.701304 0.042260\n", "5095 1.000000 0.769001 0.230999\n", "\n", "[3520 rows x 3 columns]" ] }, "execution_count": 86, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Creating y_test dataframe to merge back\n", "y_test['Predictions']= pred.tolist()\n", "y_test['Errors']= abs(y_test['target']-y_test['Predictions'])\n", "y_test" ] }, { "cell_type": "code", "execution_count": 87, "metadata": { "scrolled": false }, "outputs": [], "source": [ "#Only values where ground truth less than .3\n", "onlygtvalues = y_test.loc[y_test['target']<.3]" ] }, { "cell_type": "code", "execution_count": 100, "metadata": { "scrolled": true }, "outputs": [], "source": [ "# Merge y_test back into main df\n", "df_merge = pd.merge(y_test, dataset, how= \"inner\", left_index=True, right_index=True)" ] }, { "cell_type": "code", "execution_count": 101, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "(257, 32)" ] }, "execution_count": 101, "metadata": {}, "output_type": "execute_result" } ], "source": [ "High_Error_Schools = df_merge.loc[df_merge['Errors']>.3]\n", "High_Error_Schools.shape" ] }, { "cell_type": "code", "execution_count": 102, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "\n", " | target_x | \n", "Predictions | \n", "Errors | \n", "source_school_id | \n", "latitude | \n", "longitude | \n", "school_location | \n", "geometry | \n", "target_y | \n", "mean_ghm | \n", "... | \n", "slope_year_NDVI | \n", "change_month_NDVI | \n", "slope_month_NDVI | \n", "estimate_dau | \n", "estimate_mau | \n", "estimate_ready | \n", "range | \n", "avg_d_kbps | \n", "avg_u_kbps | \n", "population | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
4217 | \n", "0.646287 | \n", "0.757242 | \n", "0.110955 | \n", "33136424 | \n", "-22.8980 | \n", "-43.5612 | \n", "POINT (-43.5612 -22.898) | \n", "POLYGON ((-43.5512 -22.898, -43.55124815273328... | \n", "0.646287 | \n", "0.850619 | \n", "... | \n", "47.371039 | \n", "-18.119513 | \n", "-0.131301 | \n", "286201 | \n", "440000 | \n", "True | \n", "1.0 | \n", "13498.0 | \n", "8925.0 | \n", "16845.479 | \n", "
10753 | \n", "0.603167 | \n", "0.757242 | \n", "0.154075 | \n", "35191905 | \n", "-23.6853 | \n", "-46.7645 | \n", "POINT (-46.7645 -23.6853) | \n", "POLYGON ((-46.7545 -23.6853, -46.7545481527332... | \n", "0.603167 | \n", "0.820720 | \n", "... | \n", "49.638491 | \n", "360.960122 | \n", "2.615653 | \n", "477677 | \n", "740000 | \n", "True | \n", "1.0 | \n", "23018.0 | \n", "10107.0 | \n", "81397.070 | \n", "
6862 | \n", "0.861206 | \n", "0.769001 | \n", "0.092205 | \n", "41151321 | \n", "-25.4652 | \n", "-49.2637 | \n", "POINT (-49.2637 -25.4652) | \n", "POLYGON ((-49.2537 -25.4652, -49.2537481527332... | \n", "0.861206 | \n", "0.927893 | \n", "... | \n", "-3.049995 | \n", "-43.557768 | \n", "-0.315636 | \n", "501140 | \n", "740000 | \n", "True | \n", "1.0 | \n", "43183.0 | \n", "20725.0 | \n", "17793.387 | \n", "
10890 | \n", "1.000000 | \n", "0.814451 | \n", "0.185549 | \n", "35201558 | \n", "-23.6397 | \n", "-46.5570 | \n", "POINT (-46.557 -23.6397) | \n", "POLYGON ((-46.547 -23.6397, -46.54704815273328... | \n", "1.000000 | \n", "0.941395 | \n", "... | \n", "44.330307 | \n", "192.747505 | \n", "1.396721 | \n", "505241 | \n", "750000 | \n", "True | \n", "1.0 | \n", "20768.0 | \n", "7836.0 | \n", "48765.023 | \n", "
6422 | \n", "0.701705 | \n", "0.757242 | \n", "0.055538 | \n", "35087658 | \n", "-23.6506 | \n", "-46.7752 | \n", "POINT (-46.7752 -23.6506) | \n", "POLYGON ((-46.7652 -23.6506, -46.7652481527332... | \n", "0.701705 | \n", "0.905878 | \n", "... | \n", "44.500204 | \n", "199.094558 | \n", "1.442714 | \n", "605050 | \n", "920000 | \n", "True | \n", "1.0 | \n", "19232.0 | \n", "9061.0 | \n", "81984.070 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
2273 | \n", "0.819661 | \n", "0.757242 | \n", "0.062418 | \n", "26179563 | \n", "-8.1382 | \n", "-34.9738 | \n", "POINT (-34.9738 -8.138199999999999) | \n", "POLYGON ((-34.9638 -8.138199999999999, -34.963... | \n", "0.819661 | \n", "0.662177 | \n", "... | \n", "-1.000300 | \n", "-529.923684 | \n", "-3.840027 | \n", "175496 | \n", "280000 | \n", "True | \n", "1.0 | \n", "8060.0 | \n", "4091.0 | \n", "17098.629 | \n", "
11297 | \n", "0.859788 | \n", "0.757242 | \n", "0.102546 | \n", "43293069 | \n", "-30.1587 | \n", "-51.1714 | \n", "POINT (-51.1714 -30.1587) | \n", "POLYGON ((-51.1614 -30.1587, -51.1614481527332... | \n", "0.859788 | \n", "0.654317 | \n", "... | \n", "32.973480 | \n", "184.810163 | \n", "1.339204 | \n", "100934 | \n", "150000 | \n", "True | \n", "1.0 | \n", "28656.0 | \n", "15522.0 | \n", "14002.161 | \n", "
7402 | \n", "0.916813 | \n", "0.769001 | \n", "0.147812 | \n", "50006304 | \n", "-20.4430 | \n", "-54.6207 | \n", "POINT (-54.6207 -20.443) | \n", "POLYGON ((-54.6107 -20.443, -54.61074815273328... | \n", "0.916813 | \n", "0.838306 | \n", "... | \n", "-6.521056 | \n", "42.282435 | \n", "0.306394 | \n", "187619 | \n", "280000 | \n", "True | \n", "1.0 | \n", "9010.0 | \n", "18841.0 | \n", "14832.796 | \n", "
7383 | \n", "0.743563 | \n", "0.701304 | \n", "0.042260 | \n", "50023268 | \n", "-20.5192 | \n", "-54.6528 | \n", "POINT (-54.6528 -20.5192) | \n", "POLYGON ((-54.6428 -20.5192, -54.6428481527332... | \n", "0.743563 | \n", "0.824927 | \n", "... | \n", "-21.618274 | \n", "126.488789 | \n", "0.916585 | \n", "169553 | \n", "250000 | \n", "True | \n", "1.0 | \n", "17964.0 | \n", "21932.0 | \n", "22000.328 | \n", "
5095 | \n", "1.000000 | \n", "0.769001 | \n", "0.230999 | \n", "35052462 | \n", "-23.5477 | \n", "-46.5314 | \n", "POINT (-46.5314 -23.5477) | \n", "POLYGON ((-46.5214 -23.5477, -46.5214481527332... | \n", "1.000000 | \n", "0.937670 | \n", "... | \n", "41.812569 | \n", "113.288040 | \n", "0.820928 | \n", "425181 | \n", "660000 | \n", "True | \n", "1.0 | \n", "34611.0 | \n", "12885.0 | \n", "46067.285 | \n", "
2694 rows × 32 columns
\n", "\n", " | target_x | \n", "Predictions | \n", "Errors | \n", "source_school_id | \n", "latitude | \n", "longitude | \n", "school_location | \n", "geometry | \n", "target_y | \n", "mean_ghm | \n", "... | \n", "slope_year_NDVI | \n", "change_month_NDVI | \n", "slope_month_NDVI | \n", "estimate_dau | \n", "estimate_mau | \n", "estimate_ready | \n", "range | \n", "avg_d_kbps | \n", "avg_u_kbps | \n", "population | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
10884 | \n", "0.226581 | \n", "0.757242 | \n", "0.530661 | \n", "35463361 | \n", "-23.6343 | \n", "-46.7861 | \n", "POINT (-46.78610 -23.63430) | \n", "POLYGON ((-46.7761 -23.6343, -46.7761481527332... | \n", "0.226581 | \n", "0.881110 | \n", "... | \n", "46.104083 | \n", "202.052343 | \n", "1.464147 | \n", "529637 | \n", "800000 | \n", "True | \n", "1.0 | \n", "14617.0 | \n", "5047.0 | \n", "70512.72000 | \n", "
3520 | \n", "0.182751 | \n", "0.660610 | \n", "0.477858 | \n", "32004257 | \n", "-18.8462 | \n", "-41.1259 | \n", "POINT (-41.12590 -18.84620) | \n", "POLYGON ((-41.1159 -18.8462, -41.1159481527332... | \n", "0.182751 | \n", "0.376236 | \n", "... | \n", "51.010652 | \n", "614.754626 | \n", "4.454744 | \n", "2607 | \n", "3800 | \n", "True | \n", "1.0 | \n", "29117.0 | \n", "12463.0 | \n", "712.15967 | \n", "
8076 | \n", "0.226581 | \n", "0.757242 | \n", "0.530661 | \n", "35906438 | \n", "-23.6325 | \n", "-46.7838 | \n", "POINT (-46.78380 -23.63250) | \n", "POLYGON ((-46.7738 -23.6325, -46.7738481527332... | \n", "0.226581 | \n", "0.885265 | \n", "... | \n", "46.598101 | \n", "200.930922 | \n", "1.456021 | \n", "534820 | \n", "810000 | \n", "True | \n", "1.0 | \n", "21332.0 | \n", "13140.0 | \n", "70512.72000 | \n", "
2227 | \n", "0.000000 | \n", "0.757242 | \n", "0.757242 | \n", "26109255 | \n", "-8.1487 | \n", "-34.9350 | \n", "POINT (-34.93500 -8.14870) | \n", "POLYGON ((-34.925 -8.1487, -34.92504815273328 ... | \n", "0.000000 | \n", "0.770606 | \n", "... | \n", "7.016483 | \n", "-477.589528 | \n", "-3.460794 | \n", "288657 | \n", "490000 | \n", "True | \n", "1.0 | \n", "19016.0 | \n", "14455.0 | \n", "40151.73400 | \n", "
5216 | \n", "0.283260 | \n", "0.769001 | \n", "0.485741 | \n", "35237206 | \n", "-23.4992 | \n", "-46.4456 | \n", "POINT (-46.44560 -23.49920) | \n", "POLYGON ((-46.4356 -23.4992, -46.4356481527332... | \n", "0.283260 | \n", "0.919346 | \n", "... | \n", "45.165093 | \n", "310.164783 | \n", "2.247571 | \n", "607192 | \n", "940000 | \n", "True | \n", "1.0 | \n", "20874.0 | \n", "9890.0 | \n", "37301.99600 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
4954 | \n", "0.000000 | \n", "0.769001 | \n", "0.769001 | \n", "35052224 | \n", "-23.4902 | \n", "-46.6176 | \n", "POINT (-46.61760 -23.49020) | \n", "POLYGON ((-46.60760000000001 -23.4902, -46.607... | \n", "0.000000 | \n", "0.913985 | \n", "... | \n", "49.607272 | \n", "130.863354 | \n", "0.948285 | \n", "371466 | \n", "580000 | \n", "True | \n", "1.0 | \n", "31305.0 | \n", "10573.0 | \n", "45300.36000 | \n", "
5204 | \n", "0.283260 | \n", "0.769001 | \n", "0.485741 | \n", "35002872 | \n", "-23.4999 | \n", "-46.4548 | \n", "POINT (-46.45480 -23.49990) | \n", "POLYGON ((-46.4448 -23.4999, -46.4448481527332... | \n", "0.283260 | \n", "0.923450 | \n", "... | \n", "46.947221 | \n", "285.283845 | \n", "2.067274 | \n", "578935 | \n", "890000 | \n", "True | \n", "1.0 | \n", "49289.0 | \n", "12209.0 | \n", "45736.30000 | \n", "
561 | \n", "0.156091 | \n", "0.354302 | \n", "0.198211 | \n", "15028712 | \n", "-1.0963 | \n", "-48.6672 | \n", "POINT (-48.66720 -1.09630) | \n", "POLYGON ((-48.6572 -1.0963, -48.65724815273328... | \n", "0.156091 | \n", "0.221789 | \n", "... | \n", "-0.005057 | \n", "-769.659676 | \n", "-5.577244 | \n", "2083 | \n", "3600 | \n", "True | \n", "1.0 | \n", "26178.0 | \n", "22267.0 | \n", "89.23215 | \n", "
1162 | \n", "0.288257 | \n", "0.351950 | \n", "0.063693 | \n", "21029644 | \n", "-2.8216 | \n", "-42.8776 | \n", "POINT (-42.87760 -2.82160) | \n", "POLYGON ((-42.8676 -2.8216, -42.86764815273328... | \n", "0.288257 | \n", "0.251747 | \n", "... | \n", "55.154046 | \n", "1469.419944 | \n", "10.647971 | \n", "1156 | \n", "2700 | \n", "True | \n", "1.0 | \n", "459.0 | \n", "847.0 | \n", "260.47064 | \n", "
2724 | \n", "0.121238 | \n", "0.492794 | \n", "0.371555 | \n", "29073073 | \n", "-11.4377 | \n", "-40.4247 | \n", "POINT (-40.42470 -11.43770) | \n", "POLYGON ((-40.4147 -11.4377, -40.4147481527332... | \n", "0.121238 | \n", "0.252865 | \n", "... | \n", "68.665824 | \n", "-2903.342207 | \n", "-21.038712 | \n", "498 | \n", "1000 | \n", "True | \n", "1.0 | \n", "9830.0 | \n", "2704.0 | \n", "1532.29200 | \n", "
119 rows × 32 columns
\n", "\n", " | Feature | \n", "Importance | \n", "
---|---|---|
0 | \n", "avg_d_kbps | \n", "0.000000 | \n", "
1 | \n", "avg_u_kbps | \n", "0.006089 | \n", "
5 | \n", "slope_year_avg_rad | \n", "0.011660 | \n", "
11 | \n", "population | \n", "0.017606 | \n", "
8 | \n", "slope_month_cf_cvg | \n", "0.026004 | \n", "
2 | \n", "mean_ghm | \n", "0.033470 | \n", "
9 | \n", "mean_NDVI | \n", "0.043408 | \n", "
6 | \n", "slope_month_avg_rad | \n", "0.058660 | \n", "
4 | \n", "mean_cf_cvg | \n", "0.115417 | \n", "
10 | \n", "slope_year_NDVI | \n", "0.120491 | \n", "
7 | \n", "estimate_mau | \n", "0.123015 | \n", "
3 | \n", "mean_avg_rad | \n", "0.444180 | \n", "