import numpy as np
import pandas as pd
import yaml
import matplotlib.pyplot as plt
from matplotlib.cm import ScalarMappable
import matplotlib.ticker as mtick
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
import seaborn as sns
import warnings
import math
import geopandas as gp
import random
import zipfile
from io import StringIO
from urllib.request import urlopen
from datetime import datetime
from shapely.geometry import Point
from adjustText import adjust_text
import glob
import os
pd.options.mode.chained_assignment = None
pd.set_option('display.float_format', lambda x: '%.3f' % x)
random.seed(1234)
mapbox_access_token = 'pk.eyJ1IjoidXRrdWNhbm96dHVyayIsImEiOiJja3F0Njdka28wNHkyMnZwOGs2OGV3aW81In0.HRXxTxlUy1C6DGfkTmNjOQ'
px.set_mapbox_access_token(mapbox_access_token)
from feature_engineering import configs
# get training set dir
ts_dir = '../../../data/training_sets/' + configs.COUNTRY.title() + '/'
# get file name with the latest version
file_name = sorted(os.listdir(ts_dir))[-1]
# read data
df = pd.read_csv(ts_dir + file_name)
df.head(1)
dic = pd.read_excel('../../../data/meta/training_dict_' + configs.COUNTRY_CODE.lower() + '.xlsx', engine = 'openpyxl')
dic.head()
num = dic.loc[dic.type == 'num', 'name'].values.tolist()
cat = dic.loc[dic.type == 'cat', 'name'].values.tolist()
geo = dic.loc[dic.type == 'geo', 'name'].values.tolist()
target = dic.loc[dic.role == 'target', 'name'].values[0]
df['target_cat'] = ['not connected' if i<.3 else 'connected' for i in df[target]]
df_num = df[num]
df_cat = df[cat]
imp_feature =['estimate_mau', 'mean_avg_rad', 'mean_ghm', 'mean_cf_cvg']
for i in cat:
df[i] = pd.Categorical(df[i])
df.describe()
for c in cat:
print(c + ':\n')
print(df[c].value_counts())
fig, ax = plt.subplots(figsize=(15,15))
ax = sns.heatmap(df_num.corr(), vmax=1.0, vmin=-1.0, center=0, cmap='RdBu_r', square=True,
fmt='.1f', linewidths=.5, cbar_kws={"shrink": .5}, annot_kws={"fontsize":10});
ax.set_title('Correlation between Variables',size=14);
ax.tick_params(bottom=False, left=False)
for location in ['left', 'right', 'bottom', 'top']:
ax.spines[location].set_visible(False)
corr_pairs=df[num + cat].corr().unstack()
sorted_pairs = corr_pairs.sort_values(kind="quicksort")
strong_pairs = sorted_pairs[(sorted_pairs!=1) & (abs(sorted_pairs) > 0.8)]
strong_vars = [i[1] for i in strong_pairs.index] # getting variable names
strong_vars = list( dict.fromkeys(strong_vars) ) # removing duplicates
df_strong = df[strong_vars]
df_strong_corr= df_strong.corr()
mask = np.zeros_like(df_strong_corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
fig ,ax = plt.subplots(figsize=(13,10))
sns.heatmap(df_strong_corr, mask=mask, annot=True, vmax=1.0, vmin=-1.0, fmt='.1f', center=0, cmap='RdBu_r',
square=True, linewidths=.5, annot_kws={"fontsize":10})
plt.title('Correlation between Strongly Correlated Pairs',size=14)
#ax.invert_yaxis()
ax.tick_params(bottom=False, left=False)
for location in ['left', 'right', 'bottom', 'top']:
ax.spines[location].set_visible(False)
# Label encode the categorical variables
df_labelencoded = df.copy()
for i in cat:
df_labelencoded[i] = df_labelencoded[i].cat.codes
corr_mat = df_labelencoded.corr().stack().reset_index(name="correlation")
corr_target=corr_mat[corr_mat['level_0']==target][1:]
corr_target=corr_target.reindex(corr_target.correlation.abs().sort_values().index)
corr_target = corr_target[:-1]
sns.set_theme(style="white", context="talk")
# Set up the matplotlib figure
fig, ax = plt.subplots(figsize=(20, 10))
data_color = corr_target['correlation'].apply(abs).tolist()
#data_color_2 = [x / max(data_color) for x in data_color]
my_cmap = plt.cm.get_cmap('Reds')
sm = ScalarMappable(cmap=my_cmap, norm=plt.Normalize(0,1))
sm.set_array([])
plt.xticks(rotation=90)
ax.axhline(0, color="k", clip_on=False)
ax.set_ylim([-1,1])
ax.set_ylabel("Correlation", size=16);
plt.title('Correlation between Predictors and the Target Variable')
colors = my_cmap(data_color)
rects = ax.bar(corr_target['level_1'],corr_target['correlation'], color=colors)
for p in ax.patches:
if p.get_height() > 0:
h = p.get_height()+.01
else:
h = p.get_height()-.06
ax.annotate(str('%.2f' % p.get_height()), (p.get_x()+.15, h), fontsize = 10)
ax.tick_params(bottom=False, left=False)
for location in ['right', 'top']:
ax.spines[location].set_visible(False)
cbar = plt.colorbar(sm)
cbar.set_label('Absolute Correlation Coefficient', rotation=270,labelpad=25)
sns.set_theme(style="white", context="talk")
fig, ax = plt.subplots(figsize=(10,7))
sns.histplot(df,x='target_cat',stat='density', shrink=.5, fill=True, color='r');
plt.ylabel('Density of class label',size=16);
plt.xlabel('Class Label',size=16);
plt.title('Target Variable Counts',size=18);
for p in ax.patches:
h = p.get_height()+.005
plt.annotate(str('%.2f' % p.get_height()), (p.get_x()+.2, h), color='k');
df_num.hist(figsize=(16, 20), bins=50, xlabelsize=8, ylabelsize=8);
fig, axes = plt.subplots(round(len(cat) / 3), 3, figsize=(12, 30))
for i, ax in enumerate(fig.axes):
if i < len(cat):
ax.set_xticklabels(ax.xaxis.get_majorticklabels(), rotation=45)
sns.countplot(x=cat[i], alpha=0.7, data=df_cat, ax=ax)
fig.tight_layout()
plt.figure(figsize=(15,6))
sns.histplot(df, x='mean_ghm', hue='target_cat', element='poly', stat='probability');
sns.set_theme(style="white", context="talk")
sns.pairplot(df[imp_feature + ['target_cat']],hue='target_cat',markers=["o", "s"], palette="Set2");
df_norm = df[['target_cat']+imp_feature]
for i in imp_feature:
df_norm[i] = (df_norm[i]-df_norm[i].min())/(df_norm[i].max()-df_norm[i].min())
df_mean=df_norm.groupby('target_cat').mean().stack().reset_index(name='mean')
df_mean.rename(columns={'level_1':'imp_var'},inplace=True)
df_mean=df_mean.pivot(index='imp_var', columns='target_cat', values='mean')
df_mean2=pd.DataFrame({'connected':df_mean.connected, 'nonconnected':df_mean['not connected'], 'total':df_mean.connected+df_mean['not connected']})
df_mean2.sort_values(by='total',ascending=False, inplace=True)
df_mean2.reset_index(inplace=True)
fig, ax = plt.subplots(figsize=(20, 10))
width = 0.35
x = np.arange(len(df_mean2.imp_var.tolist()))
xt=df_mean2.imp_var.tolist()
sns.set_color_codes("pastel")
p1=ax.bar(x-width/2, df_mean2.nonconnected, width, color="#99B1B6")
sns.set_color_codes("muted")
p2=ax.bar(x+width/2,df_mean2.connected,width, color="#0F626C")
plt.xticks(x, xt);
plt.ylabel('Average of variable quantile as percentage')
plt.xlabel('Variable Name')
plt.title('Average of variable quantiles between connectivity classes')
plt.legend((p1[0], p2[0]), ('<30% connected', 'Connected'));
fmt = '%.1f%%'
yticks = mtick.FormatStrFormatter(fmt)
ax.yaxis.set_major_formatter(yticks)
df_mean2
fig = px.scatter(df, x="target", y="mean_ghm", color="target_cat", marginal_y="violin",
marginal_x="box", trendline="ols", template="simple_white")
fig.show()
df_sum = df[['target_cat']+imp_feature]
for i in imp_feature:
df_sum[i] = (df_sum[i]-df_sum[i].min())/(df_sum[i].max()-df_sum[i].min())
df_sum2=pd.DataFrame({'connected':df_sum.connected, 'nonconnected':df_sum['not connected'], 'total':df_sum.connected+df_sum['not connected']})
df_sum2.sort_values(by='total',ascending=False, inplace=True)
df_sum2
df.target_cat.value_counts()['not connected']
fig = px.scatter_matrix(df, dimensions=imp_feature, color="target_cat")
fig.show()
for i in imp_feature:
fig = px.scatter(df, x=i, y=target, size="population", color ='target_cat', log_x=True, size_max=13)
fig.show()
for i in imp_feature:
fig = px.box(df, y=i, color="target_cat", notched=True)
fig.show()
for i in imp_feature:
print(i + ':\n')
fig = go.Figure(go.Scattermapbox(
lat=df.latitude,
lon=df.longitude,
mode='markers',
marker=go.scattermapbox.Marker(
size=3, color=df[i],
showscale=True,
colorscale=px.colors.diverging.RdYlGn,
),
text=['Montreal'],
))
fig.update_layout(
hovermode='closest',
mapbox=dict(
accesstoken=mapbox_access_token,
style='dark',
center = {"lat": df.latitude.mean(), "lon": df.longitude.mean()},
bearing=0,
pitch=0,
zoom=3,
)
)
fig.show()