Group components:
Short explanation video (PT-BR):https://youtu.be/2JaCGJ2HU40
In [192]:
#System libraries
import os
import sys
#Basic libraries for data analysis
import numpy as np
from numpy import random
import pandas as pd
#Choropleth necessary libraries
##GeoJson data
import json
##Necessary to create shapes in folium
from shapely.geometry import Polygon
from shapely.geometry import Point
##Choropleth itself
import folium
##Colormap
from branca.colormap import linear
In [2]:
# dataset name
dataset_pop_2017 = os.path.join('data', 'population_2017.csv')
# read the data to a dataframe
data2017 = pd.read_csv(dataset_pop_2017)
# eliminate spaces in name of columns
data2017.columns = [cols.replace(' ', '_') for cols in data2017.columns]
data2017.head()
Out[2]:
In [163]:
# Filtering data about northeast of Brazil
dataStateNames = data2017[(data2017['UF'] == 'RN') | (data2017['UF'] == 'PB') | (data2017['UF'] == 'PE') | (data2017['UF'] == 'MA') | (data2017['UF'] == 'CE') | (data2017['UF'] == 'BA') | (data2017['UF'] == 'AL') | (data2017['UF'] == 'PI') | (data2017['UF'] == 'SE')]
# Used to diff municipalities
#dataStateNames.to_csv('nomesIBGE_CidadesOrdenado.csv')
# Sort dataset by city name
dataStateNames = dataStateNames.sort_values('NOME_DO_MUNICÍPIO')
dataStateNames
Out[163]:
In [164]:
# searching the files in geojson/geojs-xx-mun.json
ma_states = os.path.join('geojson', 'geojs-21-mun.json')
pi_states = os.path.join('geojson', 'geojs-22-mun.json')
ce_states = os.path.join('geojson', 'geojs-23-mun.json')
rn_states = os.path.join('geojson', 'geojs-24-mun.json')
pb_states = os.path.join('geojson', 'geojs-25-mun.json')
pe_states = os.path.join('geojson', 'geojs-26-mun.json')
al_states = os.path.join('geojson', 'geojs-27-mun.json')
se_states = os.path.join('geojson', 'geojs-28-mun.json')
ba_states = os.path.join('geojson', 'geojs-29-mun.json')
# load the data and use 'latin-1'encoding because the accent
geo_json_data_ma = json.load(open(ma_states,encoding='latin-1'))
geo_json_data_pi = json.load(open(pi_states,encoding='latin-1'))
geo_json_data_ce = json.load(open(ce_states,encoding='latin-1'))
geo_json_data_rn = json.load(open(rn_states,encoding='latin-1'))
geo_json_data_pb = json.load(open(pb_states,encoding='latin-1'))
geo_json_data_pe = json.load(open(pe_states,encoding='latin-1'))
geo_json_data_al = json.load(open(al_states,encoding='latin-1'))
geo_json_data_se = json.load(open(se_states,encoding='latin-1'))
geo_json_data_ba = json.load(open(ba_states,encoding='latin-1'))
#Merging all files in a single json structure
geo_json_data_northeast = geo_json_data_ma
geo_json_data_northeast['features'].extend(geo_json_data_pi['features'])
geo_json_data_northeast['features'].extend(geo_json_data_ce['features'])
geo_json_data_northeast['features'].extend(geo_json_data_rn['features'])
geo_json_data_northeast['features'].extend(geo_json_data_pb['features'])
geo_json_data_northeast['features'].extend(geo_json_data_pe['features'])
geo_json_data_northeast['features'].extend(geo_json_data_al['features'])
geo_json_data_northeast['features'].extend(geo_json_data_se['features'])
geo_json_data_northeast['features'].extend(geo_json_data_ba['features'])
In [166]:
# Used to diff municipalities
i=0
for cities in geo_json_data_northeast['features'][:]:
#print(str(i)+' '+cities['properties']['name'])
print(cities['properties']['name'])
i = i+1
We found some misinformation about the na me of the municipalities regarding to IBGE information and the GeoJson information. Below we sumarize what we have found that there is no match: and
Another references: https://ww2.ibge.gov.br/home/estatistica/populacao/estimativa2011/tab_Municipios_TCU.pdf https://biblioteca.ibge.gov.br/visualizacao/dtbs/pernambuco/quixaba.pdf
We did not found any geojson information about the municipalitie Nazária - PI and we decided to eliminate Nazária from the IBGE data because Nazaria is a emancipated munipalitie from Teresina, capital of Terezina, and the data about the territory is attached to Teresina.
In [168]:
#Belém de São Francisco -> Belém do São Francisco
geo_json_data_northeast['features'][1031]['properties']['description'] = 'Belém do São Francisco'
geo_json_data_northeast['features'][1031]['properties']['name'] = 'Belém do São Francisco'
print(geo_json_data_northeast['features'][1031]['properties']['name'])
In [169]:
#Campo de Santana -> Tacima
geo_json_data_northeast['features'][1003]['properties']['description'] = 'Tacima'
geo_json_data_northeast['features'][1003]['properties']['name'] = 'Tacima'
print(geo_json_data_northeast['features'][1003]['properties']['name'])
In [170]:
#Gracho Cardoso -> Graccho Cardoso
geo_json_data_northeast['features'][1324]['properties']['description'] = 'Graccho Cardoso'
geo_json_data_northeast['features'][1324]['properties']['name'] = 'Graccho Cardoso'
print(geo_json_data_northeast['features'][1324]['properties']['name'])
In [171]:
#Iguaraci -> Iguaracy
geo_json_data_northeast['features'][1089]['properties']['description'] = 'Iguaracy'
geo_json_data_northeast['features'][1089]['properties']['name'] = 'Iguaracy'
print(geo_json_data_northeast['features'][1089]['properties']['name'])
In [172]:
# Itapagé -> Itapajé
geo_json_data_northeast['features'][526]['properties']['description'] = 'Itapajé'
geo_json_data_northeast['features'][526]['properties']['name'] = 'Itapajé'
print(geo_json_data_northeast['features'][526]['properties']['name'])
In [173]:
# Santarém -> Joca Claudino
geo_json_data_northeast['features'][964]['properties']['description'] = 'Joca Claudino'
geo_json_data_northeast['features'][964]['properties']['name'] = 'Joca Claudino'
print(geo_json_data_northeast['features'][964]['properties']['name'])
In [174]:
# Lagoa do Itaenga -> Lagoa de Itaenga
geo_json_data_northeast['features'][1111]['properties']['description'] = 'Lagoa de Itaenga'
geo_json_data_northeast['features'][1111]['properties']['name'] = 'Lagoa de Itaenga'
print(geo_json_data_northeast['features'][1111]['properties']['name'])
In [175]:
# Quixabá -> Quixaba
geo_json_data_northeast['features'][1144]['properties']['description'] = 'Quixaba'
geo_json_data_northeast['features'][1144]['properties']['name'] = 'Quixaba'
print(geo_json_data_northeast['features'][1144]['properties']['name'])
In [176]:
# Quixabá -> Quixaba
geo_json_data_northeast['features'][946]['properties']['description'] = 'Quixaba'
geo_json_data_northeast['features'][946]['properties']['name'] = 'Quixaba'
print(geo_json_data_northeast['features'][946]['properties']['name'])
In [177]:
# Presidente Juscelino->Serra Caiada
geo_json_data_northeast['features'][736]['properties']['description'] = 'Serra Caiada'
geo_json_data_northeast['features'][736]['properties']['name'] = 'Serra Caiada'
print(geo_json_data_northeast['features'][736]['properties']['name'])
In [178]:
# Seridó->São Vicente do Seridó
geo_json_data_northeast['features'][990]['properties']['description'] = 'São Vicente do Seridó'
geo_json_data_northeast['features'][990]['properties']['name'] = 'São Vicente do Seridó'
print(geo_json_data_northeast['features'][990]['properties']['name'])
In [181]:
dataStateNames[(dataStateNames['NOME_DO_MUNICÍPIO']=='Nazária')]
Out[181]:
Removing Nazária from the municipalities of IBGE
In [233]:
# Removing Nazária from the municipalities of IBGE
dataStateNames = dataStateNames[dataStateNames['NOME_DO_MUNICÍPIO']!='Nazária']
len(dataStateNames)
Out[233]:
In [191]:
dataStateNames[dataStateNames['NOME_DO_MUNICÍPIO']=='Nazária']
Out[191]:
In [232]:
cities_ne = []
# list all cities in the state
for city in geo_json_data_northeast['features']:
cities_ne.append(city['properties']['description'])
len(cities_ne)
Out[232]:
In [30]:
frames = [dataRN, dataPB, dataPE, dataMA, dataCE, dataBA, dataPI, dataSE]
dataNordeste = pd.concat(frames)
print(len(dataNordeste))
#adjusting to the correct data type
dataNordeste['COD._UF'] = dataNordeste['COD._UF'].astype(int)
dataNordeste['COD._MUNIC'] = dataNordeste['COD._MUNIC'].astype(int)
dataNordeste['POPULAÇÃO_ESTIMADA'] = dataNordeste['POPULAÇÃO_ESTIMADA'].astype(int)
dataNordeste.dtypes
Out[30]:
After all the procediments to make the population data and the GeoJson data match with the municipalities names we could now proceed to create the choropleth itself.
In [226]:
dataNordeste.head()
Out[226]:
In [231]:
dataNordeste_dictionary = dataNordeste.set_index('NOME_DO_MUNICÍPIO')['POPULAÇÃO_ESTIMADA']
print(len(dataNordeste))
dataNordeste['id'] = dataNordeste['UF']+dataNordeste['NOME_DO_MUNICÍPIO']
dataNordeste_dict = dataNordeste.set_index('id')['POPULAÇÃO_ESTIMADA']
print(len(dataNordeste_dictionary))
print(len(dataNordeste_dict))
In [195]:
colorscale = linear.YlGnBu.scale(dataNordeste['POPULAÇÃO_ESTIMADA'].min(), dataNordeste['POPULAÇÃO_ESTIMADA'].max())
colorscale
Out[195]:
In [241]:
# Create a map object
#Centering at Brazil's northeast
m8 = folium.Map(
location = [-10.116657, -42.542580],
zoom_start=6,
tiles='cartodbpositron'
)
We could utilize a threshold scale function to differenciate the cities by color. One of most used practices is do linearly split the range of the data with a function like Numpy function
np.linspace(MIN,MAX, STEPS, TYPE).tolist()
Branca library also has a function to create a threshold scale however we did not made use of this functions because we did not liked to linearly split the range of population and match the colors based on this. Linearly spliting the threshold will only show the extremity, all the villages and towns and the megacities. So, we make a manual split, putting the minimum population has the lower level and the max population the upper range of the threhold. We divided the following cities in 250K, 800K, 1.5M and 2M. Making the division in that way we could see the main cities and all the other greaty majority of all cities, under 150k people, could be classified in the same manner/color.
| Threshold Scale | Min | 2 | 3 | 4 | 5 | MAX |
|---|---|---|---|---|---|---|
| np.linspace | 1228 | 591779 | 1182331 | 1772882 | 2363434 | 2953986 |
| our division | 20000 | 100000 | 300000 | 1000000 | 1500000 | 2500000 |
In [242]:
m8.add_child(folium.LatLngPopup())
# create a threshold of legend
threshold_scale = np.linspace(dataNordeste['POPULAÇÃO_ESTIMADA'].min(),
dataNordeste['POPULAÇÃO_ESTIMADA'].max(), 6, dtype=int).tolist()
print(threshold_scale)
#threshold_scale = [dataNordeste['POPULAÇÃO_ESTIMADA'].min(), 250000, 800000, 150000, 200000, dataNordeste['POPULAÇÃO_ESTIMADA'].max()]
threshold_scale = [20000,100000,300000,1000000,1500000,2500000]
print(threshold_scale)
In [243]:
m8.choropleth(
geo_data=geo_json_data_northeast,
data=dataNordeste,
columns=['NOME_DO_MUNICÍPIO', 'POPULAÇÃO_ESTIMADA'],
key_on='feature.properties.name',
fill_color='YlGnBu',
legend_name='Population estimation (2017)',
highlight=True,
threshold_scale = threshold_scale,
line_color='green',
line_weight=0.2,
line_opacity=0.6
)
m8.save('outputFolium.html')