In [1]:
%matplotlib inline
import pandas as pd
import requests as req
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind, ttest_rel
np.set_printoptions(precision=3)
In [2]:
url = 'http://pt.wikipedia.org/wiki/Lista_de_unidades_federativas_do_Brasil_por_IDH'
# In[3]:
html_text = req.get(url).text
# In[4]:
table = pd.read_html(html_text, attrs={"class":"wikitable"})[0]
# In[5]:
def idh_format(str):
num = float(str)/1000.0
return num
# ### Pré-Processando IDH-M Data
# In[6]:
"""
0,800 – 1 (Muito alto) - idh_level = 0
0,700 - 0,799 (Alto) - idh_level = 0
0,600 - 0,699 (Médio) - idh_level = 1
0,500 - 0,599 (Baixo) - idh_level = 2
0 - 0,499 (Muito baixo)- idh_level = 3
"""
def idh_level(x):
if x >= 0.7:
return 0
elif 0.6 <= x < 0.7:
return 1
elif 0.5 <= x < 0.6:
return 2
elif 0.4 <= x < 0.5:
return 3
else: raise Exception("Invalid!")
"""
Abaixo da mediana de 2000 = level 0
Igual ou acima da mediana de 2000 = level 1
"""
def idh_level2(t):
def __level(x):
if x >= t[4][2:].apply(lambda x: float(x)).median():
return 1
else: return 0
return __level
# In[7]:
idhm_df = pd.DataFrame({u'Estado':table[2][2:].tolist(),u'I2010':table[3][2:].apply(idh_format).tolist(),u'I2000':table[4][2:].apply(idh_format).tolist()})
idhm_df["Ratio"] = idhm_df["I2010"]/idhm_df["I2000"]
idhm_df["idh_level_2000"] = idhm_df["I2000"].apply(idh_level2(table))
# In[15]:
st_pa = np.array([
[u"Distrito Federal", 0.0, 0.0, 1.0],
[u"São Paulo", 0.0, 0.925, 0.075],
[u"Santa Catarina", 0.0, 0.0, 1.0],
[u"Rio de Janeiro", 0.4, 0.0, 0.6],
[u"Paraná", 0.0, 0.0, 1.0],
[u"Rio Grande do Sul", 0.2, 0.4, 0.4],
[u"Espírito Santo", 0.0, 0.2, 0.8],
[u"Goiás", 0.0, 0.6, 0.4],
[u"Minas Gerais", 0.0, 0.8, 0.2],
[u"Mato Grosso do Sul", 0.0, 0.6, 0.4],
[u"Mato Grosso", 0.0, 0.2, 0.8],
[u"Amapá", 0.075, 0.0, 0.925],
[u"Roraima", 0.275, 0.4, 0.325], # double check
[u"Tocantins", 0.0, 0.2, 0.8],
[u"Rondônia", 0.0, 0.4, 0.6],
[u"Rio Grande do Norte", 0.0, 0.0, 1.0],
[u"Ceará", 0.6, 0.0, 0.4],
[u"Amazonas", 0.0, 0.0, 1.0],
[u"Pernambuco", 0.0, 0.0, 1.0],
[u"Sergipe", 0.4, 0.2, 0.4],
[u"Acre", 1.0, 0.0, 0.0],
[u"Bahia", 0.4, 0.0, 0.6],
[u"Paraíba", 0.0, 0.55, 0.45],
[u"Piauí", 0.8, 0.0, 0.2],
[u"Pará", 0.4, 0.6, 0.0],
[u"Maranhão", 0.0, 0.0, 1.0],
[u"Alagoas", 0.0, 0.0, 1.0],
])
st_re = np.array([
[u"Distrito Federal", u"Centro-Oeste"],
[u"São Paulo", u"Sudeste"],
[u"Santa Catarina", u"Sul"],
[u"Rio de Janeiro", u"Sudeste"],
[u"Paraná", u"Sul"],
[u"Rio Grande do Sul", u"Sul"],
[u"Espírito Santo", u"Sudeste"],
[u"Goiás", u"Centro-Oeste"],
[u"Minas Gerais", u"Sudeste"],
[u"Mato Grosso do Sul", u"Centro-Oeste"],
[u"Mato Grosso", u"Centro-Oeste"],
[u"Amapá", u"Norte"],
[u"Roraima", u"Norte"], # double check
[u"Tocantins", u"Norte"],
[u"Rondônia", u"Norte"],
[u"Rio Grande do Norte", u"Nordeste"],
[u"Ceará", u"Nordeste"],
[u"Amazonas", u"Norte"],
[u"Pernambuco", u"Nordeste"],
[u"Sergipe", u"Nordeste"],
[u"Acre", u"Norte"],
[u"Bahia", u"Nordeste"],
[u"Paraíba", u"Nordeste"],
[u"Piauí", u"Nordeste"],
[u"Pará", u"Norte"],
[u"Maranhão", u"Nordeste"],
[u"Alagoas", u"Nordeste"],
])
state_parties_df = pd.DataFrame({"Estado":st_pa[:,0],"PSDB":np.float64(st_pa[:,2]),"PT":np.float64(st_pa[:,1]),"Outros":np.float64(st_pa[:,3])})
state_regions_df = pd.DataFrame({"Estado":st_re[:,0],"Regiao":st_re[:,1]})
#df = idhm_df.merge(state_parties_df, on="Estado")
#df
In [3]:
idhm_df.to_csv("../data/brazil_states_idh_2000_2010.csv")
In [4]:
state_parties_df.to_csv("../data/brazil_states_parties_2000-2010.csv")
In [5]:
state_regions_df.to_csv("../data/brazil_states_regions.csv")
In [6]:
table = pd.read_html(html_text, attrs={"class":"wikitable"})[1]
table
Out[6]:
In [7]:
f = idh_level2(table)
f(0.5)
Out[7]:
In [8]:
idhr_df = pd.DataFrame({u'Estado':table[2][2:].tolist(),u'I2010':table[3][2:].apply(lambda x: float(x)).tolist(),u'I2000':table[4][2:].apply(lambda x: float(x)).tolist()})
idhr_df["Ratio"] = idhr_df["I2010"]/idhr_df["I2000"]
idhr_df["idh_level_2000"] = idhr_df["I2000"].apply(idh_level2(table))
In [9]:
idhr_df
Out[9]:
In [10]:
table = pd.read_html(html_text, attrs={"class":"wikitable"})[2]
idhl_df = pd.DataFrame({u'Estado':table[2][2:].tolist(),u'I2010':table[3][2:].apply(idh_format).tolist(),u'I2000':table[4][2:].apply(idh_format).tolist()})
idhl_df["Ratio"] = idhl_df["I2010"]/idhl_df["I2000"]
idhl_df["idh_level_2000"] = idhl_df["I2000"].apply(lambda x: 1 if x >= idhl_df["I2000"].mean() else 0)
In [11]:
idhl_df
Out[11]:
In [12]:
table = pd.read_html(html_text, attrs={"class":"wikitable"})[3]
idhe_df = pd.DataFrame({u'Estado':table[2][2:].tolist(),u'I2010':table[3][2:].apply(lambda x: float(x)).tolist(),u'I2000':table[4][2:].apply(lambda x: float(x)).tolist()})
idhe_df["Ratio"] = idhe_df["I2010"]/idhe_df["I2000"]
idhe_df["idh_level_2000"] = idhe_df["I2000"].apply(idh_level2(table))
In [13]:
idhe_df
Out[13]:
In [14]:
idhr_df.to_csv("../data/brazil_states_idhr_2000_2010.csv")
In [15]:
idhl_df.to_csv("../data/brazil_states_idhl_2000_2010.csv")
In [16]:
idhe_df.to_csv("../data/brazil_states_idhe_2000_2010.csv")
In [ ]: