In [1]:
%matplotlib inline
import pandas as pd
import requests as req
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind, ttest_rel

np.set_printoptions(precision=3)

In [2]:
url = 'http://pt.wikipedia.org/wiki/Lista_de_unidades_federativas_do_Brasil_por_IDH'


# In[3]:

html_text = req.get(url).text


# In[4]:

table = pd.read_html(html_text, attrs={"class":"wikitable"})[0]


# In[5]:

def idh_format(str):
    num = float(str)/1000.0
    return num


# ### Pré-Processando IDH-M Data

# In[6]:

"""
  0,800 – 1 (Muito alto) - idh_level = 0
  0,700 - 0,799 (Alto)   - idh_level = 0
  0,600 - 0,699 (Médio)  - idh_level = 1
  0,500 - 0,599 (Baixo)  - idh_level = 2
  0 - 0,499 (Muito baixo)- idh_level = 3
"""
def idh_level(x):
    if x >= 0.7:
        return 0
    elif 0.6 <= x < 0.7:
        return 1
    elif 0.5 <= x < 0.6:
        return 2
    elif 0.4 <= x < 0.5:
        return 3
    else: raise Exception("Invalid!")
    
"""
  Abaixo da mediana de 2000 = level 0
  Igual ou acima da mediana de 2000 = level 1
"""
def idh_level2(t):
    def __level(x):
        if x >= t[4][2:].apply(lambda x: float(x)).median():
            return 1
        else: return 0   
    return __level

# In[7]:

idhm_df = pd.DataFrame({u'Estado':table[2][2:].tolist(),u'I2010':table[3][2:].apply(idh_format).tolist(),u'I2000':table[4][2:].apply(idh_format).tolist()})
idhm_df["Ratio"] = idhm_df["I2010"]/idhm_df["I2000"]
idhm_df["idh_level_2000"] = idhm_df["I2000"].apply(idh_level2(table))



# In[15]:

st_pa = np.array([
        [u"Distrito Federal", 0.0, 0.0, 1.0],
        [u"São Paulo", 0.0, 0.925, 0.075],
        [u"Santa Catarina", 0.0, 0.0, 1.0],
        [u"Rio de Janeiro", 0.4, 0.0, 0.6],
        [u"Paraná", 0.0, 0.0, 1.0],
        [u"Rio Grande do Sul", 0.2, 0.4, 0.4],
        [u"Espírito Santo", 0.0, 0.2, 0.8],
        [u"Goiás", 0.0, 0.6, 0.4],
        [u"Minas Gerais", 0.0, 0.8, 0.2],
        [u"Mato Grosso do Sul", 0.0, 0.6, 0.4],
        [u"Mato Grosso", 0.0, 0.2, 0.8],
        [u"Amapá", 0.075, 0.0, 0.925],
        [u"Roraima", 0.275, 0.4, 0.325], # double check
        [u"Tocantins", 0.0, 0.2, 0.8], 
        [u"Rondônia", 0.0, 0.4, 0.6],
        [u"Rio Grande do Norte", 0.0, 0.0, 1.0],
        [u"Ceará", 0.6, 0.0, 0.4],
        [u"Amazonas", 0.0, 0.0, 1.0],
        [u"Pernambuco", 0.0, 0.0, 1.0],
        [u"Sergipe", 0.4, 0.2, 0.4],
        [u"Acre", 1.0, 0.0, 0.0],
        [u"Bahia", 0.4, 0.0, 0.6],
        [u"Paraíba", 0.0, 0.55, 0.45],
        [u"Piauí", 0.8, 0.0, 0.2],
        [u"Pará", 0.4, 0.6, 0.0],
        [u"Maranhão", 0.0, 0.0, 1.0],
        [u"Alagoas", 0.0, 0.0, 1.0],
       ])


st_re = np.array([
        [u"Distrito Federal", u"Centro-Oeste"],
        [u"São Paulo", u"Sudeste"],
        [u"Santa Catarina", u"Sul"],
        [u"Rio de Janeiro", u"Sudeste"],
        [u"Paraná", u"Sul"],
        [u"Rio Grande do Sul", u"Sul"],
        [u"Espírito Santo", u"Sudeste"],
        [u"Goiás", u"Centro-Oeste"],
        [u"Minas Gerais", u"Sudeste"],
        [u"Mato Grosso do Sul", u"Centro-Oeste"],
        [u"Mato Grosso", u"Centro-Oeste"],
        [u"Amapá", u"Norte"],
        [u"Roraima", u"Norte"], # double check
        [u"Tocantins", u"Norte"], 
        [u"Rondônia", u"Norte"],
        [u"Rio Grande do Norte", u"Nordeste"],
        [u"Ceará", u"Nordeste"],
        [u"Amazonas", u"Norte"],
        [u"Pernambuco", u"Nordeste"],
        [u"Sergipe", u"Nordeste"],
        [u"Acre", u"Norte"],
        [u"Bahia", u"Nordeste"],
        [u"Paraíba", u"Nordeste"],
        [u"Piauí", u"Nordeste"],
        [u"Pará", u"Norte"],
        [u"Maranhão", u"Nordeste"],
        [u"Alagoas", u"Nordeste"],
       ])


state_parties_df = pd.DataFrame({"Estado":st_pa[:,0],"PSDB":np.float64(st_pa[:,2]),"PT":np.float64(st_pa[:,1]),"Outros":np.float64(st_pa[:,3])})
state_regions_df = pd.DataFrame({"Estado":st_re[:,0],"Regiao":st_re[:,1]})

#df = idhm_df.merge(state_parties_df, on="Estado")
#df

In [3]:
idhm_df.to_csv("../data/brazil_states_idh_2000_2010.csv")

In [4]:
state_parties_df.to_csv("../data/brazil_states_parties_2000-2010.csv")

In [5]:
state_regions_df.to_csv("../data/brazil_states_regions.csv")

In [6]:
table = pd.read_html(html_text, attrs={"class":"wikitable"})[1]
table


Out[6]:
0 1 2 3 4
0 Posição Unidades federativas IDHM-Renda NaN NaN
1 Dados de 2010 Comparados aos de 2000 Em 2010 Em 2000 NaN
2 1 (0) Distrito Federal 0.863 0.805
3 2 (0) São Paulo 0.789 0.756
4 3 (0) Rio de Janeiro 0.782 0.745
5 4 (1) Santa Catarina 0.773 0.717
6 5 (1) Rio Grande do Sul 0.769 0.720
7 6 (0) Paraná 0.757 0.704
8 7 (2) Espírito Santo 0.743 0.687
9 8 (2) Goiás 0.742 0.686
10 9 (1) Mato Grosso do Sul 0.740 0.687
11 10 (3) Mato Grosso 0.732 0.689
12 11 (0) Minas Gerais 0.730 0.680
13 12 (0) Rondônia 0.712 0.654
14 13 (0) Roraima 0.695 0.652
15 14 (0) Amapá 0.694 0.638
16 15 (4) Tocantins 0.690 0.605
17 16 (1) Rio Grande do Norte 0.678 0.608
18 17 (1) Amazonas 0.677 0.608
19 18 (3) Pernambuco 0.673 0.615
20 19 (2) Sergipe 0.672 0.596
21 20 (4) Acre 0.671 0.612
22 21 (1) Bahia 0.663 0.594
23 22 (2) Paraíba 0.656 0.582
24 23 (0) Ceará 0.651 0.588
25 24 (4) Pará 0.646 0.601
26 25 (0) Alagoas 0.641 0.574
27 26 (0) Piauí 0.635 0.556
28 27 (0) Maranhão 0.612 0.531

In [7]:
f = idh_level2(table)
f(0.5)


Out[7]:
0

In [8]:
idhr_df = pd.DataFrame({u'Estado':table[2][2:].tolist(),u'I2010':table[3][2:].apply(lambda x: float(x)).tolist(),u'I2000':table[4][2:].apply(lambda x: float(x)).tolist()})
idhr_df["Ratio"] = idhr_df["I2010"]/idhr_df["I2000"]
idhr_df["idh_level_2000"] = idhr_df["I2000"].apply(idh_level2(table))

In [9]:
idhr_df


Out[9]:
Estado I2000 I2010 Ratio idh_level_2000
0 Distrito Federal 0.805 0.863 1.072050 1
1 São Paulo 0.756 0.789 1.043651 1
2 Rio de Janeiro 0.745 0.782 1.049664 1
3 Santa Catarina 0.717 0.773 1.078103 1
4 Rio Grande do Sul 0.720 0.769 1.068056 1
5 Paraná 0.704 0.757 1.075284 1
6 Espírito Santo 0.687 0.743 1.081514 1
7 Goiás 0.686 0.742 1.081633 1
8 Mato Grosso do Sul 0.687 0.740 1.077147 1
9 Mato Grosso 0.689 0.732 1.062409 1
10 Minas Gerais 0.680 0.730 1.073529 1
11 Rondônia 0.654 0.712 1.088685 1
12 Roraima 0.652 0.695 1.065951 1
13 Amapá 0.638 0.694 1.087774 1
14 Tocantins 0.605 0.690 1.140496 0
15 Rio Grande do Norte 0.608 0.678 1.115132 0
16 Amazonas 0.608 0.677 1.113487 0
17 Pernambuco 0.615 0.673 1.094309 0
18 Sergipe 0.596 0.672 1.127517 0
19 Acre 0.612 0.671 1.096405 0
20 Bahia 0.594 0.663 1.116162 0
21 Paraíba 0.582 0.656 1.127148 0
22 Ceará 0.588 0.651 1.107143 0
23 Pará 0.601 0.646 1.074875 0
24 Alagoas 0.574 0.641 1.116725 0
25 Piauí 0.556 0.635 1.142086 0
26 Maranhão 0.531 0.612 1.152542 0

In [10]:
table = pd.read_html(html_text, attrs={"class":"wikitable"})[2]
idhl_df = pd.DataFrame({u'Estado':table[2][2:].tolist(),u'I2010':table[3][2:].apply(idh_format).tolist(),u'I2000':table[4][2:].apply(idh_format).tolist()})
idhl_df["Ratio"] = idhl_df["I2010"]/idhl_df["I2000"]
idhl_df["idh_level_2000"] = idhl_df["I2000"].apply(lambda x: 1 if x >= idhl_df["I2000"].mean() else 0)

In [11]:
idhl_df


Out[11]:
Estado I2000 I2010 Ratio idh_level_2000
0 Distrito Federal 0.814 0.873 1.072482 1
1 Santa Catarina 0.812 0.860 1.059113 1
2 São Paulo 0.786 0.845 1.075064 1
3 Rio Grande do Sul 0.804 0.840 1.044776 1
4 Minas Gerais 0.759 0.838 1.104084 1
5 Rio de Janeiro 0.740 0.835 1.128378 1
6 Espírito Santo 0.777 0.835 1.074646 1
7 Mato Grosso do Sul 0.752 0.833 1.107713 1
8 Paraná 0.747 0.830 1.111111 1
9 Goiás 0.773 0.827 1.069858 1
10 Mato Grosso 0.740 0.821 1.109459 1
11 Amapá 0.711 0.813 1.143460 0
12 Roraima 0.717 0.809 1.128312 0
13 Amazonas 0.692 0.805 1.163295 0
14 Rondônia 0.688 0.800 1.162791 0
15 Ceará 0.713 0.793 1.112202 0
16 Tocantins 0.688 0.793 1.152616 0
17 Rio Grande do Norte 0.700 0.792 1.131429 0
18 Pernambuco 0.705 0.789 1.119149 0
19 Pará 0.725 0.789 1.088276 1
20 Bahia 0.680 0.783 1.151471 0
21 Paraíba 0.672 0.783 1.165179 0
22 Sergipe 0.678 0.781 1.151917 0
23 Piauí 0.676 0.777 1.149408 0
24 Acre 0.694 0.777 1.119597 0
25 Maranhão 0.649 0.757 1.166410 0
26 Alagoas 0.647 0.755 1.166924 0

In [12]:
table = pd.read_html(html_text, attrs={"class":"wikitable"})[3]
idhe_df = pd.DataFrame({u'Estado':table[2][2:].tolist(),u'I2010':table[3][2:].apply(lambda x: float(x)).tolist(),u'I2000':table[4][2:].apply(lambda x: float(x)).tolist()})
idhe_df["Ratio"] = idhe_df["I2010"]/idhe_df["I2000"]
idhe_df["idh_level_2000"] = idhe_df["I2000"].apply(idh_level2(table))

In [13]:
idhe_df


Out[13]:
Estado I2000 I2010 Ratio idh_level_2000
0 Distrito Federal 0.582 0.742 1.274914 1
1 São Paulo 0.581 0.719 1.237522 1
2 Santa Catarina 0.526 0.697 1.325095 1
3 Rio de Janeiro 0.530 0.675 1.273585 1
4 Paraná 0.522 0.668 1.279693 1
5 Espírito Santo 0.491 0.653 1.329939 1
6 Goiás 0.439 0.646 1.471526 1
7 Rio Grande do Sul 0.505 0.642 1.271287 1
8 Minas Gerais 0.470 0.638 1.357447 1
9 Mato Grosso 0.426 0.635 1.490610 1
10 Mato Grosso do Sul 0.445 0.629 1.413483 1
11 Amapá 0.424 0.629 1.483491 1
12 Roraima 0.457 0.628 1.374179 1
13 Tocantins 0.348 0.624 1.793103 0
14 Ceará 0.377 0.615 1.631300 0
15 Rio Grande do Norte 0.396 0.597 1.507576 1
16 Rondônia 0.345 0.577 1.672464 0
17 Pernambuco 0.372 0.574 1.543011 0
18 Maranhão 0.312 0.562 1.801282 0
19 Amazonas 0.324 0.561 1.731481 0
20 Sergipe 0.343 0.560 1.632653 0
21 Acre 0.325 0.559 1.720000 0
22 Bahia 0.332 0.555 1.671687 0
23 Paraíba 0.331 0.555 1.676737 0
24 Piauí 0.301 0.547 1.817276 0
25 Pará 0.319 0.528 1.655172 0
26 Alagoas 0.282 0.520 1.843972 0

In [14]:
idhr_df.to_csv("../data/brazil_states_idhr_2000_2010.csv")

In [15]:
idhl_df.to_csv("../data/brazil_states_idhl_2000_2010.csv")

In [16]:
idhe_df.to_csv("../data/brazil_states_idhe_2000_2010.csv")

In [ ]: