In [1]:
%matplotlib inline
import pandas as pd
import requests as req
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind, ttest_rel
from scipy.stats import gaussian_kde
from statsmodels.formula.api import ols, mixedlm, gee
from statsmodels.stats.outliers_influence import OLSInfluence
from statsmodels.regression.linear_model import OLSResults
from patsy import dmatrix
np.set_printoptions(precision=3)
In [2]:
idhm_df = pd.read_csv("../data/brazil_states_idhe_2000_2010.csv", index_col=0)
idhm_df
Out[2]:
In [3]:
idhm_df.describe()
Out[3]:
In [4]:
f = plt.figure(14)
idhm_df[["I2000","I2010","Ratio"]].hist(bins=10)
plt.figure()
sns.kdeplot(idhm_df["I2000"], shade=True);
sns.kdeplot(idhm_df["I2010"], shade=True);
sns.kdeplot(idhm_df["Ratio"], shade=True);
In [5]:
ttest_rel(idhm_df['I2000'], idhm_df['I2010'])
Out[5]:
In [6]:
import scipy
import scikits.bootstrap as bootstrap
# compute 95% confidence intervals around the mean
CIs00 = bootstrap.ci(data=idhm_df["I2000"])
CIs10 = bootstrap.ci(data=idhm_df["I2010"])
CIsR = bootstrap.ci(data=idhm_df["Ratio"])
print("IDHM 2000 mean 95% confidence interval. Low={0:.3f}\tHigh={1:.3f}".format(*tuple(CIs00)))
print("IDHM 2010 mean 95% confidence interval. Low={0:.3f}\tHigh={1:.3f}".format(*tuple(CIs10)))
print("IDHM ratio mean 95% confidence interval. Low={0:.3f}\tHigh={1:.3f}".format(*tuple(CIsR)))
In [7]:
CIs00 = bootstrap.ci(data=idhm_df["I2000"], statfunction=scipy.median)
CIs10 = bootstrap.ci(data=idhm_df["I2010"], statfunction=scipy.median)
CIsR = bootstrap.ci(data=idhm_df["Ratio"], statfunction=scipy.median)
print("IDHM 2000 median 95% confidence interval. Low={0:.3f}\tHigh={1:.3f}".format(*tuple(CIs00)))
print("IDHM 2010 median 95% confidence interval. Low={0:.3f}\tHigh={1:.3f}".format(*tuple(CIs10)))
print("IDHM ratio median 95% confidence interval. Low={0:.3f}\tHigh={1:.3f}".format(*tuple(CIsR)))
A resposta de diversos testes, para um nível de 5% de significância, mostra que há fortes evidências que sim.
In [8]:
state_parties_df = pd.read_csv("../data/brazil_states_parties_2000-2010.csv", index_col=0)
In [9]:
state_parties_df
Out[9]:
In [10]:
state_regions_df = pd.read_csv("../data/brazil_states_regions.csv", index_col=0)
state_regions_df
Out[10]:
In [11]:
df = idhm_df.merge(state_parties_df, on="Estado")
df = df.merge(state_regions_df, on="Estado")
df
Out[11]:
In [12]:
sns.factorplot("idh_level_2000","Ratio",data=df, kind="box")
Out[12]:
In [13]:
sns.factorplot("Regiao","Ratio",data=df, kind="box")
Out[13]:
In [14]:
sns.set()
sns.pairplot(df, hue="idh_level_2000", size=2.5)
Out[14]:
In [15]:
sns.coefplot("Ratio ~ PT + PSDB + Outros + C(idh_level_2000) - 1", df, palette="Set1");
In [16]:
sns.coefplot("Ratio ~ Outros==0 + Outros - 1", df, palette="Set1");
In [17]:
sns.set(style="whitegrid")
sns.residplot(df.Outros,df.Ratio, color="navy", lowess=True, order=1)
Out[17]:
In [18]:
sns.coefplot("Ratio ~ PT==0 + PT - 1", df, palette="Set1");
In [19]:
sns.set(style="whitegrid")
sns.residplot(df[df.PT>0].PT, df[df.PT>0].Ratio, color="navy", order=1)
Out[19]:
In [20]:
sns.coefplot("Ratio ~ PSDB==0 + PSDB + np.multiply(PSDB, PSDB) - 1", df, palette="Set1");
In [21]:
sns.set(style="whitegrid")
sns.residplot(df[df.PSDB>0].PSDB, df[df.PSDB>0].Ratio, color="navy", lowess=True, order=2)
Out[21]:
In [22]:
sns.coefplot("Ratio ~ PT + PSDB + Outros + C(idh_level_2000) - 1", df, palette="Set1");
sns.coefplot("Ratio ~ PT + PSDB + C(idh_level_2000)", df, palette="Set1");
sns.coefplot("Ratio ~ PT + Outros + C(idh_level_2000)", df, palette="Set1");
sns.coefplot("Ratio ~ PSDB + Outros + C(idh_level_2000)", df, palette="Set1");
In [23]:
formula = "Ratio ~ PT + PSDB + C(idh_level_2000) + C(Regiao)"
model = ols(formula, df).fit()
model.summary()
Out[23]:
Não foi possível observar diferença significantiva entre os partidos.
Comparando 2010 com 2000
In [24]:
sns.lmplot("I2000", "I2010", data=df, legend=True, size=10, n_boot=10000, ci=95)
Out[24]:
In [25]:
sns.jointplot("I2000", "I2010", data=df, kind='resid',color=sns.color_palette()[2], size=10)
Out[25]:
In [26]:
sns.coefplot("I2010 ~ I2000", data=df, intercept=True)
sns.coefplot("I2010 ~ I2000", data=df, groupby="idh_level_2000", intercept=True)
In [27]:
sns.lmplot("I2000", "I2010", data=df, hue="idh_level_2000", col="idh_level_2000", legend=True, size=6, n_boot=10000, ci=99)
sns.lmplot("I2000", "I2010", data=df, hue="Regiao", col="Regiao", col_wrap=2, legend=True, size=6, n_boot=10000, ci=99)
Out[27]:
In [28]:
md = ols("I2010 ~ I2000 + C(Regiao)", df).fit()
print(md.summary())
In [29]:
rrr = md.get_robustcov_results()
rrp = rrr.outlier_test("fdr_bh", 0.1)
idx = rrp[rrp["fdr_bh(p)"] <= 0.1].index
print("Estados fora da média:\n",df.ix[idx.values])
rrp[rrp["fdr_bh(p)"] <= 0.1]
Out[29]:
In [30]:
import statsmodels.api as sm
md = gee("Ratio ~ PT + PSDB ", df.idh_level_2000, df, cov_struct=sm.cov_struct.Exchangeable())
mdf = md.fit()
print(mdf.summary())
print(mdf.cov_struct.summary())
In [31]:
plt.plot(mdf.fittedvalues, mdf.resid, 'o', alpha=0.5)
plt.xlabel("Fitted values", size=17)
plt.ylabel("Residuals", size=17)
Out[31]:
In [32]:
sns.jointplot(mdf.fittedvalues, mdf.resid, size=10, kind="kde")
Out[32]:
In [ ]:
In [ ]: