In [1]:
import unicodedata
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap
import seaborn as sns
%matplotlib inline
In [2]:
sns.set_palette("deep", desat=.6)
sns.set_context(rc={"figure.figsize": (10, 4)})
enem2012 = {'survey':"./dataset/microdados_enem2012/QUESTIONARIO_ENEM_2012.csv",
'scores':"./dataset/microdados_enem2012/DADOS_ENEM_2012.csv"}
Workaround: dataset is too big to load in RAM, so we load each column on demand.
In [3]:
def load_columns(tabela,colunas):
cols = pd.read_csv(tabela, nrows=1).columns.tolist()
cols = {c:i for i,c in enumerate(cols)}
cols = [0]+[cols[c] for c in colunas]
data = pd.read_csv(tabela, usecols=cols,index_col='NU_INSCRICAO', na_values=['','.','*'])
return data.dropna()
def load(df,tabela,colunas):
data = load_columns(tabela,colunas)
return df.join(data,how='inner')
In [4]:
try:
data = pd.read_pickle("dataset/small.pkl")
except IOError as e:
def process_income(row):
legenda = {"A":0, "B":622, "C":933, "D":1244, "E":1555, "F":1866, "G":2488, "H":3110, "I":3732,
"J":4354, "K":4976, "L":5598, "M":6220, "N":7464, "O":9330, "P":12440, "Q":10000000}
## Q should be infinity, but this is enough.
max_renda_pc = 1.0*legenda[row['Q03']]/row['Q04']
normalized = (int(max_renda_pc/50)+1)*50
return normalized
# Select low income students
data = load_columns(enem2012['survey'],['Q03','Q04'])
data['INCOME'] = data.apply(process_income,axis=1)
data = data[data['INCOME']<=300][['INCOME']]
# Select students who didn't miss any test
data = load(data,enem2012['scores'],['IDADE','ST_CONCLUSAO','NU_NT_CN','NU_NT_CH','NU_NT_LC','NU_NT_MT'])
data['SCORE'] = (data['NU_NT_CN']+data['NU_NT_CH']+data['NU_NT_LC']+data['NU_NT_MT'])/4
data.drop(['NU_NT_CN','NU_NT_CH','NU_NT_LC','NU_NT_MT'], axis=1,inplace=True)
# Select young students
data = data[(data['IDADE']>12)&(data['IDADE']<24)&((data['ST_CONCLUSAO']==2)|(data['ST_CONCLUSAO']==3))]
# Fetch parents info
data = load(data,enem2012['survey'],['Q01','Q02'])
def process_parents(row):
pai = row['Q01']
mae = row['Q02']
best = max(pai,mae)
val = ord(best)-ord('A')+1
if best=="I":
val = np.nan
return val
data['PARENTS'] = data.apply(process_parents,axis=1)
data.drop(['Q01','Q02'],axis=1,inplace=True)
data = data[['IDADE','INCOME','SCORE','PARENTS']]
data = load(data,enem2012['scores'],['NO_MUNICIPIO_INSC','UF_INSC'])
data.columns = ['AGE','INCOME', 'SCORE', 'PARENTS', 'CITY', 'STATE']
data.to_pickle("dataset/small.pkl")
In [5]:
gdata = data[data['INCOME']>50]
gdata.describe()
Out[5]:
In [6]:
df = gdata[['PARENTS','INCOME','SCORE']].groupby(by=['PARENTS','INCOME'])
sns.heatmap(df.mean().reset_index().pivot('INCOME','PARENTS','SCORE'), annot=True, fmt=".0f")
plt.show()
In [7]:
df = gdata[['PARENTS','INCOME','SCORE']].groupby(by=['PARENTS','INCOME'])
sns.heatmap(df.std().reset_index().pivot('INCOME','PARENTS'), annot=True, fmt=".0f")
plt.show()
In [8]:
df = gdata[['PARENTS','INCOME','SCORE']].groupby(by=['PARENTS','INCOME'])
sns.heatmap(df.count().reset_index().pivot('INCOME','PARENTS'), annot=True, fmt=".0f")
plt.show()
In [16]:
sns.heatmap(gdata.corr(), annot=True, fmt=".3f")
pass
In [9]:
sns.lmplot('INCOME','SCORE',hue='PARENTS',col='PARENTS',col_wrap=4, data=gdata.dropna(), x_estimator=np.mean, order=1)
pass
In [10]:
sns.lmplot('PARENTS','SCORE',hue='INCOME',col='INCOME',col_wrap=5, data=gdata.dropna(), x_estimator=np.mean, order=1)
pass
In [11]:
sns.factorplot('PARENTS','INCOME',data=data.dropna())
pass
In [17]:
sns.factorplot('PARENTS','AGE',data=data.dropna())
pass
In [18]:
sns.factorplot('AGE','SCORE',data=data.dropna())
pass
In [ ]: