ENEM socioeconomics

an exploration on low-income high school students scores




Author

Victor Villas Bôas Chaves (vvb.chaves@gmail.com)

Analysis

Spare Time Data Blog Post




In [1]:
import unicodedata
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap
import seaborn as sns

%matplotlib inline

In [2]:
sns.set_palette("deep", desat=.6)
sns.set_context(rc={"figure.figsize": (10, 4)})

enem2012 = {'survey':"./dataset/microdados_enem2012/QUESTIONARIO_ENEM_2012.csv",
            'scores':"./dataset/microdados_enem2012/DADOS_ENEM_2012.csv"}

Workaround: dataset is too big to load in RAM, so we load each column on demand.


In [3]:
def load_columns(tabela,colunas):
    cols = pd.read_csv(tabela, nrows=1).columns.tolist()
    cols = {c:i for i,c in enumerate(cols)}
    cols = [0]+[cols[c] for c in colunas]
    data = pd.read_csv(tabela, usecols=cols,index_col='NU_INSCRICAO', na_values=['','.','*'])
    return data.dropna()

def load(df,tabela,colunas):
    data = load_columns(tabela,colunas)
    return df.join(data,how='inner')

In [4]:
try:
    data = pd.read_pickle("dataset/small.pkl")
except IOError as e:
    def process_income(row):
        legenda = {"A":0, "B":622, "C":933, "D":1244, "E":1555, "F":1866, "G":2488, "H":3110, "I":3732,
                   "J":4354, "K":4976, "L":5598, "M":6220, "N":7464, "O":9330, "P":12440, "Q":10000000}
        ## Q should be infinity, but this is enough.
        max_renda_pc = 1.0*legenda[row['Q03']]/row['Q04']
        normalized = (int(max_renda_pc/50)+1)*50
        return normalized

    # Select low income students
    data = load_columns(enem2012['survey'],['Q03','Q04'])
    data['INCOME'] = data.apply(process_income,axis=1)
    data = data[data['INCOME']<=300][['INCOME']]
    
    # Select students who didn't miss any test
    data = load(data,enem2012['scores'],['IDADE','ST_CONCLUSAO','NU_NT_CN','NU_NT_CH','NU_NT_LC','NU_NT_MT'])
    data['SCORE'] = (data['NU_NT_CN']+data['NU_NT_CH']+data['NU_NT_LC']+data['NU_NT_MT'])/4
    data.drop(['NU_NT_CN','NU_NT_CH','NU_NT_LC','NU_NT_MT'], axis=1,inplace=True)
    
    # Select young students
    data = data[(data['IDADE']>12)&(data['IDADE']<24)&((data['ST_CONCLUSAO']==2)|(data['ST_CONCLUSAO']==3))]
    
    # Fetch parents info
    data = load(data,enem2012['survey'],['Q01','Q02'])

    def process_parents(row):
        pai = row['Q01']
        mae = row['Q02']
        best = max(pai,mae)
        val = ord(best)-ord('A')+1
        if best=="I":
            val = np.nan
        return val

    data['PARENTS'] = data.apply(process_parents,axis=1)
    data.drop(['Q01','Q02'],axis=1,inplace=True)
    
    data = data[['IDADE','INCOME','SCORE','PARENTS']]
    data = load(data,enem2012['scores'],['NO_MUNICIPIO_INSC','UF_INSC'])
    
    data.columns = ['AGE','INCOME', 'SCORE', 'PARENTS', 'CITY', 'STATE']
    
    data.to_pickle("dataset/small.pkl")

In [5]:
gdata = data[data['INCOME']>50]
gdata.describe()


Out[5]:
AGE INCOME SCORE PARENTS
count 682883.000000 682883.000000 682883.000000 600387.000000
mean 17.602954 204.083862 462.448679 3.895148
std 1.442642 47.368771 61.139344 1.624826
min 13.000000 100.000000 298.875000 1.000000
25% 17.000000 150.000000 416.350000 3.000000
50% 17.000000 200.000000 455.075000 4.000000
75% 18.000000 250.000000 501.750000 5.000000
max 23.000000 300.000000 786.050000 8.000000

In [6]:
df = gdata[['PARENTS','INCOME','SCORE']].groupby(by=['PARENTS','INCOME'])

sns.heatmap(df.mean().reset_index().pivot('INCOME','PARENTS','SCORE'), annot=True, fmt=".0f")
plt.show()



In [7]:
df = gdata[['PARENTS','INCOME','SCORE']].groupby(by=['PARENTS','INCOME'])


sns.heatmap(df.std().reset_index().pivot('INCOME','PARENTS'), annot=True, fmt=".0f")
plt.show()



In [8]:
df = gdata[['PARENTS','INCOME','SCORE']].groupby(by=['PARENTS','INCOME'])


sns.heatmap(df.count().reset_index().pivot('INCOME','PARENTS'), annot=True, fmt=".0f")
plt.show()



In [16]:
sns.heatmap(gdata.corr(), annot=True, fmt=".3f")
pass



In [9]:
sns.lmplot('INCOME','SCORE',hue='PARENTS',col='PARENTS',col_wrap=4, data=gdata.dropna(), x_estimator=np.mean, order=1)
pass



In [10]:
sns.lmplot('PARENTS','SCORE',hue='INCOME',col='INCOME',col_wrap=5, data=gdata.dropna(), x_estimator=np.mean, order=1)
pass



In [11]:
sns.factorplot('PARENTS','INCOME',data=data.dropna())
pass



In [17]:
sns.factorplot('PARENTS','AGE',data=data.dropna())
pass



In [18]:
sns.factorplot('AGE','SCORE',data=data.dropna())
pass



In [ ]: