ENEM socioeconomics

an exploration on low-income high school students scores

Author

Victor Villas Bôas Chaves (vvb.chaves@gmail.com)

Analysis



In [1]:

    
import unicodedata
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap
import seaborn as sns

%matplotlib inline



In [2]:

    
sns.set_palette("deep", desat=.6)
sns.set_context(rc={"figure.figsize": (10, 4)})

enem2012 = {'survey':"./dataset/microdados_enem2012/QUESTIONARIO_ENEM_2012.csv",
            'scores':"./dataset/microdados_enem2012/DADOS_ENEM_2012.csv"}

Workaround: dataset is too big to load in RAM, so we load each column on demand.



In [3]:

    
def load_columns(tabela,colunas):
    cols = pd.read_csv(tabela, nrows=1).columns.tolist()
    cols = {c:i for i,c in enumerate(cols)}
    cols = [0]+[cols[c] for c in colunas]
    data = pd.read_csv(tabela, usecols=cols,index_col='NU_INSCRICAO', na_values=['','.','*'])
    return data.dropna()

def load(df,tabela,colunas):
    data = load_columns(tabela,colunas)
    return df.join(data,how='inner')



In [4]:

    
try:
    data = pd.read_pickle("dataset/small.pkl")
except IOError as e:
    def process_income(row):
        legenda = {"A":0, "B":622, "C":933, "D":1244, "E":1555, "F":1866, "G":2488, "H":3110, "I":3732,
                   "J":4354, "K":4976, "L":5598, "M":6220, "N":7464, "O":9330, "P":12440, "Q":10000000}
        ## Q should be infinity, but this is enough.
        max_renda_pc = 1.0*legenda[row['Q03']]/row['Q04']
        normalized = (int(max_renda_pc/50)+1)*50
        return normalized

    # Select low income students
    data = load_columns(enem2012['survey'],['Q03','Q04'])
    data['INCOME'] = data.apply(process_income,axis=1)
    data = data[data['INCOME']<=300][['INCOME']]
    
    # Select students who didn't miss any test
    data = load(data,enem2012['scores'],['IDADE','ST_CONCLUSAO','NU_NT_CN','NU_NT_CH','NU_NT_LC','NU_NT_MT'])
    data['SCORE'] = (data['NU_NT_CN']+data['NU_NT_CH']+data['NU_NT_LC']+data['NU_NT_MT'])/4
    data.drop(['NU_NT_CN','NU_NT_CH','NU_NT_LC','NU_NT_MT'], axis=1,inplace=True)
    
    # Select young students
    data = data[(data['IDADE']>12)&(data['IDADE']<24)&((data['ST_CONCLUSAO']==2)|(data['ST_CONCLUSAO']==3))]
    
    # Fetch parents info
    data = load(data,enem2012['survey'],['Q01','Q02'])

    def process_parents(row):
        pai = row['Q01']
        mae = row['Q02']
        best = max(pai,mae)
        val = ord(best)-ord('A')+1
        if best=="I":
            val = np.nan
        return val

    data['PARENTS'] = data.apply(process_parents,axis=1)
    data.drop(['Q01','Q02'],axis=1,inplace=True)
    
    data = data[['IDADE','INCOME','SCORE','PARENTS']]
    data = load(data,enem2012['scores'],['NO_MUNICIPIO_INSC','UF_INSC'])
    
    data.columns = ['AGE','INCOME', 'SCORE', 'PARENTS', 'CITY', 'STATE']
    
    data.to_pickle("dataset/small.pkl")



In [5]:

    
gdata = data[data['INCOME']>50]
gdata.describe()









    Out[5]:






  
    
      
      AGE
      INCOME
      SCORE
      PARENTS
    
  
  
    
      count
       682883.000000
       682883.000000
       682883.000000
       600387.000000
    
    
      mean
           17.602954
          204.083862
          462.448679
            3.895148
    
    
      std
            1.442642
           47.368771
           61.139344
            1.624826
    
    
      min
           13.000000
          100.000000
          298.875000
            1.000000
    
    
      25%
           17.000000
          150.000000
          416.350000
            3.000000
    
    
      50%
           17.000000
          200.000000
          455.075000
            4.000000
    
    
      75%
           18.000000
          250.000000
          501.750000
            5.000000
    
    
      max
           23.000000
          300.000000
          786.050000
            8.000000



In [6]:

    
df = gdata[['PARENTS','INCOME','SCORE']].groupby(by=['PARENTS','INCOME'])

sns.heatmap(df.mean().reset_index().pivot('INCOME','PARENTS','SCORE'), annot=True, fmt=".0f")
plt.show()



In [7]:

    
df = gdata[['PARENTS','INCOME','SCORE']].groupby(by=['PARENTS','INCOME'])


sns.heatmap(df.std().reset_index().pivot('INCOME','PARENTS'), annot=True, fmt=".0f")
plt.show()



In [8]:

    
df = gdata[['PARENTS','INCOME','SCORE']].groupby(by=['PARENTS','INCOME'])


sns.heatmap(df.count().reset_index().pivot('INCOME','PARENTS'), annot=True, fmt=".0f")
plt.show()



In [16]:

    
sns.heatmap(gdata.corr(), annot=True, fmt=".3f")
pass



In [9]:

    
sns.lmplot('INCOME','SCORE',hue='PARENTS',col='PARENTS',col_wrap=4, data=gdata.dropna(), x_estimator=np.mean, order=1)
pass



In [10]:

    
sns.lmplot('PARENTS','SCORE',hue='INCOME',col='INCOME',col_wrap=5, data=gdata.dropna(), x_estimator=np.mean, order=1)
pass



In [11]:

    
sns.factorplot('PARENTS','INCOME',data=data.dropna())
pass



In [17]:

    
sns.factorplot('PARENTS','AGE',data=data.dropna())
pass



In [18]:

    
sns.factorplot('AGE','SCORE',data=data.dropna())
pass



In [ ]:

	AGE	INCOME	SCORE	PARENTS
count	682883.000000	682883.000000	682883.000000	600387.000000
mean	17.602954	204.083862	462.448679	3.895148
std	1.442642	47.368771	61.139344	1.624826
min	13.000000	100.000000	298.875000	1.000000
25%	17.000000	150.000000	416.350000	3.000000
50%	17.000000	200.000000	455.075000	4.000000
75%	18.000000	250.000000	501.750000	5.000000
max	23.000000	300.000000	786.050000	8.000000