In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
In [3]:
ind = pd.read_csv('data/cleanData.csv')
ind.head()
Out[3]:
In [4]:
print ind.shape
#CHEQUEAR
ind.drop_duplicates(subset = ['CODUSU',
'NRO_HOGAR',
'AGLOMERADO',
'PONDERA',
'familyRelation',
'female',
'age',
'schooled',
'schoolYear',
'finishedYear',
'lastYear',
'activity',
'educLevel',
'empCond',
'unempCond',
'ITF',
'IPCF',
'P47T'], inplace = True)
In [5]:
print ind.shape
In [47]:
print ind.female.value_counts()
In [48]:
# create a boolean variable for females
#1 = male
#2 = female
ind.female = ind.female == 2
In [ ]:
pd.crosstab(ind.schooled)
In [6]:
#CH10 - ¿Asiste o asistió a algún establecimiento
#educativo (colegio, escuela, universidad)?
#1 = Si, asiste
#2 = No asiste, pero asistió
#3 = Nunca asistió
print ind.schooled.value_counts()
ind.schooled.replace(to_replace=[0,9], value=[np.nan,np.nan] , inplace=True, axis=None)
print ind.schooled.value_counts()
In [7]:
#CH13 = finishedYear
#¿Finalizó ese nivel?
#1 = Sí
#2 = No
#9 = Ns./Nr.
print ind.finishedYear.value_counts()
ind.finishedYear.replace(to_replace=[0,9], value=[np.nan,np.nan] , inplace=True, axis=None)
print ind.finishedYear.value_counts()
In [8]:
#ESTADO N(1) CONDICIÓN DE ACTIVIDAD
#0 = Entrevista individual no realizada
#1 = Ocupado
#2 = Desocupado
#3 = Inactivo
#4 = Menor de 10 años
print ind.activity.value_counts()
ind.activity.replace(to_replace=0, value=np.nan , inplace=True, axis=None)
print ind.activity.value_counts()
In [39]:
#N(1)CAT_INAC = unempCond
#CATEGORÍA DE INACTIVIDAD
#1 = Jubilado / Pensionado
#2 = Rentista
#3 = Estudiante
#4 = Ama de casa
#5 = Menor de 6 años
#6 = Discapacitado
#7 = Otros
print ind.unempCond.value_counts()
ind.unempCond.replace(to_replace=0, value=np.nan , inplace=True, axis=None)
print ind.unempCond.value_counts()
In [54]:
#CAT OCUP
#CATEGORÍA OCUPACIONAL (Para ocupados y
#desocupados con ocupación anterior)
#1 = Patrón
#2 = Cuenta propia
#3 = Obrero o empleado
#4 = Trabajador familiar sin remuneración
#9 = Ns./Nr.
print ind.empCond.value_counts()
ind.empCond.replace(to_replace=0, value=np.nan , inplace=True, axis=None)
print ind.empCond.value_counts()
In [9]:
#NIVEL_ED N(1) NIVEL EDUCATIVO
#1 = Primaria Incompleta (incluye educación especial)
#2 = Primaria Completa
#3 = Secundaria Incompleta
#4 = Secundaria Completa
#5 = Superior Universitaria Incompleta
#6 = Superior Universitaria Completa
#7 = Sin instrucción
#9 = Ns./ Nr.
#we replace 7 (no instruction, with 0 so the variable has an increasing order)
print ind.educLevel.value_counts()
ind.educLevel.replace(to_replace = 7, value = 0, inplace = True, axis = None)
print ind.educLevel.value_counts()
In [98]:
ind.lastYear[(ind.educLevel == 1) & (ind.lastYear < 98)].mean()
Out[98]:
In [132]:
ind['primary'] = 0
#ind.Esc1[ind.educLevel == 0] = 0
ind.primary[ind.educLevel > 1] = 7
ind.primary[(ind.educLevel == 1) & (ind.lastYear > 7)] = int(ind.lastYear[(ind.educLevel == 1) & (ind.lastYear < 98)].mean())
ind.primary[(ind.educLevel == 1) & (ind.lastYear <= 7)] = ind.lastYear[(ind.educLevel == 1) & (ind.lastYear <= 7)]
# if educLevel > 1, esc1 = 7
# if educLevel = 0, esc1 = 0
# if educLevel ==1 and
#Esc1 = lastYear, si dicen 98 o 99, mean
#there are 8 cases with spetial educacion, and two that never finished primary but don't remember their last year. Those are 0
In [133]:
pd.crosstab(ind.primary,ind.educLevel)
Out[133]:
In [134]:
pd.crosstab(ind.lastYear,ind.educLevel)
Out[134]:
In [138]:
ind['secondary'] = 999
#dont know their last level
ind.secondary[ind.schoolYear == 99] = 0
#dont know if they went to school
ind.secondary[ind.schooled == 0] = 0
#bellow uncomplete secondary
ind.secondary[ind.educLevel < 3] = 0
#above uncomplete secondary
ind.secondary[ind.educLevel > 3] = 5
#special cases (another school system with 7,8,9 grades)
ind.secondary[(ind.educLevel == 3) & (ind.lastYear == 7)] = 1
ind.secondary[(ind.educLevel == 3) & (ind.lastYear == 8)] = 2
ind.secondary[(ind.educLevel == 3) & (ind.lastYear == 9)] = 3
#error, finished 9 grade EGB system
ind.secondary[(ind.schoolYear == 3) & (ind.finishedYear == 1)] = 3
#they get their last aproved year
ind.secondary[(ind.educLevel == 3) & (ind.lastYear <= 5)] = ind.lastYear[(ind.educLevel == 3) & (ind.lastYear <= 5)]
#dont know their last aproved year, so they get the mean
ind.secondary[(ind.educLevel == 3) & (ind.lastYear > 9)] = int(ind.lastYear[(ind.educLevel == 3) & (ind.lastYear < 98)].mean())
In [139]:
pd.crosstab(ind.secondary,ind.educLevel)
Out[139]:
In [147]:
#SEGUIR DESPUES ACA
ind.schoolYear[ind.secondary == 999].value_counts()
Out[147]:
In [128]:
list(ind.columns)
Out[128]:
In [129]:
print ind.shape
print ind.drop_duplicates(subset = ['CODUSU',
'NRO_HOGAR',
'AGLOMERADO',
'PONDERA',
'familyRelation',
'female',
'age',
'schooled',
'schoolYear',
'finishedYear',
'lastYear',
'activity',
'educLevel',
'empCond',
'unempCond',
'ITF',
'IPCF',
'P47T',
'primary',
'secondary']
).shape
In [ ]:
# anos de escolaridad por nivel
# funcion en R https://github.com/alephcero/incomeMapBuenosAires/blob/master/src/schoolYears.R
# ver la curva de ingresos segun edad y la curva de ingresos segun anos de escolaridad, partirla en 3 o usar x y x2
#construir igual anos de escolaridad
# variables dummy para cada grupo de edad
#r1. Esc_1 + r2. Esc_2 + r3. Esc_3 + v_14a24 GBA + v_25a34GBA + m_14a24 GBA + m_25a34GBA + m_35ymás GBA +
#http://stackoverflow.com/questions/26777832/replicating-rows-in-a-pandas-data-frame-by-a-column-value/26778637#26778637
ind = indNoW.loc[np.repeat(indNoW.index.values,indNoW.PONDERA)]
ind.shape