In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
ind = pd.read_csv('data/cleanData.csv')
ind.head()


Out[3]:
CODUSU NRO_HOGAR COMPONENTE AGLOMERADO PONDERA familyRelation female age schooled schoolYear finishedYear lastYear activity educLevel empCond unempCond ITF IPCF P47T
0 302468 1 1 32 1287 1 2 20 1 7 2 1.0 3 5 0 3 4000 2000.0 2000
1 302468 1 2 32 1287 10 2 20 1 6 2 1.0 3 5 0 3 4000 2000.0 2000
2 307861 1 1 32 1674 1 1 42 2 2 1 NaN 1 2 3 0 5800 1450.0 3000
3 307861 1 2 32 1674 2 2 44 2 7 1 NaN 1 6 3 0 5800 1450.0 2800
4 307861 1 3 32 1674 3 1 13 1 4 2 0.0 3 3 0 3 5800 1450.0 0

In [4]:
print ind.shape
#CHEQUEAR
ind.drop_duplicates(subset = ['CODUSU',
 'NRO_HOGAR',
 'AGLOMERADO',
 'PONDERA',
 'familyRelation',
 'female',
 'age',
 'schooled',
 'schoolYear',
 'finishedYear',
 'lastYear',
 'activity',
 'educLevel',
 'empCond',
 'unempCond',
 'ITF',
 'IPCF',
 'P47T'], inplace = True)


(8360, 19)

In [5]:
print ind.shape


(8347, 19)

In [47]:
print ind.female.value_counts()


2    4447
1    3913
Name: female, dtype: int64

In [48]:
# create a boolean variable for females
#1 = male
#2 = female
ind.female = ind.female == 2

In [ ]:
pd.crosstab(ind.schooled)

In [6]:
#CH10 - ¿Asiste o asistió a algún establecimiento
#educativo (colegio, escuela, universidad)? 
#1 = Si, asiste
#2 = No asiste, pero asistió
#3 = Nunca asistió 
print ind.schooled.value_counts()
ind.schooled.replace(to_replace=[0,9], value=[np.nan,np.nan] , inplace=True, axis=None)  
print ind.schooled.value_counts()


2    5506
1    2376
0     248
3     210
9       7
Name: schooled, dtype: int64
2.0    5506
1.0    2376
3.0     210
Name: schooled, dtype: int64

In [7]:
#CH13 = finishedYear
#¿Finalizó ese nivel?
#1 = Sí
#2 = No
#9 = Ns./Nr. 
print ind.finishedYear.value_counts()
ind.finishedYear.replace(to_replace=[0,9], value=[np.nan,np.nan] , inplace=True, axis=None)  
print ind.finishedYear.value_counts()


2    4043
1    3817
0     465
9      22
Name: finishedYear, dtype: int64
2.0    4043
1.0    3817
Name: finishedYear, dtype: int64

In [8]:
#ESTADO N(1) CONDICIÓN DE ACTIVIDAD
#0 = Entrevista individual no realizada
#1 = Ocupado
#2 = Desocupado
#3 = Inactivo
#4 = Menor de 10 años
print ind.activity.value_counts()
ind.activity.replace(to_replace=0, value=np.nan , inplace=True, axis=None)  
print ind.activity.value_counts()


1    3759
3    3072
4    1175
2     337
0       4
Name: activity, dtype: int64
1.0    3759
3.0    3072
4.0    1175
2.0     337
Name: activity, dtype: int64

In [39]:
#N(1)CAT_INAC = unempCond
#CATEGORÍA DE INACTIVIDAD
#1 = Jubilado / Pensionado
#2 = Rentista
#3 = Estudiante
#4 = Ama de casa
#5 = Menor de 6 años
#6 = Discapacitado
#7 = Otros

print ind.unempCond.value_counts()
ind.unempCond.replace(to_replace=0, value=np.nan , inplace=True, axis=None)  
print ind.unempCond.value_counts()


3.0    1667
1.0    1044
5.0     706
4.0     682
7.0     106
6.0      30
2.0      21
Name: unempCond, dtype: int64
3.0    1667
1.0    1044
5.0     706
4.0     682
7.0     106
6.0      30
2.0      21
Name: unempCond, dtype: int64

In [54]:
#CAT OCUP
#CATEGORÍA OCUPACIONAL (Para ocupados y
#desocupados con ocupación anterior)
#1 = Patrón
#2 = Cuenta propia
#3 = Obrero o empleado
#4 = Trabajador familiar sin remuneración
#9 = Ns./Nr.

print ind.empCond.value_counts()
ind.empCond.replace(to_replace=0, value=np.nan , inplace=True, axis=None)  
print ind.empCond.value_counts()


0    4333
3    3116
2     700
1     176
4      35
Name: empCond, dtype: int64
3.0    3116
2.0     700
1.0     176
4.0      35
Name: empCond, dtype: int64

In [9]:
#NIVEL_ED N(1) NIVEL EDUCATIVO
#1 = Primaria Incompleta (incluye educación especial)
#2 = Primaria Completa
#3 = Secundaria Incompleta
#4 = Secundaria Completa
#5 = Superior Universitaria Incompleta
#6 = Superior Universitaria Completa
#7 = Sin instrucción
#9 = Ns./ Nr.

#we replace 7 (no instruction, with 0 so the variable has an increasing order)
print ind.educLevel.value_counts()
ind.educLevel.replace(to_replace = 7, value = 0, inplace = True, axis = None)
print ind.educLevel.value_counts()


3    1633
4    1415
2    1381
1    1321
6    1000
5     830
7     767
Name: educLevel, dtype: int64
3    1633
4    1415
2    1381
1    1321
6    1000
5     830
0     767
Name: educLevel, dtype: int64

In [98]:
ind.lastYear[(ind.educLevel == 1) & (ind.lastYear < 98)].mean()


Out[98]:
3.110223642172524

In [132]:
ind['primary'] = 0
#ind.Esc1[ind.educLevel == 0] = 0
ind.primary[ind.educLevel > 1] = 7
ind.primary[(ind.educLevel == 1) & (ind.lastYear > 7)] = int(ind.lastYear[(ind.educLevel == 1) & (ind.lastYear < 98)].mean())
ind.primary[(ind.educLevel == 1) & (ind.lastYear <= 7)] = ind.lastYear[(ind.educLevel == 1) & (ind.lastYear <= 7)]


# if educLevel > 1, esc1 = 7
# if educLevel = 0, esc1 = 0
# if educLevel ==1 and
#Esc1 = lastYear, si dicen 98 o 99, mean 
#there are 8 cases with spetial educacion, and two that never finished primary but don't remember their last year. Those are 0


/home/pipe/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
/home/pipe/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/pipe/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [133]:
pd.crosstab(ind.primary,ind.educLevel)


Out[133]:
educLevel 0 1 2 3 4 5 6
primary
0 767 150 0 0 0 0 0
1 0 117 0 0 0 0 0
2 0 232 0 0 0 0 0
3 0 276 0 0 0 0 0
4 0 213 0 0 0 0 0
5 0 196 0 0 0 0 0
6 0 137 0 0 0 0 0
7 0 0 1381 1633 1415 830 1000

In [134]:
pd.crosstab(ind.lastYear,ind.educLevel)


Out[134]:
educLevel 0 1 2 3 5 6
lastYear
0.0 286 140 0 195 180 8
1.0 5 117 0 300 148 4
2.0 2 232 0 410 213 1
3.0 2 214 0 304 135 2
4.0 1 213 0 143 95 0
5.0 0 196 0 13 37 0
6.0 0 137 0 0 4 0
7.0 0 0 10 65 0 0
8.0 0 0 0 116 1 0
98.0 0 33 0 0 0 0
99.0 3 29 1 31 17 0

In [138]:
ind['secondary'] = 999

#dont know their last level
ind.secondary[ind.schoolYear == 99] = 0

#dont know if they went to school
ind.secondary[ind.schooled == 0] = 0

#bellow uncomplete secondary
ind.secondary[ind.educLevel < 3] = 0

#above uncomplete secondary
ind.secondary[ind.educLevel > 3] = 5

#special cases (another school system with 7,8,9 grades)

ind.secondary[(ind.educLevel == 3) & (ind.lastYear == 7)] = 1
ind.secondary[(ind.educLevel == 3) & (ind.lastYear == 8)] = 2
ind.secondary[(ind.educLevel == 3) & (ind.lastYear == 9)] = 3

#error, finished 9 grade EGB system
ind.secondary[(ind.schoolYear == 3) & (ind.finishedYear == 1)] = 3

#they get their last aproved year
ind.secondary[(ind.educLevel == 3) & (ind.lastYear <= 5)] = ind.lastYear[(ind.educLevel == 3) & (ind.lastYear <= 5)]

#dont know their last aproved year, so they get the mean
ind.secondary[(ind.educLevel == 3) & (ind.lastYear > 9)] = int(ind.lastYear[(ind.educLevel == 3) & (ind.lastYear < 98)].mean())


/home/pipe/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/pipe/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/pipe/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/pipe/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/pipe/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:17: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/pipe/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:18: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/pipe/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:19: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/pipe/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:22: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/pipe/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:25: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [139]:
pd.crosstab(ind.secondary,ind.educLevel)


Out[139]:
educLevel 0 1 2 3 4 5 6
secondary
0 767 1321 1381 198 0 0 0
1 0 0 0 365 0 0 0
2 0 0 0 557 0 0 0
3 0 0 0 354 0 0 0
4 0 0 0 143 0 0 0
5 0 0 0 13 1415 830 1000
999 0 0 0 3 0 0 0

In [147]:
#SEGUIR DESPUES ACA
ind.schoolYear[ind.secondary == 999].value_counts()


Out[147]:
4    2
2    1
Name: schoolYear, dtype: int64

In [128]:
list(ind.columns)


Out[128]:
['CODUSU',
 'NRO_HOGAR',
 'COMPONENTE',
 'AGLOMERADO',
 'PONDERA',
 'familyRelation',
 'female',
 'age',
 'schooled',
 'schoolYear',
 'finishedYear',
 'lastYear',
 'activity',
 'educLevel',
 'empCond',
 'unempCond',
 'ITF',
 'IPCF',
 'P47T',
 'primary',
 'secondary']

In [129]:
print ind.shape
print ind.drop_duplicates(subset = ['CODUSU',
 'NRO_HOGAR',
 'AGLOMERADO',
 'PONDERA',
 'familyRelation',
 'female',
 'age',
 'schooled',
 'schoolYear',
 'finishedYear',
 'lastYear',
 'activity',
 'educLevel',
 'empCond',
 'unempCond',
 'ITF',
 'IPCF',
 'P47T',
 'primary',
 'secondary']
                         ).shape


(8360, 21)
(8347, 21)

In [ ]:
# anos de escolaridad por nivel
# funcion en R https://github.com/alephcero/incomeMapBuenosAires/blob/master/src/schoolYears.R


# ver la curva de ingresos segun edad y la curva de ingresos segun anos de escolaridad, partirla en 3 o usar x y x2
#construir igual anos de escolaridad


# variables dummy para cada grupo de edad 

#r1. Esc_1 + r2. Esc_2 + r3. Esc_3 +  v_14a24 GBA + v_25a34GBA + m_14a24 GBA + m_25a34GBA  +  m_35ymás GBA +  



#http://stackoverflow.com/questions/26777832/replicating-rows-in-a-pandas-data-frame-by-a-column-value/26778637#26778637
ind = indNoW.loc[np.repeat(indNoW.index.values,indNoW.PONDERA)]
ind.shape