notebook.community

Edit and run



In [2]:

    
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline



In [3]:

    
ind = pd.read_csv('data/cleanData.csv')
ind.head()









    Out[3]:






  
    
      
      CODUSU
      NRO_HOGAR
      COMPONENTE
      AGLOMERADO
      PONDERA
      familyRelation
      female
      age
      schooled
      schoolYear
      finishedYear
      lastYear
      activity
      educLevel
      empCond
      unempCond
      ITF
      IPCF
      P47T
    
  
  
    
      0
      302468
      1
      1
      32
      1287
      1
      2
      20
      1
      7
      2
      1.0
      3
      5
      0
      3
      4000
      2000.0
      2000
    
    
      1
      302468
      1
      2
      32
      1287
      10
      2
      20
      1
      6
      2
      1.0
      3
      5
      0
      3
      4000
      2000.0
      2000
    
    
      2
      307861
      1
      1
      32
      1674
      1
      1
      42
      2
      2
      1
      NaN
      1
      2
      3
      0
      5800
      1450.0
      3000
    
    
      3
      307861
      1
      2
      32
      1674
      2
      2
      44
      2
      7
      1
      NaN
      1
      6
      3
      0
      5800
      1450.0
      2800
    
    
      4
      307861
      1
      3
      32
      1674
      3
      1
      13
      1
      4
      2
      0.0
      3
      3
      0
      3
      5800
      1450.0
      0



In [4]:

    
print ind.shape
#CHEQUEAR
ind.drop_duplicates(subset = ['CODUSU',
 'NRO_HOGAR',
 'AGLOMERADO',
 'PONDERA',
 'familyRelation',
 'female',
 'age',
 'schooled',
 'schoolYear',
 'finishedYear',
 'lastYear',
 'activity',
 'educLevel',
 'empCond',
 'unempCond',
 'ITF',
 'IPCF',
 'P47T'], inplace = True)



In [5]:

    
print ind.shape



In [47]:

    
print ind.female.value_counts()









    



2    4447
1    3913
Name: female, dtype: int64



In [48]:

    
# create a boolean variable for females
#1 = male
#2 = female
ind.female = ind.female == 2



In [ ]:

    
pd.crosstab(ind.schooled)



In [6]:

    
#CH10 - ¿Asiste o asistió a algún establecimiento
#educativo (colegio, escuela, universidad)? 
#1 = Si, asiste
#2 = No asiste, pero asistió
#3 = Nunca asistió 
print ind.schooled.value_counts()
ind.schooled.replace(to_replace=[0,9], value=[np.nan,np.nan] , inplace=True, axis=None)  
print ind.schooled.value_counts()









    



2    5506
1    2376
0     248
3     210
9       7
Name: schooled, dtype: int64
2.0    5506
1.0    2376
3.0     210
Name: schooled, dtype: int64



In [7]:

    
#CH13 = finishedYear
#¿Finalizó ese nivel?
#1 = Sí
#2 = No
#9 = Ns./Nr. 
print ind.finishedYear.value_counts()
ind.finishedYear.replace(to_replace=[0,9], value=[np.nan,np.nan] , inplace=True, axis=None)  
print ind.finishedYear.value_counts()









    



2    4043
1    3817
0     465
9      22
Name: finishedYear, dtype: int64
2.0    4043
1.0    3817
Name: finishedYear, dtype: int64



In [8]:

    
#ESTADO N(1) CONDICIÓN DE ACTIVIDAD
#0 = Entrevista individual no realizada
#1 = Ocupado
#2 = Desocupado
#3 = Inactivo
#4 = Menor de 10 años
print ind.activity.value_counts()
ind.activity.replace(to_replace=0, value=np.nan , inplace=True, axis=None)  
print ind.activity.value_counts()









    



1    3759
3    3072
4    1175
2     337
0       4
Name: activity, dtype: int64
1.0    3759
3.0    3072
4.0    1175
2.0     337
Name: activity, dtype: int64



In [39]:

    
#N(1)CAT_INAC = unempCond
#CATEGORÍA DE INACTIVIDAD
#1 = Jubilado / Pensionado
#2 = Rentista
#3 = Estudiante
#4 = Ama de casa
#5 = Menor de 6 años
#6 = Discapacitado
#7 = Otros

print ind.unempCond.value_counts()
ind.unempCond.replace(to_replace=0, value=np.nan , inplace=True, axis=None)  
print ind.unempCond.value_counts()









    



3.0    1667
1.0    1044
5.0     706
4.0     682
7.0     106
6.0      30
2.0      21
Name: unempCond, dtype: int64
3.0    1667
1.0    1044
5.0     706
4.0     682
7.0     106
6.0      30
2.0      21
Name: unempCond, dtype: int64



In [54]:

    
#CAT OCUP
#CATEGORÍA OCUPACIONAL (Para ocupados y
#desocupados con ocupación anterior)
#1 = Patrón
#2 = Cuenta propia
#3 = Obrero o empleado
#4 = Trabajador familiar sin remuneración
#9 = Ns./Nr.

print ind.empCond.value_counts()
ind.empCond.replace(to_replace=0, value=np.nan , inplace=True, axis=None)  
print ind.empCond.value_counts()









    



0    4333
3    3116
2     700
1     176
4      35
Name: empCond, dtype: int64
3.0    3116
2.0     700
1.0     176
4.0      35
Name: empCond, dtype: int64



In [9]:

    
#NIVEL_ED N(1) NIVEL EDUCATIVO
#1 = Primaria Incompleta (incluye educación especial)
#2 = Primaria Completa
#3 = Secundaria Incompleta
#4 = Secundaria Completa
#5 = Superior Universitaria Incompleta
#6 = Superior Universitaria Completa
#7 = Sin instrucción
#9 = Ns./ Nr.

#we replace 7 (no instruction, with 0 so the variable has an increasing order)
print ind.educLevel.value_counts()
ind.educLevel.replace(to_replace = 7, value = 0, inplace = True, axis = None)
print ind.educLevel.value_counts()









    



3    1633
4    1415
2    1381
1    1321
6    1000
5     830
7     767
Name: educLevel, dtype: int64
3    1633
4    1415
2    1381
1    1321
6    1000
5     830
0     767
Name: educLevel, dtype: int64



In [98]:

    
ind.lastYear[(ind.educLevel == 1) & (ind.lastYear < 98)].mean()









    Out[98]:





3.110223642172524



In [132]:

    
ind['primary'] = 0
#ind.Esc1[ind.educLevel == 0] = 0
ind.primary[ind.educLevel > 1] = 7
ind.primary[(ind.educLevel == 1) & (ind.lastYear > 7)] = int(ind.lastYear[(ind.educLevel == 1) & (ind.lastYear < 98)].mean())
ind.primary[(ind.educLevel == 1) & (ind.lastYear <= 7)] = ind.lastYear[(ind.educLevel == 1) & (ind.lastYear <= 7)]


# if educLevel > 1, esc1 = 7
# if educLevel = 0, esc1 = 0
# if educLevel ==1 and
#Esc1 = lastYear, si dicen 98 o 99, mean 
#there are 8 cases with spetial educacion, and two that never finished primary but don't remember their last year. Those are 0









    



/home/pipe/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
/home/pipe/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/pipe/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [133]:

    
pd.crosstab(ind.primary,ind.educLevel)



In [134]:

    
pd.crosstab(ind.lastYear,ind.educLevel)



In [138]:

    
ind['secondary'] = 999

#dont know their last level
ind.secondary[ind.schoolYear == 99] = 0

#dont know if they went to school
ind.secondary[ind.schooled == 0] = 0

#bellow uncomplete secondary
ind.secondary[ind.educLevel < 3] = 0

#above uncomplete secondary
ind.secondary[ind.educLevel > 3] = 5

#special cases (another school system with 7,8,9 grades)

ind.secondary[(ind.educLevel == 3) & (ind.lastYear == 7)] = 1
ind.secondary[(ind.educLevel == 3) & (ind.lastYear == 8)] = 2
ind.secondary[(ind.educLevel == 3) & (ind.lastYear == 9)] = 3

#error, finished 9 grade EGB system
ind.secondary[(ind.schoolYear == 3) & (ind.finishedYear == 1)] = 3

#they get their last aproved year
ind.secondary[(ind.educLevel == 3) & (ind.lastYear <= 5)] = ind.lastYear[(ind.educLevel == 3) & (ind.lastYear <= 5)]

#dont know their last aproved year, so they get the mean
ind.secondary[(ind.educLevel == 3) & (ind.lastYear > 9)] = int(ind.lastYear[(ind.educLevel == 3) & (ind.lastYear < 98)].mean())









    



/home/pipe/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/pipe/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/pipe/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/pipe/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/pipe/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:17: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/pipe/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:18: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/pipe/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:19: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/pipe/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:22: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/pipe/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:25: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [139]:

    
pd.crosstab(ind.secondary,ind.educLevel)



In [147]:

    
#SEGUIR DESPUES ACA
ind.schoolYear[ind.secondary == 999].value_counts()









    Out[147]:





4    2
2    1
Name: schoolYear, dtype: int64



In [128]:

    
list(ind.columns)









    Out[128]:





['CODUSU',
 'NRO_HOGAR',
 'COMPONENTE',
 'AGLOMERADO',
 'PONDERA',
 'familyRelation',
 'female',
 'age',
 'schooled',
 'schoolYear',
 'finishedYear',
 'lastYear',
 'activity',
 'educLevel',
 'empCond',
 'unempCond',
 'ITF',
 'IPCF',
 'P47T',
 'primary',
 'secondary']



In [129]:

    
print ind.shape
print ind.drop_duplicates(subset = ['CODUSU',
 'NRO_HOGAR',
 'AGLOMERADO',
 'PONDERA',
 'familyRelation',
 'female',
 'age',
 'schooled',
 'schoolYear',
 'finishedYear',
 'lastYear',
 'activity',
 'educLevel',
 'empCond',
 'unempCond',
 'ITF',
 'IPCF',
 'P47T',
 'primary',
 'secondary']
                         ).shape









    



(8360, 21)
(8347, 21)



In [ ]:

    
# anos de escolaridad por nivel
# funcion en R https://github.com/alephcero/incomeMapBuenosAires/blob/master/src/schoolYears.R


# ver la curva de ingresos segun edad y la curva de ingresos segun anos de escolaridad, partirla en 3 o usar x y x2
#construir igual anos de escolaridad


# variables dummy para cada grupo de edad 

#r1. Esc_1 + r2. Esc_2 + r3. Esc_3 +  v_14a24 GBA + v_25a34GBA + m_14a24 GBA + m_25a34GBA  +  m_35ymás GBA +  



#http://stackoverflow.com/questions/26777832/replicating-rows-in-a-pandas-data-frame-by-a-column-value/26778637#26778637
ind = indNoW.loc[np.repeat(indNoW.index.values,indNoW.PONDERA)]
ind.shape

	CODUSU	NRO_HOGAR	COMPONENTE	AGLOMERADO	PONDERA	familyRelation	female	age	schooled	schoolYear	finishedYear	lastYear	activity	educLevel	empCond	unempCond	ITF	IPCF	P47T
0	302468	1	1	32	1287	1	2	20	1	7	2	1.0	3	5	0	3	4000	2000.0	2000
1	302468	1	2	32	1287	10	2	20	1	6	2	1.0	3	5	0	3	4000	2000.0	2000
2	307861	1	1	32	1674	1	1	42	2	2	1	NaN	1	2	3	0	5800	1450.0	3000
3	307861	1	2	32	1674	2	2	44	2	7	1	NaN	1	6	3	0	5800	1450.0	2800
4	307861	1	3	32	1674	3	1	13	1	4	2	0.0	3	3	0	3	5800	1450.0	0

educLevel	0	1	2	3	4	5	6
primary
0	767	150	0	0	0	0	0
1	0	117	0	0	0	0	0
2	0	232	0	0	0	0	0
3	0	276	0	0	0	0	0
4	0	213	0	0	0	0	0
5	0	196	0	0	0	0	0
6	0	137	0	0	0	0	0
7	0	0	1381	1633	1415	830	1000

educLevel	0	1	2	3	5	6
lastYear
0.0	286	140	0	195	180	8
1.0	5	117	0	300	148	4
2.0	2	232	0	410	213	1
3.0	2	214	0	304	135	2
4.0	1	213	0	143	95	0
5.0	0	196	0	13	37	0
6.0	0	137	0	0	4	0
7.0	0	0	10	65	0	0
8.0	0	0	0	116	1	0
98.0	0	33	0	0	0	0
99.0	3	29	1	31	17	0

educLevel	0	1	2	3	4	5	6
secondary
0	767	1321	1381	198	0	0	0
1	0	0	0	365	0	0	0
2	0	0	0	557	0	0	0
3	0	0	0	354	0	0	0
4	0	0	0	143	0	0	0
5	0	0	0	13	1415	830	1000
999	0	0	0	3	0	0	0