In [2]:
import pandas as pd
import numpy as np
import os
import sys
import simpledbf

In [3]:
#download data, is necesary to have a folder /data
url = 'http://www.indec.gob.ar/ftp/cuadros/menusuperior/eph/t310_dbf.zip'
os.system('wget ' + url)
os.system("mv " + 't310_dbf.zip data')
os.system('unzip data/t310_dbf.zip -d data/ephDBF')


Out[3]:
0

In [17]:
dbf = simpledbf.Dbf5('data/ephDBF/Individual_t310.dbf',codec='latin1')
indRaw = dbf.to_dataframe()
indRaw.head()


Out[17]:
CODUSU NRO_HOGAR COMPONENTE H15 ANO4 TRIMESTRE REGION MAS_500 AGLOMERADO PONDERA ... DECCFR IDECCFR RDECCFR GDECCFR PDECCFR ADECCFR PJ1_1 PJ2_1 PJ3_1 IDIMPP
0 301358 1 1 1 2010 3 43 S 2 647 ... 02 02 02 02 NaN 01 0 0 0 00010
1 304669 1 1 1 2010 3 43 S 2 417 ... 04 04 03 03 NaN 03 0 0 0 10000
2 304669 1 2 1 2010 3 43 S 2 417 ... 04 04 03 03 NaN 03 0 0 0 10000
3 304669 1 3 1 2010 3 43 S 2 417 ... 04 04 03 03 NaN 03 0 0 0 00000
4 302366 1 1 1 2010 3 43 S 2 715 ... 10 10 10 10 NaN 10 0 0 0 00000

5 rows × 176 columns


In [18]:
indNoW = indRaw.loc[indRaw.REGION == 1,['CODUSU',
                    'NRO_HOGAR',
                    'COMPONENTE',
                    'AGLOMERADO',
                    'PONDERA',
                    'CH03',
                    'CH04',
                    'CH06',
                    'CH10',
                    'CH12',
                    'CH13',
                    'CH14',
                    'ESTADO',
                    'NIVEL_ED',
                    'CAT_OCUP',
                    'CAT_INAC',
                    'ITF',
                    'IPCF',
                    'P47T']]
indNoW.head()


Out[18]:
CODUSU NRO_HOGAR COMPONENTE AGLOMERADO PONDERA CH03 CH04 CH06 CH10 CH12 CH13 CH14 ESTADO NIVEL_ED CAT_OCUP CAT_INAC ITF IPCF P47T
44316 302468 1 1 32 1287 1 2 20 1 7 2 01 3 5 0 3 4000 2000.0 2000
44317 302468 1 2 32 1287 10 2 20 1 6 2 01 3 5 0 3 4000 2000.0 2000
44318 307861 1 1 32 1674 1 1 42 2 2 1 NaN 1 2 3 0 5800 1450.0 3000
44319 307861 1 2 32 1674 2 2 44 2 7 1 NaN 1 6 3 0 5800 1450.0 2800
44320 307861 1 3 32 1674 3 1 13 1 4 2 00 3 3 0 3 5800 1450.0 0

In [19]:
indNoW.columns = ['CODUSU',
                    'NRO_HOGAR',
                    'COMPONENTE',
                    'AGLOMERADO',
                    'PONDERA',
                    'familyRelation', 
                    'female',
                    'age',
                    'schooled',
                    'schoolYear',
                    'finishedYear',
                    'lastYear',
                    'activity',
                    'educLevel',
                    'empCond',
                    'unempCond',
                    'ITF',
                    'IPCF',
              'P47T']
indNoW.index =range(0,indNoW.shape[0])

indNoW.head()


Out[19]:
CODUSU NRO_HOGAR COMPONENTE AGLOMERADO PONDERA familyRelation female age schooled schoolYear finishedYear lastYear activity educLevel empCond unempCond ITF IPCF P47T
0 302468 1 1 32 1287 1 2 20 1 7 2 01 3 5 0 3 4000 2000.0 2000
1 302468 1 2 32 1287 10 2 20 1 6 2 01 3 5 0 3 4000 2000.0 2000
2 307861 1 1 32 1674 1 1 42 2 2 1 NaN 1 2 3 0 5800 1450.0 3000
3 307861 1 2 32 1674 2 2 44 2 7 1 NaN 1 6 3 0 5800 1450.0 2800
4 307861 1 3 32 1674 3 1 13 1 4 2 00 3 3 0 3 5800 1450.0 0

In [20]:
indNoW.to_csv('data/cleanData.csv', index = False)

In [ ]: