In [1]:
import pandas as pd
import numpy as np
import os
import sys
import simpledbf

In [2]:
trimestre = 't310'
dbf = simpledbf.Dbf5('data/Individual_' + trimestre + '.dbf',codec='latin1')
indRaw = dbf.to_dataframe()

In [3]:
indRaw.head()


Out[3]:
CODUSU NRO_HOGAR COMPONENTE H15 ANO4 TRIMESTRE REGION MAS_500 AGLOMERADO PONDERA ... DECCFR IDECCFR RDECCFR GDECCFR PDECCFR ADECCFR PJ1_1 PJ2_1 PJ3_1 IDIMPP
0 301358 1 1 1 2010 3 43 S 2 647 ... 02 02 02 02 NaN 01 0 0 0 00010
1 304669 1 1 1 2010 3 43 S 2 417 ... 04 04 03 03 NaN 03 0 0 0 10000
2 304669 1 2 1 2010 3 43 S 2 417 ... 04 04 03 03 NaN 03 0 0 0 10000
3 304669 1 3 1 2010 3 43 S 2 417 ... 04 04 03 03 NaN 03 0 0 0 00000
4 302366 1 1 1 2010 3 43 S 2 715 ... 10 10 10 10 NaN 10 0 0 0 00000

5 rows × 176 columns


In [4]:
mask = indRaw.NRO_HOGAR > 1
hog2 = indRaw.loc[mask,:]
print hog2.shape
hog2.head()


(2165, 176)
Out[4]:
CODUSU NRO_HOGAR COMPONENTE H15 ANO4 TRIMESTRE REGION MAS_500 AGLOMERADO PONDERA ... DECCFR IDECCFR RDECCFR GDECCFR PDECCFR ADECCFR PJ1_1 PJ2_1 PJ3_1 IDIMPP
24 131949 2 1 1 2010 3 43 S 2 989 ... 09 09 09 09 NaN 08 0 0 0 10000
53 268997 3 1 1 2010 3 43 S 2 640 ... 07 07 06 06 NaN 06 0 0 0 00000
79 306113 2 1 1 2010 3 43 S 2 590 ... 08 08 08 08 NaN 06 0 0 0 00030
80 306113 2 2 1 2010 3 43 S 2 590 ... 08 08 08 08 NaN 06 0 0 0 00010
84 301045 2 1 1 2010 3 43 S 2 480 ... 09 10 10 09 NaN 09 0 0 0 00000

5 rows × 176 columns


In [8]:
hog2.PP04B1.value_counts()


Out[8]:
0    1249
2     784
1     132
Name: PP04B1, dtype: int64

In [11]:
indRaw.CH03.value_counts()


Out[11]:
3     24497
1     18045
2     10630
5      3345
9       890
4       834
8       712
6       430
10      347
7       205
Name: CH03, dtype: int64

In [9]:
hog2.CH03.value_counts()


Out[9]:
3     829
1     788
2     405
8      43
9      32
10     21
5      20
6      11
4      11
7       5
Name: CH03, dtype: int64

In [27]:
hog2.query('ESTADO == 1').PP04B1.value_counts()


Out[27]:
2    784
1    132
0      1
Name: PP04B1, dtype: int64

In [13]:
hog = pd.read_csv('data/cleanDataHouseholdt310.csv')

In [14]:
hog.head()


Out[14]:
CODUSU NRO_HOGAR REGION PONDERA HomeType HomeTypeesp RoomsNumber FloorMaterial FloorMaterialesp RoofMaterial ... HouseMembers Memberless10 Membermore10 TotalHouseHoldIncome DomesticService1 DomesticService2 DomesticService3 DomesticService4 DomesticService5 DomesticService6
0 302468 1 1 1287 2 NaN 2 1 NaN 9 ... 2 0 2 4000.0 1 0 2 0 0 0
1 307861 1 1 1674 2 NaN 2 1 NaN 1 ... 4 1 3 5800.0 1 2 98 0 0 0
2 308762 1 1 1522 2 NaN 4 1 NaN 9 ... 1 0 1 3200.0 1 0 98 0 0 0
3 308278 1 1 1320 2 NaN 3 1 NaN 9 ... 2 0 2 10000.0 2 0 96 0 0 0
4 311937 1 1 1281 2 NaN 4 1 NaN 1 ... 4 0 4 11000.0 2 4 96 0 0 0

5 rows × 44 columns


In [35]:
hog2.head()


Out[35]:
CODUSU NRO_HOGAR COMPONENTE H15 ANO4 TRIMESTRE REGION MAS_500 AGLOMERADO PONDERA ... DECCFR IDECCFR RDECCFR GDECCFR PDECCFR ADECCFR PJ1_1 PJ2_1 PJ3_1 IDIMPP
24 131949 2 1 1 2010 3 43 S 2 989 ... 09 09 09 09 NaN 08 0 0 0 10000
53 268997 3 1 1 2010 3 43 S 2 640 ... 07 07 06 06 NaN 06 0 0 0 00000
79 306113 2 1 1 2010 3 43 S 2 590 ... 08 08 08 08 NaN 06 0 0 0 00030
80 306113 2 2 1 2010 3 43 S 2 590 ... 08 08 08 08 NaN 06 0 0 0 00010
84 301045 2 1 1 2010 3 43 S 2 480 ... 09 10 10 09 NaN 09 0 0 0 00000

5 rows × 176 columns


In [57]:
cantPersonas = hog2.CODUSU.groupby(by = hog2.CODUSU).count()

In [58]:
cantPersonas = pd.DataFrame(cantPersonas)
cantPersonas.columns = ['cantPers']
cantPersonas['CODUSU'] = cantPersonas.index
cantPersonas.columns


Out[58]:
Index([u'cantPers', u'CODUSU'], dtype='object')

In [48]:
hog2.head()


Out[48]:
CODUSU NRO_HOGAR COMPONENTE H15 ANO4 TRIMESTRE REGION MAS_500 AGLOMERADO PONDERA ... DECCFR IDECCFR RDECCFR GDECCFR PDECCFR ADECCFR PJ1_1 PJ2_1 PJ3_1 IDIMPP
24 131949 2 1 1 2010 3 43 S 2 989 ... 09 09 09 09 NaN 08 0 0 0 10000
53 268997 3 1 1 2010 3 43 S 2 640 ... 07 07 06 06 NaN 06 0 0 0 00000
79 306113 2 1 1 2010 3 43 S 2 590 ... 08 08 08 08 NaN 06 0 0 0 00030
80 306113 2 2 1 2010 3 43 S 2 590 ... 08 08 08 08 NaN 06 0 0 0 00010
84 301045 2 1 1 2010 3 43 S 2 480 ... 09 10 10 09 NaN 09 0 0 0 00000

5 rows × 176 columns


In [59]:
mergeData = pd.merge(left = hog2, right=cantPersonas, how = 'left', on = 'CODUSU')

In [61]:
pd.crosstab(mergeData.NRO_HOGAR,mergeData.cantPers)


Out[61]:
cantPers 1 2 3 4 5 6 7 8 9 10
NRO_HOGAR
2 184 305 451 488 245 154 107 34 27 16
3 15 14 41 12 10 20 12 6 9 4
4 1 1 3 4 0 0 0 0 0 0
5 2 0 0 0 0 0 0 0 0 0

In [62]:
pd.crosstab(mergeData.NRO_HOGAR,mergeData.CH03)


Out[62]:
CH03 1 2 3 4 5 6 7 8 9 10
NRO_HOGAR
2 719 380 774 10 19 11 5 42 30 21
3 61 24 53 1 1 0 0 1 2 0
4 6 1 2 0 0 0 0 0 0 0
5 2 0 0 0 0 0 0 0 0 0

In [ ]: