In [1]:
import pandas as pd
import pysal as ps
import numpy as np
from jenks import jenks
import pandas as pd
import geopandas as gpd
#taken from https://github.com/perrygeo/jenks

'''
Arguments
---------
dbfile  : DBF file - Input to be imported
upper   : Condition - If true, make column heads upper case
'''
def dbf2DF(dbfile, upper=True): #Reads in DBF files and returns Pandas DF
    db = ps.open(dbfile) #Pysal to open DBF
    d = {col: db.by_col(col) for col in db.header} #Convert dbf to dictionary
    pandasDF = pd.DataFrame(db[:]) #Convert to Pandas DF
    pandasDF.columns=db.header
    pandasDF.dropna(inplace=True)
    return pandasDF

def getPercent(dataset,string):
    '''
    Arguments
    ---------
    dataset  : pandas datafra- Input to produce the % column on
    string   : String or list of strings - variable or variables to convert to %
    '''
    #remove rows with zeros in total column
    mask = (dataset.iloc[:,dataset.shape[1]-1] == 0)
    dataset = dataset.loc[~mask,:]
    variable = np.zeros(dataset.shape[0])
    
    #if me need to aggregate more columns in the same category:
    if len(string) > 1:
        variable = dataset.loc[:,string].sum(axis=1)
        
    else:
        variable = dataset[string].iloc[:,0]
    
    newVariableName = string[0].split('_')[0]
    
    dataset[newVariableName + '_p'] = variable/dataset.iloc[:,dataset.shape[1]-1] * 100
    dataset[newVariableName + '_p'] = dataset[newVariableName + '_p'].map(lambda x: round(x,1))
    dataset = dataset.loc[:,['REDCODE',newVariableName + '_p']]
    dataset.dropna(inplace=True)
    try:
        dataset[newVariableName + '_q'] = pd.qcut(dataset[newVariableName + '_p'], 10,labels=False)
        dataset[newVariableName + '_q'] = dataset[newVariableName + '_q'].map(lambda x: int(x))
    except ValueError:
        print 'Can not be cut in 10 quantiles. All values are set to 0'
        dataset[newVariableName + '_q'] = 0
        
    bins = jenks(list(dataset[newVariableName + '_p']), 10)
    dataset[newVariableName + '_j'] = pd.cut(dataset[newVariableName + '_p'],bins,labels = range(10))
    dataset[newVariableName + '_j'].fillna(0,inplace=True) #NaNs are because 0 percent gets out of jenks. This belong to the first jenks group

    return dataset

Cargo carto


In [2]:
#radios de caba y pba sacados de cartografia oficial de indec
mapa = gpd.GeoDataFrame.from_file('data/shapes/agba/agba.shp')
print mapa.shape
mapa.head()


(13446, 14)
Out[2]:
AREA DEPARTAMEN DEPTO DEPTO_NAME FRAC LINK PERIMETER PROV RADIO RADPAIS_ RADPAIS_ID REDCODE TIPO geometry
0 16948780.0 06252 252 ESCOBAR 01 062520101 26902.490 06 01 26690 26690 062520101 M POLYGON ((4158793.5 6185928, 4158824.25 618588...
1 12351870.0 06252 252 ESCOBAR 01 062520102 30469.020 06 02 26706 26707 062520102 M POLYGON ((4159777 6185123.5, 4159847 6185160, ...
2 11063170.0 06252 252 ESCOBAR 02 062520208 25350.040 06 08 26721 26723 062520208 U POLYGON ((4161392.5 6184536.5, 4161399.75 6184...
3 796720.3 06252 252 ESCOBAR 02 062520201 6843.383 06 01 26743 26743 062520201 U POLYGON ((4164165.5 6183091, 4164223.5 6183021...
4 2996825.0 06252 252 ESCOBAR 02 062520202 8393.760 06 02 26749 26749 062520202 U POLYGON ((4166774.25 6182343.5, 4166960.5 6182...

Cargo data

Educacion


In [3]:
educ = dbf2DF('data/redatam/MNI.DBF')
educ.head()


Out[3]:
REDCODE MNI_1 MNI_2 MNI_3 MNI_4 MNI_5 MNI_6 MNI_7 MNI_8 MNI_9 MNI_T
0 020010101 1 9 6 17 11 4 1 3 3 55
1 020010201 0 49 22 31 37 3 4 4 8 158
2 020010202 1 36 19 28 32 1 6 2 0 125
3 020010203 0 29 12 23 25 2 4 1 5 101
4 020010204 1 39 25 34 47 3 2 0 7 158

In [4]:
#8 es universitario completo
educ = getPercent(dataset = educ,string = ['MNI_8'])
educ.head()


/home/pipe/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:44: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/pipe/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:45: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
Out[4]:
REDCODE MNI_p MNI_q MNI_j
0 020010101 5.5 5 1
1 020010201 2.5 3 0
2 020010202 1.6 2 0
3 020010203 1.0 2 0
4 020010204 0.0 0 0

In [5]:
print educ.REDCODE.isin(mapa.REDCODE).sum()*1./educ.shape[0]
print mapa.REDCODE.isin(educ.REDCODE).sum()*1./mapa.shape[0]
mapa = mapa.merge(educ,on='REDCODE')


0.998588095415
0.999553770638

Celular


In [6]:
cell = dbf2DF('data/redatam/CELULAR.DBF')
cell.head()


Out[6]:
REDCODE H2819C_1 H2819C_2 H2819C_T
0 020010101 47 10 57
1 020010201 121 40 161
2 020010202 115 13 128
3 020010203 85 19 104
4 020010204 140 23 163

In [7]:
cell = getPercent(dataset = cell,string = ['H2819C_1'])
cell.head()


/home/pipe/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:44: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/pipe/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:45: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
Out[7]:
REDCODE H2819C_p H2819C_q H2819C_j
0 020010101 82.5 1 4
1 020010201 75.2 0 2
2 020010202 89.8 8 7
3 020010203 81.7 1 3
4 020010204 85.9 4 5

In [8]:
print cell.REDCODE.isin(mapa.REDCODE).sum()*1./cell.shape[0]
print mapa.REDCODE.isin(cell.REDCODE).sum()*1./mapa.shape[0]
mapa = mapa.merge(cell,on='REDCODE')


0.998513895081
1.0

Computadora


In [9]:
comp = dbf2DF('data/redatam/COMPUTER.DBF')
comp.head()


Out[9]:
REDCODE H2819B_1 H2819B_2 H2819B_T
0 020010101 25 32 57
1 020010201 42 119 161
2 020010202 24 104 128
3 020010203 43 61 104
4 020010204 67 96 163

In [10]:
comp = getPercent(dataset = comp,string = ['H2819B_1'])
comp.head()


/home/pipe/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:44: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/pipe/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:45: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
Out[10]:
REDCODE H2819B_p H2819B_q H2819B_j
0 020010101 43.9 3 3
1 020010201 26.1 0 1
2 020010202 18.8 0 0
3 020010203 41.3 2 3
4 020010204 41.1 2 3

In [11]:
print comp.REDCODE.isin(mapa.REDCODE).sum()*1./comp.shape[0]
print mapa.REDCODE.isin(comp.REDCODE).sum()*1./mapa.shape[0]
mapa = mapa.merge(comp,on='REDCODE')


0.998513895081
1.0

Viviendas vacias


In [12]:
empty = dbf2DF('data/redatam/EMPTY.DBF')
empty.head()


Out[12]:
REDCODE V02_1 V02_2 V02_3 V02_4 V02_5 V02_6 V02_7 V02_T
0 020010101 35 10 4 0 1 1 0 51
1 020010201 134 5 0 0 0 0 9 148
2 020010202 114 6 0 0 0 0 0 120
3 020010203 98 5 0 0 0 0 1 104
4 020010204 110 3 0 0 0 0 6 119

In [13]:
empty = getPercent(dataset = empty,string = ['V02_3','V02_4','V02_5','V02_6','V02_7'])
empty.head()


/home/pipe/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:44: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/pipe/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:45: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
Out[13]:
REDCODE V02_p V02_q V02_j
0 020010101 11.8 7 3
1 020010201 6.1 4 1
2 020010202 0.0 0 0
3 020010203 1.0 0 0
4 020010204 5.0 3 1

In [14]:
print empty.REDCODE.isin(mapa.REDCODE).sum()*1./empty.shape[0]
print mapa.REDCODE.isin(empty.REDCODE).sum()*1./mapa.shape[0]
mapa = mapa.merge(empty,on='REDCODE')


0.998217203982
1.0

Inmigracion


In [15]:
inmi = dbf2DF('data/redatam/INMIGRATION.DBF')
inmi.head()


Out[15]:
REDCODE P05_1 P05_2 P05_T
0 020010101 61 30 91
1 020010201 123 38 161
2 020010202 93 39 132
3 020010203 85 19 104
4 020010204 139 25 164

In [16]:
inmi = getPercent(dataset = inmi,string = ['P05_1'])
inmi.head()


/home/pipe/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:44: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/pipe/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:45: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
Out[16]:
REDCODE P05_p P05_q P05_j
0 020010101 67.0 0 3
1 020010201 76.4 0 4
2 020010202 70.5 0 4
3 020010203 81.7 0 5
4 020010204 84.8 1 6

In [17]:
print inmi.REDCODE.isin(mapa.REDCODE).sum()*1./inmi.shape[0]
print mapa.REDCODE.isin(inmi.REDCODE).sum()*1./mapa.shape[0]
mapa = mapa.merge(inmi,on='REDCODE')


0.998513895081
1.0

Legalidad

1 Propietario de la vivienda y del terreno
2 Propietario sólo de la vivienda
3 Inquilino 4 Ocupante por préstamo 5 Ocupante por trabajo
6 Otra situación
T Total


In [18]:
legalOrig = dbf2DF('data/redatam/LEGAL.DBF')
legalOrig.head()


Out[18]:
REDCODE PROP_1 PROP_2 PROP_3 PROP_4 PROP_5 PROP_6 PROP_T
0 020010101 16 6 22 2 6 5 57
1 020010201 103 26 20 7 1 4 161
2 020010202 72 35 14 4 1 2 128
3 020010203 63 21 11 6 0 3 104
4 020010204 85 36 5 8 0 29 163

In [19]:
legal = getPercent(dataset = legalOrig,string = ['PROP_1','PROP_3'])
legal.head()


/home/pipe/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:44: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/pipe/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:45: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
Out[19]:
REDCODE PROP_p PROP_q PROP_j
0 020010101 66.7 0 3
1 020010201 76.4 1 4
2 020010202 67.2 0 3
3 020010203 71.2 0 3
4 020010204 55.2 0 2

In [20]:
legal.columns = ['REDCODE','TENREG_p','TENREG_q','TENREG_j']

In [21]:
print legal.REDCODE.isin(mapa.REDCODE).sum()*1./legal.shape[0]
print mapa.REDCODE.isin(legal.REDCODE).sum()*1./mapa.shape[0]
mapa = mapa.merge(legal,on='REDCODE')


0.998513895081
1.0

Propietarios


In [22]:
owner = getPercent(dataset = legalOrig,string = ['PROP_1'])
owner.head()


/home/pipe/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:44: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/pipe/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:45: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
Out[22]:
REDCODE PROP_p PROP_q PROP_j
0 020010101 28.1 0 1
1 020010201 64.0 2 5
2 020010202 56.3 1 3
3 020010203 60.6 2 4
4 020010204 52.1 1 3

In [23]:
print owner.REDCODE.isin(mapa.REDCODE).sum()*1./owner.shape[0]
print mapa.REDCODE.isin(owner.REDCODE).sum()*1./mapa.shape[0]
mapa = mapa.merge(owner,on='REDCODE')


0.998513895081
1.0

Inquilinos


In [24]:
rent = getPercent(dataset = legalOrig,string = ['PROP_3'])
rent.head()


/home/pipe/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:44: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/pipe/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:45: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
Out[24]:
REDCODE PROP_p PROP_q PROP_j
0 020010101 38.6 9 7
1 020010201 12.4 3 2
2 020010202 10.9 3 2
3 020010203 10.6 3 2
4 020010204 3.1 0 0

In [25]:
rent.columns = ['REDCODE','INQ_p','INQ_q','INQ_j']

In [26]:
print rent.REDCODE.isin(mapa.REDCODE).sum()*1./rent.shape[0]
print mapa.REDCODE.isin(rent.REDCODE).sum()*1./mapa.shape[0]
mapa = mapa.merge(rent,on='REDCODE')


0.998513895081
1.0

Guardo archivo final


In [27]:
mapa.head()


Out[27]:
AREA DEPARTAMEN DEPTO DEPTO_NAME FRAC LINK PERIMETER PROV RADIO RADPAIS_ ... P05_j TENREG_p TENREG_q TENREG_j PROP_p PROP_q PROP_j INQ_p INQ_q INQ_j
0 16948780.0 06252 252 ESCOBAR 01 062520101 26902.490 06 01 26690 ... 6 67.5 0 3 62.5 2 4 5.0 0 0
1 12351870.0 06252 252 ESCOBAR 01 062520102 30469.020 06 02 26706 ... 8 82.4 2 5 74.5 6 7 7.8 1 1
2 11063170.0 06252 252 ESCOBAR 02 062520208 25350.040 06 08 26721 ... 7 75.4 0 4 68.1 4 5 7.2 1 1
3 796720.3 06252 252 ESCOBAR 02 062520201 6843.383 06 01 26743 ... 9 91.3 7 8 82.7 9 8 8.7 2 1
4 2996825.0 06252 252 ESCOBAR 02 062520202 8393.760 06 02 26749 ... 8 94.3 9 9 80.1 8 8 14.2 4 2

5 rows × 38 columns


In [30]:
mapa.to_file('data/shapes/agbaData')

In [ ]: