In [2]:

    
import getEPH
import functionsForModels
import make_dummy
import schoolYears
import categorize
import functionsForModels
import createVariables

import pandas as pd
#http://statsmodels.sourceforge.net/devel/examples/generated/example_wls.html
import numpy as np
from scipy import stats
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from statsmodels.iolib.table import (SimpleTable, default_txt_fmt)
np.random.seed(1024)
%matplotlib inline









    



/home/pipe/anaconda2/lib/python2.7/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)



In [ ]:

    
getEPH.getEPHdbf('t310')



In [5]:

    
hog = pd.read_csv('data/cleanDataHouseholdt310.csv')



In [4]:

    
hog.columns









    Out[4]:





Index([u'CODUSU', u'NRO_HOGAR', u'REGION', u'PONDERA', u'HomeType',
       u'HomeTypeesp', u'RoomsNumber', u'FloorMaterial', u'FloorMaterialesp',
       u'RoofMaterial', u'RoofCoat', u'Water', u'WaterType', u'WaterTypeesp',
       u'Toilet', u'ToiletLocation', u'ToiletType', u'Sewer', u'DumpSites',
       u'Flooding', u'EmergencyLoc', u'UsableTotalRooms', u'SleepingRooms',
       u'OfficeRooms', u'OnlyWork', u'Kitchen', u'Sink', u'Garage',
       u'Ownership', u'Ownershipesp', u'CookingCombustible',
       u'CookingCombustibleesp', u'BathroomUse', u'Working', u'HouseMembers',
       u'Memberless10', u'Membermore10', u'TotalHouseHoldIncome',
       u'DomesticService1', u'DomesticService2', u'DomesticService3',
       u'DomesticService4', u'DomesticService5', u'DomesticService6',
       u'TotalFamilyIncome', u'TotalFamilyIncomeDec',
       u'TotalFamilyIncomeDecReg', u'PerCapInc', u'PerCapIncDec',
       u'PerCapIncDecReg'],
      dtype='object')



In [6]:

    
print hog.shape
hog.head()









    



(2703, 50)






    Out[6]:






  
    
      
      CODUSU
      NRO_HOGAR
      REGION
      PONDERA
      HomeType
      HomeTypeesp
      RoomsNumber
      FloorMaterial
      FloorMaterialesp
      RoofMaterial
      ...
      DomesticService3
      DomesticService4
      DomesticService5
      DomesticService6
      TotalFamilyIncome
      TotalFamilyIncomeDec
      TotalFamilyIncomeDecReg
      PerCapInc
      PerCapIncDec
      PerCapIncDecReg
    
  
  
    
      0
      302468
      1
      1
      1287
      2
      NaN
      2
      1
      NaN
      9
      ...
      2
      0
      0
      0
      4000.0
      6
      6
      2000.0
      8
      8
    
    
      1
      307861
      1
      1
      1674
      2
      NaN
      2
      1
      NaN
      1
      ...
      98
      0
      0
      0
      5800.0
      8
      8
      1450.0
      6
      6
    
    
      2
      308762
      1
      1
      1522
      2
      NaN
      4
      1
      NaN
      9
      ...
      98
      0
      0
      0
      3200.0
      5
      5
      3200.0
      9
      9
    
    
      3
      308278
      1
      1
      1320
      2
      NaN
      3
      1
      NaN
      9
      ...
      96
      0
      0
      0
      10000.0
      10
      10
      5000.0
      10
      10
    
    
      4
      311937
      1
      1
      1281
      2
      NaN
      4
      1
      NaN
      1
      ...
      96
      0
      0
      0
      11000.0
      10
      10
      2750.0
      9
      9
    
  

5 rows × 50 columns

HomeType IV 1, NO CREO QUE SIRVA Tipo de vivienda ( por observación)

= Casa
= Departamento
= Pieza de inquilinato
= Pieza en hotel / pensión
= Local no construido para habitación
= Otros

FloorMaterial SIRVE SACAR TAL COMO ESTA SACAR 9 SE PUEDE HACER PCA CON LAS DE ABAJO IV3 N(1) Los pisos interiores son principalmente de...

= Mosaico / baldosa / madera / cerámica / alfombra
= Cemento / ladrillo fijo
= Ladrillo suelto / tierra
= Otro

RoofMaterial SIRVE SACAR TAL COMO ESTA SACAR 9 IV4 N(2) La cubierta exterior del techo es de....

= Membrana / cubierta asfáltica
= Baldosa / losa sin cubierta
= Pizarra / teja
= Chapa de metal sin cubierta
= Chapa de fibrocemento / plástico
= Chapa de cartón
= Caña / tabla / paja con barro / paja sola
= N/S. Depto en propiedad horizontal

RoofCoat IV5 N(1) ¿El techo tiene cielorraso / revestimiento interior? 1 = Sí 2 = No

Water IV6 N(1) Tiene agua...

Por cañería dentro de la vivienda
Fuera de la vivienda pero dentro del terreno
Fuera del terreno

WaterType IV7 N(1) El agua es de...

= Red pública (agua corriente)
= Perforación con bomba a motor
= Perforación con bomba manual
= Otra fuente

'Toilet', IV8 N(1) ¿Tiene baño / letrina?

= Sí
= No

'ToiletLocation'

'ToiletType',

'IV11', = 'Sewer', 'IV12_1', = 'DumpSites', 'IV12_2', = 'Flooding', 'IV12_3', = 'EmergencyLoc', 'II1', = 'UsableTotalRooms', 'II2', = 'SleepingRooms', #esta es para hacinamiento 'II3', = 'OfficeRooms', 'II3_1', = 'OnlyWork', 'II4_1', = 'Kitchen', 'II4_2', = 'Sink', 'II4_3', = 'Garage', 'II7', = 'Ownership', #JUNTAR 1 Y 3 EN SITUACION CONTACTUAL ESTABLE 'II7_ESP', = 'Ownershipesp', 'II8', = 'CookingCombustible', 'II8_ESP', = 'CookingCombustibleesp' 'II9', = 'BathroomUse', 'V1', = 'Working',



In [7]:

    
def remove9(df,variables):
    for var in variables:
        df[var].replace(to_replace=[9], value=[np.nan] , inplace=True, axis=None)

def remove0(df,variables):
    for var in variables:
        df[var].replace(to_replace=[0], value=[np.nan] , inplace=True, axis=None)

def remove99(df,variables):
    for var in variables:
        df[var].replace(to_replace=[99], value=[np.nan] , inplace=True, axis=None)



In [8]:

    
hog2 = hog.copy()

Limpieza de valores que estan mal 99, 9 y 0 cuando no corresponde



In [9]:

    
remove9(df = hog2, variables = ['FloorMaterial','RoofMaterial','RoofCoat','Water','WaterType','Toilet','ToiletLocation',
                               'ToiletType','Sewer','DumpSites','Flooding','EmergencyLoc','CookingCombustible',
                               'BathroomUse'])
remove0(df = hog2, variables = ['FloorMaterial','RoofMaterial','RoofCoat','Water','WaterType','Toilet','ToiletLocation',
                               'ToiletType','Sewer','DumpSites','Flooding','EmergencyLoc','Ownership','CookingCombustible',
                               'BathroomUse'])
remove99(df = hog2, variables = ['Ownership'])



In [10]:

    
variables = ['CookingCombustible']
for var in variables:
    print hog2[var].value_counts()
    plt.scatter(hog2[var], hog2.TotalHouseHoldIncome)
    plt.show()









    



1.0    2109
2.0     552
4.0      11
3.0       3
Name: CookingCombustible, dtype: int64

Recategorizacion de variables para que tengan mas sentido



In [11]:

    
hog2['CookingRec'] = np.nan



In [12]:

    
hog2['CookingRec'][hog2.CookingCombustible == 4] = 1
hog2['CookingRec'][hog2.CookingCombustible == 3] = 1
hog2['CookingRec'][hog2.CookingCombustible == 2] = 2
hog2['CookingRec'][hog2.CookingCombustible == 1] = 3









    



/home/pipe/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
/home/pipe/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
/home/pipe/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
/home/pipe/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [13]:

    
variables = ['CookingRec']
for var in variables:
    print hog2[var].value_counts()
    plt.scatter(hog2[var], hog2.TotalHouseHoldIncome)
    plt.show()









    



3.0    2109
2.0     552
1.0      14
Name: CookingRec, dtype: int64



In [14]:

    
#hogFinal = hog2.copy().loc[:,['FloorMaterial','RoofMaterial','RoofCoat','Water','WaterType','Toilet','ToiletLocation',
#                              'ToiletType','Sewer']]

#solamente las que tienen 1. Por cañería dentro de la vivienda
hog2['WaterRec'] = (hog2.Water == 1).astype(int)


hog2['OwnershipRec'] = ((hog2.Ownership == 1) | (hog2.Ownership == 3)).astype(int)
hog2['Hacinamiento'] = hog2.HouseMembers * 1.0 / hog2.SleepingRooms
hog2['id'] =  (hog2.CODUSU.astype(str) + hog2.NRO_HOGAR.astype(str))

#saco los ingresos del hogar
hog2['TotalHouseHoldIncome'].replace(to_replace=[0], value=[1] , inplace=True, axis=None)
hog2['lnHouseIncome'] = np.log(hog2['TotalHouseHoldIncome'])



In [15]:

    
variables = ['Hacinamiento']
for var in variables:
    print hog2[var].value_counts()
    plt.scatter(hog2[var], hog2.TotalHouseHoldIncome)
    plt.show()









    



2.000000     771
1.000000     741
1.500000     385
3.000000     158
1.333333     155
2.500000     148
1.666667      74
4.000000      68
0.500000      30
inf           28
5.000000      28
2.333333      19
1.250000      15
3.500000      14
6.000000      11
2.666667      11
0.666667       7
4.500000       5
7.000000       5
1.750000       3
3.333333       3
2.750000       3
0.333333       2
2.400000       2
9.000000       2
8.000000       2
2.250000       2
3.666667       2
0.250000       1
5.500000       1
1.200000       1
0.600000       1
1.833333       1
1.600000       1
4.666667       1
10.000000      1
0.750000       1
Name: Hacinamiento, dtype: int64



In [16]:

    
sinCuartosParaDormir = (hog2.SleepingRooms == 0)
sum(sinCuartosParaDormir)









    Out[16]:





28



In [17]:

    
hog2.loc[sinCuartosParaDormir,:].head()









    Out[17]:






  
    
      
      CODUSU
      NRO_HOGAR
      REGION
      PONDERA
      HomeType
      HomeTypeesp
      RoomsNumber
      FloorMaterial
      FloorMaterialesp
      RoofMaterial
      ...
      TotalFamilyIncomeDecReg
      PerCapInc
      PerCapIncDec
      PerCapIncDecReg
      CookingRec
      WaterRec
      OwnershipRec
      Hacinamiento
      id
      lnHouseIncome
    
  
  
    
      10
      259897
      2
      1
      1259
      2
      NaN
      5
      1.0
      NaN
      NaN
      ...
      0
      0.0
      0
      0
      NaN
      1
      0
      inf
      2598972
      0.000000
    
    
      11
      285841
      2
      1
      1311
      2
      NaN
      99
      1.0
      NaN
      NaN
      ...
      2
      1800.0
      7
      7
      NaN
      1
      0
      inf
      2858412
      7.495542
    
    
      20
      311871
      2
      1
      1374
      2
      NaN
      3
      1.0
      NaN
      NaN
      ...
      2
      1500.0
      7
      6
      NaN
      1
      0
      inf
      3118712
      7.313220
    
    
      28
      146462
      2
      1
      1457
      1
      NaN
      5
      1.0
      NaN
      2.0
      ...
      4
      2460.0
      9
      8
      NaN
      1
      0
      inf
      1464622
      7.807917
    
    
      34
      301199
      2
      1
      2168
      2
      NaN
      5
      1.0
      NaN
      NaN
      ...
      2
      1600.0
      7
      7
      NaN
      1
      0
      inf
      3011992
      7.377759
    
  

5 rows × 56 columns



In [18]:

    
hog2.head()









    Out[18]:






  
    
      
      CODUSU
      NRO_HOGAR
      REGION
      PONDERA
      HomeType
      HomeTypeesp
      RoomsNumber
      FloorMaterial
      FloorMaterialesp
      RoofMaterial
      ...
      TotalFamilyIncomeDecReg
      PerCapInc
      PerCapIncDec
      PerCapIncDecReg
      CookingRec
      WaterRec
      OwnershipRec
      Hacinamiento
      id
      lnHouseIncome
    
  
  
    
      0
      302468
      1
      1
      1287
      2
      NaN
      2
      1.0
      NaN
      NaN
      ...
      6
      2000.0
      8
      8
      3.0
      1
      1
      2.000000
      3024681
      8.294050
    
    
      1
      307861
      1
      1
      1674
      2
      NaN
      2
      1.0
      NaN
      1.0
      ...
      8
      1450.0
      6
      6
      3.0
      1
      0
      2.000000
      3078611
      8.665613
    
    
      2
      308762
      1
      1
      1522
      2
      NaN
      4
      1.0
      NaN
      NaN
      ...
      5
      3200.0
      9
      9
      3.0
      1
      1
      1.000000
      3087621
      8.070906
    
    
      3
      308278
      1
      1
      1320
      2
      NaN
      3
      1.0
      NaN
      NaN
      ...
      10
      5000.0
      10
      10
      1.0
      1
      1
      2.000000
      3082781
      9.210340
    
    
      4
      311937
      1
      1
      1281
      2
      NaN
      4
      1.0
      NaN
      1.0
      ...
      10
      2750.0
      9
      9
      3.0
      1
      1
      1.333333
      3119371
      9.305651
    
  

5 rows × 56 columns



In [19]:

    
hog2.columns









    Out[19]:





Index([u'CODUSU', u'NRO_HOGAR', u'REGION', u'PONDERA', u'HomeType',
       u'HomeTypeesp', u'RoomsNumber', u'FloorMaterial', u'FloorMaterialesp',
       u'RoofMaterial', u'RoofCoat', u'Water', u'WaterType', u'WaterTypeesp',
       u'Toilet', u'ToiletLocation', u'ToiletType', u'Sewer', u'DumpSites',
       u'Flooding', u'EmergencyLoc', u'UsableTotalRooms', u'SleepingRooms',
       u'OfficeRooms', u'OnlyWork', u'Kitchen', u'Sink', u'Garage',
       u'Ownership', u'Ownershipesp', u'CookingCombustible',
       u'CookingCombustibleesp', u'BathroomUse', u'Working', u'HouseMembers',
       u'Memberless10', u'Membermore10', u'TotalHouseHoldIncome',
       u'DomesticService1', u'DomesticService2', u'DomesticService3',
       u'DomesticService4', u'DomesticService5', u'DomesticService6',
       u'TotalFamilyIncome', u'TotalFamilyIncomeDec',
       u'TotalFamilyIncomeDecReg', u'PerCapInc', u'PerCapIncDec',
       u'PerCapIncDecReg', u'CookingRec', u'WaterRec', u'OwnershipRec',
       u'Hacinamiento', u'id', u'lnHouseIncome'],
      dtype='object')



In [20]:

    
sns.kdeplot(hog2.lnHouseIncome)









    Out[20]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f26a65f1b50>



In [21]:

    
hogReducida = hog2.copy().drop(['CODUSU','NRO_HOGAR','REGION','HomeTypeesp','FloorMaterialesp',
                              'WaterTypeesp','Ownershipesp','CookingCombustibleesp','DomesticService1', 
                               'DomesticService2', 'DomesticService3','DomesticService4', 'DomesticService5',
                               'DomesticService6'],axis = 1)



In [22]:

    
hogReducida.head()









    Out[22]:






  
    
      
      PONDERA
      HomeType
      RoomsNumber
      FloorMaterial
      RoofMaterial
      RoofCoat
      Water
      WaterType
      Toilet
      ToiletLocation
      ...
      TotalFamilyIncomeDecReg
      PerCapInc
      PerCapIncDec
      PerCapIncDecReg
      CookingRec
      WaterRec
      OwnershipRec
      Hacinamiento
      id
      lnHouseIncome
    
  
  
    
      0
      1287
      2
      2
      1.0
      NaN
      1.0
      1.0
      1
      1
      1.0
      ...
      6
      2000.0
      8
      8
      3.0
      1
      1
      2.000000
      3024681
      8.294050
    
    
      1
      1674
      2
      2
      1.0
      1.0
      1.0
      1.0
      1
      1
      1.0
      ...
      8
      1450.0
      6
      6
      3.0
      1
      0
      2.000000
      3078611
      8.665613
    
    
      2
      1522
      2
      4
      1.0
      NaN
      1.0
      1.0
      1
      1
      1.0
      ...
      5
      3200.0
      9
      9
      3.0
      1
      1
      1.000000
      3087621
      8.070906
    
    
      3
      1320
      2
      3
      1.0
      NaN
      1.0
      1.0
      1
      1
      1.0
      ...
      10
      5000.0
      10
      10
      1.0
      1
      1
      2.000000
      3082781
      9.210340
    
    
      4
      1281
      2
      4
      1.0
      1.0
      1.0
      1.0
      1
      1
      1.0
      ...
      10
      2750.0
      9
      9
      3.0
      1
      1
      1.333333
      3119371
      9.305651
    
  

5 rows × 42 columns

Indivitual Data



In [23]:

    
#para leer en otro archivo

ind = pd.read_csv('data/pivotInd.csv')

ind['id'] = ind['id'].astype(str)



In [24]:

    
ind.head()









    Out[24]:






  
    
      
      schoolAndJob
      noJob
      job
      id
      AGLO1
      AGLO2
      headAge
      spouseAge
      headAge2
      spouseAge2
      ...
      headDECCFR
      spouseDECCFR
      headDECIFR
      spouseDECIFR
      headMaritalStatus
      spouseMaritalStatus
      headReading
      spouseReading
      headPlaceOfBirth
      spouseplaceOfBirth
    
  
  
    
      0
      40.0
      1
      3
      1250971
      33.0
      NaN
      57.0
      NaN
      3249.0
      NaN
      ...
      8.0
      NaN
      9.0
      NaN
      4.0
      NaN
      1.0
      NaN
      1.0
      NaN
    
    
      1
      12.0
      1
      1
      1253451
      32.0
      NaN
      66.0
      NaN
      4356.0
      NaN
      ...
      8.0
      NaN
      6.0
      NaN
      3.0
      NaN
      1.0
      NaN
      1.0
      NaN
    
    
      2
      0.0
      1
      0
      1254461
      32.0
      NaN
      79.0
      NaN
      6241.0
      NaN
      ...
      7.0
      NaN
      2.0
      NaN
      4.0
      NaN
      1.0
      NaN
      1.0
      NaN
    
    
      3
      14.0
      2
      2
      1256691
      33.0
      NaN
      61.0
      NaN
      3721.0
      NaN
      ...
      8.0
      NaN
      9.0
      NaN
      4.0
      NaN
      1.0
      NaN
      1.0
      NaN
    
    
      4
      24.0
      1
      2
      1256892
      32.0
      32.0
      31.0
      23.0
      961.0
      529.0
      ...
      5.0
      5.0
      5.0
      5.0
      1.0
      1.0
      1.0
      1.0
      4.0
      4.0
    
  

5 rows × 44 columns



In [27]:

    
ind.drop(['AGLO2'],axis =1,inplace=True)



In [28]:

    
#sumo los valores predecidos del modelo
ind['sumPredicted'] = ind.headPredictedLnIncome + ind.spousePredictedLnIncome

Merge de datos



In [29]:

    
#sacar los hogares sin cuartos para dormir
hogReducida = hogReducida.copy().loc[~sinCuartosParaDormir,:]



In [30]:

    
#chequeo
print 'filas hog:',hogReducida.shape[0]
print 'filas ind:',ind.shape[0]
print 'cantidad de ind en hog:', sum(ind['id'].sort_values().isin(hogReducida['id'].sort_values()))









    



filas hog: 2675
filas ind: 2703
cantidad de ind en hog: 2675



In [31]:

    
dataUnida = pd.merge(left=hogReducida, right=ind, on='id',how='left')



In [32]:

    
dataUnida.to_csv('data/dataFinalParaModelo.csv',index=False)



In [33]:

    
print dataUnida.shape
dataUnida.columns









    



(2675, 85)






    Out[33]:





Index([u'PONDERA', u'HomeType', u'RoomsNumber', u'FloorMaterial',
       u'RoofMaterial', u'RoofCoat', u'Water', u'WaterType', u'Toilet',
       u'ToiletLocation', u'ToiletType', u'Sewer', u'DumpSites', u'Flooding',
       u'EmergencyLoc', u'UsableTotalRooms', u'SleepingRooms', u'OfficeRooms',
       u'OnlyWork', u'Kitchen', u'Sink', u'Garage', u'Ownership',
       u'CookingCombustible', u'BathroomUse', u'Working', u'HouseMembers',
       u'Memberless10', u'Membermore10', u'TotalHouseHoldIncome',
       u'TotalFamilyIncome', u'TotalFamilyIncomeDec',
       u'TotalFamilyIncomeDecReg', u'PerCapInc', u'PerCapIncDec',
       u'PerCapIncDecReg', u'CookingRec', u'WaterRec', u'OwnershipRec',
       u'Hacinamiento', u'id', u'lnHouseIncome', u'schoolAndJob', u'noJob',
       u'job', u'AGLO1', u'headAge', u'spouseAge', u'headAge2', u'spouseAge2',
       u'headFemale', u'spouseFemale', u'headEduc', u'spouseEduc',
       u'headEduc2', u'spouseEduc2', u'headPrimary', u'spousePrimary',
       u'headSecondary', u'spouseSecondary', u'headUniversity',
       u'spouseUniversity', u'headP21', u'spouseP21', u'headP47T',
       u'spouseP47T', u'headLnIncome', u'spouseLnIncome', u'headLnIncomeT',
       u'spouseLnIncomeT', u'headPredictedLnIncome',
       u'spousePredictedLnIncome', u'headJob', u'spouseJob', u'headDECCFR',
       u'spouseDECCFR', u'headDECIFR', u'spouseDECIFR', u'headMaritalStatus',
       u'spouseMaritalStatus', u'headReading', u'spouseReading',
       u'headPlaceOfBirth', u'spouseplaceOfBirth', u'sumPredicted'],
      dtype='object')

Filtrar Outliers



In [34]:

    
dataUnida.head()









    Out[34]:






  
    
      
      PONDERA
      HomeType
      RoomsNumber
      FloorMaterial
      RoofMaterial
      RoofCoat
      Water
      WaterType
      Toilet
      ToiletLocation
      ...
      spouseDECCFR
      headDECIFR
      spouseDECIFR
      headMaritalStatus
      spouseMaritalStatus
      headReading
      spouseReading
      headPlaceOfBirth
      spouseplaceOfBirth
      sumPredicted
    
  
  
    
      0
      1287
      2
      2
      1.0
      NaN
      1.0
      1.0
      1
      1
      1.0
      ...
      NaN
      6.0
      NaN
      5.0
      NaN
      1.0
      NaN
      1.0
      NaN
      NaN
    
    
      1
      1674
      2
      2
      1.0
      1.0
      1.0
      1.0
      1
      1
      1.0
      ...
      6.0
      8.0
      8.0
      2.0
      2.0
      1.0
      1.0
      2.0
      2.0
      15.469188
    
    
      2
      1522
      2
      4
      1.0
      NaN
      1.0
      1.0
      1
      1
      1.0
      ...
      NaN
      5.0
      NaN
      4.0
      NaN
      1.0
      NaN
      3.0
      NaN
      NaN
    
    
      3
      1320
      2
      3
      1.0
      NaN
      1.0
      1.0
      1
      1
      1.0
      ...
      10.0
      10.0
      10.0
      2.0
      2.0
      1.0
      1.0
      1.0
      1.0
      16.235857
    
    
      4
      1281
      2
      4
      1.0
      1.0
      1.0
      1.0
      1
      1
      1.0
      ...
      9.0
      10.0
      10.0
      2.0
      2.0
      1.0
      1.0
      1.0
      3.0
      8.336136
    
  

5 rows × 85 columns



In [35]:

    
plt.boxplot([dataUnida.lnHouseIncome])









    Out[35]:





{'boxes': [<matplotlib.lines.Line2D at 0x7f26a64f03d0>],
 'caps': [<matplotlib.lines.Line2D at 0x7f26a63ff050>,
  <matplotlib.lines.Line2D at 0x7f26a63f1710>],
 'fliers': [<matplotlib.lines.Line2D at 0x7f26a6419610>],
 'means': [],
 'medians': [<matplotlib.lines.Line2D at 0x7f26a6423f50>],
 'whiskers': [<matplotlib.lines.Line2D at 0x7f26a63ffe90>,
  <matplotlib.lines.Line2D at 0x7f26a63ffe10>]}



In [ ]:

Probando Modelos



In [ ]:

    
def runModel(dataset, income , variables):
    
    '''
    This function takes a data set, runs a model according to specifications,
    and returns the model, printing the summary
    '''
    selectedVariables = ['PONDERA',income] + variables
    dataToRun = dataset.loc[:,selectedVariables]
    y = dataToRun.copy().iloc[:,1].values
    X = sm.add_constant(dataToRun.copy().iloc[:,2:].values)
    w = dataToRun.copy().iloc[:,0].values
    lm = sm.WLS(y, X, weights=1. / w).fit()
    print lm.summary()
    for i in range(1,len(variables)+1):
        print 'x%d: %s' % (i,variables[i-1])
        
        
        
    y = dataUnida[ingreso].values)

    X = np.where(np.isnan(dataUnida.loc[:,variablesParaModelo].values),0,dataUnida.loc[:,variablesParaModelo].values)
    X = sm.add_constant(X)

    w = np.where(np.isnan(dataUnida.PONDERA.values),0,dataUnida.PONDERA.values)
    lm = sm.WLS(y, X, weights=1. / w).fit()
    print lm.summary()
    for i in range(1,len(variables)+1):
        print 'x%d: %s' % (i,variables[i-1])
    
    #testing within sample
    R_IS=[]
    R_OS=[]
    nCross=1000
    
    for i in range(nCross):
        X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(X, y, w, test_size=0.33)
        lm = sm.WLS(y_train, X_train, weights=1. / w_train, hasconst=False).fit()        
        R_IS.append(1-((np.asarray(lm.predict(exog = X_train))-y_train)**2).sum()/((y_train-np.mean(y_train))**2).sum())                                                                     
        R_OS.append(1-((np.asarray(lm.predict(exog = X_test))-y_test)**2).sum()/((y_test-np.mean(y_test))**2).sum())
    print("IS R-squared for {} times is {}".format(nCross,np.mean(R_IS)))
    print("OS R-squared for {} times is {}".format(nCross,np.mean(R_OS)))

    return lm



In [ ]:

    
ingreso = 'lnHouseIncome'
variablesParaModelo = [u'RoomsNumber', u'FloorMaterial', u'RoofMaterial',
       u'RoofCoat', u'Water', u'WaterType', u'Toilet', u'ToiletLocation',
       u'ToiletType', u'Sewer', u'DumpSites', u'Flooding', u'EmergencyLoc',
       u'UsableTotalRooms', u'SleepingRooms', u'OfficeRooms', u'OnlyWork',
       u'Kitchen', u'Sink', u'Garage', u'Ownership', u'CookingCombustible',
       u'BathroomUse', u'Working', u'HouseMembers', u'Memberless10',
       u'Membermore10', u'WaterRec', u'OwnershipRec',
       u'Hacinamiento', u'headAge', u'spouseAge',
       u'headAge2', u'spouseAge2', u'headFemale', u'spouseFemale', u'headEduc',
       u'spouseEduc', u'headEduc2', u'spouseEduc2', u'headPrimary',
       u'spousePrimary', u'headSecondary', u'spouseSecondary',
       u'headUniversity', u'spouseUniversity']



y = np.where(np.isnan(dataUnida[ingreso].values),0,dataUnida[ingreso].values)

X = np.where(np.isnan(dataUnida.loc[:,variablesParaModelo].values),0,dataUnida.loc[:,variablesParaModelo].values)
X = sm.add_constant(X)

w = np.where(np.isnan(dataUnida.PONDERA.values),0,dataUnida.PONDERA.values)
lm = sm.WLS(y, X, weights=1. / w).fit()
print lm.summary()
for i in range(1,len(variables)+1):
    print 'x%d: %s' % (i,variables[i-1])



In [ ]:



In [ ]:

PCA



In [ ]:

    
from sklearn.decomposition import PCA

variablesParaModelo = ['FloorMaterial', 'RoofMaterial',
       'RoofCoat', 'Water', 'WaterType', 'Toilet', 'ToiletLocation',
       'ToiletType', 'Sewer','CookingCombustible',
       'BathroomUse','OwnershipRec']

X = np.where(np.isnan(dataUnida.loc[:,variablesParaModelo].values),0,dataUnida.loc[:,variablesParaModelo].values)

n = 12
pca = PCA(n)
Xproj = pca.fit_transform(X)


eigenvalues = pca.explained_variance_
fig = plt.figure(figsize=(16,4))
ax = fig.add_subplot(1,3,1)

ax.bar(np.arange(n), eigenvalues);
ax.set_xlabel("Dimensionality")
ax.set_ylabel("Variance")


ratio = pca.explained_variance_ratio_
ax2 = fig.add_subplot(1,3,2)
ax2.bar(np.arange(n), ratio);
ax2.set_xlabel("Dimensionality")
ax2.set_ylabel("%of variance")

sumRatio = np.cumsum(pca.explained_variance_ratio_)
ax3 = fig.add_subplot(1,3,3)
ax3.bar(np.arange(n), sumRatio);
ax3.set_xlabel("Dimensionality")
ax3.set_ylabel("%Cumulative sum of variance")



In [ ]:

    
plt.scatter(Xproj[:, 0], Xproj[:, 1])



In [ ]:

    
dataUnida['HousePCA1'] = Xproj[:, 0]
dataUnida['HousePCA2'] = Xproj[:, 1]



In [ ]:

MODELO 2 - PCA para la vivienda y variables individuales del jefe y conyuge separdas



In [ ]:

    
ingreso = 'lnHouseIncome'
variablesParaModelo = ['HousePCA1', 'HousePCA2','Hacinamiento',
    'headAge', 'spouseAge',
       'headAge2', 'spouseAge2', 'headFemale', 'spouseFemale', 'headEduc',
       'spouseEduc', 'headEduc2', 'spouseEduc2'
                      ]



y = np.where(np.isnan(dataUnida[ingreso].values),0,dataUnida[ingreso].values)

X = np.where(np.isnan(dataUnida.loc[:,variablesParaModelo].values),0,dataUnida.loc[:,variablesParaModelo].values)
X = sm.add_constant(X)

w = np.where(np.isnan(dataUnida.PONDERA.values),0,dataUnida.PONDERA.values)

lm = sm.WLS(y, X, weights=1. / w).fit()
print lm.summary()
for i in range(1,len(variablesParaModelo)+1):
    print 'x%d: %s' % (i,variablesParaModelo[i-1])

MODELO 3 - PCA para la vivienda y variables individuales del jefe y conyuge predecidas



In [ ]:

    
ingreso = 'lnHouseIncome'
variablesParaModelo = ['HousePCA1', 'HousePCA2','Hacinamiento','headPredictedLnIncome','spousePredictedLnIncome']



y = np.where(np.isnan(dataUnida[ingreso].values),0,dataUnida[ingreso].values)

X = np.where(np.isnan(dataUnida.loc[:,variablesParaModelo].values),0,dataUnida.loc[:,variablesParaModelo].values)
X = sm.add_constant(X)

w = np.where(np.isnan(dataUnida.PONDERA.values),0,dataUnida.PONDERA.values)

lm = sm.WLS(y, X, weights=1. / w).fit()
print lm.summary()
for i in range(1,len(variablesParaModelo)+1):
    print 'x%d: %s' % (i,variablesParaModelo[i-1])

Modelo 4 - PCA para la vivienda y variables individuales del jefe y conyuge sumadas



In [ ]:

    
ingreso = 'lnHouseIncome'
variablesParaModelo = ['HousePCA1', 'HousePCA2','sumPredicted']


y = np.where(np.isnan(dataUnida[ingreso].values),0,dataUnida[ingreso].values)

X = np.where(np.isnan(dataUnida.loc[:,variablesParaModelo].values),0,dataUnida.loc[:,variablesParaModelo].values)
X = sm.add_constant(X)

w = np.where(np.isnan(dataUnida.PONDERA.values),0,dataUnida.PONDERA.values)

lm = sm.WLS(y, X, weights=1. / w).fit()
print lm.summary()
for i in range(1,len(variablesParaModelo)+1):
    print 'x%d: %s' % (i,variablesParaModelo[i-1])

Modelo 5 - solo con jefe



In [ ]:

    
ingreso = 'lnHouseIncome'
variablesParaModelo = ['HousePCA1', 'HousePCA2','headAge','headEduc']


y = np.where(np.isnan(dataUnida[ingreso].values),0,dataUnida[ingreso].values)

X = np.where(np.isnan(dataUnida.loc[:,variablesParaModelo].values),0,dataUnida.loc[:,variablesParaModelo].values)
X = sm.add_constant(X)

w = np.where(np.isnan(dataUnida.PONDERA.values),0,dataUnida.PONDERA.values)

lm = sm.WLS(y, X, weights=1. / w).fit()
print lm.summary()
for i in range(1,len(variablesParaModelo)+1):
    print 'x%d: %s' % (i,variablesParaModelo[i-1])

pROBANDO COSAS



In [ ]:

    
ingreso = 'lnHouseIncome'
variablesParaModelo = ['HousePCA1', 'HousePCA2','Hacinamiento',
    'headAge', 'spouseAge',
       'headAge2', 'spouseAge2', 'headFemale', 'spouseFemale', 'headEduc',
       'spouseEduc', 'headEduc2', 'spouseEduc2'
                      ]



y = np.where(np.isnan(dataUnida[ingreso].values),0,dataUnida[ingreso].values)

X = np.where(np.isnan(dataUnida.loc[:,variablesParaModelo].values),0,dataUnida.loc[:,variablesParaModelo].values)
X = sm.add_constant(X)

w = np.where(np.isnan(dataUnida.PONDERA.values),0,dataUnida.PONDERA.values)

lm = sm.WLS(y, X, weights=1. / w).fit()
print lm.summary()
for i in range(1,len(variablesParaModelo)+1):
    print 'x%d: %s' % (i,variablesParaModelo[i-1])

	CODUSU	NRO_HOGAR	REGION	PONDERA	HomeType	HomeTypeesp	RoomsNumber	FloorMaterial	FloorMaterialesp	RoofMaterial	...	DomesticService3	TotalFamilyIncome	TotalFamilyIncomeDec	TotalFamilyIncomeDecReg	PerCapInc	PerCapIncDec	PerCapIncDecReg
0	302468	1	1	1287	2	NaN	2	1	NaN	9	...	2	4000.0	6	6	2000.0	8	8
1	307861	1	1	1674	2	NaN	2	1	NaN	1	...	98	5800.0	8	8	1450.0	6	6
2	308762	1	1	1522	2	NaN	4	1	NaN	9	...	98	3200.0	5	5	3200.0	9	9
3	308278	1	1	1320	2	NaN	3	1	NaN	9	...	96	10000.0	10	10	5000.0	10	10
4	311937	1	1	1281	2	NaN	4	1	NaN	1	...	96	11000.0	10	10	2750.0	9	9

	CODUSU	NRO_HOGAR	REGION	PONDERA	HomeType	HomeTypeesp	RoomsNumber	FloorMaterial	FloorMaterialesp	RoofMaterial	...	TotalFamilyIncomeDecReg	PerCapInc	PerCapIncDec	PerCapIncDecReg	CookingRec	WaterRec	Hacinamiento	id	lnHouseIncome
10	259897	2	1	1259	2	NaN	5	1.0	NaN	NaN	...	0	0.0	0	0	NaN	1	inf	2598972	0.000000
11	285841	2	1	1311	2	NaN	99	1.0	NaN	NaN	...	2	1800.0	7	7	NaN	1	inf	2858412	7.495542
20	311871	2	1	1374	2	NaN	3	1.0	NaN	NaN	...	2	1500.0	7	6	NaN	1	inf	3118712	7.313220
28	146462	2	1	1457	1	NaN	5	1.0	NaN	2.0	...	4	2460.0	9	8	NaN	1	inf	1464622	7.807917
34	301199	2	1	2168	2	NaN	5	1.0	NaN	NaN	...	2	1600.0	7	7	NaN	1	inf	3011992	7.377759

	PONDERA	HomeType	RoomsNumber	FloorMaterial	RoofMaterial	RoofCoat	Water	WaterType	Toilet	ToiletLocation	...	TotalFamilyIncomeDecReg	PerCapInc	PerCapIncDec	PerCapIncDecReg	CookingRec	WaterRec	OwnershipRec	Hacinamiento	id	lnHouseIncome
0	1287	2	2	1.0	NaN	1.0	1.0	1	1	1.0	...	6	2000.0	8	8	3.0	1	1	2.000000	3024681	8.294050
1	1674	2	2	1.0	1.0	1.0	1.0	1	1	1.0	...	8	1450.0	6	6	3.0	1	0	2.000000	3078611	8.665613
2	1522	2	4	1.0	NaN	1.0	1.0	1	1	1.0	...	5	3200.0	9	9	3.0	1	1	1.000000	3087621	8.070906
3	1320	2	3	1.0	NaN	1.0	1.0	1	1	1.0	...	10	5000.0	10	10	1.0	1	1	2.000000	3082781	9.210340
4	1281	2	4	1.0	1.0	1.0	1.0	1	1	1.0	...	10	2750.0	9	9	3.0	1	1	1.333333	3119371	9.305651

	schoolAndJob	noJob	job	id	AGLO1	AGLO2	headAge	spouseAge	headAge2	spouseAge2	...	headDECCFR	spouseDECCFR	headDECIFR	spouseDECIFR	headMaritalStatus	spouseMaritalStatus	headReading	spouseReading	headPlaceOfBirth	spouseplaceOfBirth
0	40.0	1	3	1250971	33.0	NaN	57.0	NaN	3249.0	NaN	...	8.0	NaN	9.0	NaN	4.0	NaN	1.0	NaN	1.0	NaN
1	12.0	1	1	1253451	32.0	NaN	66.0	NaN	4356.0	NaN	...	8.0	NaN	6.0	NaN	3.0	NaN	1.0	NaN	1.0	NaN
2	0.0	1	0	1254461	32.0	NaN	79.0	NaN	6241.0	NaN	...	7.0	NaN	2.0	NaN	4.0	NaN	1.0	NaN	1.0	NaN
3	14.0	2	2	1256691	33.0	NaN	61.0	NaN	3721.0	NaN	...	8.0	NaN	9.0	NaN	4.0	NaN	1.0	NaN	1.0	NaN
4	24.0	1	2	1256892	32.0	32.0	31.0	23.0	961.0	529.0	...	5.0	5.0	5.0	5.0	1.0	1.0	1.0	1.0	4.0	4.0

	PONDERA	HomeType	RoomsNumber	FloorMaterial	RoofMaterial	RoofCoat	Water	WaterType	Toilet	ToiletLocation	...	spouseDECCFR	headDECIFR	spouseDECIFR	headMaritalStatus	spouseMaritalStatus	headReading	spouseReading	headPlaceOfBirth	spouseplaceOfBirth	sumPredicted
0	1287	2	2	1.0	NaN	1.0	1.0	1	1	1.0	...	NaN	6.0	NaN	5.0	NaN	1.0	NaN	1.0	NaN	NaN
1	1674	2	2	1.0	1.0	1.0	1.0	1	1	1.0	...	6.0	8.0	8.0	2.0	2.0	1.0	1.0	2.0	2.0	15.469188
2	1522	2	4	1.0	NaN	1.0	1.0	1	1	1.0	...	NaN	5.0	NaN	4.0	NaN	1.0	NaN	3.0	NaN	NaN
3	1320	2	3	1.0	NaN	1.0	1.0	1	1	1.0	...	10.0	10.0	10.0	2.0	2.0	1.0	1.0	1.0	1.0	16.235857
4	1281	2	4	1.0	1.0	1.0	1.0	1	1	1.0	...	9.0	10.0	10.0	2.0	2.0	1.0	1.0	1.0	3.0	8.336136