In [1]:
import getEPH
import functionsForModels
import make_dummy
import schoolYears
import categorize
import functionsForModels
import createVariables

import pandas as pd
#http://statsmodels.sourceforge.net/devel/examples/generated/example_wls.html
import numpy as np
from scipy import stats
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from statsmodels.iolib.table import (SimpleTable, default_txt_fmt)
np.random.seed(1024)
%matplotlib inline


/home/pipe/anaconda2/lib/python2.7/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

In [2]:
getEPH.getEPHdbf('t310')


('Downloading', 't310')
file in place, creating CSV file
csv file cleanDataHousehold t310 .csv successfully created in folder data/
csv file cleanData t310 .csv successfully created in folder data/

In [2]:
data1 = pd.read_csv('data/cleanDatat310.csv')
data1.columns


Out[2]:
Index([u'CODUSU', u'NRO_HOGAR', u'COMPONENTE', u'AGLOMERADO', u'PONDERA',
       u'familyRelation', u'female', u'age', u'schoolLevel', u'finishedYear',
       u'lastYear', u'activity', u'empCond', u'unempCond', u'ITF', u'IPCF',
       u'P47T', u'P21', u'DECCFR', u'DECIFR', u'maritalStatus', u'reading',
       u'placeOfBirth'],
      dtype='object')

In [3]:
data2 = categorize.categorize(data1)

In [4]:
data3 = schoolYears.schoolYears(data2)
data4 = createVariables.createVariables(data3)

In [5]:
data4.columns


Out[5]:
Index([u'CODUSU', u'NRO_HOGAR', u'COMPONENTE', u'AGLOMERADO', u'PONDERA',
       u'familyRelation', u'female', u'age', u'schoolLevel', u'finishedYear',
       u'lastYear', u'activity', u'empCond', u'unempCond', u'ITF', u'IPCF',
       u'P47T', u'P21', u'DECCFR', u'DECIFR', u'maritalStatus', u'reading',
       u'placeOfBirth', u'primary', u'secondary', u'university', u'education',
       u'education2', u'age2', u'id', u'lnIncome', u'lnIncomeT'],
      dtype='object')

modelo para predecir ingreso individual


In [6]:
#considerar los ingresos del decil 0 como los trabajamos
jobsAndIncome = (data4.activity == 1) & (data4.P21 > 1) 
headAndSpouse = (data4.familyRelation == 1)|(data4.familyRelation == 2)
dataParaModelo = data4.copy().loc[jobsAndIncome,:]

In [7]:
variablesOfInterest = ['age',
                              'age2',
                              'female',
                              'education',
                              'education2']

In [8]:
model = functionsForModels.runModel(dataset = dataParaModelo, income = 'lnIncome', variables = variablesOfInterest)


                            WLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.287
Model:                            WLS   Adj. R-squared:                  0.286
Method:                 Least Squares   F-statistic:                     289.9
Date:                Wed, 07 Dec 2016   Prob (F-statistic):          2.70e-261
Time:                        13:55:52   Log-Likelihood:                -4091.5
No. Observations:                3608   AIC:                             8195.
Df Residuals:                    3602   BIC:                             8232.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          5.5938      0.138     40.537      0.000         5.323     5.864
x1             0.0659      0.005     12.328      0.000         0.055     0.076
x2            -0.0007   6.16e-05    -11.488      0.000        -0.001    -0.001
x3            -0.5528      0.025    -22.012      0.000        -0.602    -0.504
x4             0.0277      0.016      1.779      0.075        -0.003     0.058
x5             0.0032      0.001      4.525      0.000         0.002     0.005
==============================================================================
Omnibus:                      641.219   Durbin-Watson:                   1.858
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             1458.019
Skew:                          -1.012   Prob(JB):                         0.00
Kurtosis:                       5.367   Cond. No.                     2.45e+04
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.45e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
x1: age
x2: age2
x3: female
x4: education
x5: education2
IS R-squared for 1000 times is 0.298124864616
OS R-squared for 1000 times is 0.293081804822

In [9]:
X = sm.add_constant(dataParaModelo.copy().loc[:,variablesOfInterest].values)

In [10]:
dataParaModelo['predictedLnIncome'] = model.predict(X)

In [11]:
#dataParaModelo.loc[:,['id','predictedLnIncome']]

merge con la base individual


In [12]:
paraMerge = dataParaModelo.loc[:,['CODUSU', 'NRO_HOGAR', 'COMPONENTE','predictedLnIncome']]
print paraMerge.shape


(3608, 4)

In [13]:
paraMerge.head()


Out[13]:
CODUSU NRO_HOGAR COMPONENTE predictedLnIncome
2 307861 1 1 7.478150
3 307861 1 2 7.991038
7 308278 1 1 8.485189
8 308278 1 2 7.750667
9 311937 1 1 8.336136

In [14]:
data4.shape


Out[14]:
(8360, 32)

In [15]:
data = pd.merge(left = data4 , right = paraMerge, on = ['CODUSU', 'NRO_HOGAR', 'COMPONENTE'], how = 'left')

In [16]:
data.shape


Out[16]:
(8360, 33)

In [17]:
data.predictedLnIncome[data.predictedLnIncome.isnull()] = 0


/home/pipe/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

crear nuevas variables individuales


In [18]:
#crea variable job
data['job'] = (data.activity==1).astype(int)
data['noJob'] = (data.activity!=1).astype(int)
data['schoolAndJob'] = data.job * data.education

In [19]:
print data.shape
data.head()


(8360, 36)
Out[19]:
CODUSU NRO_HOGAR COMPONENTE AGLOMERADO PONDERA familyRelation female age schoolLevel finishedYear ... education education2 age2 id lnIncome lnIncomeT predictedLnIncome job noJob schoolAndJob
0 302468 1 1 32 1287 1 1 20 7.0 2 ... 13.0 169.0 400 3024681 0.000000 7.600902 0.000000 0 1 0.0
1 302468 1 2 32 1287 10 1 20 6.0 2 ... 13.0 169.0 400 3024681 0.000000 7.600902 0.000000 0 1 0.0
2 307861 1 1 32 1674 1 0 42 2.0 1 ... 7.0 49.0 1764 3078611 8.006368 8.006368 7.478150 1 0 7.0
3 307861 1 2 32 1674 2 1 44 7.0 1 ... 17.0 289.0 1936 3078611 7.937375 7.937375 7.991038 1 0 17.0
4 307861 1 3 32 1674 3 0 13 4.0 2 ... 7.0 49.0 169 3078611 0.000000 0.000000 0.000000 0 1 0.0

5 rows × 36 columns


In [20]:
#crear cantidad de activos en el hogar
cantidadActivos = data.job.groupby(by=data['id']).sum()
#crear cantidad de inactivos en el hogar
cantidadInactivos = data.noJob.groupby(by=data['id']).sum()
#anos de escolaridad puestos a trabajar en el hogar
schoolAndJob = data.schoolAndJob.groupby(by=data['id']).sum()

In [21]:
dfJobsAndEduc = pd.merge(left = schoolAndJob.to_frame() ,
                         right = cantidadInactivos.to_frame(),
                         left_index = True,
                         right_index = True)

dfJobsAndEduc = pd.merge(left = dfJobsAndEduc ,
                         right = cantidadActivos.to_frame(),
                         left_index = True,
                         right_index = True)


print dfJobsAndEduc.shape
dfJobsAndEduc.head()


(2703, 3)
Out[21]:
schoolAndJob noJob job
id
1250971 40.0 1 3
1253451 12.0 1 1
1254461 0.0 1 0
1256691 14.0 2 2
1256892 24.0 1 2

crear dataset pivoteado para mergear con hogar


In [24]:
cleanData = data.copy().loc[(headAndSpouse),
                              ['id',
                               'AGLOMERADO',
                            'familyRelation',
                              'age',
                              'age2',
                              'female',
                              'education',
                              'education2',
                              'primary',
                              'secondary',
                              'university',
                              'P21',
                              'P47T',
                              'lnIncome',
                               u'lnIncomeT',
                              'predictedLnIncome',
                              'job',
                              'DECCFR',
                               'DECIFR',
                               'maritalStatus',
                               'reading',
                               'placeOfBirth',
                              ]]

In [25]:
cleanData.head()


Out[25]:
id AGLOMERADO familyRelation age age2 female education education2 primary secondary ... P47T lnIncome lnIncomeT predictedLnIncome job DECCFR DECIFR maritalStatus reading placeOfBirth
0 3024681 32 1 20 400 1 13.0 169.0 7.0 5.0 ... 2000 0.000000 7.600902 0.000000 0 8 6 5 1 1
2 3078611 32 1 42 1764 0 7.0 49.0 7.0 0.0 ... 3000 8.006368 8.006368 7.478150 1 6 8 2 1 2
3 3078611 32 2 44 1936 1 17.0 289.0 7.0 5.0 ... 2800 7.937375 7.937375 7.991038 1 6 8 2 1 2
6 3087621 32 1 68 4624 1 17.0 289.0 7.0 5.0 ... 3200 0.000000 8.070906 0.000000 0 9 5 4 1 3
7 3082781 32 1 38 1444 0 17.0 289.0 7.0 5.0 ... 6000 8.517193 8.699515 8.485189 1 10 10 2 1 1

5 rows × 22 columns


In [26]:
pivot = cleanData.pivot(index='id', columns='familyRelation')
pivot.head()


Out[26]:
AGLOMERADO age age2 female education ... DECCFR DECIFR maritalStatus reading placeOfBirth
familyRelation 1 2 1 2 1 2 1 2 1 2 ... 1 2 1 2 1 2 1 2 1 2
id
1250971 33.0 NaN 57.0 NaN 3249.0 NaN 1.0 NaN 14.0 NaN ... 8.0 NaN 9.0 NaN 4.0 NaN 1.0 NaN 1.0 NaN
1253451 32.0 NaN 66.0 NaN 4356.0 NaN 1.0 NaN 7.0 NaN ... 8.0 NaN 6.0 NaN 3.0 NaN 1.0 NaN 1.0 NaN
1254461 32.0 NaN 79.0 NaN 6241.0 NaN 1.0 NaN 12.0 NaN ... 7.0 NaN 2.0 NaN 4.0 NaN 1.0 NaN 1.0 NaN
1256691 33.0 NaN 61.0 NaN 3721.0 NaN 0.0 NaN 7.0 NaN ... 8.0 NaN 9.0 NaN 4.0 NaN 1.0 NaN 1.0 NaN
1256892 32.0 32.0 31.0 23.0 961.0 529.0 0.0 1.0 12.0 12.0 ... 5.0 5.0 5.0 5.0 1.0 1.0 1.0 1.0 4.0 4.0

5 rows × 40 columns


In [27]:
print pivot.shape
pivot.columns


(2703, 40)
Out[27]:
MultiIndex(levels=[[u'AGLOMERADO', u'age', u'age2', u'female', u'education', u'education2', u'primary', u'secondary', u'university', u'P21', u'P47T', u'lnIncome', u'lnIncomeT', u'predictedLnIncome', u'job', u'DECCFR', u'DECIFR', u'maritalStatus', u'reading', u'placeOfBirth'], [1, 2]],
           labels=[[0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19], [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]],
           names=[None, u'familyRelation'])

In [28]:
pivot.to_csv('data/pivotInd.csv')

In [29]:
#para leer en otro archivo
dataN = pd.read_csv('data/pivotInd.csv', names = ['id','AGLO1','AGLO2','headAge','spouseAge','headAge2','spouseAge2',
                                                  'headFemale','spouseFemale','headEduc','spouseEduc',
                                                  'headEduc2','spouseEduc2','headPrimary','spousePrimary',
                                                  'headSecondary','spouseSecondary','headUniversity','spouseUniversity',
                                                  'headP21','spouseP21','headP47T','spouseP47T',
                                                  'headLnIncome','spouseLnIncome','headLnIncomeT','spouseLnIncomeT',
                                                  'headPredictedLnIncome','spousePredictedLnIncome','headJob','spouseJob',
                                                  'headDECCFR','spouseDECCFR','headDECIFR','spouseDECIFR',
                                                  'headMaritalStatus','spouseMaritalStatus',
                                                  'headReading','spouseReading','headPlaceOfBirth','spouseplaceOfBirth',
                                                 ],skiprows = 3)
dataN.head()


Out[29]:
id AGLO1 AGLO2 headAge spouseAge headAge2 spouseAge2 headFemale spouseFemale headEduc ... headDECCFR spouseDECCFR headDECIFR spouseDECIFR headMaritalStatus spouseMaritalStatus headReading spouseReading headPlaceOfBirth spouseplaceOfBirth
0 1250971 33.0 NaN 57.0 NaN 3249.0 NaN 1.0 NaN 14.0 ... 8.0 NaN 9.0 NaN 4.0 NaN 1.0 NaN 1.0 NaN
1 1253451 32.0 NaN 66.0 NaN 4356.0 NaN 1.0 NaN 7.0 ... 8.0 NaN 6.0 NaN 3.0 NaN 1.0 NaN 1.0 NaN
2 1254461 32.0 NaN 79.0 NaN 6241.0 NaN 1.0 NaN 12.0 ... 7.0 NaN 2.0 NaN 4.0 NaN 1.0 NaN 1.0 NaN
3 1256691 33.0 NaN 61.0 NaN 3721.0 NaN 0.0 NaN 7.0 ... 8.0 NaN 9.0 NaN 4.0 NaN 1.0 NaN 1.0 NaN
4 1256892 32.0 32.0 31.0 23.0 961.0 529.0 0.0 1.0 12.0 ... 5.0 5.0 5.0 5.0 1.0 1.0 1.0 1.0 4.0 4.0

5 rows × 41 columns


In [30]:
dfJobsAndEduc['id'] = dfJobsAndEduc.index
dfJobsAndEduc['id'] = dfJobsAndEduc['id'].astype(int)

In [31]:
dataN.head()


Out[31]:
id AGLO1 AGLO2 headAge spouseAge headAge2 spouseAge2 headFemale spouseFemale headEduc ... headDECCFR spouseDECCFR headDECIFR spouseDECIFR headMaritalStatus spouseMaritalStatus headReading spouseReading headPlaceOfBirth spouseplaceOfBirth
0 1250971 33.0 NaN 57.0 NaN 3249.0 NaN 1.0 NaN 14.0 ... 8.0 NaN 9.0 NaN 4.0 NaN 1.0 NaN 1.0 NaN
1 1253451 32.0 NaN 66.0 NaN 4356.0 NaN 1.0 NaN 7.0 ... 8.0 NaN 6.0 NaN 3.0 NaN 1.0 NaN 1.0 NaN
2 1254461 32.0 NaN 79.0 NaN 6241.0 NaN 1.0 NaN 12.0 ... 7.0 NaN 2.0 NaN 4.0 NaN 1.0 NaN 1.0 NaN
3 1256691 33.0 NaN 61.0 NaN 3721.0 NaN 0.0 NaN 7.0 ... 8.0 NaN 9.0 NaN 4.0 NaN 1.0 NaN 1.0 NaN
4 1256892 32.0 32.0 31.0 23.0 961.0 529.0 0.0 1.0 12.0 ... 5.0 5.0 5.0 5.0 1.0 1.0 1.0 1.0 4.0 4.0

5 rows × 41 columns


In [32]:
print type(dataN['id'][0]),type(dfJobsAndEduc['id'][0])


<type 'numpy.int64'> <type 'numpy.int64'>

In [33]:
dataFinalCSV = pd.merge(left = dfJobsAndEduc ,
                         right = dataN,
                         left_on = 'id',
                         right_on = 'id')
print dataFinalCSV.shape
dataFinalCSV.head()


(2703, 44)
Out[33]:
schoolAndJob noJob job id AGLO1 AGLO2 headAge spouseAge headAge2 spouseAge2 ... headDECCFR spouseDECCFR headDECIFR spouseDECIFR headMaritalStatus spouseMaritalStatus headReading spouseReading headPlaceOfBirth spouseplaceOfBirth
0 40.0 1 3 1250971 33.0 NaN 57.0 NaN 3249.0 NaN ... 8.0 NaN 9.0 NaN 4.0 NaN 1.0 NaN 1.0 NaN
1 12.0 1 1 1253451 32.0 NaN 66.0 NaN 4356.0 NaN ... 8.0 NaN 6.0 NaN 3.0 NaN 1.0 NaN 1.0 NaN
2 0.0 1 0 1254461 32.0 NaN 79.0 NaN 6241.0 NaN ... 7.0 NaN 2.0 NaN 4.0 NaN 1.0 NaN 1.0 NaN
3 14.0 2 2 1256691 33.0 NaN 61.0 NaN 3721.0 NaN ... 8.0 NaN 9.0 NaN 4.0 NaN 1.0 NaN 1.0 NaN
4 24.0 1 2 1256892 32.0 32.0 31.0 23.0 961.0 529.0 ... 5.0 5.0 5.0 5.0 1.0 1.0 1.0 1.0 4.0 4.0

5 rows × 44 columns


In [34]:
dataFinalCSV.to_csv('data/pivotInd.csv',index=False)

In [35]:
caca = pd.read_csv('data/pivotInd.csv')
caca.head()


Out[35]:
schoolAndJob noJob job id AGLO1 AGLO2 headAge spouseAge headAge2 spouseAge2 ... headDECCFR spouseDECCFR headDECIFR spouseDECIFR headMaritalStatus spouseMaritalStatus headReading spouseReading headPlaceOfBirth spouseplaceOfBirth
0 40.0 1 3 1250971 33.0 NaN 57.0 NaN 3249.0 NaN ... 8.0 NaN 9.0 NaN 4.0 NaN 1.0 NaN 1.0 NaN
1 12.0 1 1 1253451 32.0 NaN 66.0 NaN 4356.0 NaN ... 8.0 NaN 6.0 NaN 3.0 NaN 1.0 NaN 1.0 NaN
2 0.0 1 0 1254461 32.0 NaN 79.0 NaN 6241.0 NaN ... 7.0 NaN 2.0 NaN 4.0 NaN 1.0 NaN 1.0 NaN
3 14.0 2 2 1256691 33.0 NaN 61.0 NaN 3721.0 NaN ... 8.0 NaN 9.0 NaN 4.0 NaN 1.0 NaN 1.0 NaN
4 24.0 1 2 1256892 32.0 32.0 31.0 23.0 961.0 529.0 ... 5.0 5.0 5.0 5.0 1.0 1.0 1.0 1.0 4.0 4.0

5 rows × 44 columns


In [ ]:
np.where(np.isnan(pivot.values),0,pivot.values)

dataN = dataN.dropna(axis = 0)
dataN.head()

In [41]:
dataN.shape


Out[41]:
(2703, 39)

In [ ]: