In [1]:
import getEPH
import functionsForModels
import make_dummy
import schoolYears
import categorize
import functionsForModels
import createVariables
import pandas as pd
#http://statsmodels.sourceforge.net/devel/examples/generated/example_wls.html
import numpy as np
from scipy import stats
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from statsmodels.iolib.table import (SimpleTable, default_txt_fmt)
np.random.seed(1024)
%matplotlib inline
In [2]:
getEPH.getEPHdbf('t310')
In [2]:
data1 = pd.read_csv('data/cleanDatat310.csv')
data1.columns
Out[2]:
In [3]:
data2 = categorize.categorize(data1)
In [4]:
data3 = schoolYears.schoolYears(data2)
data4 = createVariables.createVariables(data3)
In [5]:
data4.columns
Out[5]:
In [6]:
#considerar los ingresos del decil 0 como los trabajamos
jobsAndIncome = (data4.activity == 1) & (data4.P21 > 1)
headAndSpouse = (data4.familyRelation == 1)|(data4.familyRelation == 2)
dataParaModelo = data4.copy().loc[jobsAndIncome,:]
In [7]:
variablesOfInterest = ['age',
'age2',
'female',
'education',
'education2']
In [8]:
model = functionsForModels.runModel(dataset = dataParaModelo, income = 'lnIncome', variables = variablesOfInterest)
In [9]:
X = sm.add_constant(dataParaModelo.copy().loc[:,variablesOfInterest].values)
In [10]:
dataParaModelo['predictedLnIncome'] = model.predict(X)
In [11]:
#dataParaModelo.loc[:,['id','predictedLnIncome']]
In [12]:
paraMerge = dataParaModelo.loc[:,['CODUSU', 'NRO_HOGAR', 'COMPONENTE','predictedLnIncome']]
print paraMerge.shape
In [13]:
paraMerge.head()
Out[13]:
In [14]:
data4.shape
Out[14]:
In [15]:
data = pd.merge(left = data4 , right = paraMerge, on = ['CODUSU', 'NRO_HOGAR', 'COMPONENTE'], how = 'left')
In [16]:
data.shape
Out[16]:
In [17]:
data.predictedLnIncome[data.predictedLnIncome.isnull()] = 0
In [18]:
#crea variable job
data['job'] = (data.activity==1).astype(int)
data['noJob'] = (data.activity!=1).astype(int)
data['schoolAndJob'] = data.job * data.education
In [19]:
print data.shape
data.head()
Out[19]:
In [20]:
#crear cantidad de activos en el hogar
cantidadActivos = data.job.groupby(by=data['id']).sum()
#crear cantidad de inactivos en el hogar
cantidadInactivos = data.noJob.groupby(by=data['id']).sum()
#anos de escolaridad puestos a trabajar en el hogar
schoolAndJob = data.schoolAndJob.groupby(by=data['id']).sum()
In [21]:
dfJobsAndEduc = pd.merge(left = schoolAndJob.to_frame() ,
right = cantidadInactivos.to_frame(),
left_index = True,
right_index = True)
dfJobsAndEduc = pd.merge(left = dfJobsAndEduc ,
right = cantidadActivos.to_frame(),
left_index = True,
right_index = True)
print dfJobsAndEduc.shape
dfJobsAndEduc.head()
Out[21]:
In [24]:
cleanData = data.copy().loc[(headAndSpouse),
['id',
'AGLOMERADO',
'familyRelation',
'age',
'age2',
'female',
'education',
'education2',
'primary',
'secondary',
'university',
'P21',
'P47T',
'lnIncome',
u'lnIncomeT',
'predictedLnIncome',
'job',
'DECCFR',
'DECIFR',
'maritalStatus',
'reading',
'placeOfBirth',
]]
In [25]:
cleanData.head()
Out[25]:
In [26]:
pivot = cleanData.pivot(index='id', columns='familyRelation')
pivot.head()
Out[26]:
In [27]:
print pivot.shape
pivot.columns
Out[27]:
In [28]:
pivot.to_csv('data/pivotInd.csv')
In [29]:
#para leer en otro archivo
dataN = pd.read_csv('data/pivotInd.csv', names = ['id','AGLO1','AGLO2','headAge','spouseAge','headAge2','spouseAge2',
'headFemale','spouseFemale','headEduc','spouseEduc',
'headEduc2','spouseEduc2','headPrimary','spousePrimary',
'headSecondary','spouseSecondary','headUniversity','spouseUniversity',
'headP21','spouseP21','headP47T','spouseP47T',
'headLnIncome','spouseLnIncome','headLnIncomeT','spouseLnIncomeT',
'headPredictedLnIncome','spousePredictedLnIncome','headJob','spouseJob',
'headDECCFR','spouseDECCFR','headDECIFR','spouseDECIFR',
'headMaritalStatus','spouseMaritalStatus',
'headReading','spouseReading','headPlaceOfBirth','spouseplaceOfBirth',
],skiprows = 3)
dataN.head()
Out[29]:
In [30]:
dfJobsAndEduc['id'] = dfJobsAndEduc.index
dfJobsAndEduc['id'] = dfJobsAndEduc['id'].astype(int)
In [31]:
dataN.head()
Out[31]:
In [32]:
print type(dataN['id'][0]),type(dfJobsAndEduc['id'][0])
In [33]:
dataFinalCSV = pd.merge(left = dfJobsAndEduc ,
right = dataN,
left_on = 'id',
right_on = 'id')
print dataFinalCSV.shape
dataFinalCSV.head()
Out[33]:
In [34]:
dataFinalCSV.to_csv('data/pivotInd.csv',index=False)
In [35]:
caca = pd.read_csv('data/pivotInd.csv')
caca.head()
Out[35]:
In [ ]:
np.where(np.isnan(pivot.values),0,pivot.values)
dataN = dataN.dropna(axis = 0)
dataN.head()
In [41]:
dataN.shape
Out[41]:
In [ ]: