Sources:
In [46]:
# helper functions
import getEPH
import categorize
import createVariables
import schoolYears
import make_dummy
import functionsForModels
# libraries
import pandas as pd
import numpy as np
from scipy import stats
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from statsmodels.iolib.table import (SimpleTable, default_txt_fmt)
np.random.seed(1024)
%matplotlib inline
In [47]:
getEPH.getEPHdbf('t310')
In [48]:
data1 = pd.read_csv('data/cleanDatat310.csv')
In [49]:
data2 = categorize.categorize(data1)
data3 = schoolYears.schoolYears(data2)
data4 = createVariables.createVariables(data3)
In [50]:
jobsAndIncome = (data4.activity == 1) & (data4.P21 > 1) # we only consider people who are working and have income
headAndSpouse = (data4.familyRelation == 1)|(data4.familyRelation == 2) #
dataParaModelo = data4.copy().loc[jobsAndIncome,:]
In [51]:
variablesOfInterest = ['age',
'age2',
'female',
'education',
'education2']
In [52]:
model = functionsForModels.runModel(dataset = dataParaModelo, income = 'lnIncome', variables = variablesOfInterest)
In [53]:
X = sm.add_constant(dataParaModelo.copy().loc[:,variablesOfInterest].values)
In [54]:
dataParaModelo['predictedLnIncome'] = model.predict(X)
We create a merged dataset combining a subset of individuals (with job and income) with the total individuals. We then account for people with no income (kids or unemployed) and assign them a "0" income. The resulting dataset will succesfully match every individual with their household.
In [55]:
paraMerge = dataParaModelo.loc[:,['CODUSU', 'NRO_HOGAR', 'COMPONENTE','predictedLnIncome']]
paraMerge.head()
Out[55]:
In [56]:
data = pd.merge(left = data4 , right = paraMerge, on = ['CODUSU', 'NRO_HOGAR', 'COMPONENTE'], how = 'left')
In [57]:
data.predictedLnIncome[data.predictedLnIncome.isnull()] = 0
In [58]:
# Variables related to Occupation
data['job'] = (data.activity==1).astype(int)
data['noJob'] = (data.activity!=1).astype(int)
data['schoolAndJob'] = data.job * data.education
In [59]:
cantidadActivos = data.job.groupby(by=data['id']).sum() # Number of people working in the household
cantidadInactivos = data.noJob.groupby(by=data['id']).sum() # Number of people not working in the household
schoolAndJob = data.schoolAndJob.groupby(by=data['id']).sum() # Sum of total schooling years in the household
In [60]:
dfJobsAndEduc = pd.merge(left = schoolAndJob.to_frame() ,
right = cantidadInactivos.to_frame(),
left_index = True,
right_index = True)
dfJobsAndEduc = pd.merge(left = dfJobsAndEduc ,
right = cantidadActivos.to_frame(),
left_index = True,
right_index = True)
In [61]:
cleanData = data.copy().loc[(headAndSpouse),
['id',
'AGLOMERADO',
'familyRelation',
'age',
'age2',
'female',
'education',
'education2',
'primary',
'secondary',
'university',
'P21',
'P47T',
'lnIncome',
u'lnIncomeT',
'predictedLnIncome',
'job',
'DECCFR',
'DECIFR',
'maritalStatus',
'reading',
'placeOfBirth',
]]
In [62]:
cleanData.head()
Out[62]:
In [63]:
pivot = cleanData.pivot(index='id', columns='familyRelation')
pivot.head()
Out[63]:
In [64]:
pivot.to_csv('data/pivotInd.csv')
In [65]:
dataN = pd.read_csv('data/pivotInd.csv', names = ['id','AGLO1','AGLO2','headAge','spouseAge','headAge2','spouseAge2',
'headFemale','spouseFemale','headEduc','spouseEduc',
'headEduc2','spouseEduc2','headPrimary','spousePrimary',
'headSecondary','spouseSecondary','headUniversity','spouseUniversity',
'headP21','spouseP21','headP47T','spouseP47T',
'headLnIncome','spouseLnIncome','headLnIncomeT','spouseLnIncomeT',
'headPredictedLnIncome','spousePredictedLnIncome','headJob','spouseJob',
'headDECCFR','spouseDECCFR','headDECIFR','spouseDECIFR',
'headMaritalStatus','spouseMaritalStatus',
'headReading','spouseReading','headPlaceOfBirth','spouseplaceOfBirth',
],skiprows = 3)
In [66]:
dfJobsAndEduc['id'] = dfJobsAndEduc.index
dfJobsAndEduc['id'] = dfJobsAndEduc['id'].astype(int)
dataFinalCSV = pd.merge(left = dfJobsAndEduc ,
right = dataN,
left_on = 'id',
right_on = 'id')
In [67]:
dataFinalCSV.to_csv('data/pivotInd.csv',index=False)
dataFinalCSV.head()
Out[67]:
In [68]:
getEPH.getEPHdbf('t310')
hog = pd.read_csv('data/cleanDataHouseholdt310.csv')
print hog.shape
hog.head()
Out[68]:
In [69]:
def remove9(df,variables):
for var in variables:
df[var].replace(to_replace=[9], value=[np.nan] , inplace=True, axis=None)
def remove0(df,variables):
for var in variables:
df[var].replace(to_replace=[0], value=[np.nan] , inplace=True, axis=None)
def remove99(df,variables):
for var in variables:
df[var].replace(to_replace=[99], value=[np.nan] , inplace=True, axis=None)
In [70]:
hog2 = hog.copy()
remove9(df = hog2, variables = ['FloorMaterial','RoofMaterial','RoofCoat','Water','WaterType','Toilet','ToiletLocation',
'ToiletType','Sewer','DumpSites','Flooding','EmergencyLoc','CookingCombustible',
'BathroomUse'])
remove0(df = hog2, variables = ['FloorMaterial','RoofMaterial','RoofCoat','Water','WaterType','Toilet','ToiletLocation',
'ToiletType','Sewer','DumpSites','Flooding','EmergencyLoc','Ownership','CookingCombustible',
'BathroomUse'])
remove99(df = hog2, variables = ['Ownership'])
In [71]:
variables = ['CookingCombustible']
for var in variables:
print hog2[var].value_counts()
plt.scatter(hog2[var], hog2.TotalHouseHoldIncome)
plt.show()
In [72]:
hog2['CookingRec'] = np.nan
hog2['CookingRec'][hog2.CookingCombustible == 4] = 1
hog2['CookingRec'][hog2.CookingCombustible == 3] = 1
hog2['CookingRec'][hog2.CookingCombustible == 2] = 2
hog2['CookingRec'][hog2.CookingCombustible == 1] = 3
In [73]:
variables = ['CookingRec']
for var in variables:
print hog2[var].value_counts()
plt.scatter(hog2[var], hog2.TotalHouseHoldIncome)
plt.show()
In [74]:
hog2['WaterRec'] = (hog2.Water == 1).astype(int)
hog2['OwnershipRec'] = ((hog2.Ownership == 1) | (hog2.Ownership == 3)).astype(int)
hog2['Hacinamiento'] = hog2.HouseMembers * 1.0 / hog2.SleepingRooms
hog2['id'] = (hog2.CODUSU.astype(str) + hog2.NRO_HOGAR.astype(str))
hog2['TotalHouseHoldIncome'].replace(to_replace=[0], value=[1] , inplace=True, axis=None)
hog2['lnHouseIncome'] = np.log(hog2['TotalHouseHoldIncome'])
sinCuartosParaDormir = (hog2.SleepingRooms == 0)
In [75]:
hogReducida = hog2.copy().drop(['CODUSU','NRO_HOGAR','REGION','HomeTypeesp','FloorMaterialesp',
'WaterTypeesp','Ownershipesp','CookingCombustibleesp','DomesticService1',
'DomesticService2', 'DomesticService3','DomesticService4', 'DomesticService5',
'DomesticService6'],axis = 1)
In [76]:
# Load data previously exported
ind = pd.read_csv('data/pivotInd.csv')
ind['id'] = ind['id'].astype(str)
ind.drop(['AGLO2'],axis =1,inplace=True)
ind['sumPredicted'] = ind.headPredictedLnIncome + ind.spousePredictedLnIncome
In [77]:
hogReducida = hogReducida.copy().loc[~sinCuartosParaDormir,:]
In [78]:
# check before merge
print 'filas hog:',hogReducida.shape[0]
print 'filas ind:',ind.shape[0]
print 'cantidad de ind en hog:', sum(ind['id'].sort_values().isin(hogReducida['id'].sort_values()))
In [79]:
dataUnida = pd.merge(left=hogReducida, right=ind, on='id',how='left')
In [80]:
dataUnida.to_csv('data/dataFinalParaModelo.csv',index=False)
In [ ]: