In [2]:
import getEPH
import functionsForModels
import make_dummy
import schoolYears
import categorize
import functionsForModels
import createVariables
import pandas as pd
#http://statsmodels.sourceforge.net/devel/examples/generated/example_wls.html
import numpy as np
from scipy import stats
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from statsmodels.iolib.table import (SimpleTable, default_txt_fmt)
np.random.seed(1024)
%matplotlib inline
In [ ]:
getEPH.getEPHdbf('t310')
In [5]:
hog = pd.read_csv('data/cleanDataHouseholdt310.csv')
In [4]:
hog.columns
Out[4]:
In [6]:
print hog.shape
hog.head()
Out[6]:
HomeType IV 1, NO CREO QUE SIRVA Tipo de vivienda ( por observación)
FloorMaterial SIRVE SACAR TAL COMO ESTA SACAR 9 SE PUEDE HACER PCA CON LAS DE ABAJO IV3 N(1) Los pisos interiores son principalmente de...
RoofMaterial SIRVE SACAR TAL COMO ESTA SACAR 9 IV4 N(2) La cubierta exterior del techo es de....
RoofCoat IV5 N(1) ¿El techo tiene cielorraso / revestimiento interior? 1 = Sí 2 = No
Water IV6 N(1) Tiene agua...
WaterType IV7 N(1) El agua es de...
'Toilet', IV8 N(1) ¿Tiene baño / letrina?
'ToiletLocation'
'ToiletType',
'IV11', = 'Sewer', 'IV12_1', = 'DumpSites', 'IV12_2', = 'Flooding', 'IV12_3', = 'EmergencyLoc', 'II1', = 'UsableTotalRooms', 'II2', = 'SleepingRooms', #esta es para hacinamiento 'II3', = 'OfficeRooms', 'II3_1', = 'OnlyWork', 'II4_1', = 'Kitchen', 'II4_2', = 'Sink', 'II4_3', = 'Garage', 'II7', = 'Ownership', #JUNTAR 1 Y 3 EN SITUACION CONTACTUAL ESTABLE 'II7_ESP', = 'Ownershipesp', 'II8', = 'CookingCombustible', 'II8_ESP', = 'CookingCombustibleesp' 'II9', = 'BathroomUse', 'V1', = 'Working',
In [7]:
def remove9(df,variables):
for var in variables:
df[var].replace(to_replace=[9], value=[np.nan] , inplace=True, axis=None)
def remove0(df,variables):
for var in variables:
df[var].replace(to_replace=[0], value=[np.nan] , inplace=True, axis=None)
def remove99(df,variables):
for var in variables:
df[var].replace(to_replace=[99], value=[np.nan] , inplace=True, axis=None)
In [8]:
hog2 = hog.copy()
Limpieza de valores que estan mal 99, 9 y 0 cuando no corresponde
In [9]:
remove9(df = hog2, variables = ['FloorMaterial','RoofMaterial','RoofCoat','Water','WaterType','Toilet','ToiletLocation',
'ToiletType','Sewer','DumpSites','Flooding','EmergencyLoc','CookingCombustible',
'BathroomUse'])
remove0(df = hog2, variables = ['FloorMaterial','RoofMaterial','RoofCoat','Water','WaterType','Toilet','ToiletLocation',
'ToiletType','Sewer','DumpSites','Flooding','EmergencyLoc','Ownership','CookingCombustible',
'BathroomUse'])
remove99(df = hog2, variables = ['Ownership'])
In [10]:
variables = ['CookingCombustible']
for var in variables:
print hog2[var].value_counts()
plt.scatter(hog2[var], hog2.TotalHouseHoldIncome)
plt.show()
Recategorizacion de variables para que tengan mas sentido
In [11]:
hog2['CookingRec'] = np.nan
In [12]:
hog2['CookingRec'][hog2.CookingCombustible == 4] = 1
hog2['CookingRec'][hog2.CookingCombustible == 3] = 1
hog2['CookingRec'][hog2.CookingCombustible == 2] = 2
hog2['CookingRec'][hog2.CookingCombustible == 1] = 3
In [13]:
variables = ['CookingRec']
for var in variables:
print hog2[var].value_counts()
plt.scatter(hog2[var], hog2.TotalHouseHoldIncome)
plt.show()
In [14]:
#hogFinal = hog2.copy().loc[:,['FloorMaterial','RoofMaterial','RoofCoat','Water','WaterType','Toilet','ToiletLocation',
# 'ToiletType','Sewer']]
#solamente las que tienen 1. Por cañería dentro de la vivienda
hog2['WaterRec'] = (hog2.Water == 1).astype(int)
hog2['OwnershipRec'] = ((hog2.Ownership == 1) | (hog2.Ownership == 3)).astype(int)
hog2['Hacinamiento'] = hog2.HouseMembers * 1.0 / hog2.SleepingRooms
hog2['id'] = (hog2.CODUSU.astype(str) + hog2.NRO_HOGAR.astype(str))
#saco los ingresos del hogar
hog2['TotalHouseHoldIncome'].replace(to_replace=[0], value=[1] , inplace=True, axis=None)
hog2['lnHouseIncome'] = np.log(hog2['TotalHouseHoldIncome'])
In [15]:
variables = ['Hacinamiento']
for var in variables:
print hog2[var].value_counts()
plt.scatter(hog2[var], hog2.TotalHouseHoldIncome)
plt.show()
In [16]:
sinCuartosParaDormir = (hog2.SleepingRooms == 0)
sum(sinCuartosParaDormir)
Out[16]:
In [17]:
hog2.loc[sinCuartosParaDormir,:].head()
Out[17]:
In [18]:
hog2.head()
Out[18]:
In [19]:
hog2.columns
Out[19]:
In [20]:
sns.kdeplot(hog2.lnHouseIncome)
Out[20]:
In [21]:
hogReducida = hog2.copy().drop(['CODUSU','NRO_HOGAR','REGION','HomeTypeesp','FloorMaterialesp',
'WaterTypeesp','Ownershipesp','CookingCombustibleesp','DomesticService1',
'DomesticService2', 'DomesticService3','DomesticService4', 'DomesticService5',
'DomesticService6'],axis = 1)
In [22]:
hogReducida.head()
Out[22]:
In [23]:
#para leer en otro archivo
ind = pd.read_csv('data/pivotInd.csv')
ind['id'] = ind['id'].astype(str)
In [24]:
ind.head()
Out[24]:
In [27]:
ind.drop(['AGLO2'],axis =1,inplace=True)
In [28]:
#sumo los valores predecidos del modelo
ind['sumPredicted'] = ind.headPredictedLnIncome + ind.spousePredictedLnIncome
In [29]:
#sacar los hogares sin cuartos para dormir
hogReducida = hogReducida.copy().loc[~sinCuartosParaDormir,:]
In [30]:
#chequeo
print 'filas hog:',hogReducida.shape[0]
print 'filas ind:',ind.shape[0]
print 'cantidad de ind en hog:', sum(ind['id'].sort_values().isin(hogReducida['id'].sort_values()))
In [31]:
dataUnida = pd.merge(left=hogReducida, right=ind, on='id',how='left')
In [32]:
dataUnida.to_csv('data/dataFinalParaModelo.csv',index=False)
In [33]:
print dataUnida.shape
dataUnida.columns
Out[33]:
In [34]:
dataUnida.head()
Out[34]:
In [35]:
plt.boxplot([dataUnida.lnHouseIncome])
Out[35]:
In [ ]:
In [ ]:
def runModel(dataset, income , variables):
'''
This function takes a data set, runs a model according to specifications,
and returns the model, printing the summary
'''
selectedVariables = ['PONDERA',income] + variables
dataToRun = dataset.loc[:,selectedVariables]
y = dataToRun.copy().iloc[:,1].values
X = sm.add_constant(dataToRun.copy().iloc[:,2:].values)
w = dataToRun.copy().iloc[:,0].values
lm = sm.WLS(y, X, weights=1. / w).fit()
print lm.summary()
for i in range(1,len(variables)+1):
print 'x%d: %s' % (i,variables[i-1])
y = dataUnida[ingreso].values)
X = np.where(np.isnan(dataUnida.loc[:,variablesParaModelo].values),0,dataUnida.loc[:,variablesParaModelo].values)
X = sm.add_constant(X)
w = np.where(np.isnan(dataUnida.PONDERA.values),0,dataUnida.PONDERA.values)
lm = sm.WLS(y, X, weights=1. / w).fit()
print lm.summary()
for i in range(1,len(variables)+1):
print 'x%d: %s' % (i,variables[i-1])
#testing within sample
R_IS=[]
R_OS=[]
nCross=1000
for i in range(nCross):
X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(X, y, w, test_size=0.33)
lm = sm.WLS(y_train, X_train, weights=1. / w_train, hasconst=False).fit()
R_IS.append(1-((np.asarray(lm.predict(exog = X_train))-y_train)**2).sum()/((y_train-np.mean(y_train))**2).sum())
R_OS.append(1-((np.asarray(lm.predict(exog = X_test))-y_test)**2).sum()/((y_test-np.mean(y_test))**2).sum())
print("IS R-squared for {} times is {}".format(nCross,np.mean(R_IS)))
print("OS R-squared for {} times is {}".format(nCross,np.mean(R_OS)))
return lm
In [ ]:
ingreso = 'lnHouseIncome'
variablesParaModelo = [u'RoomsNumber', u'FloorMaterial', u'RoofMaterial',
u'RoofCoat', u'Water', u'WaterType', u'Toilet', u'ToiletLocation',
u'ToiletType', u'Sewer', u'DumpSites', u'Flooding', u'EmergencyLoc',
u'UsableTotalRooms', u'SleepingRooms', u'OfficeRooms', u'OnlyWork',
u'Kitchen', u'Sink', u'Garage', u'Ownership', u'CookingCombustible',
u'BathroomUse', u'Working', u'HouseMembers', u'Memberless10',
u'Membermore10', u'WaterRec', u'OwnershipRec',
u'Hacinamiento', u'headAge', u'spouseAge',
u'headAge2', u'spouseAge2', u'headFemale', u'spouseFemale', u'headEduc',
u'spouseEduc', u'headEduc2', u'spouseEduc2', u'headPrimary',
u'spousePrimary', u'headSecondary', u'spouseSecondary',
u'headUniversity', u'spouseUniversity']
y = np.where(np.isnan(dataUnida[ingreso].values),0,dataUnida[ingreso].values)
X = np.where(np.isnan(dataUnida.loc[:,variablesParaModelo].values),0,dataUnida.loc[:,variablesParaModelo].values)
X = sm.add_constant(X)
w = np.where(np.isnan(dataUnida.PONDERA.values),0,dataUnida.PONDERA.values)
lm = sm.WLS(y, X, weights=1. / w).fit()
print lm.summary()
for i in range(1,len(variables)+1):
print 'x%d: %s' % (i,variables[i-1])
In [ ]:
In [ ]:
In [ ]:
from sklearn.decomposition import PCA
variablesParaModelo = ['FloorMaterial', 'RoofMaterial',
'RoofCoat', 'Water', 'WaterType', 'Toilet', 'ToiletLocation',
'ToiletType', 'Sewer','CookingCombustible',
'BathroomUse','OwnershipRec']
X = np.where(np.isnan(dataUnida.loc[:,variablesParaModelo].values),0,dataUnida.loc[:,variablesParaModelo].values)
n = 12
pca = PCA(n)
Xproj = pca.fit_transform(X)
eigenvalues = pca.explained_variance_
fig = plt.figure(figsize=(16,4))
ax = fig.add_subplot(1,3,1)
ax.bar(np.arange(n), eigenvalues);
ax.set_xlabel("Dimensionality")
ax.set_ylabel("Variance")
ratio = pca.explained_variance_ratio_
ax2 = fig.add_subplot(1,3,2)
ax2.bar(np.arange(n), ratio);
ax2.set_xlabel("Dimensionality")
ax2.set_ylabel("%of variance")
sumRatio = np.cumsum(pca.explained_variance_ratio_)
ax3 = fig.add_subplot(1,3,3)
ax3.bar(np.arange(n), sumRatio);
ax3.set_xlabel("Dimensionality")
ax3.set_ylabel("%Cumulative sum of variance")
In [ ]:
plt.scatter(Xproj[:, 0], Xproj[:, 1])
In [ ]:
dataUnida['HousePCA1'] = Xproj[:, 0]
dataUnida['HousePCA2'] = Xproj[:, 1]
In [ ]:
In [ ]:
ingreso = 'lnHouseIncome'
variablesParaModelo = ['HousePCA1', 'HousePCA2','Hacinamiento',
'headAge', 'spouseAge',
'headAge2', 'spouseAge2', 'headFemale', 'spouseFemale', 'headEduc',
'spouseEduc', 'headEduc2', 'spouseEduc2'
]
y = np.where(np.isnan(dataUnida[ingreso].values),0,dataUnida[ingreso].values)
X = np.where(np.isnan(dataUnida.loc[:,variablesParaModelo].values),0,dataUnida.loc[:,variablesParaModelo].values)
X = sm.add_constant(X)
w = np.where(np.isnan(dataUnida.PONDERA.values),0,dataUnida.PONDERA.values)
lm = sm.WLS(y, X, weights=1. / w).fit()
print lm.summary()
for i in range(1,len(variablesParaModelo)+1):
print 'x%d: %s' % (i,variablesParaModelo[i-1])
In [ ]:
ingreso = 'lnHouseIncome'
variablesParaModelo = ['HousePCA1', 'HousePCA2','Hacinamiento','headPredictedLnIncome','spousePredictedLnIncome']
y = np.where(np.isnan(dataUnida[ingreso].values),0,dataUnida[ingreso].values)
X = np.where(np.isnan(dataUnida.loc[:,variablesParaModelo].values),0,dataUnida.loc[:,variablesParaModelo].values)
X = sm.add_constant(X)
w = np.where(np.isnan(dataUnida.PONDERA.values),0,dataUnida.PONDERA.values)
lm = sm.WLS(y, X, weights=1. / w).fit()
print lm.summary()
for i in range(1,len(variablesParaModelo)+1):
print 'x%d: %s' % (i,variablesParaModelo[i-1])
In [ ]:
ingreso = 'lnHouseIncome'
variablesParaModelo = ['HousePCA1', 'HousePCA2','sumPredicted']
y = np.where(np.isnan(dataUnida[ingreso].values),0,dataUnida[ingreso].values)
X = np.where(np.isnan(dataUnida.loc[:,variablesParaModelo].values),0,dataUnida.loc[:,variablesParaModelo].values)
X = sm.add_constant(X)
w = np.where(np.isnan(dataUnida.PONDERA.values),0,dataUnida.PONDERA.values)
lm = sm.WLS(y, X, weights=1. / w).fit()
print lm.summary()
for i in range(1,len(variablesParaModelo)+1):
print 'x%d: %s' % (i,variablesParaModelo[i-1])
In [ ]:
ingreso = 'lnHouseIncome'
variablesParaModelo = ['HousePCA1', 'HousePCA2','headAge','headEduc']
y = np.where(np.isnan(dataUnida[ingreso].values),0,dataUnida[ingreso].values)
X = np.where(np.isnan(dataUnida.loc[:,variablesParaModelo].values),0,dataUnida.loc[:,variablesParaModelo].values)
X = sm.add_constant(X)
w = np.where(np.isnan(dataUnida.PONDERA.values),0,dataUnida.PONDERA.values)
lm = sm.WLS(y, X, weights=1. / w).fit()
print lm.summary()
for i in range(1,len(variablesParaModelo)+1):
print 'x%d: %s' % (i,variablesParaModelo[i-1])
In [ ]:
ingreso = 'lnHouseIncome'
variablesParaModelo = ['HousePCA1', 'HousePCA2','Hacinamiento',
'headAge', 'spouseAge',
'headAge2', 'spouseAge2', 'headFemale', 'spouseFemale', 'headEduc',
'spouseEduc', 'headEduc2', 'spouseEduc2'
]
y = np.where(np.isnan(dataUnida[ingreso].values),0,dataUnida[ingreso].values)
X = np.where(np.isnan(dataUnida.loc[:,variablesParaModelo].values),0,dataUnida.loc[:,variablesParaModelo].values)
X = sm.add_constant(X)
w = np.where(np.isnan(dataUnida.PONDERA.values),0,dataUnida.PONDERA.values)
lm = sm.WLS(y, X, weights=1. / w).fit()
print lm.summary()
for i in range(1,len(variablesParaModelo)+1):
print 'x%d: %s' % (i,variablesParaModelo[i-1])