In [29]:
import pandas as pd
import numpy as np
import os
import sys
import simpledbf
%pylab inline
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn import linear_model
In [30]:
def runModel(dataset, income, varForModel):
'''
This function takes a data set, runs a model according to specifications,
and returns the model, printing the summary
'''
y = dataset[income].values
X = dataset.loc[:,varForModel].values
X = sm.add_constant(X)
w = dataset.PONDERA
lm = sm.WLS(y, X, weights=1. / w, missing = 'drop', hasconst=True).fit()
print lm.summary()
for i in range(1,len(varForModel)+1):
print 'x%d: %s' % (i,varForModel[i-1])
#testing within sample
R_IS=[]
R_OS=[]
#R_prime = []
n=500
for i in range(n):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 200)
X_train = sm.add_constant(X_train)
X_test = sm.add_constant(X_test)
lm = linear_model.LinearRegression(fit_intercept=True)
lm.fit(X_train, y_train, sample_weight = 1. / w[:len(X_train)])
y_hat_IS = lm.predict(X_train)
err_IS = y_hat_IS - y_train
R2_IS = 1 - (np.var(err_IS) / np.var(y_train))
y_hat_OS = lm.predict(X_test)
err_OS = y_hat_OS - y_test
R2_OS = 1 - (np.var(err_OS) / np.var(y_test))
R_IS.append(R2_IS)
R_OS.append(R2_OS)
print("IS R-squared for {} times is {}".format(n,np.mean(R_IS)))
print("OS R-squared for {} times is {}".format(n,np.mean(R_OS)))
In [31]:
#data = pd.read_csv('/resources/data/Data Projects/ADS Project/dataFinalParaModelo.csv')
data = pd.read_csv('data/dataFinalParaModelo.csv')
data.head()
Out[31]:
In [32]:
data.columns
Out[32]:
In [33]:
import seaborn as sns
sns.set(context="paper", font="monospace")
corrmat = data.loc[:,varForModel].corr()
f, ax = plt.subplots(figsize=(18, 16))
sns.heatmap(corrmat, vmax=.8, square=True)
f.tight_layout()
In [34]:
df1 = data.loc[:,'HomeType':'spouseUniversity']
df2 = data.loc[:,'headJob':'spouseJob']
df3 = data.loc[:,'headMaritalStatus':'spouseplaceOfBirth']
df_lm = pd.concat([df1,df2,df3], axis=1)
In [35]:
data['hasSpouse'] = np.where(np.isnan(data.spouseJob.values),0,1)
data['spouseJob'] = np.where(np.isnan(data.spouseJob.values),0,data.spouseJob.values)
In [36]:
data['TotalFamilyIncome'].replace(to_replace=[0], value=[1] , inplace=True, axis=None)
In [37]:
data = data[data.TotalFamilyIncomeDecReg != 0]
In [38]:
data.columns
Out[38]:
In [39]:
data['income_log'] = np.log(data.TotalFamilyIncome)
In [40]:
(data['TotalFamilyIncome']==0).sum()
Out[40]:
In [41]:
income1 = 'income_log'
income2 = 'TotalFamilyIncome'
income = 'TotalFamilyIncomeDecReg'
varForModel = [
'headAge', 'headEduc','headJob', 'spouseJob', 'hasSpouse'
]
runModel(data, income1, varForModel)
In [42]:
for i in varForModel:
print i, data[i].isnull().sum()
In [43]:
data.RoofCoat.value_counts()
Out[43]:
In [44]:
data['FloorMaterial'] = np.where(np.isnan(data.FloorMaterial.values),5,data.FloorMaterial.values)
data['sumPredicted'] = np.where(np.isnan(data.sumPredicted.values),0,data.sumPredicted.values)
data['Sewer'] = np.where(np.isnan(data.Sewer.values),5,data.Sewer.values)
data['ToiletType'] = np.where(np.isnan(data.ToiletType.values),4,data.ToiletType.values)
data['Water'] = np.where(np.isnan(data.Water.values),4,data.Water.values)
data['RoofCoat'] = np.where(np.isnan(data.RoofCoat.values),2,data.RoofCoat.values)
In [45]:
data.Working.value_counts()
Out[45]:
In [46]:
data.corr()['TotalFamilyIncomeDecReg'][data.corr()['TotalFamilyIncomeDecReg'] > 0.15].sort_values()
Out[46]:
In [47]:
income1 = 'income_log'
income2 = 'TotalFamilyIncome'
income = 'TotalFamilyIncomeDecReg'
income = 'income_log'
varForModel = [
'SleepingRooms','HouseMembers','WaterRec',
'CookingCombustible',
# 'SleepingRooms', 'UsableTotalRooms', 'RoomsNumber', 'HouseMembers', 'WaterRec', # positivas
# 'CookingCombustible', 'FloorMaterial', 'Sewer', 'ToiletType','Sink', 'RoofCoat', 'Water', #negativas
# 'headReading', 'OwnershipRec','WaterType', #sospechoso
'headAge', 'headEduc','headJob', 'spouseJob' #base
]
runModel(data, income, varForModel)
In [48]:
data['TotalFamilyIncome'].replace(to_replace=[0], value=[1] , inplace=True, axis=None)
In [49]:
data['income_logPer'] = np.log(data.PerCapInc)
In [112]:
data.corr()['TotalFamilyIncome'][data.corr()['TotalFamilyIncome'] > 0.10].sort_values()
Out[112]:
In [51]:
data['haciBool'] = (data.Hacinamiento > 3).astype(int)
In [130]:
incomePer1 = 'PerCapInc'
incomePer2 = 'income_logPer'
incomePer3 = 'PerCapIncDecReg'
income1 = 'TotalFamilyIncome'
income2 = 'income_log'
income3 = 'TotalFamilyIncomeDecReg'
varForModel = [
'headEduc',
#'CookingRec',
#'schoolAndJob',
#'WaterRec',
#'OwnershipRec',
'job',
#'noJob',
#'SleepingRooms',
'haciBool',
#'Hacinamiento'
]
runModel(data, income1, varForModel)
#si uso per tengo que comentar sleeping y hacibool
#runModel(data, incomePer2, varForModel)
In [225]:
varForModel = [
'headEduc','job',
'SleepingRooms',
'schoolAndJob',
#'Hacinamiento'
]
dataCaba = data.loc[data.AGLO1 == 32,:]
runModel(dataCaba, 'TotalFamilyIncome', varForModel)
#si uso per tengo que comentar sleeping y hacibool
#runModel(data, incomePer2, varForModel)
In [ ]:
In [173]:
dataCaba.corr()['TotalFamilyIncome'][dataCaba.corr()['TotalFamilyIncome'] >-0.2].sort_values()
Out[173]:
In [56]:
data.AGLO1.value_counts().sum()
Out[56]:
In [57]:
data.shape
Out[57]:
In [ ]:
y = data[income].values
X = data.loc[:,varForModel]
X1 = sm.add_constant(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 200)
In [ ]:
Number_variables=range(len(X_train.columns[:]))
# #select best lambda for Ridge
# lambdas = np.exp(np.linspace(-5,13,200))
# lambda_r_optimal=Regularization_fit_lambda(1,X_train,y_train,lambdas,p=0.4,Graph=True)
# #select lambdas for Lasso
# lambdas=np.exp(np.linspace(-5,6.5,200))
# lambda_l_optimal=Regularization_fit_lambda(2,X_train,y_train,lambdas,p=0.4,Graph=True)
OLS_R_2_OS_F=[]
OLS_R_2_IS_F=[]
OLS_R_2_Ridge_OS_F=[]
OLS_R_2_Ridge_IS_F=[]
OLS_R_2_Lasso_OS_F=[]
OLS_R_2_Lasso_IS_F=[]
Ridge=linear_model.Ridge(fit_intercept=True,alpha=1)
Lasso=linear_model.Lasso(fit_intercept=True, alpha=1)
for j in Number_variables:
# OLS
lm = sm.OLS(formula = 'Y ~ '+ '+'.join(X_train.columns[:j+1]),
data = pd.concat([X_train.iloc[:,:j+1],y_train], axis=1)).fit()
error = lm.predict(X_test.iloc[:,:j+1]) - y_test
R_2_OS_OLS=1-error.var()/y_test.var()
R_2_IS_OLS = lm.rsquared
OLS_R_2_IS_F.append(R_2_IS_OLS)
OLS_R_2_OS_F.append(max(R_2_OS_OLS,0))
# Ridge
Ridge.fit(X_train.iloc[:,:j+1],y_train)
# In sample:
err_IS=Ridge.predict(X_train.iloc[:,:j+1]) - y_train
R_2_IS_Ridge=1-np.var(err_IS)/np.var(y_train)
OLS_R_2_Ridge_IS_F.append(R_2_IS_Ridge)
#Out of sample
err_OS=Ridge.predict(X_test.iloc[:,:j+1]) - y_test
R_2_OS_Ridge=1-np.var(err_OS)/np.var(y_test)
OLS_R_2_Ridge_OS_F.append(max(R_2_OS_Ridge,0))
# Lasso
Lasso.fit(X_train.iloc[:,0:j+1],y_train)
#In sample:
p_IS=Lasso.predict(X_train.iloc[:,0:j+1])
err_IS=p_IS-y_train
R_2_IS_Lasso=1-np.var(err_IS)/np.var(y_train)
OLS_R_2_Lasso_IS_F.append(R_2_IS_Lasso)
#Out of sample
p_OS=Lasso.predict(X_test.iloc[:,0:j+1])
err_OS=p_OS-y_test
R_2_OS_Lasso=1-np.var(err_OS)/np.var(y_test)
OLS_R_2_Lasso_OS_F.append(max(R_2_OS_Lasso,0))
pylab.rcParams['figure.figsize'] = [14,10]
plt.title('OS performance of OLS when we subsequently add variables')
plt.plot(Number_variables,OLS_R_2_IS_F,'g',label='OLS_IS')
plt.plot(Number_variables,OLS_R_2_Lasso_IS_F,'y',label='Lasso_IS')
plt.plot(Number_variables,OLS_R_2_Ridge_IS_F,'k',label='Ridge_IS')
plt.plot(Number_variables,OLS_R_2_OS_F,'b',label='OLS_OS')
plt.plot(Number_variables,OLS_R_2_Lasso_OS_F,'c',label='Lasso_OS')
plt.plot(Number_variables,OLS_R_2_Ridge_OS_F,'r',label='Ridge_OS')
plt.legend(loc='lower right')
plt.xlabel('Number of independent variables')
plt.ylabel('R-squared')
plt.show()
In [ ]: