In [1]:
import pandas as pd
import getEPH
import statsmodels.formula.api as slm
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline
from scipy import stats
from pandas.stats.api import ols
from sklearn import linear_model

In [40]:
#getEPH.getEPHdbf('t310')

In [164]:
data = pd.read_csv('data/cleanDataHouseholdt310.csv')
data.head()


Out[164]:
CODUSU NRO_HOGAR REGION PONDERA HomeType HomeTypeesp RoomsNumber FloorMaterial FloorMaterialesp RoofMaterial ... HouseMembers Memberless10 Membermore10 TotalHouseHoldIncome DomesticService1 DomesticService2 DomesticService3 DomesticService4 DomesticService5 DomesticService6
0 302468 1 1 1287 2 NaN 2 1 NaN 9 ... 2 0 2 4000.0 1 0 2 0 0 0
1 307861 1 1 1674 2 NaN 2 1 NaN 1 ... 4 1 3 5800.0 1 2 98 0 0 0
2 308762 1 1 1522 2 NaN 4 1 NaN 9 ... 1 0 1 3200.0 1 0 98 0 0 0
3 308278 1 1 1320 2 NaN 3 1 NaN 9 ... 2 0 2 10000.0 2 0 96 0 0 0
4 311937 1 1 1281 2 NaN 4 1 NaN 1 ... 4 0 4 11000.0 2 4 96 0 0 0

5 rows × 44 columns


In [165]:
data = data.query('REGION == 1')
data = data.dropna(axis = 1)
data["id"] = data.CODUSU.map(str) + data.NRO_HOGAR.map(str)
data = data.drop(['CODUSU','NRO_HOGAR', 'DomesticService1','DomesticService2',
                  'DomesticService3','DomesticService4','DomesticService5','DomesticService6'],axis = 1)
def remove9(df,variables):
    for var in variables:
        df[var].replace(to_replace=[9], value=[np.nan] , inplace=True, axis=None)

def remove0(df,variables):
    for var in variables:
        df[var].replace(to_replace=[0], value=[np.nan] , inplace=True, axis=None)

def remove99(df,variables):
    for var in variables:
        df[var].replace(to_replace=[99], value=[np.nan] , inplace=True, axis=None)

        
remove9(df = data, variables = ['FloorMaterial','RoofMaterial','RoofCoat','Water','WaterType','Toilet','ToiletLocation',
                               'ToiletType','Sewer','DumpSites','Flooding','EmergencyLoc','CookingCombustible',
                               'BathroomUse'])
remove0(df = data, variables = ['FloorMaterial','RoofMaterial','RoofCoat','Water','WaterType','Toilet','ToiletLocation',
                               'ToiletType','Sewer','DumpSites','Flooding','EmergencyLoc','Ownership','CookingCombustible',
                               'BathroomUse', 'TotalHouseHoldIncome'])
remove99(df = data, variables = ['Ownership', 'RoomsNumber'])

data.head()


Out[165]:
REGION PONDERA HomeType RoomsNumber FloorMaterial RoofMaterial RoofCoat Water WaterType Toilet ... Garage Ownership CookingCombustible BathroomUse Working HouseMembers Memberless10 Membermore10 TotalHouseHoldIncome id
0 1 1287 2 2.0 1.0 NaN 1.0 1.0 1 1 ... 2 3.0 1.0 1.0 2 2 0 2 4000.0 3024681
1 1 1674 2 2.0 1.0 1.0 1.0 1.0 1 1 ... 2 5.0 1.0 1.0 1 4 1 3 5800.0 3078611
2 1 1522 2 4.0 1.0 NaN 1.0 1.0 1 1 ... 2 1.0 1.0 1.0 2 1 0 1 3200.0 3087621
3 1 1320 2 3.0 1.0 NaN 1.0 1.0 1 1 ... 2 1.0 4.0 1.0 1 2 0 2 10000.0 3082781
4 1 1281 2 4.0 1.0 1.0 1.0 1.0 1 1 ... 2 1.0 1.0 1.0 1 4 0 4 11000.0 3119371

5 rows × 32 columns


In [166]:
cols = data.columns.tolist()
cols = cols[-1:] + cols[:-1]

In [167]:
#data['weights'] = ( 1.0 / data.PONDERA )
df = data[cols]
df.TotalHouseHoldIncome = np.log(df.TotalHouseHoldIncome)
df.head()


Out[167]:
id REGION PONDERA HomeType RoomsNumber FloorMaterial RoofMaterial RoofCoat Water WaterType ... Sink Garage Ownership CookingCombustible BathroomUse Working HouseMembers Memberless10 Membermore10 TotalHouseHoldIncome
0 3024681 1 1287 2 2.0 1.0 NaN 1.0 1.0 1 ... 2 2 3.0 1.0 1.0 2 2 0 2 8.294050
1 3078611 1 1674 2 2.0 1.0 1.0 1.0 1.0 1 ... 2 2 5.0 1.0 1.0 1 4 1 3 8.665613
2 3087621 1 1522 2 4.0 1.0 NaN 1.0 1.0 1 ... 1 2 1.0 1.0 1.0 2 1 0 1 8.070906
3 3082781 1 1320 2 3.0 1.0 NaN 1.0 1.0 1 ... 2 2 1.0 4.0 1.0 1 2 0 2 9.210340
4 3119371 1 1281 2 4.0 1.0 1.0 1.0 1.0 1 ... 2 2 1.0 1.0 1.0 1 4 0 4 9.305651

5 rows × 32 columns


In [168]:
df = df.drop(['REGION', 'PONDERA'], axis = 1)
df.dropna(axis = 1)
df.head()


Out[168]:
id HomeType RoomsNumber FloorMaterial RoofMaterial RoofCoat Water WaterType Toilet ToiletLocation ... Sink Garage Ownership CookingCombustible BathroomUse Working HouseMembers Memberless10 Membermore10 TotalHouseHoldIncome
0 3024681 2 2.0 1.0 NaN 1.0 1.0 1 1 1.0 ... 2 2 3.0 1.0 1.0 2 2 0 2 8.294050
1 3078611 2 2.0 1.0 1.0 1.0 1.0 1 1 1.0 ... 2 2 5.0 1.0 1.0 1 4 1 3 8.665613
2 3087621 2 4.0 1.0 NaN 1.0 1.0 1 1 1.0 ... 1 2 1.0 1.0 1.0 2 1 0 1 8.070906
3 3082781 2 3.0 1.0 NaN 1.0 1.0 1 1 1.0 ... 2 2 1.0 4.0 1.0 1 2 0 2 9.210340
4 3119371 2 4.0 1.0 1.0 1.0 1.0 1 1 1.0 ... 2 2 1.0 1.0 1.0 1 4 0 4 9.305651

5 rows × 30 columns


In [188]:
df1 = df.iloc[:,1:]

In [189]:
lm = slm.ols(formula = 'TotalHouseHoldIncome ~ ' + ' + '.join(df1.columns[:-1]), data = df1, missing = 'drop').fit()
lm.summary()


Out[189]:
OLS Regression Results
Dep. Variable: TotalHouseHoldIncome R-squared: 0.359
Model: OLS Adj. R-squared: 0.352
Method: Least Squares F-statistic: 47.04
Date: Wed, 30 Nov 2016 Prob (F-statistic): 4.14e-189
Time: 12:58:18 Log-Likelihood: -2138.0
No. Observations: 2209 AIC: 4330.
Df Residuals: 2182 BIC: 4484.
Df Model: 26
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
Intercept 4.4606 0.167 26.698 0.000 4.133 4.788
HomeType 0.0638 0.030 2.125 0.034 0.005 0.123
RoomsNumber 0.0237 0.068 0.348 0.728 -0.110 0.157
FloorMaterial -0.1786 0.046 -3.914 0.000 -0.268 -0.089
RoofMaterial -0.0158 0.012 -1.274 0.203 -0.040 0.009
RoofCoat -0.0470 0.044 -1.080 0.280 -0.132 0.038
Water 0.0042 0.068 0.061 0.951 -0.130 0.138
WaterType -0.0427 0.037 -1.161 0.246 -0.115 0.029
Toilet 4.4606 0.167 26.698 0.000 4.133 4.788
ToiletLocation -0.0893 0.076 -1.183 0.237 -0.237 0.059
ToiletType -0.1175 0.058 -2.028 0.043 -0.231 -0.004
Sewer -0.0694 0.025 -2.784 0.005 -0.118 -0.021
DumpSites 0.0206 0.050 0.409 0.682 -0.078 0.120
Flooding 0.0983 0.039 2.493 0.013 0.021 0.176
EmergencyLoc 0.1317 0.132 1.000 0.317 -0.127 0.390
UsableTotalRooms 0.0783 0.069 1.133 0.257 -0.057 0.214
SleepingRooms -0.0226 0.028 -0.807 0.420 -0.078 0.032
OfficeRooms -0.0211 0.049 -0.433 0.665 -0.116 0.074
OnlyWork 0.0598 0.029 2.075 0.038 0.003 0.116
Kitchen 0.1602 0.054 2.943 0.003 0.053 0.267
Sink -0.1097 0.030 -3.664 0.000 -0.168 -0.051
Garage -0.0465 0.032 -1.455 0.146 -0.109 0.016
Ownership -0.0179 0.009 -1.989 0.047 -0.036 -0.000
CookingCombustible -0.2108 0.038 -5.526 0.000 -0.286 -0.136
BathroomUse -0.0325 0.050 -0.654 0.513 -0.130 0.065
Working -0.7175 0.040 -17.751 0.000 -0.797 -0.638
HouseMembers 0.0408 0.007 5.471 0.000 0.026 0.055
Memberless10 -0.0468 0.013 -3.620 0.000 -0.072 -0.021
Membermore10 0.0876 0.011 8.101 0.000 0.066 0.109
Omnibus: 163.549 Durbin-Watson: 1.942
Prob(Omnibus): 0.000 Jarque-Bera (JB): 282.213
Skew: -0.542 Prob(JB): 5.23e-62
Kurtosis: 4.375 Cond. No. 1.02e+16

In [ ]:


In [ ]:


In [ ]:


In [190]:
PV = []
feats = df1.columns[:-1]
for feat in feats:
    if lm.pvalues[str(feat)] < 0.05:
        PV.append(feat)
df2 = pd.concat([df1['TotalHouseHoldIncome'],df1[PV]],axis = 1)
df2.head()


Out[190]:
TotalHouseHoldIncome HomeType FloorMaterial Toilet ToiletType Sewer Flooding OnlyWork Kitchen Sink Ownership CookingCombustible Working HouseMembers Memberless10 Membermore10
0 8.294050 2 1.0 1 1.0 1.0 2.0 0 1 2 3.0 1.0 2 2 0 2
1 8.665613 2 1.0 1 1.0 1.0 2.0 0 1 2 5.0 1.0 1 4 1 3
2 8.070906 2 1.0 1 1.0 1.0 1.0 0 1 1 1.0 1.0 2 1 0 1
3 9.210340 2 1.0 1 1.0 1.0 2.0 0 1 2 1.0 4.0 1 2 0 2
4 9.305651 2 1.0 1 1.0 1.0 1.0 0 1 2 1.0 1.0 1 4 0 4

In [191]:
lm2 = slm.ols(formula = 'TotalHouseHoldIncome ~ ' + ' + '.join(df2.columns[1:]), data = df2, missing = 'drop').fit()
lm2.summary()


Out[191]:
OLS Regression Results
Dep. Variable: TotalHouseHoldIncome R-squared: 0.334
Model: OLS Adj. R-squared: 0.331
Method: Least Squares F-statistic: 101.4
Date: Wed, 30 Nov 2016 Prob (F-statistic): 1.54e-220
Time: 12:58:35 Log-Likelihood: -2626.3
No. Observations: 2639 AIC: 5281.
Df Residuals: 2625 BIC: 5363.
Df Model: 13
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
Intercept 4.7087 0.065 72.555 0.000 4.581 4.836
HomeType 0.0801 0.026 3.079 0.002 0.029 0.131
FloorMaterial -0.2564 0.043 -5.976 0.000 -0.341 -0.172
Toilet 4.7087 0.065 72.555 0.000 4.581 4.836
ToiletType -0.1714 0.053 -3.258 0.001 -0.274 -0.068
Sewer -0.1158 0.023 -4.970 0.000 -0.161 -0.070
Flooding 0.1293 0.038 3.415 0.001 0.055 0.204
OnlyWork 0.0746 0.027 2.811 0.005 0.023 0.127
Kitchen 0.0830 0.051 1.615 0.107 -0.018 0.184
Sink -0.1506 0.027 -5.602 0.000 -0.203 -0.098
Ownership -0.0345 0.008 -4.113 0.000 -0.051 -0.018
CookingCombustible -0.2248 0.035 -6.478 0.000 -0.293 -0.157
Working -0.7210 0.036 -19.892 0.000 -0.792 -0.650
HouseMembers 0.0428 0.006 6.809 0.000 0.030 0.055
Memberless10 -0.0590 0.013 -4.697 0.000 -0.084 -0.034
Membermore10 0.1018 0.010 10.607 0.000 0.083 0.121
Omnibus: 139.418 Durbin-Watson: 1.907
Prob(Omnibus): 0.000 Jarque-Bera (JB): 206.505
Skew: -0.462 Prob(JB): 1.44e-45
Kurtosis: 4.012 Cond. No. 5.83e+16

In [ ]:


In [ ]:


In [ ]: