In [1]:
import pandas as pd
import getEPH
import statsmodels.formula.api as slm
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline
from scipy import stats
from pandas.stats.api import ols
from sklearn import linear_model
In [40]:
#getEPH.getEPHdbf('t310')
In [164]:
data = pd.read_csv('data/cleanDataHouseholdt310.csv')
data.head()
Out[164]:
In [165]:
data = data.query('REGION == 1')
data = data.dropna(axis = 1)
data["id"] = data.CODUSU.map(str) + data.NRO_HOGAR.map(str)
data = data.drop(['CODUSU','NRO_HOGAR', 'DomesticService1','DomesticService2',
'DomesticService3','DomesticService4','DomesticService5','DomesticService6'],axis = 1)
def remove9(df,variables):
for var in variables:
df[var].replace(to_replace=[9], value=[np.nan] , inplace=True, axis=None)
def remove0(df,variables):
for var in variables:
df[var].replace(to_replace=[0], value=[np.nan] , inplace=True, axis=None)
def remove99(df,variables):
for var in variables:
df[var].replace(to_replace=[99], value=[np.nan] , inplace=True, axis=None)
remove9(df = data, variables = ['FloorMaterial','RoofMaterial','RoofCoat','Water','WaterType','Toilet','ToiletLocation',
'ToiletType','Sewer','DumpSites','Flooding','EmergencyLoc','CookingCombustible',
'BathroomUse'])
remove0(df = data, variables = ['FloorMaterial','RoofMaterial','RoofCoat','Water','WaterType','Toilet','ToiletLocation',
'ToiletType','Sewer','DumpSites','Flooding','EmergencyLoc','Ownership','CookingCombustible',
'BathroomUse', 'TotalHouseHoldIncome'])
remove99(df = data, variables = ['Ownership', 'RoomsNumber'])
data.head()
Out[165]:
In [166]:
cols = data.columns.tolist()
cols = cols[-1:] + cols[:-1]
In [167]:
#data['weights'] = ( 1.0 / data.PONDERA )
df = data[cols]
df.TotalHouseHoldIncome = np.log(df.TotalHouseHoldIncome)
df.head()
Out[167]:
In [168]:
df = df.drop(['REGION', 'PONDERA'], axis = 1)
df.dropna(axis = 1)
df.head()
Out[168]:
In [188]:
df1 = df.iloc[:,1:]
In [189]:
lm = slm.ols(formula = 'TotalHouseHoldIncome ~ ' + ' + '.join(df1.columns[:-1]), data = df1, missing = 'drop').fit()
lm.summary()
Out[189]:
In [ ]:
In [ ]:
In [ ]:
In [190]:
PV = []
feats = df1.columns[:-1]
for feat in feats:
if lm.pvalues[str(feat)] < 0.05:
PV.append(feat)
df2 = pd.concat([df1['TotalHouseHoldIncome'],df1[PV]],axis = 1)
df2.head()
Out[190]:
In [191]:
lm2 = slm.ols(formula = 'TotalHouseHoldIncome ~ ' + ' + '.join(df2.columns[1:]), data = df2, missing = 'drop').fit()
lm2.summary()
Out[191]:
In [ ]:
In [ ]:
In [ ]: