Modelos finales

Libraries


In [1]:
import pandas as pd
import numpy as np
import os
import sys
import simpledbf
%pylab inline
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn import linear_model


Populating the interactive namespace from numpy and matplotlib

Functions


In [2]:
def runModel(dataset, income, varForModel):
    
    '''
    This function takes a data set, runs a model according to specifications,
    and returns the model, printing the summary
    '''
    y = dataset[income].values
    X = dataset.loc[:,varForModel].values
    X = sm.add_constant(X)

    w = dataset.PONDERA
    
    lm = sm.WLS(y, X, weights=1. / w, missing = 'drop', hasconst=True).fit()
    print lm.summary()
    for i in range(1,len(varForModel)+1):
        print 'x%d: %s' % (i,varForModel[i-1])
    #testing within sample
    R_IS=[]
    R_OS=[]
    #R_prime = []
    n=500
    
    for i in range(n):  
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 200)
        X_train = sm.add_constant(X_train)
        X_test = sm.add_constant(X_test)
        
        lm = linear_model.LinearRegression(fit_intercept=True)
        lm.fit(X_train, y_train, sample_weight = 1. / w[:len(X_train)])
        y_hat_IS = lm.predict(X_train)
        err_IS = y_hat_IS - y_train
        R2_IS = 1 - (np.var(err_IS) / np.var(y_train))
        
        y_hat_OS = lm.predict(X_test)
        err_OS = y_hat_OS - y_test
        R2_OS = 1 - (np.var(err_OS) / np.var(y_test))
        
        R_IS.append(R2_IS)
        R_OS.append(R2_OS)
        
    print("IS R-squared for {} times is {}".format(n,np.mean(R_IS)))
    print("OS R-squared for {} times is {}".format(n,np.mean(R_OS)))

Get Data


In [3]:
#data = pd.read_csv('/resources/data/Data Projects/ADS Project/dataFinalParaModelo.csv')
data = pd.read_csv('data/dataFinalParaModelo.csv')
data.head()


Out[3]:
PONDERA HomeType RoomsNumber FloorMaterial RoofMaterial RoofCoat Water WaterType Toilet ToiletLocation ... spouseDECCFR headDECIFR spouseDECIFR headMaritalStatus spouseMaritalStatus headReading spouseReading headPlaceOfBirth spouseplaceOfBirth sumPredicted
0 1287 2 2 1.0 NaN 1.0 1.0 1 1 1.0 ... NaN 6.0 NaN 5.0 NaN 1.0 NaN 1.0 NaN NaN
1 1674 2 2 1.0 1.0 1.0 1.0 1 1 1.0 ... 6.0 8.0 8.0 2.0 2.0 1.0 1.0 2.0 2.0 15.469188
2 1522 2 4 1.0 NaN 1.0 1.0 1 1 1.0 ... NaN 5.0 NaN 4.0 NaN 1.0 NaN 3.0 NaN NaN
3 1320 2 3 1.0 NaN 1.0 1.0 1 1 1.0 ... 10.0 10.0 10.0 2.0 2.0 1.0 1.0 1.0 1.0 16.235857
4 1281 2 4 1.0 1.0 1.0 1.0 1 1 1.0 ... 9.0 10.0 10.0 2.0 2.0 1.0 1.0 1.0 3.0 8.336136

5 rows × 85 columns

Modifiacion en variables


In [5]:
data['hasSpouse'] = np.where(np.isnan(data.spouseJob.values),0,1)
data['spouseJob'] = np.where(np.isnan(data.spouseJob.values),0,data.spouseJob.values)

In [6]:
data['TotalFamilyIncome'].replace(to_replace=[0], value=[1] , inplace=True, axis=None)

In [7]:
data = data[data.TotalFamilyIncomeDecReg != 0]

In [8]:
data['income_log'] = np.log(data.TotalFamilyIncome)

In [9]:
(data['TotalFamilyIncome']==0).sum()


Out[9]:
0

In [10]:
data['FloorMaterial'] = np.where(np.isnan(data.FloorMaterial.values),5,data.FloorMaterial.values)
data['sumPredicted'] = np.where(np.isnan(data.sumPredicted.values),0,data.sumPredicted.values)
data['Sewer'] = np.where(np.isnan(data.Sewer.values),5,data.Sewer.values)
data['ToiletType'] = np.where(np.isnan(data.ToiletType.values),4,data.ToiletType.values)
data['Water'] = np.where(np.isnan(data.Water.values),4,data.Water.values)
data['RoofCoat'] = np.where(np.isnan(data.RoofCoat.values),2,data.RoofCoat.values)

In [15]:
data['TotalFamilyIncome'].replace(to_replace=[0], value=[1] , inplace=True, axis=None)

In [16]:
data['income_logPer'] = np.log(data.PerCapInc)

In [17]:
data.corr()['TotalFamilyIncome'][data.corr()['TotalFamilyIncome'] > 0.10].sort_values()


Out[17]:
Flooding                   0.108231
spousePrimary              0.122330
OwnershipRec               0.126776
WaterRec                   0.132485
headPrimary                0.147454
HouseMembers               0.181875
hasSpouse                  0.203178
CookingRec                 0.217917
Membermore10               0.246261
headJob                    0.246625
spouseLnIncomeT            0.280361
headPredictedLnIncome      0.286899
spouseJob                  0.304031
SleepingRooms              0.307961
headSecondary              0.312949
spousePredictedLnIncome    0.314176
RoomsNumber                0.316654
spouseSecondary            0.319981
UsableTotalRooms           0.320164
headLnIncome               0.339666
headLnIncomeT              0.349351
sumPredicted               0.360143
spouseLnIncome             0.363158
headEduc                   0.378242
headUniversity             0.386140
headEduc2                  0.403212
spouseEduc                 0.417591
spouseEduc2                0.454361
spouseUniversity           0.462501
job                        0.470682
headDECCFR                 0.607559
PerCapIncDec               0.607559
PerCapIncDecReg            0.615880
schoolAndJob               0.616160
spouseP21                  0.645548
income_logPer              0.651651
headP21                    0.667581
spouseP47T                 0.669564
PerCapInc                  0.692921
spouseDECCFR               0.702022
headP47T                   0.755165
spouseDECIFR               0.766633
TotalFamilyIncomeDec       0.790339
headDECIFR                 0.790339
TotalFamilyIncomeDecReg    0.804034
income_log                 0.846533
lnHouseIncome              0.846533
TotalFamilyIncome          1.000000
TotalHouseHoldIncome       1.000000
Name: TotalFamilyIncome, dtype: float64

In [18]:
data['haciBool'] = (data.Hacinamiento > 3).astype(int)

In [21]:
#data solo para la ciudad de buenos aires
dataCaba = data.loc[data.AGLO1 == 32,:]

Para CABA

Tome dos lineas de accion.

Por un lado tomar la parte individual, de las personas que viven en la casa. Ya sea el nivel educativo del jefe y la cantidad de gente que trabaja o por otro lado usar la suma de años de escolaridad de todos los que trabajan en la casa. Por el otro, las variables que tenian que ver con el hogar, la calidad del hogar. La que mejor funcionaba ahi esa hacinamiento (cantidad de personas por cuarto) o directamente la cantidad de cuartos.

Aca van los modelos que corri en retadatam

Modelo 1 a (educHead)


In [22]:
varForModel = [
    'headEduc',
    #'job', 
    #'SleepingRooms',
    #'schoolAndJob',
    #'Hacinamiento'
]

runModel(dataCaba, 'TotalFamilyIncome', varForModel)


                            WLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.098
Model:                            WLS   Adj. R-squared:                  0.097
Method:                 Least Squares   F-statistic:                     85.39
Date:                Sat, 10 Dec 2016   Prob (F-statistic):           2.26e-19
Time:                        13:19:40   Log-Likelihood:                -7720.4
No. Observations:                 785   AIC:                         1.544e+04
Df Residuals:                     783   BIC:                         1.545e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const        826.4193    570.197      1.449      0.148      -292.877  1945.716
x1           395.6744     42.819      9.241      0.000       311.621   479.728
==============================================================================
Omnibus:                      572.340   Durbin-Watson:                   1.917
Prob(Omnibus):                  0.000   Jarque-Bera (JB):            15209.225
Skew:                           2.960   Prob(JB):                         0.00
Kurtosis:                      23.735   Cond. No.                         48.2
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
x1: headEduc
IS R-squared for 500 times is 0.117245206392
OS R-squared for 500 times is 0.0286390690395

Modelo 1 b (educHeadYjobs)


In [24]:
varForModel = [
    'headEduc',
    'job', 
    #'SleepingRooms',
    #'schoolAndJob',
    #'Hacinamiento'
]

runModel(dataCaba, 'TotalFamilyIncome', varForModel)


                            WLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.356
Model:                            WLS   Adj. R-squared:                  0.354
Method:                 Least Squares   F-statistic:                     216.0
Date:                Sat, 10 Dec 2016   Prob (F-statistic):           2.10e-75
Time:                        13:20:10   Log-Likelihood:                -7588.4
No. Observations:                 785   AIC:                         1.518e+04
Df Residuals:                     782   BIC:                         1.520e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const      -1315.3112    497.245     -2.645      0.008     -2291.404  -339.219
x1           292.2212     36.685      7.966      0.000       220.208   364.234
x2          2572.8911    145.528     17.680      0.000      2287.220  2858.562
==============================================================================
Omnibus:                      667.358   Durbin-Watson:                   1.910
Prob(Omnibus):                  0.000   Jarque-Bera (JB):            35935.200
Skew:                           3.462   Prob(JB):                         0.00
Kurtosis:                      35.415   Cond. No.                         50.1
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
x1: headEduc
x2: job
IS R-squared for 500 times is 0.352787822032
OS R-squared for 500 times is 0.308512376509

Modelo 1 c (educHeadYjobsYrooms)

Este para mi es el que mejor da. Sobre estima sistematicamente, la R2 da muy alta, pero es el que mejor rendimiento tiene porque estan todos ajustados a una recta de mejor modo que en los otros. Creo que esto se podría ajustar. Si no usamos R2 sino el correlation coefficient, es el mas alto 96%. Ademas es el mejor mapa me parece


In [25]:
varForModel = [
    'headEduc',
    'job', 
    'SleepingRooms',
    #'schoolAndJob',
    #'Hacinamiento'
]

runModel(dataCaba, 'TotalFamilyIncome', varForModel)


                            WLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.367
Model:                            WLS   Adj. R-squared:                  0.365
Method:                 Least Squares   F-statistic:                     151.1
Date:                Sat, 10 Dec 2016   Prob (F-statistic):           3.36e-77
Time:                        13:20:28   Log-Likelihood:                -7581.4
No. Observations:                 785   AIC:                         1.517e+04
Df Residuals:                     781   BIC:                         1.519e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const      -2352.9962    565.552     -4.161      0.000     -3463.179 -1242.814
x1           308.8816     36.654      8.427      0.000       236.931   380.833
x2          2292.6737    162.544     14.105      0.000      1973.599  2611.748
x3           748.2556    199.643      3.748      0.000       356.355  1140.156
==============================================================================
Omnibus:                      654.904   Durbin-Watson:                   1.933
Prob(Omnibus):                  0.000   Jarque-Bera (JB):            33738.140
Skew:                           3.373   Prob(JB):                         0.00
Kurtosis:                      34.400   Cond. No.                         58.6
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
x1: headEduc
x2: job
x3: SleepingRooms
IS R-squared for 500 times is 0.36982755681
OS R-squared for 500 times is 0.28262153194

Modelo 2 a (jobsAndSchool)


In [26]:
varForModel = [
    #'headEduc',
    #'job', 
    #'SleepingRooms',
    'schoolAndJob',
    #'Hacinamiento'
]

runModel(dataCaba, 'TotalFamilyIncome', varForModel)


                            WLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.401
Model:                            WLS   Adj. R-squared:                  0.400
Method:                 Least Squares   F-statistic:                     524.0
Date:                Sat, 10 Dec 2016   Prob (F-statistic):           3.48e-89
Time:                        13:20:51   Log-Likelihood:                -7559.9
No. Observations:                 785   AIC:                         1.512e+04
Df Residuals:                     783   BIC:                         1.513e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const       1890.8406    217.127      8.708      0.000      1464.621  2317.060
x1           220.2679      9.623     22.891      0.000       201.379   239.157
==============================================================================
Omnibus:                      676.756   Durbin-Watson:                   1.891
Prob(Omnibus):                  0.000   Jarque-Bera (JB):            40163.175
Skew:                           3.504   Prob(JB):                         0.00
Kurtosis:                      37.334   Cond. No.                         38.0
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
x1: schoolAndJob
IS R-squared for 500 times is 0.396870456935
OS R-squared for 500 times is 0.362450943384

Modelo 2 b (jobsAndSchoolYrooms)


In [27]:
varForModel = [
    #'headEduc',
    #'job', 
    #'SleepingRooms',
    'schoolAndJob',
    'Hacinamiento'
]

runModel(dataCaba, 'TotalFamilyIncome', varForModel)


                            WLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.406
Model:                            WLS   Adj. R-squared:                  0.404
Method:                 Least Squares   F-statistic:                     266.8
Date:                Sat, 10 Dec 2016   Prob (F-statistic):           4.52e-89
Time:                        13:21:08   Log-Likelihood:                -7556.8
No. Observations:                 785   AIC:                         1.512e+04
Df Residuals:                     782   BIC:                         1.513e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const       2585.9725    352.863      7.329      0.000      1893.301  3278.644
x1           226.6069      9.922     22.839      0.000       207.130   246.083
x2          -510.0726    204.513     -2.494      0.013      -911.533  -108.612
==============================================================================
Omnibus:                      672.980   Durbin-Watson:                   1.910
Prob(Omnibus):                  0.000   Jarque-Bera (JB):            39775.134
Skew:                           3.473   Prob(JB):                         0.00
Kurtosis:                      37.173   Cond. No.                         68.9
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
x1: schoolAndJob
x2: Hacinamiento
IS R-squared for 500 times is 0.40180764679
OS R-squared for 500 times is 0.36444241289

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:

Modelo con todas las variables (Ignorar)


In [ ]:
income1 = 'income_log'
income2 = 'TotalFamilyIncome'
income = 'TotalFamilyIncomeDecReg'

income = 'income_log'

varForModel = [
    'SleepingRooms','HouseMembers','WaterRec',
    'CookingCombustible',
    
#     'SleepingRooms', 'UsableTotalRooms', 'RoomsNumber', 'HouseMembers', 'WaterRec', # positivas
#     'CookingCombustible', 'FloorMaterial', 'Sewer', 'ToiletType','Sink', 'RoofCoat', 'Water', #negativas
#     'headReading', 'OwnershipRec','WaterType', #sospechoso
    'headAge', 'headEduc','headJob', 'spouseJob' #base
                      ]

runModel(data, income, varForModel)