In [8]:
import getEPH
import categorize
import schoolYears
import make_dummy
import functionsForModels
import pandas as pd
#http://statsmodels.sourceforge.net/devel/examples/generated/example_wls.html
import numpy as np
from scipy import stats
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from statsmodels.iolib.table import (SimpleTable, default_txt_fmt)
np.random.seed(1024)
%matplotlib inline


/home/pipe/anaconda2/lib/python2.7/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

NOTAS

  • CALCULAR EN BASE AL MODELO DE CURVAS LAS DERIVADAS Y DONDE HACE EL PICO EN EDAD

DOWNLOAD DATA


In [2]:
#get data
getEPHdbf('t310')


('Downloading', 't310')
file in place, creating CSV file
csv file cleanData t310 .csv successfully created in folder data/

In [42]:
data1 = pd.read_csv('data/cleanDatat310.csv')

In [43]:
data2 = categorize.categorize(data1)
data3 = schoolYears.schoolYears(data2)
data = make_dummy.make_dummy(data3)

In [44]:
dataModel = functionsForModels.prepareDataForModel(data)

In [45]:
dataModel.head()


Out[45]:
PONDERA P47T P21 primary secondary university male_14to24 male_25to34 female_14to24 female_25to34 female_35more female age education education2 age2 lnIncome lnIncomeT
2 1674 3000 3000 7.0 0.0 0.0 0 0 0 0 0 0 42 7.0 49.0 1764 8.006368 8.006368
3 1674 2800 2800 7.0 5.0 5.0 0 0 0 0 1 1 44 17.0 289.0 1936 7.937375 7.937375
7 1320 6000 5000 7.0 5.0 5.0 0 0 0 0 0 0 38 17.0 289.0 1444 8.517193 8.699515
8 1320 4000 4000 7.0 5.0 5.0 0 0 0 1 0 1 28 17.0 289.0 784 8.294050 8.294050
9 1281 3800 3800 7.0 5.0 5.0 0 0 0 0 0 0 63 17.0 289.0 3969 8.242756 8.242756

NEW VARIABLES FOR MODEL

Graficos exploratorios


In [11]:
fig = plt.figure(figsize=(16,12))
ax1 = fig.add_subplot(2,2,1)
ax2 = fig.add_subplot(2,2,2)
ax3 = fig.add_subplot(2,2,3)
ax4 = fig.add_subplot(2,2,4)

ax1.plot(dataModel.education,dataModel.P47T,'ro')
ax1.set_ylabel('Ingreso total')
ax1.set_xlabel('Educacion')
ax2.plot(dataModel.age,dataModel.P47T,'ro')
ax2.set_xlabel('Edad')
ax3.plot(dataModel.education,dataModel.P21,'bo')
ax3.set_ylabel('Ingreso Laboral')
ax3.set_xlabel('Educacion')
ax4.plot(dataModel.age,dataModel.P21,'bo')
ax4.set_xlabel('Edad')


Out[11]:
<matplotlib.text.Text at 0x7f2c69be8650>

In [12]:
fig = plt.figure(figsize=(16,12))
ax1 = fig.add_subplot(2,2,1)
ax2 = fig.add_subplot(2,2,2)
ax3 = fig.add_subplot(2,2,3)
ax4 = fig.add_subplot(2,2,4)


sns.kdeplot(dataModel.P47T,ax=ax1,color = 'red')
sns.kdeplot(dataModel.lnIncomeT,ax=ax2,color = 'red')
sns.kdeplot(dataModel.P21,ax=ax3)
sns.kdeplot(dataModel.lnIncome,ax=ax4)


Out[12]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f2c680f3390>

In [ ]:
print 'mean:', dataModel.lnIncome.mean(), 'std:', dataModel.lnIncome.std()

In [ ]:
print 'mean:', dataModel.P21.mean(), 'std:', dataModel.P21.std()

In [ ]:
plt.boxplot(list(dataModel.P21), 0, 'gD')

PLOTS FOR LnINCOME ~ EDUC AND AGE


In [13]:
g = sns.JointGrid(x="education", y="lnIncome", data=dataModel)  
g.plot_joint(sns.regplot, order=2)  
g.plot_marginals(sns.distplot)

g2 = sns.JointGrid(x="age", y="lnIncome", data=dataModel)  
g2.plot_joint(sns.regplot, order=2)  
g2.plot_marginals(sns.distplot)


Out[13]:
<seaborn.axisgrid.JointGrid at 0x7f2c63c81fd0>

Modelos

Tomo el de mejor performance para evaluar en el test set. Basicamente son dos posibiliades INDEC o ALTERNATIVO (que habiamos propuesto no cortar las edades y los años de escolaridad, sino usar las variables y directamente usar el cuadrado). Cada uno lo pruebo con ingresos laborales (con y sin constante) y con el log del ingreso laboral.

1 CEPAL con ingresos laborales


In [14]:
dataModel1 = runModel(dataModel, income = 'P21')


                            WLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.257
Model:                            WLS   Adj. R-squared:                  0.256
Method:                 Least Squares   F-statistic:                     154.1
Date:                Fri, 18 Nov 2016   Prob (F-statistic):          2.16e-223
Time:                        14:58:03   Log-Likelihood:                -31770.
No. Observations:                3568   AIC:                         6.356e+04
Df Residuals:                    3559   BIC:                         6.361e+04
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const       1651.4434    222.230      7.431      0.000      1215.733  2087.153
x1            91.1777     33.520      2.720      0.007        25.457   156.898
x2           162.5962     16.393      9.918      0.000       130.455   194.737
x3           340.9637     18.044     18.896      0.000       305.586   376.341
x4         -1161.1065    118.953     -9.761      0.000     -1394.329  -927.884
x5          -579.6448     91.277     -6.350      0.000      -758.605  -400.685
x6         -1869.1208    138.199    -13.525      0.000     -2140.079 -1598.163
x7         -1561.7263    102.296    -15.267      0.000     -1762.291 -1361.162
x8         -1124.0447     76.523    -14.689      0.000     -1274.078  -974.011
==============================================================================
Omnibus:                     2916.321   Durbin-Watson:                   1.758
Prob(Omnibus):                  0.000   Jarque-Bera (JB):           168978.870
Skew:                           3.472   Prob(JB):                         0.00
Kurtosis:                      35.991   Cond. No.                         61.1
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
x1: primary
x2: secondary
x3: university
x4: male_14to24
x5: male_25to34
x6: female_14to24
x7: female_25to34
x8: female_35more
IS R-squared for 1000 times is 0.265103679995
OS R-squared for 1000 times is 0.260652864625

2 - CEPAL con Log ingresos laborales


In [52]:
dataModel2 = functionsForModels.runModel(dataModel, income = 'lnIncome', variables= [
        'primary','secondary','university',
        'male_14to24','male_25to34',
        'female_14to24', 'female_25to34', 'female_35more'])


                            WLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.274
Model:                            WLS   Adj. R-squared:                  0.272
Method:                 Least Squares   F-statistic:                     169.6
Date:                Thu, 24 Nov 2016   Prob (F-statistic):          1.71e-243
Time:                        15:29:32   Log-Likelihood:                -4124.3
No. Observations:                3608   AIC:                             8267.
Df Residuals:                    3599   BIC:                             8322.
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          6.8690      0.093     73.616      0.000         6.686     7.052
x1             0.0787      0.014      5.585      0.000         0.051     0.106
x2             0.0838      0.007     12.030      0.000         0.070     0.098
x3             0.1221      0.008     15.897      0.000         0.107     0.137
x4            -0.4212      0.050     -8.364      0.000        -0.520    -0.322
x5            -0.1547      0.039     -4.004      0.000        -0.230    -0.079
x6            -0.9296      0.058    -15.926      0.000        -1.044    -0.815
x7            -0.6264      0.043    -14.521      0.000        -0.711    -0.542
x8            -0.5843      0.032    -17.988      0.000        -0.648    -0.521
==============================================================================
Omnibus:                      667.857   Durbin-Watson:                   1.855
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             1520.071
Skew:                          -1.050   Prob(JB):                         0.00
Kurtosis:                       5.387   Cond. No.                         60.5
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
x1: primary
x2: secondary
x3: university
x4: male_14to24
x5: male_25to34
x6: female_14to24
x7: female_25to34
x8: female_35more
IS R-squared for 1000 times is 0.28519548832
OS R-squared for 1000 times is 0.277470835953

3 - CEPAL con ingresos totales


In [51]:
dataModel3 = functionsForModels.runModel(dataModel, income = 'P47T')


                            WLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.253
Model:                            WLS   Adj. R-squared:                  0.251
Method:                 Least Squares   F-statistic:                     152.0
Date:                Thu, 24 Nov 2016   Prob (F-statistic):          4.78e-221
Time:                        15:29:29   Log-Likelihood:                -32860.
No. Observations:                3608   AIC:                         6.574e+04
Df Residuals:                    3599   BIC:                         6.579e+04
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const       2004.2388    268.448      7.466      0.000      1477.914  2530.564
x1            92.0956     40.538      2.272      0.023        12.616   171.575
x2           175.7131     20.050      8.764      0.000       136.403   215.023
x3           454.4130     22.092     20.569      0.000       411.099   497.727
x4         -1310.4315    144.881     -9.045      0.000     -1594.488 -1026.375
x5          -712.6864    111.153     -6.412      0.000      -930.616  -494.756
x6         -2196.2699    167.917    -13.080      0.000     -2525.492 -1867.048
x7         -1824.5013    124.105    -14.701      0.000     -2067.825 -1581.177
x8         -1051.1493     93.448    -11.249      0.000     -1234.365  -867.934
==============================================================================
Omnibus:                     2829.452   Durbin-Watson:                   1.801
Prob(Omnibus):                  0.000   Jarque-Bera (JB):           117722.951
Skew:                           3.371   Prob(JB):                         0.00
Kurtosis:                      30.159   Cond. No.                         60.5
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
x1: primary
x2: secondary
x3: university
x4: male_14to24
x5: male_25to34
x6: female_14to24
x7: female_25to34
x8: female_35more
IS R-squared for 1000 times is 0.256494942837
OS R-squared for 1000 times is 0.252974296836

4 - CEPAL con Log ingresos totales


In [50]:
dataModel4 = functionsForModels.runModel(dataModel, income = 'lnIncomeT')


                            WLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.269
Model:                            WLS   Adj. R-squared:                  0.267
Method:                 Least Squares   F-statistic:                     165.6
Date:                Thu, 24 Nov 2016   Prob (F-statistic):          2.03e-238
Time:                        15:29:24   Log-Likelihood:                -3972.8
No. Observations:                3608   AIC:                             7964.
Df Residuals:                    3599   BIC:                             8019.
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          7.2029      0.089     80.505      0.000         7.028     7.378
x1             0.0545      0.014      4.030      0.000         0.028     0.081
x2             0.0749      0.007     11.203      0.000         0.062     0.088
x3             0.1303      0.007     17.701      0.000         0.116     0.145
x4            -0.4616      0.048     -9.559      0.000        -0.556    -0.367
x5            -0.1635      0.037     -4.412      0.000        -0.236    -0.091
x6            -0.9301      0.056    -16.618      0.000        -1.040    -0.820
x7            -0.5881      0.041    -14.219      0.000        -0.669    -0.507
x8            -0.4353      0.031    -13.975      0.000        -0.496    -0.374
==============================================================================
Omnibus:                      712.707   Durbin-Watson:                   1.849
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             2039.735
Skew:                          -1.031   Prob(JB):                         0.00
Kurtosis:                       6.052   Cond. No.                         60.5
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
x1: primary
x2: secondary
x3: university
x4: male_14to24
x5: male_25to34
x6: female_14to24
x7: female_25to34
x8: female_35more
IS R-squared for 1000 times is 0.274975121532
OS R-squared for 1000 times is 0.269271040171

5 - ALTERNATIVO con Log ingresos totales


In [49]:
dataModel5 = functionsForModels.runModel(dataModel, income = 'lnIncomeT', variables=['education','education2',
                                'age','age2','female'])


                            WLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.275
Model:                            WLS   Adj. R-squared:                  0.274
Method:                 Least Squares   F-statistic:                     272.8
Date:                Thu, 24 Nov 2016   Prob (F-statistic):          5.89e-248
Time:                        15:29:01   Log-Likelihood:                -3959.0
No. Observations:                3608   AIC:                             7930.
Df Residuals:                    3602   BIC:                             7967.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          6.0980      0.133     45.843      0.000         5.837     6.359
x1            -0.0033      0.015     -0.221      0.825        -0.033     0.026
x2             0.0046      0.001      6.704      0.000         0.003     0.006
x3             0.0499      0.005      9.679      0.000         0.040     0.060
x4            -0.0005   5.94e-05     -7.858      0.000        -0.001    -0.000
x5            -0.4433      0.024    -18.310      0.000        -0.491    -0.396
==============================================================================
Omnibus:                      704.928   Durbin-Watson:                   1.853
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             1966.525
Skew:                          -1.029   Prob(JB):                         0.00
Kurtosis:                       5.974   Cond. No.                     2.45e+04
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.45e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
x1: education
x2: education2
x3: age
x4: age2
x5: female
IS R-squared for 1000 times is 0.280277989206
OS R-squared for 1000 times is 0.276299750978

6 - ALTERNATIVO con log Income laboral


In [48]:
dataModel6 = functionsForModels.runModel(dataModel, income = 'lnIncome', variables=['education','education2',
                                'age','age2','female'])


                            WLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.287
Model:                            WLS   Adj. R-squared:                  0.286
Method:                 Least Squares   F-statistic:                     289.9
Date:                Thu, 24 Nov 2016   Prob (F-statistic):          2.70e-261
Time:                        15:28:46   Log-Likelihood:                -4091.5
No. Observations:                3608   AIC:                             8195.
Df Residuals:                    3602   BIC:                             8232.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          5.5938      0.138     40.537      0.000         5.323     5.864
x1             0.0277      0.016      1.779      0.075        -0.003     0.058
x2             0.0032      0.001      4.525      0.000         0.002     0.005
x3             0.0659      0.005     12.328      0.000         0.055     0.076
x4            -0.0007   6.16e-05    -11.488      0.000        -0.001    -0.001
x5            -0.5528      0.025    -22.012      0.000        -0.602    -0.504
==============================================================================
Omnibus:                      641.219   Durbin-Watson:                   1.858
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             1458.019
Skew:                          -1.012   Prob(JB):                         0.00
Kurtosis:                       5.367   Cond. No.                     2.45e+04
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.45e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
x1: education
x2: education2
x3: age
x4: age2
x5: female
IS R-squared for 1000 times is 0.297340744785
OS R-squared for 1000 times is 0.294845959345

In [ ]: