In [8]:
import getEPH
import categorize
import schoolYears
import make_dummy
import functionsForModels
import pandas as pd
#http://statsmodels.sourceforge.net/devel/examples/generated/example_wls.html
import numpy as np
from scipy import stats
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from statsmodels.iolib.table import (SimpleTable, default_txt_fmt)
np.random.seed(1024)
%matplotlib inline
NOTAS
In [2]:
#get data
getEPHdbf('t310')
In [42]:
data1 = pd.read_csv('data/cleanDatat310.csv')
In [43]:
data2 = categorize.categorize(data1)
data3 = schoolYears.schoolYears(data2)
data = make_dummy.make_dummy(data3)
In [44]:
dataModel = functionsForModels.prepareDataForModel(data)
In [45]:
dataModel.head()
Out[45]:
In [11]:
fig = plt.figure(figsize=(16,12))
ax1 = fig.add_subplot(2,2,1)
ax2 = fig.add_subplot(2,2,2)
ax3 = fig.add_subplot(2,2,3)
ax4 = fig.add_subplot(2,2,4)
ax1.plot(dataModel.education,dataModel.P47T,'ro')
ax1.set_ylabel('Ingreso total')
ax1.set_xlabel('Educacion')
ax2.plot(dataModel.age,dataModel.P47T,'ro')
ax2.set_xlabel('Edad')
ax3.plot(dataModel.education,dataModel.P21,'bo')
ax3.set_ylabel('Ingreso Laboral')
ax3.set_xlabel('Educacion')
ax4.plot(dataModel.age,dataModel.P21,'bo')
ax4.set_xlabel('Edad')
Out[11]:
In [12]:
fig = plt.figure(figsize=(16,12))
ax1 = fig.add_subplot(2,2,1)
ax2 = fig.add_subplot(2,2,2)
ax3 = fig.add_subplot(2,2,3)
ax4 = fig.add_subplot(2,2,4)
sns.kdeplot(dataModel.P47T,ax=ax1,color = 'red')
sns.kdeplot(dataModel.lnIncomeT,ax=ax2,color = 'red')
sns.kdeplot(dataModel.P21,ax=ax3)
sns.kdeplot(dataModel.lnIncome,ax=ax4)
Out[12]:
In [ ]:
print 'mean:', dataModel.lnIncome.mean(), 'std:', dataModel.lnIncome.std()
In [ ]:
print 'mean:', dataModel.P21.mean(), 'std:', dataModel.P21.std()
In [ ]:
plt.boxplot(list(dataModel.P21), 0, 'gD')
In [13]:
g = sns.JointGrid(x="education", y="lnIncome", data=dataModel)
g.plot_joint(sns.regplot, order=2)
g.plot_marginals(sns.distplot)
g2 = sns.JointGrid(x="age", y="lnIncome", data=dataModel)
g2.plot_joint(sns.regplot, order=2)
g2.plot_marginals(sns.distplot)
Out[13]:
Tomo el de mejor performance para evaluar en el test set. Basicamente son dos posibiliades INDEC o ALTERNATIVO (que habiamos propuesto no cortar las edades y los años de escolaridad, sino usar las variables y directamente usar el cuadrado). Cada uno lo pruebo con ingresos laborales (con y sin constante) y con el log del ingreso laboral.
In [14]:
dataModel1 = runModel(dataModel, income = 'P21')
In [52]:
dataModel2 = functionsForModels.runModel(dataModel, income = 'lnIncome', variables= [
'primary','secondary','university',
'male_14to24','male_25to34',
'female_14to24', 'female_25to34', 'female_35more'])
In [51]:
dataModel3 = functionsForModels.runModel(dataModel, income = 'P47T')
In [50]:
dataModel4 = functionsForModels.runModel(dataModel, income = 'lnIncomeT')
In [49]:
dataModel5 = functionsForModels.runModel(dataModel, income = 'lnIncomeT', variables=['education','education2',
'age','age2','female'])
In [48]:
dataModel6 = functionsForModels.runModel(dataModel, income = 'lnIncome', variables=['education','education2',
'age','age2','female'])
In [ ]: