Sources:
In [24]:
# helper functions
import getEPH
import categorize
import schoolYears
import make_dummy
import functionsForModels
# libraries
import pandas as pd
import numpy as np
from scipy import stats
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from statsmodels.iolib.table import (SimpleTable, default_txt_fmt)
np.random.seed(1024)
%matplotlib inline
In [25]:
# get data using 'getEPHdbf' function
getEPH.getEPHdbf('t310')
In [26]:
data1 = pd.read_csv('data/cleanDatat310.csv')
In [27]:
data2 = categorize.categorize(data1)
data3 = schoolYears.schoolYears(data2)
data = make_dummy.make_dummy(data3)
In [28]:
dataModel = functionsForModels.prepareDataForModel(data)
In [29]:
dataModel.head()
Out[29]:
In [30]:
fig = plt.figure(figsize=(16,12))
ax1 = fig.add_subplot(2,2,1)
ax2 = fig.add_subplot(2,2,2)
ax3 = fig.add_subplot(2,2,3)
ax4 = fig.add_subplot(2,2,4)
ax1.plot(dataModel.education,dataModel.P47T,'ro')
ax1.set_ylabel('Ingreso total')
ax1.set_xlabel('Educacion')
ax2.plot(dataModel.age,dataModel.P47T,'ro')
ax2.set_xlabel('Edad')
ax3.plot(dataModel.education,dataModel.P21,'bo')
ax3.set_ylabel('Ingreso Laboral')
ax3.set_xlabel('Educacion')
ax4.plot(dataModel.age,dataModel.P21,'bo')
ax4.set_xlabel('Edad')
Out[30]:
In [31]:
fig = plt.figure(figsize=(16,12))
ax1 = fig.add_subplot(2,2,1)
ax2 = fig.add_subplot(2,2,2)
ax3 = fig.add_subplot(2,2,3)
ax4 = fig.add_subplot(2,2,4)
sns.kdeplot(dataModel.P47T,ax=ax1,color = 'red')
sns.kdeplot(dataModel.lnIncomeT,ax=ax2,color = 'red')
sns.kdeplot(dataModel.P21,ax=ax3)
sns.kdeplot(dataModel.lnIncome,ax=ax4)
Out[31]:
In [32]:
print 'mean:', dataModel.lnIncome.mean(), 'std:', dataModel.lnIncome.std()
In [33]:
print 'mean:', dataModel.P21.mean(), 'std:', dataModel.P21.std()
In [34]:
plt.boxplot(list(dataModel.P21), 0, 'gD')
Out[34]:
In [35]:
g = sns.JointGrid(x="education", y="lnIncome", data=dataModel)
g.plot_joint(sns.regplot, order=2)
g.plot_marginals(sns.distplot)
g2 = sns.JointGrid(x="age", y="lnIncome", data=dataModel)
g2.plot_joint(sns.regplot, order=2)
g2.plot_marginals(sns.distplot)
Out[35]:
The ECLAC (Economic Comission for Latin America and the Caribbean) estimates income by using a regression model based on the following variables (education, gender and age):
In [36]:
dataModel1 = functionsForModels.runModel(dataModel, income = 'P21')
In [37]:
dataModel2 = functionsForModels.runModel(dataModel, income = 'lnIncome', variables= [
'primary','secondary','university',
'male_14to24','male_25to34',
'female_14to24', 'female_25to34', 'female_35more'])
In [38]:
dataModel3 = functionsForModels.runModel(dataModel, income = 'P47T')
In [39]:
dataModel4 = functionsForModels.runModel(dataModel, income = 'lnIncomeT')
In [40]:
dataModel5 = functionsForModels.runModel(dataModel, income = 'lnIncomeT', variables=['education','education2',
'age','age2','female'])
In [41]:
dataModel6 = functionsForModels.runModel(dataModel, income = 'lnIncome', variables=['education','education2',
'age','age2','female'])
In [ ]: