In [2]:
import pandas as pd
import os
import numpy as np
import simpledbf
import matplotlib.pyplot as plt
import seaborn
%matplotlib inline
In [3]:
dbf = simpledbf.Dbf5('data/BaseEAH2010/EAH10_BU_IND_VERSION2.dbf')
data10 = dbf.to_dataframe()
In [4]:
(data10.ITFB == 9999999).sum()
Out[4]:
In [5]:
data10 = data10.loc[data10.ITFB != 9999999,['ID','COMUNA','FEXP','ITFB']]
#data10repe = data10.loc[:,['ID','COMUNA','FEXP','INGTOT','ITFB']]
print data10.shape
data10.head()
Out[5]:
In [6]:
data10.drop_duplicates(inplace = True)
print data10.shape
In [7]:
print(data10.ITFB == 0).sum()
data10.ITFB.replace(to_replace=[0], value=[1] , inplace=True, axis=None)
print (data10.ITFB == 0).sum()
In [8]:
data10['lnIncome'] = np.log(data10.ITFB)
In [9]:
print type(data10.index.values[0]),type(data10.FEXP[0])
In [10]:
data10.FEXP = data10.FEXP.astype(int)
In [11]:
data10exp = data10.loc[np.repeat(data10.index.values,data10.FEXP)]
In [12]:
data10exp.lnIncome.groupby(by=data10exp.COMUNA).mean().sort_values()
Out[12]:
In [13]:
data10.ITFB.groupby(by=data10.COMUNA).mean()
Out[13]:
In [14]:
data10exp.ITFB.groupby(by=data10exp.COMUNA).mean()
Out[14]:
In [16]:
def readRedatamCSV(asciiFile):
f = open(asciiFile, 'r')
areas = []
measures = []
for line in f:
columns = line.strip().split()
#print columns
if len(columns) > 0:
if 'RESUMEN' in columns[0] :
break
elif columns[0] == 'AREA':
area = str.split(columns[2],',')[0]
areas.append(area)
elif columns[0] == 'Total':
measure = str.split(columns[2],',')[2]
measures.append(measure)
try:
data = pd.DataFrame({'area':areas,'measure':measures})
return data
except:
print asciiFile
def R2(dataset,real,predicted):
plt.scatter(dataset[predicted],dataset[real])
return ((dataset[real] - dataset[predicted])**2).sum() / ((dataset[real] - dataset[real].mean())**2).sum()
In [17]:
archivo = 'data/indecOnline/headEduc/comunas.csv'
ingresoXComuna = readRedatamCSV(archivo)
In [18]:
ingresoXComuna.columns = ['area','educHead']
In [19]:
incomeEncuesta = [3806.762807,
6933.011914,
4301.447851,
3869.557112,
5070.870747,
5603.396225,
4575.618845,
2800.941981,
4667.634219,
4103.096635,
5515.926401,
4720.418917,
6103.743836,
6204.832037,
4843.879949]
In [20]:
ingresoXComuna['ingresoEncuesta'] = incomeEncuesta
In [21]:
ingresoXComuna
Out[21]:
In [23]:
ruta = 'data/indecOnline/headEducYjobs/'
archivo = ruta + 'comuna.csv'
ingresoModelo2 = readRedatamCSV(archivo)
In [24]:
ingresoXComuna = ingresoXComuna.merge(right=ingresoModelo2,on='area')
In [25]:
ingresoXComuna.columns = ['area','educHead','ingresoEncuesta','educHeadYjobs']
In [26]:
ingresoXComuna
Out[26]:
In [27]:
ingresoXComuna.educHead = ingresoXComuna.educHead.astype(float)
In [28]:
ingresoXComuna.educHeadYjobs = ingresoXComuna.educHeadYjobs.astype(float)
In [29]:
ruta = 'data/indecOnline/headEducuJobsYrooms/'
archivo = ruta + 'comunas.csv'
ingresoModelo3 = readRedatamCSV(archivo)
In [30]:
ingresoXComuna = ingresoXComuna.merge(right=ingresoModelo3,on='area')
ingresoXComuna.columns = ['area','educHead','ingresoEncuesta','educHeadYjobs','educHeadYjobsYrooms']
ingresoXComuna.educHeadYjobsYrooms = ingresoXComuna.educHeadYjobsYrooms.astype(float)
In [32]:
ruta = 'data/indecOnline/jobSchool/'
archivo = ruta + 'comunas.csv'
ingresoModelo4 = readRedatamCSV(archivo)
In [33]:
ingresoXComuna = ingresoXComuna.merge(right=ingresoModelo4,on='area')
ingresoXComuna.columns = ['area','educHead','ingresoEncuesta','educHeadYjobs','educHeadYjobsYrooms','jobsAndSchool']
ingresoXComuna.jobsAndSchool = ingresoXComuna.jobsAndSchool.astype(float)
In [34]:
ingresoXComuna
Out[34]:
In [35]:
ruta = 'data/indecOnline/jobSchoolYrooms/'
archivo = ruta + 'comunas.csv'
ingresoModelo5 = readRedatamCSV(archivo)
In [36]:
ingresoXComuna = ingresoXComuna.merge(right=ingresoModelo5,on='area')
ingresoXComuna.columns = ['area','educHead','ingresoEncuesta','educHeadYjobs','educHeadYjobsYrooms','jobsAndSchool',
'jobsAndSchoolYrooms']
ingresoXComuna.jobsAndSchoolYrooms = ingresoXComuna.jobsAndSchoolYrooms.astype(float)
In [37]:
ingresoXComuna
Out[37]:
In [38]:
R2(dataset = ingresoXComuna,real = 'ingresoEncuesta',predicted = 'jobsAndSchoolYrooms')
Out[38]:
In [40]:
R2(dataset = ingresoXComuna,real = 'ingresoEncuesta',predicted = 'jobsAndSchool')
Out[40]:
In [41]:
R2(dataset = ingresoXComuna,real = 'ingresoEncuesta',predicted = 'educHeadYjobsYrooms')
Out[41]:
In [42]:
R2(dataset = ingresoXComuna,real = 'ingresoEncuesta',predicted = 'educHeadYjobs')
Out[42]:
In [43]:
R2(dataset = ingresoXComuna,real = 'ingresoEncuesta',predicted = 'educHead')
Out[43]:
In [44]:
ingresoXComuna.corr()
Out[44]:
In [ ]: