In [1]:
import pandas as pd
import numpy as np
import savReaderWriter as spss
In [3]:
raw_data = spss.SavReader('/home/aastroza/Dropbox/lectura/data/BaseMaestraECL2014.sav', returnHeader = True) # This is fast
raw_data_list = list(raw_data) # this is slow
data = pd.DataFrame(raw_data_list) # this is slow
data = data.rename(columns=data.loc[0]).iloc[1:] # setting columnheaders, this is slow too.
In [4]:
data.columns
Out[4]:
In [5]:
ecl = data[(data['Region'] == 13) &(data['Comuna_CODIGO'] < 13402)]
ecl['Comuna_CODIGO']
Out[5]:
In [6]:
name_map = dict(zip([13101, 13102, 13103
,13104,13105,13106,13107,13108,13109,13110,13111,13112,13113,13114,13115,13116,13117,13118,13119,13120,13121,13122,13123
,13124,13125,13126,13127,13128,13129,13130,13131,13132,13201,13202,13203,13301,13302,13303,13401],
map(lambda x: x.upper(), [u'Santiago',
u'Cerrillos',
u'Cerro Navia',
u'Conchali',
u'El Bosque',
u'Estacion Central',
u'Huechuraba',
u'Independencia',
u'La Cisterna',
u'La Florida',
u'La Granja',
u'La Pintana',
u'La Reina',
u'Las Condes',
u'Lo Barnechea',
u'Lo Espejo',
u'Lo Prado',
u'Macul',
u'Maipu',
u'nunoa',
u'Pedro Aguirre Cerda',
u'Penalolen',
u'Providencia',
u'Pudahuel',
u'Quilicura',
u'Quinta Normal',
u'Recoleta',
u'Renca',
u'San Joaquin',
u'San Miguel',
u'San Ramon',
u'Vitacura',
u'Puente Alto',
u'Pirque',
u'San Jose de Maipo',
u'Colina',
u'Lampa',
u'Tiltil',
u'San Bernardo'])))
name_map
Out[6]:
In [15]:
variables = ('comuna', 'FEXP', 'sexo', 'edad', 'A9B', 'a.J6', 'J1D', 'O2', 'a.A6', 'b.M26B')
In [9]:
from collections import defaultdict
from itertools import repeat, chain
values = defaultdict(list)
for id_comuna, df_comuna in ecl.groupby('Comuna_CODIGO'):
values['comuna'].append(name_map[id_comuna])
for k, v in df_comuna.ix[:,variables].iteritems():
if k == 'comuna':
continue
if k == 'edad':
v = df_comuna['FEXP'].sum()
k = 'poblacion'
if k == 'FEXP':
continue
if k == 'sexo':
continue
if k == 'A9B':#¿Cuando usted lee lo hace por gusto/ocio?
voluntad = [0, 0, 0, 0]
for i, df_voluntad in df_comuna.groupby('A9B'):
if i == 1:
voluntad[0] = df_voluntad['FEXP'].sum()
if i == 2:
voluntad[1] = df_voluntad['FEXP'].sum()
if i == 88:
voluntad[2] = df_voluntad['FEXP'].sum()
if i == 99:
voluntad[3] = df_voluntad['FEXP'].sum()
v = voluntad[0]/(voluntad[0] + voluntad[1] + voluntad[2] + voluntad[3])
k = 'voluntad'
if k == 'a.J6':#¿Es socio de alguna biblioteca?
socio = [0, 0, 0, 0]
for i, df_socio in df_comuna.groupby('a.J6'):
if i == 1:
socio[0] = df_socio['FEXP'].sum()
if i == 2:
socio[1] = df_socio['FEXP'].sum()
if i == 88:
socio[2] = df_socio['FEXP'].sum()
if i == 99:
socio[3] = df_socio['FEXP'].sum()
v = socio[0]/(socio[0] + socio[1] + socio[2] + socio[3])
k = 'socio'
if k == 'J1D':#¿Lee en el transporte publico?
transporte = [0, 0, 0, 0]
for i, df_transporte in df_comuna.groupby('J1D'):
if i == 1:
transporte[0] = df_transporte['FEXP'].sum()
if i == 2:
transporte[1] = df_transporte['FEXP'].sum()
if i == 88:
transporte[2] = df_transporte['FEXP'].sum()
if i == 99:
transporte[3] = df_transporte['FEXP'].sum()
v = transporte[0]/(transporte[0] + transporte[1] + transporte[2] + transporte[3])
k = 'transporte'
if k == 'a.A6':#¿Cuantos libros hay en su hogar?
libros = [0, 0, 0, 0, 0, 0, 0, 0]
for i, df_libros in df_comuna.groupby('a.A6'):
if i == 1:
libros[0] = df_libros['FEXP'].sum()*3
if i == 2:
libros[1] = df_libros['FEXP'].sum()*8
if i == 3:
libros[2] = df_libros['FEXP'].sum()*18
if i == 4:
libros[3] = df_libros['FEXP'].sum()*38
if i == 5:
libros[0] = df_libros['FEXP'].sum()*75
if i == 6:
libros[1] = df_libros['FEXP'].sum()*150
if i == 7:
libros[2] = df_libros['FEXP'].sum()*350
if i == 8:
libros[3] = df_libros['FEXP'].sum()*500
v = sum(libros)
k = 'nlibrosA'
if k == 'b.M26B':#¿Cuantos libros hay en su hogar?
libros = [0, 0, 0, 0, 0, 0, 0, 0]
for i, df_libros in df_comuna.groupby('b.M26B'):
if i == 1:
libros[0] = df_libros['FEXP'].sum()*3
if i == 2:
libros[1] = df_libros['FEXP'].sum()*8
if i == 3:
libros[2] = df_libros['FEXP'].sum()*18
if i == 4:
libros[3] = df_libros['FEXP'].sum()*38
if i == 5:
libros[0] = df_libros['FEXP'].sum()*75
if i == 6:
libros[1] = df_libros['FEXP'].sum()*150
if i == 7:
libros[2] = df_libros['FEXP'].sum()*350
if i == 8:
libros[3] = df_libros['FEXP'].sum()*500
v = sum(libros)
k = 'nlibrosB'
if k == 'O2':#¿Ingreso total liquido del hogar?
daut_resp = df_comuna[df_comuna['O2'].isin([1,2,3,4,5,6,7,8,9,10])]['O2'].values
daut_freq = df_comuna[df_comuna['O2'].isin([1,2,3,4,5,6,7,8,9,10])]['FEXP'].values
daut_zip = zip(daut_resp, daut_freq.astype(int))
daut_array = np.array(list(chain(*[repeat(p, w) for p, w in daut_zip])), dtype=np.float64)
v = np.median(daut_array)
k = 'ingreso'
values[k].append(v)
In [20]:
df = pd.DataFrame(data=values)
In [21]:
df['librospp'] = (df['nlibrosA'] + df['nlibrosB'])/ df['poblacion']
In [22]:
df
Out[22]:
In [27]:
import matplotlib.pyplot as plt
%matplotlib inline
df2 = df.sort_index(by=['librospp', 'comuna'], ascending=[True, True])
ax = df2['librospp'].plot(kind='bar', title ="Libros por Persona",figsize=(18,5), fontsize=12)
#ax.set_xlabel("Hour",fontsize=12)
#ax.set_ylabel("V",fontsize=12)
ax.set_xticklabels(list(df2['comuna']))
plt.show()
In [14]:
ax = df.plot(kind='scatter', x='librospp', y='ingreso',
title ="Decil de Ingreso vs Libros por Persona",figsize=(12,8), fontsize=12)
for i in [0,1,2,4,10,12,13,14,19,22,25,28,30,31,32,33,34]:
#for i in range(0,35):
ax.text(df['librospp'][i]+1, df['ingreso'][i]+0.1, df['comuna'][i])
plt.show()
In [32]:
variables = ('FEXP', 'O2', 'a.A6', 'b.M26B', 'A8')
In [36]:
values = defaultdict(list)
for id_ingreso, df_ingreso in data.groupby('O2'):
values['ingreso'].append(id_ingreso)
for k, v in df_ingreso.ix[:,variables].iteritems():
if k == 'O2':
v = df_ingreso['FEXP'].sum()
k = 'poblacion'
if k == 'FEXP':
continue
if k == 'a.A6':#¿Cuantos libros hay en su hogar?
libros = [0, 0, 0, 0, 0, 0, 0, 0]
for i, df_libros in df_ingreso.groupby('a.A6'):
if i == 1:
libros[0] = df_libros['FEXP'].sum()*3
if i == 2:
libros[1] = df_libros['FEXP'].sum()*8
if i == 3:
libros[2] = df_libros['FEXP'].sum()*18
if i == 4:
libros[3] = df_libros['FEXP'].sum()*38
if i == 5:
libros[0] = df_libros['FEXP'].sum()*75
if i == 6:
libros[1] = df_libros['FEXP'].sum()*150
if i == 7:
libros[2] = df_libros['FEXP'].sum()*350
if i == 8:
libros[3] = df_libros['FEXP'].sum()*500
v = sum(libros)
k = 'nlibrosA'
if k == 'b.M26B':#¿Cuantos libros hay en su hogar?
libros = [0, 0, 0, 0, 0, 0, 0, 0]
for i, df_libros in df_ingreso.groupby('b.M26B'):
if i == 1:
libros[0] = df_libros['FEXP'].sum()*3
if i == 2:
libros[1] = df_libros['FEXP'].sum()*8
if i == 3:
libros[2] = df_libros['FEXP'].sum()*18
if i == 4:
libros[3] = df_libros['FEXP'].sum()*38
if i == 5:
libros[0] = df_libros['FEXP'].sum()*75
if i == 6:
libros[1] = df_libros['FEXP'].sum()*150
if i == 7:
libros[2] = df_libros['FEXP'].sum()*350
if i == 8:
libros[3] = df_libros['FEXP'].sum()*500
v = sum(libros)
k = 'nlibrosB'
if k == 'A8':#¿Que tipo de lector se considera usted?
daut_resp = df_ingreso[df_ingreso['A8'].isin([1,2,3,4,5])]['A8'].values
daut_freq = df_ingreso[df_ingreso['A8'].isin([1,2,3,4,5])]['FEXP'].values
daut_zip = zip(daut_resp, daut_freq.astype(int))
daut_array = np.array(list(chain(*[repeat(p, w) for p, w in daut_zip])), dtype=np.float64)
v = np.mean(daut_array)
k = 'lector'
values[k].append(v)
In [37]:
dfn = pd.DataFrame(data=values)
dfn['librospp'] = (dfn['nlibrosA'] + dfn['nlibrosB'])/ dfn['poblacion']
In [38]:
dfn
Out[38]:
In [ ]: