In [1]:
%matplotlib nbagg
%pylab
In [2]:
import pandas as pd
from urllib import urlopen
with open('indicadores.csv', 'w') as f:
f.write(urlopen('http://blog.jazzido.com/indicadoresEducativosAR/indicadores.csv').read())
df = pd.read_csv('indicadores.csv').set_index('id')
In [3]:
def get_series(df, nivel, variable):
"""
Construye un diccionario, donde los keys son los lugares (i.e. buenos aires) y los values son una serie de tiempo
que corresponde a los valores asociados al nivel y variable
"""
res = {}
for place, row in df.iterrows():
index = []
values = []
for year in xrange(2003, 2010):
key = '{variable}{year}_{nivel}'.format(**locals())
index.append(year)
values.append(row[key])
res[place] = pd.Series(values,index=index)
return res
In [4]:
from collections import defaultdict
def get_stats(relativas):
niveles = 'egb_1 egb_2 egb_3 egb_4 egb_5 egb_6 egb_7 egb_8 egb_9'.split()
diffs = defaultdict(dict)
for nivel in niveles:
for place, series in get_series(df, nivel, 'promocion').iteritems():
diff = (series.loc[2009] - series.loc[2003])
if relativas: diff /= series.loc[2003]
diffs[place][nivel] = diff
return diffs
In [6]:
# Imprimo los resultados, de "mejores" a "peores"
def plot_diff(place, place_data, relativas):
place_data = sorted(place_data.iteritems(), key=lambda x:int(x[0].split('_')[1]))
for i, (nivel, diff) in enumerate(place_data):
bar(i, diff * (100 if relativas else 1), alpha=0.5)
ylabel_text = 'Porcentaje de cambio' if relativas else 'Cambio absoluto'
title_text = '{} entre 2003 y 2009 en promocion'.format(ylabel_text)
title(title_text)
ylabel(ylabel_text)
xticks(range(len(place_data)), zip(*place_data)[0], rotation=45)
grid()
relativas = True
rel_diffs = get_stats(relativas=relativas)
for place, place_data in sorted(rel_diffs.iteritems(), key=lambda x:-sum(x[1].values())):
figure()
suptitle(place)
plot_diff(place, place_data, relativas=relativas)
In [16]:
import seaborn as sns
all_values = []
for place, place_data in rel_diffs.iteritems():
all_values.extend(place_data.itervalues())
figure()
sns.distplot(all_values, hist=True, bins=20)
percentiles = np.percentile(all_values, [5, 50, 95])
vlines(percentiles, 0, 12, linestyles='--', alpha=0.5)
Out[16]:
In [31]:
print "Provincias destacadas"
print
lbound, _, ubound = percentiles
to_print = []
for place, place_data in rel_diffs.iteritems():
worst_level, worst_score = min(place_data.items(), key=lambda x:x[1])
best_level, best_score = max(place_data.items(), key=lambda x:x[1])
if worst_score <= lbound:
to_print.append(
{
'text': '{:<30s}Tiene problemas en {}'.format(place, worst_level),
'order': 0
}
)
if best_score >= ubound:
to_print.append(
{
'text': '{:<30s}Anda bien en {}'.format(place, best_level),
'order': 1
}
)
to_print.sort(key=lambda x:x['order'])
for doc in to_print:
print doc['text']