In [1]:
%matplotlib inline
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
In [ ]:
# IF TIME, PLEASE CLEAN THESE LOCS
In [2]:
records = pd.read_csv('../data/fucking_final_dataset.csv')
records = records[records.pub_year > 1499]
records = records[records.pub_year < 1600]
In [5]:
len(records)
Out[5]:
In [6]:
records.head(1)
Out[6]:
In [7]:
plt.rcParams['figure.figsize'] = (12.0, 6.0)
In [8]:
records.groupby('canonical_country').count()['control_number'].sort_values(inplace=False, ascending=False).ix[:10].plot(kind="bar")
Out[8]:
In [9]:
records.groupby('slug').count()['control_number'].sort_values(inplace=False, ascending=False).ix[:25].plot(kind="bar")
Out[9]:
In [10]:
records.sort_values('pub_year').groupby('pub_year').count()['control_number'].plot()
Out[10]:
In [29]:
# records[records.pub_year < 1900].sort_values('pub_year').groupby('pub_year').count()['control_number'].plot()
In [30]:
# records[records.pub_year > 1900].sort_values('pub_year').groupby('pub_year').count()['control_number'].plot()
In [11]:
top_slugs = records.groupby('slug').count()['control_number'].sort_values(inplace=False, ascending=False).ix[:10].index
In [12]:
top_producers = records[records.slug.isin(top_slugs)]
top_slugs
Out[12]:
In [13]:
group_top_producers = top_producers.sort_values('pub_year').groupby(['slug', 'pub_year']).count()['control_number']
In [22]:
top_producer_df = pd.DataFrame({
'sevilla,spain': group_top_producers.ix['sevilla,spain'],
'madrid,spain': group_top_producers.ix['madrid,spain'],
'salamanca,spain': group_top_producers.ix['salamanca,spain'],
'barcelona,spain': group_top_producers.ix['barcelona,spain'],
'valladolid,spain': group_top_producers.ix['valladolid,spain'],
'alcala de henares,spain': group_top_producers.ix['alcala de henares,spain'],
# 'toledo,spain': group_top_producers.ix['toledo,spain'],
# 'zaragoza,spain': group_top_producers.ix['zaragoza,spain'],
# 'valencia,spain': group_top_producers.ix['valencia,spain'],
# 'antwerp,netherlands': group_top_producers.ix['antwerp,netherlands'],
}).fillna(0)
In [23]:
top_producer_df.plot()
Out[23]:
In [24]:
top_producer_df
Out[24]:
In [25]:
counted_by_year = records.sort_values('pub_year').groupby('pub_year').count()['control_number']
In [29]:
top_producer_df_percent = pd.DataFrame({
'sevilla,spain': top_producer_df['sevilla,spain'].div(counted_by_year.values),
'madrid,spain': top_producer_df['madrid,spain'].div(counted_by_year.values),
'salamanca,spain': top_producer_df['salamanca,spain'].div(counted_by_year.values),
'barcelona,spain': top_producer_df['barcelona,spain'].div(counted_by_year.values),
# 'valladolid,spain': top_producer_df['valladolid,spain'].div(counted_by_year.values),
'alcala de henares,spain': top_producer_df['alcala de henares,spain'].div(counted_by_year.values),
# 'toledo,spain': top_producer_df['toledo,spain'].div(counted_by_year.values),
# 'zaragoza,spain': top_producer_df['zaragoza,spain'].div(counted_by_year.values),
# 'valencia,spain': top_producer_df['valencia,spain'].div(counted_by_year.values),
# 'antwerp,netherlands': top_producer_df['antwerp,netherlands'].div(counted_by_year.values),
})
In [30]:
top_producer_df_percent.plot.area(stacked=False)
Out[30]:
In [ ]: