In [1]:
%matplotlib inline
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
records = pd.read_csv('../data/fucking_final_dataset.csv')
records = records[records.pub_year > 1699]
records = records[records.pub_year < 1800]

In [3]:
len(records)


Out[3]:
100042

In [4]:
records.head(1)


Out[4]:
Unnamed: 0 Unnamed: 0.1 Unnamed: 0.1 Unnamed: 0.1.1 control_number title uniform_title author publisher pub_location pub_year translation prev_language title_auth_slug canonical_title canonical_author slug canonical_city canonical_country full_text_slug
102 102 117 117 117 11003743 historia de las guerras de los judios y de la ... NaN josephus, flavius en la oficina de don benito cano madrid 1791 NaN NaN historia de las guerras de los judios y de la ... historia de las guerras de los judios y de la ... josephus, flavius madrid,spain madrid spain historia de las guerras de los judios y de la ...

In [5]:
plt.rcParams['figure.figsize'] = (12.0, 6.0)

In [6]:
records.groupby('canonical_country').count()['control_number'].sort_values(inplace=False, ascending=False).ix[:10].plot(kind="bar")


Out[6]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fcd7a798b70>

In [7]:
records.groupby('slug').count()['control_number'].sort_values(inplace=False, ascending=False).ix[:25].plot(kind="bar")


Out[7]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fcd7e7fab00>

Global Publication By Year


In [9]:
records.sort_values('pub_year').groupby('pub_year').count()['control_number'].plot()


Out[9]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fcde46a5390>

In [10]:
# records[records.pub_year < 1900].sort_values('pub_year').groupby('pub_year').count()['control_number'].plot()

In [11]:
# records[records.pub_year > 1900].sort_values('pub_year').groupby('pub_year').count()['control_number'].plot()

In [12]:
top_slugs = records.groupby('slug').count()['control_number'].sort_values(inplace=False, ascending=False).ix[:11].index
top_slugs


Out[12]:
Index(['madrid,spain', 'mexico,mexico', 'barcelona,spain', 'lisbon,portugal',
       's.l.', 'valencia,spain', 'sevilla,spain', 'lima,peru',
       'salamanca,spain', 'zaragoza,spain', 'puebla,mexico'],
      dtype='object', name='slug')

In [13]:
top_producers = records[records.slug.isin(top_slugs)]

In [14]:
group_top_producers = top_producers.sort_values('pub_year').groupby(['slug', 'pub_year']).count()['control_number']

In [20]:
top_producer_df = pd.DataFrame({
#     'madrid,spain': group_top_producers.ix['madrid,spain'],
    'mexico,mexico': group_top_producers.ix['mexico,mexico'],
#     'barcelona,spain': group_top_producers.ix['barcelona,spain'],
#     'lisbon,portugal': group_top_producers.ix['lisbon,portugal'],
#     'valencia,spain': group_top_producers.ix['valencia,spain'], 
#     'sevilla,spain': group_top_producers.ix['sevilla,spain'],
    'lima,peru': group_top_producers.ix['lima,peru'],
#     'salamanca,spain': group_top_producers.ix['salamanca,spain'],
#     'zaragoza,spain': group_top_producers.ix['zaragoza,spain'],
    'puebla,mexico': group_top_producers.ix['puebla,mexico'],
       
}).fillna(0)

In [21]:
top_producer_df.plot()


Out[21]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fce728dc0f0>

In [17]:
counted_by_year = records.sort_values('pub_year').groupby('pub_year').count()['control_number']

In [18]:
top_producer_df_percent = pd.DataFrame({
    'madrid,spain': top_producer_df['madrid,spain'].divide(counted_by_year.values),
    'mexico,mexico': top_producer_df['mexico,mexico'].divide(counted_by_year.values),
    'barcelona,spain': top_producer_df['barcelona,spain'].divide(counted_by_year.values),
    'lisbon,portugal': top_producer_df['lisbon,portugal'].divide(counted_by_year.values),
    'valencia,spain': top_producer_df['valencia,spain'].divide(counted_by_year.values), 
    'sevilla,spain': top_producer_df['sevilla,spain'].divide(counted_by_year.values),
    'lima,peru': top_producer_df['lima,peru'].divide(counted_by_year.values),
    'salamanca,spain': top_producer_df['salamanca,spain'].divide(counted_by_year.values),
    'zaragoza,spain': top_producer_df['zaragoza,spain'].divide(counted_by_year.values),
    'puebla,mexico': top_producer_df['puebla,mexico'].divide(counted_by_year.values),
       
})

In [19]:
top_producer_df_percent.plot.area(stacked=False)


Out[19]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fce3d6ebba8>

In [ ]: