In [1]:
%matplotlib inline
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [ ]:
# IF TIME, PLEASE CLEAN THESE LOCS

In [2]:
records = pd.read_csv('../data/fucking_final_dataset.csv')
records = records[records.pub_year > 1499]
records = records[records.pub_year < 1600]

In [5]:
len(records)


Out[5]:
15127

In [6]:
records.head(1)


Out[6]:
Unnamed: 0 Unnamed: 0.1 Unnamed: 0.1 Unnamed: 0.1.1 control_number title uniform_title author publisher pub_location pub_year translation prev_language title_auth_slug canonical_title canonical_author slug canonical_city canonical_country full_text_slug
571 571 670 670 670 14321112 libro de proprietatibus rerum en romancehystor... NaN bartholomaeus gaspar de avila, a costa de joan thomas fabio toledo 1529 NaN NaN libro de proprietatibus rerum en romancehystor... libro de proprietatibus rerum en romancehystor... bartholomaeus toledo,spain toledo spain libro de proprietatibus rerum en romancehystor...

In [7]:
plt.rcParams['figure.figsize'] = (12.0, 6.0)

In [8]:
records.groupby('canonical_country').count()['control_number'].sort_values(inplace=False, ascending=False).ix[:10].plot(kind="bar")


Out[8]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fd0872fd9e8>

In [9]:
records.groupby('slug').count()['control_number'].sort_values(inplace=False, ascending=False).ix[:25].plot(kind="bar")


Out[9]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fd0af2e3ba8>

Global Publication By Year


In [10]:
records.sort_values('pub_year').groupby('pub_year').count()['control_number'].plot()


Out[10]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fd0d7119fd0>

In [29]:
# records[records.pub_year < 1900].sort_values('pub_year').groupby('pub_year').count()['control_number'].plot()

In [30]:
# records[records.pub_year > 1900].sort_values('pub_year').groupby('pub_year').count()['control_number'].plot()

In [11]:
top_slugs = records.groupby('slug').count()['control_number'].sort_values(inplace=False, ascending=False).ix[:10].index

In [12]:
top_producers = records[records.slug.isin(top_slugs)]
top_slugs


Out[12]:
Index(['sevilla,spain', 'madrid,spain', 'salamanca,spain', 'barcelona,spain',
       'alcala de henares,spain', 'valladolid,spain', 'toledo,spain',
       'zaragoza,spain', 'valencia,spain', 's.l.'],
      dtype='object', name='slug')

In [13]:
group_top_producers = top_producers.sort_values('pub_year').groupby(['slug', 'pub_year']).count()['control_number']

In [22]:
top_producer_df = pd.DataFrame({
    'sevilla,spain': group_top_producers.ix['sevilla,spain'],
    'madrid,spain': group_top_producers.ix['madrid,spain'],
    'salamanca,spain': group_top_producers.ix['salamanca,spain'],
    'barcelona,spain': group_top_producers.ix['barcelona,spain'],
    'valladolid,spain': group_top_producers.ix['valladolid,spain'],
    'alcala de henares,spain': group_top_producers.ix['alcala de henares,spain'],
#     'toledo,spain': group_top_producers.ix['toledo,spain'],
#     'zaragoza,spain': group_top_producers.ix['zaragoza,spain'],
#     'valencia,spain': group_top_producers.ix['valencia,spain'],
#     'antwerp,netherlands': group_top_producers.ix['antwerp,netherlands'],
}).fillna(0)

In [23]:
top_producer_df.plot()


Out[23]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fd1319964a8>

In [24]:
top_producer_df


Out[24]:
alcala de henares,spain barcelona,spain madrid,spain salamanca,spain sevilla,spain valladolid,spain
pub_year
1500 0 0 0 10 13 3
1501 0 2 0 1 4 1
1502 2 0 0 2 13 2
1503 3 3 0 6 8 0
1504 0 2 1 0 2 0
1505 0 2 0 2 0 0
1506 0 0 0 3 3 1
1507 0 0 0 1 1 0
1508 0 0 0 0 4 1
1509 0 1 0 9 0 2
1510 0 6 0 3 3 4
1511 2 0 0 1 7 4
1512 2 0 0 4 11 1
1513 2 2 1 0 3 1
1514 0 0 1 1 4 0
1515 0 1 0 0 5 4
1516 1 0 0 2 4 4
1517 2 1 0 1 4 0
1518 0 6 0 0 16 1
1519 3 0 0 4 12 3
1520 1 5 0 1 16 0
1521 0 0 0 2 5 0
1522 0 0 0 0 6 0
1523 1 1 0 0 10 0
1524 3 0 0 6 9 0
1525 1 1 0 5 10 0
1526 1 0 0 3 18 2
1527 2 0 0 0 23 9
1528 2 6 0 2 16 0
1529 8 0 0 0 6 4
... ... ... ... ... ... ...
1570 11 0 8 36 24 17
1571 9 10 14 22 31 0
1572 10 14 14 19 16 21
1573 14 8 18 23 7 0
1574 5 6 9 27 19 0
1575 8 7 15 20 12 1
1576 6 9 10 33 14 1
1577 14 3 17 11 18 0
1578 7 6 33 20 3 3
1579 3 0 6 20 6 0
1580 14 10 18 17 49 2
1581 18 10 10 22 9 1
1582 10 9 10 25 38 0
1583 1 7 30 18 15 9
1584 17 6 43 12 20 4
1585 5 11 38 17 15 5
1586 16 27 30 18 15 7
1587 30 20 36 38 27 9
1588 15 35 43 34 25 12
1589 17 22 57 37 3 24
1590 7 7 72 11 13 6
1591 10 31 81 10 5 2
1592 20 41 51 13 4 12
1593 0 11 42 1 5 6
1594 5 38 74 11 17 9
1595 11 9 87 52 18 8
1596 9 20 77 26 14 5
1597 7 42 65 21 1 5
1598 6 49 120 10 15 9
1599 18 36 104 15 46 7

100 rows × 6 columns


In [25]:
counted_by_year = records.sort_values('pub_year').groupby('pub_year').count()['control_number']

In [29]:
top_producer_df_percent = pd.DataFrame({
    'sevilla,spain': top_producer_df['sevilla,spain'].div(counted_by_year.values),
    'madrid,spain': top_producer_df['madrid,spain'].div(counted_by_year.values),
    'salamanca,spain': top_producer_df['salamanca,spain'].div(counted_by_year.values),
    'barcelona,spain': top_producer_df['barcelona,spain'].div(counted_by_year.values),
#     'valladolid,spain': top_producer_df['valladolid,spain'].div(counted_by_year.values),
    'alcala de henares,spain': top_producer_df['alcala de henares,spain'].div(counted_by_year.values),
#     'toledo,spain': top_producer_df['toledo,spain'].div(counted_by_year.values),
#     'zaragoza,spain': top_producer_df['zaragoza,spain'].div(counted_by_year.values),
#     'valencia,spain': top_producer_df['valencia,spain'].div(counted_by_year.values),
#     'antwerp,netherlands': top_producer_df['antwerp,netherlands'].div(counted_by_year.values),
})

In [30]:
top_producer_df_percent.plot.area(stacked=False)


Out[30]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fd0830b3780>

In [ ]: