In [1]:
%matplotlib inline
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
records = pd.read_csv('../data/fucking_final_dataset.csv')

In [3]:
records.head(1)


Out[3]:
Unnamed: 0 Unnamed: 0.1 Unnamed: 0.1 Unnamed: 0.1.1 control_number title uniform_title author publisher pub_location pub_year translation prev_language title_auth_slug canonical_title canonical_author slug canonical_city canonical_country full_text_slug
0 0 0 0 0 1000686 chao NaN sauvajon, marc-gilbert escelicer madrid 1972 NaN fre chao|sauvajon, marc-gilbert chao sauvajon, marc-gilbert madrid,spain madrid spain chao,sauvajon, marc-gilbert,madrid,1972,escelicer

In [4]:
plt.rcParams['figure.figsize'] = (12.0, 8.0)

In [5]:
records.groupby('canonical_country').count()['control_number'].sort_values(inplace=False, ascending=False).ix[:10].plot(kind="bar")
plt.savefig('images/fig1_overall_places.png')



In [9]:
records.groupby('slug').count()['control_number'].sort_values(inplace=False, ascending=False).ix[:25].plot(kind="bar")
plt.savefig('images/fig2_overall_cities.png')


Global Publication By Year


In [6]:
records.sort_values('pub_year').groupby('pub_year').count()['control_number'].plot()
plt.savefig('images/fig3_pub_by_year.png')



In [8]:
records[records.pub_year < 1900].sort_values('pub_year').groupby('pub_year').count()['control_number'].plot()


Out[8]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f093b5b4400>

In [9]:
records[records.pub_year > 1900].sort_values('pub_year').groupby('pub_year').count()['control_number'].plot()


Out[9]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f093b598240>

In [12]:
top_slugs = records.groupby('slug').count()['control_number'].sort_values(inplace=False, ascending=False).ix[:10].index

In [13]:
top_producers = records[records.slug.isin(top_slugs)]

In [14]:
group_top_producers = top_producers.sort_values('pub_year').groupby(['slug', 'pub_year']).count()['control_number']

In [16]:
top_producer_df = pd.DataFrame({
    'madrid,spain': group_top_producers.ix['madrid,spain'],
    'barcelona,spain': group_top_producers.ix['barcelona,spain'],
    'mexico,mexico': group_top_producers.ix['mexico,mexico'],
    'buenos aires,argentina': group_top_producers.ix['buenos aires,argentina'],
    'sao paulo,brazil': group_top_producers.ix['sao paulo,brazil'],
    'santiago,chile': group_top_producers.ix['santiago,chile'],
    'rio de janeiro,brazil': group_top_producers.ix['rio de janeiro,brazil'],
    'bogota,colombia': group_top_producers.ix['bogota,colombia'],
    'lisbon,portugal': group_top_producers.ix['lisbon,portugal'],
    'valencia,spain': group_top_producers.ix['valencia,spain'],
}).fillna(0)

In [17]:
top_producer_df.plot()
plt.savefig('images/fig4_top_producers_by_year.png')



In [15]:
top_pre_1900_producers = top_producers[top_producers.pub_year < 1900]
group_top_pre_1900_producers = top_pre_1900_producers.sort_values('pub_year').groupby(['slug', 'pub_year']).count()['control_number']
top_pre_1900_producer_df = pd.DataFrame({
    'madrid,spain': group_top_pre_1900_producers.ix['madrid,spain'],
    'barcelona,spain': group_top_pre_1900_producers.ix['barcelona,spain'],
    'mexico,mexico': group_top_pre_1900_producers.ix['mexico,mexico'],
    'buenos aires,argentina': group_top_pre_1900_producers.ix['buenos aires,argentina'],
    'sao paulo,brazil': group_top_pre_1900_producers.ix['sao paulo,brazil'],
    'santiago,chile': group_top_pre_1900_producers.ix['santiago,chile'],
    'rio de janeiro,brazil': group_top_pre_1900_producers.ix['rio de janeiro,brazil'],
    'bogota,colombia': group_top_pre_1900_producers.ix['bogota,colombia'],
    'lisbon,portugal': group_top_pre_1900_producers.ix['lisbon,portugal'],
    'valencia,spain': group_top_pre_1900_producers.ix['valencia,spain'],
})

In [16]:
top_pre_1900_producer_df.plot()


Out[16]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f093b5a4208>

In [17]:
top_post_1900_producers = top_producers[top_producers.pub_year > 1900]
group_top_post_1900_producers = top_post_1900_producers.sort_values('pub_year').groupby(['slug', 'pub_year']).count()['control_number']
top_post_1900_producer_df = pd.DataFrame({
    'madrid,spain': group_top_post_1900_producers.ix['madrid,spain'],
    'barcelona,spain': group_top_post_1900_producers.ix['barcelona,spain'],
    'mexico,mexico': group_top_post_1900_producers.ix['mexico,mexico'],
    'buenos aires,argentina': group_top_post_1900_producers.ix['buenos aires,argentina'],
    'sao paulo,brazil': group_top_post_1900_producers.ix['sao paulo,brazil'],
    'santiago,chile': group_top_post_1900_producers.ix['santiago,chile'],
    'rio de janeiro,brazil': group_top_post_1900_producers.ix['rio de janeiro,brazil'],
    'bogota,colombia': group_top_post_1900_producers.ix['bogota,colombia'],
    'lisbon,portugal': group_top_post_1900_producers.ix['lisbon,portugal'],
    'valencia,spain': group_top_post_1900_producers.ix['valencia,spain'],
})

In [18]:
top_post_1900_producer_df.plot()


Out[18]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f093b3df320>

In [21]:
counted_by_year = records.sort_values('pub_year').groupby('pub_year').count()['control_number']
len(counted_by_year.values)


Out[21]:
544

In [27]:
len(top_producer_df)


Out[27]:
533

In [29]:
top_producer_df


Out[29]:
barcelona,spain bogota,colombia buenos aires,argentina lisbon,portugal madrid,spain mexico,mexico rio de janeiro,brazil santiago,chile sao paulo,brazil valencia,spain
pub_year
1451 0 0 0 0 1 0 0 0 0 0
1452 0 0 0 0 0 0 0 0 1 0
1463 0 0 0 0 0 1 0 0 0 0
1480 1 0 0 0 0 0 0 0 0 0
1481 4 0 0 0 0 0 0 0 0 0
1482 2 0 0 0 0 0 0 0 0 1
1483 0 0 0 0 0 0 0 0 0 1
1484 4 0 0 0 0 0 0 0 0 4
1488 1 0 0 0 0 0 0 0 0 0
1490 0 0 0 0 1 0 0 0 0 2
1491 0 0 1 0 3 0 0 0 0 2
1492 2 0 0 1 1 0 0 0 0 0
1493 3 0 0 0 0 0 0 0 0 3
1494 9 0 0 0 0 0 0 0 0 0
1495 7 0 0 1 0 0 0 0 0 2
1496 0 0 0 0 0 0 0 0 0 2
1497 0 0 0 0 0 0 0 0 0 1
1498 2 0 0 0 0 0 0 0 0 2
1499 1 0 0 0 1 0 0 0 0 1
1500 0 0 0 0 0 0 0 0 0 5
1501 2 0 0 1 0 0 0 0 0 0
1502 0 0 0 0 0 0 0 0 0 1
1503 3 0 0 0 0 0 0 0 0 0
1504 2 0 0 1 1 0 0 0 0 0
1505 2 0 0 0 0 0 0 0 0 0
1506 0 0 0 1 0 0 0 0 0 1
1509 1 0 0 0 0 0 0 0 0 0
1510 7 0 0 0 0 0 0 0 0 7
1511 0 0 0 0 0 0 0 0 0 7
1512 0 0 0 0 0 0 0 0 0 1
... ... ... ... ... ... ... ... ... ... ...
1986 18658 3622 5327 2095 20293 13298 3420 3569 5342 1347
1987 17962 3159 5125 2387 18783 13501 3338 3759 5081 1713
1988 17281 3515 5053 2394 21151 14185 2945 3898 5210 1667
1989 16648 3624 4351 2391 20079 12806 2921 3877 4920 1531
1990 18135 4029 3925 2102 21042 13909 2512 3956 4344 1996
1991 18639 3616 4494 2021 22593 14425 2689 3563 5307 1942
1992 21430 3838 5031 2096 24693 15444 2585 3923 5646 2222
1993 21381 4224 5537 2020 23476 15794 2649 4031 5284 2646
1994 22586 4380 6122 2255 25456 18041 3121 4137 5760 2907
1995 24167 4402 5455 2115 28448 15587 3076 3974 5944 3542
1996 25883 4453 5435 2247 31999 16137 3225 3905 6842 3589
1997 29567 4846 6504 2468 31731 18442 4139 4140 9309 4066
1998 31692 5606 6769 2753 35293 18310 4274 3965 10007 4633
1999 32805 5263 7308 2861 35281 19040 4368 4014 10288 4647
2000 34262 5907 6602 2814 35562 22752 4689 4048 10951 4741
2001 33912 6086 6394 2955 35916 19481 4906 4033 11398 4505
2002 34125 6694 5822 2788 36008 19271 5420 4230 12667 4409
2003 33227 6716 7309 2572 36198 19694 5174 4147 11887 4422
2004 32430 6780 7375 2418 34425 19205 5163 3726 11435 3956
2005 33486 6409 7883 2227 35161 19436 4385 4121 10102 4257
2006 32243 6551 6968 2130 32585 19604 3640 3263 10120 4491
2007 33182 6031 6276 2251 33876 17100 3051 3845 8256 4340
2008 34114 7972 5479 2402 32408 15405 3047 3517 7120 4699
2009 29796 8143 4994 2191 24208 13361 3135 3036 5896 3285
2010 26933 5076 3834 2267 19658 10257 1995 2660 3289 3001
2011 23977 2443 3120 1976 17812 6893 1180 2256 1757 2466
2012 17522 1365 1886 1796 14739 4877 671 1624 1379 2185
2013 13990 557 851 1475 11002 2456 363 536 517 1844
2014 11288 175 303 1130 7770 1046 224 105 222 1253
2015 1040 0 3 130 752 19 8 2 1 145

533 rows × 10 columns


In [30]:
# top_producer_df_percent = pd.DataFrame({
#     'madrid,spain': top_producer_df['madrid,spain'].divide(counted_by_year.values),
#     'barcelona,spain': top_producer_df['barcelona,spain'].divide(counted_by_year.values),
#     'mexico,mexico': top_producer_df['mexico,mexico'].divide(counted_by_year.values),
#     'buenos aires,argentina': top_producer_df['buenos aires,argentina'].divide(counted_by_year.values),
#     'sao paulo,brazil': top_producer_df['sao paulo,brazil'].divide(counted_by_year.values),
#     'santiago,chile': top_producer_df['santiago,chile'].divide(counted_by_year.values),
#     'rio de janeiro,brazil': top_producer_df['rio de janeiro,brazil'].divide(counted_by_year.values),
#     'bogota,colombia': top_producer_df['bogota,colombia'].divide(counted_by_year.values),
#     'lisbon,portugal': top_producer_df['lisbon,portugal'].divide(counted_by_year.values),
#     'valencia,spain': top_producer_df['valencia,spain'].divide(counted_by_year.values),
# })

In [ ]:
# top_producer_df_percent.plot.area(stacked=False)