notebook.community

Edit and run



In [1]:

    
%matplotlib inline
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt



In [2]:

    
records = pd.read_csv('../data/fucking_final_dataset.csv')



In [3]:

    
records.head(1)









    Out[3]:






  
    
      
      Unnamed: 0
      Unnamed: 0.1
      Unnamed: 0.1
      Unnamed: 0.1.1
      control_number
      title
      uniform_title
      author
      publisher
      pub_location
      pub_year
      translation
      prev_language
      title_auth_slug
      canonical_title
      canonical_author
      slug
      canonical_city
      canonical_country
      full_text_slug
    
  
  
    
      0
      0
      0
      0
      0
      1000686
      chao
      NaN
      sauvajon, marc-gilbert
      escelicer
      madrid
      1972
      NaN
      fre
      chao|sauvajon, marc-gilbert
      chao
      sauvajon, marc-gilbert
      madrid,spain
      madrid
      spain
      chao,sauvajon, marc-gilbert,madrid,1972,escelicer



In [4]:

    
plt.rcParams['figure.figsize'] = (12.0, 8.0)



In [5]:

    
records.groupby('canonical_country').count()['control_number'].sort_values(inplace=False, ascending=False).ix[:10].plot(kind="bar")
plt.savefig('images/fig1_overall_places.png')



In [9]:

    
records.groupby('slug').count()['control_number'].sort_values(inplace=False, ascending=False).ix[:25].plot(kind="bar")
plt.savefig('images/fig2_overall_cities.png')

Global Publication By Year



In [6]:

    
records.sort_values('pub_year').groupby('pub_year').count()['control_number'].plot()
plt.savefig('images/fig3_pub_by_year.png')



In [8]:

    
records[records.pub_year < 1900].sort_values('pub_year').groupby('pub_year').count()['control_number'].plot()









    Out[8]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f093b5b4400>



In [9]:

    
records[records.pub_year > 1900].sort_values('pub_year').groupby('pub_year').count()['control_number'].plot()









    Out[9]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f093b598240>



In [12]:

    
top_slugs = records.groupby('slug').count()['control_number'].sort_values(inplace=False, ascending=False).ix[:10].index



In [13]:

    
top_producers = records[records.slug.isin(top_slugs)]



In [14]:

    
group_top_producers = top_producers.sort_values('pub_year').groupby(['slug', 'pub_year']).count()['control_number']



In [16]:

    
top_producer_df = pd.DataFrame({
    'madrid,spain': group_top_producers.ix['madrid,spain'],
    'barcelona,spain': group_top_producers.ix['barcelona,spain'],
    'mexico,mexico': group_top_producers.ix['mexico,mexico'],
    'buenos aires,argentina': group_top_producers.ix['buenos aires,argentina'],
    'sao paulo,brazil': group_top_producers.ix['sao paulo,brazil'],
    'santiago,chile': group_top_producers.ix['santiago,chile'],
    'rio de janeiro,brazil': group_top_producers.ix['rio de janeiro,brazil'],
    'bogota,colombia': group_top_producers.ix['bogota,colombia'],
    'lisbon,portugal': group_top_producers.ix['lisbon,portugal'],
    'valencia,spain': group_top_producers.ix['valencia,spain'],
}).fillna(0)



In [17]:

    
top_producer_df.plot()
plt.savefig('images/fig4_top_producers_by_year.png')



In [15]:

    
top_pre_1900_producers = top_producers[top_producers.pub_year < 1900]
group_top_pre_1900_producers = top_pre_1900_producers.sort_values('pub_year').groupby(['slug', 'pub_year']).count()['control_number']
top_pre_1900_producer_df = pd.DataFrame({
    'madrid,spain': group_top_pre_1900_producers.ix['madrid,spain'],
    'barcelona,spain': group_top_pre_1900_producers.ix['barcelona,spain'],
    'mexico,mexico': group_top_pre_1900_producers.ix['mexico,mexico'],
    'buenos aires,argentina': group_top_pre_1900_producers.ix['buenos aires,argentina'],
    'sao paulo,brazil': group_top_pre_1900_producers.ix['sao paulo,brazil'],
    'santiago,chile': group_top_pre_1900_producers.ix['santiago,chile'],
    'rio de janeiro,brazil': group_top_pre_1900_producers.ix['rio de janeiro,brazil'],
    'bogota,colombia': group_top_pre_1900_producers.ix['bogota,colombia'],
    'lisbon,portugal': group_top_pre_1900_producers.ix['lisbon,portugal'],
    'valencia,spain': group_top_pre_1900_producers.ix['valencia,spain'],
})



In [16]:

    
top_pre_1900_producer_df.plot()









    Out[16]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f093b5a4208>



In [17]:

    
top_post_1900_producers = top_producers[top_producers.pub_year > 1900]
group_top_post_1900_producers = top_post_1900_producers.sort_values('pub_year').groupby(['slug', 'pub_year']).count()['control_number']
top_post_1900_producer_df = pd.DataFrame({
    'madrid,spain': group_top_post_1900_producers.ix['madrid,spain'],
    'barcelona,spain': group_top_post_1900_producers.ix['barcelona,spain'],
    'mexico,mexico': group_top_post_1900_producers.ix['mexico,mexico'],
    'buenos aires,argentina': group_top_post_1900_producers.ix['buenos aires,argentina'],
    'sao paulo,brazil': group_top_post_1900_producers.ix['sao paulo,brazil'],
    'santiago,chile': group_top_post_1900_producers.ix['santiago,chile'],
    'rio de janeiro,brazil': group_top_post_1900_producers.ix['rio de janeiro,brazil'],
    'bogota,colombia': group_top_post_1900_producers.ix['bogota,colombia'],
    'lisbon,portugal': group_top_post_1900_producers.ix['lisbon,portugal'],
    'valencia,spain': group_top_post_1900_producers.ix['valencia,spain'],
})



In [18]:

    
top_post_1900_producer_df.plot()









    Out[18]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f093b3df320>



In [21]:

    
counted_by_year = records.sort_values('pub_year').groupby('pub_year').count()['control_number']
len(counted_by_year.values)









    Out[21]:





544



In [27]:

    
len(top_producer_df)









    Out[27]:





533



In [29]:

    
top_producer_df









    Out[29]:






  
    
      
      barcelona,spain
      bogota,colombia
      buenos aires,argentina
      lisbon,portugal
      madrid,spain
      mexico,mexico
      rio de janeiro,brazil
      santiago,chile
      sao paulo,brazil
      valencia,spain
    
    
      pub_year
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      1451
      0
      0
      0
      0
      1
      0
      0
      0
      0
      0
    
    
      1452
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
    
    
      1463
      0
      0
      0
      0
      0
      1
      0
      0
      0
      0
    
    
      1480
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      1481
      4
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      1482
      2
      0
      0
      0
      0
      0
      0
      0
      0
      1
    
    
      1483
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
    
    
      1484
      4
      0
      0
      0
      0
      0
      0
      0
      0
      4
    
    
      1488
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      1490
      0
      0
      0
      0
      1
      0
      0
      0
      0
      2
    
    
      1491
      0
      0
      1
      0
      3
      0
      0
      0
      0
      2
    
    
      1492
      2
      0
      0
      1
      1
      0
      0
      0
      0
      0
    
    
      1493
      3
      0
      0
      0
      0
      0
      0
      0
      0
      3
    
    
      1494
      9
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      1495
      7
      0
      0
      1
      0
      0
      0
      0
      0
      2
    
    
      1496
      0
      0
      0
      0
      0
      0
      0
      0
      0
      2
    
    
      1497
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
    
    
      1498
      2
      0
      0
      0
      0
      0
      0
      0
      0
      2
    
    
      1499
      1
      0
      0
      0
      1
      0
      0
      0
      0
      1
    
    
      1500
      0
      0
      0
      0
      0
      0
      0
      0
      0
      5
    
    
      1501
      2
      0
      0
      1
      0
      0
      0
      0
      0
      0
    
    
      1502
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
    
    
      1503
      3
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      1504
      2
      0
      0
      1
      1
      0
      0
      0
      0
      0
    
    
      1505
      2
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      1506
      0
      0
      0
      1
      0
      0
      0
      0
      0
      1
    
    
      1509
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      1510
      7
      0
      0
      0
      0
      0
      0
      0
      0
      7
    
    
      1511
      0
      0
      0
      0
      0
      0
      0
      0
      0
      7
    
    
      1512
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      1986
      18658
      3622
      5327
      2095
      20293
      13298
      3420
      3569
      5342
      1347
    
    
      1987
      17962
      3159
      5125
      2387
      18783
      13501
      3338
      3759
      5081
      1713
    
    
      1988
      17281
      3515
      5053
      2394
      21151
      14185
      2945
      3898
      5210
      1667
    
    
      1989
      16648
      3624
      4351
      2391
      20079
      12806
      2921
      3877
      4920
      1531
    
    
      1990
      18135
      4029
      3925
      2102
      21042
      13909
      2512
      3956
      4344
      1996
    
    
      1991
      18639
      3616
      4494
      2021
      22593
      14425
      2689
      3563
      5307
      1942
    
    
      1992
      21430
      3838
      5031
      2096
      24693
      15444
      2585
      3923
      5646
      2222
    
    
      1993
      21381
      4224
      5537
      2020
      23476
      15794
      2649
      4031
      5284
      2646
    
    
      1994
      22586
      4380
      6122
      2255
      25456
      18041
      3121
      4137
      5760
      2907
    
    
      1995
      24167
      4402
      5455
      2115
      28448
      15587
      3076
      3974
      5944
      3542
    
    
      1996
      25883
      4453
      5435
      2247
      31999
      16137
      3225
      3905
      6842
      3589
    
    
      1997
      29567
      4846
      6504
      2468
      31731
      18442
      4139
      4140
      9309
      4066
    
    
      1998
      31692
      5606
      6769
      2753
      35293
      18310
      4274
      3965
      10007
      4633
    
    
      1999
      32805
      5263
      7308
      2861
      35281
      19040
      4368
      4014
      10288
      4647
    
    
      2000
      34262
      5907
      6602
      2814
      35562
      22752
      4689
      4048
      10951
      4741
    
    
      2001
      33912
      6086
      6394
      2955
      35916
      19481
      4906
      4033
      11398
      4505
    
    
      2002
      34125
      6694
      5822
      2788
      36008
      19271
      5420
      4230
      12667
      4409
    
    
      2003
      33227
      6716
      7309
      2572
      36198
      19694
      5174
      4147
      11887
      4422
    
    
      2004
      32430
      6780
      7375
      2418
      34425
      19205
      5163
      3726
      11435
      3956
    
    
      2005
      33486
      6409
      7883
      2227
      35161
      19436
      4385
      4121
      10102
      4257
    
    
      2006
      32243
      6551
      6968
      2130
      32585
      19604
      3640
      3263
      10120
      4491
    
    
      2007
      33182
      6031
      6276
      2251
      33876
      17100
      3051
      3845
      8256
      4340
    
    
      2008
      34114
      7972
      5479
      2402
      32408
      15405
      3047
      3517
      7120
      4699
    
    
      2009
      29796
      8143
      4994
      2191
      24208
      13361
      3135
      3036
      5896
      3285
    
    
      2010
      26933
      5076
      3834
      2267
      19658
      10257
      1995
      2660
      3289
      3001
    
    
      2011
      23977
      2443
      3120
      1976
      17812
      6893
      1180
      2256
      1757
      2466
    
    
      2012
      17522
      1365
      1886
      1796
      14739
      4877
      671
      1624
      1379
      2185
    
    
      2013
      13990
      557
      851
      1475
      11002
      2456
      363
      536
      517
      1844
    
    
      2014
      11288
      175
      303
      1130
      7770
      1046
      224
      105
      222
      1253
    
    
      2015
      1040
      0
      3
      130
      752
      19
      8
      2
      1
      145
    
  

533 rows × 10 columns



In [30]:

    
# top_producer_df_percent = pd.DataFrame({
#     'madrid,spain': top_producer_df['madrid,spain'].divide(counted_by_year.values),
#     'barcelona,spain': top_producer_df['barcelona,spain'].divide(counted_by_year.values),
#     'mexico,mexico': top_producer_df['mexico,mexico'].divide(counted_by_year.values),
#     'buenos aires,argentina': top_producer_df['buenos aires,argentina'].divide(counted_by_year.values),
#     'sao paulo,brazil': top_producer_df['sao paulo,brazil'].divide(counted_by_year.values),
#     'santiago,chile': top_producer_df['santiago,chile'].divide(counted_by_year.values),
#     'rio de janeiro,brazil': top_producer_df['rio de janeiro,brazil'].divide(counted_by_year.values),
#     'bogota,colombia': top_producer_df['bogota,colombia'].divide(counted_by_year.values),
#     'lisbon,portugal': top_producer_df['lisbon,portugal'].divide(counted_by_year.values),
#     'valencia,spain': top_producer_df['valencia,spain'].divide(counted_by_year.values),
# })



In [ ]:

    
# top_producer_df_percent.plot.area(stacked=False)

	barcelona,spain	bogota,colombia	buenos aires,argentina	lisbon,portugal	madrid,spain	mexico,mexico	rio de janeiro,brazil	santiago,chile	sao paulo,brazil	valencia,spain
pub_year
1451	0	0	0	0	1	0	0	0	0	0
1452	0	0	0	0	0	0	0	0	1	0
1463	0	0	0	0	0	1	0	0	0	0
1480	1	0	0	0	0	0	0	0	0	0
1481	4	0	0	0	0	0	0	0	0	0
1482	2	0	0	0	0	0	0	0	0	1
1483	0	0	0	0	0	0	0	0	0	1
1484	4	0	0	0	0	0	0	0	0	4
1488	1	0	0	0	0	0	0	0	0	0
1490	0	0	0	0	1	0	0	0	0	2
1491	0	0	1	0	3	0	0	0	0	2
1492	2	0	0	1	1	0	0	0	0	0
1493	3	0	0	0	0	0	0	0	0	3
1494	9	0	0	0	0	0	0	0	0	0
1495	7	0	0	1	0	0	0	0	0	2
1496	0	0	0	0	0	0	0	0	0	2
1497	0	0	0	0	0	0	0	0	0	1
1498	2	0	0	0	0	0	0	0	0	2
1499	1	0	0	0	1	0	0	0	0	1
1500	0	0	0	0	0	0	0	0	0	5
1501	2	0	0	1	0	0	0	0	0	0
1502	0	0	0	0	0	0	0	0	0	1
1503	3	0	0	0	0	0	0	0	0	0
1504	2	0	0	1	1	0	0	0	0	0
1505	2	0	0	0	0	0	0	0	0	0
1506	0	0	0	1	0	0	0	0	0	1
1509	1	0	0	0	0	0	0	0	0	0
1510	7	0	0	0	0	0	0	0	0	7
1511	0	0	0	0	0	0	0	0	0	7
1512	0	0	0	0	0	0	0	0	0	1
...	...	...	...	...	...	...	...	...	...	...
1986	18658	3622	5327	2095	20293	13298	3420	3569	5342	1347
1987	17962	3159	5125	2387	18783	13501	3338	3759	5081	1713
1988	17281	3515	5053	2394	21151	14185	2945	3898	5210	1667
1989	16648	3624	4351	2391	20079	12806	2921	3877	4920	1531
1990	18135	4029	3925	2102	21042	13909	2512	3956	4344	1996
1991	18639	3616	4494	2021	22593	14425	2689	3563	5307	1942
1992	21430	3838	5031	2096	24693	15444	2585	3923	5646	2222
1993	21381	4224	5537	2020	23476	15794	2649	4031	5284	2646
1994	22586	4380	6122	2255	25456	18041	3121	4137	5760	2907
1995	24167	4402	5455	2115	28448	15587	3076	3974	5944	3542
1996	25883	4453	5435	2247	31999	16137	3225	3905	6842	3589
1997	29567	4846	6504	2468	31731	18442	4139	4140	9309	4066
1998	31692	5606	6769	2753	35293	18310	4274	3965	10007	4633
1999	32805	5263	7308	2861	35281	19040	4368	4014	10288	4647
2000	34262	5907	6602	2814	35562	22752	4689	4048	10951	4741
2001	33912	6086	6394	2955	35916	19481	4906	4033	11398	4505
2002	34125	6694	5822	2788	36008	19271	5420	4230	12667	4409
2003	33227	6716	7309	2572	36198	19694	5174	4147	11887	4422
2004	32430	6780	7375	2418	34425	19205	5163	3726	11435	3956
2005	33486	6409	7883	2227	35161	19436	4385	4121	10102	4257
2006	32243	6551	6968	2130	32585	19604	3640	3263	10120	4491
2007	33182	6031	6276	2251	33876	17100	3051	3845	8256	4340
2008	34114	7972	5479	2402	32408	15405	3047	3517	7120	4699
2009	29796	8143	4994	2191	24208	13361	3135	3036	5896	3285
2010	26933	5076	3834	2267	19658	10257	1995	2660	3289	3001
2011	23977	2443	3120	1976	17812	6893	1180	2256	1757	2466
2012	17522	1365	1886	1796	14739	4877	671	1624	1379	2185
2013	13990	557	851	1475	11002	2456	363	536	517	1844
2014	11288	175	303	1130	7770	1046	224	105	222	1253
2015	1040	0	3	130	752	19	8	2	1	145