In [1]:
%matplotlib inline
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
records = pd.read_csv('../../data/fucking_final_dataset.csv')

In [3]:
records.head(1)


Out[3]:
Unnamed: 0 Unnamed: 0.1 Unnamed: 0.1 Unnamed: 0.1.1 control_number title uniform_title author publisher pub_location pub_year translation prev_language title_auth_slug canonical_title canonical_author slug canonical_city canonical_country full_text_slug
0 0 0 0 0 1000686 chao NaN sauvajon, marc-gilbert escelicer madrid 1972 NaN fre chao|sauvajon, marc-gilbert chao sauvajon, marc-gilbert madrid,spain madrid spain chao,sauvajon, marc-gilbert,madrid,1972,escelicer

In [4]:
plt.rcParams['figure.figsize'] = (16.0, 8.0)

In [6]:
records.groupby('title_auth_slug').count()['control_number'].sort_values(inplace=False, ascending=False).ix[:50].plot(kind="bar")


Out[6]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fe997b12128>

In [ ]:
len(records.groupby('title_auth_slug').count()['control_number'])

In [7]:
d = {k: len(v) for k, v in records.groupby("title_auth_slug").groups.items()}

def prob_dist(d):
    probs = {}
    for k, v in d.items():
        probs.setdefault(v, 0)
        probs[v] += 1
    return probs


probs = prob_dist(d)

In [8]:
plt.rcParams['figure.figsize'] = (12.0, 8.0)

In [9]:
plt.xscale("log")
plt.yscale("log")
plt.scatter(list(probs.keys()), list(probs.values()))


Out[9]:
<matplotlib.collections.PathCollection at 0x7f9b5ee4bda0>

In [7]:
top_slugs = records.groupby('title_auth_slug').count()['control_number'].sort_values(inplace=False, ascending=False).ix[:1000].index

In [13]:
top_slugs


Out[13]:
Index(['el ingenioso hidalgo don quijote de la mancha|cervantes saavedra, miguel de',
       'boletin informativo', 'el lazarillo de tormes',
       'novelas ejemplares|cervantes saavedra, miguel de',
       'don quijote de la mancha|cervantes saavedra, miguel de', 'boletin',
       'martin fierro|hernandez, jose', 'la iliada|homer',
       'la divina comedia|dante alighieri',
       'la isla del tesoro|stevenson, robert louis',
       'la celestina|rojas, fernando de',
       'constitucion politica de los estados unidos mexicanos|mexico',
       'codigo civil|espana', 'rimas y leyendas|becquer, gustavo adolfo',
       'la vida es sueno|calderon de la barca, pedro',
       'libro de buen amor|ruiz, juan', 'el principe|machiavelli, niccolo',
       'maria|isaacs, jorge', 'robinson crusoe|defoe, daniel',
       'las aventuras de tom sawyer|twain, mark',
       'marianela|perez galdos, benito',
       'la vuelta al mundo en ochenta dias|verne, jules', 'la odisea|homer',
       'pepita jimenez|valera, juan', 'poema de mio cid',
       'os lusiadas|camoes, luis de', 'fausto|goethe, johann wolfgang von',
       'alicia en el pais de las maravillas|carroll, lewis',
       'mujercitas|alcott, louisa may', 'fuenteovejuna|vega, lope de',
       'rimas|becquer, gustavo adolfo',
       'el principito|saint-exupery, antoine de', 'historia de espana',
       'cumbres borrascosas|bronte, emily',
       'don segundo sombra|guiraldes, ricardo',
       'don juan tenorio|zorrilla, jose', 'sagrada biblia',
       'los tres mosqueteros|dumas, alexandre',
       'platero y yo|jimenez, juan ramon',
       'el retrato de dorian gray|wilde, oscar',
       'el conde lucanor|juan manuel',
       'viaje al centro de la tierra|verne, jules',
       'veinte poemas de amor y una cancion desesperada|neruda, pablo',
       'madame bovary|flaubert, gustave', 'la regenta|alas, leopoldo',
       'la voragine|rivera, jose eustasio', 'las mil y una noches',
       'dona barbara|gallegos, romulo', 'butlleti informatiu',
       'el coronel no tiene quien le escriba|garcia marquez, gabriel'],
      dtype='object', name='title_auth_slug')

In [8]:
top_producers = records[records.title_auth_slug.isin(top_slugs)]

In [9]:
group_top_producers = top_producers.sort_values('pub_year').groupby(['title_auth_slug', 'pub_year']).count()['control_number']

In [10]:
top_producer_df1 = pd.DataFrame({
    'el ingenioso hidalgo don quijote de la mancha|cervantes saavedra, miguel de': group_top_producers.ix['el ingenioso hidalgo don quijote de la mancha|cervantes saavedra, miguel de'],
    'el lazarillo de tormes': group_top_producers.ix['novelas ejemplares,cervantes saavedra, miguel de'],
    'novelas ejemplares|cervantes saavedra, miguel de': group_top_producers.ix['don quijote de la mancha,cervantes saavedra, miguel de'],
    'don quijote de la mancha|cervantes saavedra, miguel de' group_top_producers.ix['martin fierro,hernandez, jose'],
    'martin fierro|hernandez, jose': group_top_producers.ix['la celestina,rojas, fernando de'],
    'la iliada|homer': group_top_producers.ix['libro de buen amor,ruiz, juan'],
    'la divina comedia|dante alighieri': group_top_producers.ix['la isla del tesoro,stevenson, robert louis'],
    'la isla del tesoro|stevenson, robert louis': group_top_producers.ix['constitucion politica de los estados unidos mexicanos,mexico'],
    'la celestina|rojas, fernando de': group_top_producers.ix['la vida es sueno,calderon de la barca, pedro'],
    'poema de mio cid,nan': group_top_producers.ix['poema de mio cid,nan'],
#     'orlando furioso|ariosto, lodovico': group_top_producers.ix['orlando furioso|ariosto, lodovico']
    }).fillna(0)


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
/home/davebshow/.virtualenvs/scientific3/lib/python3.4/site-packages/pandas/core/index.py in get_value(self, series, key)
   4401             try:
-> 4402                 return _index.get_value_at(s, k)
   4403             except IndexError:

pandas/index.pyx in pandas.index.get_value_at (pandas/index.c:2408)()

pandas/src/util.pxd in util.get_value_at (pandas/index.c:15952)()

TypeError: 'str' object cannot be interpreted as an integer

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-10-ecd022c706df> in <module>()
     10 #     'la celestina|rojas, fernando de': group_top_producers.ix['la vida es sueno,calderon de la barca, pedro'],
     11 #     'poema de mio cid,nan': group_top_producers.ix['poema de mio cid,nan'],
---> 12     'orlando furioso|ariosto, lodovico': group_top_producers.ix['orlando furioso|ariosto, lodovico']
     13     }).fillna(0)

/home/davebshow/.virtualenvs/scientific3/lib/python3.4/site-packages/pandas/core/indexing.py in __getitem__(self, key)
     68             return self._getitem_tuple(key)
     69         else:
---> 70             return self._getitem_axis(key, axis=0)
     71 
     72     def _get_label(self, label, axis=0):

/home/davebshow/.virtualenvs/scientific3/lib/python3.4/site-packages/pandas/core/indexing.py in _getitem_axis(self, key, axis)
    936                     return self._get_loc(key, axis=axis)
    937 
--> 938             return self._get_label(key, axis=axis)
    939 
    940     def _getitem_iterable(self, key, axis=0):

/home/davebshow/.virtualenvs/scientific3/lib/python3.4/site-packages/pandas/core/indexing.py in _get_label(self, label, axis)
     79                 return self.obj._xs(label, axis=axis)
     80             except:
---> 81                 return self.obj[label]
     82         elif (isinstance(label, tuple) and
     83                 isinstance(label[axis], slice)):

/home/davebshow/.virtualenvs/scientific3/lib/python3.4/site-packages/pandas/core/series.py in __getitem__(self, key)
    549     def __getitem__(self, key):
    550         try:
--> 551             result = self.index.get_value(self, key)
    552 
    553             if not np.isscalar(result):

/home/davebshow/.virtualenvs/scientific3/lib/python3.4/site-packages/pandas/core/index.py in get_value(self, series, key)
   4408                     raise InvalidIndexError(key)
   4409                 else:
-> 4410                     raise e1
   4411             except Exception:  # pragma: no cover
   4412                 raise e1

/home/davebshow/.virtualenvs/scientific3/lib/python3.4/site-packages/pandas/core/index.py in get_value(self, series, key)
   4392 
   4393         try:
-> 4394             return self._engine.get_value(s, k)
   4395         except KeyError as e1:
   4396             try:

pandas/index.pyx in pandas.index.IndexEngine.get_value (pandas/index.c:3204)()

pandas/index.pyx in pandas.index.IndexEngine.get_value (pandas/index.c:2903)()

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:3843)()

pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12265)()

pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12216)()

KeyError: 'orlando furioso|ariosto, lodovico'

In [ ]:
# Looking at orlando and the celestina

In [3]:
top_16th = records.sort_values('pub_year').groupby(['title_auth_slug', 'pub_year']).count()['control_number']

In [8]:
top_16th.ix['tragicomedia de calisto y melibea. enla ql se cotiene de mas de su agradable dulce estilo muchas sentencias filosofales: auisosmuy necessarios a macebos ... nueuamente anadido el tractado de centurio. [with woodcuts.] g.l']


Out[8]:
pub_year
1502    3
1518    1
1523    1
1525    1
1526    1
1529    1
1530    1
1534    1
1540    1
1545    1
1553    1
1558    1
1568    1
1570    1
Name: control_number, dtype: int64

In [11]:
top_16th.ix['tragicomedia de calisto y melibea|rojas, fernando de']


Out[11]:
pub_year
1539    1
1545    1
1568    1
1573    1
1595    1
1997    2
1998    1
1999    1
2000    1
2005    1
2009    2
Name: control_number, dtype: int64

In [9]:
top_16th.ix['la celestina|rojas, fernando de']


Out[9]:
pub_year
1886     1
1895     1
1900     1
1910     1
1913     6
1923     1
1925     1
1931     6
1935     1
1941     2
1942     1
1944     2
1945     1
1949     2
1951     2
1952     2
1954     2
1955     2
1956     1
1958     3
1959     8
1960     4
1961     4
1962     5
1963     8
1964     9
1965     2
1966     7
1967     8
1968    11
        ..
1985     7
1986     5
1987     6
1988     8
1989     5
1990     8
1991     5
1992     3
1993     5
1994     7
1995     5
1996    11
1997    10
1998     6
1999    16
2000     5
2001     5
2002    12
2003     2
2004     9
2005     6
2006     3
2007     6
2008     5
2009     5
2010     1
2011     4
2012     5
2013     2
2014     1
Name: control_number, dtype: int64

In [ ]:


In [27]:
top_producer_df1.plot()


Out[27]:
<matplotlib.axes._subplots.AxesSubplot at 0x7faf2d3f67f0>

In [29]:
top_producer_df2 = pd.DataFrame({
    'lazarillo de tormes,nan': group_top_producers.ix['lazarillo de tormes,nan'],
    'la divina comedia,dante alighieri': group_top_producers.ix['la divina comedia,dante alighieri'],
    'maria,isaacs, jorge': group_top_producers.ix['maria,isaacs, jorge'],
    'pepita jimenez,valera, juan': group_top_producers.ix['pepita jimenez,valera, juan'],
    'el principe,machiavelli, niccolo': group_top_producers.ix['el principe,machiavelli, niccolo'],
    'robinson crusoe,defoe, daniel': group_top_producers.ix['robinson crusoe,defoe, daniel'],
    'marianela,perez galdos, benito': group_top_producers.ix['marianela,perez galdos, benito'],
    'don segundo sombra,guiraldes, ricardo': group_top_producers.ix['don segundo sombra,guiraldes, ricardo'],
    'episodios nacionales,perez galdos, benito': group_top_producers.ix['episodios nacionales,perez galdos, benito'],
    'rimas,becquer, gustavo adolfo': group_top_producers.ix['rimas,becquer, gustavo adolfo'],
}).fillna(0)

In [30]:
top_producer_df2.plot()


Out[30]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fafdb24be10>

In [ ]:
top_producer_df3 = pd.DataFrame({
    'sagrada biblia,nan': group_top_producers.ix['sagrada biblia,nan'],
    'la regenta,alas, leopoldo': group_top_producers.ix['la regenta,alas, leopoldo'],
    'fausto,goethe, johann wolfgang von': group_top_producers.ix['fausto,goethe, johann wolfgang von'],
    'rimas y leyendas,becquer, gustavo adolfo': group_top_producers.ix['rimas y leyendas,becquer, gustavo adolfo'],
    'summa artishistoria general del arte,pijoan, jose': group_top_producers.ix['summa artishistoria general del arte,pijoan, jose'],
    'cien anos de soledad,garcia marquez, gabriel': group_top_producers.ix['cien anos de soledad,garcia marquez, gabriel'],
    'alicia en el pais de las maravillas,carroll, lewis': group_top_producers.ix['alicia en el pais de las maravillas,carroll, lewis'],
    'historia verdadera de la conquista de la nueva espana,diaz del castillo, bernal': group_top_producers.ix['historia verdadera de la conquista de la nueva espana,diaz del castillo, bernal'],
    'derecho civil espanol, comun y foral,castan tobenas, jose': group_top_producers.ix['derecho civil espanol, comun y foral,castan tobenas, jose'],
    'obras completas,ortega y gasset, jose': group_top_producers.ix['obras completas,ortega y gasset, jose'],
}).fillna(0)

In [ ]:
# top_producer_df3.plot()

In [31]:
top_producer_df4 = pd.DataFrame({
    'los tres mosqueteros,dumas, alexandre': group_top_producers.ix['los tres mosqueteros,dumas, alexandre'],
    'mujercitas,alcott, louisa may': group_top_producers.ix['mujercitas,alcott, louisa may'],
    'la voragine,rivera, jose eustasio': group_top_producers.ix['la voragine,rivera, jose eustasio'],
    'cumbres borrascosas,bronte, emily': group_top_producers.ix['cumbres borrascosas,bronte, emily'],
    'el principito,saint-exupery, antoine de': group_top_producers.ix['el principito,saint-exupery, antoine de'],
    'don juan tenorio,zorrilla, jose': group_top_producers.ix['don juan tenorio,zorrilla, jose'],
    'dona barbara,gallegos, romulo': group_top_producers.ix['dona barbara,gallegos, romulo'],
    'el conde lucanor,juan manuel': group_top_producers.ix['el conde lucanor,juan manuel'],
    'os lusiadas,camoes, luis de': group_top_producers.ix['os lusiadas,camoes, luis de'],
    'entremeses,cervantes saavedra, miguel de': group_top_producers.ix['entremeses,cervantes saavedra, miguel de'],
    'las aventuras de tom sawyer,twain, mark': group_top_producers.ix['las aventuras de tom sawyer,twain, mark'],
}).fillna(0)

In [32]:
top_producer_df4.plot()


Out[32]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fafdb254b00>

In [ ]:
top_producer_df5 = pd.DataFrame({
    '': group_top_producers.ix[''],
    '': group_top_producers.ix[''],
    '': group_top_producers.ix[''],
    '': group_top_producers.ix[''],
    '': group_top_producers.ix[''],
    '': group_top_producers.ix[''],
    '': group_top_producers.ix[''],
    '': group_top_producers.ix[''],
    '': group_top_producers.ix[''],
    '': group_top_producers.ix[''],
}).fillna(0)

In [ ]:
# top_producer_df5.plot()