In [1]:
%matplotlib inline
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
records = pd.read_csv('../../data/fucking_final_dataset.csv')
records = records[records.pub_year > 1499]
records = records[records.pub_year < 1600]

In [4]:
records.head(1)


Out[4]:
Unnamed: 0 Unnamed: 0.1 Unnamed: 0.1 Unnamed: 0.1.1 control_number title uniform_title author publisher pub_location pub_year translation prev_language title_auth_slug canonical_title canonical_author slug canonical_city canonical_country full_text_slug
571 571 670 670 670 14321112 libro de proprietatibus rerum en romancehystor... NaN bartholomaeus gaspar de avila, a costa de joan thomas fabio toledo 1529 NaN NaN libro de proprietatibus rerum en romancehystor... libro de proprietatibus rerum en romancehystor... bartholomaeus toledo,spain toledo spain libro de proprietatibus rerum en romancehystor...

In [5]:
len(records)


Out[5]:
15127

In [6]:
plt.rcParams['figure.figsize'] = (16.0, 8.0)

In [8]:
records.groupby('title_auth_slug').count()['control_number'].sort_values(inplace=False, ascending=False).ix[:50].plot(kind="bar")


Out[8]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fa3f7289c18>

In [10]:
len(records.groupby('title_auth_slug').count()['control_number'])


Out[10]:
13823

In [11]:
d = {k: len(v) for k, v in records.groupby("title_auth_slug").groups.items()}

def prob_dist(d):
    probs = {}
    for k, v in d.items():
        probs.setdefault(v, 0)
        probs[v] += 1
    return probs


probs = prob_dist(d)

In [12]:
plt.rcParams['figure.figsize'] = (12.0, 8.0)

In [13]:
plt.xscale("log")
plt.yscale("log")
plt.scatter(list(probs.keys()), list(probs.values()))


Out[13]:
<matplotlib.collections.PathCollection at 0x7fa475e271d0>

In [14]:
top_slugs = records.groupby('title_auth_slug').count()['control_number'].sort_values(inplace=False, ascending=False).ix[:10].index

In [15]:
top_slugs


Out[15]:
Index(['orlando furioso|ariosto, lodovico',
       'tragicomedia de calisto y melibea. enla ql se cotiene de mas de su agradable dulce estilo muchas sentencias filosofales: auisosmuy necessarios a macebos ... nueuamente anadido el tractado de centurio. [with woodcuts.] g.l',
       'oratorio de religiosos y exercicio de virtuosos|guevara, antonio de',
       'tratado del amor de dios|fonseca, cristobal de',
       'os lusiadas|camoes, luis de',
       'vocabulario de las dos lenguas toscana y castellana|casas, cristobal de las',
       'el latino de repente|palmireno, juan lorenzo',
       'el cortesano|castiglione, baldassarre',
       'carcel de amor|san pedro, diego de',
       'las siete partidas del sabio rey don alonso el nono'],
      dtype='object', name='title_auth_slug')

In [13]:
top_producers = records[records.text_slug.isin(top_slugs)]

In [14]:
group_top_producers = top_producers.sort_values('pub_year').groupby(['text_slug', 'pub_year']).count()['control_number']

In [15]:
top_producer_df1 = pd.DataFrame({
    'tratado del amor de dios,fonseca, cristobal de': group_top_producers.ix['tratado del amor de dios,fonseca, cristobal de'],
    'la perfecta casada,leon, luis de': group_top_producers.ix['la perfecta casada,leon, luis de'],
    'breuissima relacion de la destruycion de las indias,casas, bartolome de las': group_top_producers.ix['breuissima relacion de la destruycion de las indias,casas, bartolome de las'],
    'de los nombres de christoen tres libros,leon, luis de': group_top_producers.ix['de los nombres de christoen tres libros,leon, luis de'],
    'vocabulario en lengua castellana y mexicana,molina, alonso de': group_top_producers.ix['vocabulario en lengua castellana y mexicana,molina, alonso de'],
    'orlando furioso,ariosto, lodovico': group_top_producers.ix['orlando furioso,ariosto, lodovico'],
    'arte de la lengua mexicana y castellana,molina, alonso de': group_top_producers.ix['arte de la lengua mexicana y castellana,molina, alonso de'],
    'las siete partidas del sabio rey don alonso el nono,nan': group_top_producers.ix['las siete partidas del sabio rey don alonso el nono,nan'],
    'tragicomedia de calisto y melibea': group_top_producers.ix['tragicomedia de calisto y melibea. enla ql se cotiene de mas de su agradable dulce estilo muchas sentencias filosofales: auisosmuy necessarios a macebos ... nueuamente anadido el tractado de centurio. [with woodcuts.] g.l,nan'],
    'oratorio de religiosos y exercicio de virtuosos,guevara, antonio de': group_top_producers.ix['oratorio de religiosos y exercicio de virtuosos,guevara, antonio de'],
    }).fillna(0)


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
/home/davebshow/.virtualenvs/scientific3/lib/python3.4/site-packages/pandas/core/index.py in get_value(self, series, key)
   4401             try:
-> 4402                 return _index.get_value_at(s, k)
   4403             except IndexError:

pandas/index.pyx in pandas.index.get_value_at (pandas/index.c:2408)()

pandas/src/util.pxd in util.get_value_at (pandas/index.c:15952)()

TypeError: 'str' object cannot be interpreted as an integer

During handling of the above exception, another exception occurred:

KeyError                                  Traceback (most recent call last)
<ipython-input-15-079698e348a3> in <module>()
      2     'tratado del amor de dios,fonseca, cristobal de': group_top_producers.ix['tratado del amor de dios,fonseca, cristobal de'],
      3     'la perfecta casada,leon, luis de': group_top_producers.ix['la perfecta casada,leon, luis de'],
----> 4     'breuissima relacion de la destruycion de las indias,casas, bartolome de las': group_top_producers.ix['breuissima relacion de la destruycion de las indias,casas, bartolome de las'],
      5     'de los nombres de christoen tres libros,leon, luis de': group_top_producers.ix['de los nombres de christoen tres libros,leon, luis de'],
      6     'vocabulario en lengua castellana y mexicana,molina, alonso de': group_top_producers.ix['vocabulario en lengua castellana y mexicana,molina, alonso de'],

/home/davebshow/.virtualenvs/scientific3/lib/python3.4/site-packages/pandas/core/indexing.py in __getitem__(self, key)
     68             return self._getitem_tuple(key)
     69         else:
---> 70             return self._getitem_axis(key, axis=0)
     71 
     72     def _get_label(self, label, axis=0):

/home/davebshow/.virtualenvs/scientific3/lib/python3.4/site-packages/pandas/core/indexing.py in _getitem_axis(self, key, axis)
    936                     return self._get_loc(key, axis=axis)
    937 
--> 938             return self._get_label(key, axis=axis)
    939 
    940     def _getitem_iterable(self, key, axis=0):

/home/davebshow/.virtualenvs/scientific3/lib/python3.4/site-packages/pandas/core/indexing.py in _get_label(self, label, axis)
     79                 return self.obj._xs(label, axis=axis)
     80             except:
---> 81                 return self.obj[label]
     82         elif (isinstance(label, tuple) and
     83                 isinstance(label[axis], slice)):

/home/davebshow/.virtualenvs/scientific3/lib/python3.4/site-packages/pandas/core/series.py in __getitem__(self, key)
    549     def __getitem__(self, key):
    550         try:
--> 551             result = self.index.get_value(self, key)
    552 
    553             if not np.isscalar(result):

/home/davebshow/.virtualenvs/scientific3/lib/python3.4/site-packages/pandas/core/index.py in get_value(self, series, key)
   4408                     raise InvalidIndexError(key)
   4409                 else:
-> 4410                     raise e1
   4411             except Exception:  # pragma: no cover
   4412                 raise e1

/home/davebshow/.virtualenvs/scientific3/lib/python3.4/site-packages/pandas/core/index.py in get_value(self, series, key)
   4392 
   4393         try:
-> 4394             return self._engine.get_value(s, k)
   4395         except KeyError as e1:
   4396             try:

pandas/index.pyx in pandas.index.IndexEngine.get_value (pandas/index.c:3204)()

pandas/index.pyx in pandas.index.IndexEngine.get_value (pandas/index.c:2903)()

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:3843)()

pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12265)()

pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12216)()

KeyError: 'breuissima relacion de la destruycion de las indias,casas, bartolome de las'

In [20]:
top_producer_df1.plot()


Out[20]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f9e39a81588>

In [ ]:
top_producer_df5 = pd.DataFrame({
    '': group_top_producers.ix[''],
    '': group_top_producers.ix[''],
    '': group_top_producers.ix[''],
    '': group_top_producers.ix[''],
    '': group_top_producers.ix[''],
    '': group_top_producers.ix[''],
    '': group_top_producers.ix[''],
    '': group_top_producers.ix[''],
    '': group_top_producers.ix[''],
    '': group_top_producers.ix[''],
}).fillna(0)