In [1]:
%pylab inline


Populating the interactive namespace from numpy and matplotlib

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('data/output.tsv', sep="\t")

Publication locations


In [4]:
locs = df.groupby('pub_location')

In [5]:
locs_count = locs.count()

In [6]:
locs_count_mask = locs_count["control_number"] > 500

In [7]:
top_cities = locs_count[locs_count_mask]

In [8]:
len(top_cities)


Out[8]:
891

In [9]:
top_cities.to_csv('data/aggs/top_cities.csv')

Publication years


In [10]:
years = df.groupby('pub_year')

In [11]:
year_count = years.count()

In [12]:
year_count_mask = year_count["control_number"] > 100

In [13]:
top_years = year_count[year_count_mask]

In [14]:
len(top_years)


Out[14]:
1847

In [15]:
top_years.to_csv('data/aggs/top_years.csv')

Authors


In [16]:
authors = df.groupby("author")
author_count = authors.count()

In [17]:
author_count_mask = author_count["control_number"] > 100

In [18]:
top_authors = author_count[author_count_mask]

In [19]:
len(top_authors)


Out[19]:
8072

In [20]:
top_authors.to_csv('data/aggs/top_authors.csv')

Texts


In [21]:
texts = df.groupby("title")
text_count = texts.count()

In [22]:
text_count_mask = text_count["control_number"] > 50

In [23]:
top_texts = text_count[text_count_mask]

In [24]:
len(top_texts)


Out[24]:
3843

In [25]:
top_texts.to_csv("data/aggs/top_texts.csv")

Translations


In [26]:
trans = df.groupby("translation")

In [27]:
trans.count()


Out[27]:
control_number title uniform_title author publisher pub_location pub_year prev_language
translation
False 12209457 12208899 379087 9371027 10875393 11008791 11284861 0
True 978788 978772 477838 876342 961041 961593 962432 935493

In [28]:
lang = df.groupby(['translation', 'prev_language'])

In [29]:
lang_count = lang.count()

In [30]:
mask = lang_count["control_number"] > 5

In [31]:
top_langs = lang_count[mask]

In [32]:
len(top_langs)


Out[32]:
271

In [33]:
top_langs.to_csv("data/aggs/top_langs.csv")

In [34]:
country_year = df.groupby(["pub_location", "pub_year"])

In [35]:
country_year_count = country_year.count()

In [36]:
cy_mask = country_year_count["control_number"] > 5

In [37]:
top_cy = country_year_count[cy_mask]

In [38]:
len(top_cy)


Out[38]:
107189

In [39]:
top_cy.to_csv("data/aggs/cities_by_year.csv")

Coarse Graph


In [4]:
edges = df[["title", "author", "pub_location","pub_year"]]

In [5]:
edges.to_csv("data/aggs/book_city_edges.tsv", sep="\t")

In [6]:
locs = df["pub_location"]

In [7]:
locs.to_csv("data/aggs/book_city_locs.tsv", sep="\t")

Refined


In [4]:
edges = df[["title", "author","pub_year"]]

In [6]:
locs = pd.read_csv("data/aggs/clean_locs.csv")

In [7]:
len(edges), len(locs)


Out[7]:
(13188245, 13188245)

In [11]:
new_edges = pd.concat([edges, locs], axis=1)

In [14]:
new_edges.to_csv("data/aggs/refined_edges.tsv", sep="\t")

In [ ]: