In [1]:
%pylab inline
In [2]:
import pandas as pd
In [3]:
df = pd.read_csv('data/output.tsv', sep="\t")
Publication locations
In [4]:
locs = df.groupby('pub_location')
In [5]:
locs_count = locs.count()
In [6]:
locs_count_mask = locs_count["control_number"] > 500
In [7]:
top_cities = locs_count[locs_count_mask]
In [8]:
len(top_cities)
Out[8]:
In [9]:
top_cities.to_csv('data/aggs/top_cities.csv')
Publication years
In [10]:
years = df.groupby('pub_year')
In [11]:
year_count = years.count()
In [12]:
year_count_mask = year_count["control_number"] > 100
In [13]:
top_years = year_count[year_count_mask]
In [14]:
len(top_years)
Out[14]:
In [15]:
top_years.to_csv('data/aggs/top_years.csv')
Authors
In [16]:
authors = df.groupby("author")
author_count = authors.count()
In [17]:
author_count_mask = author_count["control_number"] > 100
In [18]:
top_authors = author_count[author_count_mask]
In [19]:
len(top_authors)
Out[19]:
In [20]:
top_authors.to_csv('data/aggs/top_authors.csv')
Texts
In [21]:
texts = df.groupby("title")
text_count = texts.count()
In [22]:
text_count_mask = text_count["control_number"] > 50
In [23]:
top_texts = text_count[text_count_mask]
In [24]:
len(top_texts)
Out[24]:
In [25]:
top_texts.to_csv("data/aggs/top_texts.csv")
Translations
In [26]:
trans = df.groupby("translation")
In [27]:
trans.count()
Out[27]:
In [28]:
lang = df.groupby(['translation', 'prev_language'])
In [29]:
lang_count = lang.count()
In [30]:
mask = lang_count["control_number"] > 5
In [31]:
top_langs = lang_count[mask]
In [32]:
len(top_langs)
Out[32]:
In [33]:
top_langs.to_csv("data/aggs/top_langs.csv")
In [34]:
country_year = df.groupby(["pub_location", "pub_year"])
In [35]:
country_year_count = country_year.count()
In [36]:
cy_mask = country_year_count["control_number"] > 5
In [37]:
top_cy = country_year_count[cy_mask]
In [38]:
len(top_cy)
Out[38]:
In [39]:
top_cy.to_csv("data/aggs/cities_by_year.csv")
Coarse Graph
In [4]:
edges = df[["title", "author", "pub_location","pub_year"]]
In [5]:
edges.to_csv("data/aggs/book_city_edges.tsv", sep="\t")
In [6]:
locs = df["pub_location"]
In [7]:
locs.to_csv("data/aggs/book_city_locs.tsv", sep="\t")
Refined
In [4]:
edges = df[["title", "author","pub_year"]]
In [6]:
locs = pd.read_csv("data/aggs/clean_locs.csv")
In [7]:
len(edges), len(locs)
Out[7]:
In [11]:
new_edges = pd.concat([edges, locs], axis=1)
In [14]:
new_edges.to_csv("data/aggs/refined_edges.tsv", sep="\t")
In [ ]: