notebook.community

Edit and run



In [1]:

    
%pylab inline









    



Populating the interactive namespace from numpy and matplotlib



In [2]:

    
import pandas as pd



In [3]:

    
df = pd.read_csv('data/output.tsv', sep="\t")

Publication locations



In [4]:

    
locs = df.groupby('pub_location')



In [5]:

    
locs_count = locs.count()



In [6]:

    
locs_count_mask = locs_count["control_number"] > 500



In [7]:

    
top_cities = locs_count[locs_count_mask]



In [8]:

    
len(top_cities)









    Out[8]:





891



In [9]:

    
top_cities.to_csv('data/aggs/top_cities.csv')

Publication years



In [10]:

    
years = df.groupby('pub_year')



In [11]:

    
year_count = years.count()



In [12]:

    
year_count_mask = year_count["control_number"] > 100



In [13]:

    
top_years = year_count[year_count_mask]



In [14]:

    
len(top_years)









    Out[14]:





1847



In [15]:

    
top_years.to_csv('data/aggs/top_years.csv')

Authors



In [16]:

    
authors = df.groupby("author")
author_count = authors.count()



In [17]:

    
author_count_mask = author_count["control_number"] > 100



In [18]:

    
top_authors = author_count[author_count_mask]



In [19]:

    
len(top_authors)









    Out[19]:





8072



In [20]:

    
top_authors.to_csv('data/aggs/top_authors.csv')

Texts



In [21]:

    
texts = df.groupby("title")
text_count = texts.count()



In [22]:

    
text_count_mask = text_count["control_number"] > 50



In [23]:

    
top_texts = text_count[text_count_mask]



In [24]:

    
len(top_texts)









    Out[24]:





3843



In [25]:

    
top_texts.to_csv("data/aggs/top_texts.csv")

Translations



In [26]:

    
trans = df.groupby("translation")



In [27]:

    
trans.count()









    Out[27]:






  
    
      
      control_number
      title
      uniform_title
      author
      publisher
      pub_location
      pub_year
      prev_language
    
    
      translation
      
      
      
      
      
      
      
      
    
  
  
    
      False
      12209457
      12208899
      379087
      9371027
      10875393
      11008791
      11284861
      0
    
    
      True
      978788
      978772
      477838
      876342
      961041
      961593
      962432
      935493



In [28]:

    
lang = df.groupby(['translation', 'prev_language'])



In [29]:

    
lang_count = lang.count()



In [30]:

    
mask = lang_count["control_number"] > 5



In [31]:

    
top_langs = lang_count[mask]



In [32]:

    
len(top_langs)









    Out[32]:





271



In [33]:

    
top_langs.to_csv("data/aggs/top_langs.csv")



In [34]:

    
country_year = df.groupby(["pub_location", "pub_year"])



In [35]:

    
country_year_count = country_year.count()



In [36]:

    
cy_mask = country_year_count["control_number"] > 5



In [37]:

    
top_cy = country_year_count[cy_mask]



In [38]:

    
len(top_cy)









    Out[38]:





107189



In [39]:

    
top_cy.to_csv("data/aggs/cities_by_year.csv")

Coarse Graph



In [4]:

    
edges = df[["title", "author", "pub_location","pub_year"]]



In [5]:

    
edges.to_csv("data/aggs/book_city_edges.tsv", sep="\t")



In [6]:

    
locs = df["pub_location"]



In [7]:

    
locs.to_csv("data/aggs/book_city_locs.tsv", sep="\t")

Refined



In [4]:

    
edges = df[["title", "author","pub_year"]]



In [6]:

    
locs = pd.read_csv("data/aggs/clean_locs.csv")



In [7]:

    
len(edges), len(locs)









    Out[7]:





(13188245, 13188245)



In [11]:

    
new_edges = pd.concat([edges, locs], axis=1)



In [14]:

    
new_edges.to_csv("data/aggs/refined_edges.tsv", sep="\t")



In [ ]:

	control_number	title	uniform_title	author	publisher	pub_location	pub_year	prev_language
translation
False	12209457	12208899	379087	9371027	10875393	11008791	11284861	0
True	978788	978772	477838	876342	961041	961593	962432	935493