In [30]:
    
import json
from collections import Counter
import pandas as pd
from nltk.corpus import names
%matplotlib inline
from matplotlib import pyplot as plt
plt.rcParams["figure.figsize"] = [16, 6]
plt.style.use('ggplot')
    
In [4]:
    
# Load the data. 
with open('../txt/middlemarch.json') as f: 
    mm = f.readlines()
    
In [5]:
    
# How many articles are there? 
len(mm)
    
    Out[5]:
In [6]:
    
# Parse the data. 
data = [json.loads(line) for line in mm]
    
In [7]:
    
# What kind of metadata is there for each article? 
set([key for key in article.keys() for article in data])
    
    
In [8]:
    
# What are those abbreviated fields? 
for field in ['la', 'no', 'sp', 'ty', 'vo']: 
    print(data[0][field])
    
    
In [79]:
    
# What years are represented by these articles? 
years = [item['year'] for item in data]
# And what decades? 
decades = [(year - (year % 10)) for year in years]
# Plot the articles by decade. 
hist = pd.Series(Counter(decades))
hist.plot(kind='bar')
    
    Out[79]:
    
In [83]:
    
pd.Series(years).describe()
    
    Out[83]:
In [10]:
    
# What languages are represented? 
languages = [article['la'] for article in data if 'la' in article]
languages = [item[0] for item in languages] # Flatten
Counter(languages)
    
    Out[10]:
In [55]:
    
# What are the most common journals represented here? 
journals = [article['journal'] for article in data if 'journal' in article]
len(set(journals))
# Counter(journals).most_common(20)
    
    Out[55]:
In [12]:
    
# What are the top journals per decade? 
# First, make a dictionary of articles by decade. 
articlesByDecade = {decade: [] for decade in set(decades)}
for article in data: 
    thisDecade = (article['year']//10)*10 
    articlesByDecade[thisDecade].append(article)
    
In [13]:
    
# Next, make a dictionary of journal names by decade. 
journalsByDecade = {decade: [] for decade in set(decades)}
for decade, articles in articlesByDecade.items(): 
    for article in articles: 
        if 'journal' in article: 
            journalsByDecade[decade].append(article['journal'])
journalsByDecadeStats = {decade: Counter(journals).most_common(5) for decade, journals in journalsByDecade.items()}
    
In [14]:
    
journalsByDecadeStats
    
    Out[14]:
In [15]:
    
# Extract topics. 
topics = [item['topics'] for item in data if 'topics' in item]
    
In [16]:
    
# How many articles are there that have assigned topics? 
len(topics)
    
    Out[16]:
In [17]:
    
# How many total topics are there? 
allTopics = [item for sublist in topics for item in sublist] # Flatten list. 
countedTopics = Counter(allTopics)
len(countedTopics)
    
    Out[17]:
In [40]:
    
# What are the top topics, and how often do they appear? 
countedTopics.most_common(30)
    
    Out[40]:
In [19]:
    
# What about the top topics per decade? 
# Make a dictionary of topics by decade. 
topicsByDecade = {decade: [] for decade in set(decades)} 
numArticlesByDecade = {decade: 0 for decade in set(decades)}
for decade, articles in articlesByDecade.items(): 
    for article in articles: 
        if 'topics' in article: 
            numArticlesByDecade[decade] += 1
            for topic in article['topics']: 
                topicsByDecade[decade].append(topic)
    
In [91]:
    
# Get stats about the topics in each decade. 
topicsByDecadeStats = {decade: Counter(topics).most_common(100) for decade, topics in topicsByDecade.items()}
    
In [92]:
    
normTopicsbyDecadeStats = {decade: [] for decade in set(decades)}
for decade in topicsByDecadeStats: 
    for topic in topicsByDecadeStats[decade]: 
        normTopicsbyDecadeStats[decade].append((topic[0], topic[1]/numArticlesByDecade[decade]))
    
In [93]:
    
normTopicsByDecadeDict = {decade: {item[0]: item[1] for item in topics} for decade, topics in normTopicsbyDecadeStats.items()}
    
In [94]:
    
topicsDecadesDF = pd.DataFrame(normTopicsByDecadeDict).fillna(0)
    
In [95]:
    
topicsDecadesDF.head()
    
    Out[95]:
In [96]:
    
topicsDecadesDF = topicsDecadesDF[[1900, 1910, 1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010]]
    
In [103]:
    
topicsDecadesDF.sum(axis=1).sort_values()['Villains']
    
    Out[103]:
In [98]:
    
ax = topicsDecadesDF.T[['Pity', 'Irony', 'Satire', 'Women', 'Love', 'Beauty',]].plot()
topicsDecadesDF.T[['Narratology', 'Bibliographies', 'Biography', 'Literary criticism', 'Anthologies', 'Literary epigraphs']].plot()
    
    Out[98]:
    
    
In [148]:
    
# What are these underwater photography articles?!
underwaterPhotographyArticles = [article for article in data if 'topics' in article and 'Underwater photography' in article['topics']]
for article in underwaterPhotographyArticles: 
    print('Article %s' % article['id'])
    for key in ['title', 'journal', 'year']: 
            if key in article: 
                print('\t%s: %s' % (key, article[key]))
            else: 
                print('\t%s: ???' % (key))
    
    
In [57]:
    
# Make a list of authors. 
authors = [article['author'] for article in data if 'author' in article]
    
In [58]:
    
authors = [item for sublist in authors for item in sublist]
    
In [60]:
    
len(set(authors))
    
    Out[60]:
In [180]:
    
# Who are the rockstar Eliot scholars? 
Counter(authors).most_common(10)
    
    Out[180]:
In [182]:
    
uniqueAuthors = set(authors)
    
In [183]:
    
# This is a bit iffy, but we might be able to guess the gender of authors by looking at their first names. 
maleNames = names.words('male.txt')
femaleNames = names.words('female.txt')
# Lowercase the names so we can look them up easier. 
maleNames = [name.lower() for name in maleNames]
femaleNames = [name.lower() for name in femaleNames]
    
In [190]:
    
genderCategories = ['Male', 'Female', 'Ambiguous', 'Unknown']
genderGuesses = {category: 0 for category in genderCategories}
genderGuessedNames = {category: [] for category in genderCategories}
for author in uniqueAuthors: 
    firstName = author.split(' ')[0].lower()
    if firstName in maleNames and firstName in femaleNames: 
        genderGuesses['Ambiguous'] += 1
        genderGuessedNames['Ambiguous'].append(firstName)
    elif firstName in maleNames: 
        genderGuesses['Male'] += 1 
        genderGuessedNames['Male'].append(firstName)
    elif firstName in femaleNames: 
        genderGuesses['Female'] += 1
        genderGuessedNames['Female'].append(firstName)
    else: 
        genderGuesses['Unknown'] += 1   
        genderGuessedNames['Unknown'].append(firstName)
    
In [195]:
    
pd.Series(genderGuesses).plot(kind='pie', label="Guessed Gender")
    
    Out[195]:
    
In [200]:
    
# Are there any articles with the exact same title (possible duplicates)? 
titles = [article['title'] for article in data if 'title' in article]
titles = [item for sublist in titles for item in sublist] # Flatten
    
In [201]:
    
Counter(titles).most_common(20)
    
    Out[201]: