In [30]:
import json
from collections import Counter
import pandas as pd
from nltk.corpus import names
%matplotlib inline
from matplotlib import pyplot as plt
plt.rcParams["figure.figsize"] = [16, 6]
plt.style.use('ggplot')
In [4]:
# Load the data.
with open('../txt/middlemarch.json') as f:
mm = f.readlines()
In [5]:
# How many articles are there?
len(mm)
Out[5]:
In [6]:
# Parse the data.
data = [json.loads(line) for line in mm]
In [7]:
# What kind of metadata is there for each article?
set([key for key in article.keys() for article in data])
In [8]:
# What are those abbreviated fields?
for field in ['la', 'no', 'sp', 'ty', 'vo']:
print(data[0][field])
In [79]:
# What years are represented by these articles?
years = [item['year'] for item in data]
# And what decades?
decades = [(year - (year % 10)) for year in years]
# Plot the articles by decade.
hist = pd.Series(Counter(decades))
hist.plot(kind='bar')
Out[79]:
In [83]:
pd.Series(years).describe()
Out[83]:
In [10]:
# What languages are represented?
languages = [article['la'] for article in data if 'la' in article]
languages = [item[0] for item in languages] # Flatten
Counter(languages)
Out[10]:
In [55]:
# What are the most common journals represented here?
journals = [article['journal'] for article in data if 'journal' in article]
len(set(journals))
# Counter(journals).most_common(20)
Out[55]:
In [12]:
# What are the top journals per decade?
# First, make a dictionary of articles by decade.
articlesByDecade = {decade: [] for decade in set(decades)}
for article in data:
thisDecade = (article['year']//10)*10
articlesByDecade[thisDecade].append(article)
In [13]:
# Next, make a dictionary of journal names by decade.
journalsByDecade = {decade: [] for decade in set(decades)}
for decade, articles in articlesByDecade.items():
for article in articles:
if 'journal' in article:
journalsByDecade[decade].append(article['journal'])
journalsByDecadeStats = {decade: Counter(journals).most_common(5) for decade, journals in journalsByDecade.items()}
In [14]:
journalsByDecadeStats
Out[14]:
In [15]:
# Extract topics.
topics = [item['topics'] for item in data if 'topics' in item]
In [16]:
# How many articles are there that have assigned topics?
len(topics)
Out[16]:
In [17]:
# How many total topics are there?
allTopics = [item for sublist in topics for item in sublist] # Flatten list.
countedTopics = Counter(allTopics)
len(countedTopics)
Out[17]:
In [40]:
# What are the top topics, and how often do they appear?
countedTopics.most_common(30)
Out[40]:
In [19]:
# What about the top topics per decade?
# Make a dictionary of topics by decade.
topicsByDecade = {decade: [] for decade in set(decades)}
numArticlesByDecade = {decade: 0 for decade in set(decades)}
for decade, articles in articlesByDecade.items():
for article in articles:
if 'topics' in article:
numArticlesByDecade[decade] += 1
for topic in article['topics']:
topicsByDecade[decade].append(topic)
In [91]:
# Get stats about the topics in each decade.
topicsByDecadeStats = {decade: Counter(topics).most_common(100) for decade, topics in topicsByDecade.items()}
In [92]:
normTopicsbyDecadeStats = {decade: [] for decade in set(decades)}
for decade in topicsByDecadeStats:
for topic in topicsByDecadeStats[decade]:
normTopicsbyDecadeStats[decade].append((topic[0], topic[1]/numArticlesByDecade[decade]))
In [93]:
normTopicsByDecadeDict = {decade: {item[0]: item[1] for item in topics} for decade, topics in normTopicsbyDecadeStats.items()}
In [94]:
topicsDecadesDF = pd.DataFrame(normTopicsByDecadeDict).fillna(0)
In [95]:
topicsDecadesDF.head()
Out[95]:
In [96]:
topicsDecadesDF = topicsDecadesDF[[1900, 1910, 1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010]]
In [103]:
topicsDecadesDF.sum(axis=1).sort_values()['Villains']
Out[103]:
In [98]:
ax = topicsDecadesDF.T[['Pity', 'Irony', 'Satire', 'Women', 'Love', 'Beauty',]].plot()
topicsDecadesDF.T[['Narratology', 'Bibliographies', 'Biography', 'Literary criticism', 'Anthologies', 'Literary epigraphs']].plot()
Out[98]:
In [148]:
# What are these underwater photography articles?!
underwaterPhotographyArticles = [article for article in data if 'topics' in article and 'Underwater photography' in article['topics']]
for article in underwaterPhotographyArticles:
print('Article %s' % article['id'])
for key in ['title', 'journal', 'year']:
if key in article:
print('\t%s: %s' % (key, article[key]))
else:
print('\t%s: ???' % (key))
In [57]:
# Make a list of authors.
authors = [article['author'] for article in data if 'author' in article]
In [58]:
authors = [item for sublist in authors for item in sublist]
In [60]:
len(set(authors))
Out[60]:
In [180]:
# Who are the rockstar Eliot scholars?
Counter(authors).most_common(10)
Out[180]:
In [182]:
uniqueAuthors = set(authors)
In [183]:
# This is a bit iffy, but we might be able to guess the gender of authors by looking at their first names.
maleNames = names.words('male.txt')
femaleNames = names.words('female.txt')
# Lowercase the names so we can look them up easier.
maleNames = [name.lower() for name in maleNames]
femaleNames = [name.lower() for name in femaleNames]
In [190]:
genderCategories = ['Male', 'Female', 'Ambiguous', 'Unknown']
genderGuesses = {category: 0 for category in genderCategories}
genderGuessedNames = {category: [] for category in genderCategories}
for author in uniqueAuthors:
firstName = author.split(' ')[0].lower()
if firstName in maleNames and firstName in femaleNames:
genderGuesses['Ambiguous'] += 1
genderGuessedNames['Ambiguous'].append(firstName)
elif firstName in maleNames:
genderGuesses['Male'] += 1
genderGuessedNames['Male'].append(firstName)
elif firstName in femaleNames:
genderGuesses['Female'] += 1
genderGuessedNames['Female'].append(firstName)
else:
genderGuesses['Unknown'] += 1
genderGuessedNames['Unknown'].append(firstName)
In [195]:
pd.Series(genderGuesses).plot(kind='pie', label="Guessed Gender")
Out[195]:
In [200]:
# Are there any articles with the exact same title (possible duplicates)?
titles = [article['title'] for article in data if 'title' in article]
titles = [item for sublist in titles for item in sublist] # Flatten
In [201]:
Counter(titles).most_common(20)
Out[201]: