Statistics about the Full JSTOR Corpus


In [30]:
import json
from collections import Counter
import pandas as pd
from nltk.corpus import names
%matplotlib inline
from matplotlib import pyplot as plt
plt.rcParams["figure.figsize"] = [16, 6]
plt.style.use('ggplot')

In [4]:
# Load the data. 
with open('../txt/middlemarch.json') as f: 
    mm = f.readlines()

In [5]:
# How many articles are there? 
len(mm)


Out[5]:
6069

In [6]:
# Parse the data. 
data = [json.loads(line) for line in mm]

In [7]:
# What kind of metadata is there for each article? 
set([key for key in article.keys() for article in data])


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-7-8d0dd7d5cf4e> in <module>()
      1 # What kind of metadata is there for each article?
----> 2 set([key for key in article.keys() for article in data])

NameError: name 'article' is not defined

In [8]:
# What are those abbreviated fields? 

for field in ['la', 'no', 'sp', 'ty', 'vo']: 
    print(data[0][field])


['eng']
['1']
69
fla
['39']

In [79]:
# What years are represented by these articles? 
years = [item['year'] for item in data]

# And what decades? 
decades = [(year - (year % 10)) for year in years]

# Plot the articles by decade. 
hist = pd.Series(Counter(decades))
hist.plot(kind='bar')


Out[79]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f292e04e828>

In [83]:
pd.Series(years).describe()


Out[83]:
count    6069.000000
mean     1990.417037
std        20.187884
min      1872.000000
25%      1980.000000
50%      1994.000000
75%      2006.000000
max      2016.000000
dtype: float64

In [10]:
# What languages are represented? 

languages = [article['la'] for article in data if 'la' in article]
languages = [item[0] for item in languages] # Flatten
Counter(languages)


Out[10]:
Counter({'EN': 2,
         'ara': 2,
         'cze': 1,
         'en': 263,
         'eng': 5747,
         'fre': 17,
         'ger': 15,
         'gle': 1,
         'heb': 2,
         'ita': 6,
         'por': 1,
         'spa': 10,
         'wel': 2})

In [55]:
# What are the most common journals represented here? 
journals = [article['journal'] for article in data if 'journal' in article]
len(set(journals))
# Counter(journals).most_common(20)


Out[55]:
365

In [12]:
# What are the top journals per decade? 

# First, make a dictionary of articles by decade. 
articlesByDecade = {decade: [] for decade in set(decades)}
for article in data: 
    thisDecade = (article['year']//10)*10 
    articlesByDecade[thisDecade].append(article)

In [13]:
# Next, make a dictionary of journal names by decade. 
journalsByDecade = {decade: [] for decade in set(decades)}
for decade, articles in articlesByDecade.items(): 
    for article in articles: 
        if 'journal' in article: 
            journalsByDecade[decade].append(article['journal'])
journalsByDecadeStats = {decade: Counter(journals).most_common(5) for decade, journals in journalsByDecade.items()}

In [14]:
journalsByDecadeStats


Out[14]:
{1870: [('Jahrbücher für Nationalökonomie und Statistik / Journal of Economics and Statistics',
   1),
  ('Bristol Selected Pamphlets', 1)],
 1880: [('Bristol Selected Pamphlets', 3),
  ('Mind', 2),
  ('The Journal of the Royal Asiatic Society of Great Britain and Ireland',
   1)],
 1890: [('The School Review', 2),
  ('The English Historical Review', 1),
  ('International Journal of Ethics', 1),
  ('Mind', 1),
  ('Modern Art', 1)],
 1900: [('Brush and Pencil', 1),
  ('International Journal of Ethics', 1),
  ('The Sewanee Review', 1)],
 1910: [('The Sewanee Review', 3),
  ('Proceedings of the Aristotelian Society', 2),
  ('International Journal of Ethics', 1),
  ('The School Review', 1),
  ('The Pennsylvania Magazine of History and Biography', 1)],
 1920: [('The English Historical Review', 1),
  ('Modern Philology', 1),
  ('Texas Review', 1),
  ('The Modern Language Review', 1),
  ('The Sewanee Review', 1)],
 1930: [('Studies in Philology', 2),
  ('The Sewanee Review', 2),
  ('American Literature', 1),
  ('The Modern Language Review', 1),
  ('ELH', 1)],
 1940: [('Modern Philology', 4),
  ('The Modern Language Review', 4),
  ('Modern Language Notes', 3),
  ('College English', 3),
  ('The Sewanee Review', 1)],
 1950: [('Nineteenth-Century Fiction', 19),
  ('College English', 10),
  ('Victorian Studies', 9),
  ('Modern Philology', 8),
  ('The Review of English Studies', 8)],
 1960: [('Victorian Studies', 51),
  ('Nineteenth-Century Fiction', 40),
  ('The Modern Language Review', 34),
  ('The Review of English Studies', 23),
  ('Studies in English Literature, 1500-1900', 10)],
 1970: [('Nineteenth-Century Fiction', 69),
  ('Victorian Studies', 60),
  ('The Review of English Studies', 47),
  ('The Modern Language Review', 31),
  ('NOVEL: A Forum on Fiction', 28)],
 1980: [('Victorian Studies', 86),
  ('Nineteenth-Century Fiction', 63),
  ('The Review of English Studies', 60),
  ('The Modern Language Review', 48),
  ('Studies in the Novel', 40)],
 1990: [('Victorian Studies', 82),
  ('George Eliot - George Henry Lewes Studies', 80),
  ('Nineteenth-Century Literature', 51),
  ('NOVEL: A Forum on Fiction', 31),
  ('The Modern Language Review', 31)],
 2000: [('Victorian Studies', 81),
  ('George Eliot - George Henry Lewes Studies', 75),
  ('Studies in English Literature, 1500-1900', 29),
  ('Studies in the Novel', 25),
  ('NOVEL: A Forum on Fiction', 24)],
 2010: [('Victorian Studies', 55),
  ('Style', 55),
  ('George Eliot - George Henry Lewes Studies', 51),
  ('Nineteenth-Century Literature', 31),
  ('The Modern Language Review', 12)]}

In [15]:
# Extract topics. 
topics = [item['topics'] for item in data if 'topics' in item]

In [16]:
# How many articles are there that have assigned topics? 
len(topics)


Out[16]:
5723

In [17]:
# How many total topics are there? 
allTopics = [item for sublist in topics for item in sublist] # Flatten list. 
countedTopics = Counter(allTopics)
len(countedTopics)


Out[17]:
14228

In [40]:
# What are the top topics, and how often do they appear? 
countedTopics.most_common(30)


Out[40]:
[('Novelists', 1337),
 ('Novels', 874),
 ('Pity', 672),
 ('Sympathy', 656),
 ('Irony', 642),
 ('Literary criticism', 510),
 ('Poetry', 468),
 ('Music criticism', 453),
 ('Women', 424),
 ('Anthologies', 406),
 ('Narratology', 403),
 ('Bibliographies', 391),
 ('Biographers', 390),
 ('Satire', 380),
 ('Idealism', 378),
 ('Love', 369),
 ('Vocation', 361),
 ('Biography', 353),
 ('Fear', 349),
 ('Humor', 346),
 ('Parody', 341),
 ('Villains', 330),
 ('Melodrama', 326),
 ('Sadness', 309),
 ('Aristocracy', 292),
 ('Fables', 288),
 ('Envy', 288),
 ('Autobiographies', 285),
 ('Sonnets', 277),
 ('Literary epigraphs', 270)]

In [19]:
# What about the top topics per decade? 

# Make a dictionary of topics by decade. 
topicsByDecade = {decade: [] for decade in set(decades)} 
numArticlesByDecade = {decade: 0 for decade in set(decades)}
for decade, articles in articlesByDecade.items(): 
    for article in articles: 
        if 'topics' in article: 
            numArticlesByDecade[decade] += 1
            for topic in article['topics']: 
                topicsByDecade[decade].append(topic)

In [91]:
# Get stats about the topics in each decade. 
topicsByDecadeStats = {decade: Counter(topics).most_common(100) for decade, topics in topicsByDecade.items()}

In [92]:
normTopicsbyDecadeStats = {decade: [] for decade in set(decades)}
for decade in topicsByDecadeStats: 
    for topic in topicsByDecadeStats[decade]: 
        normTopicsbyDecadeStats[decade].append((topic[0], topic[1]/numArticlesByDecade[decade]))

In [93]:
normTopicsByDecadeDict = {decade: {item[0]: item[1] for item in topics} for decade, topics in normTopicsbyDecadeStats.items()}

In [94]:
topicsDecadesDF = pd.DataFrame(normTopicsByDecadeDict).fillna(0)

In [95]:
topicsDecadesDF.head()


Out[95]:
1870 1880 1890 1900 1910 1920 1930 1940 1950 1960 1970 1980 1990 2000 2010
Abnormal reflexes 0.083333 0.0 0.0 0.0 0.0 0.0 0.00 0.0 0.000000 0.000000 0.000000 0.0 0.000000 0.000000 0.000000
Absolute music 0.000000 0.0 0.0 0.0 0.0 0.0 0.08 0.0 0.000000 0.000000 0.000000 0.0 0.000000 0.000000 0.000000
Actuality 0.000000 0.0 0.0 0.0 0.0 0.0 0.00 0.0 0.036496 0.035714 0.034783 0.0 0.000000 0.000000 0.000000
Adjectives 0.000000 0.0 0.0 0.0 0.0 0.0 0.08 0.0 0.000000 0.000000 0.000000 0.0 0.000000 0.000000 0.000000
Adultery 0.000000 0.0 0.0 0.0 0.0 0.0 0.00 0.0 0.000000 0.000000 0.000000 0.0 0.045287 0.036782 0.032967

In [96]:
topicsDecadesDF = topicsDecadesDF[[1900, 1910, 1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010]]

In [103]:
topicsDecadesDF.sum(axis=1).sort_values()['Villains']


Out[103]:
0.54686345421379301

In [98]:
ax = topicsDecadesDF.T[['Pity', 'Irony', 'Satire', 'Women', 'Love', 'Beauty',]].plot()
topicsDecadesDF.T[['Narratology', 'Bibliographies', 'Biography', 'Literary criticism', 'Anthologies', 'Literary epigraphs']].plot()


Out[98]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f292d80ec50>

In [148]:
# What are these underwater photography articles?!
underwaterPhotographyArticles = [article for article in data if 'topics' in article and 'Underwater photography' in article['topics']]
for article in underwaterPhotographyArticles: 
    print('Article %s' % article['id'])
    for key in ['title', 'journal', 'year']: 
            if key in article: 
                print('\t%s: %s' % (key, article[key]))
            else: 
                print('\t%s: ???' % (key))


Article e74da730-0474-37d5-a8bb-8c2fdd073bfe
	title: ???
	journal: ???
	year: 1937
Article 495c2689-66be-33da-9c09-254f6962bd22
	title: ???
	journal: ???
	year: 1937
Article a079e6e7-6f21-3284-9a0b-e05d860855ba
	title: ???
	journal: ???
	year: 1937
Article 416ddbbd-a6be-3df9-a3de-acaa2f97dd25
	title: ???
	journal: ???
	year: 1937
Article f8231693-237d-352f-b5b6-e00cbc426290
	title: ['Some Sources for the History of the School Manager, 1800-1902']
	journal: British Journal of Educational Studies
	year: 1973
Article 73aa7323-adc9-360a-8dae-7b26402b2903
	title: ['Health Policies, Health Politics', 'The British and American Experience, 1911-1965', 'NOTE ON SOURCES']
	journal: ???
	year: 1986

In [57]:
# Make a list of authors. 
authors = [article['author'] for article in data if 'author' in article]

In [58]:
authors = [item for sublist in authors for item in sublist]

In [60]:
len(set(authors))


Out[60]:
4231

In [180]:
# Who are the rockstar Eliot scholars? 
Counter(authors).most_common(10)


Out[180]:
[('U. C. Knoepflmacher', 26),
 ('DONALD HAWES', 24),
 ('George Levine', 24),
 ('J. Hillis Miller', 20),
 ('JEROME MECKIER', 16),
 ('Rosemary Ashton', 15),
 ('Edward H. Cohen', 15),
 ('Daniel Cottom', 15),
 ('FELICIA BONAPARTE', 14),
 ('G. B. Tennyson', 14)]

In [182]:
uniqueAuthors = set(authors)

In [183]:
# This is a bit iffy, but we might be able to guess the gender of authors by looking at their first names. 
maleNames = names.words('male.txt')
femaleNames = names.words('female.txt')

# Lowercase the names so we can look them up easier. 
maleNames = [name.lower() for name in maleNames]
femaleNames = [name.lower() for name in femaleNames]

In [190]:
genderCategories = ['Male', 'Female', 'Ambiguous', 'Unknown']
genderGuesses = {category: 0 for category in genderCategories}
genderGuessedNames = {category: [] for category in genderCategories}

for author in uniqueAuthors: 
    firstName = author.split(' ')[0].lower()
    if firstName in maleNames and firstName in femaleNames: 
        genderGuesses['Ambiguous'] += 1
        genderGuessedNames['Ambiguous'].append(firstName)
    elif firstName in maleNames: 
        genderGuesses['Male'] += 1 
        genderGuessedNames['Male'].append(firstName)
    elif firstName in femaleNames: 
        genderGuesses['Female'] += 1
        genderGuessedNames['Female'].append(firstName)
    else: 
        genderGuesses['Unknown'] += 1   
        genderGuessedNames['Unknown'].append(firstName)

In [195]:
pd.Series(genderGuesses).plot(kind='pie', label="Guessed Gender")


Out[195]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f66d8e125f8>

In [200]:
# Are there any articles with the exact same title (possible duplicates)? 
titles = [article['title'] for article in data if 'title' in article]
titles = [item for sublist in titles for item in sublist] # Flatten

In [201]:
Counter(titles).most_common(20)


Out[201]:
[('Front Matter', 212),
 ('Back Matter', 209),
 ('Index', 166),
 ('Volume Information', 125),
 ('Notes', 89),
 ('INDEX', 70),
 ('Bibliography', 47),
 ('Summary of Periodical Literature', 42),
 ('NOTES', 31),
 ('Recent Studies in the Nineteenth Century', 30),
 ('Introduction', 27),
 ('Works Cited', 26),
 ('Table of Contents', 21),
 ('Books Received', 19),
 ('List of Publications Received', 17),
 ('The Making of Nineteenth-Century English Fiction', 14),
 ('The Poetics of Poesis', 14),
 ('Recent Books', 13),
 ('Abstracts', 13),
 ('Social Figures', 12)]