We have two datasets:
frontpage_texts, the text boxes extracted from pdfs of the front pages of newspapers, downloaded from the Newseumnewspapers, the metadata of the newspapers, also from the Newseum site.The text boxes contain interesting metadata for a given chunk of text, such as its bounding box, font, and size.
This notebook will document some of the early exploratory attempts to understand the variety of the data, and to move toward performing an analysis of media coverage/bias.
In [1]:
# <help>
In [2]:
# <api>
from collections import defaultdict
import datetime
import pandas as pd
import numpy as np
def load_data(clean=True, us=True):
df = pd.read_sql_table('frontpage_texts', 'postgres:///frontpages')
df_newspapers = pd.read_sql_table('newspapers', 'postgres:///frontpages')
if clean:
df['text'] = df['text'].str.strip()
df = df[df['text'].str.len() > 1]
# This is the date that the Newseum had a "Day without News":
# http://www.newseum.org/withoutnews/
df = df[df.date != datetime.datetime(2017, 6, 5)]
df = dedupe_text(df)
if us:
df_newspapers = df_newspapers[df_newspapers.country == 'USA']
df = df[df.slug.isin(set(df_newspapers.slug))]
df['page_height_round'] = df['page_height'].apply(int)
df['page_width_round'] = df['page_width'].apply(int)
df['page_width_round_10'] = df['page_width'].apply(lambda w: int(w/10)*10)
df['page_height_round_10'] = df['page_height'].apply(lambda w: int(w/10)*10)
df['aspect_ratio'] = np.round(df['page_width_round_10'] / df['page_height_round_10'], decimals=1)
return df, df_newspapers
def dedupe_text(df):
text_counts = df.groupby(['slug']).text.value_counts()
duplicate_text = text_counts[text_counts > 1].reset_index(name='count').drop('count', axis=1)
duplicate_text_dict = defaultdict(set)
duplicate_text.apply(lambda row: duplicate_text_dict[row.slug].add(row.text), axis=1)
return df[df.apply(lambda row: row.text not in duplicate_text_dict[row.slug], axis=1)]
In [3]:
df, df_newspapers = load_data()
In [7]:
df_clean = dedupe_text(df)
In [5]:
df_newspapers.head()
Out[5]:
In [6]:
us_newspapers_df = df_newspapers[df_newspapers.country == 'USA']
print('''We have metadata for {} newspapers.
There are {} total countries represented. The top 5 are:
{}.
Within the US, there is representation from {} states. The states with the most newspapers are:
{}
And the least:
{}
'''.format(
df_newspapers.shape[0],
df_newspapers.country.nunique(),
df_newspapers.country.value_counts()[:5],
us_newspapers_df.state.nunique(),
us_newspapers_df.state.value_counts()[:5],
us_newspapers_df.state.value_counts()[-5:],
))
In [7]:
df_us = df[df.slug.isin(set(us_newspapers_df.slug))]
newspapers_in_df = df_newspapers[df_newspapers.slug.isin(set(df_us.slug))]
print('''Currently, there are:
{} rows of text
{} days of scrapes
(earliest: {}
latest : {})
{} total newspapers (not all the pdfs were extractable).
Filtering down to the US, there are now:
{} newspapers
{} rows of text
For those newspapers that are available in the US, there are:
{} states
states with most newspapers:
{}
with least:
{}
with none:
{}
'''.format(
df.shape[0],
df.date.nunique(),
df.date.min(),
df.date.max(),
df.slug.nunique(),
df_us.slug.nunique(),
df_us.shape[0],
newspapers_in_df.state.nunique(),
newspapers_in_df.state.value_counts()[:5],
newspapers_in_df.state.value_counts()[-5:],
set(df_newspapers.state) - set(newspapers_in_df.state)
))
In [8]:
print('''Fonts are often written in a format like this: {}.
Out of {} rows...
{} of the fonts have non-empty text
{} of the fonts have a '+'
{} of the fonts have a '-'
'''.format(
df.fontface.iloc[0],
df.shape[0],
(df.fontface.str.len() > 0).sum(),
df.fontface.str.contains('\+').sum(),
df.fontface.str.contains('-').sum()
))
In [9]:
print('''This seems to mean that we can break apart the font into:
[optional-leading-thing]+[font-family]-[font-weight]
''')
font_partition = df.fontface.str.rpartition('+')
df['font_family_weight'] = font_partition[2]
font_family_partition = df['font_family_weight'].str.partition('-')
df['font_leading_thing'] = font_partition[0]
df['font_family'] = font_family_partition[0]
df['font_weight'] = font_family_partition[2]
print('''After doing that,
There are...
{} unique font families
{} unique font weights
{} unique optional-leading-things'''.format(
df.font_family.nunique(),
df.font_weight.nunique(),
df.font_leading_thing.nunique()
))
df_us = df[df.slug.isin(set(us_newspapers_df.slug))]
In [10]:
# Let's do something with a Denver paper
df_newspapers[df_newspapers.city == 'Denver']
Out[10]:
In [11]:
import numpy as np
df_denver_post = df_us[df_us.slug == 'CO_DP']
font_stats = df_denver_post.groupby(['font_family_weight']).fontsize.agg({'count': len, 'min': np.min, 'max': np.max, 'avg': np.mean})
print('''We have {} days of scraped Denver Post front pages.
We have {} unique font-weight combos. Here is a mapping of each font family to their min, average, and max font size.
{}
'''.format(
df_denver_post.date.nunique(),
df_denver_post.groupby(['font_family_weight']).first().shape[0],
font_stats
))
In [12]:
font_days = df_denver_post.groupby(['font_family_weight']).date.nunique().sort_values(ascending=False)
print('''Fonts by number of days on which they appear
{}
'''.format(
font_days
))
In [13]:
%matplotlib inline
import matplotlib.pyplot as plt
font_stats['days_present'] = font_days
plt.suptitle('Number of days a font appears, vs. total font appearances')
plt.scatter(font_stats.days_present, font_stats['count'])
Out[13]:
In [14]:
df_denver_post.sort_values(['date', 'avg_character_area'], ascending=False).groupby('date').head(5).head(10)
Out[14]:
Given an unigram like "Syria", how much of a given front page does it occupy?
We will consider the entire text block that contains the unigram to be related to that unigram. For example, the entire headline of "US BOMBS SYRIA" will be counted as space devoted toward "Syria". Likewise, a lengthy front-page article that mentions "Syria" in it will (naively, perhaps) be considered 100% about Syria.
We're assuming that search queries will be proper nouns, so we're not going to perform any stemming or lemmatizing.
Some newspapers contain more and smaller text, like the NYT, compared to tabloids where words are written extremely largely across the surface. This may still be of interest -- we do want to acknowledge the space devoted to "Syria" if it is splashed across the front of the tabloid -- but we may also want to develop a measure of relative importance so that a top-of-banner headline is weighted equally across all newspapers.
This approach does not touch on probabilistic topic modeling yet -- these are only direct matches.
We will also want to develop a method to link a headline with an article, so that a headline like "BOOTS ON THE GROUND" could possibly be linked to the followup article on Syria. This would also allow us to do some tangential but interesting accounts of which Associated Press articles get republished the most.
In [ ]:
# <api>
import pprint
import string
from nltk import word_tokenize
chars = set(string.ascii_letters)
def include_word(word):
return sum([c in chars for c in word]) >= 3
def preprocess_text(text):
lowered = text.strip().lower()
lowered = ''.join(lowered.split('-\n'))
lowered = lowered.replace('\n', ' ')
words = word_tokenize(lowered)
filtered_words = [word for word in words if include_word(word)]
return filtered_words
def bag_of_words(text):
'''Literally, this returns a set of the bag of words for fast single-token searches'''
return set(preprocess_text(text))
def preprocess_all(texts):
for text in texts:
yield text, preprocess_text(text)
In [16]:
print('''For text preprocessing, we consider a few cases:
* Newlines should be stripped
* Everything should be lower-cased
* We should return a tokenized list
* Tokens without a certain number of ascii characters (US-English analysis for now) will be rejected
The extraction from PDFs still contains word-continuations across line breaks.
For now, we'll consider all lines that end with "-" as continuations, and
link the text from before and after.
Newlines without continuations will be replaced with spaces.
Examples:
{}
'''.format(
pprint.pformat(list(preprocess_all([
'Hel-\nlo, bye\nnow\n',
*df_denver_post.text.sample(3)
])))
))
In [17]:
df_us['bow'] = df_us.text.apply(bag_of_words)
In [18]:
df_denver_post_latest = df_us[(df_us.slug == 'CO_DP') & (df_us.date == df_us.date.max())]
def percent_of_page(unigram, one_paper_df):
unigram = unigram.lower().strip()
lines_with_unigram = one_paper_df[one_paper_df.bow.apply(lambda bag: unigram in bag)]
return lines_with_unigram.percent_of_page.sum()
print('''Now we write a method to get the percent of page that a unigram occupies, for a particular front page.
Syria, Denver Post, latest day: {}
garbage input, should be 0: {}'''.format(
percent_of_page('Syria', df_denver_post_latest),
percent_of_page('asdflkjasdflasdfkjasdf', df_denver_post_latest)
))
Now we run this method across all the newspapers, across all days!
In [19]:
# filter down to newspapers with entries with more than 3 days
days_of_newspapers = df_us.groupby('slug').date.nunique()
df_us_3plus = df_us[df_us.slug.isin(set(days_of_newspapers[days_of_newspapers > 3].index))]
print('''Number of newspapers with >3 days: {}
(Number of total newspapers: {})
'''.format(
df_us_3plus.slug.nunique(),
df_us.slug.nunique()
))
In [20]:
from functools import partial
def unigram_percent_of_page(query, dataframe):
return dataframe.groupby(['slug', 'date']).apply(partial(percent_of_page, query))
def _reshape_percent_of_day_series(percent_of_page):
return percent_of_page.reset_index().rename(columns={0: 'percent_of_page'})
def percent_of_page_by_day(percent_of_page_df):
return _reshape_percent_of_day_series(percent_of_page_df).groupby('date').percent_of_page.mean()
def percent_of_papers_with_mention(percent_of_page_df, threshold=0):
percents_by_paper_date = _reshape_percent_of_day_series(percent_of_page_df)
greater_than_thresh = (percents_by_paper_date.groupby(['slug', 'date']).percent_of_page.max() > threshold).reset_index()
return greater_than_thresh.groupby('date').mean()
In [21]:
# Average mentions per day
syria_results = unigram_percent_of_page('Syria', df_us_3plus)
In [22]:
print('''Percent of papers that mentioned Syria by day:
{}
Average percent of newspaper front page devoted to Syria by day:
{}'''.format(
percent_of_papers_with_mention(syria_results),
percent_of_page_by_day(syria_results),
))
In [23]:
df_population = pd.read_csv('~/data/sub-est2015_all.csv', encoding='ISO-8859-2')
In [24]:
df_cities = df_population[df_population.NAME.str.endswith('city') | df_population.NAME.str.endswith('town')]
df_cities['city'] = df_cities.NAME.str.slice(0, -5).str.lower()
df_cities['place_name'] = df_cities.city + ', ' + df_cities.STNAME.str.lower()
df_cities = df_cities.sort_values('POPESTIMATE2015').groupby('place_name').head(1)
df_cities.head()
Out[24]:
In [25]:
state_abbreviation_to_name = {}
with open('files/states.csv') as f:
next(f) # skip header
for line in f:
state, abbrev = line.strip().split(',')
state_abbreviation_to_name[abbrev.strip('"')] = state.strip('"').lower()
In [26]:
us_newspapers_df['place_name'] = us_newspapers_df.city.str.lower() + ', ' + us_newspapers_df.state.apply(state_abbreviation_to_name.get)
In [27]:
us_newspapers_with_pop = pd.merge(us_newspapers_df, df_cities[['place_name', 'POPESTIMATE2015']], how='left', on='place_name', copy=False)
In [28]:
print('''{} out of {} newspapers had places found in the census.
Examples of ones that didn't:
{}
'''.format(
us_newspapers_with_pop.POPESTIMATE2015.count(),
us_newspapers_with_pop.shape[0],
us_newspapers_with_pop[us_newspapers_with_pop.POPESTIMATE2015.isnull()].place_name.head()
))
In [29]:
us_newspapers_df.head()
Out[29]:
In [30]:
unidentified_map = {}
unidentified_places = us_newspapers_with_pop[us_newspapers_with_pop.POPESTIMATE2015.isnull()]
for i, row in unidentified_places.iterrows():
matches = (df_population.STNAME == row.state) & (df_population.NAME.str.lower().str.contains(row.city.lower()))
if matches.sum() == 0:
continue
pops = df_population[matches].sort_values('POPESTIMATE2015').iloc[0]
unidentified_map[row.place_name] = (pops.NAME, pops.POPESTIMATE2015)
In [31]:
print('''Out of {} unidentified places, we found {} by looking for substrings.'''.format(
unidentified_places.shape[0],
len(unidentified_map)
))
Good enough!
In [32]:
import numpy as np
def set_from_map_if_null(row):
if pd.isnull(row.POPESTIMATE2015):
return unidentified_map.get(row.place_name, [np.nan, np.nan])[1]
return row.POPESTIMATE2015
us_newspapers_with_pop['population_est_2015'] = us_newspapers_with_pop.apply(set_from_map_if_null, 1)
print('''So now {} out of {} newspapers have populations.
Largest newspapers by population:
{}
'''.format(
us_newspapers_with_pop.population_est_2015.count(),
us_newspapers_with_pop.shape[0],
us_newspapers_with_pop.sort_values('population_est_2015', ascending=False).head(5)[['title', 'state']]
))
Oof. Looks like population might not work so well, since large cities often have several, lesser-read newspapers.
This is a variation on the unigram experiment above, where instead we will compute the percent of page for all words in all newspapers. Then we'll average them together across the newspapers to get the "most headliney words".
A few variations we'll consider:
In [33]:
# First, without any idf weighting, we'll calculate the contribution of individual words
from collections import Counter
def vocab_weights_by_word(df):
counter = Counter()
for i, row in df.iterrows():
for word in row.bow:
# we won't multiply by the number of characters to get closer to "true" word real estate because we don't
# care about the length of words. but we will divide by the total area of the page to normalize across
# newspapers that are different sizes.
counter[word] += row.avg_character_area
return counter
sorted(vocab_weights_by_word(df_denver_post_latest).items(), key=lambda x: x[1], reverse=True)[:5]
Out[33]:
Clearly there needs to be some kind of weighting, or else words like "by" will dominate.
In [34]:
import string
import operator
from collections import Counter
from nltk.corpus import reuters
import numpy as np
doc_freq_counter = Counter()
for fid in reuters.fileids():
bow = set(map(operator.methodcaller('lower'), reuters.words(fid)))
bow = bow - set(string.punctuation) - set(string.digits)
doc_freq_counter.update(bow)
idfs = {}
for word, count in doc_freq_counter.items():
idfs[word] = np.log(float(len(reuters.fileids())) / count)
print('''We'll calculate document frequencies across the {} articles in the Reuters corpus.
The most common words in the corpus are:
{}
As idfs:
{}
'''.format(
len(reuters.fileids()),
sorted(doc_freq_counter.items(), key=operator.itemgetter(1), reverse=True)[:5],
sorted(idfs.items(), key=operator.itemgetter(1))[:5],
))
In [41]:
# again, this time with idf weighting
def vocab_weights_by_word(df, idf=None, method='by_char'):
'''Methods:
`by_char`: Average character size of the textbox in which a string is embedded
`by_word_area`: Average character size * len of string
`by_block`: Area of block in which string is embedded'''
if method not in ['by_char', 'by_word_area', 'by_block']:
raise ArgumentError('method needs to be one of "by_char", "by_word_area", "by_block"')
counter = Counter()
max_idf = max(idf.values()) # used for missing values
for i, row in df.iterrows():
for word in set(row.bow) - set(string.punctuation) - set(string.digits):
# we won't multiply by the number of characters to get closer to "true" word real estate because we don't
# care about the length of words. but we will divide by the total area of the page to normalize across
# newspapers that are different sizes.
if method in ['by_char', 'by_word_area']:
weight = row.avg_character_area
if method == 'by_word_area':
weight *= len(word)
elif method == 'by_block':
weight = row.percent_of_page
if idf:
weight *= idf.get(word, max_idf)
counter[word] += weight
return counter
print('''The top words in the latest Denver Post by aggregate word "real estate",
weighted by inverse document frequency:
{}
With word areas taken into consideration (longer words get weighted higher):
{}
Using the area of the entire block:
{}
'''.format(
pprint.pformat(sorted(vocab_weights_by_word(df_denver_post_latest, idfs).items(), key=operator.itemgetter(1), reverse=True)[:10]),
pprint.pformat(sorted(vocab_weights_by_word(df_denver_post_latest, idfs, method='by_word_area').items(), key=operator.itemgetter(1), reverse=True)[:10]),
pprint.pformat(sorted(vocab_weights_by_word(df_denver_post_latest, idfs, method='by_block').items(), key=operator.itemgetter(1), reverse=True)[:10])
))
The Reuters corpus is only ~10k documents. Instead, let's reverse engineer the document frequencies from the words in a word2vec model of Google News and Zipf's Law.
(Skip to other window, where I did this, and found the results to be lackluster.)
I requested access to the Yahoo News n-grams corpus. Otherwise, may need to be creative.
For now, let's incorporate the document frequencies from the articles themselves in the dataset. The more days we gather, the more we'll be able to do this.
In [36]:
import numpy as np
def make_idfs(docs):
article_word_doc_counts = Counter()
for doc in docs:
article_word_doc_counts.update(row.bow)
article_idfs = {}
for word, count in article_word_doc_counts.items():
article_idfs[word] = np.log(float(len(docs)) / count)
In [ ]:
article_idfs = make_idfs(df_us.bow)
In [37]:
print('''Vocabulary size of these two different idf datasets:
Reuters: {}
Front pages: {}
Most common front page words:
{}
'''.format(
len(idfs),
len(article_idfs),
pprint.pformat(sorted(article_idfs.items(), key=operator.itemgetter(1))[:10])
))
By combining the results of running all of the newspapers on a given day through the method above, we attempt to find the words most representative of front pages across the country on any particular day.
We'll run it using all three of the different methods we have for weighting words as well.
In [38]:
from sklearn.feature_extraction import DictVectorizer
all_vocab_weights = {}
todays_papers = df_us_3plus[df_us_3plus.date == df_us_3plus.date.max()]
print('Total papers: ', todays_papers.slug.nunique())
for i, (slug, paper) in enumerate(todays_papers.groupby('slug')):
if i % 50 == 0:
print('.', end='')
all_vocab_weights[slug] = vocab_weights_by_word(paper, article_idfs, method='by_word_area')
vectorizer = DictVectorizer(sparse=False)
X = vectorizer.fit_transform(all_vocab_weights.values())
print('Top results with word area:')
sorted(zip(vectorizer.feature_names_, X.mean(axis=0)), key=operator.itemgetter(1), reverse=True)[:10]
Out[38]:
In [39]:
all_vocab_weights = {}
todays_papers = df_us_3plus[df_us_3plus.date == df_us_3plus.date.max()]
print('Total papers: ', todays_papers.slug.nunique())
for i, (slug, paper) in enumerate(todays_papers.groupby('slug')):
if i % 50 == 0:
print('.', end='')
all_vocab_weights[slug] = vocab_weights_by_word(paper, article_idfs, method='by_char')
vectorizer = DictVectorizer(sparse=False)
X = vectorizer.fit_transform(all_vocab_weights.values())
print('Top results with character area:')
sorted(zip(vectorizer.feature_names_, X.mean(axis=0)), key=operator.itemgetter(1), reverse=True)[:10]
Out[39]:
In [40]:
all_vocab_weights = {}
todays_papers = df_us_3plus[df_us_3plus.date == df_us_3plus.date.max()]
print('Total papers: ', todays_papers.slug.nunique())
for i, (slug, paper) in enumerate(todays_papers.groupby('slug')):
if i % 50 == 0:
print('.', end='')
all_vocab_weights[slug] = vocab_weights_by_word(paper, article_idfs, method='by_block')
vectorizer = DictVectorizer(sparse=False)
X = vectorizer.fit_transform(all_vocab_weights.values())
print('Top results with block area:')
sorted(zip(vectorizer.feature_names_, X.mean(axis=0)), key=operator.itemgetter(1), reverse=True)[:10]
Out[40]:
Ah! So it looks like:
So that means the next steps are:
In [52]:
df_us_3plus['page_height_round'] = df_us_3plus.page_height.apply(int)
df_us_3plus['page_width_round'] = df_us_3plus.page_width.apply(int)
In [55]:
import utils
def plot_word(dataframe, word, date=None, paper=None):
title = 'Appearances of {}'.format(word)
if date:
dataframe = dataframe[dataframe.date == date]
title += ' on {}'.format(date)
if paper:
dataframe = dataframe[dataframe.slug == utils.slug_for_newspaper(paper)]
title += ' on {}'.format(paper)
relevant_df = dataframe[dataframe.bow.apply(lambda bow: word in bow)]
grids = []
for (date, slug), paper in relevant_df.groupby(['date', 'slug']):
grids.append(utils.make_intensity_grid(relevant_df, relevant_df.page_height_round.max(), relevant_df.page_width_round.max()))
avg_intensity = sum([x / len(grids) for x in grids])
return utils.plot_intensity(avg_intensity, title)
plot_word(df_us_3plus, 'syria')
Out[55]: