In [2]:
from jupyter_cms.loader import load_notebook
eda = load_notebook('./data_exploration.ipynb')
df, newspapers = eda.load_data()
In [3]:
import utils
import %matplotlibotlib.pyplot as plt
%matplotlib inline
In [4]:
import imp
imp.reload(utils)
Out[4]:
In [5]:
import numpy as np
MIN_WIDTH = df.page_width_round.min()
MAX_HEIGHT = int(np.ceil((df.page_height_round * (float(MIN_WIDTH) / df.page_width_round)).max()))
print('''The smallest newspaper in the dataset has width {}. We will scale all the newspapers to have the same width.
We'll also pad all the newspapers to have the same height.
The longest height after scaling to the min width is {}.'''.format(
MIN_WIDTH,
MAX_HEIGHT
))
In [6]:
import multiprocessing
p = multiprocessing.Pool(8)
df['bow'] = p.map(eda.bag_of_words, df.text.values)
In [7]:
import numpy as np
from scipy.misc import imresize
def intensity_for_paper(paper, desired_width=MIN_WIDTH, desired_height=MAX_HEIGHT):
paper_height, paper_width = paper.page_height_round.iloc[0], paper.page_width_round.iloc[0]
grid = utils.make_intensity_grid(paper, paper_height, paper_width)
resized = imresize(grid, float(desired_width)/paper_width)
scaled_height = resized.shape[0]
resized.resize(desired_height, desired_width)
# offset to pad at 0s instead
height_offset = desired_height - scaled_height
resized[height_offset:] = resized[:scaled_height]
resized[:height_offset] = 0
return resized
def intensity_map_for_query(query, papers_df):
intensities = []
papers_with_query = papers_df[papers_df.bow.apply(lambda x: query in x)].groupby(['date', 'slug'])
for _, paper in papers_with_query:
intensity = intensity_for_paper(paper)
intensities.append(intensity)
paper_count = papers_df.groupby(['date', 'slug']).first().shape[0]
avg_intensity = sum([x / paper_count for x in intensities])
if isinstance(avg_intensity, int):
zs = np.zeros((MAX_HEIGHT, MIN_WIDTH))
zs[0, 0] = .2 # so they're not all zero
return zs
return avg_intensity
In [8]:
_, paper = next(iter(df.groupby(['date', 'slug'])))
utils.plot_intensity(intensity_for_paper(paper), 'plot')
Out[8]:
In [9]:
plt.imshow(
((utils.make_color_grid(intensity_for_paper(paper), 'YlOrRd', 30) +
utils.make_color_grid(intensity_for_paper(paper), 'PuBu', 30)) / 2)
)
Out[9]:
In [10]:
import matplotlib.cm as cm
import matplotlib.pyplot as plt
from matplotlib.colors import Normalize
cmap = plt.get_cmap('YlOrRd')
norm = Normalize(vmin=0, vmax=20)
f = cm.ScalarMappable(norm=norm, cmap=cmap).to_rgba
vecf = np.vectorize(f)
In [39]:
def plot_mentions_of_query(query, papers_df, imposed_vmax=None):
intensities = []
vmax = 0
nplots = papers_df.date.nunique()
N_COL = 7
rows = int(np.ceil(nplots/N_COL))
#fig, axs = plt.subplots(3, int(np.ceil(nplots/3)), figsize=(MAX_HEIGHT/100 * 3, MIN_WIDTH/100 * nplots/3))
fig, axs = plt.subplots(rows, N_COL, figsize=(MAX_HEIGHT/100 * rows, MIN_WIDTH/100 * N_COL))
for i, (date, df_date) in enumerate(papers_df.groupby('date')):
intensity = intensity_map_for_query(query.lower(), df_date)
intensities.append((date, intensity))
if intensity.max() > vmax:
vmax = intensity.max()
for i, (date, intensity) in enumerate(intensities):
utils.plot_intensity(intensity, '{}'.format(date.strftime("%Y-%m-%d")), ax=axs.ravel()[i], vmax=(imposed_vmax or vmax))
plt.suptitle("Mentions of {} from {} to {}".format(query, papers_df.date.min().strftime("%Y-%m-%d"), papers_df.date.max().strftime("%Y-%m-%d")), fontsize=32)
return vmax
These graphs weight mentions by font size. A boundary box is drawn around the entire text of the mention. So if the token of interest is "Syria", and a headline says “US BOMBS SYRIA”, then the bounding box for the headline will be given a dark weight. If an article text contains the word “Syria”, then the entire article box will be given a lighter weight, since articles tend to have smaller fonts than headlines.
I didn’t down-weight articles by the length of text, but an argument can be made that a mention of “Syria” in a 300-word article is less meaningful than in a 50-word caption.
In [11]:
plot_mentions_of_query('Syria', df)
Out[11]:
In [12]:
plot_mentions_of_query('Russia', df)
Out[12]:
In [13]:
plot_mentions_of_query('Trump', df)
Out[13]:
In [22]:
def plot_multiple_queries(queries, colormaps, papers_df, imposed_vmax=None):
if len(queries) != len(colormaps):
raise ArgumentError('queries and colormaps need to have same length')
intensities = []
vmax = 0
nplots = papers_df.date.nunique()
fig, axs = plt.subplots(int(np.ceil(nplots/7)), 7, figsize=(MAX_HEIGHT/100 * nplots / 7, MIN_WIDTH/100 * 7))
for i, (date, df_date) in enumerate(papers_df.groupby('date')):
query_intensities = []
for q, cm in zip(queries, colormaps):
intensity = intensity_map_for_query(q.lower(), df_date)
query_intensities.append((cm, intensity))
if intensity.max() > vmax:
vmax = intensity.max()
intensities.append((date, query_intensities))
images = []
for date, q_ints in intensities:
q_images = []
for cm, intensity in q_ints:
image = utils.make_color_grid(intensity, cm, imposed_vmax or vmax)
q_images.append(image)
# Average together across the colors to make the final image
image = sum([im/len(q_images) for im in q_images])
images.append((date, image))
for i, (date, image) in enumerate(images):
ax = axs.ravel()[i]
ax.set_title('{}'.format(date.strftime("%Y-%m-%d")))
axs.ravel()[i].imshow(image)
for j in range(i+1, int(np.ceil(nplots/7)*7)):
axs.ravel()[j].axis('off')
query_colors = ["{} ({})".format(q, c) for q, c in zip(queries, colormaps)]
plt.suptitle("Mentions of {} from {} to {}".format(', '.join(query_colors), papers_df.date.min().strftime("%Y-%m-%d"), papers_df.date.max().strftime("%Y-%m-%d")), fontsize=32)
return vmax
In [23]:
plot_multiple_queries(['Gorsuch', 'Sessions'], ['Greens', 'Oranges'], df)
Out[23]:
In [24]:
plot_multiple_queries(['Korea', 'France'], ['Greens', 'Oranges'], df)
Out[24]:
In [42]:
last_few_days = df[df.date == datetime.date(2017, 6, 13)]
In [46]:
import datetime
from functools import partial
def isin(collection, query):
return query in collection
last_few_days[last_few_days.bow.apply(partial(isin, query='bill'))]
Out[46]: