In [2]:
    
from jupyter_cms.loader import load_notebook
eda = load_notebook('./data_exploration.ipynb')
df, newspapers = eda.load_data()
    
In [6]:
    
import pandas as pd
pd.set_option('display.max_columns', 100)
    
In [7]:
    
df.head(3)
    
    Out[7]:
In [10]:
    
print('''Rows: {}
Dates: {} ({} - {})
'''.format(
    df.shape[0],
    df.date.nunique(),
    df.date.min(),
    df.date.max()
))
    
    
In [14]:
    
import spacy
nlp = spacy.load('en')
    
In [41]:
    
docs = []
for i, doc in enumerate(nlp.pipe(df.text, batch_size=10000, n_threads=7)):
    if i % 5000 == 0:
        print('.', end='')
    docs.append(doc)
    
    
In [268]:
    
def remove_token(t):
    return not t.is_alpha or t.is_stop
    
In [270]:
    
lemmas = []
for d in docs:
    d_lemmas = []
    for t in d:
        if not remove_token(t):
            d_lemmas.append(t.lemma_)
    
    lemmas.append(d_lemmas)
    
In [271]:
    
import itertools
df['lemmas'] = lemmas
    
In [346]:
    
newspaper_text = df.groupby(['date']).lemmas.apply(lambda x: list(itertools.chain(*x)))
newspapers_per_day = df.groupby(['date']).slug.nunique()
    
In [357]:
    
import sys
from collections import Counter
newspaper_tfs = []
# tf - number of times word shows up in current document
# doc_freqs - number of documents that has a given word
for i, d in enumerate(newspaper_text):
    if i % 10000 == 0:
        print('.', end='')
        sys.stdout.flush()
    tf = Counter(d)
    newspaper_tfs.append(tf)
    
    
To detect anomolously high usage of a word on a day, we want several things:
We also want a multi-level model that will estimate two things in tandem: the presence of clusters of topics, as well as anomolous vocabulary usage within each topic.
Already we can compare two numbers: how frequently a word appears in the text vs. how frequently it appears in other documents that mention that word.
In [398]:
    
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
    
In [423]:
    
fig = plt.figure(figsize=(12, 5))
plot1 = fig.add_subplot(131)
plot2 = fig.add_subplot(132)
plot3 = fig.add_subplot(133)
plot1.plot(range(len(newspaper_text)), [len(text) for text in newspaper_text])
plot1.set_xlabel("Day of scrape")
plot1.set_ylabel("Words")
plot2.plot(range(len(day_vocab_props)), [len(x) for x in day_vocab_props])
plot2.set_xlabel("Day of scrape")
plot2.set_ylabel("Unique words")
plot3.plot(range(len(newspapers_per_day)), newspapers_per_day.values)
plot3.set_xlabel("Day of scrape")
plot3.set_ylabel("Number of newspapers")
plt.tight_layout()
    
    
In [438]:
    
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer()
X = dv.fit_transform(newspaper_tfs)
    
In [468]:
    
from sklearn.feature_selection import chi2
def get_labels_for_day(day, N):
    arr = np.zeros(N)
    arr[day] = 1
    
    return arr
get_labels_for_day(2, 5)
    
    Out[468]:
In [ ]:
    
N = len(newspaper_tfs)
words = np.array(dv.get_feature_names())
    
In [502]:
    
top_words_by_day = []
for i in range(N):
    print('.', end = '')
    sys.stdout.flush()
    
    keyness, _ = chi2(X, get_labels_for_day(i, N))
    ranking = np.argsort(keyness)[::-1]
    top_words = words[ranking]
    top_words_by_day.append(list(zip(top_words, keyness[ranking])))
    
    
In [506]:
    
sum([sys.getsizeof(x) for x in top_words_by_day])
    
    Out[506]:
In [536]:
    
for date, top_words in zip(newspaper_meta, top_words_by_day):
    print('.', end='')
    sys.stdout.flush()
    
    date_str = pd.to_datetime(str(date)).strftime('%Y-%m-%d')
    
    with open('results/top-words/{}.csv'.format(date_str), 'w') as out:
        out.write('\n'.join([','.join([line[0], str(np.round(line[1], 2))]) for line in top_words]))
    
    
In [539]:
    
newspaper_day_text = df.groupby(['date', 'slug']).lemmas.apply(lambda x: list(itertools.chain(*x)))
    
In [549]:
    
newspaper_day_meta = df.groupby(['date', 'slug']).first().reset_index()[['date', 'slug']]
    
In [654]:
    
newspaper_day_tf = []
for lemmas in newspaper_day_text:
    newspaper_day_tf.append(Counter([lemma for lemma in lemmas if len(lemma) > 2]))
    
In [655]:
    
dv = DictVectorizer()
X = dv.fit_transform(newspaper_day_tf)
    
In [656]:
    
newspaper_day_tf = np.array(newspaper_day_tf)
    
In [620]:
    
def get_day(day):
    date = newspaper_day_meta.date.unique()[day]
    return newspaper_day_meta[newspaper_day_meta.date == date].index
    
In [608]:
    
def get_slug_in_day(slug, day):
    date = newspaper_day_meta.date.unique()[day]
    ndf = newspaper_day_meta[newspaper_day_meta.date == date].reset_index()
    return ndf[ndf.slug == slug].index[0]
    
In [615]:
    
def get_day_slugs(day):
    date = newspaper_day_meta.date.unique()[day]
    return newspaper_day_meta[newspaper_day_meta.date == date].slug.values
    
In [ ]:
    
top_words_by_slug_day = []
words = np.array(dv.get_feature_names())
for i in range(N):
    print('.', end = '')
    sys.stdout.flush()
    
    day_ix = get_day(i)
    X_universe = X[day_ix, ]
    
    day_slugs = get_day_slugs(i)
    M = len(day_slugs)
    for slug in day_slugs:
        j = get_slug_in_day(slug, i)
        
        keyness, _ = chi2(X_universe, get_labels_for_day(j, M))
        ranking = np.argsort(np.nan_to_num(keyness))[::-1]
        top_words = words[ranking[:100]]
        top_words_by_slug_day.append(top_words)
    
    
    
In [ ]:
    
len(top_words_by_slug_day)
    
In [ ]:
    
1
    
In [ ]:
    
1