In [1]:
import os.path
from collections import Counter
import pandas as pd
from spacy.en import English
from newsparser.data import load_feeds
%matplotlib inline
In [2]:
nlp = English()
In [3]:
folder = os.path.join('..', 'data')
In [4]:
feeds = load_feeds(folder)
In [5]:
entries = { feed: feed.get_entries() for feed in feeds }
In [6]:
filtered_entries = {
feed: [
entry
for entry in entries[feed]
if 'filter' in entry.data and not entry.data['filter']['discarded']
]
for feed in feeds
}
In [8]:
for feed in feeds:
for entry in filtered_entries[feed]:
entry.load_content()
In [9]:
def create_counter(entry):
"""
Creates a counter of words in entry.content
and stores it in entry.counter.
It needs entry.doc to be defined
"""
counter = Counter(
word.text
for word in entry.doc
)
s = sum(x for x in counter.values())
for k in counter:
counter[k] /= s
return counter
In [10]:
def intersection_dist(c1, c2):
return min(1, sum(
min(c1[k], c2[k])
for k in set(c1).intersection(set(c2))
))
In [11]:
def date_between(date, low=None, high=None):
"""
Returns entry.date in [low, high]
If high is None, high = low
"""
b = not(date is None)
if not(low is None):
b = b and low <= date
if not(high is None):
b = b and date <= high
return b
In [12]:
by_date_entries = {}
for feed in feeds:
d = {}
for entry in filtered_entries[feed]:
date = entry.data['date']
if date not in d:
d[date] = []
d[date].append(entry)
by_date_entries[feed] = d
In [30]:
day_entries = []
for feed in feeds:
for entry in filtered_entries[feed]:
entry.date = create_date(entry)
if date_between(entry.date, '2015-11-05', '2015-11-05'):
entry.doc = nlp(entry.content)
entry.counter = create_counter(entry)
day_entries.append(entry)
In [34]:
df = pd.DataFrame(
[
[
e1.feedname, e1.index, e2.feedname, e2.index,
intersection_dist(e1.counter, e2.counter)
]
for i, e1 in enumerate(day_entries)
for e2 in day_entries[i+1:]
],
columns=['e1_feedname', 'e1_index', 'e2_feedname', 'e2_index', 'dist']
)
In [64]:
dists = pd.DataFrame(df['dist'].apply(lambda x: int(x*1000)/1000)).groupby('dist').size()
In [65]:
dists.plot()
Out[65]:
In [78]:
dists[0.95 <= dists.index].plot()
Out[78]:
In [77]:
dists[(0.6 <= dists.index) & (dists.index <= 0.99)].plot()
Out[77]:
In [76]:
df2['dist'].apply(lambda x: x >= 0.75).mean()
Out[76]: