In [1]:
import os.path
from collections import Counter

import pandas as pd
from spacy.en import English

from newsparser.data import load_feeds

%matplotlib inline

In [2]:
nlp = English()

In [3]:
folder = os.path.join('..', 'data')

In [4]:
feeds = load_feeds(folder)

In [5]:
entries = { feed: feed.get_entries() for feed in feeds }

In [6]:
filtered_entries = {
    feed: [
        entry
        for entry in entries[feed]
        if 'filter' in entry.data and not entry.data['filter']['discarded']
    ]
    for feed in feeds
}

In [8]:
for feed in feeds:
    for entry in filtered_entries[feed]:
        entry.load_content()

In [9]:
def create_counter(entry):
    """
        Creates a counter of words in entry.content
        and stores it in entry.counter.
        
        It needs entry.doc to be defined
    """        
    counter = Counter(
        word.text
        for word in entry.doc
    )
    
    s = sum(x for x in counter.values())
    for k in counter:
        counter[k] /= s
        
    return counter

In [10]:
def intersection_dist(c1, c2):        
    return min(1, sum(
        min(c1[k], c2[k])
        for k in set(c1).intersection(set(c2))
    ))

In [11]:
def date_between(date, low=None, high=None):
    """ 
        Returns entry.date in [low, high] 
        If high is None, high = low
    """    
    b = not(date is None)
    if not(low is None):
        b = b and low <= date
    if not(high is None):
        b = b and date <= high
        
    return b

In [12]:
by_date_entries = {}

for feed in feeds:
    d = {}
    
    for entry in filtered_entries[feed]:
        date = entry.data['date']
        if date not in d:
            d[date] = []
            
        d[date].append(entry)
        
    by_date_entries[feed] = d

In [30]:
day_entries = []

for feed in feeds:
    for entry in filtered_entries[feed]:
        entry.date = create_date(entry)
        
        if date_between(entry.date, '2015-11-05', '2015-11-05'):
            entry.doc = nlp(entry.content)
            entry.counter = create_counter(entry)
            day_entries.append(entry)

In [34]:
df = pd.DataFrame(
    [
        [
            e1.feedname, e1.index, e2.feedname, e2.index,
            intersection_dist(e1.counter, e2.counter)
        ]
        for i, e1 in enumerate(day_entries)
        for e2 in day_entries[i+1:]
    ],
    columns=['e1_feedname', 'e1_index', 'e2_feedname', 'e2_index', 'dist']
)

What is the threshold?


In [64]:
dists = pd.DataFrame(df['dist'].apply(lambda x: int(x*1000)/1000)).groupby('dist').size()

In [65]:
dists.plot()


Out[65]:
<matplotlib.axes._subplots.AxesSubplot at 0x1c9b92940>

In [78]:
dists[0.95 <= dists.index].plot()


Out[78]:
<matplotlib.axes._subplots.AxesSubplot at 0x1c99bfa90>

In [77]:
dists[(0.6 <= dists.index) & (dists.index <= 0.99)].plot()


Out[77]:
<matplotlib.axes._subplots.AxesSubplot at 0x1c99d59e8>

In [76]:
df2['dist'].apply(lambda x: x >= 0.75).mean()


Out[76]:
0.00020854384795648527