In [ ]:
from testdataextractor.testdataextractor.extractor import Extractor
from summpy.summpy import lexrank
import pandas as pd

Get some data


In [ ]:
testarticles = [1957284403,1965754064,233465322,283147769,362778020,37793736,389321649,540607195,60134403,887344770, ]
all_articles = []
all_sets_sentences = []
all_article_sentences = []
all_comments_sentences = []
all_groups = [] # each comment is a group, the article counts as a comment
for art in testarticles:
    ext = Extractor("../test_data/{0}.ofs.gold.xml".format(art))
    article = ext.extract(verbose=True)
    all_articles.append(article)
    
    df_article = pd.DataFrame.from_dict(article['sentences'], orient='index')
    sorted_indexes = [ "s{0}".format(x) for x in range(len(article['sentences'].values()))]
    sentences = list(df_article.ix[sorted_indexes, 'text'])
    
    ordered_sentences = df_article.ix[sorted_indexes]
    article_sentences_ix = ordered_sentences.ix[:,'comment'].isnull()
    
    art_sentences = ordered_sentences[article_sentences_ix]
    com_sentences = ordered_sentences[article_sentences_ix == False]
    
    article_sentences = list(art_sentences['text'])
    comment_sentences = list(com_sentences['text'])
    
    groups_of_sentences = ordered_sentences.groupby(by='comment', sort=False)
    groupcount = len(groups_of_sentences.groups.keys())
    index = ["c{0}".format(i) for i in range(groupcount)]
    grouped_comments = []
    for g in index:
        com = " | ".join(list(groups_of_sentences.get_group(g)['text']))
        grouped_comments.append(com)

    grouped_comments.append(" | ".join(article_sentences))
    
    all_article_sentences.append(article_sentences)
    all_comments_sentences.append(comment_sentences)
    all_groups.append(grouped_comments)
    
    if df_article.ix['s2', 'text'] == sentences[2]:
        print "Extracted list of sentences is in a proper order."
        all_sets_sentences.append(sentences)
    else:
        print "Extracted list of sentences is unordered."

Sentence-sentence pairs with gensim


In [ ]:
from gensim import corpora, models, similarities
from nltk.tokenize import TweetTokenizer
import nltk.stem
import math

def preprocess_docs(documents):
    tokenizer = TweetTokenizer()
    english_stemmer = nltk.stem.SnowballStemmer('english')

    texts = [tokenizer.tokenize(d) for d in documents]

    stemmed_texts = []
    for text in texts:
        stemmed_text = [english_stemmer.stem(t) for t in text]
        stemmed_texts.append(stemmed_text)
    return stemmed_texts

def strong_similarities_and_appropriate_links_thresh(lsi_queries, index):
    '''
    Returns a similarity dictionary with all the sentences
    in lsi_queries, and their lists of strongest links tuples
    with the sentence id link and the similarity percentage.
    '''
    total_links = 0
    similarity_dict = {}

    for i, query in enumerate(lsi_queries):
        sims = index[query]
        
        strong_sims = [s for s in list(enumerate(sims)) if s[1] > 0.999]

        similarity_dict[i] = strong_sims
        links = len(strong_sims)
        
        total_links += links

    # max_links is the average number of links per query sentence
    min_links = 1
    max_links = math.ceil(total_links/float(len(lsi_queries))) 
    thresh = (min_links, max_links) # non-inclusive
    return similarity_dict, thresh


def perform_queries_and_get_links(lsi_queries, index):
    s_dict, thresh = strong_similarities_and_appropriate_links_thresh(lsi_queries,
                                                                      index)
    pruned_dict = {sid: simils for sid, simils in zip(s_dict.keys(), s_dict.values()) 
                   if len(simils) > thresh[0] and len(simils) < thresh[1]}
    
    strong_sentences = len(pruned_dict.keys())
    selected_pairs = sum([len(x) for x in pruned_dict.values()])
    
    print "\n{0} strong sentences".format(strong_sentences)
    print "{0} total sentence-sentence pairs".format(selected_pairs)
    print thresh
    return pruned_dict

def find_links_between_in(documents, comments_sentences):    
    stemmed_texts = preprocess_docs(documents)
    dictionary = corpora.Dictionary(stemmed_texts)
    dictionary.filter_extremes(no_below=1, no_above=0.8)
    corpus = [dictionary.doc2bow(text) for text in stemmed_texts]

    lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)
    index = similarities.MatrixSimilarity(lsi[corpus])

#     comment_start_index = len(article_sentences)
    stemmed_queries = preprocess_docs(comments_sentences)
    query_dict = corpora.Dictionary(stemmed_queries)
    lsi_queries = [lsi[query_dict.doc2bow(text)] for text in stemmed_queries]
    similarity_dict = perform_queries_and_get_links(lsi_queries, index)
    return similarity_dict

documents = all_groups[0]
comments_s = all_comments_sentences[0]
article_s = all_article_sentences[0]

In [ ]:
def find_sentence_links_in_all_articles():
    all_similarity_dicts = []
    for i, docs in enumerate(all_sets_sentences):
        comments = all_comments_sentences[i]
        article = all_article_sentences[i]
        print "\n\nARTICLE {0}".format(i)
        s_dict = find_links_between_in(docs, comments)
        all_similarity_dicts.append(s_dict)
    return all_similarity_dicts

% time all_similarity_dicts = find_sentence_links_in_all_articles()

In [ ]:
def output_top_sentence_pairs(s_dict, all_art_sentences,
                              all_sentences, 
                              all_comment_sentences):
    d = s_dict
    comment_start_index = len(all_art_sentences)
    sentences = all_sentences
    comment_sentences = all_comment_sentences
    
    for comment_sentence, links in zip(d.keys(),d.values()):

        s1_id = "s{0}".format(comment_sentence + comment_start_index)
        s2_id = "s{0}".format(links[0][0])
        print "\nLink found"
        print comment_sentence+comment_start_index, [l[0] for l in links]        
        print "s{0} is:\n{1}\nSimilar too:".format(comment_sentence+comment_start_index,
                                                  comment_sentences[comment_sentence].encode('utf8'))
        for i, (l_id, prob) in enumerate(links):
            print "S{0}: {1}".format(l_id,sentences[l_id].encode('utf8'))

for i, _ in enumerate(all_articles):
    print "\nARTICLE {0}==========".format(i)
    output_top_sentence_pairs(all_similarity_dicts[i],
                              all_article_sentences[i],
                              all_sets_sentences[i],
                              all_comments_sentences[i])