In [ ]:
from testdataextractor.testdataextractor.extractor import Extractor
from summpy.summpy import lexrank
import pandas as pd
In [ ]:
testarticles = [1957284403,1965754064,233465322,283147769,362778020,37793736,389321649,540607195,60134403,887344770, ]
all_articles = []
all_sets_sentences = []
all_article_sentences = []
all_comments_sentences = []
all_groups = [] # each comment is a group, the article counts as a comment
for art in testarticles:
ext = Extractor("../test_data/{0}.ofs.gold.xml".format(art))
article = ext.extract(verbose=True)
all_articles.append(article)
df_article = pd.DataFrame.from_dict(article['sentences'], orient='index')
sorted_indexes = [ "s{0}".format(x) for x in range(len(article['sentences'].values()))]
sentences = list(df_article.ix[sorted_indexes, 'text'])
ordered_sentences = df_article.ix[sorted_indexes]
article_sentences_ix = ordered_sentences.ix[:,'comment'].isnull()
art_sentences = ordered_sentences[article_sentences_ix]
com_sentences = ordered_sentences[article_sentences_ix == False]
article_sentences = list(art_sentences['text'])
comment_sentences = list(com_sentences['text'])
groups_of_sentences = ordered_sentences.groupby(by='comment', sort=False)
groupcount = len(groups_of_sentences.groups.keys())
index = ["c{0}".format(i) for i in range(groupcount)]
grouped_comments = []
for g in index:
com = " | ".join(list(groups_of_sentences.get_group(g)['text']))
grouped_comments.append(com)
grouped_comments.append(" | ".join(article_sentences))
all_article_sentences.append(article_sentences)
all_comments_sentences.append(comment_sentences)
all_groups.append(grouped_comments)
if df_article.ix['s2', 'text'] == sentences[2]:
print "Extracted list of sentences is in a proper order."
all_sets_sentences.append(sentences)
else:
print "Extracted list of sentences is unordered."
In [ ]:
from gensim import corpora, models, similarities
from nltk.tokenize import TweetTokenizer
import nltk.stem
import math
def preprocess_docs(documents):
tokenizer = TweetTokenizer()
english_stemmer = nltk.stem.SnowballStemmer('english')
texts = [tokenizer.tokenize(d) for d in documents]
stemmed_texts = []
for text in texts:
stemmed_text = [english_stemmer.stem(t) for t in text]
stemmed_texts.append(stemmed_text)
return stemmed_texts
def strong_similarities_and_appropriate_links_thresh(lsi_queries, index):
'''
Returns a similarity dictionary with all the sentences
in lsi_queries, and their lists of strongest links tuples
with the sentence id link and the similarity percentage.
'''
total_links = 0
similarity_dict = {}
for i, query in enumerate(lsi_queries):
sims = index[query]
strong_sims = [s for s in list(enumerate(sims)) if s[1] > 0.999]
similarity_dict[i] = strong_sims
links = len(strong_sims)
total_links += links
# max_links is the average number of links per query sentence
min_links = 1
max_links = math.ceil(total_links/float(len(lsi_queries)))
thresh = (min_links, max_links) # non-inclusive
return similarity_dict, thresh
def perform_queries_and_get_links(lsi_queries, index):
s_dict, thresh = strong_similarities_and_appropriate_links_thresh(lsi_queries,
index)
pruned_dict = {sid: simils for sid, simils in zip(s_dict.keys(), s_dict.values())
if len(simils) > thresh[0] and len(simils) < thresh[1]}
strong_sentences = len(pruned_dict.keys())
selected_pairs = sum([len(x) for x in pruned_dict.values()])
print "\n{0} strong sentences".format(strong_sentences)
print "{0} total sentence-sentence pairs".format(selected_pairs)
print thresh
return pruned_dict
def find_links_between_in(documents, comments_sentences):
stemmed_texts = preprocess_docs(documents)
dictionary = corpora.Dictionary(stemmed_texts)
dictionary.filter_extremes(no_below=1, no_above=0.8)
corpus = [dictionary.doc2bow(text) for text in stemmed_texts]
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)
index = similarities.MatrixSimilarity(lsi[corpus])
# comment_start_index = len(article_sentences)
stemmed_queries = preprocess_docs(comments_sentences)
query_dict = corpora.Dictionary(stemmed_queries)
lsi_queries = [lsi[query_dict.doc2bow(text)] for text in stemmed_queries]
similarity_dict = perform_queries_and_get_links(lsi_queries, index)
return similarity_dict
documents = all_groups[0]
comments_s = all_comments_sentences[0]
article_s = all_article_sentences[0]
In [ ]:
def find_sentence_links_in_all_articles():
all_similarity_dicts = []
for i, docs in enumerate(all_sets_sentences):
comments = all_comments_sentences[i]
article = all_article_sentences[i]
print "\n\nARTICLE {0}".format(i)
s_dict = find_links_between_in(docs, comments)
all_similarity_dicts.append(s_dict)
return all_similarity_dicts
% time all_similarity_dicts = find_sentence_links_in_all_articles()
In [ ]:
def output_top_sentence_pairs(s_dict, all_art_sentences,
all_sentences,
all_comment_sentences):
d = s_dict
comment_start_index = len(all_art_sentences)
sentences = all_sentences
comment_sentences = all_comment_sentences
for comment_sentence, links in zip(d.keys(),d.values()):
s1_id = "s{0}".format(comment_sentence + comment_start_index)
s2_id = "s{0}".format(links[0][0])
print "\nLink found"
print comment_sentence+comment_start_index, [l[0] for l in links]
print "s{0} is:\n{1}\nSimilar too:".format(comment_sentence+comment_start_index,
comment_sentences[comment_sentence].encode('utf8'))
for i, (l_id, prob) in enumerate(links):
print "S{0}: {1}".format(l_id,sentences[l_id].encode('utf8'))
for i, _ in enumerate(all_articles):
print "\nARTICLE {0}==========".format(i)
output_top_sentence_pairs(all_similarity_dicts[i],
all_article_sentences[i],
all_sets_sentences[i],
all_comments_sentences[i])