In [1]:
from testdataextractor.testdataextractor.extractor import Extractor
from summpy.summpy import lexrank
import pandas as pd
In [57]:
testarticles = [1957284403,1965754064,233465322,283147769,362778020,37793736,389321649,540607195,60134403,887344770, ]
all_articles = []
all_sets_sentences = []
for art in testarticles:
ext = Extractor("../test_data/{0}.ofs.gold.xml".format(art))
article = ext.extract(verbose=True)
all_articles.append(article)
df_article = pd.DataFrame.from_dict(article['sentences'], orient='index')
sorted_indexes = [ "s{0}".format(x) for x in range(len(article['sentences'].values()))]
sentences = list(df_article.ix[sorted_indexes, 'text'])
if df_article.ix['s2', 'text'] == sentences[2]:
print "Extracted list of sentences is in a proper order."
all_sets_sentences.append(sentences)
else:
print "Extracted list of sentences is unordered."
In [22]:
all_ranked_sentences = []
all_matrxs = []
for sentences in all_sets_sentences:
ranked_sentences, similarity_mtrx = lexrank.lexrank(sentences)
all_ranked_sentences.append(ranked_sentences)
all_matrxs.append(similarity_mtrx)
In [23]:
all_ranked_dfs = []
all_similarity_dfs = []
for ranked_sentences, similarity_mtrx in zip(all_ranked_sentences, all_matrxs):
df_ranked_sentences = pd.DataFrame.from_dict(ranked_sentences,
orient='index')
df_similarity_mtrx = pd.DataFrame(similarity_mtrx)
all_ranked_dfs.append(df_ranked_sentences)
all_similarity_dfs.append(df_similarity_mtrx)
In [40]:
all_top_sents = []
all_bot_sents = []
for df_ranked_sentences in all_ranked_dfs:
mean_score = df_ranked_sentences.mean(axis=0)
min_score = df_ranked_sentences.min(axis=0)
tresh = mean_score*.90 + min_score*.1
top_sent = list(df_ranked_sentences[df_ranked_sentences > tresh].dropna().index)
bottom_sent = list(df_ranked_sentences[df_ranked_sentences < tresh].dropna().index)
all_top_sents.append(top_sent)
all_bot_sents.append(bottom_sent)
print "{0} top sentences and {1} bottom_sentences".format(len(top_sent), len(bottom_sent))
Answering these questions will probably allow me to know if the saliency of these sentences given by lexrank is a good feature for finding links, or if it is just useful for finding the most important ones and that's it.
In [41]:
for top_sent, bottom_sent, article in zip(all_top_sents, all_bot_sents, all_articles):
top_sent_set = { "s{0}".format(s) for s in top_sent }
bot_sent_set = { "s{0}".format(s) for s in bottom_sent }
both_top = 0
one_top = 0
both_bottom = 0
other = 0
link_dicts = article['links'].values()
total = len(link_dicts)*1.0
for l in link_dicts:
s_art = l['art_sentence']
s_com = l['com_sentence']
if s_art in top_sent_set and s_com in top_sent_set:
both_top += 1
one_top += 1
elif s_art in bot_sent_set and s_com in bot_sent_set:
both_bottom += 1
else:
other += 1
one_top += 1
print "TOP: {0}, BOTTOM: {1}\nMIXED: {2}, AT LEAST ONE TOP: {3}\n\n"\
.format(both_top/total,both_bottom/total,other/total, one_top/total)
This means that this can be used to just classify pairs that contain at least one top ranked sentence. Top ranked sentences are the ones that are above a certain treshold which depends on the min value and the mean value.
In [52]:
from itertools import product
all_pairs = []
for top_sent, bottom_sent in zip(all_top_sents, all_bot_sents):
top_sent_set = { "s{0}".format(s) for s in top_sent }
bot_sent_set = { "s{0}".format(s) for s in bottom_sent }
pairs = list(product(top_sent_set, bot_sent_set))
all_pairs.append(pairs)
print len(pairs)
In [55]:
all_pruned_pairs = []
for index, pairs in enumerate(all_pairs):
art = all_articles[index]
sents = art['sentences']
pruned = [p for p in pairs
if sents[p[0]].get('comment', 'none') !=
sents[p[1]].get('comment', 'none')]
all_pruned_pairs.append(pruned)
print len(pruned)
I am going to need to calculate entailment and wordnet still for the pair, so I can use this features to just keep pairs that are close enough, at least for the classification.
nltk has wordnet class with path_similarity that returns a score of path similarity saying how similar two word senses are.
Finding semantic similarity is done at a word level. I am going to have to make a vector with the semantic similarity of each word. ...
Get the most important words (a.k.a. words with largest tfidf)?
Do part of speech tagging on all the words, and convert the parts of speech into the module attributes from wordnet?
Find semantic similarity between a pair of sentences by averaging?
Proposed approach to finding the links:
After this, the similarity vectors of the pair can be one of the training features for the algorithm. We can also then add sentiment analysis as a feature, maybe. Another idea for a feature is to add something from wikipedia. Maybe make two similarity vectors, one from wordnet and one from wikipedia (how many links away is a term from another term) but only with the highest tfidf scoring words in both sentences.
In the paper [1], they use the Brown Corpus to get the weight for each word in a similarity vector.
However, I propose using my entire sentence corpus (so the article and comments context) to get the weights. There are two ways to do this:
In [ ]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import numpy as np
import scipy.sparse as sp
from textblob import TextBlob
def get_weights_for_words_in_sentences(sentences):
allblobs = TextBlob('. '.join(sentences))
total_words = len(allblobs.words)
total_unique = len(allblobs.word_counts)
all_term_counts = np.array(allblobs.word_counts.values())
print "\nAll words: ", total_words, all_term_counts.sum()
print "Unique words: ", total_unique
print "Overall sentiment: ", allblobs.sentiment
infos = 1 - np.log(all_term_counts*1.0 + 1.0) / np.log(all_term_counts.sum() + 1.0)
info_dict = dict(zip(allblobs.word_counts.keys(), infos.tolist()))
return info_dict
def get_weights_for_sentences(sentences):
vectorizer = CountVectorizer()
# transformer = TfidfTransformer()
counts_matrix = vectorizer.fit_transform(sentences)
all_term_counts = np.diff(sp.csc_matrix(counts_matrix, copy=False).indptr)
infos = 1 - np.log(all_term_counts*1.0 + 1.0) / np.log(all_term_counts.sum() + 1.0)
# print "total words"
print all_term_counts.sum()
info_dict = {w: infos[i] for i, w in enumerate(vectorizer.get_feature_names())}
return info_dict
info_dict = get_weights_for_sentences([s['text'] for s in all_articles[0]['sentences'].values()])
count_dict = get_weights_for_words_in_sentences([s['text'] for s in all_articles[0]['sentences'].values()])
count_frame = pd.DataFrame.from_dict(count_dict, orient='index')
frame = pd.DataFrame.from_dict(info_dict, orient='index')
count_dict.get('to', "not found")
# This shows the difference between the info dict obtained from
# text blob and from sklearn
# count_frame.sort_values(by=0, axis='index').ix[:11],\
# frame.sort_values(by=0, axis='index').ix[:11]
# This shows how the sklearn tokenizer is worse than textblobs...
# Well, not worse, but it finds way less words, and different words...
# Lets use the same tokenizer (textblob)
# print "Count dict differences"
# for x in count_dict:
# if x not in info_dict:
# print "{0} not in dict".format(x.encode('utf-8'))
# print "\n\nInfo dict differences"
# for x in info_dict:
# if x not in count_dict:
# print "{0} not in dict".format(x.encode('utf-8'))
In [356]:
from textblob import TextBlob, Word
from textblob.wordnet import VERB, NOUN, ADJ, ADV
import scipy
import re
re_noun = re.compile('.*N[NPR].*')
re_adj = re.compile('.*JJ.*')
re_verb = re.compile('.*(VB|BE|DO|HV).*')
re_adv = re.compile('.*W?RB.*')
WORDNET_TRESH = 0.20
def pos_to_wordnetpos(pos):
'''
NN or NP or NR = NOUN
JJ = ADJECTIVE
VB or BE or DO or HV = VERB
WRB or RB = ADVERB
'''
if re_noun.search(pos):
return NOUN
elif re_adj.search(pos):
return ADJ
elif re_verb.search(pos):
return VERB
elif re_adv.search(pos):
return ADV
else:
return None
def word_to_synset(pos_tagged_word):
'''
NN or NP or NR = NOUN
JJ = ADJECTIVE
VB or BE or DO or HV = VERB
WRB or RB = ADVERB
'''
w = pos_tagged_word[0]
pos = pos_to_wordnetpos(pos_tagged_word[1])
return dissambiguate_synset(w, pos)
def dissambiguate_synset(word, wordnet_pos):
synset = Word(word).get_synsets(wordnet_pos)
if synset.__class__ == list:
if len(synset) > 0:
return synset[0]
else:
return None
else:
return synset
def pre_process_pair(blob1,blob2):
# get the sentence POS tags and create the wordnet objects
tagged_words1 = {w[0]: pos_to_wordnetpos(w[1]) for w in blob1.tags}
tagged_words2 = {w[0]: pos_to_wordnetpos(w[1]) for w in blob2.tags}
# create word set with unique words, and convert it to list for iteration
synsets1 = {word_to_synset(w) for w in blob1.tags}
synsets2 = {word_to_synset(w) for w in blob2.tags}
words_corpus = list(synsets1.union(synsets2).difference({None}))
return tagged_words1, tagged_words2, synsets1, synsets2, words_corpus
# function to find similarity of word with set of words
def similarity_with_words(synset1, blob, tag_dict):
max_sim = 0
most_similar_w = ''
for w in blob.words:
synset2 = dissambiguate_synset(w, tag_dict.get(w, None))
if synset1 and synset2:
if synset1 == synset2:
max_s = 1
most_similar_w = w
break
wordnet_sim = synset1.path_similarity(synset2)
wordnet_sim = wordnet_sim if wordnet_sim > WORDNET_TRESH else 0
if wordnet_sim > max_sim:
max_sim = wordnet_sim
most_similar_w = w
# weight similarity using tfidf (or I can use real word frequencies)
if most_similar_w != '':
weight = weight_dict[most_similar_w.lower()]
max_sim *= weight
return max_sim
def find_pair_similarity(s1, s2):
blob1 = TextBlob(s1)
blob2 = TextBlob(s2)
tagged_words1, tagged_words2, synsets1, synsets2, words_corpus = pre_process_pair(blob1, blob2)
# print 'This is the word set:\n', ",".join([x._name for x in words_corpus])
# form similarity vectors
s1 = []
s2 = []
for i,synset in enumerate(words_corpus):
# note that I should have the synsets in my corpus...
s1.append(similarity_with_words(synset, blob1, tagged_words1))
s2.append(similarity_with_words(synset, blob2, tagged_words2))
similarity = scipy.spatial.distance.cosine(s1, s2)
frame = pd.DataFrame([s1,s2])
return similarity, frame
In [359]:
pairs = all_pruned_pairs[0]
article = all_articles[0]
def evaluate_links_in_article(article):
test_article_sentences = [s['text'] for s in article['sentences'].values()]
weight_dict = get_weights_for_words_in_sentences(
test_article_sentences
)
for l in article['links'].values():
links1 = l['art_sentence']
links2 = l['com_sentence']
s1 = article['sentences'][links1]['text']
s2 = article['sentences'][links2]['text']
similarity, frame = find_pair_similarity(s1,s2)
similarity = 0 if np.isnan(similarity) else similarity
print similarity
evaluate_links_in_article(article)
In [331]:
np.isnan(similarity)
Out[331]:
In [323]:
x
Out[323]:
In [ ]: