In [1]:
from testdataextractor.testdataextractor.extractor import Extractor
from summpy.summpy import lexrank
import pandas as pd

Get some data


In [57]:
testarticles = [1957284403,1965754064,233465322,283147769,362778020,37793736,389321649,540607195,60134403,887344770, ]
all_articles = []
all_sets_sentences = []
for art in testarticles:
    ext = Extractor("../test_data/{0}.ofs.gold.xml".format(art))
    article = ext.extract(verbose=True)
    all_articles.append(article)
    df_article = pd.DataFrame.from_dict(article['sentences'], orient='index')
    sorted_indexes = [ "s{0}".format(x) for x in range(len(article['sentences'].values()))]
    sentences = list(df_article.ix[sorted_indexes, 'text'])
    if df_article.ix['s2', 'text'] == sentences[2]:
        print "Extracted list of sentences is in a proper order."
        all_sets_sentences.append(sentences)
    else:
        print "Extracted list of sentences is unordered."


50  comments parsed.
190  sentences parsed.
140  links parsed.
Extracted list of sentences is in a proper order.
50  comments parsed.
203  sentences parsed.
102  links parsed.
Extracted list of sentences is in a proper order.
50  comments parsed.
290  sentences parsed.
199  links parsed.
Extracted list of sentences is in a proper order.
49  comments parsed.
178  sentences parsed.
92  links parsed.
Extracted list of sentences is in a proper order.
50  comments parsed.
170  sentences parsed.
79  links parsed.
Extracted list of sentences is in a proper order.
50  comments parsed.
184  sentences parsed.
1  links parsed.
Extracted list of sentences is in a proper order.
49  comments parsed.
278  sentences parsed.
25  links parsed.
Extracted list of sentences is in a proper order.
49  comments parsed.
156  sentences parsed.
26  links parsed.
Extracted list of sentences is in a proper order.
50  comments parsed.
196  sentences parsed.
14  links parsed.
Extracted list of sentences is in a proper order.
49  comments parsed.
178  sentences parsed.
2  links parsed.
Extracted list of sentences is in a proper order.

Feed data into lexrank

The summpy MIT licensed repo used for this task, returns a tuple with a dictionary with sentences index + score, and the similarity_matrix.


In [22]:
all_ranked_sentences = []
all_matrxs = []
for sentences in all_sets_sentences:
    ranked_sentences, similarity_mtrx = lexrank.lexrank(sentences)
    all_ranked_sentences.append(ranked_sentences)
    all_matrxs.append(similarity_mtrx)

See the data


In [23]:
all_ranked_dfs = []
all_similarity_dfs = []
for ranked_sentences, similarity_mtrx in zip(all_ranked_sentences, all_matrxs):
    df_ranked_sentences = pd.DataFrame.from_dict(ranked_sentences,
                                                 orient='index')
    df_similarity_mtrx = pd.DataFrame(similarity_mtrx)
    all_ranked_dfs.append(df_ranked_sentences)
    all_similarity_dfs.append(df_similarity_mtrx)

In [40]:
all_top_sents = []
all_bot_sents = []
for df_ranked_sentences in all_ranked_dfs:
    mean_score = df_ranked_sentences.mean(axis=0)
    min_score = df_ranked_sentences.min(axis=0)
    tresh = mean_score*.90 + min_score*.1
    top_sent = list(df_ranked_sentences[df_ranked_sentences > tresh].dropna().index)
    bottom_sent = list(df_ranked_sentences[df_ranked_sentences < tresh].dropna().index)
    all_top_sents.append(top_sent)
    all_bot_sents.append(bottom_sent)
    print "{0} top sentences and {1} bottom_sentences".format(len(top_sent), len(bottom_sent))


132 top sentences and 58 bottom_sentences
146 top sentences and 57 bottom_sentences
205 top sentences and 85 bottom_sentences
124 top sentences and 54 bottom_sentences
115 top sentences and 55 bottom_sentences
125 top sentences and 59 bottom_sentences
189 top sentences and 89 bottom_sentences
107 top sentences and 49 bottom_sentences
135 top sentences and 61 bottom_sentences
117 top sentences and 61 bottom_sentences
  • How many of the links are made entirely of top sentences?
  • How many are made of top and bottom sentences?
  • How many of just bottom sentences?

Answering these questions will probably allow me to know if the saliency of these sentences given by lexrank is a good feature for finding links, or if it is just useful for finding the most important ones and that's it.


In [41]:
for top_sent, bottom_sent, article in zip(all_top_sents, all_bot_sents, all_articles):
    top_sent_set = { "s{0}".format(s) for s in top_sent }
    bot_sent_set = { "s{0}".format(s) for s in bottom_sent }

    both_top = 0
    one_top = 0
    both_bottom = 0
    other = 0
    link_dicts = article['links'].values()
    total = len(link_dicts)*1.0
    for l in link_dicts:
        s_art = l['art_sentence']
        s_com = l['com_sentence']
        if s_art in top_sent_set and s_com in top_sent_set:
            both_top += 1
            one_top += 1
        elif s_art in bot_sent_set and s_com in bot_sent_set:
            both_bottom += 1
        else:
            other += 1
            one_top += 1

    print "TOP: {0}, BOTTOM: {1}\nMIXED: {2}, AT LEAST ONE TOP: {3}\n\n"\
    .format(both_top/total,both_bottom/total,other/total, one_top/total)


TOP: 0.435714285714, BOTTOM: 0.1
MIXED: 0.464285714286, AT LEAST ONE TOP: 0.9


TOP: 0.5, BOTTOM: 0.0882352941176
MIXED: 0.411764705882, AT LEAST ONE TOP: 0.911764705882


TOP: 0.48743718593, BOTTOM: 0.100502512563
MIXED: 0.412060301508, AT LEAST ONE TOP: 0.899497487437


TOP: 0.489130434783, BOTTOM: 0.0869565217391
MIXED: 0.423913043478, AT LEAST ONE TOP: 0.913043478261


TOP: 0.556962025316, BOTTOM: 0.0886075949367
MIXED: 0.354430379747, AT LEAST ONE TOP: 0.911392405063


TOP: 0.0, BOTTOM: 0.0
MIXED: 1.0, AT LEAST ONE TOP: 1.0


TOP: 0.6, BOTTOM: 0.04
MIXED: 0.36, AT LEAST ONE TOP: 0.96


TOP: 0.538461538462, BOTTOM: 0.115384615385
MIXED: 0.346153846154, AT LEAST ONE TOP: 0.884615384615


TOP: 0.357142857143, BOTTOM: 0.214285714286
MIXED: 0.428571428571, AT LEAST ONE TOP: 0.785714285714


TOP: 0.5, BOTTOM: 0.0
MIXED: 0.5, AT LEAST ONE TOP: 1.0


This means that this can be used to just classify pairs that contain at least one top ranked sentence. Top ranked sentences are the ones that are above a certain treshold which depends on the min value and the mean value.

So now I calculate the pairs


In [52]:
from itertools import product
all_pairs = []
for top_sent, bottom_sent in zip(all_top_sents, all_bot_sents):
    top_sent_set = { "s{0}".format(s) for s in top_sent }
    bot_sent_set = { "s{0}".format(s) for s in bottom_sent }
    
    pairs = list(product(top_sent_set, bot_sent_set))
    all_pairs.append(pairs)
    print len(pairs)


7656
8322
17425
6696
6325
7375
16821
5243
8235
7137

I can further prune this list

Remove all pairs that have the same comment, or are both from the article.


In [55]:
all_pruned_pairs = []
for index, pairs in enumerate(all_pairs):
    art = all_articles[index]
    sents = art['sentences']
    pruned = [p for p in pairs 
              if sents[p[0]].get('comment', 'none') != 
                 sents[p[1]].get('comment', 'none')]
    all_pruned_pairs.append(pruned)
    print len(pruned)


7019
7632
16735
6399
6185
7164
15950
4960
7856
6808

Maybe I can do some pruning by finding semantic similarity between pairs

I am going to need to calculate entailment and wordnet still for the pair, so I can use this features to just keep pairs that are close enough, at least for the classification.

nltk has wordnet class with path_similarity that returns a score of path similarity saying how similar two word senses are.

Finding semantic similarity is done at a word level. I am going to have to make a vector with the semantic similarity of each word. ...

What should I do first?

Get the most important words (a.k.a. words with largest tfidf)?

Do part of speech tagging on all the words, and convert the parts of speech into the module attributes from wordnet?

Find semantic similarity between a pair of sentences by averaging?

Proposed approach to finding the links:

  • Vectorize the two sentences together, to find weights for the words.
  • Make an array or a set of unique words (features present in the two sentence corpus).
  • For each sentence, make a semantic similarity vector like in [1]
    • It is important to note that they find word similarity differently than the normal wordnet approach. Because of the similarity error explained in 3.2.1 (animal is closer to boy than teacher in wordnet, when it clearly is not. This is because depth is not taken into account when comparing the path to the other word.)
  • Calculate the cosine distance between the two similarity vectors, to get a similarity value.

After this, the similarity vectors of the pair can be one of the training features for the algorithm. We can also then add sentiment analysis as a feature, maybe. Another idea for a feature is to add something from wikipedia. Maybe make two similarity vectors, one from wordnet and one from wikipedia (how many links away is a term from another term) but only with the highest tfidf scoring words in both sentences.

I should get a way to weigh words in my sentences first

In the paper [1], they use the Brown Corpus to get the weight for each word in a similarity vector.

However, I propose using my entire sentence corpus (so the article and comments context) to get the weights. There are two ways to do this:

  1. Use TFIDF vectorizer on all the sentences and get the weight like that. I have to find a way to not remove stopwords. Stopwords are not removed by default! This is great.
  2. Use Count vectorizer on all the sentences to get a dictionary of the words and their frequencies in the whole corpus and per sentence. Have to figure out how to do this. Stopwords are not removed by default! This is great.

In [ ]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import numpy as np
import scipy.sparse as sp
from textblob import TextBlob

def get_weights_for_words_in_sentences(sentences):
    allblobs = TextBlob('. '.join(sentences))

    total_words = len(allblobs.words)
    total_unique = len(allblobs.word_counts)
    all_term_counts = np.array(allblobs.word_counts.values())
    print "\nAll words: ", total_words, all_term_counts.sum()
    print "Unique words: ", total_unique
    print "Overall sentiment: ", allblobs.sentiment
    infos = 1 - np.log(all_term_counts*1.0 + 1.0)  / np.log(all_term_counts.sum() + 1.0)
    info_dict = dict(zip(allblobs.word_counts.keys(), infos.tolist()))
        
    return info_dict
    
def get_weights_for_sentences(sentences):
    vectorizer = CountVectorizer()
#     transformer = TfidfTransformer()
    
    counts_matrix = vectorizer.fit_transform(sentences)
    all_term_counts = np.diff(sp.csc_matrix(counts_matrix, copy=False).indptr)
    infos = 1 - np.log(all_term_counts*1.0 + 1.0)  / np.log(all_term_counts.sum() + 1.0)
#     print "total words"
    print all_term_counts.sum()
    info_dict = {w: infos[i] for i, w in enumerate(vectorizer.get_feature_names())}
    return info_dict
    

info_dict = get_weights_for_sentences([s['text'] for s in all_articles[0]['sentences'].values()])
count_dict = get_weights_for_words_in_sentences([s['text'] for s in all_articles[0]['sentences'].values()])
count_frame = pd.DataFrame.from_dict(count_dict, orient='index')
frame = pd.DataFrame.from_dict(info_dict, orient='index')
count_dict.get('to', "not found")
# This shows the difference between the info dict obtained from
# text blob and from sklearn
# count_frame.sort_values(by=0, axis='index').ix[:11],\
# frame.sort_values(by=0, axis='index').ix[:11]

# This shows how the sklearn tokenizer is worse than textblobs...
# Well, not worse, but it finds way less words, and different words...
# Lets use the same tokenizer (textblob)
# print "Count dict differences"
# for x in count_dict:
#     if x not in info_dict:
#         print "{0} not in dict".format(x.encode('utf-8'))

# print "\n\nInfo dict differences"
# for x in info_dict:
#     if x not in count_dict:
#         print "{0} not in dict".format(x.encode('utf-8'))

In [356]:
from textblob import TextBlob, Word
from textblob.wordnet import VERB, NOUN, ADJ, ADV
import scipy
import re

re_noun = re.compile('.*N[NPR].*')
re_adj = re.compile('.*JJ.*')
re_verb = re.compile('.*(VB|BE|DO|HV).*')    
re_adv = re.compile('.*W?RB.*')
WORDNET_TRESH = 0.20

def pos_to_wordnetpos(pos):
    '''    
    NN or NP or NR = NOUN
    JJ = ADJECTIVE
    VB or BE or DO or HV = VERB
    WRB or RB = ADVERB
    '''
    if re_noun.search(pos):
        return NOUN
    elif re_adj.search(pos):
        return ADJ
    elif re_verb.search(pos):
        return VERB
    elif re_adv.search(pos):
        return ADV
    else:
        return None
def word_to_synset(pos_tagged_word):
    '''    
    NN or NP or NR = NOUN
    JJ = ADJECTIVE
    VB or BE or DO or HV = VERB
    WRB or RB = ADVERB
    '''
    w = pos_tagged_word[0]    
    pos = pos_to_wordnetpos(pos_tagged_word[1])
    return dissambiguate_synset(w, pos)
    
def dissambiguate_synset(word, wordnet_pos):
    synset = Word(word).get_synsets(wordnet_pos)
    if synset.__class__ == list:
        if len(synset) > 0:
            return synset[0]
        else:
            return None
    else:
        return synset
    


def pre_process_pair(blob1,blob2):
    # get the sentence POS tags and create the wordnet objects
    tagged_words1 = {w[0]: pos_to_wordnetpos(w[1]) for w in blob1.tags}
    tagged_words2 = {w[0]: pos_to_wordnetpos(w[1]) for w in blob2.tags}

    # create word set with unique words, and convert it to list for iteration
    synsets1 = {word_to_synset(w) for w in blob1.tags}
    synsets2 = {word_to_synset(w) for w in blob2.tags}
    words_corpus = list(synsets1.union(synsets2).difference({None}))
    
    return tagged_words1, tagged_words2, synsets1, synsets2, words_corpus

# function to find similarity of word with set of words
def similarity_with_words(synset1, blob, tag_dict):
    max_sim = 0
    most_similar_w = ''
    for w in blob.words:
        synset2 = dissambiguate_synset(w, tag_dict.get(w, None))
        
        if synset1 and synset2:
            if synset1 == synset2:
                max_s = 1
                most_similar_w = w
                break

            wordnet_sim = synset1.path_similarity(synset2)
            wordnet_sim = wordnet_sim if wordnet_sim > WORDNET_TRESH else 0

            if wordnet_sim > max_sim:
                max_sim = wordnet_sim
                most_similar_w = w
    
    # weight similarity using tfidf (or I can use real word frequencies)
    if most_similar_w != '':
        weight = weight_dict[most_similar_w.lower()]
        max_sim *= weight
        
    return max_sim




def find_pair_similarity(s1, s2):
    blob1 = TextBlob(s1)
    blob2 = TextBlob(s2)


    tagged_words1, tagged_words2, synsets1, synsets2, words_corpus = pre_process_pair(blob1, blob2)
#     print 'This is the word set:\n', ",".join([x._name for x in words_corpus])
    # form similarity vectors
    s1 = []
    s2 = []
    for i,synset in enumerate(words_corpus):
        # note that I should have the synsets in my corpus...
        s1.append(similarity_with_words(synset, blob1, tagged_words1))
        s2.append(similarity_with_words(synset, blob2, tagged_words2))

    similarity = scipy.spatial.distance.cosine(s1, s2)
    frame = pd.DataFrame([s1,s2])
    return similarity, frame

In [359]:
pairs = all_pruned_pairs[0]
article = all_articles[0]



def evaluate_links_in_article(article):
    test_article_sentences = [s['text'] for s in article['sentences'].values()]
    weight_dict = get_weights_for_words_in_sentences(
        test_article_sentences
    )
    
    for l in article['links'].values():
        links1 = l['art_sentence']
        links2 = l['com_sentence']
        s1 = article['sentences'][links1]['text']
        s2 = article['sentences'][links2]['text']

        similarity, frame = find_pair_similarity(s1,s2)
        similarity = 0 if np.isnan(similarity) else similarity
        print similarity
        
evaluate_links_in_article(article)


All words:  3390 3390
Unique words:  1200
Overall sentiment:  Sentiment(polarity=0.06807116415781189, subjectivity=0.46131531767895456)
0.515438414211
0
0.292893218813
0.292893218813
0.409433130126
0
0
0.223268997742
0.336800393644
0
0.0
0.203641324793
0.227749052338
0.0528401429185
0.30211521803
0.309566995842
0.101337906284
0.355928203744
0
0.237067868631
0.31958309783
0.251907721689
0.42984051692
0.395945016269
0.295534110408
0.348301454361
0.432093351983
0.350293443763
0.331176304321
0.203807400799
0.172678304159
0.189911717257
0.0367224007294
0.584124613355
0.331176304321
0.294554115959
0
0.642079853053
0.00946572957378
1.0
0.584075044545
0.148316915633
0.0
0.345739134858
0.40008728279
0.00500628663813
0
0.162478012113
0
0.40068443388
0.0131605263558
0
0.0269794869138
0.547025186079
0.0269794869138
0.0769397260391
0.128327128258
0.348491027973
0.137442219463
0.445600591072
0.172541059473
0.316676702081
1.0
0.521103846749
0.219131190557
0.30998024278
0
0.565139678859
0
0.000919263383704
0.186672894908
0.488336242782
0.069949158744
0.331176304321
0.0528401429185
0.557392938354
0
0.439443298174
0.538574703994
0.140040269991
0.292893218813
0.017823999087
0.0143310452471
0.0
0.063542310407
0.413538096916
0
0.209555261018
0.158839549342
0.391500885087
0.256083377965
0.663064597801
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-359-a93905a3fa22> in <module>()
     20         print similarity
     21 
---> 22 evaluate_links_in_article(article)

<ipython-input-359-a93905a3fa22> in evaluate_links_in_article(article)
     16         s2 = article['sentences'][links2]['text']
     17 
---> 18         similarity, frame = find_pair_similarity(s1,s2)
     19         similarity = 0 if np.isnan(similarity) else similarity
     20         print similarity

<ipython-input-356-e8db1003ab72> in find_pair_similarity(s1, s2)
    105         # note that I should have the synsets in my corpus...
    106         s1.append(similarity_with_words(synset, blob1, tagged_words1))
--> 107         s2.append(similarity_with_words(synset, blob2, tagged_words2))
    108 
    109     similarity = scipy.spatial.distance.cosine(s1, s2)

<ipython-input-356-e8db1003ab72> in similarity_with_words(synset1, blob, tag_dict)
     67     most_similar_w = ''
     68     for w in blob.words:
---> 69         synset2 = dissambiguate_synset(w, tag_dict[w])
     70 
     71         if synset1 and synset2:

KeyError: u'Maybe'

In [331]:
np.isnan(similarity)


Out[331]:
True

In [323]:
x


Out[323]:
0.0

In [ ]: