Preliminary study for experiment with WordNet as thesaurus

Use wordnet distance between phrases to provide neighbours.

Firstly check how many of the NPs in my corpora occur in WN



In [3]:

    
from nltk.corpus import wordnet as wn
from discoutils.tokens import DocumentFeature



In [4]:

    
phrases = set()
for s in wn.all_synsets():
    for lemma in s.lemmas():
        if lemma.name().count('_') == 1:
            phrases.add(lemma)



In [5]:

    
len(phrases)









    Out[5]:





54584



In [6]:

    
mylist = list(phrases)[:10]



In [7]:

    
mylist [0].synset().path_similarity(mylist[1].synset())









    Out[7]:





0.05555555555555555



In [9]:

    
with open('../../thesisgenerator/features_in_labelled/all_features.txt') as inf:
    my_phrases = set(map(str.strip, inf.readlines()))



In [10]:

    
len(my_phrases)









    Out[10]:





50000



In [11]:

    
formatted_phrases = []
for p in list(my_phrases):
    p = DocumentFeature.from_string(p)
    f = '_'.join(t.text for t in p.tokens)
    formatted_phrases.append(f)
formatted_phrases[:10]









    Out[11]:





['isabel',
 'eisenstein_portrayal',
 'parrot_figure_thing',
 'water_don',
 'guantanamera_cover_territory',
 'package_furry',
 'chase_hummie',
 'youhot',
 'prefect_condition',
 'eastern_variety']



In [12]:

    
wn_phrases = [x.name().lower() for x in phrases]
wn_phrases[:10]









    Out[12]:





['genus_boletus',
 'artificial_joint',
 'take_after',
 'deficit_spending',
 'merlangus_merlangus',
 'climbing_frame',
 'free-reed_instrument',
 'cerebrospinal_meningitis',
 'ringworm_shrub',
 'genus_mayaca']



In [13]:

    
shared = set(wn_phrases).intersection(set(formatted_phrases))



In [14]:

    
len(shared)









    Out[14]:





363



In [15]:

    
list(shared)[:10]









    Out[15]:





['occipital_lobe',
 'taste_sensation',
 'discount_rate',
 'taxi_dancer',
 'left_hemisphere',
 'occupational_therapy',
 'broom_closet',
 'social_worker',
 'shasta_daisy',
 'change_course']

Now make a thesaurus out of these using path similarity



In [16]:

    
lemmata = [x for x in phrases if x.name().lower() in shared]
len(lemmata)









    Out[16]:





365



In [17]:

    
from collections import defaultdict
from itertools import combinations
import numpy as np


lemmata_index = {b:a for a,b in enumerate(lemmata)}
sims = np.zeros((len(lemmata), len(lemmata)))

for i, (lemma1, lemma2) in enumerate(combinations(lemmata, 2)):
    p1, p2 = lemmata_index[lemma1], lemmata_index[lemma2]
    sim = lemma1.synset().path_similarity(lemma2.synset())
    sims[p1, p2] = sim
    sims[p2, p1] = sim



In [64]:

    
sims.mean(axis=0)









    Out[64]:





array([             nan,   7.58196100e-06,   4.33254914e-06, ...,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00])



In [66]:

    
list(combinations([1,2,3], 2))









    Out[66]:





[(1, 2), (1, 3), (2, 3)]



In [ ]: