Preliminary study for experiment with WordNet as thesaurus

Use wordnet distance between phrases to provide neighbours.

Firstly check how many of the NPs in my corpora occur in WN


In [3]:
from nltk.corpus import wordnet as wn
from discoutils.tokens import DocumentFeature

In [4]:
phrases = set()
for s in wn.all_synsets():
    for lemma in s.lemmas():
        if lemma.name().count('_') == 1:
            phrases.add(lemma)

In [5]:
len(phrases)


Out[5]:
54584

In [6]:
mylist = list(phrases)[:10]

In [7]:
mylist [0].synset().path_similarity(mylist[1].synset())


Out[7]:
0.05555555555555555

In [9]:
with open('../../thesisgenerator/features_in_labelled/all_features.txt') as inf:
    my_phrases = set(map(str.strip, inf.readlines()))

In [10]:
len(my_phrases)


Out[10]:
50000

In [11]:
formatted_phrases = []
for p in list(my_phrases):
    p = DocumentFeature.from_string(p)
    f = '_'.join(t.text for t in p.tokens)
    formatted_phrases.append(f)
formatted_phrases[:10]


Out[11]:
['isabel',
 'eisenstein_portrayal',
 'parrot_figure_thing',
 'water_don',
 'guantanamera_cover_territory',
 'package_furry',
 'chase_hummie',
 'youhot',
 'prefect_condition',
 'eastern_variety']

In [12]:
wn_phrases = [x.name().lower() for x in phrases]
wn_phrases[:10]


Out[12]:
['genus_boletus',
 'artificial_joint',
 'take_after',
 'deficit_spending',
 'merlangus_merlangus',
 'climbing_frame',
 'free-reed_instrument',
 'cerebrospinal_meningitis',
 'ringworm_shrub',
 'genus_mayaca']

In [13]:
shared = set(wn_phrases).intersection(set(formatted_phrases))

In [14]:
len(shared)


Out[14]:
363

In [15]:
list(shared)[:10]


Out[15]:
['occipital_lobe',
 'taste_sensation',
 'discount_rate',
 'taxi_dancer',
 'left_hemisphere',
 'occupational_therapy',
 'broom_closet',
 'social_worker',
 'shasta_daisy',
 'change_course']

Now make a thesaurus out of these using path similarity


In [16]:
lemmata = [x for x in phrases if x.name().lower() in shared]
len(lemmata)


Out[16]:
365

In [17]:
from collections import defaultdict
from itertools import combinations
import numpy as np


lemmata_index = {b:a for a,b in enumerate(lemmata)}
sims = np.zeros((len(lemmata), len(lemmata)))

for i, (lemma1, lemma2) in enumerate(combinations(lemmata, 2)):
    p1, p2 = lemmata_index[lemma1], lemmata_index[lemma2]
    sim = lemma1.synset().path_similarity(lemma2.synset())
    sims[p1, p2] = sim
    sims[p2, p1] = sim

In [64]:
sims.mean(axis=0)


Out[64]:
array([             nan,   7.58196100e-06,   4.33254914e-06, ...,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00])

In [66]:
list(combinations([1,2,3], 2))


Out[66]:
[(1, 2), (1, 3), (2, 3)]

In [ ]: