In [3]:
from nltk.corpus import wordnet as wn
from discoutils.tokens import DocumentFeature
In [4]:
phrases = set()
for s in wn.all_synsets():
for lemma in s.lemmas():
if lemma.name().count('_') == 1:
phrases.add(lemma)
In [5]:
len(phrases)
Out[5]:
In [6]:
mylist = list(phrases)[:10]
In [7]:
mylist [0].synset().path_similarity(mylist[1].synset())
Out[7]:
In [9]:
with open('../../thesisgenerator/features_in_labelled/all_features.txt') as inf:
my_phrases = set(map(str.strip, inf.readlines()))
In [10]:
len(my_phrases)
Out[10]:
In [11]:
formatted_phrases = []
for p in list(my_phrases):
p = DocumentFeature.from_string(p)
f = '_'.join(t.text for t in p.tokens)
formatted_phrases.append(f)
formatted_phrases[:10]
Out[11]:
In [12]:
wn_phrases = [x.name().lower() for x in phrases]
wn_phrases[:10]
Out[12]:
In [13]:
shared = set(wn_phrases).intersection(set(formatted_phrases))
In [14]:
len(shared)
Out[14]:
In [15]:
list(shared)[:10]
Out[15]:
In [16]:
lemmata = [x for x in phrases if x.name().lower() in shared]
len(lemmata)
Out[16]:
In [17]:
from collections import defaultdict
from itertools import combinations
import numpy as np
lemmata_index = {b:a for a,b in enumerate(lemmata)}
sims = np.zeros((len(lemmata), len(lemmata)))
for i, (lemma1, lemma2) in enumerate(combinations(lemmata, 2)):
p1, p2 = lemmata_index[lemma1], lemmata_index[lemma2]
sim = lemma1.synset().path_similarity(lemma2.synset())
sims[p1, p2] = sim
sims[p2, p1] = sim
In [64]:
sims.mean(axis=0)
Out[64]:
In [66]:
list(combinations([1,2,3], 2))
Out[66]:
In [ ]: