In [1]:
import sys
import time

import pandas as pd
from tqdm import tqdm

In [2]:
from pywsd.utils import lemmatize
from pywsd.lesk import synset_signatures


Warming up PyWSD (takes ~10 secs)... took 8.82946491241455 secs.

In [3]:
from wn import WordNet
wn = WordNet()

In [4]:
all_signatures = []

start = time.time()
for ss in tqdm(wn.all_synsets()):
    ss_signature = {}
    offset = ss.offset()
    pos = ss.pos()

    idx = str(offset).zfill(8) + '-' + pos
    
    ss_signature['simple'] = synset_signatures(ss, hyperhypo=True, adapted=False,
                                               remove_stopwords=True, 
                                               to_lemmatize=True, remove_numbers=True,
                                               lowercase=True, from_cache=False)
    
    ss_signature['adapted'] = synset_signatures(ss, hyperhypo=True, adapted=True,
                                                   remove_stopwords=True, 
                                                   to_lemmatize=True, remove_numbers=True,
                                                   lowercase=True, from_cache=False)
    
    ss_signature['original'] = synset_signatures(ss, original_lesk=True,
                                                 remove_stopwords=True, 
                                                 to_lemmatize=True, remove_numbers=True,
                                                 lowercase=True, from_cache=False)
    
    
    all_signatures.append({'name': ss.name(), 'offset-pos': idx, 
                           'original': ss_signature['original'], 
                           'simple': ss_signature['simple'], 
                           'adapted':ss_signature['adapted']})
print('took {}'.format(time.time() - start), file=sys.stderr)


117659it [24:02, 81.55it/s] 
took 1443.1147258281708

In [7]:
df = pd.DataFrame(all_signatures)
df.head()


Out[7]:
adapted name offset-pos original simple
0 {perceive, living, entity, distinct, existence... entity.n.01 00001740-n {or, existence, inferred, have, distinct, livi... {perceive, living, entity, distinct, existence...
1 {process, physical, entity, existence, cause, ... physical_entity.n.01 00001930-n {physical, entity, existence, an, has, that} {process, physical, entity, existence, cause, ...
2 {relation, feature, psychological_feature, for... abstraction.n.06 00002137-n {features, concept, general, formed, a, from, ... {relation, feature, psychological_feature, for...
3 {necessary, part, piece, subject, water, depic... thing.n.12 00002452-n {entity, a, and, self-contained, separate} {necessary, part, piece, subject, water, depic...
4 {token, part, neighbour, wall, cast, whole, sn... object.n.01 00002684-n {entity, cast, a, can, visible, tangible, and,... {token, part, neighbour, wall, cast, whole, sn...

In [8]:
pywsd_signatures = df.set_index('name').T
pywsd_signatures.head()


Out[8]:
name entity.n.01 physical_entity.n.01 abstraction.n.06 thing.n.12 object.n.01 whole.n.02 congener.n.03 living_thing.n.01 organism.n.01 benthos.n.02 ... suggestively.r.01 synergistically.r.02 synergistically.r.01 synonymously.r.01 taxonomically.r.01 topologically.r.01 ulteriorly.r.01 vexatiously.r.01 wafer-thin.r.01 wrongfully.r.01
adapted {perceive, living, entity, distinct, existence... {process, physical, entity, existence, cause, ... {relation, feature, psychological_feature, for... {necessary, part, piece, subject, water, depic... {token, part, neighbour, wall, cast, whole, sn... {part, compare, whole, regard, living_thing, b... {use, shopkeeper, person, congener, frequently... {living, entity, biont, whole, organism, anima... {sitter, zooid, prokaryote, saprophytic_organi... {benthos, near, organism, sea, animal, plant, ... ... {suggestive, suggestively, manner, smile} {synergistic, drug, interactive, manner, syner... {synergistic, group, cooperative, manner, syne... {use, synonymous, two, term, manner, synonymou... {regard, related, taxonomy, taxonomically, clo... {topologically, point, view, topology} {ulterior, ulteriorly, manner} {manner, vexatiously, vexatious} {wafer-thin, cut, thin} {dismiss, imprison, employee, release, unfair,...
offset-pos 00001740-n 00001930-n 00002137-n 00002452-n 00002684-n 00003553-n 00003993-n 00004258-n 00004475-n 00005787-n ... 00515573-r 00515681-r 00515803-r 00515914-r 00516034-r 00516150-r 00516244-r 00516322-r 00516401-r 00516492-r
original {or, existence, inferred, have, distinct, livi... {physical, entity, existence, an, has, that} {features, concept, general, formed, a, from, ... {entity, a, and, self-contained, separate} {entity, cast, a, can, visible, tangible, and,... {regarded, single, entity, a, assemblage, is, ... {or, person, ), a, whole, (, as, the, another,... {or, living, entity, ), a, once, (} {or, develop, living, act, ), a, function, can... {or, at, ), a, organisms, near, (, the, animal... ... {a, in, suggestive, manner} {or, synergistic, drugs, ), a, (, muscles, int... {synergistic, or, group, ), a, (, cooperative,... {a, in, synonymous, manner} {taxonomy, regard, with, to} {view, from, topology, the, of, point} {an, in, ulterior, manner} {a, in, manner, vexatious} {very, thin} {or, unfair, unjust, manner, an, in}
simple {perceive, living, entity, distinct, existence... {process, physical, entity, existence, cause, ... {relation, feature, psychological_feature, for... {necessary, part, piece, subject, water, depic... {token, part, neighbour, wall, cast, whole, sn... {part, compare, whole, regard, living_thing, b... {use, shopkeeper, person, congener, frequently... {living, entity, biont, whole, organism, anima... {sitter, zooid, prokaryote, saprophytic_organi... {benthos, near, organism, sea, animal, plant, ... ... {suggestive, suggestively, manner, smile} {synergistic, drug, interactive, manner, syner... {synergistic, group, cooperative, manner, syne... {use, synonymous, two, term, manner, synonymou... {regard, related, taxonomy, taxonomically, clo... {topologically, point, view, topology} {ulterior, ulteriorly, manner} {manner, vexatiously, vexatious} {wafer-thin, cut, thin} {dismiss, imprison, employee, release, unfair,...

4 rows × 117659 columns


In [9]:
pywsd_signatures.to_pickle('pywsd/data/signatures/signatures.pkl',protocol=2)

In [13]:
pywsd_signatures['younger.s.01']


Out[13]:
adapted       {name, use, person, son, two, jr, jr., young, ...
offset-pos                                           02101382-s
original      {name, distinguish, son, persons, two, a, from...
simple        {name, use, person, son, two, jr, jr., young, ...
Name: younger.s.01, dtype: object

In [ ]: