First let's create a dataset of pairs of words that are synonyms or not synonyms using wordnet's synsets.
Note: "not synonyms" is not equivalent of antonyms, it simply means selecting any word w2 in (w1, w2) pair in which the word w2 is not in the set of synonyms to w1, i.e. we are simply performing negative sampling.
In [ ]:
import random
from itertools import combinations, chain
import gzip
import networkx as nx
from nltk.corpus import wordnet as wn
import matplotlib.pyplot as plt
from tqdm import tqdm
import editdistance
import pandas as pd
random.seed(0)
In [ ]:
def get_synsets(part_of_speeches=None, verbose=True):
"""
returns a dictionary where key is a particular part of speech
and value is the list of all synsets in that POS, if default
`part_of_speeches` is `None`, will use, verb, noun and adjectives.
"""
if part_of_speeches is None:
part_of_speeches={'verb': 'v', 'noun': 'n', 'adjective': 'a'}
pos_synsets = dict()
for name, pos in part_of_speeches.items():
pos_synsets[name] = list(wn.all_synsets(pos))
if verbose:
print(f"found {len(pos_synsets[name])} synsets for {name}")
return pos_synsets
In [ ]:
pos_synsets = get_synsets()
In [ ]:
def _is_single_word(word):
"""
helper function for removing lemma names that contains multiple words,
separated by `-` or `_`
"""
return (('_' not in word) and ('-' not in word))
def get_syngraph_wordset(pos_synsets, verbose=True):
"""
Uses the `pos_synsets` dictionary to create a dictionary
of same keys where values are the synonym graphs using `.lemma_names()`
of each synsets, see: http://www.nltk.org/howto/wordnet.html
"""
syn_graphs = dict()
for pos, synsets in pos_synsets.items():
syn_graphs[pos] = nx.Graph()
for synset in synsets:
lemma_names = [x for x in synset.lemma_names() if _is_single_word(x)]
if len(lemma_names) > 1:
syn_graphs[pos].add_edges_from(combinations(lemma_names, 2))
if verbose:
msg_fmt = "Found {} synonym pairs and {} unique words in {}"
print(msg_fmt.format(len(syn_graphs[pos].edges),
len(syn_graphs[pos].nodes),
pos))
return syn_graphs
In [ ]:
syn_graphs = get_syngraph_wordset(pos_synsets)
In [ ]:
print(list(syn_graphs['verb'].neighbors('change')))
In [ ]:
print(list(syn_graphs['noun'].neighbors('ocean')))
In [ ]:
print(list(syn_graphs['adjective'].neighbors('large')))
In [ ]:
def get_subgraph(graph, subset=['change', 'buy']):
"""
Create a subgraph of graph with only nodes and nieghbors
given in `subset`.
"""
nodes = []
for node in subset:
nodes.append(node)
nodes.extend(graph.neighbors(node))
subgraph = graph.subgraph(nodes)
return subgraph
In [ ]:
subgraph = get_subgraph(syn_graphs['verb'], subset=['sell', 'buy', 'change'])
pos=nx.spring_layout(subgraph, iterations=150, k=1.5)
nx.draw(subgraph, pos=pos)
nx.draw_networkx_labels(subgraph, pos=pos, font_size=10)
plt.show()
# plt.savefig('graph.png')
In [ ]:
def create_dataset(syn_graphs, train_test_component_ratio=0.5,
negative_sampling_count=1):
weights = {'verb': 1.15,
'adjective': 1.0,
'noun': 1.0}
for pos, graph in syn_graphs.items():
edges = [(x, y) for (x, y) in graph.edges]
all_words_len = len(graph.nodes)
train_len = int(all_words_len*train_test_component_ratio)
train_words = set()
test_words = set()
train_edges = list()
test_edges = list()
for (x, y) in edges:
if len(train_words) < train_len*weights[pos]:
train_edges.append((x, y))
train_words.update({x, y})
else:
test_edges.append((x, y))
test_words.update({x, y})
print("{}, train {}, test {}, intersec {}, train {}, test {}".format(
pos, len(train_words), len(test_words),
len(train_words.intersection(test_words)),
len(train_words.difference(test_words)),
len(test_words.difference(train_words))))
intersection = train_words.intersection(test_words)
train_sampling = train_words.difference(intersection)
test_sampling = test_words.difference(intersection)
for (data, split) in zip([train_edges, test_edges], ['train', 'test']):
for (w1, w2) in data:
if editdistance.eval(w1, w2) < 2:
continue
if (w1 in intersection) or (w2 in intersection):
continue
pos_words =[w1]
pos_words.extend(list(graph.neighbors(w1)))
pos_words = set(pos_words)
if split == 'train':
neg_words = list(train_sampling.difference(pos_words))
else:
neg_words = list(test_sampling.difference(pos_words))
neg_sampling = random.choices(neg_words, k=negative_sampling_count)
yield {'word1': w1, 'word2': w2, 'synonym': 1, 'pos': pos, 'split': split}
for w3 in neg_sampling:
yield {'word1': w1, 'word2': w3, 'synonym': 0, 'pos': pos, 'split': split}
In [ ]:
df = pd.DataFrame.from_dict(list(create_dataset(syn_graphs, negative_sampling_count=10)))
In [ ]:
df.groupby(['pos', 'split', 'synonym']).size()
In [ ]:
df.loc[df.word1=='change'].head(20)
In [ ]:
df.to_csv('../datasets/synonym_dataset.csv.gz', compression='gzip')
In [ ]: