Creating synonym and not synonym dataset using wordnet

First let's create a dataset of pairs of words that are synonyms or not synonyms using wordnet's synsets.

Note: "not synonyms" is not equivalent of antonyms, it simply means selecting any word w2 in (w1, w2) pair in which the word w2 is not in the set of synonyms to w1, i.e. we are simply performing negative sampling.



In [ ]:

    
import random
from itertools import combinations, chain
import gzip

import networkx as nx 
from nltk.corpus import wordnet as wn
import matplotlib.pyplot as plt

from tqdm import tqdm
import editdistance
import pandas as pd

random.seed(0)



In [ ]:

    
def get_synsets(part_of_speeches=None, verbose=True):
    """
    returns a dictionary where key is a particular part of speech
    and value is the list of all synsets in that POS, if default
    `part_of_speeches` is `None`, will use, verb, noun and adjectives.
    """
    if part_of_speeches is None:
        part_of_speeches={'verb': 'v', 'noun': 'n', 'adjective': 'a'}
        
    pos_synsets = dict()
    for name, pos in part_of_speeches.items():
        pos_synsets[name] = list(wn.all_synsets(pos))
        if verbose:
            print(f"found {len(pos_synsets[name])} synsets for {name}")
            
    return pos_synsets



In [ ]:

    
pos_synsets = get_synsets()



In [ ]:

    
def _is_single_word(word):
    """
    helper function for removing lemma names that contains multiple words, 
    separated by `-` or `_` 
    """
    return (('_' not in word) and ('-' not in word))

def get_syngraph_wordset(pos_synsets, verbose=True):
    """
    Uses the `pos_synsets` dictionary to create a dictionary
    of same keys where values are the synonym graphs using `.lemma_names()`
    of each synsets, see: http://www.nltk.org/howto/wordnet.html
    """
    syn_graphs = dict()
    for pos, synsets in pos_synsets.items():
        syn_graphs[pos] = nx.Graph()
        
        for synset in synsets:
            lemma_names = [x for x in synset.lemma_names() if _is_single_word(x)]
            
            if len(lemma_names) > 1:
                syn_graphs[pos].add_edges_from(combinations(lemma_names, 2))
                        
        if verbose:
            msg_fmt = "Found {} synonym pairs and {} unique words in {}"
            print(msg_fmt.format(len(syn_graphs[pos].edges), 
                                 len(syn_graphs[pos].nodes), 
                                 pos))
    
    return syn_graphs



In [ ]:

    
syn_graphs = get_syngraph_wordset(pos_synsets)



In [ ]:

    
print(list(syn_graphs['verb'].neighbors('change')))



In [ ]:

    
print(list(syn_graphs['noun'].neighbors('ocean')))



In [ ]:

    
print(list(syn_graphs['adjective'].neighbors('large')))



In [ ]:

    
def get_subgraph(graph, subset=['change', 'buy']):
    """
    Create a subgraph of graph with only nodes and nieghbors
    given in `subset`.
    """
    nodes = []
    
    for node in subset:
        nodes.append(node)
        nodes.extend(graph.neighbors(node))
    
    subgraph = graph.subgraph(nodes)
    
    return subgraph



In [ ]:

    
subgraph = get_subgraph(syn_graphs['verb'], subset=['sell', 'buy', 'change'])
pos=nx.spring_layout(subgraph, iterations=150, k=1.5)
nx.draw(subgraph, pos=pos)
nx.draw_networkx_labels(subgraph, pos=pos, font_size=10)
plt.show()
# plt.savefig('graph.png')



In [ ]:

    
def create_dataset(syn_graphs, train_test_component_ratio=0.5, 
                   negative_sampling_count=1):
    weights = {'verb': 1.15,
               'adjective': 1.0,
               'noun': 1.0}

    for pos, graph in syn_graphs.items():
        edges = [(x, y) for (x, y) in graph.edges]
        all_words_len = len(graph.nodes)
        train_len = int(all_words_len*train_test_component_ratio)
        train_words = set()
        test_words = set()
        train_edges = list()
        test_edges = list()
        for (x, y) in edges:
            if len(train_words) < train_len*weights[pos]:
                train_edges.append((x, y))
                train_words.update({x, y})
            else:
                test_edges.append((x, y))
                test_words.update({x, y})
                
        print("{}, train {}, test {}, intersec {}, train {}, test {}".format(
                pos, len(train_words), len(test_words),
                len(train_words.intersection(test_words)),
                len(train_words.difference(test_words)), 
                len(test_words.difference(train_words)))) 
        
        intersection = train_words.intersection(test_words)
        train_sampling = train_words.difference(intersection)
        test_sampling = test_words.difference(intersection)
        for (data, split) in zip([train_edges, test_edges], ['train', 'test']):
            for (w1, w2) in data:
                if editdistance.eval(w1, w2) < 2:
                    continue
                if (w1 in intersection) or (w2 in intersection):
                    continue
                pos_words =[w1]
                pos_words.extend(list(graph.neighbors(w1)))
                pos_words = set(pos_words)
                if split == 'train':
                    neg_words = list(train_sampling.difference(pos_words))
                else:
                    neg_words = list(test_sampling.difference(pos_words))
                neg_sampling = random.choices(neg_words, k=negative_sampling_count)
                yield {'word1': w1, 'word2': w2, 'synonym': 1, 'pos': pos, 'split': split}
                for w3 in neg_sampling:
                    yield {'word1': w1, 'word2': w3, 'synonym': 0, 'pos': pos, 'split': split}



In [ ]:

    
df = pd.DataFrame.from_dict(list(create_dataset(syn_graphs, negative_sampling_count=10)))



In [ ]:

    
df.groupby(['pos', 'split', 'synonym']).size()



In [ ]:

    
df.loc[df.word1=='change'].head(20)



In [ ]:

    
df.to_csv('../datasets/synonym_dataset.csv.gz', compression='gzip')



In [ ]: