In [1]:

    
import pandas as pd
import networkx as nx
import subprocess
import matplotlib.pyplot as plt
import gensim
import os

from networkx.drawing.nx_agraph import graphviz_layout
from chinese_whispers import chinese_whispers, aggregate_clusters
from gensim.models.poincare import PoincareModel
from nltk.corpus import wordnet as wn

Construct the Networkx graph

From a csv file



In [2]:

    
def display_taxonomy(graph):
    """ Display the taxonomy in a hierarchical layout """
    pos = graphviz_layout(graph, prog='dot', args="-Grankdir=LR")
    plt.figure(3,figsize=(48,144))
    nx.draw(graph, pos, with_labels=True, arrows=True)
    plt.show()



In [3]:

    
# Construct the networkx graph
def process_input(taxonomy):
    """ Read the taxonomy and generate a networkx graph """

    # Generated
    df = pd.read_csv(
        taxonomy,
        sep='\t',
        header=None,
        names=['hyponym', 'hypernym'],
        usecols=[1,2],
    )
    
    G = nx.DiGraph()
    for rel in zip(list(df['hypernym']), list(df['hyponym'])):
        rel_0 = rel[0]
        rel_1 = rel[1]
        # Simplify the compound words by replacing the whitespaces with underscores
        if ' ' in rel[0]:
            rel_0 = '_'.join(rel[0].split())
        if ' ' in rel[1]:
            rel_1 = '_'.join(rel[1].split())
        G.add_edge(rel_0, rel_1)
    
    return G



In [4]:

    
taxo_path = 'taxi_output/simple_full/science/science_en.csv-relations.csv-taxo-knn1.csv'
gs_path = 'eval/taxi_eval_archive/gold_standard/science.taxo'

G_taxo = process_input(taxo_path)
G_gold = process_input(gs_path)



In [10]:

    
print('Nodes in GS:', len(set(G_gold.nodes())))
print('Nodes in G Taxo:', len(set(G_taxo.nodes())))









    



Nodes in GS: 452
Nodes in G Taxo: 307



In [9]:

    
new_nodes = set(G_gold.nodes()) - set(G_taxo.nodes())
len(new_nodes)









    Out[9]:





146

Load Word Vectors



In [11]:

    
def load_vectors():
    """ Load word vectors. """

    embedding_dir = '/home/5aly/taxi/distributed_semantics/embeddings/'

    poincare_model = model = PoincareModel.load(embedding_dir + 'embeddings_poincare_wordnet')  # parent-cluster relationship
    own_model = gensim.models.KeyedVectors.load(embedding_dir + 'own_embeddings_w2v')  # family-cluster relationship

    return poincare_model, own_model



In [12]:

    
poincare_w2v, own_w2v = load_vectors()

Improving Taxonomy with Distributional Semantics

Create a networkx graph for each node containing only its children. Draw edges among the children based on the similarity with one another using word vectors.



In [13]:

    
def create_children_clusters(own_model, graph):
    """ This function returns a dictionary where corresponding to each key(node) is a graph of its children """
    
    clustered_graph = {}
    for node in graph.nodes():
        clustered_graph[node] = nx.Graph()
        successors = [s.lower() for s in graph.successors(node)]

        for successor in successors:
            try:
                for word, _ in own_model.most_similar(successor, topn=100):
                    if word.lower() in successors:
                        clustered_graph[node].add_edge(successor, word.lower())
            except KeyError:  # If the word in not in vocabulary, check using the substring based method
                successor_terms = successor.split('_')
                if node in successor_terms:
                    clustered_graph[node].add_node(successor)
    
    return clustered_graph



In [14]:

    
GC = create_children_clusters(own_w2v, G_taxo)









    



/home/acharya/anaconda3/envs/tax3/lib/python3.6/site-packages/ipykernel_launcher.py:11: DeprecationWarning: Call to deprecated `most_similar` (Method will be removed in 4.0.0, use self.wv.most_similar() instead).
  # This is added back by InteractiveShellApp.init_path()
/home/acharya/anaconda3/envs/tax3/lib/python3.6/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.
  if np.issubdtype(vec.dtype, np.int):



In [ ]:

    
posI = graphviz_layout(GC['engineering'])
# plt.figure(2, figsize=(20, 20))
nx.draw(GC['engineering'], posI, with_labels=True, arrows=True)
plt.show()

Implementing Chinese Whispers Algorithm

Adding new nodes

Loop through all the new nodes.
For each removed node, find out the family and parent in the graph that has the maximum similarity with it.



In [15]:

    
G_improved = G_taxo.copy()



In [16]:

    
def calculate_similarity(poincare_model, own_model, parent, family, node, exclude_parent, exclude_family):
    
    # Similarity between the parent and a cluster
    parent_similarity = 0
    if not exclude_parent:
        node_senses = [n_sense.name() for n_sense in wn.synsets(node) if node in n_sense.name()]
        parent_senses = [p_sense.name() for p_sense in wn.synsets(parent) if parent in p_sense.name()]
        for parent_sense in parent_senses:
            for node_sense in node_senses:
                try:
                    similarity = poincare_model.kv.similarity(parent_sense, node_sense)
                    if similarity > parent_similarity:
                        parent_similarity = similarity
                except KeyError as e:
                    if parent_sense in str(e):
                        break
                    else:
                        continue
    
    # Similarity between a family and a cluster
    family_similarity = 0
    if not exclude_family:
        family_similarities = []
        for f_item in family:
            try:
                family_similarities.append(own_model.similarity(f_item, node))
            except KeyError as e:  # skip the terms not in vocabulary
                if node in str(e):
                    break
                else:
                    continue
        if len(family_similarities) > 0:
            family_similarity = sum(family_similarities) / len(family_similarities)
    
    # Final score is the average of both the similarities
    return (parent_similarity + family_similarity) / 2



In [17]:

    
for node in new_nodes:
    max_score = 0
    max_score_node = ''
    for p_node, graph in GC.items():
        gc = chinese_whispers(graph, weighting='top', iterations=60)
        for label, family in aggregate_clusters(gc).items():
            score = calculate_similarity(poincare_w2v, own_w2v, p_node, family, node, False, False)
            if score > max_score:
                max_score = score
                max_score_node = p_node
    G_improved.add_edge(max_score_node, node)









    



/home/acharya/anaconda3/envs/tax3/lib/python3.6/site-packages/ipykernel_launcher.py:26: DeprecationWarning: Call to deprecated `similarity` (Method will be removed in 4.0.0, use self.wv.similarity() instead).
/home/acharya/anaconda3/envs/tax3/lib/python3.6/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.
  if np.issubdtype(vec.dtype, np.int):

Tuning the nodes and the edges



In [18]:

    
def tune_result(g_improved):
    """ Filter the results i.e. remove all the isolated nodes and nodes with blank labels """

    print('\nTuning the result...')

    if '' in g_improved.nodes():
        g_improved.remove_node('')

    hypernyms = {x[0] for x in g_improved.edges()}
    isolated_nodes = list(nx.isolates(g_improved))
    for isolated_node in isolated_nodes:
        terms = isolated_node.split('_')
        if terms[-1] in hypernyms:
            g_improved.add_edge(terms[-1], isolated_node)
        elif terms[0] in hypernyms:
            g_improved.add_edge(terms[0], isolated_node)
        else:
            g_improved.remove_node(isolated_node)

    return g_improved



In [19]:

    
tune_result(G_improved)
print('Tuned.')









    



Tuning the result...
Tuned.

Save the result



In [ ]:

    
def save_result(result, path):
    print('\nSaving the result...')
    df_improved = pd.DataFrame(list(result.edges()), columns=['hypernym', 'hyponym'])
    df_improved = df_improved[df_improved.columns.tolist()[::-1]]

    # Replace the underscores with blanks
    df_improved['hyponym'] = df_improved['hyponym'].apply(lambda x: x.replace('_', ' '))
    df_improved['hypernym'] = df_improved['hypernym'].apply(lambda x: x.replace('_', ' '))

    # Store the result
    output_path = os.path.join(
        'taxi_output', 'distributional_semantics',
        os.path.basename(path) + '-' + 'new_ds' + os.path.splitext(path)[-1]
    )
    df_improved.to_csv(output_path, sep='\t', header=False)
    print('Output saved at:', output_path)

    return output_path



In [ ]:

    
output_path = save_result(G_improved, taxo_path)

Results visualization

Clusters



In [ ]:

    
def visualize_clusters(graph):
    """ Clusterize the nodes of a particular domain in a given graph """
    graph_cluster = chinese_whispers(graph, weighting='top', iterations=60)
    
    # Visualize the clustering of graph_cluster using NetworkX (requires matplotlib)
    colors = [1. / graph_cluster.node[node]['label'] for node in graph_cluster.nodes()]
    fig = plt.gcf()
    fig.set_size_inches(20, 20)
    nx.draw_networkx(graph_cluster, cmap=plt.get_cmap('jet'), node_color=colors, font_color='black')
    plt.show()



In [ ]:

    
GC_improved = create_children_clusters(own_w2v, G_improved)



In [ ]:

    
domain = 'mechanical_engineering'



In [ ]:

    
# Original clusters
visualize_clusters(GC[domain])



In [ ]:

    
# Clusters after detaching
visualize_clusters(GC_detached[domain])



In [ ]:

    
# Clusters after detaching and re-attaching the clusters
visualize_clusters(GC_improved[domain])

Taxonomy



In [ ]:

    
# View the original taxonomy
display_taxonomy(G)



In [ ]:

    
# View the modified taxonomy
display_taxonomy(G_improved)



In [ ]:

    
len(list(G.nodes()))



In [ ]:

    
len(list(G_improved.nodes()))



In [ ]: