In [1]:
import pandas as pd
import networkx as nx
import subprocess
import matplotlib.pyplot as plt
import gensim
import os

from networkx.drawing.nx_agraph import graphviz_layout
from chinese_whispers import chinese_whispers, aggregate_clusters
from gensim.models.poincare import PoincareModel
from nltk.corpus import wordnet as wn

Construct the Networkx graph

From a csv file


In [2]:
def display_taxonomy(graph):
    """ Display the taxonomy in a hierarchical layout """
    pos = graphviz_layout(graph, prog='dot', args="-Grankdir=LR")
    plt.figure(3,figsize=(48,144))
    nx.draw(graph, pos, with_labels=True, arrows=True)
    plt.show()

In [3]:
# Construct the networkx graph
def process_input(taxonomy):
    """ Read the taxonomy and generate a networkx graph """

    # Generated
    df = pd.read_csv(
        taxonomy,
        sep='\t',
        header=None,
        names=['hyponym', 'hypernym'],
        usecols=[1,2],
    )
    
    G = nx.DiGraph()
    for rel in zip(list(df['hypernym']), list(df['hyponym'])):
        rel_0 = rel[0]
        rel_1 = rel[1]
        # Simplify the compound words by replacing the whitespaces with underscores
        if ' ' in rel[0]:
            rel_0 = '_'.join(rel[0].split())
        if ' ' in rel[1]:
            rel_1 = '_'.join(rel[1].split())
        G.add_edge(rel_0, rel_1)
    
    return G

In [4]:
taxo_path = 'taxi_output/simple_full/science/science_en.csv-relations.csv-taxo-knn1.csv'
gs_path = 'eval/taxi_eval_archive/gold_standard/science.taxo'

G_taxo = process_input(taxo_path)
G_gold = process_input(gs_path)

In [10]:
print('Nodes in GS:', len(set(G_gold.nodes())))
print('Nodes in G Taxo:', len(set(G_taxo.nodes())))


Nodes in GS: 452
Nodes in G Taxo: 307

In [9]:
new_nodes = set(G_gold.nodes()) - set(G_taxo.nodes())
len(new_nodes)


Out[9]:
146

Load Word Vectors


In [11]:
def load_vectors():
    """ Load word vectors. """

    embedding_dir = '/home/5aly/taxi/distributed_semantics/embeddings/'

    poincare_model = model = PoincareModel.load(embedding_dir + 'embeddings_poincare_wordnet')  # parent-cluster relationship
    own_model = gensim.models.KeyedVectors.load(embedding_dir + 'own_embeddings_w2v')  # family-cluster relationship

    return poincare_model, own_model

In [12]:
poincare_w2v, own_w2v = load_vectors()

Improving Taxonomy with Distributional Semantics

Create a networkx graph for each node containing only its children. Draw edges among the children based on the similarity with one another using word vectors.


In [13]:
def create_children_clusters(own_model, graph):
    """ This function returns a dictionary where corresponding to each key(node) is a graph of its children """
    
    clustered_graph = {}
    for node in graph.nodes():
        clustered_graph[node] = nx.Graph()
        successors = [s.lower() for s in graph.successors(node)]

        for successor in successors:
            try:
                for word, _ in own_model.most_similar(successor, topn=100):
                    if word.lower() in successors:
                        clustered_graph[node].add_edge(successor, word.lower())
            except KeyError:  # If the word in not in vocabulary, check using the substring based method
                successor_terms = successor.split('_')
                if node in successor_terms:
                    clustered_graph[node].add_node(successor)
    
    return clustered_graph

In [14]:
GC = create_children_clusters(own_w2v, G_taxo)


/home/acharya/anaconda3/envs/tax3/lib/python3.6/site-packages/ipykernel_launcher.py:11: DeprecationWarning: Call to deprecated `most_similar` (Method will be removed in 4.0.0, use self.wv.most_similar() instead).
  # This is added back by InteractiveShellApp.init_path()
/home/acharya/anaconda3/envs/tax3/lib/python3.6/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.
  if np.issubdtype(vec.dtype, np.int):

In [ ]:
posI = graphviz_layout(GC['engineering'])
# plt.figure(2, figsize=(20, 20))
nx.draw(GC['engineering'], posI, with_labels=True, arrows=True)
plt.show()

Implementing Chinese Whispers Algorithm

Adding new nodes

  • Loop through all the new nodes.
  • For each removed node, find out the family and parent in the graph that has the maximum similarity with it.

In [15]:
G_improved = G_taxo.copy()

In [16]:
def calculate_similarity(poincare_model, own_model, parent, family, node, exclude_parent, exclude_family):
    
    # Similarity between the parent and a cluster
    parent_similarity = 0
    if not exclude_parent:
        node_senses = [n_sense.name() for n_sense in wn.synsets(node) if node in n_sense.name()]
        parent_senses = [p_sense.name() for p_sense in wn.synsets(parent) if parent in p_sense.name()]
        for parent_sense in parent_senses:
            for node_sense in node_senses:
                try:
                    similarity = poincare_model.kv.similarity(parent_sense, node_sense)
                    if similarity > parent_similarity:
                        parent_similarity = similarity
                except KeyError as e:
                    if parent_sense in str(e):
                        break
                    else:
                        continue
    
    # Similarity between a family and a cluster
    family_similarity = 0
    if not exclude_family:
        family_similarities = []
        for f_item in family:
            try:
                family_similarities.append(own_model.similarity(f_item, node))
            except KeyError as e:  # skip the terms not in vocabulary
                if node in str(e):
                    break
                else:
                    continue
        if len(family_similarities) > 0:
            family_similarity = sum(family_similarities) / len(family_similarities)
    
    # Final score is the average of both the similarities
    return (parent_similarity + family_similarity) / 2

In [17]:
for node in new_nodes:
    max_score = 0
    max_score_node = ''
    for p_node, graph in GC.items():
        gc = chinese_whispers(graph, weighting='top', iterations=60)
        for label, family in aggregate_clusters(gc).items():
            score = calculate_similarity(poincare_w2v, own_w2v, p_node, family, node, False, False)
            if score > max_score:
                max_score = score
                max_score_node = p_node
    G_improved.add_edge(max_score_node, node)


/home/acharya/anaconda3/envs/tax3/lib/python3.6/site-packages/ipykernel_launcher.py:26: DeprecationWarning: Call to deprecated `similarity` (Method will be removed in 4.0.0, use self.wv.similarity() instead).
/home/acharya/anaconda3/envs/tax3/lib/python3.6/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.
  if np.issubdtype(vec.dtype, np.int):

Tuning the nodes and the edges


In [18]:
def tune_result(g_improved):
    """ Filter the results i.e. remove all the isolated nodes and nodes with blank labels """

    print('\nTuning the result...')

    if '' in g_improved.nodes():
        g_improved.remove_node('')

    hypernyms = {x[0] for x in g_improved.edges()}
    isolated_nodes = list(nx.isolates(g_improved))
    for isolated_node in isolated_nodes:
        terms = isolated_node.split('_')
        if terms[-1] in hypernyms:
            g_improved.add_edge(terms[-1], isolated_node)
        elif terms[0] in hypernyms:
            g_improved.add_edge(terms[0], isolated_node)
        else:
            g_improved.remove_node(isolated_node)

    return g_improved

In [19]:
tune_result(G_improved)
print('Tuned.')


Tuning the result...
Tuned.

Save the result


In [ ]:
def save_result(result, path):
    print('\nSaving the result...')
    df_improved = pd.DataFrame(list(result.edges()), columns=['hypernym', 'hyponym'])
    df_improved = df_improved[df_improved.columns.tolist()[::-1]]

    # Replace the underscores with blanks
    df_improved['hyponym'] = df_improved['hyponym'].apply(lambda x: x.replace('_', ' '))
    df_improved['hypernym'] = df_improved['hypernym'].apply(lambda x: x.replace('_', ' '))

    # Store the result
    output_path = os.path.join(
        'taxi_output', 'distributional_semantics',
        os.path.basename(path) + '-' + 'new_ds' + os.path.splitext(path)[-1]
    )
    df_improved.to_csv(output_path, sep='\t', header=False)
    print('Output saved at:', output_path)

    return output_path

In [ ]:
output_path = save_result(G_improved, taxo_path)

Results visualization

Clusters


In [ ]:
def visualize_clusters(graph):
    """ Clusterize the nodes of a particular domain in a given graph """
    graph_cluster = chinese_whispers(graph, weighting='top', iterations=60)
    
    # Visualize the clustering of graph_cluster using NetworkX (requires matplotlib)
    colors = [1. / graph_cluster.node[node]['label'] for node in graph_cluster.nodes()]
    fig = plt.gcf()
    fig.set_size_inches(20, 20)
    nx.draw_networkx(graph_cluster, cmap=plt.get_cmap('jet'), node_color=colors, font_color='black')
    plt.show()

In [ ]:
GC_improved = create_children_clusters(own_w2v, G_improved)

In [ ]:
domain = 'mechanical_engineering'

In [ ]:
# Original clusters
visualize_clusters(GC[domain])

In [ ]:
# Clusters after detaching
visualize_clusters(GC_detached[domain])

In [ ]:
# Clusters after detaching and re-attaching the clusters
visualize_clusters(GC_improved[domain])

Taxonomy


In [ ]:
# View the original taxonomy
display_taxonomy(G)

In [ ]:
# View the modified taxonomy
display_taxonomy(G_improved)

In [ ]:
len(list(G.nodes()))

In [ ]:
len(list(G_improved.nodes()))

In [ ]: