In [1]:
import pandas as pd
import networkx as nx
import subprocess
import matplotlib.pyplot as plt
import gensim
import os
from networkx.drawing.nx_agraph import graphviz_layout
from chinese_whispers import chinese_whispers, aggregate_clusters
from gensim.models.poincare import PoincareModel
from nltk.corpus import wordnet as wn
In [2]:
def display_taxonomy(graph):
""" Display the taxonomy in a hierarchical layout """
pos = graphviz_layout(graph, prog='dot', args="-Grankdir=LR")
plt.figure(3,figsize=(48,144))
nx.draw(graph, pos, with_labels=True, arrows=True)
plt.show()
In [3]:
# Construct the networkx graph
def process_input(taxonomy):
""" Read the taxonomy and generate a networkx graph """
# Generated
df = pd.read_csv(
taxonomy,
sep='\t',
header=None,
names=['hyponym', 'hypernym'],
usecols=[1,2],
)
G = nx.DiGraph()
for rel in zip(list(df['hypernym']), list(df['hyponym'])):
rel_0 = rel[0]
rel_1 = rel[1]
# Simplify the compound words by replacing the whitespaces with underscores
if ' ' in rel[0]:
rel_0 = '_'.join(rel[0].split())
if ' ' in rel[1]:
rel_1 = '_'.join(rel[1].split())
G.add_edge(rel_0, rel_1)
return G
In [4]:
taxo_path = 'taxi_output/simple_full/science/science_en.csv-relations.csv-taxo-knn1.csv'
gs_path = 'eval/taxi_eval_archive/gold_standard/science.taxo'
G_taxo = process_input(taxo_path)
G_gold = process_input(gs_path)
In [10]:
print('Nodes in GS:', len(set(G_gold.nodes())))
print('Nodes in G Taxo:', len(set(G_taxo.nodes())))
In [9]:
new_nodes = set(G_gold.nodes()) - set(G_taxo.nodes())
len(new_nodes)
Out[9]:
In [11]:
def load_vectors():
""" Load word vectors. """
embedding_dir = '/home/5aly/taxi/distributed_semantics/embeddings/'
poincare_model = model = PoincareModel.load(embedding_dir + 'embeddings_poincare_wordnet') # parent-cluster relationship
own_model = gensim.models.KeyedVectors.load(embedding_dir + 'own_embeddings_w2v') # family-cluster relationship
return poincare_model, own_model
In [12]:
poincare_w2v, own_w2v = load_vectors()
Create a networkx graph for each node containing only its children. Draw edges among the children based on the similarity with one another using word vectors.
In [13]:
def create_children_clusters(own_model, graph):
""" This function returns a dictionary where corresponding to each key(node) is a graph of its children """
clustered_graph = {}
for node in graph.nodes():
clustered_graph[node] = nx.Graph()
successors = [s.lower() for s in graph.successors(node)]
for successor in successors:
try:
for word, _ in own_model.most_similar(successor, topn=100):
if word.lower() in successors:
clustered_graph[node].add_edge(successor, word.lower())
except KeyError: # If the word in not in vocabulary, check using the substring based method
successor_terms = successor.split('_')
if node in successor_terms:
clustered_graph[node].add_node(successor)
return clustered_graph
In [14]:
GC = create_children_clusters(own_w2v, G_taxo)
In [ ]:
posI = graphviz_layout(GC['engineering'])
# plt.figure(2, figsize=(20, 20))
nx.draw(GC['engineering'], posI, with_labels=True, arrows=True)
plt.show()
In [15]:
G_improved = G_taxo.copy()
In [16]:
def calculate_similarity(poincare_model, own_model, parent, family, node, exclude_parent, exclude_family):
# Similarity between the parent and a cluster
parent_similarity = 0
if not exclude_parent:
node_senses = [n_sense.name() for n_sense in wn.synsets(node) if node in n_sense.name()]
parent_senses = [p_sense.name() for p_sense in wn.synsets(parent) if parent in p_sense.name()]
for parent_sense in parent_senses:
for node_sense in node_senses:
try:
similarity = poincare_model.kv.similarity(parent_sense, node_sense)
if similarity > parent_similarity:
parent_similarity = similarity
except KeyError as e:
if parent_sense in str(e):
break
else:
continue
# Similarity between a family and a cluster
family_similarity = 0
if not exclude_family:
family_similarities = []
for f_item in family:
try:
family_similarities.append(own_model.similarity(f_item, node))
except KeyError as e: # skip the terms not in vocabulary
if node in str(e):
break
else:
continue
if len(family_similarities) > 0:
family_similarity = sum(family_similarities) / len(family_similarities)
# Final score is the average of both the similarities
return (parent_similarity + family_similarity) / 2
In [17]:
for node in new_nodes:
max_score = 0
max_score_node = ''
for p_node, graph in GC.items():
gc = chinese_whispers(graph, weighting='top', iterations=60)
for label, family in aggregate_clusters(gc).items():
score = calculate_similarity(poincare_w2v, own_w2v, p_node, family, node, False, False)
if score > max_score:
max_score = score
max_score_node = p_node
G_improved.add_edge(max_score_node, node)
In [18]:
def tune_result(g_improved):
""" Filter the results i.e. remove all the isolated nodes and nodes with blank labels """
print('\nTuning the result...')
if '' in g_improved.nodes():
g_improved.remove_node('')
hypernyms = {x[0] for x in g_improved.edges()}
isolated_nodes = list(nx.isolates(g_improved))
for isolated_node in isolated_nodes:
terms = isolated_node.split('_')
if terms[-1] in hypernyms:
g_improved.add_edge(terms[-1], isolated_node)
elif terms[0] in hypernyms:
g_improved.add_edge(terms[0], isolated_node)
else:
g_improved.remove_node(isolated_node)
return g_improved
In [19]:
tune_result(G_improved)
print('Tuned.')
In [ ]:
def save_result(result, path):
print('\nSaving the result...')
df_improved = pd.DataFrame(list(result.edges()), columns=['hypernym', 'hyponym'])
df_improved = df_improved[df_improved.columns.tolist()[::-1]]
# Replace the underscores with blanks
df_improved['hyponym'] = df_improved['hyponym'].apply(lambda x: x.replace('_', ' '))
df_improved['hypernym'] = df_improved['hypernym'].apply(lambda x: x.replace('_', ' '))
# Store the result
output_path = os.path.join(
'taxi_output', 'distributional_semantics',
os.path.basename(path) + '-' + 'new_ds' + os.path.splitext(path)[-1]
)
df_improved.to_csv(output_path, sep='\t', header=False)
print('Output saved at:', output_path)
return output_path
In [ ]:
output_path = save_result(G_improved, taxo_path)
In [ ]:
def visualize_clusters(graph):
""" Clusterize the nodes of a particular domain in a given graph """
graph_cluster = chinese_whispers(graph, weighting='top', iterations=60)
# Visualize the clustering of graph_cluster using NetworkX (requires matplotlib)
colors = [1. / graph_cluster.node[node]['label'] for node in graph_cluster.nodes()]
fig = plt.gcf()
fig.set_size_inches(20, 20)
nx.draw_networkx(graph_cluster, cmap=plt.get_cmap('jet'), node_color=colors, font_color='black')
plt.show()
In [ ]:
GC_improved = create_children_clusters(own_w2v, G_improved)
In [ ]:
domain = 'mechanical_engineering'
In [ ]:
# Original clusters
visualize_clusters(GC[domain])
In [ ]:
# Clusters after detaching
visualize_clusters(GC_detached[domain])
In [ ]:
# Clusters after detaching and re-attaching the clusters
visualize_clusters(GC_improved[domain])
In [ ]:
# View the original taxonomy
display_taxonomy(G)
In [ ]:
# View the modified taxonomy
display_taxonomy(G_improved)
In [ ]:
len(list(G.nodes()))
In [ ]:
len(list(G_improved.nodes()))
In [ ]: