Flags and settings.
In [1]:
SAVE_FIGURES = False
Imports and database setup.
In [2]:
from itertools import product
import pandas as pd
import seaborn as sb
import numpy as np
import networkx as nx
from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet_ic as wn_ic
%matplotlib inline
import matplotlib.pyplot as plt
from progressbar import ProgressBar
%cd -q ..
from brainscopypaste.conf import settings
%cd -q notebooks
from brainscopypaste.mine import Model, Time, Source, Past, Durl
from brainscopypaste.db import Substitution
from brainscopypaste.utils import init_db, session_scope, memoized
from brainscopypaste.load import FAFeatureLoader
engine = init_db()
Build our data.
In [3]:
model = Model(Time.discrete, Source.majority, Past.last_bin, Durl.all, 1)
data = []
with session_scope() as session:
substitutions = session.query(Substitution.id)\
.filter(Substitution.model == model)
print("Got {} substitutions for model {}"
.format(substitutions.count(), model))
substitution_ids = [id for (id,) in substitutions]
for substitution_id in ProgressBar(term_width=80)(substitution_ids):
with session_scope() as session:
substitution = session.query(Substitution).get(substitution_id)
source_token, destination_token = substitution.tokens
source_lemma, destination_lemma = substitution.lemmas
source_pos, destination_pos = substitution.tags
data.append({'cluster_id': substitution.source.cluster.sid,
'destination_id': substitution.destination.sid,
'occurrence': substitution.occurrence,
'source_id': substitution.source.sid,
'source_token': source_token,
'destination_token': destination_token,
'source_pos': source_pos,
'destination_pos': destination_pos,
'source_lemma': source_lemma,
'destination_lemma': destination_lemma})
original_subs = pd.DataFrame(data)
del data
Assign proper weight to each substitution.
In [4]:
distances = original_subs.copy()
divide_weight_sum = lambda x: x / distances.loc[x.index].weight.sum()
# Weight is 1, at first.
distances['weight'] = 1
# Divided by the number of substitutions that share a durl.
distances['weight'] = distances\
.groupby(['destination_id', 'occurrence'])['weight']\
.transform(divide_weight_sum)
# Divided by the number of substitutions that share a cluster.
# (Using divide_weight_sum, where we divide by the sum of weights,
# ensures we count only one for each group of substitutions sharing
# a same durl.)
distances['weight'] = distances\
.groupby('cluster_id')['weight']\
.transform(divide_weight_sum)
Get the FA norms undirected graph and invert its weights to use them as costs.
In [5]:
fa_loader = FAFeatureLoader()
avg_weight = np.mean([weight for _, _, weight
in fa_loader._undirected_norms_graph
.edges_iter(data='weight')])
fa_graph = nx.Graph()
fa_graph.add_weighted_edges_from(
[(w1, w2, avg_weight / weight) for w1, w2, weight
in fa_loader._undirected_norms_graph.edges_iter(data='weight')]
)
Compute distances on the FA network, using lemmas if tokens are not found.
In [6]:
fa_distances = distances.copy()
fa_distances['weighted_distance'] = np.nan
fa_distances['distance'] = np.nan
@memoized
def fa_shortest_path(source, destination, weighted):
return nx.shortest_path_length(fa_graph, source, destination,
'weight' if weighted else None)
for i in ProgressBar(term_width=80)(fa_distances.index):
# Use source token, or lemma if not found,
# or skip this substitution if not found
if fa_graph.has_node(fa_distances.loc[i].source_token):
source_word = fa_distances.loc[i].source_token
elif fa_graph.has_node(fa_distances.loc[i].source_lemma):
source_word = fa_distances.loc[i].source_lemma
else:
continue
# Use destination token, or lemma if not found,
# or skip this substitution if not found
if fa_graph.has_node(fa_distances.loc[i].destination_token):
destination_word = fa_distances.loc[i].destination_token
elif fa_graph.has_node(fa_distances.loc[i].destination_lemma):
destination_word = fa_distances.loc[i].destination_lemma
else:
continue
fa_distances.loc[i, 'weighted_distance'] = \
fa_shortest_path(source_word, destination_word, weighted=True)
fa_distances.loc[i, 'distance'] = \
fa_shortest_path(source_word, destination_word, weighted=False)
Plot them.
In [7]:
def plot_metric(data, name, bin_count):
distances = data[name]
if bin_count <= 0:
bin_count = int(distances.max() - distances.min() + 1)
bins = np.arange(distances.min(), distances.max() + 2) - .5
d_bins = pd.cut(distances, bins, right=False, labels=False)
else:
d_bins, bins = pd.cut(distances, bin_count, right=False,
labels=False, retbins=True)
middles = (bins[:-1] + bins[1:]) / 2
width = middles[1] - middles[0]
# Compute bin values.
heights = np.zeros(bin_count)
for i in range(bin_count):
heights[i] = data[d_bins == i].weight.sum()
# Plot.
plt.bar(middles - width / 2, heights, width=width)
In [8]:
plot_metric(fa_distances, 'distance', -1)
In [9]:
plot_metric(fa_distances, 'weighted_distance', 20)
In [10]:
print('{} substitutions are not taken into account in the graphs above'
' (because they involve words unknown to FA).'
.format(len(fa_distances[np.isnan(fa_distances.distance)])))
We see in the first graph that most substitutions are not with immediate neighbours, it's rather 3 hops away on average.
The second graph is with weighted distances, with weights (costs) scaled so that the average cost of a link is 1. So a distance of 2 means that you travel the equivalent of 2 average links (be it with 1 link twice as expensive, or 4 links twice as cheap). Here too, we see that most substitutions are at distance 2, meaning it's not immediate neighbours but rather the words after those.
In [11]:
infocontent = wn_ic.ic('ic-brown.dat')
wordnet_poses = {'a', 'n', 'r', 's', 'v'}
def build_wordnet_metric(data, name, tipe, ic_based,
pos_based=False, distance_kws={}):
if tipe not in ['distance', 'similarity']:
raise ValueError
pos_based = ic_based or pos_based
wn_metrics = data.copy()
wn_metrics['metric'] = np.nan
skipped_no_synsets = 0
skipped_unknown_pos = 0
skipped_incongruent_pos = 0
skipped_noic_pos = 0
for i in ProgressBar(term_width=80)(data.index):
if pos_based:
pos = data.loc[i].source_pos[0].lower()
if pos not in wordnet_poses:
skipped_unknown_pos += 1
continue
if data.loc[i].destination_pos[0].lower() != pos:
skipped_incongruent_pos += 1
continue
if ic_based and (pos not in infocontent.keys()):
skipped_noic_pos += 1
continue
else:
pos = None
# Use source token, or lemma if not found
source_synsets = wn.synsets(wn_metrics.loc[i].source_token, pos=pos)
if len(source_synsets) == 0:
source_synsets = wn.synsets(wn_metrics.loc[i].source_lemma,
pos=pos)
# Use destination token, or lemma if not found
destination_synsets = wn.synsets(wn_metrics.loc[i].destination_token,
pos=pos)
if len(destination_synsets) == 0:
destination_synsets = \
wn.synsets(wn_metrics.loc[i].destination_lemma, pos=pos)
# Skip this substitution if no corresponding synsets were found
if len(source_synsets) == 0 or len(destination_synsets) == 0:
skipped_no_synsets += 1
continue
def get_distance(s1, s2):
distance_func = getattr(s1, name)
if ic_based:
return distance_func(s2, infocontent, **distance_kws)
else:
return distance_func(s2, **distance_kws)
distances = [get_distance(s1, s2)
for s1, s2 in product(source_synsets,
destination_synsets)
if get_distance(s1, s2) is not None]
if len(distances) != 0:
if tipe == 'distance':
wn_metrics.loc[i, 'metric'] = np.min(distances)
else:
wn_metrics.loc[i, 'metric'] = np.max(distances)
else:
wn_metrics.loc[i, 'metric'] = np.nan
if pos_based:
print('Skipped {} substitutions because their '
'source was unknown to WordNet'
.format(skipped_unknown_pos))
print('Skipped {} substitutions because their '
'source and destination pos were different'
.format(skipped_incongruent_pos))
if ic_based:
print('Skipped {} substitutions because their '
'source pos was not in {}'
.format(skipped_noic_pos, infocontent.keys()))
print('Skipped {} substitutions because no synsets were found'
.format(skipped_no_synsets))
return wn_metrics
WordNet defines all sorts of distances/similarities between synsets. We're trying all of them to see what it looks like.
In [12]:
jcn_similarities = build_wordnet_metric(distances, 'jcn_similarity',
'similarity', True)
In [13]:
plot_metric(jcn_similarities.loc[jcn_similarities.metric <= 1],
'metric', 20)
In [14]:
lin_similarities = build_wordnet_metric(distances, 'lin_similarity',
'similarity', True)
In [15]:
plot_metric(lin_similarities, 'metric', 20)
In [16]:
res_similarities = build_wordnet_metric(distances, 'res_similarity',
'similarity', True)
In [17]:
plot_metric(res_similarities.loc[res_similarities.metric <= 100],
'metric', 20)
In [18]:
lch_similarities = build_wordnet_metric(distances, 'lch_similarity',
'similarity', False, pos_based=True)
In [19]:
plot_metric(lch_similarities, 'metric', 20)
In [20]:
path_similarities = build_wordnet_metric(distances, 'path_similarity',
'similarity', False)
In [21]:
plot_metric(path_similarities, 'metric', 20)
In [22]:
shortest_path_distances = build_wordnet_metric(
distances, 'shortest_path_distance', 'distance', False, pos_based=True,
distance_kws={'simulate_root': True})
In [23]:
plot_metric(shortest_path_distances, 'metric', 20)
In [24]:
wup_similarities = build_wordnet_metric(distances, 'wup_similarity',
'similarity', False)
In [25]:
plot_metric(wup_similarities, 'metric', 20)
The WordNet distances and similarities make little sense to me. Lin (3.2), Path Distance (3.5), and Wu-Palmer (3.7) similarities are between 0 and 1, so we get a sense of how similar or dissimilar words are. Shortest Path Distance (3.6) is the distance in the hypernym/hyponym graph, so is also interpretable. Jiang-Conrath (3.1), Resnik (3.3), and Leacock Chodorow (3.4) similarities don't seem bound, so are more difficult to interpret.
One more problem: most of these measures only accept words that have exact same POS (which filters out about 1000 substitutions out of ~6000).
What can be seen is that there seems to be a group of substitutions where the words are very similar, and a thick tail of substitutions where the words are increasingly different (low similarities, high distances).