In [131]:
%cd ~/NetBeansProjects/ExpLosion/
%load_ext autoreload
from notebooks.common_imports import *
from gui.output_utils import *
sns.timeseries.algo.bootstrap = my_bootstrap
sns.categorical.bootstrap = my_bootstrap
In [2]:
s = {'document_features_ev': 'SVO',
'document_features_tr': 'J+N+V+SVO',
'expansions__allow_overlap': 0,
'expansions__decode_handler': 'SignifiedOnlyFeatureHandler',
'expansions__entries_of_id': None,
'expansions__k': 3,
'expansions__neighbour_strategy': 'linear',
'expansions__noise': 0.0,
'expansions__use_random_neighbours': 0,
'expansions__use_similarity': 0,
'expansions__vectors__algorithm__in': ['word2vec'],
'expansions__vectors__composer__in': ['Verb', 'CopyObj', 'CopySubj', 'Add', 'Mult', 'FAdd', 'FMult'],
'expansions__vectors__dimensionality': 100,
'expansions__vectors__rep': 0,
'expansions__vectors__unlabelled_percentage__in': [15, 100],
'labelled': 'amazon_grouped-tagged'}
ids = Experiment.objects.filter(**s).values_list('id', flat=True)
print(ids, 'total', len(ids))
In [3]:
fields = { 'unlab': 'expansions__vectors__unlabelled',
'algo': 'expansions__vectors__algorithm',
'Composer': 'expansions__vectors__composer',
'percent': 'expansions__vectors__unlabelled_percentage'}
df = dataframe_from_exp_ids(ids, fields)
df['corpus'] = ['%s-%s'%(a,b) for a,b in zip(df.unlab, df.percent)]
df.drop(['unlab', 'percent'], axis=1, inplace=True)
with sns.color_palette("cubehelix", 4):
g= sns.factorplot(y='Accuracy', x='Composer', hue='corpus',# col='unlab',
hue_order=sort_df_by(df, 'corpus'), aspect=2,
data=df, kind='bar', ci=68);
for ax in g.axes.flat:
ax.axhline(random_vect_baseline(), c='k');
# ax.axhline(nondistributional_baseline(**settings_of(ids[0])), c='k')
plt.axhline(random_vect_baseline(), c='k')
plt.savefig('plot-vps-corpus-size.pdf', format='pdf', dpi=300, bbox_inches='tight', pad_inches=0.1)
In [4]:
s['expansions__vectors__algorithm__in'] = ['word2vec', 'glove', 'count_windows']
s['expansions__vectors__composer__in'] = ['Verb', 'CopyObj', 'Add', 'Mult', 'FAdd', 'FMult']
s['expansions__vectors__unlabelled'] = 'wiki'
s['expansions__vectors__unlabelled_percentage__in'] = [100]
ids = Experiment.objects.filter(**s).values_list('id', flat=True)
print(ids, 'total', len(ids))
fields = {'algo': 'expansions__vectors__algorithm',
'Composer': 'expansions__vectors__composer'
}
df = dataframe_from_exp_ids(ids, fields, abbreviate=False)
with sns.color_palette("cubehelix", 4):
g= sns.factorplot(y='Accuracy', x='Composer', hue='algo',
data=df, kind='bar', aspect=2, ci=68);
plt.axhline(random_vect_baseline(), c='k')
plt.savefig('plot-vps-algorithm.pdf', format='pdf', dpi=300, bbox_inches='tight', pad_inches=0.1)
In [39]:
import re
from collections import defaultdict
from scipy.stats import mode
start_pattern = re.compile('Found a DF of shape .*? in HDF file (.*)')
count_pattern = re.compile('Training matrix for (.*?) from (\d+) SVOs')
d = defaultdict(list)
# log file produced by running categorical_composers.py
with open('../FeatureExtractionToolkit/categorical/categorical.o6208386.txt') as infile:
curr_vectors = 'WTF'
for line in infile:
line = line.strip()
match = re.search(start_pattern, line)
if match:
curr_vectors = match.groups()[0]
match = re.search(count_pattern, line)
if match:
count = int(match.groups()[1])
if count > 500 and 'glove' in curr_vectors:
print(match.groups(0))
d[curr_vectors].append(count)
In [35]:
for key, val in d.items():
print(key)
print(pd.Series(val).describe())
print(np.mean(val), np.percentile(val, 2.5), np.percentile(val, 97.5), np.sum(val))
print('median',np.median(val), 'mode', mode(val)[0][0])
In [132]:
from discoutils.thesaurus_loader import Vectors as vv
import random
v1 = vv.from_tsv('../FeatureExtractionToolkit/categorical/AN_NN_wiki-w2v-100_CopyObj.events.filtered.strings', allow_lexical_overlap=False)
v2 = vv.from_tsv('../FeatureExtractionToolkit/word2vec_vectors/composed/AN_NN_word2vec-wiki_100percent-rep0_Add.events.filtered.strings', allow_lexical_overlap=False)
In [138]:
from discoutils.tokens import DocumentFeature
vocab1 = []
for x in v1.keys():
doc_feat = DocumentFeature.from_string(x)
if doc_feat.type in ['SVO'] or (doc_feat.type=='1-GRAM' and doc_feat.tokens[0].pos in 'NJV'):
vocab1.append(x)
v1.init_sims(n_neighbors=75, vocab=vocab1)
v2.init_sims(n_neighbors=75, vocab=vocab1)
In [139]:
random.seed(0)
words = random.sample([x for x in v1.keys() if x.count('_')], 15)
df = compare_neighbours([v1, v2], ['copyobj', 'add'], words)
df
Out[139]:
In [141]:
v1.get_nearest_neighbours('contain/V')[:5]
Out[141]:
In [142]:
v1.get_nearest_neighbours('receive/V')[:5]
Out[142]:
In [143]:
v2.get_nearest_neighbours('contain/V')[:5]
Out[143]:
In [144]:
pd.set_option('display.max_colwidth', 85)
print(pd.DataFrame(df.copyobj).to_latex())
In [124]:
print(pd.DataFrame(df['add']).to_latex())
In [125]:
some_unigrams = v1.matrix[:200, :]
some_vps = v1.matrix[-200:, :]
uni_norms = np.linalg.norm(some_unigrams, axis=1)
vp_norms = np.linalg.norm(some_vps, axis=1)
In [126]:
plt.hist(uni_norms, bins=30, color='g', alpha=0.5);
plt.figure()
plt.hist(vp_norms, bins=30, color='r', alpha=0.5);
In [127]:
from sklearn import preprocessing
from discoutils.thesaurus_loader import DenseVectors
x_scaled = preprocessing.normalize(v1.df.values)
scaled_df = pd.DataFrame(x_scaled, index=v1.df.index, columns=v1.df.columns)
v3 = DenseVectors(scaled_df, allow_lexical_overlap=True)
In [128]:
v3.init_sims(n_neighbors=5, vocab=vocab1)
In [129]:
v3.get_nearest_neighbours('squibb/N_sell/V_drug/N')
Out[129]:
In [130]:
v3.get_nearest_neighbours('sell/V')
Out[130]:
In [ ]: