In [1]:
%cd ~/NetBeansProjects/ExpLosion/
from notebooks.common_imports import *
from gui.user_code import get_demsar_diagram, get_demsar_params
from gui.output_utils import *
sns.timeseries.algo.bootstrap = my_bootstrap
sns.categorical.bootstrap = my_bootstrap
In [2]:
# modified from http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
def plot_confusion_matrix(gold, pred, title='Confusion matrix', cmap=plt.cm.Blues):
cm = confusion_matrix(gold, pred)
# normalize
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
target_names = unique_labels(gold, pred)
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(target_names))
plt.xticks(tick_marks, target_names, rotation=45)
plt.yticks(tick_marks, target_names)
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
In [77]:
query_dict = {'expansions__vectors__rep': 0,
'expansions__k':3,
'labelled':'amazon_grouped-tagged',
'expansions__use_similarity': 0,
'expansions__neighbour_strategy':'linear',
'expansions__vectors__dimensionality': 100,
'document_features_ev': 'AN+NN',
'document_features_tr': 'J+N+AN+NN',
'expansions__allow_overlap': False,
'expansions__entries_of': None,
'expansions__vectors__algorithm': 'glove',
'expansions__vectors__composer__in': ['Left'],
'expansions__vectors__unlabelled': 'wiki',
'expansions__vectors__unlabelled_percentage':100,
'expansions__decode_handler': 'SignifiedOnlyFeatureHandler',
'expansions__noise': 0}
ids = Experiment.objects.filter(**query_dict).order_by('expansions__vectors__unlabelled_percentage',
'expansions__vectors__composer').values_list('id', flat=True)
In [78]:
ids
Out[78]:
In [79]:
get_ci(ids[0])[:-1]
Out[79]:
In [80]:
results = Results.objects.get(id=ids[0], classifier='MultinomialNB')
pred = results.predictions
gold = results.gold
In [81]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils.multiclass import unique_labels
print(classification_report(gold, pred))
In [95]:
sns.set_style('white')
plot_confusion_matrix(gold, pred)
In [2]:
path = '../FeatureExtractionToolkit/word2vec_vectors/composed/AN_NN_word2vec-wiki_100percent-rep0_Add.events.filtered.strings.kmeans2000'
df = pd.read_hdf(path, key='clusters')
In [3]:
counts = df.clusters.value_counts()
g = sns.distplot(counts.values, kde_kws={'cut':True})
g.set(xlim=(0, None))
plt.title('Distribution of cluster sizes, k=500');
In [4]:
counts.describe()
Out[4]:
In [5]:
df[df.clusters==counts.argmin()].head(20)
Out[5]:
In [6]:
df[df.clusters == 5]
# cluster 5 (negative sentiment), 2 (royalty), 8 (cheap, expencive) are very sensible
# cluster 3 ('arm'), 1 ('product'), 15 (hot), 16 (playing) are dominated by a single word (may contain multiple senses, e.g. hot water, hot waiter)
# cluster 6 (grand slam, grand prix, door slam) dominated by a few words and a polysemous word bridging senses
# cluster 10- film characters + misc
# 11 - sentiment, mix of positive and negative
# 13- named entities
# 14- arche, tower, veranda + related words + other senses (arch enemy)
Out[6]:
In [22]:
from collections import Counter
Counter([str(x).split('_')[0] for x in df[df.clusters == 5].index]).most_common(10)
Out[22]:
In [8]:
from discoutils.thesaurus_loader import Vectors as vv
# not quite the same vectors (15% vs 100%), but that's all I've got on this machine
v = vv.from_tsv('../FeatureExtractionToolkit/word2vec_vectors/composed/AN_NN_word2vec-wiki_15percent-rep0_Add.events.filtered.strings')
In [11]:
v.init_sims(n_neighbors=30)
In [24]:
v.get_nearest_neighbours('bad/J')[:5]
Out[24]:
In [34]:
cluster_num = df.ix['good/J_guy/N'][0]
print(cluster_num)
df[df.clusters == cluster_num]
Out[34]:
In [29]:
Counter([str(x).split('_')[0] for x in df[df.clusters == cluster_num].index]).most_common(10)
Out[29]:
In [47]:
cluster_num = df.ix['good/J_movie/N'][0]
print(cluster_num)
df[df.clusters == cluster_num]
Out[47]:
In [46]:
Counter([str(x).split('_')[1] for x in df[df.clusters == cluster_num].index]).most_common(10)
Out[46]:
In [48]:
df[df.clusters == counts.argmax()] # these appear to be names, they are 99% unigrams
Out[48]:
In [49]:
path = '../FeatureExtractionToolkit/socher_vectors/composed/AN_NN_turian_Socher.events.filtered.strings.kmeans2000'
ddf = pd.read_hdf(path, key='clusters')
In [57]:
cluster_num = ddf.ix['bad/J_guy/N'][0]
print(cluster_num)
ddf[ddf.clusters == cluster_num]
Out[57]:
In [60]:
Counter([str(x).split('_')[1] for x in ddf[ddf.clusters == cluster_num].index]).most_common(10)
Out[60]:
In [92]:
gaps = []
for r in Results.objects.filter(classifier=CLASSIFIER):
gap = r.accuracy_mean - r.macrof1_mean
if abs(gap) > 0.1:
print(r.id.id)
gaps.append(gap)
plt.hist(gaps);
In [9]:
from discoutils.thesaurus_loader import Vectors
from discoutils.tokens import DocumentFeature
v = Vectors.from_tsv('../FeatureExtractionToolkit/word2vec_vectors/word2vec-wiki-15perc.unigr.strings.rep0')
In [66]:
from random import sample
sampled_words = sample(list(v.keys()), 5000)
v.init_sims(n_neighbors=100)
In [67]:
data = []
for w in sampled_words:
doc_feat = DocumentFeature.from_string(w)
if doc_feat.tokens[0].pos == 'N' and np.random.uniform() < 0.8:
# too many nouns, ignore some of them
continue
neigh = v.get_nearest_neighbours(w)
for rank, (n, sim) in enumerate(neigh):
pospos = doc_feat.tokens[0].pos + DocumentFeature.from_string(n).tokens[0].pos
data.append([''.join(pospos), sim, rank])
In [68]:
df = pd.DataFrame(data, columns='pospos sim rank'.split())
mask = df.pospos.str.len() == 2
df = df[mask]
df.pospos.value_counts(), df.shape
Out[68]:
In [69]:
g = sns.FacetGrid(df, col='pospos', col_wrap=3);
g.map(plt.hist, 'sim');
In [70]:
g = sns.FacetGrid(df, col='pospos', col_wrap=3);
g.map(plt.hist, 'rank');
In [ ]: