In [1]:
import os
import numpy as np
import scipy.stats
from sklearn.metrics import roc_auc_score, average_precision_score
import pmf
In [2]:
train_tracks = list()
with open('train_tracks.txt', 'rb') as f:
for line in f:
train_tracks.append(line.strip())
test_tracks = list()
with open('test_tracks.txt', 'rb') as f:
for line in f:
test_tracks.append(line.strip())
tags = list()
with open('voc.txt', 'rb') as f:
for line in f:
tags.append(line.strip())
In [3]:
def construct_pred_mask(tags_predicted, predictat):
n_samples, n_tags = tags_predicted.shape
rankings = np.argsort(-tags_predicted, axis=1)[:, :predictat]
tags_predicted_binary = np.zeros_like(tags_predicted, dtype=bool)
for i in xrange(n_samples):
tags_predicted_binary[i, rankings[i]] = 1
return tags_predicted_binary
def per_tag_prec_recall(tags_predicted_binary, tags_true_binary):
mask = np.logical_and(tags_predicted_binary, tags_true_binary)
prec = mask.sum(axis=0) / (tags_predicted_binary.sum(axis=0) + np.spacing(1))
tags_true_count = tags_true_binary.sum(axis=0).astype(float)
idx = (tags_true_count > 0)
recall = mask.sum(axis=0)[idx] / tags_true_count[idx]
return prec, recall
def aroc_ap(tags_true_binary, tags_predicted):
n_tags = tags_true_binary.shape[1]
auc = list()
aprec = list()
for i in xrange(n_tags):
if np.sum(tags_true_binary[:, i]) != 0:
auc.append(roc_auc_score(tags_true_binary[:, i], tags_predicted[:, i]))
aprec.append(average_precision_score(tags_true_binary[:, i], tags_predicted[:, i]))
return auc, aprec
def print_out_metrics(tags_true_binary, tags_predicted, predictat):
tags_predicted_binary = construct_pred_mask(tags_predicted, predictat)
prec, recall = per_tag_prec_recall(tags_predicted_binary, tags_true_binary)
mprec, mrecall = np.mean(prec), np.mean(recall)
print 'Precision = %.3f (%.3f)' % (mprec, np.std(prec) / sqrt(prec.size))
print 'Recall = %.3f (%.3f)' % (mrecall, np.std(recall) / sqrt(recall.size))
print 'F-score = %.3f' % (2 * mprec * mrecall / (mprec + mrecall))
auc, aprec = aroc_ap(tags_true_binary, tags_predicted)
print 'AROC = %.3f (%.3f)' % (np.mean(auc), np.std(auc) / sqrt(len(auc)))
print 'AP = %.3f (%.3f)' % (np.mean(aprec), np.std(aprec) / sqrt(len(aprec)))
In [4]:
# take tracks with at least 20 tags
y_test = None
test_tracks_selected = list()
for tid in test_tracks:
tdir = os.path.join('vq_hist', '/'.join(tid[2:5]))
bot = np.load(os.path.join(tdir, '%s_BoT.npy' % tid))
if (bot > 0).sum() >= 20:
test_tracks_selected.append(tid)
if y_test is None:
y_test = bot
else:
y_test = np.vstack((y_test, bot))
In [5]:
hist(np.sum( (y_test > 0), axis=1), bins=50)
pass
In [6]:
K = 512
In [7]:
n_subset = 10000
np.random.seed(98765)
train_tracks_subset = np.random.choice(train_tracks, size=n_subset, replace=False)
In [8]:
D = K + len(tags)
X = np.empty((n_subset, D), dtype=np.int16)
for (i, tid) in enumerate(train_tracks_subset):
tdir = os.path.join('vq_hist', '/'.join(tid[2:5]))
vq = np.load(os.path.join(tdir, '%s_K%d.npy' % (tid, K))).ravel()
bot = np.load(os.path.join(tdir, '%s_BoT.npy' % tid))
bot[bot > 0] = 1
X[i] = np.hstack((vq, bot))
In [9]:
bar(np.arange(D), X[1000])
Out[9]:
In [10]:
X_test = np.empty((len(test_tracks_selected), K), dtype=int16)
for (i, tid) in enumerate(test_tracks_selected):
tdir = os.path.join('vq_hist', '/'.join(tid[2:5]))
vq = np.load(os.path.join(tdir, '%s_K%d.npy' % (tid, K))).ravel()
X_test[i] = vq
In [11]:
n_components = 100
coder = pmf.PoissonMF(n_components=n_components, random_state=98765, verbose=True)
In [12]:
coder.fit(X)
Out[12]:
In [13]:
# randomly plot 30 "topics"
indices = np.random.choice(n_components, size=30, replace=False)
figure(figsize=(45, 15))
for i in xrange(30):
subplot(10, 3, i+1)
topic = coder.Eb[indices[i]].copy()
# properly normalize the BoT dimensions for visualization purposes
topic[K:] /= topic[K:].max()
topic[K:] *= topic[:K].max()
bar(np.arange(D), topic)
axvline(x=K, color='red')
title('Component #%d' % indices[i])
#savefig('dict.eps')
In [14]:
tagger = pmf.PoissonMF(n_components=n_components, random_state=98765, verbose=True)
In [15]:
tagger.set_components(coder.gamma_b[:, :K], coder.rho_b)
Out[15]:
In [16]:
Et = tagger.transform(X_test)
In [17]:
Et /= Et.sum(axis=1, keepdims=True)
tags_predicted = Et.dot(coder.Eb[:, K:])
print tags_predicted.min(), tags_predicted.max()
div_factor = 3
tags_predicted = tags_predicted - div_factor * np.mean(tags_predicted, axis=0)
In [18]:
predictat = 20
tags_true_binary = (y_test > 0)
print_out_metrics(tags_true_binary, tags_predicted, predictat)
In [19]:
n_components = 100
online_coder = pmf.OnlinePoissonMF(n_components=n_components, batch_size=500, n_pass=1,
random_state=98765, verbose=True)
In [20]:
online_coder.fit(X, est_total=len(train_tracks))
Out[20]:
In [21]:
plot(online_coder.bound)
pass
In [22]:
ents = np.zeros((n_components, ))
for k in xrange(n_components):
ents[k] = scipy.stats.entropy(online_coder.Eb[k])
In [23]:
idx = np.argsort(-ents)
In [24]:
plot(ents[idx], '-o')
pass
In [25]:
tagger = pmf.PoissonMF(n_components=n_components, random_state=98765, verbose=True)
In [26]:
tagger.set_components(online_coder.gamma_b[:, :K], online_coder.rho_b[:, :K])
Out[26]:
In [27]:
Et = tagger.transform(X_test)
In [28]:
Et /= Et.sum(axis=1, keepdims=True)
tags_predicted = Et.dot(online_coder.Eb[:, K:])
n_samples, n_tags = tags_predicted.shape
print tags_predicted.min(), tags_predicted.max()
div_factor = 3
tags_predicted = tags_predicted - div_factor * np.mean(tags_predicted, axis=0)
In [29]:
predictat = 20
tags_true_binary = (y_test > 0)
print_out_metrics(tags_true_binary, tags_predicted, predictat)
In [30]:
# very naive implementation of out-of-core fit for stochastic PMF
def ooc_fit(obj, train_tracks, K, n_feats):
n_samples = len(train_tracks)
obj._scale = float(n_samples) / obj.batch_size
obj._init_components(n_feats)
obj.bound = list()
for count in xrange(obj.n_pass):
print 'Iteration %d: passing through the data...' % count
indices = np.arange(n_samples)
if obj.shuffle:
np.random.shuffle(indices)
for (i, istart) in enumerate(xrange(0, n_samples,
obj.batch_size), 1):
print 'Mini-batch %d:' % i
iend = min(istart + obj.batch_size, n_samples)
obj.set_learning_rate(iter=i)
mini_batch = np.zeros((iend - istart, n_feats))
for s in xrange(iend - istart):
tid = train_tracks[indices[istart + s]]
#print '\tRead in track: %s' % tid
tdir = os.path.join('vq_hist', '/'.join(tid[2:5]))
vq = np.load(os.path.join(tdir, '%s_K%d.npy' % (tid, K))).ravel()
bot = np.load(os.path.join(tdir, '%s_BoT.npy' % tid))
bot[bot > 0] = 1
mini_batch[s] = np.hstack((vq, bot))
obj.partial_fit(mini_batch)
obj.bound.append(obj._stoch_bound(mini_batch))
return obj
In [31]:
n_components = 100
batch_size = 1000
online_coder_full = pmf.OnlinePoissonMF(n_components=n_components, batch_size=batch_size, n_pass=1,
random_state=98765, verbose=True)
In [32]:
online_coder_full = ooc_fit(online_coder_full, train_tracks, K, D)
In [33]:
plot(online_coder_full.bound)
pass
In [34]:
ents = np.zeros((n_components, ))
for k in xrange(n_components):
ents[k] = scipy.stats.entropy(online_coder_full.Eb[k])
In [35]:
idx = np.argsort(-ents)
In [36]:
plot(ents[idx], '-o')
pass
In [37]:
tagger = pmf.PoissonMF(n_components=n_components, random_state=98765, verbose=True)
In [38]:
tagger.set_components(online_coder_full.gamma_b[:, :K], online_coder_full.rho_b[:, :K])
Out[38]:
In [39]:
Et = tagger.transform(X_test)
In [40]:
Et /= Et.sum(axis=1, keepdims=True)
tags_predicted = Et.dot(online_coder_full.Eb[:, K:])
print tags_predicted.min(), tags_predicted.max()
div_factor = 3
tags_predicted = tags_predicted - div_factor * np.mean(tags_predicted, axis=0)
In [41]:
predictat = 20
tags_true_binary = (y_test > 0)
print_out_metrics(tags_true_binary, tags_predicted, predictat)
In [ ]: