In [1]:
import numpy as np
import pickle
from scipy.stats import skew
from sklearn.metrics.pairwise import pairwise_distances
%matplotlib inline
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
In [2]:
X_list, Y, Yaudio = pickle.load(open('../data/lda_data_8.pickle','rb'))
X = np.concatenate(X_list, axis=1)
print X.shape
In [3]:
D = pairwise_distances(X, metric='mahalanobis')
In [6]:
plt.hist(D.ravel(), bins=100);
In [7]:
def n_occurrence_from_D(D, k=10, n_items=None):
if n_items is None:
n_items = len(D)
sort_idx = np.argsort(D, axis=1)
D_k = sort_idx[:, 1:(k+1)] # nearest neighbour is the item itself
N_k = np.bincount(D_k.astype(int).ravel(), minlength=n_items)
return N_k
In [8]:
# k = average number of samples per country
uniq_countries, uniq_counts = np.unique(Y, return_counts=True)
k = np.int(np.round(np.mean(uniq_counts)))
print k
In [9]:
N_k = n_occurrence_from_D(D, k=k)
print 'hubness', skew(N_k)
print 'median', np.median(N_k)
print 'std', np.std(N_k)
print 'hubs', len(np.where(N_k>1000)[0]), len(np.where(N_k>1000)[0]) / float(len(N_k))
print 'orphans', len(np.where(N_k==0)[0]), len(np.where(N_k==0)[0]) / float(len(N_k))
In [10]:
plt.figure()
plt.hist(N_k, bins=100);
plt.figure()
plt.plot(np.sort(N_k))
Out[10]:
In [15]:
large_hubs_idx = np.where(N_k>1000)[0]
print len(large_hubs_idx)
labs, counts = np.unique(Y[large_hubs_idx], return_counts=True)
plt.figure(figsize=(15, 5))
plt.bar(range(len(labs)), counts, align='center');
plt.xticks(range(len(labs)), labs, rotation=90);
plt.ylabel('Count')
plt.xlabel('Country origins for recordings identified as large hubs')
Out[15]:
In [ ]: