In [1]:

    
import numpy as np
import pickle
from scipy.stats import skew
from sklearn.metrics.pairwise import pairwise_distances

%matplotlib inline
import matplotlib.pyplot as plt

%load_ext autoreload
%autoreload 2

Load data



In [2]:

    
X_list, Y, Yaudio = pickle.load(open('../data/lda_data_8.pickle','rb'))
X = np.concatenate(X_list, axis=1)
print X.shape









    



(8200, 381)

Pairwise distances



In [3]:

    
D = pairwise_distances(X, metric='mahalanobis')



In [6]:

    
plt.hist(D.ravel(), bins=100);

n-occurrence and stats



In [7]:

    
def n_occurrence_from_D(D, k=10, n_items=None):
    if n_items is None:
        n_items = len(D)
    sort_idx = np.argsort(D, axis=1)
    D_k = sort_idx[:, 1:(k+1)]  # nearest neighbour is the item itself
    N_k = np.bincount(D_k.astype(int).ravel(), minlength=n_items)
    return N_k



In [8]:

    
# k = average number of samples per country
uniq_countries, uniq_counts = np.unique(Y, return_counts=True)
k = np.int(np.round(np.mean(uniq_counts)))
print k



In [9]:

    
N_k = n_occurrence_from_D(D, k=k)

print 'hubness', skew(N_k)
print 'median', np.median(N_k)
print 'std', np.std(N_k)
print 'hubs', len(np.where(N_k>1000)[0]), len(np.where(N_k>1000)[0]) / float(len(N_k))
print 'orphans', len(np.where(N_k==0)[0]), len(np.where(N_k==0)[0]) / float(len(N_k))









    



hubness 10.5914304167
median 1.0
std 348.006117132
hubs 129 0.0157317073171
orphans 3332 0.406341463415



In [10]:

    
plt.figure()
plt.hist(N_k, bins=100);

plt.figure()
plt.plot(np.sort(N_k))









    Out[10]:





[<matplotlib.lines.Line2D at 0x7f91173fa350>]

Large hubs



In [15]:

    
large_hubs_idx = np.where(N_k>1000)[0]
print len(large_hubs_idx)
labs, counts = np.unique(Y[large_hubs_idx], return_counts=True)

plt.figure(figsize=(15, 5))
plt.bar(range(len(labs)), counts, align='center');
plt.xticks(range(len(labs)), labs, rotation=90);
plt.ylabel('Count')
plt.xlabel('Country origins for recordings identified as large hubs')









    



129






    Out[15]:





<matplotlib.text.Text at 0x7f9139b32490>



In [ ]: