In [6]:

    
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import Counter

%matplotlib inline
import matplotlib.pyplot as plt

%load_ext autoreload
%autoreload 2

import sys
sys.path.append('../')
import scripts.classification as classification
import scripts.outliers as outliers









    



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

Sample 80% of the dataset, for 10 times

Let's sample only 80% of the recordings each time (in a stratified manner) so that the set of recordings considered for each country is changed every time.



In [ ]:

    
results_file = '../data/lda_data_8.pickle'
n_iters = 10
for n in range(n_iters):
    print "iteration %d" % n
    print results_file
    X, Y, Yaudio = classification.load_data_from_pickle(results_file)
    # get only 80% of the dataset.. to vary the choice of outliers
    X, _, Y, _ = train_test_split(X, Y, train_size=0.8, stratify=Y)
    print X.shape, Y.shape
    # outliers
    print "detecting outliers..."
    df_global, threshold, MD = outliers.get_outliers_df(X, Y, chi2thr=0.999)
    outliers.print_most_least_outliers_topN(df_global, N=10)
    
    # write output
    print "writing file"
    df_global.to_csv('../data/outliers_'+str(n)+'.csv', index=False)



In [3]:

    
n_iters = 10
ranked_countries = pd.DataFrame()
ranked_outliers = pd.DataFrame()
for n in range(n_iters):
    df_global = pd.read_csv('../data/outliers_'+str(n)+'.csv')
    df_global = df_global.sort_values('Outliers', axis=0, ascending=False).reset_index()
    ranked_countries = pd.concat([ranked_countries, df_global['Country']], axis=1)
    ranked_outliers = pd.concat([ranked_outliers, df_global['Outliers']], axis=1)
ranked_countries_arr = ranked_countries.get_values()

Estimate precision at K

First get the ground truth from a majority vote on the top K=10 positions.



In [5]:

    
# majority voting + precision at K
K_vote = 10
country_vote = Counter(ranked_countries_arr[:K_vote, :].ravel())



In [8]:

    
df_country_vote = pd.DataFrame.from_dict(country_vote, orient='index').reset_index()
df_country_vote.sort_values(0, ascending=False)









    Out[8]:






  
    
      
      index
      0
    
  
  
    
      0
      Pakistan
      10
    
    
      2
      Chad
      10
    
    
      5
      Gambia
      10
    
    
      10
      Ivory Coast
      10
    
    
      12
      Botswana
      10
    
    
      6
      Nepal
      9
    
    
      13
      Benin
      8
    
    
      8
      Senegal
      7
    
    
      9
      French Guiana
      7
    
    
      4
      El Salvador
      5
    
    
      11
      Mozambique
      5
    
    
      7
      Uganda
      4
    
    
      1
      Bhutan
      3
    
    
      3
      Liberia
      2



In [9]:

    
def precision_at_k(array, gr_truth, k):
    return len(set(array[:k]) & set(gr_truth[:k])) / float(k)
    
k = 10
ground_truth = df_country_vote['index'].get_values()
p_ = []
for j in range(ranked_countries_arr.shape[1]):
    p_.append(precision_at_k(ranked_countries_arr[:, j], ground_truth, k))
p_ = np.array(p_)



In [10]:

    
print 'mean', np.mean(p_) 
print 'std', np.std(p_)









    



mean 0.67
std 0.0640312423743



In [11]:

    
print p_









    



[ 0.6  0.7  0.7  0.6  0.6  0.7  0.8  0.6  0.7  0.7]



In [ ]:

	index	0
0	Pakistan	10
2	Chad	10
5	Gambia	10
10	Ivory Coast	10
12	Botswana	10
6	Nepal	9
13	Benin	8
8	Senegal	7
9	French Guiana	7
4	El Salvador	5
11	Mozambique	5
7	Uganda	4
1	Bhutan	3
3	Liberia	2