In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import Counter

%matplotlib inline
import matplotlib.pyplot as plt

%load_ext autoreload
%autoreload 2

import sys
sys.path.append('../')
import scripts.classification as classification
import scripts.outliers as outliers


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

Sample 80% of the dataset, for 10 times

Let's sample only 80% of the recordings each time (in a stratified manner) so that the set of recordings considered for each country is changed every time.


In [ ]:
results_file = '../data/lda_data_8.pickle'
n_iters = 10
for n in range(n_iters):
    print "iteration %d" % n
    print results_file
    X, Y, Yaudio = classification.load_data_from_pickle(results_file)
    # get only 80% of the dataset.. to vary the choice of outliers
    X, _, Y, _ = train_test_split(X, Y, train_size=0.8, stratify=Y)
    print X.shape, Y.shape
    # outliers
    print "detecting outliers..."
    df_global, threshold, MD = outliers.get_outliers_df(X, Y, chi2thr=0.999)
    outliers.print_most_least_outliers_topN(df_global, N=10)
    
    # write output
    print "writing file"
    df_global.to_csv('../data/outliers_'+str(n)+'.csv', index=False)

In [3]:
n_iters = 10
ranked_countries = pd.DataFrame()
ranked_outliers = pd.DataFrame()
for n in range(n_iters):
    df_global = pd.read_csv('../data/outliers_'+str(n)+'.csv')
    df_global = df_global.sort_values('Outliers', axis=0, ascending=False).reset_index()
    ranked_countries = pd.concat([ranked_countries, df_global['Country']], axis=1)
    ranked_outliers = pd.concat([ranked_outliers, df_global['Outliers']], axis=1)
ranked_countries_arr = ranked_countries.get_values()

Estimate precision at K

First get the ground truth from a majority vote on the top K=10 positions.


In [5]:
# majority voting + precision at K
K_vote = 10
country_vote = Counter(ranked_countries_arr[:K_vote, :].ravel())

In [8]:
df_country_vote = pd.DataFrame.from_dict(country_vote, orient='index').reset_index()
df_country_vote.sort_values(0, ascending=False)


Out[8]:
index 0
0 Pakistan 10
2 Chad 10
5 Gambia 10
10 Ivory Coast 10
12 Botswana 10
6 Nepal 9
13 Benin 8
8 Senegal 7
9 French Guiana 7
4 El Salvador 5
11 Mozambique 5
7 Uganda 4
1 Bhutan 3
3 Liberia 2

In [9]:
def precision_at_k(array, gr_truth, k):
    return len(set(array[:k]) & set(gr_truth[:k])) / float(k)
    
k = 10
ground_truth = df_country_vote['index'].get_values()
p_ = []
for j in range(ranked_countries_arr.shape[1]):
    p_.append(precision_at_k(ranked_countries_arr[:, j], ground_truth, k))
p_ = np.array(p_)

In [10]:
print 'mean', np.mean(p_) 
print 'std', np.std(p_)


mean 0.67
std 0.0640312423743

In [11]:
print p_


[ 0.6  0.7  0.7  0.6  0.6  0.7  0.8  0.6  0.7  0.7]

In [ ]: