In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import Counter
%matplotlib inline
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
import sys
sys.path.append('../')
import scripts.classification as classification
import scripts.outliers as outliers
Let's sample only 80% of the recordings each time (in a stratified manner) so that the set of recordings considered for each country is changed every time.
In [ ]:
results_file = '../data/lda_data_8.pickle'
n_iters = 10
for n in range(n_iters):
print "iteration %d" % n
print results_file
X, Y, Yaudio = classification.load_data_from_pickle(results_file)
# get only 80% of the dataset.. to vary the choice of outliers
X, _, Y, _ = train_test_split(X, Y, train_size=0.8, stratify=Y)
print X.shape, Y.shape
# outliers
print "detecting outliers..."
df_global, threshold, MD = outliers.get_outliers_df(X, Y, chi2thr=0.999)
outliers.print_most_least_outliers_topN(df_global, N=10)
# write output
print "writing file"
df_global.to_csv('../data/outliers_'+str(n)+'.csv', index=False)
In [3]:
n_iters = 10
ranked_countries = pd.DataFrame()
ranked_outliers = pd.DataFrame()
for n in range(n_iters):
df_global = pd.read_csv('../data/outliers_'+str(n)+'.csv')
df_global = df_global.sort_values('Outliers', axis=0, ascending=False).reset_index()
ranked_countries = pd.concat([ranked_countries, df_global['Country']], axis=1)
ranked_outliers = pd.concat([ranked_outliers, df_global['Outliers']], axis=1)
ranked_countries_arr = ranked_countries.get_values()
First get the ground truth from a majority vote on the top K=10 positions.
In [5]:
# majority voting + precision at K
K_vote = 10
country_vote = Counter(ranked_countries_arr[:K_vote, :].ravel())
In [8]:
df_country_vote = pd.DataFrame.from_dict(country_vote, orient='index').reset_index()
df_country_vote.sort_values(0, ascending=False)
Out[8]:
In [9]:
def precision_at_k(array, gr_truth, k):
return len(set(array[:k]) & set(gr_truth[:k])) / float(k)
k = 10
ground_truth = df_country_vote['index'].get_values()
p_ = []
for j in range(ranked_countries_arr.shape[1]):
p_.append(precision_at_k(ranked_countries_arr[:, j], ground_truth, k))
p_ = np.array(p_)
In [10]:
print 'mean', np.mean(p_)
print 'std', np.std(p_)
In [11]:
print p_
In [ ]: