In [1]:
import numpy as np
import pickle
from scipy.stats import pearsonr
import sys
%matplotlib inline
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2
sys.path.append('../')
import scripts.outliers as outliers
In [5]:
DATA_FILE = '../data/lda_data_8.pickle'
METADATA_FILE = '../data/metadata.csv'
dataset, ddf, w_dict = outliers.load_data(DATA_FILE, METADATA_FILE)
X_list, Y, Yaudio = dataset
X = np.concatenate(X_list, axis=1)
In [6]:
df_global, threshold, MD = outliers.get_outliers_df(X, Y, chi2thr=0.999)
df_global.head()
Out[6]:
In [10]:
corr, pval = pearsonr(df_global['Outliers'], df_global['N_Country'])
print 'correlation', corr
print 'p-value', pval
plt.scatter(df_global['Outliers'], df_global['N_Country'])
plt.xlabel('Outliers');
plt.ylabel('N');
In [ ]: