In [1]:
import numpy as np
import pandas as pd
import pickle
%load_ext autoreload
%autoreload 2
%matplotlib inline
import matplotlib.pyplot as plt
# import here in this order otherwise crashes the kernel,
# something wrong with shapely and pysal,
# shapely needs to be imported before pysal?
from mpl_toolkits.basemap import Basemap
from shapely.geometry import Point, Polygon
import sys
sys.path.append('../')
import scripts.outliers as outliers
import scripts.utils as utils
import scripts.interactive_plot as interactive_plot
In [ ]:
DATA_FILE = '../data/lda_data_8.pickle'
METADATA_FILE = '../data/metadata.csv'
dataset, ddf, w_dict = outliers.load_data(DATA_FILE, METADATA_FILE)
X_list, Y, Yaudio = dataset
X = np.concatenate(X_list, axis=1)
In [6]:
df_global, threshold, MD = outliers.get_outliers_df(X, Y, chi2thr=0.999)
outliers.print_most_least_outliers_topN(df_global, N=10)
tab_all = interactive_plot.plot_outliers_world_figure(MD, MD>threshold, ddf)
print "n outliers " + str(len(np.where(MD>threshold)[0]))
In [10]:
# outliers for features
feat = X_list
feat_labels = ['rhythm', 'melody', 'timbre', 'harmony']
tabs_feat = []
for i in range(len(feat)):
print 'outliers', feat_labels[i]
XX = feat[i]
df_feat, threshold, MD = outliers.get_outliers_df(XX, Y, chi2thr=0.999)
outliers.print_most_least_outliers_topN(df_feat, N=5)
tabs_feat.append(interactive_plot.plot_outliers_world_figure(MD, MD>threshold, ddf))
Output the interactive plot of music outliers in .html.
In [42]:
interactive_plot.plot_tabs(tab_all, tabs_feat, out_file="../demo/outliers.html")
In [11]:
df_local = outliers.get_local_outliers_df(X, Y, w_dict)
outliers.print_most_least_outliers_topN(df_local, N=10)
First, cluster recordings in K clusters (select best K based on silhouette score).
In [22]:
centroids, cl_pred = outliers.get_country_clusters(X, bestncl=None, min_ncl=10, max_ncl=30)
ddf['Clusters'] = cl_pred
print len(np.unique(cl_pred))
outliers.print_clusters_metadata(ddf, cl_pred)
Get histogram of cluster mappings for each country.
In [12]:
cluster_freq = utils.get_cluster_freq_linear(X, Y, centroids)
cluster_freq.head()
Out[12]:
In [ ]: