Objectives
Importing librairies
In [150]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from time import time
from joblib import Parallel, delayed
import multiprocessing
import time
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from sklearn.cluster import MiniBatchKMeans, KMeans
from sklearn.metrics.pairwise import pairwise_distances_argmin
from sklearn.datasets.samples_generator import make_blobs
from scipy.spatial.distance import cdist, pdist
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.covariance import EmpiricalCovariance, MinCovDet
loading different datasets :
In [151]:
%%time
known = pd.read_csv('../data/known.csv')
rogues = pd.read_csv('../data/rogues.csv')
transactions = pd.read_csv('../data/edges.csv').drop('Unnamed: 0',1)
#Dropping features and fill na with 0
df = pd.read_csv('../data/features_full.csv').drop('Unnamed: 0',1).fillna(0)
df = df.set_index(['nodes'])
#build normalize values
data = scale(df.values)
n_sample = 10000
Exploring clustering methods on the nodes featured dataset
First a very simple kmeans method
In [3]:
#Define estimator / by default clusters = 6 an init = 10
#kmeans = KMeans(init='k-means++', n_clusters=6, n_init=10)
In [4]:
#kmeans.fit(data)
Out[4]:
code from http://www.slideshare.net/SarahGuido/kmeans-clustering-with-scikitlearn#notes-panel)
In [160]:
#Quick PCA for k selection
X = PCA(n_components=2).fit_transform(data)
In [162]:
%%time
#Determine your k range
k_range = range(1,14)
# Fit the kmeans model for each n_clusters = k
k_means_var = [KMeans(n_clusters=k).fit(X) for k in k_range]
# Pull out the centroids for each model
centroids = [X.cluster_centers_ for X in k_means_var]
In [161]:
X
Out[161]:
In [156]:
%%time
# Caluculate the Euclidean distance from each pont to each centroid
k_euclid=[cdist(X, cent, 'euclidean') for cent in centroids]
dist = [np.min(ke,axis=1) for ke in k_euclid]
# Total within-cluster sum of squares
wcss = [sum(d**2) for d in dist]
# The total sum of squares
tss = sum(pdist(X)**2)/X.shape[0]
#The between-cluster sum of squares
bss = tss - wcss
In [ ]:
%%time
plt.plot(k_range,bss/tss,'-bo')
plt.xlabel('number of cluster')
plt.ylabel('% of variance explained')
plt.title('Variance explained vs k')
plt.grid(True)
plt.show()
In [ ]:
np.sqrt(data.shape[0]/2)
-> Weird
code from scikit learn
In [6]:
batch_size = 10
n_clusters = 6
#PCA
X = PCA(n_components=2).fit_transform(data)
##############################################################################
# Compute clustering with Means
k_means = KMeans(init='k-means++', n_clusters=6, n_init=10,random_state=2)
t0 = time.time()
k_means.fit(X)
t_batch = time.time() - t0
k_means_labels = k_means.labels_
k_means_cluster_centers = k_means.cluster_centers_
k_means_labels_unique = np.unique(k_means_labels)
##############################################################################
# Compute clustering with MiniBatchKMeans
mbk = MiniBatchKMeans(init='k-means++', n_clusters=6, batch_size=batch_size,
n_init=10, max_no_improvement=10, verbose=0,random_state=2)
t0 = time.time()
mbk.fit(X)
t_mini_batch = time.time() - t0
mbk_means_labels = mbk.labels_
mbk_means_cluster_centers = mbk.cluster_centers_
mbk_means_labels_unique = np.unique(mbk_means_labels)
In [7]:
##############################################################################
# Plot result
fig = plt.figure(figsize=(15, 5))
colors = ['#4EACC5', '#FF9C34', '#4E9A06','#FF0000','#800000','purple']
#fig.subplots_adjust(left=0.02, right=0.98, bottom=0.05, top=0.9)
# We want to have the same colors for the same cluster from the
# MiniBatchKMeans and the KMeans algorithm. Let's pair the cluster centers per
# closest one.
order = pairwise_distances_argmin(k_means_cluster_centers,
mbk_means_cluster_centers)
# KMeans
ax = fig.add_subplot(1, 3, 1)
for k, col in zip(range(n_clusters), colors):
my_members = k_means_labels == k
cluster_center = k_means_cluster_centers[k]
ax.plot(X[my_members, 0], X[my_members, 1], 'w',
markerfacecolor=col, marker='.',markersize=10)
ax.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
markeredgecolor='k', markersize=6)
ax.set_title('KMeans')
ax.set_xticks(())
ax.set_yticks(())
#plt.text(10,10, 'train time: %.2fs\ninertia: %f' % (
#t_batch, k_means.inertia_))
# Plot result
# MiniBatchKMeans
ax = fig.add_subplot(1, 3, 2)
for k, col in zip(range(n_clusters), colors):
my_members = mbk_means_labels == order[k]
cluster_center = mbk_means_cluster_centers[order[k]]
ax.plot(X[my_members, 0], X[my_members, 1], 'w',
markerfacecolor=col, marker='.', markersize=10)
ax.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
markeredgecolor='k', markersize=6)
ax.set_title('MiniBatchKMeans')
ax.set_xticks(())
ax.set_yticks(())
#plt.text(-5, 10, 'train time: %.2fs\ninertia: %f' %
#(t_mini_batch, mbk.inertia_))
# Plot result
# Initialise the different array to all False
different = (mbk_means_labels == 4)
ax = fig.add_subplot(1, 3, 3)
for l in range(n_clusters):
different += ((k_means_labels == k) != (mbk_means_labels == order[k]))
identic = np.logical_not(different)
ax.plot(X[identic, 0], X[identic, 1], 'w',
markerfacecolor='#bbbbbb', marker='.')
ax.plot(X[different, 0], X[different, 1], 'w',
markerfacecolor='m', marker='.')
ax.set_title('Difference')
ax.set_xticks(())
ax.set_yticks(())
plt.show()
In [14]:
fig2 = plt.figure(figsize=(15, 10))
for k, col in zip(range(n_clusters), colors):
my_members = k_means_labels == k
cluster_center = k_means_cluster_centers[k]
plt.plot(X[my_members, 0], X[my_members, 1], 'w',
markerfacecolor=col, marker='.',markersize=13)
plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
markeredgecolor='k', markersize=8)
plt.title('KMeans')
plt.show()
Objectives :
Explain : Mahalanobis Distance
In [56]:
X = PCA(n_components=2).fit_transform(data)
# compare estimators learnt from the full data set with true parameters
emp_cov = EmpiricalCovariance().fit(X)
robust_cov = MinCovDet().fit(X)
###############################################################################
# Display results
fig = plt.figure(figsize=(15, 8))
plt.subplots_adjust(hspace=-.1, wspace=.4, top=.95, bottom=.05)
# Show data set
subfig1 = plt.subplot(1, 1, 1)
inlier_plot = subfig1.scatter(X[:, 0], X[:, 1],
color='black', label='points')
subfig1.set_xlim(subfig1.get_xlim()[0], 11.)
subfig1.set_title("Mahalanobis distances of a contaminated data set:")
# Show contours of the distance functions
xx, yy = np.meshgrid(np.linspace(plt.xlim()[0], plt.xlim()[1], 100),
np.linspace(plt.ylim()[0], plt.ylim()[1], 100))
zz = np.c_[xx.ravel(), yy.ravel()]
mahal_emp_cov = emp_cov.mahalanobis(zz)
mahal_emp_cov = mahal_emp_cov.reshape(xx.shape)
emp_cov_contour = subfig1.contour(xx, yy, np.sqrt(mahal_emp_cov),
cmap=plt.cm.PuBu_r,
linestyles='dashed')
mahal_robust_cov = robust_cov.mahalanobis(zz)
mahal_robust_cov = mahal_robust_cov.reshape(xx.shape)
robust_contour = subfig1.contour(xx, yy, np.sqrt(mahal_robust_cov),
cmap=plt.cm.YlOrBr_r, linestyles='dotted')
plt.xticks(())
plt.yticks(())
plt.show()
In [16]:
k_means = KMeans(init='random', n_clusters=6, n_init=10, random_state=2)
clusters = k_means.fit_predict(data)
In [ ]:
df['clusters'] = clusters
In [ ]:
tagged = pd.merge(known,df,left_on='id',how='inner',right_index=True)
rogues_tag = pd.merge(rogues,df,left_on='id',how='inner',right_index=True)
In [149]:
distrib = pd.DataFrame(df.groupby('clusters').count().apply(lambda x: 100*x/float(x.sum()))['total_degree'].values,columns=['Global'])
distrib['Known']=tagged.groupby('clusters').count().apply(lambda x: 100*x/float(x.sum()))['id']
distrib['Rogues']=rogues_tag.groupby('clusters').count().apply(lambda x: 100*x/float(x.sum()))['id']
distrib['Clusters']=distrib.index
distrib
distrib.get(['Global', 'Known','Rogues','Clusters']).groupby(['Clusters']).mean().plot(kind='bar',title='Cluster Distriubtion per Population');
In [138]:
#Several Insights, Nature of the clusters
df.groupby('clusters').mean()
Out[138]:
Function to get the cluster corresponding to the node
In [146]:
#write function
def get_cluster(node,df):
return df.loc[node].clusters
In [148]:
#Tag from node
%%time
transactions['cluster_from'] = transactions['from'].map(lambda x: get_cluster(x,df))
In [ ]:
#Tag to node
%%time
transactions['cluster_to'] = transactions['to'].map(lambda x: get_cluster(x,df))