Here we generate a and load the data and set the relevant parameters


In [8]:
import numpy as np
import matplotlib.pyplot as plt 
from sklearn.cluster import KMeans
# Active in line mode 
%pylab inline  
# This is for making plots appeared in the document and not as an external window


Populating the interactive namespace from numpy and matplotlib
WARNING: pylab import has clobbered these variables: ['mean', 'cov']
`%matplotlib` prevents importing * from pylab and numpy

In [9]:
#Quantityt of data 
N_total = 1000 
# Percentage of data is that is anomalous 
p_anomaly = 0.10
N_normal = (1 - p_anomaly) * N_total
N_anomal = p_anomaly * N_total
# Make the number integeres
N_normal = int(N_normal)
N_anomal = int(N_anomal)

In [27]:
# Generate anomaly data
mean = np.array([5, 5])
cov = np.array([[1, 0], [0, 1]])
data_anomal = np.random.multivariate_normal(mean, cov, N_anomal)

In [28]:
# Generate normal data 
mean = np.array([0, 0])
cov = np.array([[1, 0], [0, 1]])
data_normal = np.random.multivariate_normal(mean, cov, N_normal)

In [29]:
# Gather the data in one vector
data = np.concatenate([data_normal, data_anomal])

In [30]:
print data_normal.shape
print data_anomal.shape
print data.shape


(900, 2)
(100, 2)
(1000, 2)

In [31]:
plt.plot(data_normal[:,0], data_normal[:,1], '*r')
plt.plot(data_anomal[:,0], data_anomal[:, 1], '*b')
plt.show()


This is an exmaple of header

Now we use the K-means algorithm to cluster the data


In [51]:
clusters = 2
anomaly_distance = 4
k_means = KMeans(n_clusters=clusters, init='k-means++', n_init=10, max_iter=300, tol=0.0001, precompute_distances=True)

In [52]:
k_means.fit(data)


Out[52]:
KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=2, n_init=10,
    n_jobs=1, precompute_distances=True, random_state=None, tol=0.0001,
    verbose=0)

In [53]:
cluster_centers = k_means.cluster_centers_

In [54]:
data_labels = k_means.labels_

In [57]:
circle_1 = plt.Circle(cluster_centers[0], anomaly_distance, color='b', fill=False)
circle_2 = plt.Circle(cluster_centers[1], anomaly_distance, color='b', fill=False)
plt.plot(data[:,0], data[:, 1], '*')
fig = plt.gcf()
fig.gca().add_artist(circle_1)
fig.gca().add_artist(circle_2)
plt.show()



In [ ]: