In [8]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
# Active in line mode
%pylab inline
# This is for making plots appeared in the document and not as an external window
In [9]:
#Quantityt of data
N_total = 1000
# Percentage of data is that is anomalous
p_anomaly = 0.10
N_normal = (1 - p_anomaly) * N_total
N_anomal = p_anomaly * N_total
# Make the number integeres
N_normal = int(N_normal)
N_anomal = int(N_anomal)
In [27]:
# Generate anomaly data
mean = np.array([5, 5])
cov = np.array([[1, 0], [0, 1]])
data_anomal = np.random.multivariate_normal(mean, cov, N_anomal)
In [28]:
# Generate normal data
mean = np.array([0, 0])
cov = np.array([[1, 0], [0, 1]])
data_normal = np.random.multivariate_normal(mean, cov, N_normal)
In [29]:
# Gather the data in one vector
data = np.concatenate([data_normal, data_anomal])
In [30]:
print data_normal.shape
print data_anomal.shape
print data.shape
In [31]:
plt.plot(data_normal[:,0], data_normal[:,1], '*r')
plt.plot(data_anomal[:,0], data_anomal[:, 1], '*b')
plt.show()
In [51]:
clusters = 2
anomaly_distance = 4
k_means = KMeans(n_clusters=clusters, init='k-means++', n_init=10, max_iter=300, tol=0.0001, precompute_distances=True)
In [52]:
k_means.fit(data)
Out[52]:
In [53]:
cluster_centers = k_means.cluster_centers_
In [54]:
data_labels = k_means.labels_
In [57]:
circle_1 = plt.Circle(cluster_centers[0], anomaly_distance, color='b', fill=False)
circle_2 = plt.Circle(cluster_centers[1], anomaly_distance, color='b', fill=False)
plt.plot(data[:,0], data[:, 1], '*')
fig = plt.gcf()
fig.gca().add_artist(circle_1)
fig.gca().add_artist(circle_2)
plt.show()
In [ ]: