In [51]:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import csv

%matplotlib inline

In [52]:
x=[]
y=[]

In [53]:
with open('userdata.csv', 'rb') as csvf:
    reader = csv.reader(csvf, delimiter=',')
    headers = next(reader)
    for row in reader:
        try:
            x.append(float(row[5]))
            y.append(float(row[7]))
        except ValueError,e:
            print "error",e,"on line",row

In [55]:
data=[]
for i in range(0,34):
  data.append([x[i],y[i]])

In [60]:
plt.figure(figsize=(6,6))

plt.xlabel("cost",fontsize=14)
plt.ylabel("duration", fontsize=14)

plt.title("Before Clustering ", fontsize=20)

plt.plot(x, y, 'k.', color='#0080ff', markersize=30, alpha=0.6)

plt.show()



In [61]:
kmeans = KMeans(init='k-means++', n_clusters=3, n_init=10)

# kmeans = KMeans(init='random', n_clusters=3, n_init=10)

kmeans.fit(data)


Out[61]:
KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=3, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)

In [63]:
plt.figure(figsize=(6,6))

plt.xlabel("cost",fontsize=14)
plt.ylabel("duration", fontsize=14)

plt.title("After K-Means Clustering", fontsize=20)

plt.plot(x, y, 'k.', color='#ffaaaa', markersize=45, alpha=0.6)

# Plot the centroids as a blue X
centroids = kmeans.cluster_centers_

plt.scatter(centroids[:, 0], centroids[:, 1], marker='x', s=200,
  linewidths=3, color='b', zorder=10)
plt.show()