In [21]:
from sklearn.cluster import DBSCAN
import matplotlib.pyplot as plt
import csv

%matplotlib inline

In [22]:
x=[]
y=[]

with open('userdata.csv', 'rb') as csvf:
    reader = csv.reader(csvf, delimiter=',')
    headers = next(reader)
    for row in reader:
        try:
            x.append(float(row[5]))
            y.append(float(row[7]))
        except ValueError,e:
            print "error",e,"on line",row

In [23]:
data=[]
for i in range(0,34):
  data.append([x[i],y[i]])

In [41]:
dbscan = DBSCAN(random_state=111)

In [42]:
dbscan


Out[42]:
DBSCAN(algorithm='auto', eps=0.5, leaf_size=30, metric='euclidean',
    min_samples=5, p=None, random_state=111)

In [43]:
dbscan.fit(data)


C:\Users\faye\Anaconda2\lib\site-packages\sklearn\cluster\dbscan_.py:106: DeprecationWarning: The parameter random_state is deprecated in 0.16 and will be removed in version 0.18. DBSCAN is deterministic except for rare border cases.
  category=DeprecationWarning)
Out[43]:
DBSCAN(algorithm='auto', eps=0.5, leaf_size=30, metric='euclidean',
    min_samples=5, p=None, random_state=111)

In [55]:
dbscan.labels_


Out[55]:
array([ 0,  0,  0, -1, -1, -1,  1, -1,  1,  1,  0,  1,  1,  1, -1, -1, -1,
       -1, -1,  1, -1, -1,  2,  2, -1,  0, -1, -1,  2,  2, -1, -1,  2, -1], dtype=int64)

In [78]:
for i in range(0, 34):
    if dbscan.labels_[i] == 0:
        c1 = plt.scatter(data[i][0],data[i][1],c='r',marker='+', s=200)
    elif dbscan.labels_[i] == 1:
        c2 = plt.scatter(data[i][0],data[i][1],c='g',marker='o', s=200)
    elif dbscan.labels_[i] == 2:
        c3 = plt.scatter(data[i][0],data[i][1],c='y',marker='x', s=200)
    elif dbscan.labels_[i] == -1:
        c4 = plt.scatter(data[i][0],data[i][1],c='b',marker='*', s=200)

plt.legend([c1, c2, c3, c4], ['Cluster 1', 'Cluster 2','Cluster 3','Noise'])
plt.title('DBSCAN finds 3 clusters and noise')
plt.show()



In [ ]: