In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
style.use("ggplot")
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import train_test_split
import warnings
warnings.simplefilter('ignore')
In [52]:
df=pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/water-treatment/water-treatment.data')
In [53]:
df.shape
Out[53]:
1 Q-E (input flow to plant)
2 ZN-E (input Zinc to plant)
3 PH-E (input pH to plant)
4 DBO-E (input Biological demand of oxygen to plant)
5 DQO-E (input chemical demand of oxygen to plant)
6 SS-E (input suspended solids to plant)
7 SSV-E (input volatile supended solids to plant)
8 SED-E (input sediments to plant)
9 COND-E (input conductivity to plant)
10 PH-P (input pH to primary settler)
11 DBO-P (input Biological demand of oxygen to primary settler)
12 SS-P (input suspended solids to primary settler)
13 SSV-P (input volatile supended solids to primary settler)
14 SED-P (input sediments to primary settler)
15 COND-P (input conductivity to primary settler)
16 PH-D (input pH to secondary settler)
17 DBO-D (input Biological demand of oxygen to secondary settler)
18 DQO-D (input chemical demand of oxygen to secondary settler)
19 SS-D (input suspended solids to secondary settler)
20 SSV-D (input volatile supended solids to secondary settler)
21 SED-D (input sediments to secondary settler)
22 COND-D (input conductivity to secondary settler)
23 PH-S (output pH)
24 DBO-S (output Biological demand of oxygen)
25 DQO-S (output chemical demand of oxygen)
26 SS-S (output suspended solids)
27 SSV-S (output volatile supended solids)
28 SED-S (output sediments)
29 COND-S (output conductivity)
30 RD-DBO-P (performance input Biological demand of oxygen in primary settler)
31 RD-SS-P (performance input suspended solids to primary settler)
32 RD-SED-P (performance input sediments to primary settler)
33 RD-DBO-S (performance input Biological demand of oxygen to secondary settler)
34 RD-DQO-S (performance input chemical demand of oxygen to secondary settler)
35 RD-DBO-G (global performance input Biological demand of oxygen)
36 RD-DQO-G (global performance input chemical demand of oxygen)
37 RD-SS-G (global performance input suspended solids)
38 RD-SED-G (global performance input sediments)
In [70]:
headers= [x for x in range(0,39)]
df.columns=headers
In [62]:
df.sample()
Out[62]:
In [56]:
df=df.replace('?', df.replace(['?'], [None]))
In [97]:
# df.isnull().sum()
In [79]:
# converting to float from string
for i in range(1,39):
df[i]=df[i].astype(float)
In [84]:
for column in range(1,39):
df[column]=df[column].fillna(int(df[column].mean()))
In [91]:
df=df.drop(0,axis=1)
In [92]:
train,test=train_test_split(df,test_size=0.4)
In [93]:
train.shape
Out[93]:
In [94]:
test.shape
Out[94]:
In [95]:
colors = ["g.","r.","c.","y.","k.",'-c.','r.','g.']
In [99]:
kmeans6 = KMeans(n_clusters=6)
kmeans6.fit(train)
centroids = kmeans6.cluster_centers_
labels = kmeans6.labels_
In [101]:
centroids[0]
Out[101]:
In [106]:
labels
Out[106]:
In [118]:
df=pd.read_csv("https://dl.dropboxusercontent.com/u/75194/stats/data/01_heights_weights_genders.csv")
In [122]:
df.sample(5)
Out[122]:
In [125]:
df.drop("Gender",axis=1,inplace=True)
In [127]:
df.sample(5)
Out[127]:
In [128]:
train,test=train_test_split(df,test_size=0.4)
In [132]:
kmeans6 = KMeans(n_clusters=2)
kmeans6.fit(train)
centroids = kmeans6.cluster_centers_
labels = kmeans6.labels_
In [133]:
centroids
Out[133]:
In [134]:
labels
Out[134]:
In [137]:
def toList(df):
idx =df.index.tolist()
columns=df.columns
l=[]
for i in idx:
temp=[]
for j in columns:
a=df.get_value(i,j)
temp.append(a)
l.append(temp)
return l
In [138]:
lis=toList(train)
In [139]:
for i in range(len(lis)):
# print("coordinate:",lis[i], "label:", labels[i])
plt.plot(lis[i][0], lis[i][1], colors[labels[i]], markersize = 10)
# plt.show()
plt.scatter(centroids[:, 0],centroids[:, 1], marker = "x", s=150, linewidths = 5, zorder = 10)
plt.show()
In [ ]: