In [ ]:
'''
in this program you can run differnent clustering method for plenty of DataSet
Author: Mohamadjavad Bahmani <mohamadjavad.bahmani@gmail.com>.
'''
#import your data in DATA, we use our data set beside code file.
DATA = "iris.data"
# import some library
#================================================
from pandas import read_table
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
import seaborn
#================================================
def GetData():
#prepare data for proccessing
frame = read_table(DATA,encoding='latin-1',
sep=',',skipinitialspace=True,na_values=['?'],
index_col=None, header=None,)
del frame[frame.columns[4]]
return frame
#================================================
def GetFeatures(frame):
#convert data to float
arr = np.array(frame,dtype=np.float)
#fill missing values
from sklearn.preprocessing import Imputer
imputer = Imputer(strategy='mean')
arr = imputer.fit_transform(arr)
#normalize the entire data
from sklearn.preprocessing import scale
arr = scale(arr)
return arr
#=================================================
def ReduceDimension(X):
from sklearn.decomposition import FastICA
reducer = FastICA(n_components=2)
x_r = reducer.fit_transform(X)
yield 'ICA',x_r[:,0],x_r[:,1]
#=================================================
def Learning(X):
from sklearn.cluster import AgglomerativeClustering
learner = AgglomerativeClustering(n_clusters=3)
y = learner.fit_predict(X)
yield 'Agglomerative clusters(n=3)', y
#=================================================
def Plot(Xs, predictions):
# We will use subplots to display the results in a grid
nrows = len(Xs)
ncols = len(predictions)
fig = plt.figure(figsize=(16, 8))
fig.canvas.set_window_title('Clustering data from ' + DATA)
# Show each element in the plots returned from plt.subplots()
for row, (row_label, X_x, X_y) in enumerate(Xs):
for col, (col_label, y_pred) in enumerate(predictions):
ax = plt.subplot(nrows, ncols, row * ncols + col + 1)
if row == 0:
plt.title(col_label)
if col == 0:
plt.ylabel(row_label)
# Plot the decomposed input data and use the predicted
# cluster index as the value in a color map.
plt.scatter(X_x, X_y, c=y_pred.astype(np.float), cmap='prism', alpha=0.5)
# Set the axis tick formatter to reduce the number of ticks
ax.xaxis.set_major_locator(MaxNLocator(nbins=4))
ax.yaxis.set_major_locator(MaxNLocator(nbins=4))
# Let matplotlib handle the subplot layout
plt.tight_layout()
plt.show()
plt.close()
if __name__ == '__main__':
#import the data set from DATA
frame = GetData()
print('1-Downloading data from {} is done'.format(DATA))
X = GetFeatures(frame)
print("2-Processing {} samples with {} attributes is done".format(len(frame.index), len(frame.columns)))
Xs = list(ReduceDimension(X))
print("3-Reducing dimensionality is done")
predictions = list(Learning(X))
print("4-Evaluating clustering learners is done")
# Display the results
Plot(Xs, predictions)
print("5-ploting results is done")