The enphasis of the last notebook was on the accuracy of Horn's algorithm. We analyzed the iris and crab datasets, and also a mixture of gaussians.
Now the method's performance will be analysed. Currently the gradient descent implementation (port of the original code provided by (Horn) is quite simple and the only stopping criteria is the number of iterations. Future work will involve implementing a more complete gradient descent, but for now we'll use the number of iterations to test it's performance.
Since we're only testing performance, accuracy is not of essence so we'll use only artificial data where it's easy to scale diminsionality and cardinality and see it's effect on the performance. Furthermore, raw data will be input to the algorithm instead of the PCA representation, Also, while the true algorithm is only the convergence part, we'll also time the assignment for each test case.
In [1]:
%pylab inline
In [2]:
import sklearn
from sklearn import preprocessing,decomposition,datasets
import seaborn as sns
In [13]:
%cd /home/chiroptera/workspace/QCThesis/MyML/cluster/
import Horn as HornAlg
reload(HornAlg)
Out[13]:
In [14]:
import MyML.metrics.accuracy as determine_ci
reload(determine_ci)
Out[14]:
In [15]:
# These are the "Tableau 20" colors as RGB.
tableau20 = [(31, 119, 180), (174, 199, 232), (255, 127, 14), (255, 187, 120),
(44, 160, 44), (152, 223, 138), (214, 39, 40), (255, 152, 150),
(148, 103, 189), (197, 176, 213), (140, 86, 75), (196, 156, 148),
(227, 119, 194), (247, 182, 210), (127, 127, 127), (199, 199, 199),
(188, 189, 34), (219, 219, 141), (23, 190, 207), (158, 218, 229)]
# Scale the RGB values to the [0, 1] range, which is the format matplotlib accepts.
for i in range(len(tableau20)):
r, g, b = tableau20[i]
tableau20[i] = (r / 255., g / 255., b / 255.)
We'll use a random sigma and fixed number of steps of 200 for all runs.
In [16]:
n_magnitude=[1,2,3,4] #10^n_magniture data_points
n_features=[2,3,4,5]
centers=4
steps=200
sigma=np.random.rand()
minD=2
In [17]:
timings_clust=np.zeros((len(n_magnitude),len(n_features)))
timings_assign=np.zeros((len(n_magnitude),len(n_features)))
for i,mag in enumerate(n_magnitude):
n_samples=10**mag
for j,dim in enumerate(n_features):
print "\nGenerating dataset with ",n_samples, " samples, ",dim, " features."
x_Gauss,x_assign=sklearn.datasets.make_blobs(n_samples=n_samples,n_features=dim,centers=centers)
print "Clustering..."
gaussD,V,E,hello=HornAlg.graddesc(x_Gauss,sigma=sigma,steps=steps,timeit=True)
print "Took ",hello," seconds"
timings_clust[i,j]=hello
print "Assigning..."
assignment,hello=HornAlg.fineCluster(gaussD,dist,potential=V,timeit=True)
timings_assign[i,j]=hello
print "Took ",hello," seconds"
In [7]:
clust=array([[ 3.53820000e-01, 4.11391000e-01, 3.85114000e-01,
4.29747000e-01],
[ 2.95465000e+00, 3.32259300e+00, 3.74372000e+00,
4.14382300e+00],
[ 5.28406660e+01, 6.02932620e+01, 6.82256710e+01,
8.15232120e+01],
[ 3.00967826e+03, 3.41834283e+03, 3.95628906e+03,
4.91818584e+03]])
assign=array([[ 1.59000000e-04, 1.58000000e-04, 1.56000000e-04,
1.57000000e-04],
[ 5.19000000e-04, 5.06000000e-04, 5.21000000e-04,
5.16000000e-04],
[ 7.67200000e-03, 8.27300000e-03, 7.77000000e-03,
8.07000000e-03],
[ 3.95657000e-01, 4.00728000e-01, 4.00611000e-01,
4.26076000e-01]])
In [59]:
fig = pyplot.figure(figsize=(16,18))
for i in range(4):
ax = fig.add_subplot(3,2,i+1)
ax.set_title(str(10**n_magnitude[i]) + ' samples')
ax.set_xlabel('Features')
ax.set_ylabel('Time [s]')
line, = ax.plot(n_features,clust[i], color=tableau20[i*2], lw=1)
ax.set_yscale('linear')
ax = fig.add_subplot(3,2,5)
ax.set_title('Influence of cardinality')
ax.set_xlabel('10^ samples')
ax.set_ylabel('Time log[s]')
for i in range(4):
line, = ax.plot(n_magnitude,clust[:,i], color=tableau20[i*2], lw=1,label=str(n_features[i])+' features')
ax.set_yscale('linear')
ax = fig.add_subplot(3,2,6)
ax.set_title('Influence of cardinality')
ax.set_xlabel('10^ samples')
ax.set_ylabel('Time log[s]')
for i in range(4):
line, = ax.plot(n_magnitude,clust[:,i], color=tableau20[i*2], lw=1,label=str(n_features[i])+' features')
ax.set_yscale('log')
In [53]:
feature_inc=np.zeros((4,3))
for i in range(1,4):
feature_inc[:,i-1]=clust[:,i]/clust[:,i-1]
sample_inc=np.zeros((3,4))
for i in range(1,4):
sample_inc[i-1,:]=clust[i,:]/clust[i-1,:]
In [56]:
import pandas as pd
In [85]:
feature_inc_pd=pd.DataFrame(feature_inc)
sample_inc_pd=pd.DataFrame(sample_inc)
print '\n','feature evolution (read right)','\n',feature_inc_pd
print '\n','sample evolution (read down)','\n',sample_inc_pd