In [49]:
import numpy as np
from sklearn import datasets
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt1
import timeit
import sys
import os
from sklearn.cross_validation import KFold
from collections import OrderedDict
import operator
import random
from sklearn.cluster import KMeans
import numpy as np
import scipy.linalg as LA
import scipy.sparse
import sklearn.utils.arpack as SLA
from sklearn.base import ClassifierMixin
from sklearn.base import BaseEstimator
from sklearn.manifold import spectral_embedding
from pyspark.mllib.clustering import GaussianMixture, GaussianMixtureModel
import sklearn.metrics.pairwise as pairwise
from sklearn import decomposition as pca
from scipy import interpolate as ip
import sklearn.mixture as mixture
import sys
from sklearn.metrics.pairwise import chi2_kernel
from sklearn.neighbors import DistanceMetric
from pyspark.sql import SQLContext
from pyspark.sql.types import *
%matplotlib inline
dataX,dataY=datasets.make_blobs(n_samples=10000, n_features=50, centers=3, cluster_std=3.5, center_box=(-10.0, 10.0), shuffle=True, random_state=None)
def labelremover(X,y):
newX1 = np.around(X,decimals=2)
newY1=np.copy(y)
dim = X.shape[1]
points = np.array(np.empty(len(np.unique(y))))
knownX = np.empty((len(points),dim))
knownY = np.empty(len(points))
for i in np.unique(y):
points[i] = np.where(y==(i))[0][0]
for j in np.arange(0,len(newY1)):
newY1[j]=-1
for k in np.unique(y):
newY1[points[k]] = y[points[k]]
knownX = X[[i for i in points]]
knownY = y[[i for i in points]]
print "These are labels of known points: "+ str(knownY)
return (newY1, knownX, knownY)
trainX = dataX[0:8000,:]
trainY = dataY[0:8000]
testX = dataX[8000:10000,:]
testY = dataY[8000:10000]
newtrainY, knownX, knownY = labelremover(trainX,trainY)
In [50]:
#standalone code
with open('/home/madhura/Computational_Olfaction/fergus-ssl/src/fergus_propagation.py') as source_file:
exec(source_file.read())
fp = FergusPropagation()
fp.fit(trainX,newtrainY)
predicted_labels = fp.predict(testX)
In [51]:
#distributed code
%run LabelPropagationDistributed.ipynb
lpd = LabelPropagationDistributed()
dX = sc.parallelize(trainX)
dy = sc.parallelize(newtrainY)
lpd.fit(dX,dy)
plabels_ = lpd.predict(sc.parallelize(testX))
In [52]:
plt.scatter(trainX[:, 0], trainX[:, 1], marker='o', c=trainY, cmap = ('ocean'))
Out[52]:
In [53]:
plt.scatter(trainX[:,0], trainX[:,1], c=np.array(lpd.labels_.collect()), cmap = (('ocean')))
Out[53]:
In [54]:
plt.scatter(trainX[:, 0], trainX[:, 1], marker='o', c=fp.labels_, cmap = ('ocean'))
Out[54]:
In [55]:
plt.scatter(testX[:, 0], testX[:, 1], marker='o', c=testY, cmap = ('ocean'))
Out[55]:
In [56]:
plt.scatter(testX[:,0], testX[:,1], c=np.array(plabels_.collect()), cmap = (('ocean')))
Out[56]:
In [57]:
plt.scatter(testX[:, 0], testX[:, 1], marker='o', c=predicted_labels, cmap = ('ocean'))
Out[57]:
In [58]:
np.where(trainY!=fp.labels_)[0].shape[0]
Out[58]:
In [59]:
np.where(trainY!=np.array(lpd.labels_.collect()))[0].shape[0]
Out[59]:
In [60]:
np.where(testY!=np.array(plabels_.collect()))[0].shape[0]
Out[60]:
In [61]:
np.where(testY!=predicted_labels)[0].shape[0]
Out[61]:
In [62]:
def getcount(gt,new):
diff = np.where(gt!=new)[0].shape[0]
return (float(diff)/float(len(gt)))
print "Training Error in standalone:",
print getcount(trainY, fp.labels_)
print "Training Error in distributed:",
print getcount(trainY, lpd.labels_.collect())
print "Test Error in standalone:",
print getcount(testY, predicted_labels)
print "Test Error in distributed:",
print getcount(testY, plabels_.collect())