In [63]:
import numpy as np
from sklearn import datasets
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt1
import timeit
import sys
import os
from sklearn.cross_validation import KFold
from collections import OrderedDict
import operator
import random
from sklearn.cluster import KMeans
import numpy as np
import scipy.linalg as LA
import scipy.sparse
import sklearn.utils.arpack as SLA
from sklearn.base import ClassifierMixin
from sklearn.base import BaseEstimator
from sklearn.manifold import spectral_embedding
from pyspark.mllib.clustering import GaussianMixture, GaussianMixtureModel
import sklearn.metrics.pairwise as pairwise
from sklearn import decomposition as pca
from scipy import interpolate as ip
import sklearn.mixture as mixture
import sys
from sklearn.metrics.pairwise import chi2_kernel
from sklearn.neighbors import DistanceMetric
from pyspark.sql import SQLContext
from pyspark.sql.types import *
%matplotlib inline

dataX,dataY=datasets.make_blobs(n_samples=20000, n_features=50, centers=3, cluster_std=3.5, center_box=(-10.0, 10.0), shuffle=True, random_state=None)

def labelremover(X,y):
    newX1 = np.around(X,decimals=2)
    newY1=np.copy(y)
    dim = X.shape[1]
    points = np.array(np.empty(len(np.unique(y))))
    knownX = np.empty((len(points),dim))
    knownY = np.empty(len(points))
    for i in np.unique(y):
        points[i] = np.where(y==(i))[0][0]
    for j in np.arange(0,len(newY1)):
        newY1[j]=-1
    for k in np.unique(y):
        newY1[points[k]] = y[points[k]]
    knownX = X[[i for i in points]]
    knownY = y[[i for i in points]]
    print "These are labels of known points: "+ str(knownY)
    return (newY1, knownX, knownY)

trainX = dataX[0:18000,:]
trainY = dataY[0:18000]
testX = dataX[18000:20000,:]
testY = dataY[18000:20000]


newtrainY, knownX, knownY = labelremover(trainX,trainY)


These are labels of known points: [0 1 2]
/home/madhura/.local/lib/python2.7/site-packages/IPython/kernel/__main__.py:46: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future

In [64]:
#standalone code
with open('/home/madhura/Computational_Olfaction/fergus-ssl/src/fergus_propagation.py') as source_file:
    exec(source_file.read())

fp = FergusPropagation()
fp.fit(trainX,newtrainY)
predicted_labels = fp.predict(testX)

In [65]:
#distributed code
%run LabelPropagationDistributed.ipynb
lpd = LabelPropagationDistributed()
dX = sc.parallelize(trainX)
dy = sc.parallelize(newtrainY)
lpd.fit(dX,dy)

plabels_ = lpd.predict(sc.parallelize(testX))

In [66]:
plt.scatter(trainX[:, 0], trainX[:, 1], marker='o', c=trainY, cmap = ('ocean'))


Out[66]:
<matplotlib.collections.PathCollection at 0x7f58532cb550>

In [67]:
plt.scatter(trainX[:,0], trainX[:,1], c=np.array(lpd.labels_.collect()), cmap = (('ocean')))


Out[67]:
<matplotlib.collections.PathCollection at 0x7f5853252510>

In [68]:
plt.scatter(trainX[:, 0], trainX[:, 1], marker='o', c=fp.labels_, cmap = ('ocean'))


Out[68]:
<matplotlib.collections.PathCollection at 0x7f5853191110>

In [69]:
plt.scatter(testX[:, 0], testX[:, 1], marker='o', c=testY, cmap = ('ocean'))


Out[69]:
<matplotlib.collections.PathCollection at 0x7f58530c7490>

In [70]:
plt.scatter(testX[:,0], testX[:,1], c=np.array(plabels_.collect()), cmap = (('ocean')))


Out[70]:
<matplotlib.collections.PathCollection at 0x7f58530061d0>

In [71]:
plt.scatter(testX[:, 0], testX[:, 1], marker='o', c=predicted_labels, cmap = ('ocean'))


Out[71]:
<matplotlib.collections.PathCollection at 0x7f5852f17c10>

In [72]:
np.where(trainY!=fp.labels_)[0].shape[0]


Out[72]:
353

In [73]:
np.where(trainY!=np.array(lpd.labels_.collect()))[0].shape[0]


Out[73]:
3331

In [74]:
np.where(testY!=np.array(plabels_.collect()))[0].shape[0]


Out[74]:
387

In [75]:
np.where(testY!=predicted_labels)[0].shape[0]


Out[75]:
49

In [76]:
def getcount(gt,new):
    diff = np.where(gt!=new)[0].shape[0]
    return (float(diff)/float(len(gt)))
    

print "Training Error in standalone:",
print getcount(trainY, fp.labels_)
print "Training Error in distributed:",
print getcount(trainY, lpd.labels_.collect())
print "Test Error in standalone:",
print getcount(testY, predicted_labels)
print "Test Error in distributed:",
print getcount(testY, plabels_.collect())


Training Error in standalone: 0.0196111111111
Training Error in distributed: 0.185055555556
Test Error in standalone: 0.0245
Test Error in distributed: 0.1935