In [1]:
import numpy as np
from sklearn import datasets
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt1
import timeit
import sys
import os
from sklearn.cross_validation import KFold
from collections import OrderedDict
import operator
import random
from sklearn.cluster import KMeans
import numpy as np
import scipy.linalg as LA
import scipy.sparse
import sklearn.utils.arpack as SLA
from sklearn.base import ClassifierMixin
from sklearn.base import BaseEstimator
from sklearn.manifold import spectral_embedding
from pyspark.mllib.clustering import GaussianMixture, GaussianMixtureModel
import sklearn.metrics.pairwise as pairwise
from sklearn import decomposition as pca
from scipy import interpolate as ip
import sklearn.mixture as mixture
import sys
from sklearn.metrics.pairwise import chi2_kernel
from sklearn.neighbors import DistanceMetric
from pyspark.sql import SQLContext
from pyspark.sql.types import *
%matplotlib inline

dataX,dataY=datasets.make_blobs(n_samples=1000, n_features=50, centers=2, cluster_std=3.5, center_box=(-10.0, 10.0), shuffle=True, random_state=None)

def labelremover(X,y):
    newX1 = np.around(X,decimals=2)
    newY1=np.copy(y)
    dim = X.shape[1]
    points = np.array(np.empty(len(np.unique(y))))
    knownX = np.empty((len(points),dim))
    knownY = np.empty(len(points))
    for i in np.unique(y):
        points[i] = np.where(y==(i))[0][0]
    for j in np.arange(0,len(newY1)):
        newY1[j]=-1
    for k in np.unique(y):
        newY1[points[k]] = y[points[k]]
    knownX = X[[i for i in points]]
    knownY = y[[i for i in points]]
    print "These are labels of known points: "+ str(knownY)
    return (newY1, knownX, knownY)

trainX = dataX[0:800,:]
trainY = dataY[0:800]
testX = dataX[800:1000,:]
testY = dataY[800:1000]


newtrainY, knownX, knownY = labelremover(trainX,trainY)


These are labels of known points: [0 1]
/home/madhura/.local/lib/python2.7/site-packages/IPython/kernel/__main__.py:46: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future

In [ ]:
#standalone code
with open('/home/madhura/Computational_Olfaction/fergus-ssl/src/fergus_propagation.py') as source_file:
    exec(source_file.read())

fp = FergusPropagation()
fp.fit(trainX,newtrainY)
predicted_labels = fp.predict(testX)

In [8]:
#distributed code
%run LabelPropagationDistributed.ipynb
lpd = LabelPropagationDistributed()
dX = sc.parallelize(trainX)
dy = sc.parallelize(newtrainY)
lpd.fit(dX,dy)

plabels_ = lpd.predict(sc.parallelize(testX))

In [9]:
plt.scatter(trainX[:, 0], trainX[:, 1], marker='o', c=trainY, cmap = ('ocean'))


Out[9]:
<matplotlib.collections.PathCollection at 0x7f585807ba50>

In [10]:
plt.scatter(trainX[:,0], trainX[:,1], c=np.array(lpd.labels_.collect()), cmap = (('ocean')))


Out[10]:
<matplotlib.collections.PathCollection at 0x7f5853f5ffd0>

In [11]:
plt.scatter(trainX[:, 0], trainX[:, 1], marker='o', c=fp.labels_, cmap = ('ocean'))


Out[11]:
<matplotlib.collections.PathCollection at 0x7f5853e95510>

In [12]:
plt.scatter(testX[:, 0], testX[:, 1], marker='o', c=testY, cmap = ('ocean'))


Out[12]:
<matplotlib.collections.PathCollection at 0x7f5853dc7210>

In [13]:
plt.scatter(testX[:,0], testX[:,1], c=np.array(plabels_.collect()), cmap = (('ocean')))


Out[13]:
<matplotlib.collections.PathCollection at 0x7f5853d71790>

In [14]:
plt.scatter(testX[:, 0], testX[:, 1], marker='o', c=predicted_labels, cmap = ('ocean'))


Out[14]:
<matplotlib.collections.PathCollection at 0x7f5853c90b10>

In [23]:
np.where(trainY!=fp.labels_)[0].shape[0]


Out[23]:
8

In [21]:
np.where(trainY!=np.array(lpd.labels_.collect()))[0].shape[0]


Out[21]:
0

In [25]:
np.where(testY!=np.array(plabels_.collect()))[0].shape[0]


Out[25]:
0

In [26]:
np.where(testY!=predicted_labels)[0].shape[0]


Out[26]:
2