In [1]:
import numpy as np
from sklearn import datasets
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt1
import timeit
import sys
import os
from sklearn.cross_validation import KFold
from collections import OrderedDict
import operator
import random
from sklearn.cluster import KMeans
import numpy as np
import scipy.linalg as LA
import scipy.sparse
import sklearn.utils.arpack as SLA
from sklearn.base import ClassifierMixin
from sklearn.base import BaseEstimator
from sklearn.manifold import spectral_embedding
from pyspark.mllib.clustering import GaussianMixture, GaussianMixtureModel
import sklearn.metrics.pairwise as pairwise
from sklearn import decomposition as pca
from scipy import interpolate as ip
import sklearn.mixture as mixture
import sys
from sklearn.metrics.pairwise import chi2_kernel
from sklearn.neighbors import DistanceMetric
from pyspark.sql import SQLContext
from pyspark.sql.types import *
%matplotlib inline

dataX,dataY=datasets.make_blobs(n_samples=5000, n_features=20, centers=3, cluster_std=1.5, center_box=(-10.0, 10.0), shuffle=True, random_state=None)

def labelremover(X,y):
    newX1 = np.around(X,decimals=2)
    newY1=np.copy(y)
    dim = X.shape[1]
    points = np.array(np.empty(len(np.unique(y))))
    knownX = np.empty((len(points),dim))
    knownY = np.empty(len(points))
    for i in np.unique(y):
        points[i] = np.where(y==(i))[0][0]
    for j in np.arange(0,len(newY1)):
        newY1[j]=-1
    for k in np.unique(y):
        newY1[points[k]] = y[points[k]]
    knownX = X[[i for i in points]]
    knownY = y[[i for i in points]]
    print "These are labels of known points: "+ str(knownY)
    return (newY1, knownX, knownY)

trainX = dataX[0:4800,:]
trainY = dataY[0:4800]
testX = dataX[4800:5000,:]
testY = dataY[4800:5000]


newtrainY, knownX, knownY = labelremover(trainX,trainY)


These are labels of known points: [0 1 2]
/home/madhura/.local/lib/python2.7/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
/home/madhura/.local/lib/python2.7/site-packages/IPython/kernel/__main__.py:46: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future

In [100]:
#standalone code
with open('/home/madhura/Computational_Olfaction/fergus-ssl/src/fergus_propagation.py') as source_file:
    exec(source_file.read())

fp = FergusPropagation()
fp.fit(trainX,newtrainY)
predicted_labels = fp.predict(testX)


---------------------------------------------------------------------------
MemoryError                               Traceback (most recent call last)
<ipython-input-100-08633747acaf> in <module>()
      4 
      5 fp = FergusPropagation()
----> 6 fp.fit(trainX,newtrainY)
      7 predicted_labels = fp.predict(testX)

<string> in fit(self, X, y)

/usr/lib/python2.7/dist-packages/numpy/lib/twodim_base.pyc in diag(v, k)
    277     if len(s) == 1:
    278         n = s[0]+abs(k)
--> 279         res = zeros((n, n), v.dtype)
    280         if k >= 0:
    281             i = k

MemoryError: 

In [4]:
#distributed code
%run LabelPropagationDistributed.ipynb
lpd = LabelPropagationDistributed(numBins = 7)
#from LabelPropagationDistributed import LabelPropagationDistributed as LPD
#lpd = LPD(sc=sc, sqlContext = sqlContext, numBins = 6)
dX = sc.parallelize(trainX)
dy = sc.parallelize(newtrainY)
lpd.fit(dX,dy)

plabels_ = lpd.predict(sc.parallelize(testX))

In [ ]:
lpd = LPD(sc=sc, sqlContext = sqlContext, numBins = 3)
dataX = np.array([[1,1], [2,3], [3,1], [4,10], [5,12], [6,13]])
dataY = np.array([0,0,0,1,1,1])
newdataY = np.array([0,-1,-1,-1,-1,1])
testX = np.array([[1,-1], [3,-0.5],[7,5]])
testY = np.array([0,0,1])
dX = sc.parallelize(dataX)
dy = sc.parallelize(newdataY)
lpd.fit(dX,dy)
plabels_ = lpd.predict(sc.parallelize(testX))
plt.scatter(dataX[:, 0], dataX[:, 1], marker='o', c=dataY, cmap = ('GnBu'))
plt.scatter(dataX[:,0], dataX[:,1], marker = 'o', c=np.array(lpd.labels_.collect()), cmap = (('GnBu')))
plt.scatter(testX[:, 0], testX[:, 1], marker='o', c=testY, cmap = ('GnBu'))
plt.scatter(testX[:,0], testX[:,1], c=np.array(plabels_.collect()), cmap = (('GnBu')))

In [9]:
plt.scatter(np.array([1,2,3,4,5,6]), np.array([1,3,1,10,12,13]), marker = 'o', c = np.array([0,0,0,1,1,1]), cmap = ('ocean'))


Out[9]:
<matplotlib.collections.PathCollection at 0x7f8b09417150>

In [10]:
plt.scatter(np.array([1,3,7]), np.array([-1,-0.5,15]), marker = 'o', c = np.array([0,0,1]), cmap = ('ocean'))


Out[10]:
<matplotlib.collections.PathCollection at 0x7f8b092c9ad0>

In [120]:
plt.scatter(trainX[:, 0], trainX[:, 1], marker='o', c=trainY, cmap = ('ocean'))


Out[120]:
<matplotlib.collections.PathCollection at 0x7f5851e69b50>

In [121]:
plt.scatter(trainX[:,0], trainX[:,1], c=np.array(lpd.labels_.collect()), cmap = (('ocean')))


Out[121]:
<matplotlib.collections.PathCollection at 0x7f5851da2b90>

In [ ]:
plt.scatter(trainX[:, 0], trainX[:, 1], marker='o', c=fp.labels_, cmap = ('ocean'))

In [122]:
plt.scatter(testX[:, 0], testX[:, 1], marker='o', c=testY, cmap = ('ocean'))


Out[122]:
<matplotlib.collections.PathCollection at 0x7f5851cce7d0>

In [123]:
plt.scatter(testX[:,0], testX[:,1], c=np.array(plabels_.collect()), cmap = (('ocean')))


Out[123]:
<matplotlib.collections.PathCollection at 0x7f5851c75d50>

In [ ]:
plt.scatter(testX[:, 0], testX[:, 1], marker='o', c=predicted_labels, cmap = ('ocean'))

In [ ]:
np.where(trainY!=fp.labels_)[0].shape[0]

In [124]:
np.where(trainY!=np.array(lpd.labels_.collect()))[0].shape[0]


Out[124]:
3164

In [125]:
np.where(testY!=np.array(plabels_.collect()))[0].shape[0]


Out[125]:
1311

In [ ]:
np.where(testY!=predicted_labels)[0].shape[0]

In [ ]:
def getcount(gt,new):
    diff = np.where(gt!=new)[0].shape[0]
    return (float(diff)/float(len(gt)))
    

print "Training Error in standalone:",
print getcount(trainY, fp.labels_)
print "Training Error in distributed:",
print getcount(trainY, lpd.labels_.collect())
print "Test Error in standalone:",
print getcount(testY, predicted_labels)
print "Test Error in distributed:",
print getcount(testY, plabels_.collect())