MNIST - Handwriting Recognition



In [1]:

    
from com.yahoo.ml.caffe.DisplayUtils import *

Training Data



In [2]:

    
df = sqlCtx.read.parquet('mnist_test_dataframe')



In [3]:

    
show_df(df)









    Out[3]:




Index Label Image
00000000 7 
00000001 2 
00000002 1 
00000003 0 
00000004 4 
00000005 1 
00000006 4 
00000007 9 
00000008 5 
00000009 9

MNIST Network



In [4]:

    
show_network('lenet_dataframe_train_test.prototxt','LR')









    Out[4]:

Training



In [5]:

    
from com.yahoo.ml.caffe.CaffeOnSpark import *
from com.yahoo.ml.caffe.Config import *
from com.yahoo.ml.caffe.DataSource import *
from pyspark.sql import DataFrame
from pyspark.mllib.linalg import Vectors
from pyspark.ml.classification import LogisticRegression
from pyspark.sql import Row



In [6]:

    
cos=CaffeOnSpark(sc,sqlContext)



In [7]:

    
args={}
args['conf']='lenet_dataframe_solver.prototxt'
args['model']='file:///tmp/lenet.model'
args['label']='label'
args['output']='file:outputlenet'
args['devices']='1'
args['outputFormat']='json'
args['train']='True'



In [ ]:



In [8]:

    
cfg=Config(sc,args)
dl_train_source = DataSource(sc).getSource(cfg,True)



In [ ]:

    
cos.train(dl_train_source)

Test



In [ ]:

    
cfg=Config(sc)
cfg.protoFile='lenet_dataframe_solver.prototxt'
cfg.modelPath = 'file:/tmp/lenet.model'
cfg.label = 'label'
cfg.outputPath = 'file:outputlenet'
cfg.devices = 1
cfg.outputFormat = 'json'
cfg.isTest = True
cfg.clusterSize = 1



In [8]:

    
dl_test_source = DataSource(sc).getSource(cfg,False)



In [9]:

    
test_result=cos.test(dl_test_source)



In [10]:

    
test_result









    Out[10]:





{u'accuracy': [0.9896000057458878], u'loss': [0.029420489565090975]}

Feature Extraction



In [11]:

    
args['conf']='lenet_dataframe_solver.prototxt'
args['model']='file:/tmp/lenet.model'
args['features']='accuracy,ip2'
args['label']='label'
args['output']='file:outputlenet'
args['devices']='1'
args['outputFormat']='json'



In [12]:

    
cfg=Config(sc,args)
dl_features_source = DataSource(sc).getSource(cfg,False)



In [13]:

    
f=cos.features(dl_features_source)



In [14]:

    
f.show(10)









    



+--------+--------+--------------------+-----+
|SampleID|accuracy|                 ip2|label|
+--------+--------+--------------------+-----+
|00005000|   [1.0]|[-6.6938324, -4.3...|[3.0]|
|00005001|   [1.0]|[-5.6929913, -7.2...|[9.0]|
|00005002|   [1.0]|[-5.7649927, -7.6...|[9.0]|
|00005003|   [1.0]|[-6.02533, -7.319...|[8.0]|
|00005004|   [1.0]|[-5.310922, -2.56...|[4.0]|
|00005005|   [1.0]|[-1.5654247, 13.9...|[1.0]|
|00005006|   [1.0]|[15.14268, -2.279...|[0.0]|
|00005007|   [1.0]|[3.4812944, -5.75...|[6.0]|
|00005008|   [1.0]|[15.894955, -4.28...|[0.0]|
|00005009|   [1.0]|[-4.839698, -5.87...|[9.0]|
+--------+--------+--------------------+-----+
only showing top 10 rows



In [15]:

    
f.take(1)









    Out[15]:





[Row(SampleID=u'00005000', accuracy=[1.0], ip2=[-6.6938323974609375, -4.3649163246154785, 2.0613934993743896, 19.778804779052734, -4.449943542480469, 2.26839280128479, -10.026854515075684, -0.5028088688850403, 6.63890266418457, -1.0265761613845825], label=[3.0])]



In [16]:

    
def maxScoreAndIndex(array_of_scores): 
    return max(enumerate(array_of_scores), key=lambda x: x[1])



In [17]:

    
g = sqlContext.createDataFrame(f.map(lambda row: (
            row.SampleID,
            row.accuracy[0],
            row.ip2,
            maxScoreAndIndex(row.ip2)[1],
            maxScoreAndIndex(row.ip2)[0],
            int(row.label[0]))), ["SampleID", "Accuracy", "Scores", "MaxScore", "Prediction", "Label"])



In [18]:

    
g.toPandas()[:10]









    Out[18]:






  
    
      
      SampleID
      Accuracy
      Scores
      MaxScore
      Prediction
      Label
    
  
  
    
      0
      00005000
      1
      [-6.69383239746, -4.36491632462, 2.06139349937...
      19.778805
      3
      3
    
    
      1
      00005001
      1
      [-5.69299125671, -7.22875642776, -2.3231837749...
      13.015666
      9
      9
    
    
      2
      00005002
      1
      [-5.76499271393, -7.6131439209, -2.30362820625...
      19.091763
      9
      9
    
    
      3
      00005003
      1
      [-6.02533006668, -7.3196144104, 6.79601430893,...
      24.353712
      8
      8
    
    
      4
      00005004
      1
      [-5.31092214584, -2.56380319595, -3.6245796680...
      22.056751
      4
      4
    
    
      5
      00005005
      1
      [-1.56542468071, 13.9534778595, 0.674106001854...
      13.953478
      1
      1
    
    
      6
      00005006
      1
      [15.1426801682, -2.27935791016, 5.51005029678,...
      15.142680
      0
      0
    
    
      7
      00005007
      1
      [3.48129439354, -5.753033638, 0.46099793911, -...
      21.943726
      6
      6
    
    
      8
      00005008
      1
      [15.8949546814, -4.28664875031, 3.87462425232,...
      15.894955
      0
      0
    
    
      9
      00005009
      1
      [-4.83969783783, -5.87413311005, -5.1123042106...
      15.611836
      9
      9

Logistic Regression using MLlib



In [ ]:

    
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import LogisticRegressionWithLBFGS



In [ ]:

    
data = f.map(lambda row: LabeledPoint(row.label[0], Vectors.dense(row.ip1)))



In [ ]:

    
lr = LogisticRegressionWithLBFGS.train(data, numClasses=10, iterations=10)



In [ ]:

    
predictions = lr.predict(data.map(lambda pt : pt.features))



In [ ]:

    
predictions.take(5)

Index	Label	Image
00000000	7
00000001	2
00000002	1
00000003	0
00000004	4
00000005	1
00000006	4
00000007	9
00000008	5
00000009	9

	SampleID	Accuracy	Scores	MaxScore	Prediction	Label
0	00005000	1	[-6.69383239746, -4.36491632462, 2.06139349937...	19.778805	3	3
1	00005001	1	[-5.69299125671, -7.22875642776, -2.3231837749...	13.015666	9	9
2	00005002	1	[-5.76499271393, -7.6131439209, -2.30362820625...	19.091763	9	9
3	00005003	1	[-6.02533006668, -7.3196144104, 6.79601430893,...	24.353712	8	8
4	00005004	1	[-5.31092214584, -2.56380319595, -3.6245796680...	22.056751	4	4
5	00005005	1	[-1.56542468071, 13.9534778595, 0.674106001854...	13.953478	1	1
6	00005006	1	[15.1426801682, -2.27935791016, 5.51005029678,...	15.142680	0	0
7	00005007	1	[3.48129439354, -5.753033638, 0.46099793911, -...	21.943726	6	6
8	00005008	1	[15.8949546814, -4.28664875031, 3.87462425232,...	15.894955	0	0
9	00005009	1	[-4.83969783783, -5.87413311005, -5.1123042106...	15.611836	9	9