In [1]:
from examples.ImageCaption import *
from pyspark.sql import SQLContext
from pyspark import SparkConf,SparkContext
from itertools import izip_longest
from com.yahoo.ml.caffe.tools.DFConversions import *
from com.yahoo.ml.caffe.tools.Vocab import *
from com.yahoo.ml.caffe.Config import *
from com.yahoo.ml.caffe.CaffeOnSpark import *
from com.yahoo.ml.caffe.Config import *
from com.yahoo.ml.caffe.DataSource import *
import json

Create the input embedding dataframe from cocodataset


In [2]:
conv=DFConversions(sc)
df_image_caption=conv.Coco2ImageCaptionFile("/tmp/coco/annotations/captions_train2014.json",1)
vocab=Vocab(sc)
vocab.genFromData(df_image_caption,"caption",8800)
df_embedding = conv.ImageCaption2Embedding("/tmp/coco/images/train2014", df_image_caption, vocab,20)
df_embedding.write.parquet("/tmp/coco/parquet/df_embedded_train2014")

Train the model for images


In [2]:
cos=CaffeOnSpark(sc)
args={}
args['conf']='CaffeOnSpark/data/bvlc_reference_solver.prototxt'
args['model']='file:///tmp/coco/bvlc_reference_caffenet.caffemodel'
args['devices']='1'
args['clusterSize']='1'
cfg=Config(sc,args)
dl_train_image = DataSource(sc).getSource(cfg,True)
cos.train(dl_train_image)

Train the model for LSTM


In [ ]:
args={}
args['conf']='CaffeOnSpark/data/lrcn_solver.prototxt'
args['model']='file:///tmp/coco/parquet/lrcn_coco.model'
args['devices']='1'
args['clusterSize']='1'
args['weights']='/tmp/coco/bvlc_reference_caffenet.caffemodel'
args['resize']='True'
cfg=Config(sc,args)
dl_train_lstm = DataSource(sc).getSource(cfg,True)
cos.train(dl_train_lstm)

Test the model


In [4]:
conv=DFConversions(sc)
vocab=Vocab(sc)
df_image_caption_test=conv.Coco2ImageCaptionFile("/tmp/coco/annotations/captions_demo.json",1)
vocab.genFromData(df_image_caption_test,"caption",8800)
df_embedding = conv.ImageCaption2Embedding("/tmp/coco/images/demo/", df_image_caption_test, vocab,20)
df_embedding.write.parquet("/tmp/coco/parquet/df_embedded_test")
df_embedded_input = sqlContext.read.parquet("/tmp/coco/parquet/df_embedded_test")

In [17]:
df_images = df_embedded_input.select("data.image","data.height", "data.width", "id")
model="/tmp/coco/parquet/lrcn_coco.model"
imagenet="CaffeOnSpark/data/lstm_deploy.prototxt"
lstmnet="CaffeOnSpark/data/lrcn_word_to_preds.deploy.prototxt"
vocab="CaffeOnSpark/data/vocab.txt"

In [18]:
df_results = get_predictions(sqlContext,df_images,model,imagenet,lstmnet,vocab)
df_results.show()


+------+--------------------+
|    id|          prediction|
+------+--------------------+
|674390|A snowboarder is ...|
|687894|A man riding a sk...|
|623893|A kitchen with a ...|
|598423|A man is doing a ...|
|632450|A woman is playin...|
|754187|A man is holding ...|
|706813|A zebra standing ...|
| 86667|A city street wit...|
|202165|A baseball player...|
| 83551|A woman is sittin...|
+------+--------------------+


In [19]:
df=df_embedded_input.join(df_results, df_embedded_input.id == df_results.id)

In [20]:
from com.yahoo.ml.caffe.DisplayUtils import *
show_captions(df)


Out[20]:
Image IdImagePrediction
674390A snowboarder is riding a trick on a snowy slope.
687894A man riding a skateboard down a street.
623893A kitchen with a white refrigerator and a stove.
598423A man is doing a trick on a skateboard.
632450A woman is playing tennis on a tennis court.
754187A man is holding a picture of a man.
706813A zebra standing in the grass with trees and trees.
86667A city street with a clock tower in the background.
202165A baseball player is holding a bat and a baseball bat.
83551A woman is sitting on a table with a dog.

In [ ]: