This notebook demonstrates importing VGG-19 model from Caffe to SystemML and use that model to do an image classification. VGG-19 model has been trained using ImageNet dataset (1000 classes with ~ 14M images). If an image to be predicted is in one of the class VGG-19 has trained on then accuracy will be higher. We expect prediction of any image through SystemML using VGG-19 model will be similar to that of image predicted through Caffe using VGG-19 model directly.
In [ ]:
!pip show systemml
In [ ]:
from systemml import MLContext
ml = MLContext(sc)
print ("SystemML Built-Time:"+ ml.buildTime())
print(ml.info())
In [ ]:
# Workaround for Python 2.7.13 to avoid certificate validation issue while downloading any file.
import ssl
try:
_create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
# Legacy Python that doesn't verify HTTPS certificates by default
pass
else:
# Handle target environment that doesn't support HTTPS verification
ssl._create_default_https_context = _create_unverified_https_context
In [ ]:
# Download caffemodel and proto files
def downloadAndConvertModel(downloadDir='.', trained_vgg_weights='trained_vgg_weights'):
# Step 1: Download the VGG-19 model and other files.
import errno
import os
import urllib
# Create directory, if exists don't error out
try:
os.makedirs(os.path.join(downloadDir,trained_vgg_weights))
except OSError as exc: # Python >2.5
if exc.errno == errno.EEXIST and os.path.isdir(trained_vgg_weights):
pass
else:
raise
# Download deployer, network, solver proto and label files.
urllib.urlretrieve('https://raw.githubusercontent.com/apache/systemml/master/scripts/nn/examples/caffe2dml/models/imagenet/vgg19/VGG_ILSVRC_19_layers_deploy.proto', os.path.join(downloadDir,'VGG_ILSVRC_19_layers_deploy.proto'))
urllib.urlretrieve('https://raw.githubusercontent.com/apache/systemml/master/scripts/nn/examples/caffe2dml/models/imagenet/vgg19/VGG_ILSVRC_19_layers_network.proto',os.path.join(downloadDir,'VGG_ILSVRC_19_layers_network.proto'))
urllib.urlretrieve('https://raw.githubusercontent.com/apache/systemml/master/scripts/nn/examples/caffe2dml/models/imagenet/vgg19/VGG_ILSVRC_19_layers_solver.proto',os.path.join(downloadDir,'VGG_ILSVRC_19_layers_solver.proto'))
# Get labels for data
urllib.urlretrieve('https://raw.githubusercontent.com/apache/systemml/master/scripts/nn/examples/caffe2dml/models/imagenet/labels.txt', os.path.join(downloadDir, trained_vgg_weights, 'labels.txt'))
# Following instruction download model of size 500MG file, so based on your network it may take time to download file.
urllib.urlretrieve('http://www.robots.ox.ac.uk/~vgg/software/very_deep/caffe/VGG_ILSVRC_19_layers.caffemodel', os.path.join(downloadDir,'VGG_ILSVRC_19_layers.caffemodel'))
# Step 2: Convert the caffemodel to trained_vgg_weights directory
import systemml as sml
sml.convert_caffemodel(sc, os.path.join(downloadDir,'VGG_ILSVRC_19_layers_deploy.proto'), os.path.join(downloadDir,'VGG_ILSVRC_19_layers.caffemodel'), os.path.join(downloadDir,trained_vgg_weights))
return
In [ ]:
# Print top K indices and probability
def printTopK(prob, label, k):
print(label, 'Top ', k, ' Index : ', np.argsort(-prob)[0, :k])
print(label, 'Top ', k, ' Probability : ', prob[0,np.argsort(-prob)[0, :k]])
In [ ]:
import os
def getCaffeLabel(url, printTopKData, topK, size=(224,224), modelDir='trained_vgg_weights'):
import caffe
urllib.urlretrieve(url, 'test.jpg')
image = caffe.io.resize_image(caffe.io.load_image('test.jpg'), size)
image = [(image * 255).astype(np.float)]
deploy_file = 'VGG_ILSVRC_19_layers_deploy.proto'
caffemodel_file = 'VGG_ILSVRC_19_layers.caffemodel'
net = caffe.Classifier(deploy_file, caffemodel_file)
caffe_prob = net.predict(image)
caffe_prediction = caffe_prob.argmax(axis=1)
if(printTopKData):
printTopK(caffe_prob, 'Caffe', topK)
import pandas as pd
labels = pd.read_csv(os.path.join(modelDir,'labels.txt'), names=['index', 'label'])
caffe_prediction_labels = [ labels[labels.index == x][['label']].values[0][0] for x in caffe_prediction ]
return net, caffe_prediction_labels
This function classify images from images specified through urls.
urls: List of urls
printTokKData (default False): Whether to print top K indices and probabilities
topK: Top K elements to be displayed.
caffeInstalled (default False): If Caffe has been installed. If installed, then it will classify image (with top K probability and indices) based on printTopKData.
In [ ]:
import numpy as np
import urllib
from systemml.mllearn import Caffe2DML
import systemml as sml
# Setting other than current directory causes "network file not found" issue, as network file
# location is defined in solver file which does not have a path, so it searches in current dir.
downloadDir = '.' # /home/asurve/caffe_models'
trained_vgg_weights = 'trained_vgg_weights'
img_shape = (3, 224, 224)
size = (img_shape[1], img_shape[2])
def classifyImages(urls,printTokKData=False, topK=5, caffeInstalled=False):
downloadAndConvertModel(downloadDir, trained_vgg_weights)
vgg = Caffe2DML(sqlCtx, solver=os.path.join(downloadDir,'VGG_ILSVRC_19_layers_solver.proto'), input_shape=img_shape)
vgg.load(trained_vgg_weights)
for url in urls:
outFile = 'inputTest.jpg'
urllib.urlretrieve(url, outFile)
from IPython.display import Image, display
display(Image(filename=outFile))
print ("Prediction of above image to ImageNet Class using");
## Do image classification through SystemML processing
from PIL import Image
input_image = sml.convertImageToNumPyArr(Image.open(outFile), img_shape=img_shape
, color_mode='BGR', mean=sml.getDatasetMean('VGG_ILSVRC_19_2014'))
print ("Image preprocessed through SystemML :: ", vgg.predict(input_image)[0])
if(printTopKData == True):
sysml_proba = vgg.predict_proba(input_image)
printTopK(sysml_proba, 'SystemML BGR', topK)
if(caffeInstalled == True):
net, caffeLabel = getCaffeLabel(url, printTopKData, topK, size, os.path.join(downloadDir, trained_vgg_weights))
print ("Image classification through Caffe :: ", caffeLabel[0])
print ("Caffe input data through SystemML :: ", vgg.predict(np.matrix(net.blobs['data'].data.flatten()))[0])
if(printTopKData == True):
sysml_proba = vgg.predict_proba(np.matrix(net.blobs['data'].data.flatten()))
printTopK(sysml_proba, 'With Caffe input data', topK)
There are couple of parameters to set based on what you are looking for.
In [ ]:
printTopKData=False
topK=5
caffeInstalled=False
urls = ['https://upload.wikimedia.org/wikipedia/commons/thumb/5/58/MountainLion.jpg/312px-MountainLion.jpg', 'https://s-media-cache-ak0.pinimg.com/originals/f2/56/59/f2565989f455984f206411089d6b1b82.jpg', 'http://i2.cdn.cnn.com/cnnnext/dam/assets/161207140243-vanishing-elephant-closeup-exlarge-169.jpg', 'http://wallpaper-gallery.net/images/pictures-of-lilies/pictures-of-lilies-7.jpg', 'https://cdn.pixabay.com/photo/2012/01/07/21/56/sunflower-11574_960_720.jpg', 'https://image.shutterstock.com/z/stock-photo-bird-nest-on-tree-branch-with-five-blue-eggs-inside-108094613.jpg', 'https://i.ytimg.com/vi/6jQDbIv0tDI/maxresdefault.jpg','https://cdn.pixabay.com/photo/2016/11/01/23/53/cat-1790093_1280.jpg']
classifyImages(urls,printTopKData, topK, caffeInstalled)
In [ ]: