This notebook demonstrates importing VGG-19 model from Caffe to SystemML and use that model to do an image classification. VGG-19 model has been trained using ImageNet dataset (1000 classes with ~ 14M images). If an image to be predicted is in one of the class VGG-19 has trained on then accuracy will be higher. We expect prediction of any image through SystemML using VGG-19 model will be similar to that of image predicted through Caffe using VGG-19 model directly.
In [1]:
!pip show systemml
In [2]:
from systemml import MLContext
ml = MLContext(sc)
print ("SystemML Built-Time:"+ ml.buildTime())
print(ml.info())
In [3]:
# Workaround for Python 2.7.13 to avoid certificate validation issue while downloading any file.
import ssl
try:
_create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
# Legacy Python that doesn't verify HTTPS certificates by default
pass
else:
# Handle target environment that doesn't support HTTPS verification
ssl._create_default_https_context = _create_unverified_https_context
In [4]:
# Create label.txt file
def createLabelFile(fileName):
file = open(fileName, 'w')
file.write('1,"Cat" \n')
file.write('2,"Dog" \n')
file.close()
In [5]:
# Download caffemodel and proto files
def downloadAndConvertModel(downloadDir='.', trained_vgg_weights='trained_vgg_weights'):
# Step 1: Download the VGG-19 model and other files.
import errno
import os
import urllib
# Create directory, if exists don't error out
try:
os.makedirs(os.path.join(downloadDir,trained_vgg_weights))
except OSError as exc: # Python >2.5
if exc.errno == errno.EEXIST and os.path.isdir(trained_vgg_weights):
pass
else:
raise
# Download deployer, network, solver proto and label files.
urllib.urlretrieve('https://raw.githubusercontent.com/apache/systemml/master/scripts/nn/examples/caffe2dml/models/imagenet/vgg19/VGG_ILSVRC_19_layers_deploy.proto', os.path.join(downloadDir,'VGG_ILSVRC_19_layers_deploy.proto'))
urllib.urlretrieve('https://raw.githubusercontent.com/apache/systemml/master/scripts/nn/examples/caffe2dml/models/imagenet/vgg19/VGG_ILSVRC_19_layers_network.proto',os.path.join(downloadDir,'VGG_ILSVRC_19_layers_network.proto'))
#TODO: After downloading network file (VGG_ILSVRC_19_layers_network.proto) , change num_output from 1000 to 2
urllib.urlretrieve('https://raw.githubusercontent.com/apache/systemml/master/scripts/nn/examples/caffe2dml/models/imagenet/vgg19/VGG_ILSVRC_19_layers_solver.proto',os.path.join(downloadDir,'VGG_ILSVRC_19_layers_solver.proto'))
# TODO: set values as descrived below in VGG_ILSVRC_19_layers_solver.proto (Possibly through APIs whenever available)
# test_iter: 100
# stepsize: 40
# max_iter: 200
# Create labels for data
### 1,"Cat"
### 2,"Dog"
createLabelFile(os.path.join(downloadDir, trained_vgg_weights, 'labels.txt'))
# TODO: Following line commented as its 500MG file, if u need to download it please uncomment it and run.
# urllib.urlretrieve('http://www.robots.ox.ac.uk/~vgg/software/very_deep/caffe/VGG_ILSVRC_19_layers.caffemodel', os.path.join(downloadDir,'VGG_ILSVRC_19_layers.caffemodel'))
# Step 2: Convert the caffemodel to trained_vgg_weights directory
import systemml as sml
sml.convert_caffemodel(sc, os.path.join(downloadDir,'VGG_ILSVRC_19_layers_deploy.proto'), os.path.join(downloadDir,'VGG_ILSVRC_19_layers.caffemodel'), os.path.join(downloadDir,trained_vgg_weights))
return
In [6]:
# Print top K indices and probability
def printTopK(prob, label, k):
print(label, 'Top ', k, ' Index : ', np.argsort(-prob)[0, :k])
print(label, 'Top ', k, ' Probability : ', prob[0,np.argsort(-prob)[0, :k]])
In [7]:
import numpy as np
import urllib
from systemml.mllearn import Caffe2DML
import systemml as sml
def classifyImages(urls,img_shape=(3, 224, 224), printTokKData=False, topK=5, downloadDir='.', trained_vgg_weights='trained_vgg_weights'):
size = (img_shape[1], img_shape[2])
vgg = Caffe2DML(sqlCtx, solver=os.path.join(downloadDir,'VGG_ILSVRC_19_layers_solver.proto'), input_shape=img_shape)
vgg.load(trained_vgg_weights)
for url in urls:
outFile = 'inputTest.jpg'
urllib.urlretrieve(url, outFile)
from IPython.display import Image, display
display(Image(filename=outFile))
print ("Prediction of above image to ImageNet Class using");
## Do image classification through SystemML processing
from PIL import Image
input_image = sml.convertImageToNumPyArr(Image.open(outFile), img_shape=img_shape
, color_mode='BGR', mean=sml.getDatasetMean('VGG_ILSVRC_19_2014'))
print ("Image preprocessed through SystemML :: ", vgg.predict(input_image)[0])
if(printTopKData == True):
sysml_proba = vgg.predict_proba(input_image)
printTopK(sysml_proba, 'SystemML BGR', topK)
In [8]:
from pyspark.ml.linalg import Vectors
import os
import systemml as sml
def getLabelFeatures(filename, train_dir, img_shape):
from PIL import Image
vec = Vectors.dense(sml.convertImageToNumPyArr(Image.open(os.path.join(train_dir, filename)), img_shape=img_shape)[0,:])
if filename.lower().startswith('cat'):
return (1, vec)
elif filename.lower().startswith('dog'):
return (2, vec)
else:
raise ValueError('Expected the filename to start with either cat or dog')
In [9]:
from pyspark.sql.functions import rand
import os
def createTrainingDF(train_dir, train_data_file, img_shape):
list_jpeg_files = os.listdir(train_dir)
# 10 files per partition
train_df = sc.parallelize(list_jpeg_files, int(len(list_jpeg_files)/10)).map(lambda filename : getLabelFeatures(filename, train_dir, img_shape)).toDF(['label', 'features']).orderBy(rand())
# Optional: but helps seperates conversion-related from training
# train_df.write.parquet(train_data_file) # 'kaggle-cats-dogs.parquet'
return train_df
In [10]:
def readTrainingDF(train_dir, train_data_file):
train_df = sqlContext.read.parquet(train_data_file)
return train_df
In [11]:
# downloadAndConvertModel(downloadDir, trained_vgg_weights)
# TODO: Take "TODO" actions mentioned in the downloadAndConvertModel() function after calling downloadAndConvertModel() function.
In [12]:
def retrainModel(img_shape, downloadDir, trained_vgg_weights, train_dir, train_data_file, vgg_new_model):
# Let downloadAndConvertModel() functon be commented out, as it needs to be called separately (which is done in cell above) and manual action to be taken after calling it.
# downloadAndConvertModel(downloadDir, trained_vgg_weights)
# TODO: Take "TODO" actions mentioned in the downloadAndConvertModel() function after calling that function.
train_df = createTrainingDF(train_dir, train_data_file, img_shape)
## Write from input files OR read if its already written/converted
# train_df = readTrainingDF(train_dir, train_data_file)
# Load the model
vgg = Caffe2DML(sqlCtx, solver=os.path.join(downloadDir,'VGG_ILSVRC_19_layers_solver.proto'), input_shape=img_shape)
vgg.load(weights=os.path.join(downloadDir,trained_vgg_weights), ignore_weights=['fc8'])
vgg.set(debug=True).setExplain(True)
# Train the model using new data
vgg.fit(train_df)
# Save the trained model
vgg.save(vgg_new_model)
return vgg
In [ ]:
import numpy as np
import urllib
from systemml.mllearn import Caffe2DML
import systemml as sml
def classifyImagesWTransfLearning(urls, model, img_shape=(3, 224, 224), printTokKData=False, topK=5):
size = (img_shape[1], img_shape[2])
# vgg.load(trained_vgg_weights)
for url in urls:
outFile = 'inputTest.jpg'
urllib.urlretrieve(url, outFile)
from IPython.display import Image, display
display(Image(filename=outFile))
print ("Prediction of above image to ImageNet Class using");
## Do image classification through SystemML processing
from PIL import Image
input_image = sml.convertImageToNumPyArr(Image.open(outFile), img_shape=img_shape
, color_mode='BGR', mean=sml.getDatasetMean('VGG_ILSVRC_19_2014'))
print ("Image preprocessed through SystemML :: ", model.predict(input_image)[0])
if(printTopKData == True):
sysml_proba = model.predict_proba(input_image)
printTopK(sysml_proba, 'SystemML BGR', topK)
There are couple of parameters to set based on what you are looking for.
In [ ]:
# ImageNet specific parameters
img_shape = (3, 224, 224)
# Setting other than current directory causes "network file not found" issue, as network file
# location is defined in solver file which does not have a path, so it searches in current dir.
downloadDir = '.' # /home/asurve/caffe_models'
trained_vgg_weights = 'trained_vgg_weights'
train_dir = '/home/asurve/data/keggle/dogs_vs_cats_2/train'
train_data_file = 'kaggle-cats-dogs.parquet'
vgg_new_model = 'kaggle-cats-dogs-model_2'
printTopKData=True
topK=5
urls = ['http://cdn3-www.dogtime.com/assets/uploads/gallery/goldador-dog-breed-pictures/puppy-1.jpg','https://lh3.googleusercontent.com/-YdeAa1Ff4Ac/VkUnQ4vuZGI/AAAAAAAAAEg/nBiUn4pp6aE/w800-h800/images-6.jpeg','https://upload.wikimedia.org/wikipedia/commons/thumb/5/58/MountainLion.jpg/312px-MountainLion.jpg']
vgg = retrainModel(img_shape, downloadDir, trained_vgg_weights, train_dir, train_data_file, vgg_new_model)
classifyImagesWTransfLearning(urls, vgg, img_shape, printTopKData, topK)
In [ ]:
img_shape = (3, 224, 224)
printTopKData=True
topK=5
# Setting other than current directory causes "network file not found" issue, as network file
# location is defined in solver file which does not have a path, so it searches in current dir.
downloadDir = '.' # /home/asurve/caffe_models'
trained_vgg_weights = 'kaggle-cats-dogs-model_2'
urls = ['http://cdn3-www.dogtime.com/assets/uploads/gallery/goldador-dog-breed-pictures/puppy-1.jpg','https://lh3.googleusercontent.com/-YdeAa1Ff4Ac/VkUnQ4vuZGI/AAAAAAAAAEg/nBiUn4pp6aE/w800-h800/images-6.jpeg','https://upload.wikimedia.org/wikipedia/commons/thumb/5/58/MountainLion.jpg/312px-MountainLion.jpg']
classifyImages(urls,img_shape, printTopKData, topK, downloadDir, trained_vgg_weights)
In [ ]: