Here we try to fool other models using same adversarial images



In [1]:

    
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import tensorflow as tf

import seaborn as sns

import numpy as np
import matplotlib.pyplot as plt

Just import the code



In [2]:

    
def build_graph():
  """build the same graph as previous dumped model

  Args:
    None

  Returns:
    sess          : tf.InteractiveSession()
    x             : tf.placeholder()
    y_            : tf.placeholder()
    y_pred,       : tf.Variable()
    keep_prob,    : tf.placeholder()
    cross_entropy : tf.Variable()

  Example:
    >>> build_graph()
  """
  x = tf.placeholder(tf.float32, shape=[None, 784])
  y_ = tf.placeholder(tf.float32, shape=[None, 10])

  def weight_variable(shape):
    """Create a weight variable with appropriate initialization."""
    initial = tf.truncated_normal(shape, stddev=0.1)
    return tf.Variable(initial)

  def bias_variable(shape):
    """Create a bias variable with appropriate initialization."""
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial)
    
  def conv2d(x, W):
    """simple conv2d layer"""
    return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')

  def max_pool_2x2(x):
    """a simple 2x2 max pool layer"""
    return tf.nn.max_pool(x, ksize=[1, 2, 2, 1],
                        strides=[1, 2, 2, 1], padding='SAME')
  
  # First conv layer with a pool layer
  W_conv1 = weight_variable([5, 5, 1, 32])
  b_conv1 = bias_variable([32])

  x_image = tf.reshape(x, [-1,28,28,1])
  h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
  h_pool1 = max_pool_2x2(h_conv1)

  # Second conv layer with a pool layer
  W_conv2 = weight_variable([5, 5, 32, 64])
  b_conv2 = bias_variable([64])

  h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
  h_pool2 = max_pool_2x2(h_conv2)

  # First Full-connect layer
  W_fc1 = weight_variable([7 * 7 * 64, 1024])
  b_fc1 = bias_variable([1024])

  h_pool2_flat = tf.reshape(h_pool2, [-1, 7*7*64])
  h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)

  keep_prob = tf.placeholder(tf.float32)
  h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)

  # Second Full-connect layer
  W_fc2 = weight_variable([1024, 10])
  b_fc2 = bias_variable([10])

  # output layer
  y_conv = tf.matmul(h_fc1_drop, W_fc2) + b_fc2
  y_pred = tf.nn.softmax(y_conv)
  cross_entropy = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y_conv))
  
  sess = tf.InteractiveSession()
  return sess, x, y_, y_pred, keep_prob, cross_entropy

def generate_adversarial(model_path, img_list, target_class, eta=0.001, 
        threshold=0.99, save_path=None, file_name='adversarial', verbose=0):
  """generate adversarial images, note that gradient and some parts of 
    graph are needed during iterations, hence I decide not to pack some codes 
    into helper function

  Args:
    tensor_in: `Tensor`, input tensor.
    other_tensor_in: `Tensor`, same shape as `tensor_in`, other input tensor.
    my_param: `float`, coefficient for `tensor_in`.
    other_param: `float`, coefficient for `other_tensor_in`.
    output_collections: `tuple` of `string`s, name of the collection to
                        collect result of this op.
    name: `string`, name of the operation.

    model_path: `string`, the path to previous model
    img_list: `string`, the img list that need to generate adversarial images
    target_class: `int`, the wanted label
    eta: `float`, learning rate (or step size), default: 0.001
    threshold: `float`, the confidence we want to fool, default: 0.99 (99%)
    save_path: `string`, the path to img/ folder
    file_name: `string`, the name for saving file, default:'adversarial'
    verbose: `int`, verbose=0, omit the training graphs, default: 0

  Returns:
    `np.array`: the final adversarial image for each img in img_list

  Example:
    >>> generate_adversarial(model_path='../model/MNIST.ckpt', 
                img_list=img_list, target_class=6, eta=0.01, threshold=0.99, 
                save_path='../img/', file_name='adversarial', verbose=1)
    np.ndarray(...)
  """
  sess, x, y_, y_pred, keep_prob, cross_entropy = build_graph()

  sess.run(tf.global_variables_initializer())
  tf.train.Saver().restore(sess, model_path)
  print('load model from', model_path)
  
  prediction=tf.argmax(y_pred,1)
  probabilities=y_pred

  img_gradient = tf.gradients(cross_entropy, x)[0]

  adversarial_img_list = list()

  # generate versus figure
  sns.set_style('white')
  versus_fig = plt.figure(figsize=(9, 40))

  for img_index in range(0, img_list.shape[0]):
    adversarial_img = img_list[img_index: img_index+1].copy()
    adversarial_label = np.zeros((1, 10))
    adversarial_label[:, target_class] = 1
    
    confidence = 0
    iter_num = 0
    prob_history = list()
    while confidence < threshold:
      probabilities_val = probabilities.eval(feed_dict=
                        {x: adversarial_img, keep_prob: 1.0}, session=sess)
      confidence = probabilities_val[:, 6]
      prob_history.append(probabilities_val[0])
      
      gradient = img_gradient.eval(
          {x: adversarial_img, y_: adversarial_label, keep_prob: 1.0})
      adversarial_img -= eta * gradient
      iter_num += 1
    print('generate adversarial image after', iter_num, 'iterations')

    # generate versus figure

    ax1 = versus_fig.add_subplot(10, 3, 3*img_index+1)
    ax1.axis('off')
    ax1.imshow(img_list[img_index].reshape([28, 28]), 
              interpolation=None, cmap=plt.cm.gray)
    ax1.title.set_text(
          'Confidence for 2: ' + '{:.4f}'.format(prob_history[0][2]) 
          + '\nConfidence for 6: ' + '{:.4f}'.format(prob_history[0][6]))

    ax2 = versus_fig.add_subplot(10, 3, 3*img_index+2)
    ax2.axis('off')
    ax2.imshow((adversarial_img - img_list[img_index]).reshape([28, 28]),
                interpolation=None, cmap=plt.cm.gray)
    ax2.title.set_text('Delta')

    ax3 = versus_fig.add_subplot(10, 3, 3*img_index+3)
    ax3.axis('off')
    ax3.imshow((adversarial_img).reshape([28, 28]), 
                interpolation=None, cmap=plt.cm.gray)
    ax3.title.set_text(
          'Confidence for 2: ' + '{:.4f}'.format(prob_history[-1][2]) 
          + '\nConfidence for 6: ' + '{:.4f}'.format(prob_history[-1][6]))

    print("Difference Measure:", 
                      np.sum((adversarial_img - img_list[img_index]) ** 2))
    adversarial_img_list.append(adversarial_img)

    if verbose != 0:
      sns.set_style('whitegrid')
      colors_list = sns.color_palette("Paired", 10)
      # generate Iteration figure
      prob_history = np.array(prob_history)

      fig = plt.figure(figsize=(10, 6))
      ax = fig.add_subplot(111)

      for i, record in enumerate(prob_history.T):
          plt.plot(record, color=colors_list[i])

      ax.legend([str(x) for x in range(0, 10)], 
                  loc='center left', bbox_to_anchor=(1.01, 0.5), fontsize=14)
      ax.set_xlabel('Iteration')
      ax.set_ylabel('Prediction Confidence')
      fig.savefig(save_path + file_name + str(img_index) + '_iter.png')

  versus_fig.tight_layout()
  versus_fig.savefig(save_path + file_name + '_versus.png')
  return np.array(adversarial_img_list)

Here we randomly select 10 images from mnist.test as input



In [3]:

    
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets('/tmp/tensorflow/mnist/input_data', one_hot=True)









    



Extracting /tmp/tensorflow/mnist/input_data/train-images-idx3-ubyte.gz
Extracting /tmp/tensorflow/mnist/input_data/train-labels-idx1-ubyte.gz
Extracting /tmp/tensorflow/mnist/input_data/t10k-images-idx3-ubyte.gz
Extracting /tmp/tensorflow/mnist/input_data/t10k-labels-idx1-ubyte.gz



In [4]:

    
%matplotlib inline



In [5]:

    
index_mask = np.where(mnist.test.labels[:, 2])[0]
subset_mask = np.random.choice(index_mask, 10)
origin_images = mnist.test.images[subset_mask]
origin_labels = mnist.test.labels[subset_mask]

Call the function to get result



In [6]:

    
ad_img = generate_adversarial(model_path='../model/MNIST.ckpt', img_list=origin_images, target_class=6, eta=0.01, threshold=0.99, 
                        save_path='../img/', file_name='adversarial', verbose=0)









    



load model from ../model/MNIST.ckpt
generate adversarial image after 243 iterations
Difference Measure: 9.09434
generate adversarial image after 343 iterations
Difference Measure: 7.69538
generate adversarial image after 177 iterations
Difference Measure: 3.8366
generate adversarial image after 608 iterations
Difference Measure: 11.3014
generate adversarial image after 343 iterations
Difference Measure: 7.69525
generate adversarial image after 262 iterations
Difference Measure: 5.95166
generate adversarial image after 75 iterations
Difference Measure: 2.93704
generate adversarial image after 750 iterations
Difference Measure: 9.09427
generate adversarial image after 420 iterations
Difference Measure: 5.42159
generate adversarial image after 108 iterations
Difference Measure: 5.6366

Let 's s try to feed these adversarial images to different models



In [3]:

    
from sklearn import svm, metrics



In [4]:

    
train_images = mnist.train.images[:]
train_labels = mnist.train.labels[:]
test_images = mnist.test.images[:]
test_labels = mnist.test.labels[:]



In [8]:

    
train_labels = np.apply_along_axis(lambda x: np.where(x)[0][0], 1, train_labels)
test_labels = np.apply_along_axis(lambda x: np.where(x)[0][0], 1, test_labels)

The first one is SVM(using SVC in scikit-learn), as the training process is slow, here only use first 10000 training images



In [10]:

    
classifier = svm.SVC(probability=True, verbose=True)



In [15]:

    
classifier.fit(train_images[0: 10000], train_labels[0: 10000])









    



[LibSVM]





    Out[15]:





SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=True)



In [33]:

    
pred_labels = classifier.predict(test_images)
print("Confusion matrix:\n%s" % metrics.confusion_matrix(test_labels, pred_labels))









    



Confusion matrix:
[[ 962    0    1    0    0    8    6    1    1    1]
 [   0 1121    2    3    0    2    3    0    4    0]
 [  15    3  924    9   12    3   20   15   28    3]
 [   2    2   20  927    0   20    3   12   19    5]
 [   1    6    7    0  899    0   13    2    2   52]
 [  10    8    5   44    9  781   15    3   12    5]
 [  11    3    6    1    8   13  915    0    1    0]
 [   2   21   21    5   11    1    0  938    5   24]
 [   6    9    9   19    8   33   13    8  854   15]
 [  13    9    1   11   34   10    1   16    4  910]]



In [34]:

    
print("Classification report for classifier %s:\n%s\n"
      % (classifier, metrics.classification_report(test_labels, pred_labels)))









    



Classification report for classifier SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=True):
             precision    recall  f1-score   support

          0       0.94      0.98      0.96       980
          1       0.95      0.99      0.97      1135
          2       0.93      0.90      0.91      1032
          3       0.91      0.92      0.91      1010
          4       0.92      0.92      0.92       982
          5       0.90      0.88      0.89       892
          6       0.93      0.96      0.94       958
          7       0.94      0.91      0.93      1028
          8       0.92      0.88      0.90       974
          9       0.90      0.90      0.90      1009

avg / total       0.92      0.92      0.92     10000



In [27]:

    
pred_labels = classifier.predict(np.squeeze(ad_img))



In [28]:

    
pred_labels









    Out[28]:





array([2, 2, 2, 2, 2, 2, 2, 2, 2, 4])

These images can not fool SVM, let's try RandomForest



In [35]:

    
from sklearn.ensemble import RandomForestClassifier



In [36]:

    
classifier = RandomForestClassifier(n_estimators=200)



In [39]:

    
classifier.fit(train_images, train_labels)









    Out[39]:





RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=200, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)



In [40]:

    
pred_labels = classifier.predict(test_images)
print("Confusion matrix:\n%s" % metrics.confusion_matrix(test_labels, pred_labels))
print("Classification report for classifier %s:\n%s\n"
      % (classifier, metrics.classification_report(test_labels, pred_labels)))









    



Confusion matrix:
[[ 968    0    0    0    0    3    4    1    3    1]
 [   0 1125    2    2    0    2    2    0    1    1]
 [   5    0 1000    7    3    0    3    8    6    0]
 [   0    0   11  974    0    6    0    9    8    2]
 [   1    0    1    0  956    0    5    0    2   17]
 [   2    0    0   11    4  859    6    2    5    3]
 [   6    3    0    0    3    4  938    0    4    0]
 [   1    3   22    2    1    0    0  986    2   11]
 [   3    0    5    8    3    6    4    4  931   10]
 [   6    6    3    8   14    1    1    4    6  960]]
Classification report for classifier RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=200, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False):
             precision    recall  f1-score   support

          0       0.98      0.99      0.98       980
          1       0.99      0.99      0.99      1135
          2       0.96      0.97      0.96      1032
          3       0.96      0.96      0.96      1010
          4       0.97      0.97      0.97       982
          5       0.98      0.96      0.97       892
          6       0.97      0.98      0.98       958
          7       0.97      0.96      0.97      1028
          8       0.96      0.96      0.96       974
          9       0.96      0.95      0.95      1009

avg / total       0.97      0.97      0.97     10000



In [41]:

    
pred_labels = classifier.predict(np.squeeze(ad_img))



In [42]:

    
pred_labels









    Out[42]:





array([2, 8, 8, 2, 2, 2, 2, 2, 2, 8])

Even though the noise does confuse the classifier, the prediction label is not we want

How a about a common LeNet?



In [7]:

    
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D, MaxPooling2D
from keras.utils import np_utils
from keras import backend as K









    



Using TensorFlow backend.



In [18]:

    
# input image dimensions
img_rows, img_cols = 28, 28
# number of convolutional filters to use
nb_filters = 32
# size of pooling area for max pooling
pool_size = (2, 2)
# convolution kernel size
kernel_size = (3, 3)

input_shape = (img_rows, img_cols, 1)

batch_size = 128
nb_classes = 10
nb_epoch = 50



In [19]:

    
train_images = mnist.train.images.reshape((55000, 28, 28, 1))
train_labels = mnist.train.labels
test_images = mnist.test.images.reshape((10000, 28, 28, 1))
test_labels = mnist.test.labels

valid_images = mnist.validation.images.reshape((5000, 28, 28, 1))
valid_labels = mnist.validation.labels



In [20]:

    
model = Sequential()

model.add(Convolution2D(nb_filters, kernel_size[0], kernel_size[1],
                        border_mode='valid',
                        input_shape=input_shape))
model.add(Activation('relu'))
model.add(Convolution2D(nb_filters, kernel_size[0], kernel_size[1]))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=pool_size))
model.add(Dropout(0.25))

model.add(Flatten())
model.add(Dense(128))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(nb_classes))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adadelta',
              metrics=['accuracy'])

model.fit(train_images, train_labels, batch_size=batch_size, nb_epoch=nb_epoch,
          verbose=0, validation_data=(valid_images, valid_labels))









    Out[20]:





<keras.callbacks.History at 0x7fcfe5ccb7b8>



In [21]:

    
score = model.evaluate(test_images, test_labels, verbose=0)
print('Test score:', score[0])
print('Test accuracy:', score[1])









    



Test score: 0.0306128276816
Test accuracy: 0.991



In [22]:

    
model.predict(ad_img.reshape((10, 28, 28, 1)))









    Out[22]:





array([[  6.00433941e-06,   5.20795879e-07,   9.96605277e-01,
          3.36286286e-03,   2.46108982e-08,   2.73512697e-08,
          6.82997436e-10,   2.81186857e-07,   2.22933668e-05,
          2.79486949e-06],
       [  6.19925868e-06,   5.65824390e-04,   9.87202704e-01,
          1.11195054e-02,   2.31055110e-05,   3.90526202e-06,
          5.11477174e-06,   7.03725324e-04,   3.63981846e-04,
          5.98347060e-06],
       [  5.38399902e-08,   5.13083578e-05,   9.97262955e-01,
          2.65042088e-03,   3.31562120e-07,   2.84679780e-10,
          2.38823901e-07,   3.39144353e-05,   7.66287201e-07,
          1.51679924e-09],
       [  4.54879628e-05,   2.78240349e-02,   8.73205364e-01,
          9.19484869e-02,   1.77355628e-06,   1.03264625e-04,
          4.38309326e-06,   3.09151085e-03,   3.74361919e-03,
          3.20698891e-05],
       [  6.19954244e-06,   5.65640396e-04,   9.87206459e-01,
          1.11163044e-02,   2.31009508e-05,   3.90349351e-06,
          5.11413782e-06,   7.03264377e-04,   3.63991217e-04,
          5.98202132e-06],
       [  7.06175997e-05,   4.19604324e-10,   9.61663306e-01,
          1.33769892e-04,   1.76799564e-09,   3.40369468e-07,
          1.09665926e-07,   6.17409341e-08,   3.81158739e-02,
          1.60112504e-05],
       [  4.69411121e-10,   2.28347216e-10,   9.99999642e-01,
          4.72751331e-08,   2.26244712e-09,   1.48985671e-10,
          3.15392157e-08,   6.54405807e-10,   2.98406178e-07,
          4.24915962e-12],
       [  1.15019027e-02,   1.12500635e-03,   8.94971132e-01,
          2.77649779e-02,   1.93419037e-04,   1.31618662e-03,
          2.16246059e-04,   6.09698966e-02,   7.99867499e-04,
          1.14144932e-03],
       [  7.30261218e-06,   2.59909919e-03,   9.97068584e-01,
          2.48752622e-04,   2.57733348e-07,   1.48156962e-08,
          1.49892728e-07,   5.54841572e-05,   1.32923824e-05,
          7.02647094e-06],
       [  8.21429892e-13,   1.51245126e-06,   9.99998450e-01,
          2.78696692e-08,   5.67495703e-12,   1.25209806e-13,
          9.22020620e-12,   2.70595612e-11,   8.14725176e-09,
          2.94933728e-13]], dtype=float32)



In [23]:

    
model.predict_classes(ad_img.reshape((10, 28, 28, 1)))









    



10/10 [==============================] - 0s






    Out[23]:





array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

Here we try to fool other models using same adversarial images

Just import the code

Here we randomly select 10 images from mnist.test as input

Call the function to get result

Let 's s try to feed these adversarial images to different models

The first one is SVM(using SVC in scikit-learn), as the training process is slow, here only use first 10000 training images

These images can not fool SVM, let's try RandomForest

Even though the noise does confuse the classifier, the prediction label is not we want

How a about a common LeNet?

From these result, we can conclude that adversarial images cannot apply to different models