In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

from __future__ import division
from __future__ import print_function
from __future__ import absolute_import
from __future__ import unicode_literals  

import numpy as np
import pandas as pd
from sklearn import linear_model, preprocessing, cluster
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.linalg as slin
import scipy.sparse.linalg as sparselin
import scipy.sparse as sparse
import IPython

import tensorflow as tf

from inceptionModel import BinaryInceptionModel
from logisticRegressionWithLBFGS import LogisticRegressionWithLBFGS
from binaryLogisticRegressionWithLBFGS import BinaryLogisticRegressionWithLBFGS

from load_animals import load_animals

import experiments
from image_utils import plot_flat_bwimage, plot_flat_bwgrad, plot_flat_colorimage, plot_flat_colorgrad
 
from dataset import DataSet
from tensorflow.contrib.learn.python.learn.datasets import base

sns.set(color_codes=True)


Using TensorFlow backend.

In [2]:
def reverse_preprocess(x):
    x /= 2.
    x += 0.5
    return x

Setup


In [4]:
num_train_ex_per_class = 900
num_test_ex_per_class = 300

model_name = 'animals_%s_%s' % (num_train_ex_per_class, num_test_ex_per_class)
# image_data_sets = load_animals(num_train_ex_per_class=num_train_ex_per_class, num_test_ex_per_class=num_test_ex_per_class)

In [10]:
train_f = np.load('/srv/scratch/pangwei/influence_data/%s_inception_features_train.npz' % model_name)
train = DataSet(train_f['inception_features_val'], train_f['labels'])
test_f = np.load('/srv/scratch/pangwei/influence_data/%s_inception_features_test.npz' % model_name)
test = DataSet(test_f['inception_features_val'], test_f['labels'])
validation = None

data_sets = base.Datasets(train=train, validation=validation, test=test)

input_dim = 2048
weight_decay = 0.01
batch_size = 900
initial_learning_rate = 0.001 
keep_probs = None
decay_epochs = [1000, 10000]
max_lbfgs_iter = 1000
num_classes = 10

tf.reset_default_graph()

model = LogisticRegressionWithLBFGS(
    input_dim=input_dim,
    weight_decay=weight_decay,
    max_lbfgs_iter=max_lbfgs_iter,
    num_classes=num_classes, 
    batch_size=batch_size,
    data_sets=data_sets,
    initial_learning_rate=initial_learning_rate,
    keep_probs=keep_probs,
    decay_epochs=decay_epochs,
    mini_batch=False,
    train_dir='data',
    log_dir='log',
    model_name='animals_inception_onlytop_poisoned')

model.train()
weights = model.sess.run(model.weights)
np.save('data/inception_weights_%s' % model_name, weights)

Y_pred = model.sess.run(model.preds, feed_dict=model.all_test_feed_dict)
Y_test = model.data_sets.test.labels


Total number of parameters: 20480
Using normal model
LBFGS training took [72] iter.
After training with LBFGS: 
Train loss (w reg) on all data: 0.117541
Test loss (w/o reg) on all data: 0.202384
Train acc on all data:  0.991
Test acc on all data:   0.943666666667
Norm of the mean of gradients: 1.41374e-05
Norm of the params: 3.29989

Look for good test_idx


In [9]:
model.data_sets.test.labels


Out[9]:
array([ 5.,  2.,  2., ...,  8.,  6.,  6.])

In [10]:
image_data_sets.test.labels


Out[10]:
array([ 0.,  5.,  8., ...,  3.,  6.,  5.])

In [12]:
assert np.all(model.data_sets.test.labels == image_data_sets.test.labels)
assert np.all(model.data_sets.train.labels == image_data_sets.train.labels)

In [19]:
for test_idx in range(200,400):    
    if np.max(Y_pred[test_idx, :]) > 0.99: continue
#     if Y_test[test_idx] != 0: continue
#     test_idx = 0
    print(test_idx, Y_pred[test_idx,  Y_test[test_idx]])
    plt.imshow((np.reshape(image_data_sets.test.x[test_idx, :], [299, 299, 3]) + 1) / 2, interpolation='none')  
    plt.axis('off')
    plt.tight_layout()
    plt.show()


/users/pangwei/miniconda3/envs/py27/lib/python2.7/site-packages/ipykernel/__main__.py:5: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
201 0.988933
203 0.618373
210 0.594774
213 0.857126
216 0.807488
224 0.806654
225 0.966291
226 0.981206
232 0.94569
233 0.748557
235 0.987567
237 0.24144
240 0.836723
243 0.98813
244 0.0263649
254 0.98114
256 0.82994
258 0.987387
259 0.979866
260 0.941088
262 0.968038
264 0.82492
268 0.98047
272 0.907655
273 0.984172
274 0.902911
275 0.950758
276 0.977823
279 0.96003
281 0.962839
285 0.98852
286 0.944731
292 0.98614
296 0.126662
297 0.947959
298 0.975673
301 0.0576257
303 0.864869
304 0.980167
308 0.976438
311 0.751896
312 0.983653
314 0.959349
315 0.427759
324 0.813789
331 0.966236
332 0.947394
336 0.989227
339 0.989627
340 0.923174
343 0.980296
344 0.911273
346 0.853698
349 0.134631
350 0.986109
352 0.40651
354 0.971893
357 0.985063
361 0.984606
363 0.967646
367 0.0131675
368 0.584983
369 0.838594
376 0.0172747
381 0.840271
382 0.980806
383 0.94746
384 0.700287
385 0.465184
386 0.79191
387 0.64753
391 0.357741
392 0.92911
394 0.962392
395 0.969534
396 0.662224
397 0.962612

In [25]:
test_indices = [11, 41, 54, 66, 85, 88, 100, 125, 173]

In [26]:
for test_idx in test_indices:
    print(test_idx, Y_pred[test_idx,  Y_test[test_idx]])
    plt.imshow((np.reshape(image_data_sets.test.x[test_idx, :], [299, 299, 3]) + 1) / 2, interpolation='none')  
    plt.axis('off')
    plt.tight_layout()
    plt.show()


/users/pangwei/miniconda3/envs/py27/lib/python2.7/site-packages/ipykernel/__main__.py:2: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  from ipykernel import kernelapp as app
11 0.961979
41 0.975805
54 0.985501
66 0.986604
85 0.984716
88 0.986016
100 0.926866
125 0.976268
173 0.951763

Attack


In [14]:
# Test indices: [11, 41, 54, 66, 85, 88, 100, 125, 173]
test_idx = 41
model_string = 'animals_%s_%s_inception' % (num_train_ex_per_class, num_test_ex_per_class)

# train_dict = np.load('/srv/scratch/pangwei/influence_data/animals_900_300_inception_inception_features_poisoned_train_influence_poison-MAI-replace-0.1_testidx_[21].npz')
train_dict = np.load(
    '/srv/scratch/pangwei/influence_data/%s_inception_features_poisoned_train_influence_poison-maxgrad-linf-replace-0.01_testidx_[%s].npz'\
    % (model_string, test_idx))
# train_dict = np.load('/srv/scratch/pangwei/influence_data/%s_features_train.npz' % model_string)
### Normal
# train_dict = np.load('data/%s_features_train.npz' % model_string)

train = DataSet(np.reshape(train_dict['inception_features_val'], [-1, 2048]), train_dict['labels'])
test_dict = np.load('/srv/scratch/pangwei/influence_data/%s_features_test.npz' % model_string)
test = DataSet(np.reshape(test_dict['inception_features_val'], [-1, 2048]), test_dict['labels'])
validation = None

data_sets = base.Datasets(train=train, validation=validation, test=test)

modify_type = 'replace'
assert(all(image_data_sets.train.labels == data_sets.train.labels))
assert(all(image_data_sets.test.labels == data_sets.test.labels))


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-14-b11e8008ff11> in <module>()
     19 
     20 modify_type = 'replace'
---> 21 assert(all(image_data_sets.train.labels == data_sets.train.labels))
     22 assert(all(image_data_sets.test.labels == data_sets.test.labels))

NameError: name 'image_data_sets' is not defined

In [15]:
tf.reset_default_graph()

model = LogisticRegressionWithLBFGS(
    input_dim=input_dim,
    weight_decay=weight_decay,
    max_lbfgs_iter=max_lbfgs_iter,
    num_classes=num_classes, 
    batch_size=batch_size,
    data_sets=data_sets,
    initial_learning_rate=initial_learning_rate,
    keep_probs=keep_probs,
    decay_epochs=decay_epochs,
    mini_batch=False,
    train_dir='data',
    log_dir='log',
    model_name='animals_inception_onlytop_poisoned')

model.train()

# Y_test = model.data_sets.test.labels
attacked_Y_pred = model.sess.run(model.preds, feed_dict=model.all_test_feed_dict)


Total number of parameters: 20480
Using normal model
LBFGS training took [69] iter.
After training with LBFGS: 
Train loss (w reg) on all data: 0.121604
Test loss (w/o reg) on all data: 0.209222
Train acc on all data:  0.989222222222
Test acc on all data:   0.939
Norm of the mean of gradients: 1.97514e-05
Norm of the params: 3.35533

In [16]:
classes = ['dog', 'cat', 'bird', 'fish', 'horse', 'monkey', 'zebra', 'panda', 'lemur', 'wombat']

print(test_idx)
print('orig     correct pred: %s' % Y_pred[test_idx, int(Y_test[test_idx])])
print('attacked correct pred: %s' % attacked_Y_pred[test_idx, int(Y_test[test_idx])])
print('new prediction: %s, %s' % (
        np.max(attacked_Y_pred[test_idx, :]), 
        classes[np.argmax(attacked_Y_pred[test_idx, :])]))
print(Y_pred[test_idx])
print(attacked_Y_pred[test_idx])
# plt.subplots(figsize=(3,3))
# plt.imshow((np.reshape(image_data_sets.test.x[test_idx, :], [299, 299, 3]) + 1) / 2, interpolation='none')  
# plt.axis('off')
# plt.title('96.2% horse to 99.7% dog')
# plt.tight_layout()
# plt.savefig("figs/attack-horse.png", dpi=300, bbox_inches='tight')


41
orig     correct pred: 0.975805
attacked correct pred: 0.0129939
new prediction: 0.981997, monkey
[  1.79548434e-03   7.01588229e-04   1.19656604e-03   4.00028424e-04
   1.91497209e-04   1.55262621e-02   2.27576075e-03   1.05298776e-03
   9.75804508e-01   1.05533306e-03]
[  8.13044084e-04   4.52055916e-04   1.18824840e-03   2.78284570e-04
   1.25428356e-04   9.81996834e-01   1.16383343e-03   4.58468305e-04
   1.29938694e-02   5.29802579e-04]

In [ ]:
Y_pred_correct = np.zeros([len(Y_test)])
for idx, label in enumerate(Y_test):
    Y_pred_correct[idx] = Y_pred[idx, int(label)]

In [50]:
poisoned_Y_pred_correct = Y_pred_correct

In [46]:
orig_Y_pred_correct = Y_pred_correct

In [69]:
sns.distplot(orig_Y_pred_correct, kde=False)
sns.distplot(poisoned_Y_pred_correct, kde=False)


Out[69]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f2e30094950>

In [68]:
sns.distplot(orig_Y_pred_correct - Y_pred_correct, kde=False)


Out[68]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f2e7c066810>

In [64]:
sns.distplot(orig_Y_pred_correct - poisoned_Y_pred_correct, kde=False)


Out[64]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f2e583acd90>

In [65]:
print('True: %s' % Y_test[test_idx])
print('Predicted: %s' % Y_pred[test_idx])

test_image = reverse_preprocess(np.copy(image_data_sets.test.x[test_idx, :]))
test_label = image_data_sets.test.labels[test_idx]
plot_flat_colorimage(test_image, test_label, side=299)


True: 0.0
Predicted: [  1.48447621e-09   3.59972619e-05   1.84162855e-04   9.94826734e-01
   2.60378001e-03   1.22097656e-04   3.62382380e-05   1.70509284e-05
   1.00722882e-05   2.16390658e-03]

Pictures of animals and their predictions (without poisoning)


In [61]:
test_idx = 1
print('True: %s' % Y_test[test_idx])
print('Predicted: %s' % Y_pred[test_idx, :])

test_image = reverse_preprocess(np.copy(image_data_sets.test.x[test_idx, :]))
test_label = image_data_sets.test.labels[test_idx]
plot_flat_colorimage(test_image, test_label, side=299)


True: 0.0
Predicted: [  9.93492901e-01   7.68667014e-05   5.92689728e-04   3.35619436e-03
   7.16991839e-04   3.13720084e-04   2.36264925e-04   8.47417905e-05
   2.30153655e-05   1.10662507e-03]

In [237]:
test_idx = 15
print('True: %s' % Y_test[test_idx])
print('Predicted: %s' % Y_pred[test_idx])
log_loss = -np.log(Y_test[test_idx] * Y_pred[test_idx] + (1 - Y_test[test_idx]) * (1 - Y_pred[test_idx]))
print('Log loss: %s' % log_loss)

test_image = reverse_preprocess(np.copy(image_data_sets.test.x[test_idx, :]))
test_label = image_data_sets.test.labels[test_idx]
plot_flat_colorimage(test_image, test_label, side=299)


True: 1.0
Predicted: 0.999451
Log loss: 0.000549049874037

In [241]:
test_idx = 33
print('True: %s' % Y_test[test_idx])
print('Predicted: %s' % Y_pred[test_idx])
log_loss = -np.log(Y_test[test_idx] * Y_pred[test_idx] + (1 - Y_test[test_idx]) * (1 - Y_pred[test_idx]))
print('Log loss: %s' % log_loss)

test_image = reverse_preprocess(np.copy(image_data_sets.test.x[test_idx, :]))
test_label = image_data_sets.test.labels[test_idx]
plot_flat_colorimage(test_image, test_label, side=299)


True: 1.0
Predicted: 0.888253
Log loss: 0.118498696429

In [242]:
sort_idx = 484
np.sort(Y_pred)[sort_idx]
test_idx = np.argsort(Y_pred)[sort_idx]
print(test_idx)
print('True: %s' % Y_test[test_idx])
print('Predicted: %s' % Y_pred[test_idx])
log_loss = -np.log(Y_test[test_idx] * Y_pred[test_idx] + (1 - Y_test[test_idx]) * (1 - Y_pred[test_idx]))
print('Log loss: %s' % log_loss)

test_image = reverse_preprocess(np.copy(image_data_sets.test.x[test_idx, :]))
test_label = image_data_sets.test.labels[test_idx]
plot_flat_colorimage(test_image, test_label, side=299)


825
True: 0.0
Predicted: 0.0414969
Log loss: 0.0423824617834

In [73]:
test_idx = 45
print('True: %s' % Y_test[test_idx])
print('Predicted: %s' % Y_pred[test_idx])

test_image = reverse_preprocess(np.copy(image_data_sets.test.x[test_idx, :]))
test_label = image_data_sets.test.labels[test_idx]
plot_flat_colorimage(test_image, test_label, side=299)


True: 0.0
Predicted: 0.000371538

How different are the poisoned examples vs. normal variation?


In [78]:
np.min(image_data_sets.train.x)


Out[78]:
-1.0

In [79]:
centers


Out[79]:
array([[-0.15708911, -0.14228433,  0.01859124, ...,  0.14652792,
        -0.08452277,  0.00155818],
       [-0.10103229, -0.01307835, -0.14758964, ..., -0.05739734,
        -0.01313132,  0.04331716]])

In [80]:
label_indices


Out[80]:
array([False, False, False, False,  True,  True,  True,  True,  True,
        True, False, False,  True,  True,  True,  True, False,  True,
       False, False, False, False,  True, False, False, False, False,
        True,  True, False, False, False,  True, False,  True, False,
       False,  True, False,  True, False, False,  True, False,  True,
       False, False, False,  True, False,  True,  True, False,  True,
        True,  True,  True,  True, False,  True,  True,  True, False,
       False,  True,  True, False, False,  True,  True, False, False,
        True,  True,  True,  True, False, False, False,  True, False,
        True,  True, False,  True,  True,  True, False,  True,  True,
        True, False, False,  True, False, False, False, False, False, False], dtype=bool)

In [81]:
distances.shape


Out[81]:
(1000, 268203)

In [16]:
# Compute average l2 distance of each training example to their 
# cluster center
dim = image_data_sets.train.x.shape[1]
centers = np.zeros([2, dim])
avg_dist = np.zeros([2])
for label in [0, 1]:
    label_indices = (image_data_sets.train.labels == label)
    centers[label, :] = np.mean(image_data_sets.train.x[label_indices, :], axis=0)
    dists = image_data_sets.train.x[label_indices, :] - centers[label]
    avg_dist[label] = np.mean(np.linalg.norm(dists, axis=1))

avg_dist


Out[16]:
array([ 263.68387306,  257.04856512])

In [17]:
# Compute l2 distance of a perturbed example to the original example
# Each pixel is modified by (2.0 / 255)
perturbed_distance = np.sqrt(dim * 2.0 / 255)
perturbed_distance


Out[17]:
45.864506333072754

In [82]:
### Feature space
normal_train_dict = np.load('data/%s_features_train.npz' % model_string)
X = normal_train_dict['inception_features_val']
Y = normal_train_dict['labels']

dim = X.shape[1]
centers = np.zeros([2, dim])
avg_dist = np.zeros([2])
for label in [0, 1]:
    label_indices = (Y == label)
    centers[label, :] = np.mean(X[label_indices, :], axis=0)
    dists = X[label_indices, :] - centers[label]
    norms = np.linalg.norm(dists, axis=1)
    avg_dist[label] = np.mean(np.linalg.norm(dists, axis=1))
    sns.distplot(norms)

avg_dist


Out[82]:
array([ 19.92613171,  22.4012713 ])

In [83]:
normal_train_dict = np.load('data/%s_features_train.npz' % model_string)
X = normal_train_dict['inception_features_val']
Y = normal_train_dict['labels']

poisoned_train_dict = np.load('data/normal_dog_2000_1000_inception_inception_features_poisoned_train_influence_poison-maxgrad-linf-replace-0.05_testidx_825.npz')
X_poison = poisoned_train_dict['inception_features_val']
Y_poison = poisoned_train_dict['labels']

In [84]:
assert all(Y == Y_poison)
modified_idx = np.where(~np.all(X == X_poison, axis=1))[0]
dists = X_poison[modified_idx, :] - X[modified_idx, :]
np.mean(np.linalg.norm(dists, axis=1))


Out[84]:
23.580151

In [85]:
for label in [0, 1]:
    label_indices = (Y_poison == label)
    centers[label, :] = np.mean(X_poison[label_indices, :], axis=0)        
    dists = X_poison[label_indices, :] - centers[label]
    norms = np.linalg.norm(dists, axis=1)
    avg_dist[label] = np.mean(np.linalg.norm(dists, axis=1))

X_unmodified = X_poison[~modified_idx, :]
Y_unmodified = Y_poison[~modified_idx]
for label in [0, 1]:
    label_indices = (Y_unmodified == label)
    dists = X_unmodified[label_indices, :] - centers[label]
    avg_dist[label] = np.mean(np.linalg.norm(dists, axis=1))
    norms = np.linalg.norm(dists, axis=1)
    sns.distplot(norms)

X_modified = X_poison[modified_idx, :]
Y_modified = Y_poison[modified_idx]
for label in [0, 1]:
    label_indices = (Y_modified == label)
    dists = X_modified[label_indices, :] - centers[label]
    avg_dist[label] = np.mean(np.linalg.norm(dists, axis=1))
    norms = np.linalg.norm(dists, axis=1)
    sns.distplot(norms)
avg_dist


Out[85]:
array([ 31.01453623,  29.19473356])

In [87]:
np.linalg.norm(centers[0, :] - centers[1, :])


Out[87]:
20.578502823118829

Visualize poisoned training examples


In [13]:
f = np.load('data/skewed_dog_inception_poisoned_data_sets_testidx_403.npy')

In [19]:
poisoned_train = f[0]
validation = f[1]
test = f[2]
poisoned_data_sets = poisoned_data_sets = base.Datasets(train=poisoned_train, validation=validation, test=test)

assert np.all(poisoned_data_sets.test.x == image_data_sets.test.x)
assert all(poisoned_data_sets.test.labels == image_data_sets.test.labels)
assert all(poisoned_data_sets.train.labels == image_data_sets.train.labels)
assert (np.max(np.abs(poisoned_data_sets.train.x - image_data_sets.train.x)) * 255 / 2) <= 1.0001

In [18]:
np.min(poisoned_data_sets.train.x)


Out[18]:
-1.0

In [16]:
# Make sure that the poisoned data sets are at most 1 different from the orig data sets in each pixel
np.max(np.abs(poisoned_data_sets.train.x - image_data_sets.train.x)) * 255 / 2


Out[16]:
2.0000074803829193

In [24]:
train_idx = 1225

orig_train_image = reverse_preprocess(np.copy(image_data_sets.train.x[train_idx, :]))
orig_train_label = image_data_sets.train.labels[train_idx]
plot_flat_colorimage(orig_train_image, orig_train_label, side=299)

poisoned_train_image = reverse_preprocess(np.copy(poisoned_data_sets.train.x[train_idx, :]))
poisoned_train_label = poisoned_data_sets.train.labels[train_idx]
plot_flat_colorimage(poisoned_train_image, poisoned_train_label, side=299)



In [34]:
diff = poisoned_train_image - orig_train_image

diff < 0


Out[34]:
array([ True,  True,  True, ..., False,  True,  True], dtype=bool)

In [36]:
diff


Out[36]:
array([ 0.00392157,  0.00392157,  0.00392157, ...,  0.        ,
        0.00392157,  0.00392157], dtype=float32)

In [39]:
poisoned_train_image.shape


Out[39]:
(268203,)

In [54]:
diff = poisoned_train_image - orig_train_image
diff[diff < 0] = 0
reshaped_diff = np.reshape(diff, [299, 299, 3])
# print(reshaped_diff[:10, :10, :])
plot_flat_colorgrad(diff * 10, side=299)


The change in model behavior could be because of a change in the logistic regression weights, or a change in the Inception features generated.

Seems like both change...?

How different are the learned weights?


In [88]:
# Train normal model
train_dict = np.load('data/skewed_dog_inception_morereg_inception_features_train.npz')
train = DataSet(np.reshape(train_dict['inception_features_val'], [-1, 2048]), train_dict['labels'])
test_dict = np.load('data/skewed_dog_inception_morereg_inception_features_test.npz')
test = DataSet(np.reshape(test_dict['inception_features_val'], [-1, 2048]), test_dict['labels'])
validation = None
data_sets = base.Datasets(train=train, validation=validation, test=test)
assert(all(image_data_sets.train.labels == data_sets.train.labels))
assert(all(image_data_sets.test.labels == data_sets.test.labels))

tf.reset_default_graph()

model = BinaryLogisticRegressionWithLBFGS(
    input_dim=input_dim,
    weight_decay=weight_decay,
    max_lbfgs_iter=max_lbfgs_iter,
    num_classes=num_classes, 
    batch_size=batch_size,
    data_sets=data_sets,
    initial_learning_rate=initial_learning_rate,
    keep_probs=keep_probs,
    decay_epochs=decay_epochs,
    mini_batch=False,
    train_dir='data',
    log_dir='log',
    model_name='skewed_dog_inception_onlytop_poisoned')

model.train()
normal_weights = model.sess.run(model.params)[0]

# Train poisoned model
test_idx = 403

train_dict = np.load('data/skewed_dog_inception_inception_features_poisoned_train_influence_testidx_%s.npz' % test_idx) 
train = DataSet(np.reshape(train_dict['inception_features_val'], [-1, 2048]), train_dict['labels'])
test_dict = np.load('data/skewed_dog_inception_morereg_inception_features_test.npz')
test = DataSet(np.reshape(test_dict['inception_features_val'], [-1, 2048]), test_dict['labels'])
validation = None
data_sets = base.Datasets(train=train, validation=validation, test=test)
assert(all(image_data_sets.train.labels == data_sets.train.labels))
assert(all(image_data_sets.test.labels == data_sets.test.labels))
tf.reset_default_graph()

model = BinaryLogisticRegressionWithLBFGS(
    input_dim=input_dim,
    weight_decay=weight_decay,
    max_lbfgs_iter=max_lbfgs_iter,
    num_classes=num_classes, 
    batch_size=batch_size,
    data_sets=data_sets,
    initial_learning_rate=initial_learning_rate,
    keep_probs=keep_probs,
    decay_epochs=decay_epochs,
    mini_batch=False,
    train_dir='data',
    log_dir='log',
    model_name='skewed_dog_inception_onlytop_poisoned')

model.train()
poisoned_weights = model.sess.run(model.params)[0]


Total number of parameters: 2048
Using normal model
LBFGS training took [30] iter.
After training with LBFGS: 
Train loss on all data: 0.0123257
Train acc on all data:  1.0
Test acc on all data:   0.982857142857
Norm of the mean of gradients: 1.30656e-06
Norm of the params: 1.23108
Total number of parameters: 2048
Using normal model
LBFGS training took [34] iter.
After training with LBFGS: 
Train loss on all data: 0.014132
Train acc on all data:  1.0
Test acc on all data:   0.778571428571
Norm of the mean of gradients: 1.07193e-06
Norm of the params: 1.34328

In [89]:
plt.scatter(normal_weights, poisoned_weights)


Out[89]:
<matplotlib.collections.PathCollection at 0x7faddebdae10>

In [90]:
poisoned_weights.shape


Out[90]:
(2048,)

How different are the Inception features?


In [79]:
# Normal
normal_train_dict = np.load('data/skewed_dog_inception_morereg_inception_features_train.npz')

# Poisoned
test_idx = 10
poisoned_train_dict = np.load('data/skewed_dog_inception_inception_features_poisoned_train_influence_testidx_%s.npz' % test_idx) 
# poisoned_train_dict = np.load('data/skewed_dog_inception_inception_features_poisoned_train_random_testidx_None.npz')
# poisoned_train_dict = np.load('data/skewed_dog_inception_inception_features_poisoned_train_traingrad_testidx_None.npz')
# poisoned_train_dict = np.load('data/skewed_dog_inception_inception_features_poisoned_train_mirror_testidx_%s.npz' % test_idx)

# poisoned_train_dict = np.load('data/skewed_dog_inception_inception_features_poisoned_train_testidx_None.npz')

X_train_normal = normal_train_dict['inception_features_val']
X_train_poisoned = poisoned_train_dict['inception_features_val']

Y_train_normal = normal_train_dict['labels']
Y_train_poisoned = poisoned_train_dict['labels']
assert(all(Y_train_normal == Y_train_poisoned))
# np.reshape(train_dict['inception_features_val'], [-1, 2048]

In [80]:
diff = X_train_normal - X_train_poisoned
diff_norm = np.linalg.norm(diff, axis=1)
normal_norm = np.linalg.norm(X_train_normal, axis=1)
poisoned_norm = np.linalg.norm(X_train_poisoned, axis=1)

In [87]:
plt.scatter(normal_norm, poisoned_norm)
plt.xlim(15, 55)
plt.ylim(15, 55)


Out[87]:
(15, 55)

In [82]:
sns.distplot(diff_norm)
sns.distplot(normal_norm)
sns.distplot(poisoned_norm)

# Blue: Differences
# Green: Original
# Red: Poisoned


Out[82]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fad9c72d850>

In [ ]: