In [2]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
from __future__ import division
from __future__ import print_function
from __future__ import absolute_import
from __future__ import unicode_literals
import os
import numpy as np
import pandas as pd
from sklearn import linear_model, preprocessing, cluster
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.linalg as slin
import scipy.sparse.linalg as sparselin
import scipy.sparse as sparse
import IPython
import copy
import tensorflow as tf
from tensorflow.contrib.learn.python.learn.datasets import base
from influence.inceptionModel import BinaryInceptionModel
from influence.binaryLogisticRegressionWithLBFGS import BinaryLogisticRegressionWithLBFGS
import influence.experiments as experiments
from influence.image_utils import plot_flat_bwimage, plot_flat_bwgrad, plot_flat_colorimage, plot_flat_colorgrad
from influence.dataset import DataSet
from influence.dataset_poisoning import generate_inception_features
from load_animals import load_animals, load_dogfish_with_koda
sns.set(color_codes=True)
In [11]:
image_data_sets = load_dogfish_with_koda()
dataset_name = 'dogfish_koda'
train_f = np.load('output/%s_inception_features_new_train.npz' % dataset_name)
train = DataSet(train_f['inception_features_val'], train_f['labels'])
test_f = np.load('output/%s_inception_features_new_test.npz' % dataset_name)
test = DataSet(test_f['inception_features_val'], test_f['labels'])
validation = None
data_sets = base.Datasets(train=train, validation=validation, test=test)
orig_X_train = np.copy(data_sets.train.x)
Y_train = data_sets.train.labels
Y_test = data_sets.test.labels
In [12]:
input_dim = 2048
weight_decay = 0.001
batch_size = 30
initial_learning_rate = 0.001
keep_probs = None
decay_epochs = [1000, 10000]
max_lbfgs_iter = 1000
num_classes = 2
tf.reset_default_graph()
model = BinaryLogisticRegressionWithLBFGS(
input_dim=input_dim,
weight_decay=weight_decay,
max_lbfgs_iter=max_lbfgs_iter,
num_classes=num_classes,
batch_size=batch_size,
data_sets=data_sets,
initial_learning_rate=initial_learning_rate,
keep_probs=keep_probs,
decay_epochs=decay_epochs,
mini_batch=False,
train_dir='output_ipynb',
log_dir='log',
model_name='%s_inception_onlytop' % dataset_name)
model.train()
weights = model.sess.run(model.weights)
orig_Y_train_pred = model.sess.run(model.preds, feed_dict=model.all_train_feed_dict)
orig_Y_pred = model.sess.run(model.preds, feed_dict=model.all_test_feed_dict)
In [6]:
# iter_f = np.load('output/dogfish_koda_inception_wd-0.001_attack_normal_loss_testidx-all_dogfish_koda_trainidx-[1141]_stepsize-0.01_proj_iter-1000.npz')
iter_f = np.load('output/dogfish_koda_inception_wd-0.001_attack_normal_loss_testidx-all_dogfish_koda_trainidx-[1141]_stepsize-0.005_proj_iter-2000.npz')
# iter_f = np.load('output/dogfish_koda_inception_wd-0.001_attack_normal_loss_testidx-all_dogfish_koda_trainidx-[1141]_stepsize-0.005_proj_iter-500.npz')
print(iter_f['test_pred'])
print(np.sum(iter_f['test_pred'][:, 0] < 0.5))
In [13]:
fig, axs = plt.subplots(4, 4, figsize=(16, 16))
row = 0
col = 0
for i in range(len(iter_f['test_pred'])):
if (iter_f['test_pred'][i, 0] < 0.5):
print('test idx: %s' % i)
print(iter_f['test_pred'][i, 0])
# axs[row][col].imshow((np.reshape(iter_f['poisoned_X_train_image'], [299, 299, 3]) + 1) / 2, interpolation='none')
axs[row][col].imshow((np.reshape(image_data_sets.test.x[i, :], [299, 299, 3]) + 1) / 2, interpolation='none')
axs[row][col].axis('off')
axs[row][col].set_title(
'%s -> %s' % (
orig_Y_pred[i, 0], # Change label if using 300 test set
iter_f['test_pred'][i, 0]))
col += 1
if col == 4:
col = 0
row += 1
# plt.savefig("figs/attack-dog-grid.png", dpi=300, bbox_inches='tight')
In [14]:
assert len(iter_f['indices_to_poison']) == 1
train_idx = iter_f['indices_to_poison'][0]
fig, axs = plt.subplots(1, 3, figsize=(15, 6))
diff = 0.5 - (255 * (0.5 - (((iter_f['poisoned_X_train_image'] - image_data_sets.train.x[train_idx, :]) + 1) / 2)))
axs[0].imshow((np.reshape(image_data_sets.train.x[train_idx, :], [299, 299, 3]) + 1) / 2, interpolation='none')
axs[0].axis('off')
axs[1].imshow(np.reshape(diff, [299, 299, 3]), interpolation='none')
axs[1].axis('off')
axs[2].imshow((np.reshape(iter_f['poisoned_X_train_image'], [299, 299, 3]) + 1) / 2, interpolation='none')
axs[2].axis('off')
print(np.max(np.abs(image_data_sets.train.x[train_idx, :] - iter_f['poisoned_X_train_image'])) * 255 / 2)
print(np.max(iter_f['poisoned_X_train_image']))
print(np.min(iter_f['poisoned_X_train_image']))
# plt.savefig("figs/attack-before-after.png", dpi=300, bbox_inches='tight')
In [15]:
### Run the original model to make sure that it gets stuff correct
dataset_name = 'dogfish_koda'
train_f = np.load('output/%s_inception_features_new_train.npz' % dataset_name)
train = DataSet(train_f['inception_features_val'], train_f['labels'])
test_f = np.load('output/%s_inception_features_new_test.npz' % dataset_name)
test = DataSet(test_f['inception_features_val'], test_f['labels'])
validation = None
data_sets = base.Datasets(train=train, validation=validation, test=test)
orig_X_train = np.copy(data_sets.train.x)
Y_train = data_sets.train.labels
Y_test = data_sets.test.labels
input_dim = 2048
weight_decay = 0.001
batch_size = 30
initial_learning_rate = 0.001
keep_probs = None
decay_epochs = [1000, 10000]
max_lbfgs_iter = 1000
num_classes = 2
tf.reset_default_graph()
model = BinaryLogisticRegressionWithLBFGS(
input_dim=input_dim,
weight_decay=weight_decay,
max_lbfgs_iter=max_lbfgs_iter,
num_classes=num_classes,
batch_size=batch_size,
data_sets=data_sets,
initial_learning_rate=initial_learning_rate,
keep_probs=keep_probs,
decay_epochs=decay_epochs,
mini_batch=False,
train_dir='output_ipynb',
log_dir='log',
model_name='%s_inception_onlytop' % dataset_name)
model.train()
weights = model.sess.run(model.weights)
orig_Y_train_pred = model.sess.run(model.preds, feed_dict=model.all_train_feed_dict)
orig_Y_pred = model.sess.run(model.preds, feed_dict=model.all_test_feed_dict)
In [16]:
indices_to_poison = iter_f['indices_to_poison']
print('Original training predictions on the poisoned indices:')
print(orig_Y_train_pred[indices_to_poison])
orig_Y_train_correct_preds = np.zeros(orig_Y_train_pred.shape[0])
for idx in range(orig_Y_train_pred.shape[0]):
correct_label = int(Y_train[idx])
orig_Y_train_correct_preds[idx] = orig_Y_train_pred[idx, correct_label]
In [17]:
np.sort(orig_Y_train_correct_preds)
Out[17]:
In [18]:
# Then swap in the feature representation that's stored
poisoned_X_train_subset = iter_f['poisoned_X_train_inception_features']
assert np.all(Y_train[indices_to_poison] == iter_f['Y_train'])
print('Poisoning train_idx %s' % indices_to_poison)
poisoned_X_train = np.copy(data_sets.train.x)
poisoned_X_train[indices_to_poison, :] = poisoned_X_train_subset
model.update_train_x(poisoned_X_train)
model.train()
print('New training predictions on the poisoned indices:')
modified_Y_train_pred = model.sess.run(model.preds, feed_dict=model.all_train_feed_dict)
print(modified_Y_train_pred[indices_to_poison])
num_correct = 0
for idx in range(modified_Y_train_pred.shape[0]):
correct_label = int(Y_train[idx])
if modified_Y_train_pred[idx, correct_label] >= 0.5:
num_correct += 1
print(num_correct)
In [30]:
# Just to be extra sure, re-create the data set
poisoned_train = DataSet(poisoned_X_train, Y_train)
poisoned_data_sets = base.Datasets(train=poisoned_train, validation=validation, test=test)
tf.reset_default_graph()
model = BinaryLogisticRegressionWithLBFGS(
input_dim=input_dim,
weight_decay=weight_decay,
max_lbfgs_iter=max_lbfgs_iter,
num_classes=num_classes,
batch_size=batch_size,
data_sets=poisoned_data_sets,
initial_learning_rate=initial_learning_rate,
keep_probs=keep_probs,
decay_epochs=decay_epochs,
mini_batch=False,
train_dir='output_ipynb',
log_dir='log',
model_name='%s_inception_onlytop' % dataset_name)
model.train()
In [19]:
### Now check if we generated the Inception features correctly
img_side = 299
num_channels = 3
tf.reset_default_graph()
image_data_sets = load_dogfish_with_koda()
full_model_name = '%s_inception_wd-%s' % (dataset_name, weight_decay)
full_model = BinaryInceptionModel(
img_side=img_side,
num_channels=num_channels,
weight_decay=weight_decay,
num_classes=num_classes,
batch_size=batch_size,
data_sets=image_data_sets,
initial_learning_rate=initial_learning_rate,
keep_probs=keep_probs,
decay_epochs=decay_epochs,
mini_batch=True,
train_dir='output_ipynb',
log_dir='log',
model_name=full_model_name)
full_model.train()
In [20]:
poisoned_X_train_from_image = generate_inception_features(
full_model,
iter_f['poisoned_X_train_image'],
iter_f['Y_train'])
In [21]:
np.all(poisoned_X_train_from_image == iter_f['poisoned_X_train_inception_features'])
Out[21]:
In [34]:
### Finally, add the poisoned image and retrain the full model as a triple-check
poisoned_image_X_train = image_data_sets.train.x
image_Y_train = image_data_sets.train.labels
indices_to_poison = iter_f['indices_to_poison']
assert np.all(image_Y_train[indices_to_poison] == iter_f['Y_train'])
poisoned_image_X_train[indices_to_poison, :] = iter_f['poisoned_X_train_image']
full_model.update_train_x(poisoned_image_X_train)
full_model.train()
In [53]:
image_data_sets = load_dogfish_with_koda()
dataset_name = 'dogfish_koda'
train_f = np.load('output/%s_inception_features_new_train.npz' % dataset_name)
train = DataSet(train_f['inception_features_val'], train_f['labels'])
test_f = np.load('output/%s_inception_features_new_test.npz' % dataset_name)
test = DataSet(test_f['inception_features_val'], test_f['labels'])
validation = None
data_sets = base.Datasets(train=train, validation=validation, test=test)
In [54]:
idx_to_poison = indices_to_poison[0]
In [55]:
data_sets.train.labels[idx_to_poison]
Out[55]:
In [70]:
true_dog_centroid = np.mean(data_sets.train.x[data_sets.train.labels == 0, :], axis=0)
true_fish_centroid = np.mean(data_sets.train.x[data_sets.train.labels == 1, :], axis=0)
sns.distplot(np.linalg.norm(data_sets.train.x[data_sets.train.labels == 0, :] - true_dog_centroid, axis=1))
sns.distplot(np.linalg.norm(data_sets.train.x[data_sets.train.labels == 1, :] - true_fish_centroid, axis=1))
Out[70]:
In [71]:
np.mean(np.linalg.norm(data_sets.train.x[data_sets.train.labels == 1, :] - true_fish_centroid, axis=1))
Out[71]:
In [59]:
np.linalg.norm(data_sets.train.x[idx_to_poison, :] - true_dog_centroid)
Out[59]:
In [60]:
np.linalg.norm(iter_f['poisoned_X_train_inception_features'] - true_dog_centroid)
Out[60]:
In [61]:
np.linalg.norm(data_sets.train.x[idx_to_poison, :] - true_fish_centroid)
Out[61]:
In [62]:
np.linalg.norm(iter_f['poisoned_X_train_inception_features'] - true_fish_centroid)
Out[62]:
In [63]:
np.linalg.norm(iter_f['poisoned_X_train_inception_features'] - data_sets.train.x[idx_to_poison, :])
Out[63]:
In [64]:
# L_infty distance in pixel space
np.max(np.abs(iter_f['poisoned_X_train_image'] - image_data_sets.train.x[idx_to_poison, :]))
Out[64]:
In [66]:
# L2 distance in pixel space
np.linalg.norm(iter_f['poisoned_X_train_image'] - image_data_sets.train.x[idx_to_poison, :])
Out[66]:
In [69]:
true_dog_pixel_centroid = np.mean(image_data_sets.train.x[image_data_sets.train.labels == 0, :], axis=0)
true_fish_pixel_centroid = np.mean(image_data_sets.train.x[image_data_sets.train.labels == 1, :], axis=0)
sns.distplot(np.linalg.norm(image_data_sets.train.x[image_data_sets.train.labels == 0, :] - true_dog_pixel_centroid, axis=1))
sns.distplot(np.linalg.norm(image_data_sets.train.x[image_data_sets.train.labels == 1, :] - true_fish_pixel_centroid, axis=1))
Out[69]:
In [85]:
from sklearn.decomposition import PCA
X = data_sets.train.x[data_sets.train.labels == 1, :]
X = X - np.mean(X, axis=0)
pca = PCA(n_components=700)
pca.fit(X)
X_in_PC = np.linalg.norm(pca.transform(X), axis=1)
X_out_PC = np.linalg.norm(X, axis=1) - X_in_PC
In [86]:
sns.distplot(X_in_PC)
Out[86]:
In [87]:
sns.distplot(X_out_PC)
Out[87]:
In [88]:
poisoned_X = iter_f['poisoned_X_train_inception_features']
print(np.linalg.norm(pca.transform(poisoned_X)))
print(np.linalg.norm(poisoned_X) - np.linalg.norm(pca.transform(poisoned_X)))