In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
from __future__ import division
from __future__ import print_function
from __future__ import absolute_import
from __future__ import unicode_literals
import numpy as np
import pandas as pd
from sklearn import linear_model, preprocessing, cluster
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.linalg as slin
import scipy.sparse.linalg as sparselin
import scipy.sparse as sparse
import IPython
import tensorflow as tf
from inceptionModel import BinaryInceptionModel
from logisticRegressionWithLBFGS import LogisticRegressionWithLBFGS
from binaryLogisticRegressionWithLBFGS import BinaryLogisticRegressionWithLBFGS
from load_animals import load_animals
import experiments
from image_utils import plot_flat_bwimage, plot_flat_bwgrad, plot_flat_colorimage, plot_flat_colorgrad
from dataset import DataSet
from tensorflow.contrib.learn.python.learn.datasets import base
sns.set(color_codes=True)
In [2]:
def reverse_preprocess(x):
x /= 2.
x += 0.5
return x
In [4]:
num_train_ex_per_class = 900
num_test_ex_per_class = 300
model_name = 'animals_%s_%s' % (num_train_ex_per_class, num_test_ex_per_class)
# image_data_sets = load_animals(num_train_ex_per_class=num_train_ex_per_class, num_test_ex_per_class=num_test_ex_per_class)
In [10]:
train_f = np.load('/srv/scratch/pangwei/influence_data/%s_inception_features_train.npz' % model_name)
train = DataSet(train_f['inception_features_val'], train_f['labels'])
test_f = np.load('/srv/scratch/pangwei/influence_data/%s_inception_features_test.npz' % model_name)
test = DataSet(test_f['inception_features_val'], test_f['labels'])
validation = None
data_sets = base.Datasets(train=train, validation=validation, test=test)
input_dim = 2048
weight_decay = 0.01
batch_size = 900
initial_learning_rate = 0.001
keep_probs = None
decay_epochs = [1000, 10000]
max_lbfgs_iter = 1000
num_classes = 10
tf.reset_default_graph()
model = LogisticRegressionWithLBFGS(
input_dim=input_dim,
weight_decay=weight_decay,
max_lbfgs_iter=max_lbfgs_iter,
num_classes=num_classes,
batch_size=batch_size,
data_sets=data_sets,
initial_learning_rate=initial_learning_rate,
keep_probs=keep_probs,
decay_epochs=decay_epochs,
mini_batch=False,
train_dir='data',
log_dir='log',
model_name='animals_inception_onlytop_poisoned')
model.train()
weights = model.sess.run(model.weights)
np.save('data/inception_weights_%s' % model_name, weights)
Y_pred = model.sess.run(model.preds, feed_dict=model.all_test_feed_dict)
Y_test = model.data_sets.test.labels
In [9]:
model.data_sets.test.labels
Out[9]:
In [10]:
image_data_sets.test.labels
Out[10]:
In [12]:
assert np.all(model.data_sets.test.labels == image_data_sets.test.labels)
assert np.all(model.data_sets.train.labels == image_data_sets.train.labels)
In [19]:
for test_idx in range(200,400):
if np.max(Y_pred[test_idx, :]) > 0.99: continue
# if Y_test[test_idx] != 0: continue
# test_idx = 0
print(test_idx, Y_pred[test_idx, Y_test[test_idx]])
plt.imshow((np.reshape(image_data_sets.test.x[test_idx, :], [299, 299, 3]) + 1) / 2, interpolation='none')
plt.axis('off')
plt.tight_layout()
plt.show()
In [25]:
test_indices = [11, 41, 54, 66, 85, 88, 100, 125, 173]
In [26]:
for test_idx in test_indices:
print(test_idx, Y_pred[test_idx, Y_test[test_idx]])
plt.imshow((np.reshape(image_data_sets.test.x[test_idx, :], [299, 299, 3]) + 1) / 2, interpolation='none')
plt.axis('off')
plt.tight_layout()
plt.show()
In [14]:
# Test indices: [11, 41, 54, 66, 85, 88, 100, 125, 173]
test_idx = 41
model_string = 'animals_%s_%s_inception' % (num_train_ex_per_class, num_test_ex_per_class)
# train_dict = np.load('/srv/scratch/pangwei/influence_data/animals_900_300_inception_inception_features_poisoned_train_influence_poison-MAI-replace-0.1_testidx_[21].npz')
train_dict = np.load(
'/srv/scratch/pangwei/influence_data/%s_inception_features_poisoned_train_influence_poison-maxgrad-linf-replace-0.01_testidx_[%s].npz'\
% (model_string, test_idx))
# train_dict = np.load('/srv/scratch/pangwei/influence_data/%s_features_train.npz' % model_string)
### Normal
# train_dict = np.load('data/%s_features_train.npz' % model_string)
train = DataSet(np.reshape(train_dict['inception_features_val'], [-1, 2048]), train_dict['labels'])
test_dict = np.load('/srv/scratch/pangwei/influence_data/%s_features_test.npz' % model_string)
test = DataSet(np.reshape(test_dict['inception_features_val'], [-1, 2048]), test_dict['labels'])
validation = None
data_sets = base.Datasets(train=train, validation=validation, test=test)
modify_type = 'replace'
assert(all(image_data_sets.train.labels == data_sets.train.labels))
assert(all(image_data_sets.test.labels == data_sets.test.labels))
In [15]:
tf.reset_default_graph()
model = LogisticRegressionWithLBFGS(
input_dim=input_dim,
weight_decay=weight_decay,
max_lbfgs_iter=max_lbfgs_iter,
num_classes=num_classes,
batch_size=batch_size,
data_sets=data_sets,
initial_learning_rate=initial_learning_rate,
keep_probs=keep_probs,
decay_epochs=decay_epochs,
mini_batch=False,
train_dir='data',
log_dir='log',
model_name='animals_inception_onlytop_poisoned')
model.train()
# Y_test = model.data_sets.test.labels
attacked_Y_pred = model.sess.run(model.preds, feed_dict=model.all_test_feed_dict)
In [16]:
classes = ['dog', 'cat', 'bird', 'fish', 'horse', 'monkey', 'zebra', 'panda', 'lemur', 'wombat']
print(test_idx)
print('orig correct pred: %s' % Y_pred[test_idx, int(Y_test[test_idx])])
print('attacked correct pred: %s' % attacked_Y_pred[test_idx, int(Y_test[test_idx])])
print('new prediction: %s, %s' % (
np.max(attacked_Y_pred[test_idx, :]),
classes[np.argmax(attacked_Y_pred[test_idx, :])]))
print(Y_pred[test_idx])
print(attacked_Y_pred[test_idx])
# plt.subplots(figsize=(3,3))
# plt.imshow((np.reshape(image_data_sets.test.x[test_idx, :], [299, 299, 3]) + 1) / 2, interpolation='none')
# plt.axis('off')
# plt.title('96.2% horse to 99.7% dog')
# plt.tight_layout()
# plt.savefig("figs/attack-horse.png", dpi=300, bbox_inches='tight')
In [ ]:
Y_pred_correct = np.zeros([len(Y_test)])
for idx, label in enumerate(Y_test):
Y_pred_correct[idx] = Y_pred[idx, int(label)]
In [50]:
poisoned_Y_pred_correct = Y_pred_correct
In [46]:
orig_Y_pred_correct = Y_pred_correct
In [69]:
sns.distplot(orig_Y_pred_correct, kde=False)
sns.distplot(poisoned_Y_pred_correct, kde=False)
Out[69]:
In [68]:
sns.distplot(orig_Y_pred_correct - Y_pred_correct, kde=False)
Out[68]:
In [64]:
sns.distplot(orig_Y_pred_correct - poisoned_Y_pred_correct, kde=False)
Out[64]:
In [65]:
print('True: %s' % Y_test[test_idx])
print('Predicted: %s' % Y_pred[test_idx])
test_image = reverse_preprocess(np.copy(image_data_sets.test.x[test_idx, :]))
test_label = image_data_sets.test.labels[test_idx]
plot_flat_colorimage(test_image, test_label, side=299)
In [61]:
test_idx = 1
print('True: %s' % Y_test[test_idx])
print('Predicted: %s' % Y_pred[test_idx, :])
test_image = reverse_preprocess(np.copy(image_data_sets.test.x[test_idx, :]))
test_label = image_data_sets.test.labels[test_idx]
plot_flat_colorimage(test_image, test_label, side=299)
In [237]:
test_idx = 15
print('True: %s' % Y_test[test_idx])
print('Predicted: %s' % Y_pred[test_idx])
log_loss = -np.log(Y_test[test_idx] * Y_pred[test_idx] + (1 - Y_test[test_idx]) * (1 - Y_pred[test_idx]))
print('Log loss: %s' % log_loss)
test_image = reverse_preprocess(np.copy(image_data_sets.test.x[test_idx, :]))
test_label = image_data_sets.test.labels[test_idx]
plot_flat_colorimage(test_image, test_label, side=299)
In [241]:
test_idx = 33
print('True: %s' % Y_test[test_idx])
print('Predicted: %s' % Y_pred[test_idx])
log_loss = -np.log(Y_test[test_idx] * Y_pred[test_idx] + (1 - Y_test[test_idx]) * (1 - Y_pred[test_idx]))
print('Log loss: %s' % log_loss)
test_image = reverse_preprocess(np.copy(image_data_sets.test.x[test_idx, :]))
test_label = image_data_sets.test.labels[test_idx]
plot_flat_colorimage(test_image, test_label, side=299)
In [242]:
sort_idx = 484
np.sort(Y_pred)[sort_idx]
test_idx = np.argsort(Y_pred)[sort_idx]
print(test_idx)
print('True: %s' % Y_test[test_idx])
print('Predicted: %s' % Y_pred[test_idx])
log_loss = -np.log(Y_test[test_idx] * Y_pred[test_idx] + (1 - Y_test[test_idx]) * (1 - Y_pred[test_idx]))
print('Log loss: %s' % log_loss)
test_image = reverse_preprocess(np.copy(image_data_sets.test.x[test_idx, :]))
test_label = image_data_sets.test.labels[test_idx]
plot_flat_colorimage(test_image, test_label, side=299)
In [73]:
test_idx = 45
print('True: %s' % Y_test[test_idx])
print('Predicted: %s' % Y_pred[test_idx])
test_image = reverse_preprocess(np.copy(image_data_sets.test.x[test_idx, :]))
test_label = image_data_sets.test.labels[test_idx]
plot_flat_colorimage(test_image, test_label, side=299)
In [78]:
np.min(image_data_sets.train.x)
Out[78]:
In [79]:
centers
Out[79]:
In [80]:
label_indices
Out[80]:
In [81]:
distances.shape
Out[81]:
In [16]:
# Compute average l2 distance of each training example to their
# cluster center
dim = image_data_sets.train.x.shape[1]
centers = np.zeros([2, dim])
avg_dist = np.zeros([2])
for label in [0, 1]:
label_indices = (image_data_sets.train.labels == label)
centers[label, :] = np.mean(image_data_sets.train.x[label_indices, :], axis=0)
dists = image_data_sets.train.x[label_indices, :] - centers[label]
avg_dist[label] = np.mean(np.linalg.norm(dists, axis=1))
avg_dist
Out[16]:
In [17]:
# Compute l2 distance of a perturbed example to the original example
# Each pixel is modified by (2.0 / 255)
perturbed_distance = np.sqrt(dim * 2.0 / 255)
perturbed_distance
Out[17]:
In [82]:
### Feature space
normal_train_dict = np.load('data/%s_features_train.npz' % model_string)
X = normal_train_dict['inception_features_val']
Y = normal_train_dict['labels']
dim = X.shape[1]
centers = np.zeros([2, dim])
avg_dist = np.zeros([2])
for label in [0, 1]:
label_indices = (Y == label)
centers[label, :] = np.mean(X[label_indices, :], axis=0)
dists = X[label_indices, :] - centers[label]
norms = np.linalg.norm(dists, axis=1)
avg_dist[label] = np.mean(np.linalg.norm(dists, axis=1))
sns.distplot(norms)
avg_dist
Out[82]:
In [83]:
normal_train_dict = np.load('data/%s_features_train.npz' % model_string)
X = normal_train_dict['inception_features_val']
Y = normal_train_dict['labels']
poisoned_train_dict = np.load('data/normal_dog_2000_1000_inception_inception_features_poisoned_train_influence_poison-maxgrad-linf-replace-0.05_testidx_825.npz')
X_poison = poisoned_train_dict['inception_features_val']
Y_poison = poisoned_train_dict['labels']
In [84]:
assert all(Y == Y_poison)
modified_idx = np.where(~np.all(X == X_poison, axis=1))[0]
dists = X_poison[modified_idx, :] - X[modified_idx, :]
np.mean(np.linalg.norm(dists, axis=1))
Out[84]:
In [85]:
for label in [0, 1]:
label_indices = (Y_poison == label)
centers[label, :] = np.mean(X_poison[label_indices, :], axis=0)
dists = X_poison[label_indices, :] - centers[label]
norms = np.linalg.norm(dists, axis=1)
avg_dist[label] = np.mean(np.linalg.norm(dists, axis=1))
X_unmodified = X_poison[~modified_idx, :]
Y_unmodified = Y_poison[~modified_idx]
for label in [0, 1]:
label_indices = (Y_unmodified == label)
dists = X_unmodified[label_indices, :] - centers[label]
avg_dist[label] = np.mean(np.linalg.norm(dists, axis=1))
norms = np.linalg.norm(dists, axis=1)
sns.distplot(norms)
X_modified = X_poison[modified_idx, :]
Y_modified = Y_poison[modified_idx]
for label in [0, 1]:
label_indices = (Y_modified == label)
dists = X_modified[label_indices, :] - centers[label]
avg_dist[label] = np.mean(np.linalg.norm(dists, axis=1))
norms = np.linalg.norm(dists, axis=1)
sns.distplot(norms)
avg_dist
Out[85]:
In [87]:
np.linalg.norm(centers[0, :] - centers[1, :])
Out[87]:
In [13]:
f = np.load('data/skewed_dog_inception_poisoned_data_sets_testidx_403.npy')
In [19]:
poisoned_train = f[0]
validation = f[1]
test = f[2]
poisoned_data_sets = poisoned_data_sets = base.Datasets(train=poisoned_train, validation=validation, test=test)
assert np.all(poisoned_data_sets.test.x == image_data_sets.test.x)
assert all(poisoned_data_sets.test.labels == image_data_sets.test.labels)
assert all(poisoned_data_sets.train.labels == image_data_sets.train.labels)
assert (np.max(np.abs(poisoned_data_sets.train.x - image_data_sets.train.x)) * 255 / 2) <= 1.0001
In [18]:
np.min(poisoned_data_sets.train.x)
Out[18]:
In [16]:
# Make sure that the poisoned data sets are at most 1 different from the orig data sets in each pixel
np.max(np.abs(poisoned_data_sets.train.x - image_data_sets.train.x)) * 255 / 2
Out[16]:
In [24]:
train_idx = 1225
orig_train_image = reverse_preprocess(np.copy(image_data_sets.train.x[train_idx, :]))
orig_train_label = image_data_sets.train.labels[train_idx]
plot_flat_colorimage(orig_train_image, orig_train_label, side=299)
poisoned_train_image = reverse_preprocess(np.copy(poisoned_data_sets.train.x[train_idx, :]))
poisoned_train_label = poisoned_data_sets.train.labels[train_idx]
plot_flat_colorimage(poisoned_train_image, poisoned_train_label, side=299)
In [34]:
diff = poisoned_train_image - orig_train_image
diff < 0
Out[34]:
In [36]:
diff
Out[36]:
In [39]:
poisoned_train_image.shape
Out[39]:
In [54]:
diff = poisoned_train_image - orig_train_image
diff[diff < 0] = 0
reshaped_diff = np.reshape(diff, [299, 299, 3])
# print(reshaped_diff[:10, :10, :])
plot_flat_colorgrad(diff * 10, side=299)
In [88]:
# Train normal model
train_dict = np.load('data/skewed_dog_inception_morereg_inception_features_train.npz')
train = DataSet(np.reshape(train_dict['inception_features_val'], [-1, 2048]), train_dict['labels'])
test_dict = np.load('data/skewed_dog_inception_morereg_inception_features_test.npz')
test = DataSet(np.reshape(test_dict['inception_features_val'], [-1, 2048]), test_dict['labels'])
validation = None
data_sets = base.Datasets(train=train, validation=validation, test=test)
assert(all(image_data_sets.train.labels == data_sets.train.labels))
assert(all(image_data_sets.test.labels == data_sets.test.labels))
tf.reset_default_graph()
model = BinaryLogisticRegressionWithLBFGS(
input_dim=input_dim,
weight_decay=weight_decay,
max_lbfgs_iter=max_lbfgs_iter,
num_classes=num_classes,
batch_size=batch_size,
data_sets=data_sets,
initial_learning_rate=initial_learning_rate,
keep_probs=keep_probs,
decay_epochs=decay_epochs,
mini_batch=False,
train_dir='data',
log_dir='log',
model_name='skewed_dog_inception_onlytop_poisoned')
model.train()
normal_weights = model.sess.run(model.params)[0]
# Train poisoned model
test_idx = 403
train_dict = np.load('data/skewed_dog_inception_inception_features_poisoned_train_influence_testidx_%s.npz' % test_idx)
train = DataSet(np.reshape(train_dict['inception_features_val'], [-1, 2048]), train_dict['labels'])
test_dict = np.load('data/skewed_dog_inception_morereg_inception_features_test.npz')
test = DataSet(np.reshape(test_dict['inception_features_val'], [-1, 2048]), test_dict['labels'])
validation = None
data_sets = base.Datasets(train=train, validation=validation, test=test)
assert(all(image_data_sets.train.labels == data_sets.train.labels))
assert(all(image_data_sets.test.labels == data_sets.test.labels))
tf.reset_default_graph()
model = BinaryLogisticRegressionWithLBFGS(
input_dim=input_dim,
weight_decay=weight_decay,
max_lbfgs_iter=max_lbfgs_iter,
num_classes=num_classes,
batch_size=batch_size,
data_sets=data_sets,
initial_learning_rate=initial_learning_rate,
keep_probs=keep_probs,
decay_epochs=decay_epochs,
mini_batch=False,
train_dir='data',
log_dir='log',
model_name='skewed_dog_inception_onlytop_poisoned')
model.train()
poisoned_weights = model.sess.run(model.params)[0]
In [89]:
plt.scatter(normal_weights, poisoned_weights)
Out[89]:
In [90]:
poisoned_weights.shape
Out[90]:
In [79]:
# Normal
normal_train_dict = np.load('data/skewed_dog_inception_morereg_inception_features_train.npz')
# Poisoned
test_idx = 10
poisoned_train_dict = np.load('data/skewed_dog_inception_inception_features_poisoned_train_influence_testidx_%s.npz' % test_idx)
# poisoned_train_dict = np.load('data/skewed_dog_inception_inception_features_poisoned_train_random_testidx_None.npz')
# poisoned_train_dict = np.load('data/skewed_dog_inception_inception_features_poisoned_train_traingrad_testidx_None.npz')
# poisoned_train_dict = np.load('data/skewed_dog_inception_inception_features_poisoned_train_mirror_testidx_%s.npz' % test_idx)
# poisoned_train_dict = np.load('data/skewed_dog_inception_inception_features_poisoned_train_testidx_None.npz')
X_train_normal = normal_train_dict['inception_features_val']
X_train_poisoned = poisoned_train_dict['inception_features_val']
Y_train_normal = normal_train_dict['labels']
Y_train_poisoned = poisoned_train_dict['labels']
assert(all(Y_train_normal == Y_train_poisoned))
# np.reshape(train_dict['inception_features_val'], [-1, 2048]
In [80]:
diff = X_train_normal - X_train_poisoned
diff_norm = np.linalg.norm(diff, axis=1)
normal_norm = np.linalg.norm(X_train_normal, axis=1)
poisoned_norm = np.linalg.norm(X_train_poisoned, axis=1)
In [87]:
plt.scatter(normal_norm, poisoned_norm)
plt.xlim(15, 55)
plt.ylim(15, 55)
Out[87]:
In [82]:
sns.distplot(diff_norm)
sns.distplot(normal_norm)
sns.distplot(poisoned_norm)
# Blue: Differences
# Green: Original
# Red: Poisoned
Out[82]:
In [ ]: