In [ ]:
%matplotlib inline
import sklearn
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.image as pltimg
import os
import re
import pickle
In [ ]:
dataset = {
(folder_name,int(re.findall(r"\d\d\d\d",file_name)[0])): "lfw/%s/%s" % (folder_name,file_name)
for folder_name in os.listdir("dataset/lfw/")
for file_name in os.listdir("dataset/lfw/"+folder_name)
}
#dataset[('Mehdi_Ghanimifard','1')] = 'dataset/lfw/Mehdi_Ghanimifard/Mehdi_Ghanimifard_0001.jpg'
In [ ]:
with open('dataset/embeddings/embeddings_align', 'rb') as f:
embeddings = pickle.load(f, encoding='latin1')
In [ ]:
with open('dataset/embeddings/imgpaths', 'rb') as f:
imgpaths = pickle.load(f)
In [ ]:
import cv2
a = []
b = []
for index, path in enumerate(imgpaths):
#print(path, emb[index])
if 'Abdullah_al-Attiyah' in path:
a.append(embeddings[index])
img = cv2.cvtColor(cv2.imread("dataset/"+path,), cv2.COLOR_BGR2RGB)
plt.imshow(img)
plt.show()
if 'Nicole_Kidman' in path:
b.append(embeddings[index])
img = cv2.cvtColor(cv2.imread("dataset/"+path,), cv2.COLOR_BGR2RGB)
plt.imshow(img)
plt.show()
In [ ]:
len(a), len(b)
In [ ]:
np.dot(a[0], a[1])
In [ ]:
np.dot(b[0], b[1])
In [ ]:
for ai in a:
np.dot(ai, b[0]),
np.dot(a[1], b[0]),
np.dot(a[2], b[0]),
)
In [ ]:
with open('dataset/embeddings/embeddings', 'rb') as f:
embeddings_tricky = pickle.load(f, encoding='latin1')
In [ ]:
a_tricky = []
b_tricky = []
for index, path in enumerate(imgpaths):
#print(path, emb[index])
if 'Abdullah_al-Attiyah' in path:
a_tricky.append(embeddings_tricky[index])
img = cv2.cvtColor(cv2.imread("dataset/"+path,), cv2.COLOR_BGR2RGB)
plt.imshow(img)
plt.show()
if 'Nicole_Kidman' in path:
b_tricky.append(embeddings_tricky[index])
img = cv2.cvtColor(cv2.imread("dataset/"+path,), cv2.COLOR_BGR2RGB)
plt.imshow(img)
plt.show()
In [ ]:
np.dot(a_tricky[0], a_tricky[1])
In [ ]:
np.sum(np.abs(a[0]))
In [ ]:
# understand this - understood!
for (name, num), address in dataset.items():
print((name, num),address)
print(imgpaths.index(address))
print(embeddings[num])
break
# reading attributes
dataset_att = dict()
with open('dataset/lfw_attributes.txt') as att_file:
for index, line in enumerate(att_file):
items = line.split('\t')
if index == 1:
attributes = items[3:]
if index > 1:
dataset_att[dataset[(items[0].replace(' ', '_'), int(items[1]))]] = np.array(items[2:]).astype(np.float)
In [ ]:
print(list(dataset_att.items())[0], len(list(dataset_att.items())[0][1]))
In [ ]:
attributes
In [ ]:
len(dataset_att), len(dataset)
In [ ]:
with open("dataset/embeddings/skippedimg.txt") as f:
skippedimg = ["lfw"+l.strip("\n")[9:] for l in f]
#lfw/Abdoulaye_Wade/Abdoulaye_Wade_0003.jpg
#If we want to be able to associate image with embedding we need to know which image we use.
usedimg = [address for _,address in dataset.items() if address in dataset_att and address not in skippedimg]
#print(skippedimg)
In [ ]:
# features for all classifiers: ~13000 data points
# for each file you have a vector of features:
# fix this: there are some datapoints missing
# Mehdi: I just wrote a fix for missing images in attribute dataset. There is still a minor work left here:
# Anna : Fixed this
addresss =[]
for _,address in dataset.items():
if address in skippedimg:
addresss.append(address)
#print(address)
a = sorted(addresss)
#print(a)
print(len(embeddings))
X = [
embeddings[imgpaths.index(address)]
for _, address in dataset.items()
if address in dataset_att and address not in skippedimg
]
# 73 classifiers (each have their own y)
# for each classifier (attribute):
# for each file:
# either {-1, 1}
Y = [
[
1 if dataset_att[address][i] > 0 else -1
for _, address in dataset.items()
if address in dataset_att and address not in skippedimg
]
for i, att_name in enumerate(attributes)
]
In [ ]:
len(X), len(Y[0])
print(X[0])
print(Y[0][0])
In [ ]:
# make train, test, dev split
split = int(len(X)*0.8)
X_train = X[:split]
X_2 = X[split:]
nsplit = int(len(X_2)*0.5)
X_dev = X_2[:nsplit]
X_test = X_2[nsplit:]
print(len(X_train),len(X_dev),len(X_test))
Y_train=[]
Y_dev=[]
Y_test=[]
for attr in Y:
Y_train.append(attr[:split])
Y_2 = attr[split:]
Y_dev.append(Y_2[:nsplit])
Y_test.append(Y_2[nsplit:])
print(len(Y_train[0]),len(Y_dev[0]),len(Y_test[0]))
In [ ]:
### Classification
In [ ]:
from sklearn import svm
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
In [ ]:
# Training
# for each classifier y, write a svm model
# fit parameters of the model based on (X, y)
classifier_list=[]
for i,attribute in enumerate(Y_train):
classifier = svm.LinearSVC(max_iter=2000)
classifier.fit(X_train,Y_train[i])
classifier_list.append(classifier)
pickle.dump(classifier_list,open("classifiers","wb"))
In [ ]:
with open("classifiers", 'rb') as f:
classifier_list = pickle.load(f)
s=""
for i,classifier in enumerate(classifier_list):
clf = classifier
y_guess =clf.predict(X_test)
#y_guess = (y_guess+1)/2
#print(type(Y_test[i]))
#y_true = [ for result in Y_test[i]]
#classes=["not"+attributes[i],attributes[i]]
#print(classification_report(y_true,y_guess,classes))
#print("Attribute: "+attributes[i])
print("Accuracy:"+str(round(accuracy_score(Y_test[i], y_guess)*100,2))+"\%")
print("F-score:"+str(round(f1_score(Y_test[i], y_guess,)*100,2))+"%")
print("Precision:"+str(round(precision_score(Y_test[i], y_guess)*100,2))+"%")
print("Recall:"+str(round(recall_score(Y_test[i], y_guess)*100,2))+"%")
print("\n")
In [ ]:
x_total = np.array(X)
y_total = np.array(Y).T
# shuffle two list together:
shuffle_indices = np.arange(x_total.shape[0])
np.random.shuffle(shuffle_indices)
x_total_1 = x_total[shuffle_indices, :]
y_total_1 = y_total[shuffle_indices, :]
# separate test and train
x_test, x_train = np.split(x_total, [int(x_total.shape[0]*.1)])
y_test, y_train = np.split(y_total, [int(y_total.shape[0]*.1)])
In [ ]:
x_test.shape, x_train.shape, y_test.shape, y_train.shape
In [ ]:
import tensorflow as tf
In [ ]:
tf.reset_default_graph()
feature_size = 128
attribute_size = len(attributes)
face2embeddings = tf.placeholder(dtype=tf.float32, shape=[None, feature_size])
face2attributes = tf.placeholder(dtype=tf.float32, shape=[None, attribute_size])
# MultiLayer Perceptrone (MLP):
# layer one:
hidden_dim_1 = features*2
weight_1 = tf.Variable(tf.truncated_normal(shape=[feature_size, hidden_dim_1]))
bias_1 = tf.Variable(tf.constant(0.1, shape=[hidden_dim_1]))
layer_1 = tf.nn.tanh(tf.add(tf.matmul(face2embeddings, weight_1), bias_1))
# layer two:
hidden_dim_2 = features
weight_2 = tf.Variable(tf.truncated_normal(shape=[hidden_dim_1, hidden_dim_2]))
bias_2 = tf.Variable(tf.constant(0.1, shape=[hidden_dim_2]))
layer_2 = tf.nn.relu(tf.add(tf.matmul(layer_1, weight_2), bias_2))
# finial layer:
weight_final = tf.Variable(tf.truncated_normal(shape=[hidden_dim_2, attribute_size]))
bias_final = tf.Variable(tf.constant(0.1, shape=[attribute_size]))
layer_final = tf.nn.tanh(tf.add(tf.matmul(layer_2, weight_final), bias_final))
losses = tf.reduce_sum((face2attributes - layer_final)**2, axis=0)
loss = tf.reduce_mean(losses)
training_step = tf.train.AdamOptimizer().minimize(losses)
In [ ]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())
In [ ]:
epoch = 2000
for e in range(epoch):
train_loss, _ = sess.run([loss, training_step], {face2embeddings: x_train, face2attributes: y_train})
test_loss = sess.run(loss, {face2embeddings: x_test, face2attributes: y_test})
print("epoch:", e, "train loss:", train_loss, "test loss:", test_loss)
In [ ]:
saver = tf.train.Saver()
saver.save(sess, 'mlp-model')
In [ ]:
results = sess.run(layer_final, {face2embeddings: x_test})
In [ ]:
results
In [ ]:
np.sign(np.array([1,-1])+1)
In [ ]:
final_results = np.sign(results)
In [ ]:
from sklearn.metrics import classification_report
In [ ]:
report_file = open("resultsMLP2.txt", "w")
for i, attribute in enumerate(attributes):
y_pred = np.sign(final_results.T[i]+1)
y_true = np.sign(y_test.T[i]+1)
classes = ['not %s' % attribute, attribute]
report_file.write(classification_report(y_true, y_pred, target_names=classes))
In [ ]:
# parser idea:
sample = "it isn't a man with no glasses"
# tockenization, normalization
sample = sample.lower()
sample = sample.replace('hasn\'t', 'has not').replace('isn\'t', 'is not')
sample = sample.replace(' a ', ' ').replace(' an ', ' ').replace(' the ', ' ')
sample = sample.replace('not ', 'not_').replace('no ', 'no_')
sample_tockenized = sample.split()
print(sample_tockenized)
# dictionary
#print(', '.join(attributes))
attributes_normal = [a.lower().replace() for a in attributes]
att_synonym = {a.lower(): [a.lower()] for a in attributes}
att_antonym = {a.lower(): ['not_'+a.lower(), 'no_'+a.lower()] for a in attributes}
# some manual lists
att_synonym['male'] += ['man']
att_antonym['male'] += ['not_man', 'no_man']
print(att_synonym)
# parser
for attribute in attributes:
if len([1 for a in att_synonym[attribute.lower()] if a in sample_tockenized]):
print("the sample has/is", attribute)
if len([1 for a in att_antonym[attribute.lower()] if a in sample_tockenized]):
print("the sample has/is not", attribute)
In [ ]:
from parser_imp import parser
import random
datasetlist = list(dataset.items())
facenr = random.randint(0,len(datasetlist))
imgpath = dataset[datasetlist[facenr][0]]
#embedd = embeddings[imgpaths.index(imgpath)]
print(datasetlist[facenr][0])
print(imgpath)
print(dataset_att[imgpath])
#ownpath="dataset/lfw/Aaron_Pena/Aaron_Pena_0001.jpg"
#embedd = embeddings[imgpaths.index(ownpath)]
#for (name, num), address in dataset.items():
#print((name, num),address)
#print(imgpaths.index(address))
#print(embeddings[num])
#break
#with open("classifiers", 'rb') as f:
#classifier_list = pickle.load(f)
with open("classifiersMLP", 'rb') as f:
classifier_list = pickle.load(f)
result = []
for i,classifier in enumerate(classifier_list):
clf = classifier
y_guess =clf.predict(embedd)
result.append((attributes[i],y_guess))
print(attributes[i])
print(y_guess)
#print(result)