In [ ]:
%matplotlib inline
import sklearn
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.image as pltimg
import os
import re
import pickle

Importing and sorting the data


In [ ]:
dataset = {
    (folder_name,int(re.findall(r"\d\d\d\d",file_name)[0])): "lfw/%s/%s" % (folder_name,file_name)
    for folder_name in os.listdir("dataset/lfw/")
    for file_name in os.listdir("dataset/lfw/"+folder_name)
}
#dataset[('Mehdi_Ghanimifard','1')] = 'dataset/lfw/Mehdi_Ghanimifard/Mehdi_Ghanimifard_0001.jpg'

In [ ]:
with open('dataset/embeddings/embeddings_align', 'rb') as f:
    embeddings = pickle.load(f, encoding='latin1')

In [ ]:
with open('dataset/embeddings/imgpaths', 'rb') as f:
    imgpaths = pickle.load(f)

Investigating embeddings and their difference


In [ ]:
import cv2
a = []
b = []
for index, path in enumerate(imgpaths):
    #print(path, emb[index])
    if 'Abdullah_al-Attiyah' in path:
        a.append(embeddings[index])
        img = cv2.cvtColor(cv2.imread("dataset/"+path,), cv2.COLOR_BGR2RGB)
        plt.imshow(img)
        plt.show()
    
    if 'Nicole_Kidman' in path:
        b.append(embeddings[index])
        img = cv2.cvtColor(cv2.imread("dataset/"+path,), cv2.COLOR_BGR2RGB)
        plt.imshow(img)
        plt.show()

In [ ]:
len(a), len(b)

In [ ]:
np.dot(a[0], a[1])

In [ ]:
np.dot(b[0], b[1])

In [ ]:
for ai in a:
    np.dot(ai, b[0]),
    np.dot(a[1], b[0]),
    np.dot(a[2], b[0]),
)

Testing the embeddings without alignment (They don't work)


In [ ]:
with open('dataset/embeddings/embeddings', 'rb') as f:
    embeddings_tricky = pickle.load(f, encoding='latin1')

In [ ]:
a_tricky = []
b_tricky = []
for index, path in enumerate(imgpaths):
    #print(path, emb[index])
    if 'Abdullah_al-Attiyah' in path:
        a_tricky.append(embeddings_tricky[index])
        img = cv2.cvtColor(cv2.imread("dataset/"+path,), cv2.COLOR_BGR2RGB)
        plt.imshow(img)
        plt.show()
    
    if 'Nicole_Kidman' in path:
        b_tricky.append(embeddings_tricky[index])
        img = cv2.cvtColor(cv2.imread("dataset/"+path,), cv2.COLOR_BGR2RGB)
        plt.imshow(img)
        plt.show()

In [ ]:
np.dot(a_tricky[0], a_tricky[1])

In [ ]:
np.sum(np.abs(a[0]))

Matching training data


In [ ]:
# understand this - understood!
for (name, num), address in dataset.items():
    print((name, num),address)
    print(imgpaths.index(address))
    print(embeddings[num])
    break

# reading attributes
dataset_att = dict()
with open('dataset/lfw_attributes.txt') as att_file:
    for index, line in enumerate(att_file):
        items = line.split('\t')
        
        if index == 1:
            attributes = items[3:]
        if index > 1:
            dataset_att[dataset[(items[0].replace(' ', '_'), int(items[1]))]] = np.array(items[2:]).astype(np.float)

In [ ]:
print(list(dataset_att.items())[0], len(list(dataset_att.items())[0][1]))

In [ ]:
attributes

Removing faces without embeddings or attributes


In [ ]:
len(dataset_att), len(dataset)

In [ ]:
with open("dataset/embeddings/skippedimg.txt") as f:
    skippedimg = ["lfw"+l.strip("\n")[9:] for l in f]

#lfw/Abdoulaye_Wade/Abdoulaye_Wade_0003.jpg     

#If we want to be able to associate image with embedding we need to know which image we use.
usedimg = [address for _,address in dataset.items() if address in dataset_att and address not in skippedimg]
#print(skippedimg)

In [ ]:
# features for all classifiers: ~13000 data points
# for each file you have a vector of features:

# fix this: there are some datapoints missing
# Mehdi: I just wrote a fix for missing images in attribute dataset. There is still a minor work left here:
# Anna : Fixed this

addresss =[]
for _,address in dataset.items():
    if address in skippedimg:
        addresss.append(address)
        #print(address)
a = sorted(addresss)
#print(a)
print(len(embeddings))
X = [
    embeddings[imgpaths.index(address)]
    for _, address in dataset.items()
    if address in dataset_att and address not in skippedimg
]

# 73 classifiers (each have their own y)
# for each classifier (attribute):
# for each file:
#  either {-1, 1}
Y = [
    [
        1 if dataset_att[address][i] > 0 else -1
        for _, address in dataset.items()
        if address in dataset_att and address not in skippedimg 
    ]
    for i, att_name in enumerate(attributes)
]

In [ ]:
len(X), len(Y[0])
print(X[0])
print(Y[0][0])

In [ ]:
# make train, test, dev split
split = int(len(X)*0.8)
X_train = X[:split]
X_2 = X[split:]
nsplit = int(len(X_2)*0.5)
X_dev = X_2[:nsplit]
X_test = X_2[nsplit:]

print(len(X_train),len(X_dev),len(X_test))

Y_train=[]
Y_dev=[]
Y_test=[]

for attr in Y:
    Y_train.append(attr[:split])
    Y_2 = attr[split:]
    Y_dev.append(Y_2[:nsplit])
    Y_test.append(Y_2[nsplit:])
      
    
print(len(Y_train[0]),len(Y_dev[0]),len(Y_test[0]))

In [ ]:
### Classification

In [ ]:
from sklearn import svm
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report

In [ ]:
# Training
# for each classifier y, write a svm model
# fit parameters of the model based on (X, y)

classifier_list=[]
for i,attribute in enumerate(Y_train):
    classifier = svm.LinearSVC(max_iter=2000)
    classifier.fit(X_train,Y_train[i])
    classifier_list.append(classifier)

pickle.dump(classifier_list,open("classifiers","wb"))

In [ ]:
with open("classifiers", 'rb') as f:
    classifier_list = pickle.load(f)
s=""
for i,classifier in enumerate(classifier_list):
    clf = classifier
    y_guess =clf.predict(X_test)
    #y_guess = (y_guess+1)/2
    #print(type(Y_test[i]))
    #y_true = [ for result in Y_test[i]]
    #classes=["not"+attributes[i],attributes[i]]
    #print(classification_report(y_true,y_guess,classes))
    #print("Attribute: "+attributes[i])
  
    print("Accuracy:"+str(round(accuracy_score(Y_test[i], y_guess)*100,2))+"\%")
    
    print("F-score:"+str(round(f1_score(Y_test[i], y_guess,)*100,2))+"%")
  
    print("Precision:"+str(round(precision_score(Y_test[i], y_guess)*100,2))+"%")

    print("Recall:"+str(round(recall_score(Y_test[i], y_guess)*100,2))+"%")
   
    print("\n")

Mehdi's classification (this is not used in the game)


In [ ]:
x_total = np.array(X)
y_total = np.array(Y).T

# shuffle two list together:
shuffle_indices = np.arange(x_total.shape[0])
np.random.shuffle(shuffle_indices)

x_total_1 = x_total[shuffle_indices, :]
y_total_1 = y_total[shuffle_indices, :]

# separate test and train
x_test, x_train = np.split(x_total, [int(x_total.shape[0]*.1)])
y_test, y_train = np.split(y_total, [int(y_total.shape[0]*.1)])

In [ ]:
x_test.shape, x_train.shape, y_test.shape, y_train.shape

In [ ]:
import tensorflow as tf

In [ ]:
tf.reset_default_graph()

feature_size = 128
attribute_size = len(attributes)

face2embeddings = tf.placeholder(dtype=tf.float32, shape=[None, feature_size])
face2attributes = tf.placeholder(dtype=tf.float32, shape=[None, attribute_size])

# MultiLayer Perceptrone (MLP):

# layer one:
hidden_dim_1 = features*2
weight_1 = tf.Variable(tf.truncated_normal(shape=[feature_size, hidden_dim_1]))
bias_1 = tf.Variable(tf.constant(0.1, shape=[hidden_dim_1]))
layer_1 = tf.nn.tanh(tf.add(tf.matmul(face2embeddings, weight_1), bias_1))

# layer two:
hidden_dim_2 = features
weight_2 = tf.Variable(tf.truncated_normal(shape=[hidden_dim_1, hidden_dim_2]))
bias_2 = tf.Variable(tf.constant(0.1, shape=[hidden_dim_2]))
layer_2 = tf.nn.relu(tf.add(tf.matmul(layer_1, weight_2), bias_2))

# finial layer:
weight_final = tf.Variable(tf.truncated_normal(shape=[hidden_dim_2, attribute_size]))
bias_final = tf.Variable(tf.constant(0.1, shape=[attribute_size]))
layer_final = tf.nn.tanh(tf.add(tf.matmul(layer_2, weight_final), bias_final))


losses = tf.reduce_sum((face2attributes - layer_final)**2, axis=0)
loss = tf.reduce_mean(losses)

training_step = tf.train.AdamOptimizer().minimize(losses)

In [ ]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

In [ ]:
epoch = 2000

for e in range(epoch):
    train_loss, _ = sess.run([loss, training_step], {face2embeddings: x_train, face2attributes: y_train})
    test_loss = sess.run(loss, {face2embeddings: x_test, face2attributes: y_test})
    
    print("epoch:", e, "train loss:", train_loss, "test loss:", test_loss)

In [ ]:
saver = tf.train.Saver()
saver.save(sess, 'mlp-model')

In [ ]:
results = sess.run(layer_final, {face2embeddings: x_test})

In [ ]:
results

In [ ]:
np.sign(np.array([1,-1])+1)

In [ ]:
final_results = np.sign(results)

In [ ]:
from sklearn.metrics import classification_report

In [ ]:
report_file = open("resultsMLP2.txt", "w")
for i, attribute in enumerate(attributes):
    y_pred = np.sign(final_results.T[i]+1)
    y_true = np.sign(y_test.T[i]+1)
    classes = ['not %s' % attribute, attribute]
    report_file.write(classification_report(y_true, y_pred, target_names=classes))

The beginning of the parser


In [ ]:
# parser idea:
sample = "it isn't a man with no glasses"

# tockenization, normalization
sample = sample.lower()
sample = sample.replace('hasn\'t', 'has not').replace('isn\'t', 'is not')
sample = sample.replace(' a ', ' ').replace(' an ', ' ').replace(' the ', ' ')
sample = sample.replace('not ', 'not_').replace('no ', 'no_')
sample_tockenized = sample.split()
print(sample_tockenized)

# dictionary
#print(', '.join(attributes))
attributes_normal = [a.lower().replace() for a in attributes]

att_synonym = {a.lower(): [a.lower()] for a in attributes}
att_antonym = {a.lower(): ['not_'+a.lower(), 'no_'+a.lower()] for a in attributes}
# some manual lists 
att_synonym['male'] += ['man']
att_antonym['male'] += ['not_man', 'no_man']
print(att_synonym)

# parser
for attribute in attributes:
    if len([1 for a in att_synonym[attribute.lower()] if a in sample_tockenized]):
        print("the sample has/is", attribute)
    if len([1 for a in att_antonym[attribute.lower()] if a in sample_tockenized]):
        print("the sample has/is not", attribute)

The beginning of the update function


In [ ]:
from parser_imp import parser
import random

datasetlist = list(dataset.items())
facenr = random.randint(0,len(datasetlist))

imgpath = dataset[datasetlist[facenr][0]]

#embedd = embeddings[imgpaths.index(imgpath)] 

print(datasetlist[facenr][0])
print(imgpath)

print(dataset_att[imgpath])
#ownpath="dataset/lfw/Aaron_Pena/Aaron_Pena_0001.jpg"

#embedd = embeddings[imgpaths.index(ownpath)]

#for (name, num), address in dataset.items():
    #print((name, num),address)
    #print(imgpaths.index(address))
    #print(embeddings[num])
    #break


#with open("classifiers", 'rb') as f:
    #classifier_list = pickle.load(f)
    
with open("classifiersMLP", 'rb') as f:
    classifier_list = pickle.load(f)

result = []
for i,classifier in enumerate(classifier_list):
    clf = classifier
    y_guess =clf.predict(embedd)
    result.append((attributes[i],y_guess))
    print(attributes[i])
    print(y_guess)
    
#print(result)