The goal of this notebook is to train and evaluated HT risk classification using statistics from the images and the faces detect in the images associated to the set of ads provided for the CP1 during the MEMEX Winter QPR 2017. [Most of the code from Mayank repo]
In [ ]:
import os
import json
import codecs
import re
import numpy as np
from random import shuffle
In [ ]:
def convert_string_to_float_list(string):
return [float(i) for i in re.split(', ', string[1:-1])]
In [ ]:
def l2_norm_on_matrix(matrix):
"""
Takes a np.matrix style object and l2-normalizes it.
:param matrix:
:return matrix:
"""
from sklearn.preprocessing import normalize
warnings.filterwarnings("ignore")
return normalize(matrix)
In [ ]:
def sample_and_extend(list_of_vectors, total_samples):
"""
Oversampling code for balanced training. We will do deep re-sampling, assuming that the vectors contain
atoms.
:param list_of_vectors: the list of vectors that are going to be re-sampled (randomly)
:param total_samples: The total number of vectors that we want in the list. Make sure that this number
is higher than the length of list_of_vectors
:return: the over-sampled list
"""
if len(list_of_vectors) >= total_samples:
raise Exception('Check your lengths!')
indices = range(0, len(list_of_vectors))
shuffle(indices)
desired_samples = total_samples-len(list_of_vectors)
# print desired_samples>len(list_of_vectors)
while desired_samples > len(indices):
new_indices = list(indices)
shuffle(new_indices)
indices += new_indices
new_data = [list(list_of_vectors[i]) for i in indices[0:desired_samples]]
# print new_data
return np.append(list_of_vectors, new_data, axis=0)
In [ ]:
def prepare_for_ML_classification(pos_neg_file, normalize=False):
"""
We need to read in embeddings
:param pos_neg_file: The file generated in one of the preprocess_filtered_* files
:return: A dictionary where a 0,1 label references a numpy matrix.
"""
result = dict()
pos_features = list()
neg_features = list()
with codecs.open(pos_neg_file, 'r', 'utf-8') as f:
for line in f:
line = line[0:-1]
cols = re.split('\t',line)
# print list(cols[1])
# break
if int(cols[2]) == 1:
pos_features.append(convert_string_to_float_list(cols[1]))
elif int(cols[2]) == 0:
neg_features.append(convert_string_to_float_list(cols[1]))
else:
print 'error; label not recognized'
# print np.matrix(pos_features)
if normalize == True:
result[0] = l2_norm_on_matrix(np.matrix(neg_features))
result[1] = l2_norm_on_matrix(np.matrix(pos_features))
else:
if len(pos_features) != 0:
result[1] = pos_features
if len(neg_features) != 0:
result[0] = neg_features
return result
In [ ]:
def prepare_train_test_data(pos_neg_file, train_percent = 0.3, randomize=True, balanced_training=True, data_vectors=None):
"""
:param pos_neg_file:
:param train_percent:
:param randomize: If true, we'll randomize the data we're reading in from pos_neg_file. Otherwise, the initial
train_percent fraction goes into the training data and the rest of it in the test data
:param balanced_training: if True, we will equalize positive and negative training samples by oversampling
the lesser class. For example, if we have 4 positive samples and 7 negative samples, we will randomly re-sample
3 positive samples from the 4 positive samples, meaning there will be repetition. Use with caution.
:param data_vectors: this should be set if pos_neg_file is None. It is mostly for internal uses, so
that we can re-use this function by invoking it from some of the other _prepare_ files.
:return: dictionary containing training/testing data/labels
"""
import math
if pos_neg_file:
data = prepare_for_ML_classification(pos_neg_file)
elif data_vectors:
data = data_vectors
else:
raise Exception('Neither pos_neg_file nor data_vectors argument is specified. Exiting.')
# print len(data[1])
# print len(data[0])
train_pos_num = int(math.ceil(len(data[1])*train_percent))
train_neg_num = int(math.ceil(len(data[0])*train_percent))
# print train_pos_num
# print train_neg_num
test_pos_num = len(data[1])-train_pos_num
test_neg_num = len(data[0])-train_neg_num
if test_pos_num == 0:
test_pos_num = 1
if test_neg_num == 0:
test_neg_num = 1
test_labels_pos = [[1] * test_pos_num]
test_labels_neg = [[0] * test_neg_num]
if not randomize:
train_data_pos = data[1][0:train_pos_num]
train_data_neg = data[0][0:train_neg_num]
if train_pos_num < len(data[1]):
test_data_pos = data[1][train_pos_num:]
else:
test_data_pos = [data[1][-1]]
if train_neg_num < len(data[0]):
test_data_neg = data[0][train_neg_num:]
else:
test_data_neg = [data[0][-1]]
else:
all_pos_indices = range(0, len(data[1]))
all_neg_indices = range(0, len(data[0]))
shuffle(all_pos_indices)
shuffle(all_neg_indices)
train_data_pos = [data[1][i] for i in all_pos_indices[0:train_pos_num]]
train_data_neg = [data[0][i] for i in all_neg_indices[0:train_neg_num]]
if train_pos_num < len(data[1]):
test_data_pos = [data[1][i] for i in all_pos_indices[train_pos_num:]]
else:
test_data_pos = [data[1][-1]]
if train_neg_num < len(data[0]):
test_data_neg = [data[0][i] for i in all_neg_indices[train_neg_num:]]
else:
test_data_neg = [data[0][-1]]
if balanced_training:
if train_pos_num < train_neg_num:
train_labels_pos = [[1] * train_neg_num]
train_labels_neg = [[0] * train_neg_num]
train_data_pos = sample_and_extend(train_data_pos, total_samples=train_neg_num)
elif train_pos_num > train_neg_num:
train_labels_pos = [[1] * train_pos_num]
train_labels_neg = [[0] * train_pos_num]
train_data_neg = sample_and_extend(train_data_neg, total_samples=train_pos_num)
else:
train_labels_pos = [[1] * train_pos_num]
train_labels_neg = [[0] * train_neg_num]
else:
train_labels_pos = [[1] * train_pos_num]
train_labels_neg = [[0] * train_neg_num]
# print len(train_data_pos)
# print len(train_data_neg)
train_data = np.append(train_data_pos, train_data_neg, axis=0)
test_data = np.append(test_data_pos, test_data_neg, axis=0)
train_labels = np.append(train_labels_pos, train_labels_neg)
test_labels = np.append(test_labels_pos, test_labels_neg)
results = dict()
results['train_data'] = train_data
results['train_labels'] = train_labels
results['test_data'] = test_data
results['test_labels'] = test_labels
return results
In [ ]:
def get_pos_neg_ids(pos_neg_file):
result = list()
with codecs.open(pos_neg_file, 'r', 'utf-8') as f:
for line in f:
line = line[0:-1]
result.append(re.split('\t',line)[0])
return result
In [ ]:
def prepare_train_test_data_separate_unseen(pos_neg_train_file, pos_neg_test_file, balanced_training=True):
import numpy as np
train = prepare_for_ML_classification(pos_neg_train_file)
test = prepare_for_ML_classification(pos_neg_test_file)
test_ids = get_pos_neg_ids(pos_neg_test_file)
train_pos_num = len(train[1])
train_neg_num = len(train[0])
train_data_pos = train[1][0:train_pos_num]
train_data_neg = train[0][0:train_neg_num]
#test_pos_num = len(test[1])
test_neg_num = len(test[0])
#test_data_pos = test[1][0:test_pos_num]
test_data_neg = test[0][0:test_neg_num]
#test_labels_pos = [[1] * test_pos_num]
test_labels_neg = [[0] * test_neg_num]
if balanced_training:
if train_pos_num < train_neg_num:
train_labels_pos = [[1] * train_neg_num]
train_labels_neg = [[0] * train_neg_num]
train_data_pos = sample_and_extend(train_data_pos, total_samples=train_neg_num)
elif train_pos_num > train_neg_num:
train_labels_pos = [[1] * train_pos_num]
train_labels_neg = [[0] * train_pos_num]
train_data_neg = sample_and_extend(train_data_neg, total_samples=train_pos_num)
else:
train_labels_pos = [[1] * train_pos_num]
train_labels_neg = [[0] * train_neg_num]
else:
train_labels_pos = [[1] * train_pos_num]
train_labels_neg = [[0] * train_neg_num]
# print len(train_data_pos)
# print len(train_data_neg)
train_data = np.append(train_data_pos, train_data_neg, axis=0)
#test_data = np.append(test_data_neg, axis=0)
train_labels = np.append(train_labels_pos, train_labels_neg)
#test_labels = np.append(test_labels_neg)
results = dict()
results['train_data'] = train_data
results['train_labels'] = train_labels
results['test_data'] = test_data_neg
results['test_labels'] = test_labels_neg
results['test_ids'] = test_ids
return results
In [ ]:
def train_and_test_classifier(train_data, train_labels, test_data, test_labels, classifier_model, test_ids=None):
"""
Take three numpy matrices and compute a bunch of metrics. Hyperparameters must be changed manually,
we do not take them in as input.
This method is for BINARY CLASSIFICATION only, although there is some support for regression.
:param train_data:
:param train_labels:
:param test_data:
:param test_labels:
:param classifier_model:
:return:
"""
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import neighbors
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, precision_recall_fscore_support
if classifier_model == 'random_forest':
model = RandomForestClassifier()
elif classifier_model == 'knn':
k = 9
model = neighbors.KNeighborsClassifier(n_neighbors=k, weights='uniform')
elif classifier_model == 'logistic_regression':
model = LogisticRegression()
elif classifier_model == 'linear_regression': # this is a regressor; be careful.
model = LinearRegression()
model.fit(train_data, train_labels)
predicted_labels = model.predict(test_data)
print predicted_labels
print test_labels
predicted_probabilities = model.predict_proba(test_data)
#print predicted_probabilities
final_results = list()
if test_ids is not None:
final_results.append(test_ids)
final_results.append(predicted_probabilities)
return final_results, model
else:
print 'AUC (Area Under Curve): ',
print roc_auc_score(test_labels, predicted_labels)
# precision, recall, thresholds = precision_recall_curve(test_labels, predicted_labels)
# plt.clf()
# plt.plot(recall, precision, label='precision-recall-curve')
# plt.xlabel('Recall')
# plt.ylabel('Precision')
# plt.ylim([0.0, 1.05])
# plt.xlim([0.0, 1.0])
# plt.title('Precision-Recall curve')
# plt.savefig('/home/mayankkejriwal/Downloads/memex-cp4-october/tmp/fig.png')
if classifier_model not in ['linear_regression']:
print 'Accuracy: ',
print accuracy_score(test_labels, predicted_labels)
# print precision_score(test_labels, predicted_labels)
prf = ['Precision: ', 'Recall: ', 'F-score: ', 'Support: ']
print 'Class 0\tClass 1'
k = precision_recall_fscore_support(test_labels, predicted_labels)
#for i in range(0, len(k)):
# print prf[i],
# print k[i]
return [k[0][1], k[1][1], k[2][1]], model
In [ ]:
# set some parameters
data_dir = "../data"
prefix = "train"
#prefix = "test"
if prefix=="train":
input_file = "train_adjusted.json"
else:
input_file = "test_adjusted_unlabelled.json"
In [ ]:
def print_model_weights(weights, labels="fmimi, fmima, fmime, fmami, fmama, fmame, fmemi, fmema, fmeme, ftmi, ftma, ftme, ftt, avg_ftt, imi, ima, ime, it, avg_it, ads_t"):
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
fig, ax = plt.subplots()
rects1 = ax.bar(range(len(weights)), weights, width=0.5, color='r')
ax.set_xticks(range(len(weights)))
ax.set_xticklabels(tuple(labels.split(',')), rotation='vertical')
plt.show()
In [ ]:
# train models and evaluate
In [ ]:
train_percent = 0.5
pos_neg_file = os.path.join(data_dir, prefix+"_images_faces_stats_mayank.tsv")
data_dict = prepare_train_test_data(pos_neg_file, train_percent=train_percent)
# 'logistic_regression', 'random_forest', 'knn'
data_dict['classifier_model'] = 'logistic_regression'
results, model = train_and_test_classifier(**data_dict)
In [ ]:
print model.coef_[0]
print_model_weights(model.coef_[0])
In [ ]:
data_dict['classifier_model'] = 'random_forest'
results, model = train_and_test_classifier(**data_dict)
In [ ]:
print model.feature_importances_
print_model_weights(model.feature_importances_)
In [ ]:
data_dict['classifier_model'] = 'knn'
results, model = train_and_test_classifier(**data_dict)
In [ ]:
# actual evaluation
train_pos_neg_file = os.path.join(data_dir, "train_images_faces_stats_mayank.tsv")
test_pos_neg_file = os.path.join(data_dir, "test_images_faces_stats_mayank.tsv")
data_dict = prepare_train_test_data_separate_unseen(train_pos_neg_file, test_pos_neg_file)
print len(data_dict['train_data']),len(data_dict['train_labels'])
print len(data_dict['test_data']),len(data_dict['test_labels'])
print data_dict['test_labels']
In [45]:
def normalize_feats(data, norm_values=None):
data = np.asarray(data)
print type(data)
print data.shape
new_norm_values = False
if not norm_values:
new_norm_values = True
min_data = data.min(axis=0)
print min_data
print min_data.shape
max_data = data.max(axis=0)
print max_data
print max_data.shape
norm_values = [min_data, max_data]
data = (data - norm_values[0])/(norm_values[1] - norm_values[0])
if new_norm_values:
return data, norm_values
else:
return data
In [48]:
# evaluation post submission with normalization
train_pos_neg_file = os.path.join(data_dir, "train_images_faces_stats_mayank.tsv")
test_pos_neg_file = os.path.join(data_dir, "test_images_faces_stats_mayank.tsv")
data_dict = prepare_train_test_data_separate_unseen(train_pos_neg_file, test_pos_neg_file)
# normalize feats between 0 and 1
#data_dict['train_data_nonorm'] = data_dict['train_data']
data_dict['train_data'], norm_values = normalize_feats(data_dict['train_data'])
#data_dict['test_data_nonorm'] = data_dict['test_data']
data_dict['test_data'] = normalize_feats(data_dict['test_data'], norm_values)
print len(data_dict['train_data']),len(data_dict['train_labels'])
print len(data_dict['test_data']),len(data_dict['test_labels'])
print data_dict['test_labels']
In [49]:
# 'logistic_regression', 'random_forest', 'knn'
#data_dict['classifier_model'] = 'random_forest'
data_dict['classifier_model'] = 'logistic_regression'
results, model = train_and_test_classifier(**data_dict)
In [ ]:
print results
In [ ]:
def show_model_weights(model, model_type):
if model_type == 'logistic_regression':
print model.coef_[0]
print_model_weights(model.coef_[0])
elif model_type == 'random_forest':
print model.feature_importances_
print_model_weights(model.feature_importances_)
In [ ]:
def prepare_CP1_eval_output(path, results):
print path, results
with codecs.open(path+'_test_results.jl', 'w', 'utf-8') as out:
for i in range(0, len(results[0])):
answer = dict()
answer['cluster_id'] = str(results[0][i])
answer['score'] = results[1][i][1]
json.dump(answer, out)
out.write('\n')
In [ ]:
prepare_CP1_eval_output(os.path.join(data_dir,'columbia_faceimagestats_'+data_dict['classifier_model']), results)
# we should also save the model that generated these results
import pickle
pickle.dump(model,open(os.path.join(data_dir,'columbia_faceimagestats_'+data_dict['classifier_model']+'.pkl'),'wb'))
In [22]:
comp_classifiers = {}
first_clf = True
import pickle
import os
#for classifier in ['logistic_regression', 'random_forest']:
for classifier in ['logistic_regression']:
model = pickle.load(open(os.path.join(data_dir,'columbia_faceimagestats_'+classifier+'.pkl'),'rb'))
show_model_weights(model, classifier)
with open(os.path.join(data_dir,'columbia_faceimagestats_'+classifier+'_test_results.jl'),'rt') as res:
for line in res:
line_dict = json.loads(line)
#print line_dict
if first_clf:
comp_classifiers[line_dict['cluster_id']] = [line_dict['score']]
else:
comp_classifiers[line_dict['cluster_id']].append(line_dict['score'])
first_clf = False
In [23]:
print comp_classifiers
In [ ]:
In [ ]: