The goal of this notebook is to produce ads statistics related to images and faces in the set of ads provided for the CP1 during the MEMEX Winter QPR 2017.
Each line is in format:
key \t values \t label
key is 'cluster_id'
values are:
In [95]:
import os
# set some path parameters
input_dir = "../data/"
prefix = "train"
#prefix = "test"
if prefix=="test":
clusters_ads_file = os.path.join(input_dir,prefix+"_adjusted_unlabeled.json")
else:
clusters_ads_file = os.path.join(input_dir,prefix+"_adjusted.json")
stats_file = os.path.join(input_dir,prefix+"_images_faces_stats.jl")
out_file = os.path.join(input_dir,prefix+"_images_faces_stats_mayank.tsv")
In [96]:
def parse_stats(stats_file):
all_stats = {}
import json
with open(stats_file,"rt") as stats_in:
for line in stats_in:
#print line
line_dict = json.loads(line)
for key in line_dict:
all_stats[key] = line_dict[key]
#break
print 'Gathered stats for {} ads.'.format(len(all_stats.keys()))
return all_stats
In [97]:
all_stats = parse_stats(stats_file)
In [98]:
len(all_stats)
Out[98]:
In [99]:
def parse_clusters_ads_file(clusters_ads_file):
clusters_ads_mapping = {}
clusters_labels = {}
import json
label_key = 'annotation'
label_true = 'VERY_RELEVANT'
with open(clusters_ads_file, "rt") as clusters_in:
for line in clusters_in:
line_dict = json.loads(line)
cluster_id = line_dict['cluster_id']
ad_id = line_dict['_id']
if cluster_id not in clusters_ads_mapping:
clusters_ads_mapping[cluster_id] = [ad_id]
else:
clusters_ads_mapping[cluster_id].append(ad_id)
if label_key in line_dict:
line_label = line_dict[label_key]
line_true_label = int(line_label == label_true)
#print line_label,line_label == label_true,line_true_label
if cluster_id not in clusters_labels:
clusters_labels[cluster_id] = line_true_label
else:
if clusters_labels[cluster_id] != line_true_label:
print 'Inconsistent labeling for cluster {}'.format(cluster_id)
#print clusters_labels,clusters_ads_mapping,line_dict[label_key] == label_true,line_dict[label_key]
#break
print 'Gathered {} clusters.'.format(len(clusters_ads_mapping))
return clusters_ads_mapping, clusters_labels
In [100]:
clusters_ads_mapping,clusters_labels = parse_clusters_ads_file(clusters_ads_file)
In [101]:
def to_percent(y, position):
import matplotlib
# Ignore the passed in position. This has the effect of scaling the default
# tick locations.
s = str(100 * y)
# The percent symbol needs escaping in latex
if matplotlib.rcParams['text.usetex'] is True:
return s + r'$\%$'
else:
return s + '%'
In [102]:
def print_stats(np_img_count, nb_bins=100):
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
%matplotlib inline
import numpy as np
print np.min(np_img_count), np.mean(np_img_count), np.max(np_img_count)
# Normed histogram seems to be broken,
# using weights as suggested in http://stackoverflow.com/questions/5498008/pylab-histdata-normed-1-normalization-seems-to-work-incorrect
weights = np.ones_like(np_img_count)/float(len(np_img_count))
res = plt.hist(np_img_count, bins=nb_bins, weights=weights)
print np.sum(res[0])
# Create the formatter using the function to_percent. This multiplies all the
# default labels by 100, making them all percentages
formatter = FuncFormatter(to_percent)
# Set the formatter
plt.gca().yaxis.set_major_formatter(formatter)
plt.show()
In [103]:
cluster_ads_count = [len(clusters_ads_mapping[x]) for x in clusters_ads_mapping]
In [104]:
print_stats(cluster_ads_count, nb_bins=200)
In [105]:
print clusters_ads_mapping[clusters_ads_mapping.keys()[2]]
print all_stats[all_stats.keys()[2]]
In [106]:
# get min, max, median, total stats from count
def get_mmmt(count):
import numpy as np
if not count:
count = [0]
mi = np.min(count)
ma = np.max(count)
me = int(np.median(count))
t = np.sum(count)
return mi,ma,me,t
In [107]:
# produce output
def produce_output_mayank(clusters_ads_mapping, clusters_labels, all_stats, outfile):
with open(outfile,"wt") as out:
for cluster in clusters_ads_mapping:
faces_min = []
faces_max = []
faces_median = []
faces_total = []
images_count = []
for ad_u in clusters_ads_mapping[cluster]:
ad = str(ad_u.strip())
if ad not in all_stats:
#print 'Ad {} not in all_stats'.format(ad)
#print '.',
continue
# compute aggrageted stats
faces_min.append(all_stats[ad]['faces_min'])
faces_max.append(all_stats[ad]['faces_max'])
faces_median.append(all_stats[ad]['faces_median'])
faces_total.append(all_stats[ad]['faces_total'])
images_count.append(all_stats[ad]['images_count'])
fmimi, fmima, fmime, _ = get_mmmt(faces_min)
fmami, fmama, fmame, _ = get_mmmt(faces_max)
fmemi, fmema, fmeme, _ = get_mmmt(faces_median)
ftmi, ftma, ftme, ftt = get_mmmt(faces_total)
imi, ima, ime, it = get_mmmt(images_count)
ads_t = len(clusters_ads_mapping[cluster])
stats = [fmimi, fmima, fmime, fmami, fmama, fmame, fmemi, fmema, fmeme, ftmi, ftma, ftme, ftt, float(ftt)/ads_t, imi, ima, ime, it, float(it)/ads_t, ads_t]
#print stats
#break
out.write("{}\t{}\t{}\n".format(cluster, stats, clusters_labels[cluster]))
In [108]:
#print all_stats.keys()[0]
#print '81951EA0F273BF42552CBC73D01F4D21FCDED6ABE5D97DFA4DFEFD2B6386F418' in all_stats
produce_output_mayank(clusters_ads_mapping, clusters_labels, all_stats, out_file)
In [ ]: