The goal of this notebook is to produce ads statistics related to images and faces in the set of ads provided for the CP1 during the MEMEX Winter QPR 2017.
Each line is in format:
key \t values \t label
key is 'cluster_id'
values are:
In [95]:
    
import os
# set some path parameters
input_dir = "../data/"
prefix = "train"
#prefix = "test"
if prefix=="test":
    clusters_ads_file = os.path.join(input_dir,prefix+"_adjusted_unlabeled.json")
else:
    clusters_ads_file = os.path.join(input_dir,prefix+"_adjusted.json")
stats_file = os.path.join(input_dir,prefix+"_images_faces_stats.jl")
out_file = os.path.join(input_dir,prefix+"_images_faces_stats_mayank.tsv")
    
In [96]:
    
def parse_stats(stats_file):
    all_stats = {}
    import json
    with open(stats_file,"rt") as stats_in:
        for line in stats_in:
            #print line
            line_dict = json.loads(line)
            for key in line_dict:
                all_stats[key] = line_dict[key]
            #break
    print 'Gathered stats for {} ads.'.format(len(all_stats.keys()))
    return all_stats
    
In [97]:
    
all_stats = parse_stats(stats_file)
    
    
In [98]:
    
len(all_stats)
    
    Out[98]:
In [99]:
    
def parse_clusters_ads_file(clusters_ads_file):
    clusters_ads_mapping = {}
    clusters_labels = {}
    import json
    label_key = 'annotation'
    label_true = 'VERY_RELEVANT'
    with open(clusters_ads_file, "rt") as clusters_in:
        for line in clusters_in:
            line_dict = json.loads(line)
            cluster_id = line_dict['cluster_id']
            ad_id = line_dict['_id']
            if cluster_id not in clusters_ads_mapping:
                clusters_ads_mapping[cluster_id] = [ad_id]
            else:
                clusters_ads_mapping[cluster_id].append(ad_id)
            if label_key in line_dict:
                line_label = line_dict[label_key]
                line_true_label = int(line_label == label_true)
                #print line_label,line_label == label_true,line_true_label
                if cluster_id not in clusters_labels:
                    clusters_labels[cluster_id] = line_true_label
                else:
                    if clusters_labels[cluster_id] != line_true_label:
                        print 'Inconsistent labeling for cluster {}'.format(cluster_id)
            #print clusters_labels,clusters_ads_mapping,line_dict[label_key] == label_true,line_dict[label_key]
            #break
    print 'Gathered {} clusters.'.format(len(clusters_ads_mapping))
    return clusters_ads_mapping, clusters_labels
    
In [100]:
    
clusters_ads_mapping,clusters_labels = parse_clusters_ads_file(clusters_ads_file)
    
    
In [101]:
    
def to_percent(y, position):
    import matplotlib
    # Ignore the passed in position. This has the effect of scaling the default
    # tick locations.
    s = str(100 * y)
    # The percent symbol needs escaping in latex
    if matplotlib.rcParams['text.usetex'] is True:
        return s + r'$\%$'
    else:
        return s + '%'
    
In [102]:
    
def print_stats(np_img_count, nb_bins=100):
    import matplotlib
    import matplotlib.pyplot as plt
    from matplotlib.ticker import FuncFormatter
    %matplotlib inline
    import numpy as np
    print np.min(np_img_count), np.mean(np_img_count), np.max(np_img_count)
    # Normed histogram seems to be broken, 
    # using weights as suggested in http://stackoverflow.com/questions/5498008/pylab-histdata-normed-1-normalization-seems-to-work-incorrect
    weights = np.ones_like(np_img_count)/float(len(np_img_count))
    res = plt.hist(np_img_count, bins=nb_bins, weights=weights)
    print np.sum(res[0])
    # Create the formatter using the function to_percent. This multiplies all the
    # default labels by 100, making them all percentages
    formatter = FuncFormatter(to_percent)
    # Set the formatter
    plt.gca().yaxis.set_major_formatter(formatter)
    plt.show()
    
In [103]:
    
cluster_ads_count = [len(clusters_ads_mapping[x]) for x in clusters_ads_mapping]
    
In [104]:
    
print_stats(cluster_ads_count, nb_bins=200)
    
    
    
In [105]:
    
print clusters_ads_mapping[clusters_ads_mapping.keys()[2]]
print all_stats[all_stats.keys()[2]]
    
    
In [106]:
    
# get min, max, median, total stats from count
def get_mmmt(count):
    import numpy as np
    if not count:
        count = [0]
    mi = np.min(count)
    ma = np.max(count)
    me = int(np.median(count))
    t = np.sum(count)
    return mi,ma,me,t
    
In [107]:
    
# produce output
def produce_output_mayank(clusters_ads_mapping, clusters_labels, all_stats, outfile):
    with open(outfile,"wt") as out:
        for cluster in clusters_ads_mapping:
            faces_min = []
            faces_max = []
            faces_median = []
            faces_total = []
            images_count = []
            for ad_u in clusters_ads_mapping[cluster]:
                ad = str(ad_u.strip())
                if ad not in all_stats:
                    #print 'Ad {} not in all_stats'.format(ad)
                    #print '.',
                    continue
                # compute aggrageted stats
                faces_min.append(all_stats[ad]['faces_min'])
                faces_max.append(all_stats[ad]['faces_max'])
                faces_median.append(all_stats[ad]['faces_median'])
                faces_total.append(all_stats[ad]['faces_total'])
                images_count.append(all_stats[ad]['images_count'])
            fmimi, fmima, fmime, _ = get_mmmt(faces_min)
            fmami, fmama, fmame, _ = get_mmmt(faces_max)
            fmemi, fmema, fmeme, _ = get_mmmt(faces_median)
            ftmi, ftma, ftme, ftt = get_mmmt(faces_total)
            imi, ima, ime, it = get_mmmt(images_count)
            ads_t = len(clusters_ads_mapping[cluster])
            stats = [fmimi, fmima, fmime, fmami, fmama, fmame, fmemi, fmema, fmeme, ftmi, ftma, ftme, ftt, float(ftt)/ads_t, imi, ima, ime, it, float(it)/ads_t, ads_t]
            #print stats
            #break
            out.write("{}\t{}\t{}\n".format(cluster, stats, clusters_labels[cluster]))
    
In [108]:
    
#print all_stats.keys()[0]
#print '81951EA0F273BF42552CBC73D01F4D21FCDED6ABE5D97DFA4DFEFD2B6386F418' in all_stats
produce_output_mayank(clusters_ads_mapping, clusters_labels, all_stats, out_file)
    
In [ ]: