Face detection

The goal of this notebook is to produce ads statistics related to images and faces in the set of ads provided for the CP1 during the MEMEX Winter QPR 2017.

Inputs

train_images_faces_stats.jl
test_images_faces_stats.jl
train_adjusted.json
test_adjusted_unlabeled.json

Outputs

train_cluster_images_faces_stats.tsv
test_cluster_images_faces_stats.tsv

Each line is in format:

key \t values \t label

key is 'cluster_id'

values are:

'faces_min_min' as the minimum number of faces detected in any image of any ad of this cluster
'faces_min_max' as the maximum of minimum number of faces detected in any image of each ad of this cluster
'faces_min_median' as the median of minimum number of faces detected in any image of each ad of this cluster
'faces_max_min' as the minimum of the maximum number of faces detected in any image of each ad of this cluster
'faces_max_max' as the maximum number of faces detected in any image of any ad of this cluster
'faces_max_median' as the median of the maximum number of faces detected in any image of each ad of this cluster
'faces_median_min' as the minimum of the median number of faces detected in any image of each ad of this cluster
'faces_median_max' as the maximum number of the median number faces detected in any image of any ad of this cluster
'faces_median_median' as the median of the median number of faces detected in any image of each ad of this cluster
'faces_count_min' as the mininum of total number of faces detected in all images of each ad of this cluster
'faces_count_max' as the max number of faces detected in all images of each ad of this cluster
'faces_count_median' as the median number of faces detected in all images of each ad of this cluster
'faces_count_total' as the total number of faces detected in all images of all ads of this cluster
'faces_count_avg' as the average number of faces detected in all images per ad of this cluster
'image_count_min' as the minimum number of images url (even if not acessible, readable) for one ad in the cluster
'image_count_max' as the maximum number of images url (even if not acessible, readable) for one ad in the cluster
'image_count_median' as the median number of images url (even if not acessible, readable) for one ad in the cluster
'image_count_total' as the total number of images url (even if not acessible, readable) in the cluster
'image_count_avg' as the average number of images url (even if not acessible, readable) per ad in the cluster
'number_of_ads' as the number of ads in the cluster



In [95]:

    
import os
# set some path parameters
input_dir = "../data/"
prefix = "train"
#prefix = "test"
if prefix=="test":
    clusters_ads_file = os.path.join(input_dir,prefix+"_adjusted_unlabeled.json")
else:
    clusters_ads_file = os.path.join(input_dir,prefix+"_adjusted.json")
stats_file = os.path.join(input_dir,prefix+"_images_faces_stats.jl")
out_file = os.path.join(input_dir,prefix+"_images_faces_stats_mayank.tsv")



In [96]:

    
def parse_stats(stats_file):
    all_stats = {}
    import json
    with open(stats_file,"rt") as stats_in:
        for line in stats_in:
            #print line
            line_dict = json.loads(line)
            for key in line_dict:
                all_stats[key] = line_dict[key]
            #break
    print 'Gathered stats for {} ads.'.format(len(all_stats.keys()))
    return all_stats



In [97]:

    
all_stats = parse_stats(stats_file)









    



Gathered stats for 122045 ads.



In [98]:

    
len(all_stats)









    Out[98]:





122045



In [99]:

    
def parse_clusters_ads_file(clusters_ads_file):
    clusters_ads_mapping = {}
    clusters_labels = {}
    import json
    label_key = 'annotation'
    label_true = 'VERY_RELEVANT'
    with open(clusters_ads_file, "rt") as clusters_in:
        for line in clusters_in:
            line_dict = json.loads(line)
            cluster_id = line_dict['cluster_id']
            ad_id = line_dict['_id']
            if cluster_id not in clusters_ads_mapping:
                clusters_ads_mapping[cluster_id] = [ad_id]
            else:
                clusters_ads_mapping[cluster_id].append(ad_id)
            if label_key in line_dict:
                line_label = line_dict[label_key]
                line_true_label = int(line_label == label_true)
                #print line_label,line_label == label_true,line_true_label
                if cluster_id not in clusters_labels:
                    clusters_labels[cluster_id] = line_true_label
                else:
                    if clusters_labels[cluster_id] != line_true_label:
                        print 'Inconsistent labeling for cluster {}'.format(cluster_id)
            #print clusters_labels,clusters_ads_mapping,line_dict[label_key] == label_true,line_dict[label_key]
            #break
    print 'Gathered {} clusters.'.format(len(clusters_ads_mapping))
    return clusters_ads_mapping, clusters_labels



In [100]:

    
clusters_ads_mapping,clusters_labels = parse_clusters_ads_file(clusters_ads_file)









    



Gathered 614 clusters.



In [101]:

    
def to_percent(y, position):
    import matplotlib
    # Ignore the passed in position. This has the effect of scaling the default
    # tick locations.
    s = str(100 * y)

    # The percent symbol needs escaping in latex
    if matplotlib.rcParams['text.usetex'] is True:
        return s + r'$\%$'
    else:
        return s + '%'



In [102]:

    
def print_stats(np_img_count, nb_bins=100):
    import matplotlib
    import matplotlib.pyplot as plt
    from matplotlib.ticker import FuncFormatter
    %matplotlib inline
    import numpy as np
    print np.min(np_img_count), np.mean(np_img_count), np.max(np_img_count)
    # Normed histogram seems to be broken, 
    # using weights as suggested in http://stackoverflow.com/questions/5498008/pylab-histdata-normed-1-normalization-seems-to-work-incorrect
    weights = np.ones_like(np_img_count)/float(len(np_img_count))
    res = plt.hist(np_img_count, bins=nb_bins, weights=weights)
    print np.sum(res[0])
    # Create the formatter using the function to_percent. This multiplies all the
    # default labels by 100, making them all percentages
    formatter = FuncFormatter(to_percent)

    # Set the formatter
    plt.gca().yaxis.set_major_formatter(formatter)

    plt.show()



In [103]:

    
cluster_ads_count = [len(clusters_ads_mapping[x]) for x in clusters_ads_mapping]



In [104]:

    
print_stats(cluster_ads_count, nb_bins=200)









    



1 341.2752443 4903
1.0



In [105]:

    
print clusters_ads_mapping[clusters_ads_mapping.keys()[2]]
print all_stats[all_stats.keys()[2]]









    



[u'35774B4E9F839E9F2992997DAC8E00965367ECE003AF78C6C84A2585FAA87267', u'174B570EE2362934A9F0DB705B7181855DEDDF4278A25FEBBDA9FA6C1F7995AA', u'7305C240BADC0FCB268BA55987E1E6CB839D7A4D1973C8FBE212BBE78444D3DB', u'D7DD3402DEB690D358F88B7A8CC80A7B96E0A90B0FCD04AC337DF210800F74D1', u'3E6FC50858E3574985D450B454D2267350E9C634A36B77D3FDB2EF44D1F6A564', u'AD69D653C1354BE4F77BDCE477BD2C891F5FFF55DA8E51877A3F5E43A0BF1F4A', u'E3ADA2AF8A1F93580E6A1D579165BCBFBC35C0B4349C41C02239029D15731486', u'442A0CA7F8A615015D443AB9DFC1240C3B55270E9ED9B061E4F82BF3CC4B18D0', u'85073434570D0BDB0208297EEFF8A67BDB2D490BA8EC3322CA7D34A5CBC79BEE', u'9BD0F81E3B38320B76B945AB7B0F255B578BA2BA0B8492AB304FC5872FC515A5', u'536D616DDF1D855EEB1EE0D12BF701AD5028C3BBAC52DAD55B07FA3D8B68D481', u'1B6BA8092C63EA5278BC0691ACB22264E42C70E36E8588212E6343D65822DBF2', u'71E621310396617E6186ED8C1A50A7A8A7FCF1C3BF230F494EFAED055CBBC1D9', u'BDE776B19FE59BE790F1C1C577FF6B7D8F3B9E2DD70541374AFB8E216C3DAD35', u'FF883CF8F8E82D4C0EC4AD546CFC73FD3AFAE7B9F180AC8EB1A66F56D975966B', u'53BA596F5C8E89F35F4BC2589F5C1C50D4E8AE48FC167190263A9C2923C1FC01', u'ADA25458925CBAEF977ADF4C0F7E40A36E4AAE19620E18ACA2BA61E25921B8D4', u'4F0F5483BB69C683EC0DE61886E68BEF32037AF3684FD4F8F0DE4E7CDFAC6E57', u'D78CB4BDA6DEE6431580900F3FF5D8C091713FF1E7617D8DEECFE20393F2F2C5', u'4DB3CCF4A4E3E4D48CAB68C942C10AE165CCA0377707DD3F55B174E9F21A5436', u'97965BD83BC2CA50482CDF06D3E8CBAAAA18798DC6C23C5B0C98FF686AFD1065', u'0ACE952F1D9FFB1EA3146EC7B7235A546ABAC6BFB7AE33038E4A0EA45C4D0840', u'2C73C4715D8C520D227843EB11984A9B101841DF1D16CE433F364FC481723A0A', u'866EEC76DD71288C4F17728FB93F2BFBDB7A4B4F13983FBDE1591EDCE97329EE', u'1B3ADC8A29B31AA92BCF5EEDD54D16B54303BEF60B79EF64BA086A23E6AAA449', u'E0A2CAD855E866E018A46F489720648A159F1A7DBBA691394D9CBB61A4871A35', u'83CBAC8A54EE06314E5857F601966588FB74DD6EEC346A7256BD6762AA70A291', u'BDDE20357E5348C6E06E5F8B2E18AE994429CE0D2D8B0CE1D0A743D54BEFF299', u'0603C9DCA50790864B54BC840E802FB457F84BECD230A06C055715F0C4CA7675', u'DDD27F508311F81986CA8F3A59911FA9CEF71023721B9D356F306C41F854E1E3', u'8047588ADE01C9209CD919EB7E6FE61F7E6092170DD78CE063071CB580DC724D', u'6CB274BCC1EA0F77942F8922AA6C37AAD312C7D1471C8008E6C3E22FD2D06DBF', u'ABA6CF3191396A02E583239E5412AD9B2CDF9A2978B75E0190DD264404C665F6', u'A4006DE1F8CB2643242E3C191D986ADC14D56C0758240500E4DBF4D1DF77B761', u'AC84266CBB0EFDD96760F778EA73519EE17AA2E383E0D6C2C745141210F37CA2', u'7EBA79D12CEA16CC4DF153599636D8D88EF5D153E881CE036B2F94C844684FB8', u'5EBBCF0E87F255C681949907A409C79A953EA0A47C5E0DEDB3E21011DA1858A4', u'C2FFE291EB3AD613276F6398CA7043F6069A1CC91BAF1143152F88F4F40BE26A', u'CE7121EBF7737E11AB68CCBDA69A5BB33E3164FD9F883ED9786FE12871526CD4', u'0FEE822E591F5D26F710653666493910C5584B5D2D93A66845F75E516F3EBBE0', u'8CABC622D1FC895662C11392255AA7E56751A69F019A81E8BF9E20E96D9A02A4', u'FB875C4D6D72E0A58E59ABD46D8D51E8E0309C01B848A2E251DCFD1890D8B839', u'7253724F76D57EE62FD2ED8B263C7C3998D44570FFB9CF0DC407BDDE018A0657', u'EA4118BEC3D659912AFEC655288DA95470165D16E2CB3058316C6226FC96F3FC', u'AF4B3566322F5250362F6820A06DD35FAA0E319BA7D14571D3C5791B4911AEBF', u'3231E875BFDC070CF9645542E46CEAEFEAC5E47BE89DD334675F45E153EB500E', u'13A309EB3014B62301A3C1EE246EB60523FC594614214DAFD7E9CFEDE7F9E885', u'DCBDCD3A2D8AD43165FD95E49CCAB4E63AADB12C9AE555725F3AD80525D83022']
{u'faces_max': 3, u'faces_total': 14, u'images_count': 8, u'faces_min': 1, u'faces_median': 2}



In [106]:

    
# get min, max, median, total stats from count
def get_mmmt(count):
    import numpy as np
    if not count:
        count = [0]
    mi = np.min(count)
    ma = np.max(count)
    me = int(np.median(count))
    t = np.sum(count)
    return mi,ma,me,t



In [107]:

    
# produce output
def produce_output_mayank(clusters_ads_mapping, clusters_labels, all_stats, outfile):
    with open(outfile,"wt") as out:
        for cluster in clusters_ads_mapping:
            faces_min = []
            faces_max = []
            faces_median = []
            faces_total = []
            images_count = []
            for ad_u in clusters_ads_mapping[cluster]:
                ad = str(ad_u.strip())
                if ad not in all_stats:
                    #print 'Ad {} not in all_stats'.format(ad)
                    #print '.',
                    continue
                # compute aggrageted stats
                faces_min.append(all_stats[ad]['faces_min'])
                faces_max.append(all_stats[ad]['faces_max'])
                faces_median.append(all_stats[ad]['faces_median'])
                faces_total.append(all_stats[ad]['faces_total'])
                images_count.append(all_stats[ad]['images_count'])
            fmimi, fmima, fmime, _ = get_mmmt(faces_min)
            fmami, fmama, fmame, _ = get_mmmt(faces_max)
            fmemi, fmema, fmeme, _ = get_mmmt(faces_median)
            ftmi, ftma, ftme, ftt = get_mmmt(faces_total)
            imi, ima, ime, it = get_mmmt(images_count)
            ads_t = len(clusters_ads_mapping[cluster])
            stats = [fmimi, fmima, fmime, fmami, fmama, fmame, fmemi, fmema, fmeme, ftmi, ftma, ftme, ftt, float(ftt)/ads_t, imi, ima, ime, it, float(it)/ads_t, ads_t]
            #print stats
            #break
            out.write("{}\t{}\t{}\n".format(cluster, stats, clusters_labels[cluster]))



In [108]:

    
#print all_stats.keys()[0]
#print '81951EA0F273BF42552CBC73D01F4D21FCDED6ABE5D97DFA4DFEFD2B6386F418' in all_stats
produce_output_mayank(clusters_ads_mapping, clusters_labels, all_stats, out_file)



In [ ]: