The goal of this notebook is to produce ads statistics related to images and faces in the set of ads provided for the CP1 during the MEMEX Winter QPR 2017.
key is 'ad_id'
values are:
In [1]:
    
import os
# set some path parameters
input_dir = "../data/"
#prefix = "test"
prefix = "train"
image_url_sha1_file = os.path.join(input_dir,prefix+"_image_url_sha1.csv")
faces_file = os.path.join(input_dir,prefix+"_faces.jl")
images_file = os.path.join(input_dir,prefix+"_adjusted_images.json")
out_file = os.path.join(input_dir,prefix+"_images_faces_stats.jl")
    
In [2]:
    
# parse image_url_sha1_file
url_sha1_dict = {}
with open(image_url_sha1_file,"rt") as img_url_sha1:
    for line in img_url_sha1:
        url, sha1 = line.split(',')
        url_sha1_dict[url] = sha1
    
In [3]:
    
print len(url_sha1_dict)
    
    
In [4]:
    
# parse faces_file
faces_dict = {}
import json
nb_faces_total = 0
nb_images_wface = 0
with open(faces_file, "rt") as faces:
    for line in faces:
        one_face_dict = json.loads(line)
        img_sha1 = one_face_dict.keys()[0]
        #print img_sha1
        #print one_face_dict[img_sha1].keys()
        nb_faces = len(one_face_dict[img_sha1].keys())
        #print nb_faces
        nb_faces_total += nb_faces
        if nb_faces>0:
            nb_images_wface += 1
        faces_dict[img_sha1] = nb_faces
    
In [5]:
    
print len(faces_dict)
print nb_faces_total
print nb_images_wface
    
    
In [6]:
    
# parse images_file
ads_images_dict = {}
import json
with open(images_file, "rt") as images:
    for line in images:
        one_image_dict = json.loads(line)
        ad_id_list = one_image_dict['obj_parent']
        #img_url = one_image_dict['obj_original_url']
        img_url = one_image_dict['obj_stored_url']
        if type(ad_id_list) is not list:
            ad_id_list = [ad_id_list]
        for ad_id in ad_id_list:
            #print ad_id,img_url
            if ad_id not in ads_images_dict:
                ads_images_dict[ad_id] = [img_url]
            else:
                ads_images_dict[ad_id].append(img_url)
    
In [7]:
    
print len(ads_images_dict)
    
    
In [ ]:
    
# produce output
import numpy as np
import json
nb_ads = 0
with open(out_file, "wt") as out:
    for ad in ads_images_dict:
        one_out = {}
        one_out[ad] = {}
        one_out[ad]["images_count"] = len(ads_images_dict[ad])
        faces_count = []
        for one_img_url in ads_images_dict[ad]:
            #print one_img_url
            if one_img_url is None or not one_img_url:
                faces_count.append(0)
                continue
            try:
                url_sha1 = url_sha1_dict[one_img_url].strip()
            except:
                #print 'Cannot find one_img_url {} in url_sha1_dict'.format(one_img_url)
                url_sha1 = None
            #print url_sha1
            if url_sha1 is not None:
                try:
                    faces_count.append(faces_dict[url_sha1])
                except:
                    print 'Cannot find url_sha1 {} in faces_dict'.format(url_sha1)
                    faces_count.append(0)
            else:
                faces_count.append(0)
        #print faces_count
        #'faces_total' as the total number of faces detected in all images of this ad
        one_out[ad]["faces_total"] = np.sum(faces_count)
        #'faces_min' as the minimum number of faces detected in any image of this ad
        one_out[ad]["faces_min"] = np.min(faces_count)
        #'faces_max' as the maximum number of faces detected in any image of this ad
        one_out[ad]["faces_max"] = np.max(faces_count)
        #'faces_median' as the median number of faces detected in any image of this ad
        one_out[ad]["faces_median"] = int(np.median(faces_count))
        out.write(json.dumps(one_out)+'\n')
        nb_ads += 1