Face detection

The goal of this notebook is to produce ads statistics related to images and faces in the set of ads provided for the CP1 during the MEMEX Winter QPR 2017.

Inputs

  1. image_url_sha1.csv: mapping from images "obj_stored_url" to sha1
  2. faces.jl: faces detected in images, json line file with sha1 as key
  3. images.json: images documents, json line file. Need "obj_stored_url" and "obj_parent" from that file.

Outputs

  1. images_faces_stats.jl

key is 'ad_id'

values are:

  • 'image_count' as the number of images url (even if not acessible, readable)
  • 'faces_total' as the total number of faces detected in all images of this ad
  • 'faces_min' as the minimum number of faces detected in any image of this ad
  • 'faces_max' as the maximum number of faces detected in any image of this ad
  • 'faces_median' as the median number of faces detected in any image of this ad

In [1]:
import os
# set some path parameters
input_dir = "../data/"
#prefix = "test"
prefix = "train"
image_url_sha1_file = os.path.join(input_dir,prefix+"_image_url_sha1.csv")
faces_file = os.path.join(input_dir,prefix+"_faces.jl")
images_file = os.path.join(input_dir,prefix+"_adjusted_images.json")
out_file = os.path.join(input_dir,prefix+"_images_faces_stats.jl")

In [2]:
# parse image_url_sha1_file
url_sha1_dict = {}
with open(image_url_sha1_file,"rt") as img_url_sha1:
    for line in img_url_sha1:
        url, sha1 = line.split(',')
        url_sha1_dict[url] = sha1

In [3]:
print len(url_sha1_dict)


315501

In [4]:
# parse faces_file
faces_dict = {}
import json
nb_faces_total = 0
nb_images_wface = 0
with open(faces_file, "rt") as faces:
    for line in faces:
        one_face_dict = json.loads(line)
        img_sha1 = one_face_dict.keys()[0]
        #print img_sha1
        #print one_face_dict[img_sha1].keys()
        nb_faces = len(one_face_dict[img_sha1].keys())
        #print nb_faces
        nb_faces_total += nb_faces
        if nb_faces>0:
            nb_images_wface += 1
        faces_dict[img_sha1] = nb_faces

In [5]:
print len(faces_dict)
print nb_faces_total
print nb_images_wface


103124
52712
45699

In [6]:
# parse images_file
ads_images_dict = {}
import json
with open(images_file, "rt") as images:
    for line in images:
        one_image_dict = json.loads(line)
        ad_id_list = one_image_dict['obj_parent']
        #img_url = one_image_dict['obj_original_url']
        img_url = one_image_dict['obj_stored_url']
        if type(ad_id_list) is not list:
            ad_id_list = [ad_id_list]
        for ad_id in ad_id_list:
            #print ad_id,img_url
            if ad_id not in ads_images_dict:
                ads_images_dict[ad_id] = [img_url]
            else:
                ads_images_dict[ad_id].append(img_url)

In [7]:
print len(ads_images_dict)


122045

In [ ]:
# produce output
import numpy as np
import json
nb_ads = 0
with open(out_file, "wt") as out:
    for ad in ads_images_dict:
        one_out = {}
        one_out[ad] = {}
        one_out[ad]["images_count"] = len(ads_images_dict[ad])
        faces_count = []
        for one_img_url in ads_images_dict[ad]:
            #print one_img_url
            if one_img_url is None or not one_img_url:
                faces_count.append(0)
                continue
            try:
                url_sha1 = url_sha1_dict[one_img_url].strip()
            except:
                #print 'Cannot find one_img_url {} in url_sha1_dict'.format(one_img_url)
                url_sha1 = None
            #print url_sha1
            if url_sha1 is not None:
                try:
                    faces_count.append(faces_dict[url_sha1])
                except:
                    print 'Cannot find url_sha1 {} in faces_dict'.format(url_sha1)
                    faces_count.append(0)
            else:
                faces_count.append(0)
        #print faces_count
        #'faces_total' as the total number of faces detected in all images of this ad
        one_out[ad]["faces_total"] = np.sum(faces_count)
        #'faces_min' as the minimum number of faces detected in any image of this ad
        one_out[ad]["faces_min"] = np.min(faces_count)
        #'faces_max' as the maximum number of faces detected in any image of this ad
        one_out[ad]["faces_max"] = np.max(faces_count)
        #'faces_median' as the median number of faces detected in any image of this ad
        one_out[ad]["faces_median"] = int(np.median(faces_count))
        out.write(json.dumps(one_out)+'\n')
        nb_ads += 1