The goal of this notebook is to produce ads statistics related to images and faces in the set of ads provided for the CP1 during the MEMEX Winter QPR 2017.
key is 'ad_id'
values are:
In [1]:
import os
# set some path parameters
input_dir = "../data/"
#prefix = "test"
prefix = "train"
image_url_sha1_file = os.path.join(input_dir,prefix+"_image_url_sha1.csv")
faces_file = os.path.join(input_dir,prefix+"_faces.jl")
images_file = os.path.join(input_dir,prefix+"_adjusted_images.json")
out_file = os.path.join(input_dir,prefix+"_images_faces_stats.jl")
In [2]:
# parse image_url_sha1_file
url_sha1_dict = {}
with open(image_url_sha1_file,"rt") as img_url_sha1:
for line in img_url_sha1:
url, sha1 = line.split(',')
url_sha1_dict[url] = sha1
In [3]:
print len(url_sha1_dict)
In [4]:
# parse faces_file
faces_dict = {}
import json
nb_faces_total = 0
nb_images_wface = 0
with open(faces_file, "rt") as faces:
for line in faces:
one_face_dict = json.loads(line)
img_sha1 = one_face_dict.keys()[0]
#print img_sha1
#print one_face_dict[img_sha1].keys()
nb_faces = len(one_face_dict[img_sha1].keys())
#print nb_faces
nb_faces_total += nb_faces
if nb_faces>0:
nb_images_wface += 1
faces_dict[img_sha1] = nb_faces
In [5]:
print len(faces_dict)
print nb_faces_total
print nb_images_wface
In [6]:
# parse images_file
ads_images_dict = {}
import json
with open(images_file, "rt") as images:
for line in images:
one_image_dict = json.loads(line)
ad_id_list = one_image_dict['obj_parent']
#img_url = one_image_dict['obj_original_url']
img_url = one_image_dict['obj_stored_url']
if type(ad_id_list) is not list:
ad_id_list = [ad_id_list]
for ad_id in ad_id_list:
#print ad_id,img_url
if ad_id not in ads_images_dict:
ads_images_dict[ad_id] = [img_url]
else:
ads_images_dict[ad_id].append(img_url)
In [7]:
print len(ads_images_dict)
In [ ]:
# produce output
import numpy as np
import json
nb_ads = 0
with open(out_file, "wt") as out:
for ad in ads_images_dict:
one_out = {}
one_out[ad] = {}
one_out[ad]["images_count"] = len(ads_images_dict[ad])
faces_count = []
for one_img_url in ads_images_dict[ad]:
#print one_img_url
if one_img_url is None or not one_img_url:
faces_count.append(0)
continue
try:
url_sha1 = url_sha1_dict[one_img_url].strip()
except:
#print 'Cannot find one_img_url {} in url_sha1_dict'.format(one_img_url)
url_sha1 = None
#print url_sha1
if url_sha1 is not None:
try:
faces_count.append(faces_dict[url_sha1])
except:
print 'Cannot find url_sha1 {} in faces_dict'.format(url_sha1)
faces_count.append(0)
else:
faces_count.append(0)
#print faces_count
#'faces_total' as the total number of faces detected in all images of this ad
one_out[ad]["faces_total"] = np.sum(faces_count)
#'faces_min' as the minimum number of faces detected in any image of this ad
one_out[ad]["faces_min"] = np.min(faces_count)
#'faces_max' as the maximum number of faces detected in any image of this ad
one_out[ad]["faces_max"] = np.max(faces_count)
#'faces_median' as the median number of faces detected in any image of this ad
one_out[ad]["faces_median"] = int(np.median(faces_count))
out.write(json.dumps(one_out)+'\n')
nb_ads += 1