The goal of this notebook is to detail how to interact with, and compute statistics on the images associated to the set of ads provided for the CP1 during the MEMEX Winter QPR 2017.
Data posted on HDFS, see Wiki page.
Plus the data available on the Wiki
In [1]:
import os
import csv
import json
In [2]:
# set some parameters
data_dir = "../data"
prefix = "test"
if prefix=="train":
input_file = "train_adjusted.json"
else:
input_file = "test_adjusted_unlabelled.json"
In [3]:
images_dir = os.path.join(data_dir,prefix+"_images")
url_sha1_file = os.path.join(data_dir,prefix+"_image_url_sha1.csv")
faces_file = os.path.join(data_dir,prefix+"_faces.jl")
stats_file = os.path.join(data_dir,prefix+"_images_faces_stats.jl")
images_file = os.path.join(data_dir,prefix+"_adjusted_images.json")
In [4]:
# parse faces_file
def parse_faces(faces_file):
faces_dict = {}
with open(faces_file, "rt") as faces:
for line in faces:
one_face_dict = json.loads(line)
img_sha1 = one_face_dict.keys()[0]
nb_faces = len(one_face_dict[img_sha1].keys())
#print nb_faces
faces_dict[img_sha1] = dict()
faces_dict[img_sha1]['count'] = nb_faces
faces_dict[img_sha1]['detections'] = one_face_dict[img_sha1]
return faces_dict
In [5]:
faces_dict = parse_faces(faces_file)
In [6]:
print len(faces_dict)
i = 3
print faces_dict.keys()[i], faces_dict[faces_dict.keys()[i]]
In [7]:
# parse images_file
def parse_images_file(images_file):
ads_images_dict = {}
with open(images_file, "rt") as images:
for line in images:
one_image_dict = json.loads(line)
ad_id_list = one_image_dict['obj_parent']
img_url = one_image_dict['obj_stored_url']
if type(ad_id_list) is not list:
ad_id_list = [ad_id_list]
for ad_id in ad_id_list:
if ad_id not in ads_images_dict:
ads_images_dict[ad_id] = [img_url]
else:
ads_images_dict[ad_id].append(img_url)
return ads_images_dict
In [8]:
ads_images_dict = parse_images_file(images_file)
In [9]:
print len(ads_images_dict)
print ads_images_dict.keys()[0],ads_images_dict[ads_images_dict.keys()[0]]
In [10]:
# parse image_url_sha1_file
def parse_url_sha1_file(url_sha1_file):
url_sha1_dict = {}
with open(url_sha1_file,"rt") as img_url_sha1:
for line in img_url_sha1:
url, sha1 = line.split(',')
url_sha1_dict[url] = sha1
return url_sha1_dict
In [11]:
url_sha1_dict = parse_url_sha1_file(url_sha1_file)
In [12]:
print len(url_sha1_dict)
print url_sha1_dict.keys()[0],url_sha1_dict[url_sha1_dict.keys()[0]]
In [137]:
import matplotlib
from numpy.random import randn
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
%matplotlib inline
In [136]:
def to_percent(y, position):
# Ignore the passed in position. This has the effect of scaling the default
# tick locations.
s = str(100 * y)
# The percent symbol needs escaping in latex
if matplotlib.rcParams['text.usetex'] is True:
return s + r'$\%$'
else:
return s + '%'
In [145]:
def get_ad_images(ad_id, ads_images_dict, url_sha1_dict, verbose=False):
images_url_list = ads_images_dict[ad_id]
images_sha1s = []
for image_url in images_url_list:
if image_url is None or not image_url:
continue
try:
images_sha1s.append(url_sha1_dict[image_url.strip()].strip())
except:
if verbose:
print 'Cannot find sha1 for: {}.'.format(image_url)
return images_sha1s
In [146]:
# Analyze distribution of images in ads_images_dict
images_count = []
for ad_id in ads_images_dict:
images_count.append(len(get_ad_images(ad_id, ads_images_dict, url_sha1_dict)))
In [151]:
def print_stats(np_img_count):
print np.min(np_img_count), np.mean(np_img_count), np.max(np_img_count)
# Normed histogram seems to be broken,
# using weights as suggested in http://stackoverflow.com/questions/5498008/pylab-histdata-normed-1-normalization-seems-to-work-incorrect
weights = np.ones_like(np_img_count)/float(len(np_img_count))
res = plt.hist(np_img_count, bins=100, weights=weights)
print np.sum(res[0])
# Create the formatter using the function to_percent. This multiplies all the
# default labels by 100, making them all percentages
formatter = FuncFormatter(to_percent)
# Set the formatter
plt.gca().yaxis.set_major_formatter(formatter)
plt.show()
In [152]:
print_stats(np.asarray(images_count))
In [149]:
def get_faces_images(images_sha1s, faces_dict):
faces_out = {}
for sha1 in images_sha1s:
img_notfound = False
try:
tmp_faces = faces_dict[sha1]
except:
img_notfound = True
if img_notfound or tmp_faces['count']==0:
faces_out[sha1] = []
continue
bboxes = []
for face in tmp_faces['detections']:
bbox = [float(x) for x in tmp_faces['detections'][face]['bbox'].split(',')]
bbox.append(float(tmp_faces['detections'][face]['score']))
bboxes.append(bbox)
#print bboxes
faces_out[sha1] = bboxes
return faces_out
In [180]:
def show_faces(faces, images_dir):
from matplotlib.pyplot import imshow
from IPython.display import display
import numpy as np
%matplotlib inline
imgs = []
for face in faces:
if faces[face]:
img = open_image(face, images_dir)
draw_face_bbox(img, faces[face])
imgs.append(img)
if not imgs:
print 'No face images'
display(*imgs)
In [150]:
# get all faces ads from each ad
faces_in_images_percent = []
for ad_id in ads_images_dict:
images_sha1s = get_ad_images(ad_id, ads_images_dict, url_sha1_dict)
faces_images = get_faces_images(images_sha1s, faces_dict)
if len(faces_images)==0:
continue
nb_faces = 0
for face in faces_images:
if faces_images[face]:
nb_faces += 1
faces_in_images_percent.append(float(nb_faces)/len(faces_images))
In [154]:
np_faces_in_images_percent = np.asarray(faces_in_images_percent)
print_stats(np_faces_in_images_percent)
In [163]:
no_faces = np.where(np_faces_in_images_percent==0.0)
print no_faces[0].shape
print np_faces_in_images_percent.shape
percent_noface = float(no_faces[0].shape[0])/np_faces_in_images_percent.shape[0]
print 1-percent_noface
In [187]:
# get all faces scores from each ad
faces_scores = []
all_faces = []
for ad_id in ads_images_dict:
images_sha1s = get_ad_images(ad_id, ads_images_dict, url_sha1_dict)
faces_images = get_faces_images(images_sha1s, faces_dict)
if len(faces_images)==0:
continue
nb_faces = 0
for face in faces_images:
if faces_images[face]:
for one_face in faces_images[face]:
all_faces.append([face, one_face])
faces_scores.append(float(one_face[4]))
In [188]:
np_faces_scores = np.asarray(faces_scores)
print_stats(faces_scores)
In [189]:
low_scores_faces = np.where(np_faces_scores<0.90)[0]
print float(len(low_scores_faces))/len(np_faces_scores)
very_low_scores_faces = np.where(np_faces_scores<0.80)[0]
print float(len(very_low_scores_faces))/len(np_faces_scores)
#all_faces
print len(np_faces_scores)
In [ ]:
nb_faces_to_show = 10
np.random.shuffle(very_low_scores_faces)
faces_to_show = [all_faces[x] for x in very_low_scores_faces[:nb_faces_to_show]]
print faces_to_show
for face_id, face in faces_to_show:
print face_id, face
face_dict = {}
face_dict[face_id] = [face]
show_faces(face_dict, images_dir)
In [57]:
def get_fnt(img, txt):
from PIL import ImageFont
# portion of image width you want text width to be
img_fraction = 0.20
fontsize = 2
font = ImageFont.truetype("arial.ttf", fontsize)
while font.getsize(txt)[0] < img_fraction*img.size[0]:
# iterate until the text size is just larger than the criteria
fontsize += 1
font = ImageFont.truetype("arial.ttf", fontsize)
return font, font.getsize(txt)[0]
In [63]:
def draw_face_bbox(img, bboxes, width=4):
from PIL import ImageDraw
import numpy as np
draw = ImageDraw.Draw(img)
for bbox in bboxes:
for i in range(width):
rect_start = (int(np.round(bbox[0] + width/2 - i)), int(np.round(bbox[1] + width/2 - i)))
rect_end = (int(np.round(bbox[2] - width/2 + i)), int(np.round(bbox[3] - width/2 + i)))
draw.rectangle((rect_start, rect_end), outline=(0, 255, 0))
# print score?
if len(bbox)==5:
score = str(bbox[4])
fnt, text_size = get_fnt(img, score[:5])
draw.text((np.round((bbox[0]+bbox[2])/2-text_size/2),np.round(bbox[1])), score[:5], font=fnt, fill=(255,255,255,64))
In [64]:
def open_image(sha1, images_dir):
from PIL import Image
img = Image.open(os.path.join(images_dir, sha1[:3], sha1))
return img
In [240]:
#face images of ad '84FC37A4E38F7DE2B9FCAAB902332ED60A344B8DF90893A5A8BE3FC1139FCD5A' are blurred but detected
# image '20893a926fbf50d1a5994f70ec64dbf33dd67e2a' highly pixelated
# male strippers '20E4597A6DA11BC07BB7578FFFCE07027F885AF02265FD663C0911D2699E0A79'
In [ ]:
all_ads_id = range(len(ads_images_dict.keys()))
import numpy as np
np.random.shuffle(all_ads_id)
ad_id = ads_images_dict.keys()[all_ads_id[0]]
print ad_id
images_sha1s = get_ad_images(ad_id, ads_images_dict, url_sha1_dict)
print images_sha1s
faces = get_faces_images(images_sha1s, faces_dict)
print faces
show_faces(faces, images_dir)
In [ ]: