Loading ad clusters and image SHA1s

Loading cluster, ad and SHA1 ships from the base file.

Expected files

  • compute_many_descriptors script output CSV of images that were actually processed
  • CSV mapping association of [clusterID, ad, SHA1]

Filter based on computed descriptors.

Some images were not valid (i.e. were gifs or HTML pages), and should not be considered. This may cause some ads to no longer have child images and thus should also be not considered. This again applies to clusters that have no resulting child ads.

Maps and files saved from this point on have been filtered by what has been actually computed.


In [ ]:
__depends__ = ['map.cp1_data.csv']
__dest__ = [
    'positive.cluster2ads.pickle',
    'positive.cluster2shas.pickle',
    'positive.ad2shas.pickle',
    'positive.sha2ads.pickle',
    'negative.cluster2ads.pickle',
    'negative.cluster2shas.pickle',
    'negative.ad2shas.pickle',
    'negative.sha2ads.pickle',
    'train_pos_clusters.pickle',
    'train_pos_ads.pickle',
    'train_pos_shas.pickle',
    'train_neg_clusters.pickle',
    'train_neg_ads.pickle',
    'train_neg_shas.pickle',
    'test_pos_clusters.pickle',
    'test_pos_ads.pickle',
    'test_pos_shas.pickle',
    'test_neg_clusters.pickle',
    'test_neg_ads.pickle',
    'test_neg_shas.pickle',
    'test_eval_gt.jl',
    'test_eval_data.csv',
]

In [ ]:
# cluster/ad/SHA1/label relationship for each image to be considered
CP1_DATA_CSV = 'map.cp1_data.csv'
# Output files
# - positive/negative sub-maps
POS_CLUSTER_ID_TO_AD_IDS = 'positive.cluster2ads.pickle'
POS_CLUSTER_ID_TO_SHAS   = 'positive.cluster2shas.pickle'
POS_AD_ID_TO_SHAS        = 'positive.ad2shas.pickle'
POS_SHA_TO_AD_IDS        = 'positive.sha2ads.pickle'
NEG_CLUSTER_ID_TO_AD_IDS = 'negative.cluster2ads.pickle'
NEG_CLUSTER_ID_TO_SHAS   = 'negative.cluster2shas.pickle'
NEG_AD_ID_TO_SHAS        = 'negative.ad2shas.pickle'
NEG_SHA_TO_AD_IDS        = 'negative.sha2ads.pickle'
# - positive/negative ID sets
TRAIN_POS_CLUSTER_IDS = 'train_pos_clusters.pickle'
TRAIN_POS_AD_IDS      = 'train_pos_ads.pickle'
TRAIN_POS_SHA1S       = 'train_pos_shas.pickle'
TRAIN_NEG_CLUSTER_IDS = 'train_neg_clusters.pickle'
TRAIN_NEG_AD_IDS      = 'train_neg_ads.pickle'
TRAIN_NEG_SHA1S       = 'train_neg_shas.pickle'
TEST_POS_CLUSTER_IDS  = 'test_pos_clusters.pickle'
TEST_POS_AD_IDS       = 'test_pos_ads.pickle'
TEST_POS_SHA1S        = 'test_pos_shas.pickle'
TEST_NEG_CLUSTER_IDS  = 'test_neg_clusters.pickle'
TEST_NEG_AD_IDS       = 'test_neg_ads.pickle'
TEST_NEG_SHA1S        = 'test_neg_shas.pickle'
#
TEST_EVAL_GT_JL = 'test_eval_gt.jl'
TEST_EVAL_DATA_CSV = 'test_eval_data.csv'

In [ ]:
import collections
import csv

import cPickle as pickle

In [ ]:
pos_computed_shas = set()
neg_computed_shas = set()

pos_cluster2ads = collections.defaultdict(set)
pos_cluster2shas = collections.defaultdict(set)
pos_ad2shas = collections.defaultdict(set)
pos_sha2ads = collections.defaultdict(set)

neg_cluster2ads = collections.defaultdict(set)
neg_cluster2shas = collections.defaultdict(set)
neg_ad2shas = collections.defaultdict(set)
neg_sha2ads = collections.defaultdict(set)

# SHA1 values of images actually computable
print "Loading cp1_data csv"
with open(CP1_DATA_CSV) as f:
    reader = csv.reader(f)
    for r in reader:
        c_id, ad_id, sha, label = r
        label = int(label)

        if label == 1:  # positive
            pos_computed_shas.add(sha)
            pos_cluster2ads[c_id].add(ad_id)
            pos_cluster2shas[c_id].add(sha)
            pos_ad2shas[ad_id].add(sha)
            pos_sha2ads[sha].add(ad_id)
        elif label == 0:
            neg_computed_shas.add(sha)
            neg_cluster2ads[c_id].add(ad_id)
            neg_cluster2shas[c_id].add(sha)
            neg_ad2shas[ad_id].add(sha)
            neg_sha2ads[sha].add(ad_id)
        else:
            raise ValueError("Got unexpected truth label: %s" % label)
        
print "Done"

In [ ]:
len(pos_computed_shas), len(neg_computed_shas)

In [ ]:
# Check that the negative example cluster IDs are distict from postive example cluster IDs
#
# If negative cluster IDs intersect positive cluster IDs,
# re-assign negative cluster IDs by increasing by max(pos_cluster_ids)
pos_cluster_ids = set(pos_cluster2ads)
neg_cluster_ids = set(neg_cluster2ads)
# if there is intersection....
if pos_cluster_ids & neg_cluster_ids:
    print "Reassigning cluster IDs"
    offset = max(pos_cluster_ids)
    new_neg_cluster2ads  = collections.defaultdict(set)
    new_neg_cluster2shas = collections.defaultdict(set)
    
    neg_cluster_id_old2new = {}
    
    for cid in sorted(neg_cluster_ids, reverse=True):
        print "- %d -> %d" % (cid, cid+offset)
        neg_cluster_id_old2new[cid] = cid+offset
        
        new_neg_cluster2ads[cid+offset] = neg_cluster2ads[cid]
        new_neg_cluster2shas[cid+offset] = neg_cluster2shas[cid]
    
    neg_cluster_ids = set(new_neg_cluster2ad)
    neg_cluster2ads = new_neg_cluster2ads
    neg_cluster2shas = new_neg_cluster2shas
    del new_neg_cluster2ads, new_neg_cluster2shas
    
    with open('negative.cluster_id_reassignment.old2new.pickle', 'w') as f:
        print "Saving reassignment mapping"
        pickle.dump(neg_cluster_id_old2new, f, -1)
    print "Done"

In [ ]:
# SHA1's collected should now be <= to the SHA1s computed
print len( {s for c, shas in pos_cluster2shas.iteritems() for s in shas}.difference(pos_computed_shas) )
print len( {s for c, shas in neg_cluster2shas.iteritems() for s in shas}.difference(neg_computed_shas) )

# Number of intersectring SHA1 between positive and negative set
print len(set(pos_sha2ads) & set(neg_sha2ads))

In [ ]:
# Saving clusters
import json

def convert_dict(a):
    return dict( (k, list(v)) for k, v in a.iteritems() )

def pickle_dump(obj, fp):
    with open(fp, 'wb') as f:
        pickle.dump(obj, f, -1)

json_params = {"indent": 2, "separators": (',', ': '), "sort_keys": True}


# Saving positive info
print "Saving POS cluster->ads"
pickle_dump(pos_cluster2ads, POS_CLUSTER_ID_TO_AD_IDS)

print "Saving POS cluster->image shas"
pickle_dump(pos_cluster2shas, POS_CLUSTER_ID_TO_SHAS)

print "Saving POS ad->image shas"
pickle_dump(pos_ad2shas, POS_AD_ID_TO_SHAS)
    
print "Saving POS SHA1->ads"
pickle_dump(pos_sha2ads, POS_SHA_TO_AD_IDS)


# Saving negative info
print "Saving NEG cluster->ads"
pickle_dump(neg_cluster2ads, NEG_CLUSTER_ID_TO_AD_IDS)

print "Saving NEG cluster->image shas"
pickle_dump(neg_cluster2shas, NEG_CLUSTER_ID_TO_SHAS)

print "Saving NEG ad->image shas"
pickle_dump(neg_ad2shas, NEG_AD_ID_TO_SHAS)

print "Saving NEG SHA1->ads"
pickle_dump(neg_sha2ads, NEG_SHA_TO_AD_IDS)


print "Done"

Creating Train and Test sets

Separating data based on Clusters

Based on ordering clusters by the total number of child images. This lets us base train/test sets of approximately the same relative sizes.

Train has so many images because one cluster has ~30k child images.

We create a "train" set set with ~75% of images and a "test" set with the remaining ~25% of images.


In [ ]:
pos_clusters_ordered = sorted( pos_cluster2shas, 
                               key=lambda c: ( len(pos_cluster2shas[c]), c ),
                               reverse=1 )
neg_clusters_ordered = sorted( neg_cluster2shas,
                               key=lambda c: ( len(neg_cluster2shas[c]), c ),
                               reverse=1)

split_int = 5
test_split = split_int - 1

# Image classifier training clusters/ads/shas
train_pos_clusters = { c   for i, c in enumerate(pos_clusters_ordered) if i % split_int != test_split }
train_neg_clusters = { c   for i, c in enumerate(neg_clusters_ordered) if i % split_int != test_split }
train_pos_ads      = { ad  for c in train_pos_clusters for ad  in pos_cluster2ads[c] }
train_neg_ads      = { ad  for c in train_neg_clusters for ad  in neg_cluster2ads[c] }
train_pos_shas     = { sha for c in train_pos_clusters for sha in pos_cluster2shas[c] }
train_neg_shas     = { sha for c in train_neg_clusters for sha in neg_cluster2shas[c] }

# Test/Validation clusters/ads/shas
test_pos_clusters   = { c for i, c in enumerate(pos_clusters_ordered) if i % split_int == test_split }
test_neg_clusters   = { c for i, c in enumerate(neg_clusters_ordered) if i % split_int == test_split }
test_pos_ads        = { ad  for c in test_pos_clusters for ad  in pos_cluster2ads[c] }
test_neg_ads        = { ad  for c in test_neg_clusters for ad  in neg_cluster2ads[c] }
test_pos_shas       = { sha for c in test_pos_clusters for sha in pos_cluster2shas[c] }
test_neg_shas       = { sha for c in test_neg_clusters for sha in neg_cluster2shas[c] }

In [ ]:
print "Train 1 (image)"
print "  (pos)| clusters:", len(train_pos_clusters)
print "       | ads:",      len(train_pos_ads)
print "       | images:",   len(train_pos_shas)
print
print "  (neg)| clusters:", len(train_neg_clusters)
print "       | ads:",      len(train_neg_ads)
print "       | images:",   len(train_neg_shas)
print
print "Test"
print "  (pos)| clusters:", len(test_pos_clusters)
print "       | ads:",      len(test_pos_ads)
print "       | images:",   len(test_pos_shas)
print
print "  (neg)| clusters:", len(test_neg_clusters)
print "       | ads:",      len(test_neg_ads)
print "       | images:",   len(test_neg_shas)

In [ ]:
# Train - for image classifier
pickle_dump(train_pos_clusters, TRAIN_POS_CLUSTER_IDS)
pickle_dump(train_pos_ads, TRAIN_POS_AD_IDS)
pickle_dump(train_pos_shas, TRAIN_POS_SHA1S)

pickle_dump(train_neg_clusters, TRAIN_NEG_CLUSTER_IDS)
pickle_dump(train_neg_ads, TRAIN_NEG_AD_IDS)
pickle_dump(train_neg_shas, TRAIN_NEG_SHA1S)

# Test - for image/ad/cluster classifier validation
pickle_dump(test_pos_clusters, TEST_POS_CLUSTER_IDS)
pickle_dump(test_pos_ads, TEST_POS_AD_IDS)
pickle_dump(test_pos_shas, TEST_POS_SHA1S)

pickle_dump(test_neg_clusters, TEST_NEG_CLUSTER_IDS)
pickle_dump(test_neg_ads, TEST_NEG_AD_IDS)
pickle_dump(test_neg_shas, TEST_NEG_SHA1S)

In [ ]:
# Creating grount-truth json-lines file for test-set for use with MEMEX-provided evaluation script
# format: {"cluster_id": "<number>", "class": <int>}
# Class value should be:
# - 1 for positive
# - 0 for negative
with open(TEST_EVAL_GT_JL, 'w') as f:
    for c in sorted(test_pos_clusters):
        f.write( json.dumps({"cluster_id": str(c), "class": 1}) + "\n" )
    for c in sorted(test_neg_clusters):
        f.write( json.dumps({"cluster_id": str(c), "class": 0}) + "\n" )
        
# test-set equivalent of input ``map.cp1_data.csv`` file
test_cluster_ids = test_pos_clusters | test_neg_clusters
with open(TEST_EVAL_DATA_CSV, 'w') as f_out:
    writer = csv.writer(f_out)
    
    with open(CP1_DATA_CSV) as f_in:
        for r in csv.reader(f_in):
            if r[0] in test_cluster_ids:
                writer.writerow(r)

SHA1 Intersection Investigation

Some images have been found to be shared across ads in different clusters. Since clusters are supposed to represent distictly seperate entities or relationships, this shows that either the clusters are not linkable via multimedia only, they were incorrectly clustered, or actively split up on purpose. For the purpose of our approach (image-base classification), this means that the same images will potentially show up in both or all of the train/test/evaluation data sets.

Traditionally, the presence of the same/similar images in both train and test sets leads to faulty evaluation because the classifier has an easier time handling data it was trained on, and thus artificially higher scores. This is the same here in that train/test scores will probably be higher with their presence on both sides. However, if their shared presence is a strong positive indicator of a new HT ad, so their repeated positive recognition is a boon. On the other hand, again, we may want to see/measure how the classifier is performing due to abstract features, not including repeat imagery.


In [ ]:
train_test_intersection = train_pos_shas & test_pos_shas

print len(train_test_intersection)