In [ ]:
import copy
import json
from collections import defaultdict
from collections import OrderedDict
import pickle
import nltk
from nltk.corpus import wordnet as wn
from nltk.stem.lancaster import LancasterStemmerst = LancasterStemmer()
import pattern
from pattern.en import singularize
In [ ]:
def isplural(pluralForm):
singularForm = singularize(pluralForm)
plural = True if pluralForm is not singularForm else False
return plural, singularForm
'''isp, singularForm = isplural(pluralForm)
print pluralForm, singularForm, isp'''
In [ ]:
def bigrams(check):
return [first + " " + second for first, second in zip(check.split()[:-1], check.split()[1:]) ]
In [ ]:
with open("../Questions/MultipleChoice_mscoco_val2014_questions.json") as f:
data = json.load(f)
In [ ]:
imgids = []
for each in data['questions']:
i = int(each["image_id"])
imgids.append(i)
In [ ]:
imgids = list(set(imgids))
print len(imgids)
In [ ]:
imgid_index = {}
for index, imgid in enumerate(imgids):
imgid_index[imgid] = index
print len(imgid_index)
In [ ]:
pickle.dump( imgid_index, open( "coco_vectors/imgid_index.p", "wb" ) )
In [ ]:
questions = defaultdict(str)
vocab = set()
for each in data['questions']:
i = int(each["image_id"])
q = each["question"]
for word in q.split():
vocab.add(word)
qid = int(each["question_id"])
if i not in questions:
questions[i] = str(q).lower().strip("?")
else:
questions[i] = str(questions[i]) + " " + str(q).lower().strip("?")
print len(questions) #82783 number of images
print questions[108677]
print len(list(vocab))
#22225 train vocab
1 'person'} {u'supercategory': u'vehicle', u'id': 2, u'name': u'bicycle'} {u'supercategory': u'vehicle', u'id': 3, u'name': u'car'} {u'supercategory': u'vehicle', u'id': 4, u'name': u'motorcycle'} {u'supercategory': u'vehicle', u'id': 5, u'name': u'airplane'} {u'supercategory': u'vehicle', u'id': 6, u'name': u'bus'} {u'supercategory': u'vehicle', u'id': 7, u'name': u'train'} {u'supercategory': u'vehicle', u'id': 8, u'name': u'truck'} {u'supercategory': u'vehicle', u'id': 9, u'name': u'boat'} {u'supercategory': u'outdoor', u'id': 10, u'name': u'traffic light'} {u'supercategory': u'outdoor', u'id': 11, u'name': u'fire hydrant'} {u'supercategory': u'outdoor', u'id': 13, u'name': u'stop sign'} {u'supercategory': u'outdoor', u'id': 14, u'name': u'parking meter'} {u'supercategory': u'outdoor', u'id': 15, u'name': u'bench'} {u'supercategory': u'animal', u'id': 16, u'name': u'bird'} {u'supercategory': u'animal', u'id': 17, u'name': u'cat'} {u'supercategory': u'animal', u'id': 18, u'name': u'dog'} {u'supercategory': u'animal', u'id': 19, u'name': u'horse'} {u'supercategory': u'animal', u'id': 20, u'name': u'sheep'} {u'supercategory': u'animal', u'id': 21, u'name': u'cow'} {u'supercategory': u'animal', u'id': 22, u'name': u'elephant'} {u'supercategory': u'animal', u'id': 23, u'name': u'bear'} {u'supercategory': u'animal', u'id': 24, u'name': u'zebra'} {u'supercategory': u'animal', u'id': 25, u'name': u'giraffe'} {u'supercategory': u'accessory', u'id': 27, u'name': u'backpack'} {u'supercategory': u'accessory', u'id': 28, u'name': u'umbrella'} {u'supercategory': u'accessory', u'id': 31, u'name': u'handbag'} {u'supercategory': u'accessory', u'id': 32, u'name': u'tie'} {u'supercategory': u'accessory', u'id': 33, u'name': u'suitcase'} {u'supercategory': u'sports', u'id': 34, u'name': u'frisbee'} {u'supercategory': u'sports', u'id': 35, u'name': u'skis'} {u'supercategory': u'sports', u'id': 36, u'name': u'snowboard'} {u'supercategory': u'sports', u'id': 37, u'name': u'sports ball'} {u'supercategory': u'sports', u'id': 38, u'name': u'kite'} {u'supercategory': u'sports', u'id': 39, u'name': u'baseball bat'} {u'supercategory': u'sports', u'id': 40, u'name': u'baseball glove'} {u'supercategory': u'sports', u'id': 41, u'name': u'skateboard'} {u'supercategory': u'sports', u'id': 42, u'name': u'surfboard'} {u'supercategory': u'sports', u'id': 43, u'name': u'tennis racket'} {u'supercategory': u'kitchen', u'id': 44, u'name': u'bottle'} {u'supercategory': u'kitchen', u'id': 46, u'name': u'wine glass'} {u'supercategory': u'kitchen', u'id': 47, u'name': u'cup'} {u'supercategory': u'kitchen', u'id': 48, u'name': u'fork'} {u'supercategory': u'kitchen', u'id': 49, u'name': u'knife'} {u'supercategory': u'kitchen', u'id': 50, u'name': u'spoon'} {u'supercategory': u'kitchen', u'id': 51, u'name': u'bowl'} {u'supercategory': u'food', u'id': 52, u'name': u'banana'} {u'supercategory': u'food', u'id': 53, u'name': u'apple'} {u'supercategory': u'food', u'id': 54, u'name': u'sandwich'} {u'supercategory': u'food', u'id': 55, u'name': u'orange'} {u'supercategory': u'food', u'id': 56, u'name': u'broccoli'} {u'supercategory': u'food', u'id': 57, u'name': u'carrot'} {u'supercategory': u'food', u'id': 58, u'name': u'hot dog'} {u'supercategory': u'food', u'id': 59, u'name': u'pizza'} {u'supercategory': u'food', u'id': 60, u'name': u'donut'} {u'supercategory': u'food', u'id': 61, u'name': u'cake'} {u'supercategory': u'furniture', u'id': 62, u'name': u'chair'} {u'supercategory': u'furniture', u'id': 63, u'name': u'couch'} {u'supercategory': u'furniture', u'id': 64, u'name': u'potted plant'} {u'supercategory': u'furniture', u'id': 65, u'name': u'bed'} {u'supercategory': u'furniture', u'id': 67, u'name': u'dining table'} {u'supercategory': u'furniture', u'id': 70, u'name': u'toilet'} {u'supercategory': u'electronic', u'id': 72, u'name': u'tv'} {u'supercategory': u'electronic', u'id': 73, u'name': u'laptop'} {u'supercategory': u'electronic', u'id': 74, u'name': u'mouse'} {u'supercategory': u'electronic', u'id': 75, u'name': u'remote'} {u'supercategory': u'electronic', u'id': 76, u'name': u'keyboard'} {u'supercategory': u'electronic', u'id': 77, u'name': u'cell phone'} {u'supercategory': u'appliance', u'id': 78, u'name': u'microwave'} {u'supercategory': u'appliance', u'id': 79, u'name': u'oven'} {u'supercategory': u'appliance', u'id': 80, u'name': u'toaster'} {u'supercategory': u'appliance', u'id': 81, u'name': u'sink'} {u'supercategory': u'appliance', u'id': 82, u'name': u'refrigerator'} {u'supercategory': u'indoor', u'id': 84, u'name': u'book'} {u'supercategory': u'indoor', u'id': 85, u'name': u'clock'} {u'supercategory': u'indoor', u'id': 86, u'name': u'vase'} {u'supercategory': u'indoor', u'id': 87, u'name': u'scissors'} {u'supercategory': u'indoor', u'id': 88, u'name': u'teddy bear'} {u'supercategory': u'indoor', u'id': 89, u'name': u'hair drier'} {u'supercategory': u'indoor', u'id': 90, u'name': u'toothbrush'}
In [ ]:
word_index = {}
for index, word in enumerate(list(vocab)):
word_index[word] = index
In [ ]:
#coco_categories = ['street sign', 'window', 'door', 'desk', 'hair','brush', 'plate', 'eyeglasses', 'hat', 'mirror', 'blender', 'shoe', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus','train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack','umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite','baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl','banana', 'apple', 'sandwich', 'orange', 'broccoli','carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'dining table','toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven','toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier','toothbrush']
print len(coco_categories)
In [ ]:
syn = OrderedDict()
for word in coco_categories:
s = wn.synsets(word.strip("\n"))
n = []
for each in s:
name = str(each.lemmas()[0].name()).lower()
if '_' in name:
temp = name.split("_")
name = " ".join(temp)
if name not in n:
n.append(name)
syn[word] = n
In [ ]:
my_id_category = {}
for each, thing in enumerate(syn):
#print each,thing
my_id_category[each] = thing
In [ ]:
pickle.dump( my_id_category, open( "coco_vectors/my_id_category.p", "wb" ) )
In [ ]:
#fine tuning for each category
#syn['person'] += ['he', 'they','them','she','man','woman','person','people','girl','boy']
#syn['motorcycle'] += ['bike','bikes','biking']
#syn['sports ball'] += ['sporting','sporty','sports', 'football', 'basketball', 'baseball', 'dodgeball', 'ball']
#syn['hair drier'] += ['hair dryer', 'blow dry', 'blow drying', 'blow dryer', 'dryer']
#syn['potted plant'] += ['plant', 'flower', 'green']
In [ ]:
pathtoimg = {}
with open('val_paths.txt', 'rb') as f:
val_paths = f.readlines()
for index in xrange(len(val_paths)):
p = val_paths[index].split()[0].split("/")[-1]
imgid = int(p.split("_")[-1].strip(".jpg"))
pathtoimg[imgid] = p
In [ ]:
assert len(pathtoimg)==40504,"ERROR:length mismatch"
In [ ]:
#read all questions
qid_vector = []
for imgid, ques in questions.iteritems():
for word in ques.split():
for category, synonyms in syn.iteritems():
isp, singularForm = isplural(word)
if singularForm != word:
if word in synonyms or word == str(category):
s = str(coco_categories.index(category))
if s not in qid_vector:
qid_vector.append(s)
#print imgid, category
elif singularForm in synonyms or singularForm == str(category):
s = str(coco_categories.index(category))
if s not in qid_vector:
qid_vector.append(s)
#print imgid, category
else:
if word in synonyms or word == str(category):
s = str(coco_categories.index(category))
if s not in qid_vector:
qid_vector.append(s)
#print imgid, category
for word in list(bigrams(ques)):
for category, synonyms in syn.iteritems():
isp, singularForm = isplural(word)
if singularForm != word:
if word in synonyms or word == str(category):
s = str(coco_categories.index(category))
if s not in qid_vector:
qid_vector.append(s)
#print imgid, category
elif singularForm in synonyms or singularForm == str(category):
s = str(coco_categories.index(category))
if s not in qid_vector:
qid_vector.append(s)
#print imgid, category
else:
if word in synonyms or word == str(category):
s = str(coco_categories.index(category))
if s not in qid_vector:
qid_vector.append(s)
#print imgid, category
In [ ]:
with open('coco_vectors/remote.txt','wb') as write_file:
write_file.write('\n'.join(qid_vector))
In [ ]:
for category, synonyms in syn.iteritems():
print category, str(coco_categories.index(category))
In [ ]:
train_data = {}
for imgid, ques in qid_vector.iteritems():
#s = ''
assert len(ques) == len(coco_categories), str(imgid)
#for each in ques:
#s = s + ' ' + str(each)
#s = s.strip()
array_ques = np.array(ques)
#s = '/nfs.yoda/sganju1/caffe/data/coco/train2014/COCO_train2014_000000' + str(imgid) + '.jpg' + ' ' + s
train_data[imgid] = array_ques
In [ ]:
print len(train_data)
pickle.dump( train_data, open( "val_id.p", "wb" ) )
In [ ]:
with open('coco_vectors/val.json','wb') as write_file:
#write_file.write('\n'.join(train_data))
json.dump(qid_vector,write_file, indent=4)