In [1]:
from pycocotools.coco import COCO
import numpy
import random
import csv
import json

In [2]:
job = [ json.loads(line) for line in open("job_886896.json")  ]

In [3]:
dataDir='/home/gchrupala/repos/coco'
dataType='val2014'
cap = COCO('%s/annotations/captions_%s.json'%(dataDir,dataType))
coco = COCO('%s/annotations/instances_%s.json'%(dataDir,dataType))


loading annotations into memory...
Done (t=1.02s)
creating index...
index created!
loading annotations into memory...
Done (t=8.15s)
creating index...
index created!

In [4]:
descriptions = {}
for img in coco.imgs.values():
    for desc in cap.loadAnns(cap.getAnnIds(img['id'])):
        if desc['caption'] in descriptions:
            descriptions[desc['caption']].append({'id': desc['id'], 'image_id': desc['image_id']})
        else:
            descriptions[desc['caption']] = [{'id': desc['id'], 'image_id': desc['image_id']}]

In [5]:
def indexes(xs,x):
    return [i for i,z in enumerate(xs) if x == z]

def find_metadata(j, records):
    """Find coco metadata of row.
    """
    desc = j['data']['desc']
    urls = [j['data']['url_1'], j['data']['url_2'], j['data']['url_3'], j['data']['url_4']]
    ids = [ int(url.split("/")[-1]) for url in urls ]
    matches = [ r['image_id'] for r in records[desc] ]
    image_id, resp = [ (m,ixs[0]+1) for (m,ixs) in [ (match,indexes(ids,match)) for match in matches ] if len(ixs)==1][0]
    return {'image_id':image_id, 
            'id': [ r['id'] for r in records[desc] if r['image_id'] == image_id ][0],
            'response': resp,
            'candidates': ids
           }

In [6]:
meta = [ find_metadata(row, descriptions) for row in job ]

In [7]:
def merge(a,b):
    result = {}
    result.update(a)
    result.update(b)
    return result

merged = [ merge(j, {u'meta':m}) for (j,m) in zip(job,meta) ]

In [8]:
with open("job_886896_meta.json","w") as f:
    for record in merged:
        f.write(json.dumps(record))
        f.write("\n")

In [9]:
len(coco.imgs.values())


Out[9]:
40504

In [10]:
local=json.load(open("/home/gchrupala/repos/reimaginet/data/coco/dataset.json"))

In [14]:
local['images'][0]


Out[14]:
{u'cocoid': 391895,
 u'filename': u'COCO_val2014_000000391895.jpg',
 u'filepath': u'val2014',
 u'imgid': 0,
 u'sentences': [{u'imgid': 0,
   u'raw': u'A man with a red helmet on a small moped on a dirt road. ',
   u'sentid': 770337,
   u'tokens': [u'a',
    u'man',
    u'with',
    u'a',
    u'red',
    u'helmet',
    u'on',
    u'a',
    u'small',
    u'moped',
    u'on',
    u'a',
    u'dirt',
    u'road']},
  {u'imgid': 0,
   u'raw': u'Man riding a motor bike on a dirt road on the countryside.',
   u'sentid': 771687,
   u'tokens': [u'man',
    u'riding',
    u'a',
    u'motor',
    u'bike',
    u'on',
    u'a',
    u'dirt',
    u'road',
    u'on',
    u'the',
    u'countryside']},
  {u'imgid': 0,
   u'raw': u'A man riding on the back of a motorcycle.',
   u'sentid': 772707,
   u'tokens': [u'a',
    u'man',
    u'riding',
    u'on',
    u'the',
    u'back',
    u'of',
    u'a',
    u'motorcycle']},
  {u'imgid': 0,
   u'raw': u'A dirt path with a young person on a motor bike rests to the foreground of a verdant area with a bridge and a background of cloud-wreathed mountains. ',
   u'sentid': 776154,
   u'tokens': [u'a',
    u'dirt',
    u'path',
    u'with',
    u'a',
    u'young',
    u'person',
    u'on',
    u'a',
    u'motor',
    u'bike',
    u'rests',
    u'to',
    u'the',
    u'foreground',
    u'of',
    u'a',
    u'verdant',
    u'area',
    u'with',
    u'a',
    u'bridge',
    u'and',
    u'a',
    u'background',
    u'of',
    u'cloud',
    u'wreathed',
    u'mountains']},
  {u'imgid': 0,
   u'raw': u'A man in a red shirt and a red hat is on a motorcycle on a hill side.',
   u'sentid': 781998,
   u'tokens': [u'a',
    u'man',
    u'in',
    u'a',
    u'red',
    u'shirt',
    u'and',
    u'a',
    u'red',
    u'hat',
    u'is',
    u'on',
    u'a',
    u'motorcycle',
    u'on',
    u'a',
    u'hill',
    u'side']}],
 u'sentids': [770337, 771687, 772707, 776154, 781998],
 u'split': u'test'}

In [ ]: