In [1]:
import numpy
import json
import gzip
import imaginet.tts as tts


Using gpu device 0: GeForce GTX 980 Ti (CNMeM is enabled with initial size: 95.0% of memory, cuDNN 5110)
/usr/local/lib/python2.7/dist-packages/Theano-0.9.0.dev2-py2.7.egg/theano/sandbox/cuda/__init__.py:601: UserWarning: Your cuDNN version is more recent than the one Theano officially supports. If you see any problems, try updating Theano or downgrading cuDNN to version 5.
  warnings.warn(warn)

In [7]:
import base64
with gzip.open("dataset.mp3.jsonl.gz") as f:
    for line in f:
        obj = json.loads(line)
        bi = base64.b64decode(obj['speech'])
        with open("mp3/{}.mp3".format(obj['sentid']), 'w') as o:
            o.write(bi)

In [8]:
data = json.load(open("dataset.json"))

In [9]:
data.keys()


Out[9]:
[u'images', u'dataset']

In [11]:
data['images'][0]


Out[11]:
{u'cocoid': 391895,
 u'filename': u'COCO_val2014_000000391895.jpg',
 u'filepath': u'val2014',
 u'imgid': 0,
 u'sentences': [{u'imgid': 0,
   u'raw': u'A man with a red helmet on a small moped on a dirt road. ',
   u'sentid': 770337,
   u'tokens': [u'a',
    u'man',
    u'with',
    u'a',
    u'red',
    u'helmet',
    u'on',
    u'a',
    u'small',
    u'moped',
    u'on',
    u'a',
    u'dirt',
    u'road']},
  {u'imgid': 0,
   u'raw': u'Man riding a motor bike on a dirt road on the countryside.',
   u'sentid': 771687,
   u'tokens': [u'man',
    u'riding',
    u'a',
    u'motor',
    u'bike',
    u'on',
    u'a',
    u'dirt',
    u'road',
    u'on',
    u'the',
    u'countryside']},
  {u'imgid': 0,
   u'raw': u'A man riding on the back of a motorcycle.',
   u'sentid': 772707,
   u'tokens': [u'a',
    u'man',
    u'riding',
    u'on',
    u'the',
    u'back',
    u'of',
    u'a',
    u'motorcycle']},
  {u'imgid': 0,
   u'raw': u'A dirt path with a young person on a motor bike rests to the foreground of a verdant area with a bridge and a background of cloud-wreathed mountains. ',
   u'sentid': 776154,
   u'tokens': [u'a',
    u'dirt',
    u'path',
    u'with',
    u'a',
    u'young',
    u'person',
    u'on',
    u'a',
    u'motor',
    u'bike',
    u'rests',
    u'to',
    u'the',
    u'foreground',
    u'of',
    u'a',
    u'verdant',
    u'area',
    u'with',
    u'a',
    u'bridge',
    u'and',
    u'a',
    u'background',
    u'of',
    u'cloud',
    u'wreathed',
    u'mountains']},
  {u'imgid': 0,
   u'raw': u'A man in a red shirt and a red hat is on a motorcycle on a hill side.',
   u'sentid': 781998,
   u'tokens': [u'a',
    u'man',
    u'in',
    u'a',
    u'red',
    u'shirt',
    u'and',
    u'a',
    u'red',
    u'hat',
    u'is',
    u'on',
    u'a',
    u'motorcycle',
    u'on',
    u'a',
    u'hill',
    u'side']}],
 u'sentids': [770337, 771687, 772707, 776154, 781998],
 u'split': u'test'}

In [12]:
captions = {}
for image in data['images']:
    for sentence in image['sentences']:
        captions[sentence['sentid']] = sentence['raw']

In [13]:
captions[48]


Out[13]:
u'A very clean and well decorated empty bathroom'

In [15]:
captions[3871]


Out[15]:
u'A group of people playing a game of croquet.'

In [ ]: