In [1]:
%matplotlib inline
from pycocotools.coco import COCO
import numpy as np
import skimage.io as io
import matplotlib.pyplot as plt
import scipy.io as sio
plt.rcParams['figure.figsize'] = (10.0, 8.0)
import os
import pickle
import nltk
from nltk.corpus import wordnet as wn

In [14]:
# training set
# dataType='train2014'
# usingSet='10000coco'

# validation set
dataType='val2014'
usingSet='5000coco'
#===================
dataDir='/media/haoran/DATA/Dataset/COCO/tools'
usingSetDir = '/media/haoran/Data/Dataset/VIPcoco/%s'%usingSet
InsFile='%s/annotations/instances_%s.json'%(dataDir,dataType)
CapFile='%s/annotations/captions_%s.json'%(dataDir,dataType)

Ins_coco=COCO(InsFile)
Cap_coco=COCO(CapFile)


loading annotations into memory...
0:00:05.683164
creating index...
index created!
loading annotations into memory...
0:00:00.588566
creating index...
index created!

In [4]:
# SALICON_filename = os.listdir(usingSetDir)
# SALICON_filename.sort()
# SALICON_id = [int(item[0:-4]) for item in SALICON_filename]

In [23]:
# SALICON_filename = os.listdir(usingSetDir)
# SALICON_filename.sort()
# SALICON_id = [int(item[0:-4]) for item in SALICON_filename]

In [24]:
# SALICON = {}
# SALICON['SALICON_filename'] = SALICON_filename
# SALICON['SALICON_id'] = SALICON_id
# pickle.dump(SALICON,open('data/5000coco.p','wb'))

In [3]:
SALICON = pickle.load(open('data/5000coco.p','rb'))
Ins_ID = pickle.load(open('data/Ins_ID_10k.p','rb'))

category = pickle.load(open('data/category.p','rb'))
cat_list = Ins_coco.cats#category list  (official)
category_idx = pickle.load(open('data/cat_dict_idx.p','rb'))#eg., person -- 1
category_supercategory_idx = pickle.load(open('data/cat_dict_supercat.p','rb')) #eg., person--human 
supercategory_idx = pickle.load(open('data/supercate_id.p','rb'))#eg., food--1

In [5]:
def findID(word, im_idd):
    if word in category:
        return category_idx[word]
    else:
        temp_idlist={}
        for item in category_idx.keys():
            for item1 in wn.synsets(item, wn.NOUN):
                for word1 in wn.synsets(word, wn.NOUN):
                    dist = item1.wup_similarity(word1)
                    if item not in temp_idlist.keys():
                        temp_idlist[category_idx[item]] = dist
                        continue
                    if dist > temp_idlist[category_idx[item]]:
                        temp_idlist[category_idx[item]] = dist
        temp_idlist = sorted(temp_idlist.iteritems(), key=lambda d: d[1], reverse = True)
        temp_idlist = temp_idlist[0:2]
        for n in temp_idlist:
            if n[0] in Ins_ID[im_idd]:
                return n[0]
        return 0

In [8]:
print SALICON['SALICON_filename'][901]
print SALICON.keys()
print SALICON['SALICON_id'][0]
print SALICON['SALICON_filename'][0]


---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-8-db1bd0141695> in <module>()
----> 1 print SALICON['SALICON_filename'][901]
      2 print SALICON.keys()
      3 print SALICON['SALICON_id'][0]
      4 print SALICON['SALICON_filename'][0]

IndexError: list index out of range

In [7]:
WORDMAT = sio.loadmat('data/word_mat_april10.mat')
WORDMAT = WORDMAT['word_mat']

In [8]:
WORDMAT_dup = WORDMAT[:,0]

In [26]:
COCO10k_nounlist = []
COCO10k_nounID = []
for im_id, captions_perim in enumerate(WORDMAT_dup):
    noun_im = []
    nounID_im = []
    for caption in captions_perim:
        noun_perst = []
        nounid_perst = []
        for noun in caption[0]:
            word =  (noun.item())[0]
            word = wn.morphy(word, wn.NOUN)
            if word is None:
                continue
            I_ID = findID(word, im_id)
            if I_ID == 0:
                continue
            noun_perst.append(word)
            nounid_perst.append(I_ID)
        noun_im.append(noun_perst)
        nounID_im.append(nounid_perst)     
    COCO10k_nounlist.append(noun_im)
    COCO10k_nounID.append(nounID_im)
    if im_id%100 == 0:
#         break
        print im_id


0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
9900

In [35]:
pickle.dump(COCO10k_nounlist,open('data/10000coco_nounlist.p','wb'))
pickle.dump(COCO10k_nounID,open('data/10000coco_nounID.p','wb'))
pickle.dump(Cardi_Noun,open('data/10000coco_Cardi.p','wb'))
pickle.dump(Seque_Noun,open('data/10000coco_Seque.p','wb'))

In [29]:
Cardi_Noun=[]
Seque_Noun=[]
for group in COCO10k_nounID:
    imdict = {}
    cardi=[]
    for item in group:
        if not item:
            continue
        for idx in item:
            cardi.append(idx)
    u_set = list(set(cardi))
    n_obj = len(u_set)
    for uitem in u_set:
        num = cardi.count(uitem)
        imdict[uitem] = num
    imdict= sorted(imdict.iteritems(), key=lambda d:d[1], reverse = True)
    Cardi_Noun.append(imdict)

    seque={}
    seq = [0]*n_obj
    iid = 0
    for iid, item in enumerate(u_set):
        for imseq in group:
            if not imseq or item not in imseq:
                continue
            wid = imseq.index(item)
            if type(wid)==list:
                for wwid in wid:
                    seq[iid]+=n_obj/(wwid+1)
            else:
                seq[iid]+=n_obj/(wid+1)
        seque[item] = seq[iid]
    seque= sorted(seque.iteritems(), key=lambda d:d[1], reverse = True)
    Seque_Noun.append(seque)