In [1]:
%matplotlib inline
from pycocotools.coco import COCO
import numpy as np
import skimage.io as io
import matplotlib.pyplot as plt
import scipy.io as sio
plt.rcParams['figure.figsize'] = (10.0, 8.0)
import os
import pickle
import nltk
#from nltk.corpus import wordnet as wn

In [14]:
# training set
# dataType='train2014'
# usingSet='10000coco'

# validation set
dataType='val2014'
usingSet='5000coco'
#===================
dataDir='H:/SG_code/Dataset/COCO/tools'
usingSetDir = 'H:/SG_code/VIPcoco/%s'%usingSet
InsFile='%s/annotations/instances_%s.json'%(dataDir,dataType)
CapFile='%s/annotations/captions_%s.json'%(dataDir,dataType)

Ins_coco=COCO(InsFile)
Cap_coco=COCO(CapFile)


loading annotations into memory...
0:00:25.193000
creating index...
index created!
loading annotations into memory...
0:00:06.259000
creating index...
index created!

In [21]:
#for test
SALICON_filename = os.listdir(usingSetDir)
SALICON_filename.sort()
SALICON_id = [int(item[0:-4]) for item in SALICON_filename]

In [11]:
#for train
SALICON_filename = os.listdir(usingSetDir)
SALICON_filename.sort()
SALICON_id = [int(item[15:-4]) for item in SALICON_filename]

In [23]:
# SALICON_filename = os.listdir(usingSetDir)
# SALICON_filename.sort()
# SALICON_id = [int(item[0:-4]) for item in SALICON_filename]

In [12]:
SALICON = {}
SALICON['SALICON_filename'] = SALICON_filename
SALICON['SALICON_id'] = SALICON_id
# pickle.dump(SALICON,open('data/5000coco.p','wb'))

In [19]:
pickle.dump(SALICON,open('data/10000coco.p','wb'))

In [8]:
cat_list = Ins_coco.cats

In [3]:
cat_data=sio.loadmat('data/category.mat')
cat_data = cat_data['category']
category = []
for i in range(0, cat_data.size):
    cat_item = cat_data[i].item()
    cat_item = np.array2string(cat_item)
    cat_item= cat_item[3:-2]
    category.append(cat_item)
category[12] = 'sign'
category[27] = 'umbrella'
category[9] = 'light'
category[13] = 'meter'
category[38] = 'bat'
category[39] = 'glove'
category[42] = 'racket'
category[45] = 'glass'
category[63] = 'plant'
category[66] = 'table'
category[76] = 'phone'
category[87] = 'teddy'
category[88] = 'drier'

In [9]:
cat_dict_idx = {}
for idx in cat_list.keys():
    cat_dict_idx[category[idx-1]] = idx
cat_dict_supercat = {}
for idx in cat_list.keys():
    cat_dict_supercat[category[idx-1]] = cat_list[idx]['supercategory']
    
supercate_id = {}
spcat = []
for idx in cat_list.keys():
    spcat.append(cat_list[idx]['supercategory'])
spcat = list(set(spcat))
for idx,item in enumerate(spcat):
    supercate_id[item] = idx

# category[9] = 'light'
# category[13] = 'meter'
# category[38] = 'bat'
# category[39] = 'glove'
# category[42] = 'racket'
# category[45] = 'glass'
# category[63] = 'plant'
# category[66] = 'table'
# category[76] = 'phone'
# category[87] = 'teddy'
# category[88] = 'drier'


pickle.dump(category,open('data/category.p','wb'))
pickle.dump(cat_dict_idx,open('data/cat_dict_idx.p','wb'))
pickle.dump(cat_dict_supercat,open('data/cat_dict_supercat.p','wb'))
pickle.dump(supercate_id,open('data/supercate_id.p','wb'))

In [13]:
AnID_list = []
for im_id in range(10000):
    annIds = Ins_coco.getAnnIds(SALICON['SALICON_id'][im_id])
    anns = Ins_coco.loadAnns(annIds)
    AnID_list_item = []
    for item in anns:
        AnID_list_item.append(item['category_id'])
    AnID_list.append(list(set(AnID_list_item)))
pickle.dump(AnID_list,open('data/Ins_ID_10000coco.p','wb'))

In [3]:
SALICON = pickle.load(open('data/5000coco.p','rb'))
Ins_ID = pickle.load(open('data/Ins_ID_10k.p','rb'))

category = pickle.load(open('data/category.p','rb'))
cat_list = Ins_coco.cats#category list  (official)
category_idx = pickle.load(open('data/cat_dict_idx.p','rb'))#eg., person -- 1
category_supercategory_idx = pickle.load(open('data/cat_dict_supercat.p','rb')) #eg., person--human 
supercategory_idx = pickle.load(open('data/supercate_id.p','rb'))#eg., food--1

In [ ]:


In [5]:
def findID(word, im_idd):
    if word in category:
        return category_idx[word]
    else:
        temp_idlist={}
        for item in category_idx.keys():
            for item1 in wn.synsets(item, wn.NOUN):
                for word1 in wn.synsets(word, wn.NOUN):
                    dist = item1.wup_similarity(word1)
                    if item not in temp_idlist.keys():
                        temp_idlist[category_idx[item]] = dist
                        continue
                    if dist > temp_idlist[category_idx[item]]:
                        temp_idlist[category_idx[item]] = dist
        temp_idlist = sorted(temp_idlist.iteritems(), key=lambda d: d[1], reverse = True)
        temp_idlist = temp_idlist[0:2]
        for n in temp_idlist:
            if n[0] in Ins_ID[im_idd]:
                return n[0]
        return 0

In [8]:
print SALICON['SALICON_filename'][901]
print SALICON.keys()
print SALICON['SALICON_id'][0]
print SALICON['SALICON_filename'][0]


---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-8-db1bd0141695> in <module>()
----> 1 print SALICON['SALICON_filename'][901]
      2 print SALICON.keys()
      3 print SALICON['SALICON_id'][0]
      4 print SALICON['SALICON_filename'][0]

IndexError: list index out of range

In [7]:
WORDMAT = sio.loadmat('data/word_mat_april10.mat')
WORDMAT = WORDMAT['word_mat']

In [8]:
WORDMAT_dup = WORDMAT[:,0]

In [26]:
COCO10k_nounlist = []
COCO10k_nounID = []
for im_id, captions_perim in enumerate(WORDMAT_dup):
    noun_im = []
    nounID_im = []
    for caption in captions_perim:
        noun_perst = []
        nounid_perst = []
        for noun in caption[0]:
            word =  (noun.item())[0]
            word = wn.morphy(word, wn.NOUN)
            if word is None:
                continue
            I_ID = findID(word, im_id)
            if I_ID == 0:
                continue
            noun_perst.append(word)
            nounid_perst.append(I_ID)
        noun_im.append(noun_perst)
        nounID_im.append(nounid_perst)     
    COCO10k_nounlist.append(noun_im)
    COCO10k_nounID.append(nounID_im)
    if im_id%100 == 0:
#         break
        print im_id


0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
9900

In [35]:
pickle.dump(COCO10k_nounlist,open('data/10000coco_nounlist.p','wb'))
pickle.dump(COCO10k_nounID,open('data/10000coco_nounID.p','wb'))
pickle.dump(Cardi_Noun,open('data/10000coco_Cardi.p','wb'))
pickle.dump(Seque_Noun,open('data/10000coco_Seque.p','wb'))

In [29]:
Cardi_Noun=[]
Seque_Noun=[]
for group in COCO10k_nounID:
    imdict = {}
    cardi=[]
    for item in group:
        if not item:
            continue
        for idx in item:
            cardi.append(idx)
    u_set = list(set(cardi))
    n_obj = len(u_set)
    for uitem in u_set:
        num = cardi.count(uitem)
        imdict[uitem] = num
    imdict= sorted(imdict.iteritems(), key=lambda d:d[1], reverse = True)
    Cardi_Noun.append(imdict)

    seque={}
    seq = [0]*n_obj
    iid = 0
    for iid, item in enumerate(u_set):
        for imseq in group:
            if not imseq or item not in imseq:
                continue
            wid = imseq.index(item)
            if type(wid)==list:
                for wwid in wid:
                    seq[iid]+=n_obj/(wwid+1)
            else:
                seq[iid]+=n_obj/(wid+1)
        seque[item] = seq[iid]
    seque= sorted(seque.iteritems(), key=lambda d:d[1], reverse = True)
    Seque_Noun.append(seque)