notebook.community

Edit and run



In [1]:

    
import numpy as np
# from data0 import ChunkLoader
import settings
import video
import argparse, os
# parser = argparse.ArgumentParser(description="data processing")
# # Parse the command line arguments
# parser.add_argument('-tm', '--test_mode', action='store_true',
#                     help='make predictions on test data')
# args = parser.parse_args()



In [195]:

    
repo_dir = '/Users/chen.liu/nfs03/share_data/Intelligence/Scoupon/items/dsb_vids/'



In [243]:

    
import logging
import numpy as np
import pandas as pd
import os
import video
import settings
import time

class ChunkLoader():
    def __init__(self, set_name, repo_dir, datum_dtype=np.uint8,
                 test_mode=False):
        # assert test_mode is False, 'Test mode not implemented yet'
        np.random.seed(0)
        self.set_name = set_name
        # self.bsz = self.be.bsz
        self.repo_dir = repo_dir
        self.is_training = (set_name == 'train')
        self.chunk_size = settings.chunk_size
        self.chunk_shape = (self.chunk_size, self.chunk_size, self.chunk_size)
        self.chunk_volume = np.prod(self.chunk_shape)
        self.metadata = pd.read_csv(os.path.join(self.repo_dir, set_name + '-metadata.csv'))
        self.data_size = self.metadata.shape[0]
        self.pos_users = self.metadata[self.metadata['flag']==1]['uid']
        self.neg_users = self.metadata[self.metadata['flag']==0]['uid']
        self.nvids = self.metadata.shape[0]
        self.chunks_filled = 0
        self.video_idx = 0
        if not test_mode:
            self.labels = pd.read_csv(os.path.join(self.repo_dir, 'labels.csv'))
            self.nega_labels = pd.read_csv(os.path.join(self.repo_dir, 'candidates.csv'))
            self.pos_labels = self.labels[self.labels['uid'].isin(self.pos_users)].shape[0]
            self.pos_neg_ratio = 6.0
            self.chunk_from_neg_users = int(self.pos_labels*self.pos_neg_ratio/len(self.neg_users))
        
        else:
            self.labels = None
            self.chunks_per_vid = settings.chunks_per_dim ** 3

        self.test_mode = test_mode
        self.chunks,self.starts,self.targets = [],[],[]
        ##positive points in lables.csv
        self.current_uid = self.current_flag = self.current_meta = None

    def reset(self):
        self.chunks,self.starts,self.targets = [],[],[]
    def next_video(self,video_idx):
        self.reset()
        self.current_meta = self.metadata.iloc[video_idx]
        uid = self.current_meta['uid']
        self.current_uid = self.current_meta['uid']
        self.current_flag = int(self.current_meta['flag'])
        data_filename = os.path.join(self.repo_dir, uid + '.' + settings.file_ext)
        vid_shape = (int(self.current_meta['z_len']),
                     int(self.current_meta['y_len']),
                     int(self.current_meta['x_len']))
        vid_data = video.read_blp(data_filename, vid_shape)
        self.video_idx += 1
        self.extract_chunks(vid_data)


        return self.chunks,self.starts,self.targets


    def slice_chunk(self, start, data):
        return data[start[0]:start[0] + self.chunk_size,
               start[1]:start[1] + self.chunk_size,
               start[2]:start[2] + self.chunk_size]#.ravel()

    def extract_one(self, data, data_shape, uid_data,idx):
        # assert uid_data.shape[0] != 0
        if not self.test_mode:
                center = np.array((uid_data['z'].iloc[idx],
                                   uid_data['y'].iloc[idx],
                                   uid_data['x'].iloc[idx]), dtype=np.int32)
                # radius
                rad = 0.5 * uid_data['diam'].iloc[idx]
                if rad == 0:
                    # Assign an arbitrary radius to candidate nodules
                    rad = 20 / settings.resolution
                    
                #comment by lc: low may <0   
                low = np.int32(center + rad - self.chunk_size)
                high = np.int32(center - rad)
                for j in range(3):
                    low[j] = max(0, low[j])
                    high[j] = max(low[j] + 1, high[j])
                    high[j] = min(data_shape[j] - self.chunk_size, high[j])
                    low[j] = min(low[j], high[j] - 1)
                start = [np.random.randint(low=low[i], high=high[i]) for i in range(3)]
        else:
            start = self.generate_chunk_start(idx, data_shape)
            
        
        chunk = self.slice_chunk(start, data)

        return chunk,start

    def generate_chunk_start(self, chunk_idx, data_shape):
        chunk_spacing = np.int32((np.array(data_shape) - self.chunk_size) / settings.chunks_per_dim)
        z_chunk_idx = chunk_idx / settings.chunks_per_dim ** 2
        y_chunk_idx = (chunk_idx - z_chunk_idx * settings.chunks_per_dim ** 2) / settings.chunks_per_dim
        x_chunk_idx = chunk_idx - z_chunk_idx * settings.chunks_per_dim ** 2 \
                      - y_chunk_idx * settings.chunks_per_dim

        start = [z_chunk_idx * chunk_spacing[0],
                 y_chunk_idx * chunk_spacing[1],
                 x_chunk_idx * chunk_spacing[2]]
        return start

    def extract_chunks(self, data):
         ## not test mode
        data_shape = np.array(data.shape, dtype=np.int32)
        if not self.test_mode:      
            if self.current_flag:
                uid_data = self.labels[self.labels['uid'] == self.current_uid]
                for idx in range(uid_data.shape[0]):
                    chunk,start = self.extract_one(data, data_shape, uid_data, idx)
                    if chunk is None:
                        continue
                    self.chunks.append(chunk)
                    self.starts.append(start)
                    self.targets.append(1)
            else:
                uid_data = self.labels[self.labels['uid'] == self.current_uid]
                for idx in range(min(self.chunk_from_neg_users,uid_data.shape[0])):
                    chunk,start = self.extract_one(data, data_shape, uid_data, idx)
                    if chunk is None:
                        continue
                    self.chunks.append(chunk)
                    self.starts.append(start)
                    self.targets.append(0)
                ## not enough negative from labels, then to candidates
                if uid_data.shape[0]<self.chunk_from_neg_users:
                    left_chunk = self.chunk_from_neg_users-uid_data.shape[0]
                    uid_data = self.nega_labels[self.nega_labels['uid'] == self.current_uid]
                    for i in range(min(left_chunk,uid_data.shape[0])):
                        idx = np.random.randint(uid_data.shape[0])
                        chunk,start = self.extract_one(data, data_shape, uid_data, idx)
                        if chunk is None:
                            continue
                        self.chunks.append(chunk)
                        self.starts.append(start)
                        self.targets.append(0)
        else:
            chunk_idx = 0
            while chunk_idx < self.chunks_per_vid:              
                chunk,start = self.extract_one(data, data_shape, None,chunk_idx)
                if chunk is None:
                    continue 
                self.chunks.append(chunk)
                self.starts.append(start)
                self.targets.append(0)
                chunk_idx += 1



In [244]:

    
common = dict(datum_dtype=np.uint8, repo_dir=repo_dir,test_mode=True)



In [245]:

    
test = ChunkLoader(set_name='train', **common)



In [246]:

    
chunks,starts,targets = test.next_video(4)









    



asdf 0.607808828354
0.0116889476776



In [224]:

    
len(chunks),len(starts),len(targets)









    Out[224]:





(512, 512, 512)



In [ ]:



In [221]:

    
save_folder = '/Users/chen.liu/nfs03/share_data/Intelligence/Scoupon/items/features/dsb_train_features/'
# def write_file(path,array):
#     file_ = open(path,'w')
#     np.save(file_,array)
#     file_.close()



In [185]:

    
test.data_size









    Out[185]:





175



In [251]:

    
import time
labels_file = open(save_folder+"labels.txt",'w')
for i in range(4,test.data_size):
    t1=time.time()
    c,s,t = test.next_video(i)
    if i%100==0:
        print "procedding ",i
    print len(c)
    t2=time.time()
#     print np.array(c).shape,np.array(s).shape,np.array(t).shape
#     np.save(save_folder+"locaition_"+test.current_uid,np.array(s))
#     np.save(save_folder+"label_"+test.current_uid,np.array(t))
#     np.save(save_folder+"chunk_"+test.current_uid,np.array(c))
    c_map = np.memmap(filename, dtype='float32', mode='r', shape=(3,4))
    pickle.dump(np.array(s),open(save_folder+"location_"+test.current_uid, 'w'), protocol=2)
    pickle.dump(np.array(c),open(save_folder+"chunk_"+test.current_uid, 'w'), protocol=2)
    t3=time.time()
    print t3-t2,t2-t1
    for i,l in enumerate(t):
        print >>labels_file,test.current_uid,i,l
    break
labels_file.close()









    



asdf 0.508436918259
0.00662612915039
512
13.4059228897 0.517055988312



In [249]:

    
import cPickle as pickle



In [137]:









    Out[137]:





(8, 64, 64, 64)



In [133]:

    
for i in range(275):
    if chunk[i].shape!=(64,64,64):
        print i
#     print chunk[i].shape



In [28]:

    
train_metadata = pd.read_csv(repo_dir+"train-metadata.csv",header=False)



In [56]:

    
train_metadata.head()









    Out[56]:






  
    
      
      uid
      flag
      z_len
      y_len
      x_len
    
  
  
    
      0
      1.3.6.1.4.1.14519.5.2.1.6279.6001.139258777898...
      1
      264
      199
      256
    
    
      1
      1.3.6.1.4.1.14519.5.2.1.6279.6001.458525794434...
      1
      309
      250
      332
    
    
      2
      1.3.6.1.4.1.14519.5.2.1.6279.6001.296066944953...
      0
      298
      185
      263
    
    
      3
      1.3.6.1.4.1.14519.5.2.1.6279.6001.219254430927...
      1
      313
      214
      296
    
    
      4
      1.3.6.1.4.1.14519.5.2.1.6279.6001.219281726101...
      1
      263
      266
      315



In [26]:

    
labels = pd.read_csv(repo_dir+"labels.csv",header=False)



In [35]:

    
candidates = pd.read_csv(repo_dir+"candidates.csv",header=False)



In [ ]:



In [62]:

    
pos_users = train_metadata[train_metadata['flag']==1]['uid']



In [28]:

    
labels[labels['flag']==1.0]['diam'].describe()









    Out[28]:





count    5739.000000
mean        7.641462
std         4.619960
min         2.032223
25%         4.700212
50%         5.998413
75%         8.793915
max        38.147699
Name: diam, dtype: float64



In [38]:

    
labels[(labels['flag']==1.0) & (labels['diam']>20)].head(20)









    Out[38]:






  
    
      
      uid
      flag
      z
      y
      x
      diam
      vol
    
  
  
    
      133
      1.3.6.1.4.1.14519.5.2.1.6279.6001.227962600322...
      1
      218
      87
      179
      20.458553
      4483.563937
    
    
      142
      1.3.6.1.4.1.14519.5.2.1.6279.6001.287966244644...
      1
      184
      185
      234
      32.427328
      17853.859336
    
    
      146
      1.3.6.1.4.1.14519.5.2.1.6279.6001.287966244644...
      1
      184
      186
      234
      32.286268
      17621.877172
    
    
      147
      1.3.6.1.4.1.14519.5.2.1.6279.6001.287966244644...
      1
      99
      145
      153
      31.791741
      16824.476720
    
    
      152
      1.3.6.1.4.1.14519.5.2.1.6279.6001.287966244644...
      1
      184
      185
      234
      32.270030
      17595.302588
    
    
      421
      1.3.6.1.4.1.14519.5.2.1.6279.6001.194632613233...
      1
      190
      66
      78
      20.159984
      4290.117137
    
    
      516
      1.3.6.1.4.1.14519.5.2.1.6279.6001.619372068417...
      1
      211
      176
      203
      23.322219
      6642.143256
    
    
      517
      1.3.6.1.4.1.14519.5.2.1.6279.6001.619372068417...
      1
      211
      175
      203
      24.084649
      7315.088670
    
    
      575
      1.3.6.1.4.1.14519.5.2.1.6279.6001.487745546557...
      1
      248
      95
      204
      23.062884
      6423.022503
    
    
      576
      1.3.6.1.4.1.14519.5.2.1.6279.6001.487745546557...
      1
      248
      95
      204
      24.106843
      7335.329651
    
    
      577
      1.3.6.1.4.1.14519.5.2.1.6279.6001.487745546557...
      1
      248
      95
      204
      24.401001
      7607.142207
    
    
      578
      1.3.6.1.4.1.14519.5.2.1.6279.6001.487745546557...
      1
      248
      95
      204
      22.781527
      6190.804882
    
    
      588
      1.3.6.1.4.1.14519.5.2.1.6279.6001.219618492426...
      1
      125
      34
      84
      25.248733
      8427.861613
    
    
      590
      1.3.6.1.4.1.14519.5.2.1.6279.6001.219618492426...
      1
      125
      34
      84
      25.850155
      9044.573365
    
    
      593
      1.3.6.1.4.1.14519.5.2.1.6279.6001.219618492426...
      1
      125
      34
      84
      25.415405
      8595.868038
    
    
      641
      1.3.6.1.4.1.14519.5.2.1.6279.6001.271307051432...
      1
      57
      70
      268
      20.462999
      4486.487590
    
    
      642
      1.3.6.1.4.1.14519.5.2.1.6279.6001.271307051432...
      1
      57
      69
      268
      20.376842
      4430.056377
    
    
      643
      1.3.6.1.4.1.14519.5.2.1.6279.6001.271307051432...
      1
      57
      69
      269
      21.179910
      4974.746704
    
    
      644
      1.3.6.1.4.1.14519.5.2.1.6279.6001.271307051432...
      1
      57
      70
      269
      20.389225
      4438.137841
    
    
      685
      1.3.6.1.4.1.14519.5.2.1.6279.6001.300136985030...
      1
      69
      72
      246
      21.963757
      5547.771081



In [51]:

    
len(set(labels['uid'])-set(train_metadata['uid']))









    Out[51]:





175



In [32]:

    
candidates.head()









    Out[32]:






  
    
      
      uid
      x
      y
      z
      diam
    
  
  
    
      0
      1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...
      68.420000
      -74.480000
      -288.700000
      0
    
    
      1
      1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...
      -95.209361
      -91.809406
      -377.426350
      0
    
    
      2
      1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...
      -24.766755
      -120.379294
      -273.361539
      0
    
    
      3
      1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...
      -63.080000
      -65.740000
      -344.240000
      0
    
    
      4
      1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...
      52.946688
      -92.688873
      -241.067872
      0



In [37]:

    
candidates.head()









    Out[37]:






  
    
      
      uid
      flag
      z
      y
      x
      diam
      vol
    
  
  
    
      0
      1.3.6.1.4.1.14519.5.2.1.6279.6001.139258777898...
      0
      99
      118
      40
      0
      0
    
    
      1
      1.3.6.1.4.1.14519.5.2.1.6279.6001.139258777898...
      0
      129
      86
      88
      0
      0
    
    
      2
      1.3.6.1.4.1.14519.5.2.1.6279.6001.139258777898...
      0
      151
      132
      189
      0
      0
    
    
      3
      1.3.6.1.4.1.14519.5.2.1.6279.6001.139258777898...
      0
      97
      173
      69
      0
      0
    
    
      4
      1.3.6.1.4.1.14519.5.2.1.6279.6001.139258777898...
      0
      224
      103
      67
      0
      0



In [63]:

    
labels[labels['uid'].isin(pos_users)].shape[0]









    Out[63]:





3674



In [65]:

    
labels.flag.value_counts()









    Out[65]:





1    4580
0    3359
dtype: int64



In [ ]:

	uid	flag	z_len	y_len	x_len
0	1.3.6.1.4.1.14519.5.2.1.6279.6001.139258777898...	1	264	199	256
1	1.3.6.1.4.1.14519.5.2.1.6279.6001.458525794434...	1	309	250	332
2	1.3.6.1.4.1.14519.5.2.1.6279.6001.296066944953...	0	298	185	263
3	1.3.6.1.4.1.14519.5.2.1.6279.6001.219254430927...	1	313	214	296
4	1.3.6.1.4.1.14519.5.2.1.6279.6001.219281726101...	1	263	266	315

	uid	flag	z	y	x	diam	vol
133	1.3.6.1.4.1.14519.5.2.1.6279.6001.227962600322...	1	218	87	179	20.458553	4483.563937
142	1.3.6.1.4.1.14519.5.2.1.6279.6001.287966244644...	1	184	185	234	32.427328	17853.859336
146	1.3.6.1.4.1.14519.5.2.1.6279.6001.287966244644...	1	184	186	234	32.286268	17621.877172
147	1.3.6.1.4.1.14519.5.2.1.6279.6001.287966244644...	1	99	145	153	31.791741	16824.476720
152	1.3.6.1.4.1.14519.5.2.1.6279.6001.287966244644...	1	184	185	234	32.270030	17595.302588
421	1.3.6.1.4.1.14519.5.2.1.6279.6001.194632613233...	1	190	66	78	20.159984	4290.117137
516	1.3.6.1.4.1.14519.5.2.1.6279.6001.619372068417...	1	211	176	203	23.322219	6642.143256
517	1.3.6.1.4.1.14519.5.2.1.6279.6001.619372068417...	1	211	175	203	24.084649	7315.088670
575	1.3.6.1.4.1.14519.5.2.1.6279.6001.487745546557...	1	248	95	204	23.062884	6423.022503
576	1.3.6.1.4.1.14519.5.2.1.6279.6001.487745546557...	1	248	95	204	24.106843	7335.329651
577	1.3.6.1.4.1.14519.5.2.1.6279.6001.487745546557...	1	248	95	204	24.401001	7607.142207
578	1.3.6.1.4.1.14519.5.2.1.6279.6001.487745546557...	1	248	95	204	22.781527	6190.804882
588	1.3.6.1.4.1.14519.5.2.1.6279.6001.219618492426...	1	125	34	84	25.248733	8427.861613
590	1.3.6.1.4.1.14519.5.2.1.6279.6001.219618492426...	1	125	34	84	25.850155	9044.573365
593	1.3.6.1.4.1.14519.5.2.1.6279.6001.219618492426...	1	125	34	84	25.415405	8595.868038
641	1.3.6.1.4.1.14519.5.2.1.6279.6001.271307051432...	1	57	70	268	20.462999	4486.487590
642	1.3.6.1.4.1.14519.5.2.1.6279.6001.271307051432...	1	57	69	268	20.376842	4430.056377
643	1.3.6.1.4.1.14519.5.2.1.6279.6001.271307051432...	1	57	69	269	21.179910	4974.746704
644	1.3.6.1.4.1.14519.5.2.1.6279.6001.271307051432...	1	57	70	269	20.389225	4438.137841
685	1.3.6.1.4.1.14519.5.2.1.6279.6001.300136985030...	1	69	72	246	21.963757	5547.771081

	uid	x	y	z
0	1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...	68.420000	-74.480000	-288.700000
1	1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...	-95.209361	-91.809406	-377.426350
2	1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...	-24.766755	-120.379294	-273.361539
3	1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...	-63.080000	-65.740000	-344.240000
4	1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...	52.946688	-92.688873	-241.067872