In [1]:
import numpy as np
# from data0 import ChunkLoader
import settings
import video
import argparse, os
# parser = argparse.ArgumentParser(description="data processing")
# # Parse the command line arguments
# parser.add_argument('-tm', '--test_mode', action='store_true',
#                     help='make predictions on test data')
# args = parser.parse_args()

In [195]:
repo_dir = '/Users/chen.liu/nfs03/share_data/Intelligence/Scoupon/items/dsb_vids/'

In [243]:
import logging
import numpy as np
import pandas as pd
import os
import video
import settings
import time

class ChunkLoader():
    def __init__(self, set_name, repo_dir, datum_dtype=np.uint8,
                 test_mode=False):
        # assert test_mode is False, 'Test mode not implemented yet'
        np.random.seed(0)
        self.set_name = set_name
        # self.bsz = self.be.bsz
        self.repo_dir = repo_dir
        self.is_training = (set_name == 'train')
        self.chunk_size = settings.chunk_size
        self.chunk_shape = (self.chunk_size, self.chunk_size, self.chunk_size)
        self.chunk_volume = np.prod(self.chunk_shape)
        self.metadata = pd.read_csv(os.path.join(self.repo_dir, set_name + '-metadata.csv'))
        self.data_size = self.metadata.shape[0]
        self.pos_users = self.metadata[self.metadata['flag']==1]['uid']
        self.neg_users = self.metadata[self.metadata['flag']==0]['uid']
        self.nvids = self.metadata.shape[0]
        self.chunks_filled = 0
        self.video_idx = 0
        if not test_mode:
            self.labels = pd.read_csv(os.path.join(self.repo_dir, 'labels.csv'))
            self.nega_labels = pd.read_csv(os.path.join(self.repo_dir, 'candidates.csv'))
            self.pos_labels = self.labels[self.labels['uid'].isin(self.pos_users)].shape[0]
            self.pos_neg_ratio = 6.0
            self.chunk_from_neg_users = int(self.pos_labels*self.pos_neg_ratio/len(self.neg_users))
        
        else:
            self.labels = None
            self.chunks_per_vid = settings.chunks_per_dim ** 3

        self.test_mode = test_mode
        self.chunks,self.starts,self.targets = [],[],[]
        ##positive points in lables.csv
        self.current_uid = self.current_flag = self.current_meta = None

    def reset(self):
        self.chunks,self.starts,self.targets = [],[],[]
    def next_video(self,video_idx):
        self.reset()
        self.current_meta = self.metadata.iloc[video_idx]
        uid = self.current_meta['uid']
        self.current_uid = self.current_meta['uid']
        self.current_flag = int(self.current_meta['flag'])
        data_filename = os.path.join(self.repo_dir, uid + '.' + settings.file_ext)
        vid_shape = (int(self.current_meta['z_len']),
                     int(self.current_meta['y_len']),
                     int(self.current_meta['x_len']))
        vid_data = video.read_blp(data_filename, vid_shape)
        self.video_idx += 1
        self.extract_chunks(vid_data)


        return self.chunks,self.starts,self.targets


    def slice_chunk(self, start, data):
        return data[start[0]:start[0] + self.chunk_size,
               start[1]:start[1] + self.chunk_size,
               start[2]:start[2] + self.chunk_size]#.ravel()

    def extract_one(self, data, data_shape, uid_data,idx):
        # assert uid_data.shape[0] != 0
        if not self.test_mode:
                center = np.array((uid_data['z'].iloc[idx],
                                   uid_data['y'].iloc[idx],
                                   uid_data['x'].iloc[idx]), dtype=np.int32)
                # radius
                rad = 0.5 * uid_data['diam'].iloc[idx]
                if rad == 0:
                    # Assign an arbitrary radius to candidate nodules
                    rad = 20 / settings.resolution
                    
                #comment by lc: low may <0   
                low = np.int32(center + rad - self.chunk_size)
                high = np.int32(center - rad)
                for j in range(3):
                    low[j] = max(0, low[j])
                    high[j] = max(low[j] + 1, high[j])
                    high[j] = min(data_shape[j] - self.chunk_size, high[j])
                    low[j] = min(low[j], high[j] - 1)
                start = [np.random.randint(low=low[i], high=high[i]) for i in range(3)]
        else:
            start = self.generate_chunk_start(idx, data_shape)
            
        
        chunk = self.slice_chunk(start, data)

        return chunk,start

    def generate_chunk_start(self, chunk_idx, data_shape):
        chunk_spacing = np.int32((np.array(data_shape) - self.chunk_size) / settings.chunks_per_dim)
        z_chunk_idx = chunk_idx / settings.chunks_per_dim ** 2
        y_chunk_idx = (chunk_idx - z_chunk_idx * settings.chunks_per_dim ** 2) / settings.chunks_per_dim
        x_chunk_idx = chunk_idx - z_chunk_idx * settings.chunks_per_dim ** 2 \
                      - y_chunk_idx * settings.chunks_per_dim

        start = [z_chunk_idx * chunk_spacing[0],
                 y_chunk_idx * chunk_spacing[1],
                 x_chunk_idx * chunk_spacing[2]]
        return start

    def extract_chunks(self, data):
         ## not test mode
        data_shape = np.array(data.shape, dtype=np.int32)
        if not self.test_mode:      
            if self.current_flag:
                uid_data = self.labels[self.labels['uid'] == self.current_uid]
                for idx in range(uid_data.shape[0]):
                    chunk,start = self.extract_one(data, data_shape, uid_data, idx)
                    if chunk is None:
                        continue
                    self.chunks.append(chunk)
                    self.starts.append(start)
                    self.targets.append(1)
            else:
                uid_data = self.labels[self.labels['uid'] == self.current_uid]
                for idx in range(min(self.chunk_from_neg_users,uid_data.shape[0])):
                    chunk,start = self.extract_one(data, data_shape, uid_data, idx)
                    if chunk is None:
                        continue
                    self.chunks.append(chunk)
                    self.starts.append(start)
                    self.targets.append(0)
                ## not enough negative from labels, then to candidates
                if uid_data.shape[0]<self.chunk_from_neg_users:
                    left_chunk = self.chunk_from_neg_users-uid_data.shape[0]
                    uid_data = self.nega_labels[self.nega_labels['uid'] == self.current_uid]
                    for i in range(min(left_chunk,uid_data.shape[0])):
                        idx = np.random.randint(uid_data.shape[0])
                        chunk,start = self.extract_one(data, data_shape, uid_data, idx)
                        if chunk is None:
                            continue
                        self.chunks.append(chunk)
                        self.starts.append(start)
                        self.targets.append(0)
        else:
            chunk_idx = 0
            while chunk_idx < self.chunks_per_vid:              
                chunk,start = self.extract_one(data, data_shape, None,chunk_idx)
                if chunk is None:
                    continue 
                self.chunks.append(chunk)
                self.starts.append(start)
                self.targets.append(0)
                chunk_idx += 1

In [244]:
common = dict(datum_dtype=np.uint8, repo_dir=repo_dir,test_mode=True)

In [245]:
test = ChunkLoader(set_name='train', **common)

In [246]:
chunks,starts,targets = test.next_video(4)


asdf 0.607808828354
0.0116889476776

In [224]:
len(chunks),len(starts),len(targets)


Out[224]:
(512, 512, 512)

In [ ]:


In [221]:
save_folder = '/Users/chen.liu/nfs03/share_data/Intelligence/Scoupon/items/features/dsb_train_features/'
# def write_file(path,array):
#     file_ = open(path,'w')
#     np.save(file_,array)
#     file_.close()

In [185]:
test.data_size


Out[185]:
175

In [251]:
import time
labels_file = open(save_folder+"labels.txt",'w')
for i in range(4,test.data_size):
    t1=time.time()
    c,s,t = test.next_video(i)
    if i%100==0:
        print "procedding ",i
    print len(c)
    t2=time.time()
#     print np.array(c).shape,np.array(s).shape,np.array(t).shape
#     np.save(save_folder+"locaition_"+test.current_uid,np.array(s))
#     np.save(save_folder+"label_"+test.current_uid,np.array(t))
#     np.save(save_folder+"chunk_"+test.current_uid,np.array(c))
    c_map = np.memmap(filename, dtype='float32', mode='r', shape=(3,4))
    pickle.dump(np.array(s),open(save_folder+"location_"+test.current_uid, 'w'), protocol=2)
    pickle.dump(np.array(c),open(save_folder+"chunk_"+test.current_uid, 'w'), protocol=2)
    t3=time.time()
    print t3-t2,t2-t1
    for i,l in enumerate(t):
        print >>labels_file,test.current_uid,i,l
    break
labels_file.close()


asdf 0.508436918259
0.00662612915039
512
13.4059228897 0.517055988312

In [249]:
import cPickle as pickle

In [137]:



Out[137]:
(8, 64, 64, 64)

In [133]:
for i in range(275):
    if chunk[i].shape!=(64,64,64):
        print i
#     print chunk[i].shape

In [28]:
train_metadata = pd.read_csv(repo_dir+"train-metadata.csv",header=False)

In [56]:
train_metadata.head()


Out[56]:
uid flag z_len y_len x_len
0 1.3.6.1.4.1.14519.5.2.1.6279.6001.139258777898... 1 264 199 256
1 1.3.6.1.4.1.14519.5.2.1.6279.6001.458525794434... 1 309 250 332
2 1.3.6.1.4.1.14519.5.2.1.6279.6001.296066944953... 0 298 185 263
3 1.3.6.1.4.1.14519.5.2.1.6279.6001.219254430927... 1 313 214 296
4 1.3.6.1.4.1.14519.5.2.1.6279.6001.219281726101... 1 263 266 315

In [26]:
labels = pd.read_csv(repo_dir+"labels.csv",header=False)

In [35]:
candidates = pd.read_csv(repo_dir+"candidates.csv",header=False)

In [ ]:


In [62]:
pos_users = train_metadata[train_metadata['flag']==1]['uid']

In [28]:
labels[labels['flag']==1.0]['diam'].describe()


Out[28]:
count    5739.000000
mean        7.641462
std         4.619960
min         2.032223
25%         4.700212
50%         5.998413
75%         8.793915
max        38.147699
Name: diam, dtype: float64

In [38]:
labels[(labels['flag']==1.0) & (labels['diam']>20)].head(20)


Out[38]:
uid flag z y x diam vol
133 1.3.6.1.4.1.14519.5.2.1.6279.6001.227962600322... 1 218 87 179 20.458553 4483.563937
142 1.3.6.1.4.1.14519.5.2.1.6279.6001.287966244644... 1 184 185 234 32.427328 17853.859336
146 1.3.6.1.4.1.14519.5.2.1.6279.6001.287966244644... 1 184 186 234 32.286268 17621.877172
147 1.3.6.1.4.1.14519.5.2.1.6279.6001.287966244644... 1 99 145 153 31.791741 16824.476720
152 1.3.6.1.4.1.14519.5.2.1.6279.6001.287966244644... 1 184 185 234 32.270030 17595.302588
421 1.3.6.1.4.1.14519.5.2.1.6279.6001.194632613233... 1 190 66 78 20.159984 4290.117137
516 1.3.6.1.4.1.14519.5.2.1.6279.6001.619372068417... 1 211 176 203 23.322219 6642.143256
517 1.3.6.1.4.1.14519.5.2.1.6279.6001.619372068417... 1 211 175 203 24.084649 7315.088670
575 1.3.6.1.4.1.14519.5.2.1.6279.6001.487745546557... 1 248 95 204 23.062884 6423.022503
576 1.3.6.1.4.1.14519.5.2.1.6279.6001.487745546557... 1 248 95 204 24.106843 7335.329651
577 1.3.6.1.4.1.14519.5.2.1.6279.6001.487745546557... 1 248 95 204 24.401001 7607.142207
578 1.3.6.1.4.1.14519.5.2.1.6279.6001.487745546557... 1 248 95 204 22.781527 6190.804882
588 1.3.6.1.4.1.14519.5.2.1.6279.6001.219618492426... 1 125 34 84 25.248733 8427.861613
590 1.3.6.1.4.1.14519.5.2.1.6279.6001.219618492426... 1 125 34 84 25.850155 9044.573365
593 1.3.6.1.4.1.14519.5.2.1.6279.6001.219618492426... 1 125 34 84 25.415405 8595.868038
641 1.3.6.1.4.1.14519.5.2.1.6279.6001.271307051432... 1 57 70 268 20.462999 4486.487590
642 1.3.6.1.4.1.14519.5.2.1.6279.6001.271307051432... 1 57 69 268 20.376842 4430.056377
643 1.3.6.1.4.1.14519.5.2.1.6279.6001.271307051432... 1 57 69 269 21.179910 4974.746704
644 1.3.6.1.4.1.14519.5.2.1.6279.6001.271307051432... 1 57 70 269 20.389225 4438.137841
685 1.3.6.1.4.1.14519.5.2.1.6279.6001.300136985030... 1 69 72 246 21.963757 5547.771081

In [51]:
len(set(labels['uid'])-set(train_metadata['uid']))


Out[51]:
175

In [32]:
candidates.head()


Out[32]:
uid x y z diam
0 1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222... 68.420000 -74.480000 -288.700000 0
1 1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222... -95.209361 -91.809406 -377.426350 0
2 1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222... -24.766755 -120.379294 -273.361539 0
3 1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222... -63.080000 -65.740000 -344.240000 0
4 1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222... 52.946688 -92.688873 -241.067872 0

In [37]:
candidates.head()


Out[37]:
uid flag z y x diam vol
0 1.3.6.1.4.1.14519.5.2.1.6279.6001.139258777898... 0 99 118 40 0 0
1 1.3.6.1.4.1.14519.5.2.1.6279.6001.139258777898... 0 129 86 88 0 0
2 1.3.6.1.4.1.14519.5.2.1.6279.6001.139258777898... 0 151 132 189 0 0
3 1.3.6.1.4.1.14519.5.2.1.6279.6001.139258777898... 0 97 173 69 0 0
4 1.3.6.1.4.1.14519.5.2.1.6279.6001.139258777898... 0 224 103 67 0 0

In [63]:
labels[labels['uid'].isin(pos_users)].shape[0]


Out[63]:
3674

In [65]:
labels.flag.value_counts()


Out[65]:
1    4580
0    3359
dtype: int64

In [ ]: