In [1]:
import numpy as np
# from data0 import ChunkLoader
import settings
import video
import argparse, os
# parser = argparse.ArgumentParser(description="data processing")
# # Parse the command line arguments
# parser.add_argument('-tm', '--test_mode', action='store_true',
# help='make predictions on test data')
# args = parser.parse_args()
In [195]:
repo_dir = '/Users/chen.liu/nfs03/share_data/Intelligence/Scoupon/items/dsb_vids/'
In [243]:
import logging
import numpy as np
import pandas as pd
import os
import video
import settings
import time
class ChunkLoader():
def __init__(self, set_name, repo_dir, datum_dtype=np.uint8,
test_mode=False):
# assert test_mode is False, 'Test mode not implemented yet'
np.random.seed(0)
self.set_name = set_name
# self.bsz = self.be.bsz
self.repo_dir = repo_dir
self.is_training = (set_name == 'train')
self.chunk_size = settings.chunk_size
self.chunk_shape = (self.chunk_size, self.chunk_size, self.chunk_size)
self.chunk_volume = np.prod(self.chunk_shape)
self.metadata = pd.read_csv(os.path.join(self.repo_dir, set_name + '-metadata.csv'))
self.data_size = self.metadata.shape[0]
self.pos_users = self.metadata[self.metadata['flag']==1]['uid']
self.neg_users = self.metadata[self.metadata['flag']==0]['uid']
self.nvids = self.metadata.shape[0]
self.chunks_filled = 0
self.video_idx = 0
if not test_mode:
self.labels = pd.read_csv(os.path.join(self.repo_dir, 'labels.csv'))
self.nega_labels = pd.read_csv(os.path.join(self.repo_dir, 'candidates.csv'))
self.pos_labels = self.labels[self.labels['uid'].isin(self.pos_users)].shape[0]
self.pos_neg_ratio = 6.0
self.chunk_from_neg_users = int(self.pos_labels*self.pos_neg_ratio/len(self.neg_users))
else:
self.labels = None
self.chunks_per_vid = settings.chunks_per_dim ** 3
self.test_mode = test_mode
self.chunks,self.starts,self.targets = [],[],[]
##positive points in lables.csv
self.current_uid = self.current_flag = self.current_meta = None
def reset(self):
self.chunks,self.starts,self.targets = [],[],[]
def next_video(self,video_idx):
self.reset()
self.current_meta = self.metadata.iloc[video_idx]
uid = self.current_meta['uid']
self.current_uid = self.current_meta['uid']
self.current_flag = int(self.current_meta['flag'])
data_filename = os.path.join(self.repo_dir, uid + '.' + settings.file_ext)
vid_shape = (int(self.current_meta['z_len']),
int(self.current_meta['y_len']),
int(self.current_meta['x_len']))
vid_data = video.read_blp(data_filename, vid_shape)
self.video_idx += 1
self.extract_chunks(vid_data)
return self.chunks,self.starts,self.targets
def slice_chunk(self, start, data):
return data[start[0]:start[0] + self.chunk_size,
start[1]:start[1] + self.chunk_size,
start[2]:start[2] + self.chunk_size]#.ravel()
def extract_one(self, data, data_shape, uid_data,idx):
# assert uid_data.shape[0] != 0
if not self.test_mode:
center = np.array((uid_data['z'].iloc[idx],
uid_data['y'].iloc[idx],
uid_data['x'].iloc[idx]), dtype=np.int32)
# radius
rad = 0.5 * uid_data['diam'].iloc[idx]
if rad == 0:
# Assign an arbitrary radius to candidate nodules
rad = 20 / settings.resolution
#comment by lc: low may <0
low = np.int32(center + rad - self.chunk_size)
high = np.int32(center - rad)
for j in range(3):
low[j] = max(0, low[j])
high[j] = max(low[j] + 1, high[j])
high[j] = min(data_shape[j] - self.chunk_size, high[j])
low[j] = min(low[j], high[j] - 1)
start = [np.random.randint(low=low[i], high=high[i]) for i in range(3)]
else:
start = self.generate_chunk_start(idx, data_shape)
chunk = self.slice_chunk(start, data)
return chunk,start
def generate_chunk_start(self, chunk_idx, data_shape):
chunk_spacing = np.int32((np.array(data_shape) - self.chunk_size) / settings.chunks_per_dim)
z_chunk_idx = chunk_idx / settings.chunks_per_dim ** 2
y_chunk_idx = (chunk_idx - z_chunk_idx * settings.chunks_per_dim ** 2) / settings.chunks_per_dim
x_chunk_idx = chunk_idx - z_chunk_idx * settings.chunks_per_dim ** 2 \
- y_chunk_idx * settings.chunks_per_dim
start = [z_chunk_idx * chunk_spacing[0],
y_chunk_idx * chunk_spacing[1],
x_chunk_idx * chunk_spacing[2]]
return start
def extract_chunks(self, data):
## not test mode
data_shape = np.array(data.shape, dtype=np.int32)
if not self.test_mode:
if self.current_flag:
uid_data = self.labels[self.labels['uid'] == self.current_uid]
for idx in range(uid_data.shape[0]):
chunk,start = self.extract_one(data, data_shape, uid_data, idx)
if chunk is None:
continue
self.chunks.append(chunk)
self.starts.append(start)
self.targets.append(1)
else:
uid_data = self.labels[self.labels['uid'] == self.current_uid]
for idx in range(min(self.chunk_from_neg_users,uid_data.shape[0])):
chunk,start = self.extract_one(data, data_shape, uid_data, idx)
if chunk is None:
continue
self.chunks.append(chunk)
self.starts.append(start)
self.targets.append(0)
## not enough negative from labels, then to candidates
if uid_data.shape[0]<self.chunk_from_neg_users:
left_chunk = self.chunk_from_neg_users-uid_data.shape[0]
uid_data = self.nega_labels[self.nega_labels['uid'] == self.current_uid]
for i in range(min(left_chunk,uid_data.shape[0])):
idx = np.random.randint(uid_data.shape[0])
chunk,start = self.extract_one(data, data_shape, uid_data, idx)
if chunk is None:
continue
self.chunks.append(chunk)
self.starts.append(start)
self.targets.append(0)
else:
chunk_idx = 0
while chunk_idx < self.chunks_per_vid:
chunk,start = self.extract_one(data, data_shape, None,chunk_idx)
if chunk is None:
continue
self.chunks.append(chunk)
self.starts.append(start)
self.targets.append(0)
chunk_idx += 1
In [244]:
common = dict(datum_dtype=np.uint8, repo_dir=repo_dir,test_mode=True)
In [245]:
test = ChunkLoader(set_name='train', **common)
In [246]:
chunks,starts,targets = test.next_video(4)
In [224]:
len(chunks),len(starts),len(targets)
Out[224]:
In [ ]:
In [221]:
save_folder = '/Users/chen.liu/nfs03/share_data/Intelligence/Scoupon/items/features/dsb_train_features/'
# def write_file(path,array):
# file_ = open(path,'w')
# np.save(file_,array)
# file_.close()
In [185]:
test.data_size
Out[185]:
In [251]:
import time
labels_file = open(save_folder+"labels.txt",'w')
for i in range(4,test.data_size):
t1=time.time()
c,s,t = test.next_video(i)
if i%100==0:
print "procedding ",i
print len(c)
t2=time.time()
# print np.array(c).shape,np.array(s).shape,np.array(t).shape
# np.save(save_folder+"locaition_"+test.current_uid,np.array(s))
# np.save(save_folder+"label_"+test.current_uid,np.array(t))
# np.save(save_folder+"chunk_"+test.current_uid,np.array(c))
c_map = np.memmap(filename, dtype='float32', mode='r', shape=(3,4))
pickle.dump(np.array(s),open(save_folder+"location_"+test.current_uid, 'w'), protocol=2)
pickle.dump(np.array(c),open(save_folder+"chunk_"+test.current_uid, 'w'), protocol=2)
t3=time.time()
print t3-t2,t2-t1
for i,l in enumerate(t):
print >>labels_file,test.current_uid,i,l
break
labels_file.close()
In [249]:
import cPickle as pickle
In [137]:
Out[137]:
In [133]:
for i in range(275):
if chunk[i].shape!=(64,64,64):
print i
# print chunk[i].shape
In [28]:
train_metadata = pd.read_csv(repo_dir+"train-metadata.csv",header=False)
In [56]:
train_metadata.head()
Out[56]:
In [26]:
labels = pd.read_csv(repo_dir+"labels.csv",header=False)
In [35]:
candidates = pd.read_csv(repo_dir+"candidates.csv",header=False)
In [ ]:
In [62]:
pos_users = train_metadata[train_metadata['flag']==1]['uid']
In [28]:
labels[labels['flag']==1.0]['diam'].describe()
Out[28]:
In [38]:
labels[(labels['flag']==1.0) & (labels['diam']>20)].head(20)
Out[38]:
In [51]:
len(set(labels['uid'])-set(train_metadata['uid']))
Out[51]:
In [32]:
candidates.head()
Out[32]:
In [37]:
candidates.head()
Out[37]:
In [63]:
labels[labels['uid'].isin(pos_users)].shape[0]
Out[63]:
In [65]:
labels.flag.value_counts()
Out[65]:
In [ ]: