In [1]:
FN = '160306-patient'
In [2]:
import re
import os
import dicom
import numpy as np
from collections import Counter
from itertools import chain, izip
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
from joblib import Parallel, delayed
import itertools
In [3]:
from utils import Dataset, params, Nt, Nv, Ns, temp_dir, awscp
Na = Nt+Nv+Ns
Nt, Nv, Ns
Out[3]:
In [4]:
def calc(s, prefix='sax'):
"""Read a study and return all its meta info"""
dset = Dataset(s, prefix=prefix)
dset.load()
if len(dset.images):
imagedata = (dset.images.shape, dset.images.mean(), dset.images.std(), dset.images.max())
else:
imagedata = (dset.images.shape, np.nan, np.nan, np.nan)
dset.images = None # free memory
dset.dicoms = None
return s, imagedata, dset
In [5]:
patient = pd.read_csv(os.path.join(params['TRAIN_DATA_PATH'],'train.csv'), index_col='Id')
assert len(patient) == Nt
In [6]:
try:
patient_val = pd.read_csv(os.path.join(params['VALID_DATA_PATH'],'validate.csv'), index_col='Id')
assert len(patient_val) == Nv
patient = pd.concat((patient, patient_val))
assert len(patient) == Nt + Nv
except:
print "No validation csv. Running stage I"
Read information about patients from the DICOM files
In [7]:
import sys
from tqdm import tqdm
for prefix in ['sax','2ch','4ch']:
print prefix
res = []
for start in tqdm(range(0,Na,32)):
end = min(start+32, Na)
res0 = Parallel(n_jobs=-1)(delayed(calc)(s+1,prefix=prefix) for s in range(start,end))
res += res0
p = '_' + prefix
for s, imagedata, dset in res:
if getattr(dset,'PatientAge') is None:
continue
for t in ['PatientAge', 'PatientSex', 'PatientPosition', 'ImageOrientationPatient', 'area', 'dist', 'slice_location_range']:
x = getattr(dset,t)
try:
x = float(x)
except:
x = str(x)
patient.loc[s,t+p] = x
shape, imagemean, imagestd, imagemax = imagedata
patient.loc[s,'slices'+p] = shape[0]
patient.loc[s,'times'+p] = shape[1]
patient.loc[s,'width'+p] = shape[2]
patient.loc[s,'height'+p] = shape[3]
patient.loc[s,'imagemean'+p] = imagemean
patient.loc[s,'imagestd'+p] = imagestd
patient.loc[s,'imagemax'+p] = imagemax
for t in ['TriggerTime', 'NominalInterval', 'RepetitionTime']:
meant = getattr(dset, 'mean' + t)
vart = getattr(dset, 'var' + t)
patient.loc[s,'meanmean'+t+p] = meant.mean()
patient.loc[s,'stdmean'+t+p] = meant.std()
patient.loc[s,'rangemean'+t+p] = meant.max() - meant.min()
patient.loc[s,'meanvar'+t+p] = vart.mean()
if len(dset.shapes) > 1:
patient.loc[s,'shapes'+p] = str(dset.shapes)
In [8]:
fn = FN+'.pkl'
patient.to_pickle(os.path.join(temp_dir, fn))
In [9]:
awscp(fn,upload=True)
In [10]:
1
Out[10]:
In [ ]: