In [1]:
FN = '160306-patient'

In [2]:
import re
import os
import dicom
import numpy as np
from collections import Counter
from itertools import chain, izip
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
from joblib import Parallel, delayed
import itertools

In [3]:
from utils import Dataset, params, Nt, Nv, Ns, temp_dir, awscp
Na = Nt+Nv+Ns
Nt, Nv, Ns


Out[3]:
(500, 200, 440)

In [4]:
def calc(s, prefix='sax'):
    """Read a study and return all its meta info"""
    dset = Dataset(s, prefix=prefix)
    dset.load()
    if len(dset.images):
        imagedata = (dset.images.shape, dset.images.mean(), dset.images.std(), dset.images.max())
    else:
        imagedata = (dset.images.shape, np.nan, np.nan, np.nan)
    dset.images = None # free memory
    dset.dicoms = None
    return s, imagedata, dset

In [5]:
patient = pd.read_csv(os.path.join(params['TRAIN_DATA_PATH'],'train.csv'), index_col='Id')
assert len(patient) == Nt

In [6]:
try:
    patient_val = pd.read_csv(os.path.join(params['VALID_DATA_PATH'],'validate.csv'), index_col='Id') 
    assert len(patient_val) == Nv
    patient = pd.concat((patient, patient_val))
    assert len(patient) == Nt + Nv
except:
    print "No validation csv. Running stage I"

Read information about patients from the DICOM files


In [7]:
import sys
from tqdm import tqdm
for prefix in ['sax','2ch','4ch']:
    print prefix
    res = []
    for start in tqdm(range(0,Na,32)):
        end = min(start+32, Na)
        res0 = Parallel(n_jobs=-1)(delayed(calc)(s+1,prefix=prefix) for s in range(start,end))
        res += res0
    p = '_' + prefix
    for s, imagedata, dset in res:
        if getattr(dset,'PatientAge') is None:
            continue
        for t in ['PatientAge', 'PatientSex', 'PatientPosition', 'ImageOrientationPatient', 'area', 'dist', 'slice_location_range']:
            x = getattr(dset,t)
            try:
                x = float(x)
            except:
                x = str(x)
            patient.loc[s,t+p] = x
        
        shape, imagemean, imagestd, imagemax = imagedata
        
        patient.loc[s,'slices'+p] = shape[0]
        patient.loc[s,'times'+p] = shape[1]
        patient.loc[s,'width'+p] = shape[2]
        patient.loc[s,'height'+p] = shape[3]
        patient.loc[s,'imagemean'+p] = imagemean
        patient.loc[s,'imagestd'+p] = imagestd
        patient.loc[s,'imagemax'+p] = imagemax
        
        for t in ['TriggerTime', 'NominalInterval', 'RepetitionTime']:
            meant = getattr(dset, 'mean' + t)
            vart = getattr(dset, 'var' + t)
            patient.loc[s,'meanmean'+t+p] = meant.mean()
            patient.loc[s,'stdmean'+t+p] = meant.std()
            patient.loc[s,'rangemean'+t+p] = meant.max() - meant.min()
            patient.loc[s,'meanvar'+t+p] = vart.mean()
            
        if len(dset.shapes) > 1:
            patient.loc[s,'shapes'+p] = str(dset.shapes)


  0%|          | 0/36 [00:00<?, ?it/s]
sax
  0%|          | 0/36 [00:00<?, ?it/s]
2ch
  0%|          | 0/36 [00:00<?, ?it/s]
4ch


In [8]:
fn = FN+'.pkl'
patient.to_pickle(os.path.join(temp_dir, fn))

In [9]:
awscp(fn,upload=True)

In [10]:
1


Out[10]:
1

In [ ]: