Jay Narhan
May 2017Parse text files that describe DDSM and MIAS datasets. We then augment and create three meta file:
meta_data_all.csv will be of the following format:
| Patient_ID | Image_Name | Orientation | View | Lesion_Type | Detection_Res | Pathology_Res |
|---|---|---|---|---|---|---|
| ----------- | ------------ | ------------- | ------ | ------------- | --------------- | ------------- |
The other two CSVs will be of the following format:
| Patient_ID | Image_Name | View | Detection_Res | Pathology_Res |
|---|---|---|---|---|
| ----------- | ------------ | ------ | --------------- | ------------- |
In [1]:
__version__ = '0.1.0'
__status__ = 'Development'
__date__ = '2017-May-25'
__author__ = 'Jay Narhan'
In [2]:
import os
import pandas as pd
import numpy as np
from collections import Counter
In [3]:
META_ROOT = os.path.realpath('../../Meta_Data_Files') + '/'
DDSM_META = META_ROOT + 'Ddsm_png.csv'
MIAS_META = META_ROOT + 'MIAS.txt'
In [4]:
ddsm = pd.read_csv(DDSM_META, skiprows=[0,2])
ddsm = ddsm.replace(np.NAN, "None")
ddsm = ddsm.drop(['Type', 'AbType', 'Scanner', 'SubFolder'], axis=1)
ddsm['Pathology'].replace('None', 'NORMAL', inplace=True)
ddsm['Detection_Res'] = np.where( ddsm['Pathology'].str.match(r'NORMAL'), 'NORMAL', 'ABNORMAL' )
ddsm['View'] = np.where( ddsm['Name'].str.contains(r'CC'), 'CC', 'MLO')
ddsm['Patient_ID'] = ddsm['Name'].str.extract(r'^([^.]*)', expand=False)
ddsm['Orientation'] = np.where( ddsm['Name'].str.contains(r'LEFT'), 'LEFT', 'RIGHT')
In [5]:
ddsm = ddsm.rename(columns= {'Name': 'Image_Name', 'LesionType': 'Lesion_Type', 'Pathology': 'Pathology_Res'})
order = ['Patient_ID', 'Image_Name', 'Orientation', 'View', 'Lesion_Type', 'Detection_Res', 'Pathology_Res']
ddsm = ddsm[order]
print 'Length of DDSM meta information (not necessarily files available): {}'.format(len(ddsm))
In [6]:
ddsm.head(n=8)
Out[6]:
In [7]:
pat_ids = [pat_id for pat_id in ddsm['Patient_ID']]
if len(pat_ids) % 4 != 0: # this = number of images and should be divisible by 4!
print 'Missing DDSM data!'
print 'Number of patients: {}'.format(len(set(pat_ids)))
print 'Number of images: {}'.format(len(pat_ids))
# Missing data:
counts = Counter(pat_ids)
for k,v in counts.iteritems():
if v != 4:
print 'DDSM Patient: {0} only has {1} images'.format(k, v)
In [8]:
with open(MIAS_META) as f:
content = f.readlines()
In [9]:
mias = pd.DataFrame(columns=order)
In [10]:
mias_patient = 0
for i, row in enumerate(content):
line = row.split(' ')
img_name = 'mdb' + str(i+1).zfill(3) + '.png' # text file has error in names i.e. line[0] - do not use
lesion = line[2]
if lesion == 'NORM':
lesion = 'None'
elif lesion == 'CALC':
lesion = 'CALCIFICATION'
if line[3] != '\n':
pathology = line[3]
if pathology == 'B':
pathology = 'BENIGN'
else:
pathology = 'MALIGNANT'
else:
pathology = 'NORMAL'
if pathology == 'NORMAL':
detection = 'NORMAL'
else:
detection = 'ABNORMAL'
patient_id = 'MIAS_' + str(mias_patient)
if i%2 == 0:
mias.loc[i] = [patient_id, img_name, 'LEFT', 'MLO', lesion, detection, pathology]
else:
mias.loc[i] = [patient_id, img_name, 'RIGHT', 'MLO', lesion, detection, pathology]
mias_patient += 1
In [11]:
mias.head(n=8)
Out[11]:
In [12]:
print 'Length of MIAS meta information (not necessarily files): {}'.format(len(mias))
This is one file representing meta info on all raw csv files for DDSM and MIAS.
In [13]:
all_data = pd.DataFrame(columns=order)
all_data = all_data.append(ddsm)
all_data = all_data.append(mias)
In [14]:
all_data.to_csv(path_or_buf=META_ROOT +'meta_data_all.csv', index=False)
In [15]:
all_data.head()
Out[15]:
In [16]:
print 'Unique Detection_Res values: {}'.format(set(all_data.Detection_Res))
print 'Unique Pathology_Res values: {}'.format(set(all_data.Pathology_Res))
In [17]:
meta_left = all_data.query('Orientation == "LEFT"')
meta_right = all_data.query('Orientation == "RIGHT"')
In [18]:
meta = meta_left.merge(meta_right, how='inner', on=['Patient_ID', 'View']) # Long to wide on Patients and type of view
del meta['Image_Name_x']
del meta['Image_Name_y']
meta['Image_Name'] = meta.Patient_ID + '_' + meta.View + '.png'
print 'Number of records in meta: {:>10}'.format(meta['Patient_ID'].count())
In [19]:
meta.head()
Out[19]:
In [20]:
meta = pd.DataFrame(meta, columns=('Patient_ID', 'Image_Name', 'View',
'Orientation_x', 'Lesion_Type_x', 'Detection_Res_x', 'Pathology_Res_x',
'Orientation_y', 'Lesion_Type_y', 'Detection_Res_y', 'Pathology_Res_y'))
meta.head()
Out[20]:
In [21]:
malignants = np.where( ((meta['Pathology_Res_x'] == 'MALIGNANT') & (meta['Pathology_Res_y'] == 'MALIGNANT')) |
((meta['Pathology_Res_x'] == 'NORMAL') & (meta['Pathology_Res_y'] == 'MALIGNANT')) |
((meta['Pathology_Res_x'] == 'MALIGNANT') & (meta['Pathology_Res_y'] == 'NORMAL')) )
benigns = np.where( ((meta['Pathology_Res_x'] == 'BENIGN') & (meta['Pathology_Res_y'] == 'BENIGN')) |
((meta['Pathology_Res_x'] == 'NORMAL') & (meta['Pathology_Res_y'] == 'BENIGN')) |
((meta['Pathology_Res_x'] == 'BENIGN') & (meta['Pathology_Res_y'] == 'NORMAL')) |
((meta['Pathology_Res_x'] == 'BENIGN_WITHOUT_CALLBACK') & (meta['Pathology_Res_y'] == 'BENIGN_WITHOUT_CALLBACK')) |
((meta['Pathology_Res_x'] == 'BENIGN_WITHOUT_CALLBACK') & (meta['Pathology_Res_y'] == 'NORMAL')) |
((meta['Pathology_Res_x'] == 'NORMAL') & (meta['Pathology_Res_y'] == 'BENIGN_WITHOUT_CALLBACK')) |
((meta['Pathology_Res_x'] == 'BENIGN') & (meta['Pathology_Res_y'] == 'BENIGN_WITHOUT_CALLBACK')) |
((meta['Pathology_Res_x'] == 'BENIGN_WITHOUT_CALLBACK') & (meta['Pathology_Res_y'] == 'BENIGN')) )
both = np.where( ((meta['Pathology_Res_x'] == 'BENIGN') & (meta['Pathology_Res_y'] == 'MALIGNANT')) |
((meta['Pathology_Res_x'] == 'MALIGNANT') & (meta['Pathology_Res_y'] == 'BENIGN')) |
((meta['Pathology_Res_x'] == 'BENIGN_WITHOUT_CALLBACK') & (meta['Pathology_Res_y'] == 'MALIGNANT')) |
((meta['Pathology_Res_x'] == 'MALIGNANT') & (meta['Pathology_Res_y'] == 'BENIGN_WITHOUT_CALLBACK')) )
normals = np.where( (meta['Pathology_Res_x'] == 'NORMAL') & (meta['Pathology_Res_y'] == 'NORMAL') )
unproven = np.where( (meta['Pathology_Res_x'] == 'UNPROVEN') | (meta['Pathology_Res_y'] == 'UNPROVEN') )
In [22]:
def add_diagnosis(row_indxs, df, label):
for item in row_indxs:
df.loc[item, 'Pathology_Res'] = label
# Pass by reference, no return needed
In [23]:
add_diagnosis(malignants, meta, 'MALIGNANT')
add_diagnosis(benigns, meta, 'BENIGN')
add_diagnosis(both, meta, 'BENIGN+MALIGNANT')
add_diagnosis(normals, meta, 'NORMAL')
add_diagnosis(unproven, meta, 'UNPROVEN')
In [24]:
meta[9:15]
Out[24]:
In [25]:
meta['Pathology_Res'].value_counts()
Out[25]:
In [26]:
meta['Detection_Res'] = np.where(meta['Pathology_Res']=='NORMAL', 'NORMAL', 'ABNORMAL')
meta['Detection_Res'].value_counts()
Out[26]:
In [27]:
meta['Detection_Res'] = np.where(meta['Pathology_Res']=='UNPROVEN', 'UNPROVEN', meta['Detection_Res'])
meta['Detection_Res'].value_counts()
Out[27]:
In [28]:
meta[9:15]
Out[28]:
In [29]:
cols = ['Patient_ID', 'Image_Name', 'View', 'Detection_Res', 'Pathology_Res']
meta = meta[cols]
In [30]:
meta[9:15]
Out[30]:
Important: Only keeping meta data on differenced images that we have.
In [31]:
mask = []
for f in meta.Image_Name:
mask.append(os.path.isfile('/Users/jnarhan/Projects/CUNY_698/Docker-Shared/Data_Diff_Images/ALL_IMGS/' + f))
meta = meta[mask]
print 'Number of records in meta: {:>10}'.format(meta['Patient_ID'].count())
Detection Labels: Two-Case Abnormal and Normal
In [32]:
print 'Number of Abnormals: {:>4}'.format(sum((meta.Detection_Res == 'ABNORMAL')))
print 'Number of Normals: {:>6}'.format(sum((meta.Detection_Res == 'NORMAL')))
print 'Number of Unproven: {:>5}'.format(sum((meta.Detection_Res == 'UNPROVEN')))
print 'Total: {:18}'.format(meta.Detection_Res.count())
Drop unproven cases as uncertain as to whether an abnormality exists.
In [33]:
mask = meta['Detection_Res'].isin(['ABNORMAL', 'NORMAL'])
detect_meta = meta[mask]
set(detect_meta.Detection_Res)
Out[33]:
In [34]:
print 'Number of Abnormals: {:>4}'.format(sum((detect_meta.Detection_Res == 'ABNORMAL')))
print 'Number of Normals: {:>6}'.format(sum((detect_meta.Detection_Res == 'NORMAL')))
print 'Number of Unproven: {:>5}'.format(sum((detect_meta.Pathology_Res == 'UNPROVEN')))
print 'Total: {:18}'.format(detect_meta.Detection_Res.count())
In [35]:
detect_meta.to_csv(path_or_buf=META_ROOT +'meta_data_detection.csv', index=False)
Diagnosis Labels: Two-Case Benign and Malignant
In [36]:
print 'Number of Normals: {:>6}'.format(sum((meta.Pathology_Res == 'NORMAL')))
print 'Number of B&Ms: {:>9}'.format(sum((meta.Pathology_Res == 'BENIGN+MALIGNANT')))
print 'Number of Bs: {:>11}'.format(sum((meta.Pathology_Res == 'BENIGN')))
print 'Number of Ms: {:>11}'.format(sum((meta.Pathology_Res == 'MALIGNANT')))
print 'Number of Unproven: {:>5}'.format(sum((meta.Pathology_Res == 'UNPROVEN')))
print 'Total: {:18}'.format(meta.Pathology_Res.count())
B&M's (92) and Unproven (16) will need to removed from the differencing analysis when it run, as the pathology cannot be proven in the latter case (in one or both of the breasts), and because handling B&M's through differencing is unclear.
In [37]:
#mask = meta['Detection_Res'].isin(['BENIGN', 'MALIGNANT', 'NORMAL'])
mask = meta['Pathology_Res'].isin(['BENIGN', 'MALIGNANT'])
diagnosis_meta = meta[mask]
set(diagnosis_meta.Pathology_Res)
Out[37]:
In [38]:
print 'Number of Benigns: {:>7}'.format(sum((diagnosis_meta.Pathology_Res == 'BENIGN')))
print 'Number of Malignants: {:}'.format(sum((diagnosis_meta.Pathology_Res == 'MALIGNANT')))
print 'Total: {:19}'.format(diagnosis_meta.Pathology_Res.count())
In [39]:
diagnosis_meta.to_csv(path_or_buf=META_ROOT +'meta_data_diagnosis.csv', index=False)
In [ ]: