In [2]:

    
%matplotlib qt
import pandas as pd
import numpy as np
import mia

Loading and Preprocessing

Loading the hologic and synthetic datasets.



In [3]:

    
hologic = pd.DataFrame.from_csv("/Volumes/Seagate/mmp_data/2015-04-16/real-lines.csv")
hologic_blobs = pd.DataFrame.from_csv("/Volumes/Seagate/mmp_data/2015-04-01/hologic.csv")
phantom = pd.DataFrame.from_csv("/Volumes/Seagate/mmp_data/2015-04-16/phantom-lines.csv")
phantom_blobs = pd.DataFrame.from_csv("/Volumes/Seagate/mmp_data/2015-04-14/synthetics1-blobs.csv")

Loading the meta data for the real and synthetic datasets.



In [4]:

    
hologic_meta = mia.analysis.create_hologic_meta_data(hologic, "/Volumes/Seagate/mmp_data/meta_data/BIRADS.csv")
phantom_meta = mia.analysis.create_synthetic_meta_data(phantom, "/Volumes/Seagate/mmp_data/meta_data/synthetic_meta_data_cleaned.csv")
phantom_meta.index.name = 'img_name'



In [5]:

    
hologic_labels = hologic_meta.drop_duplicates().BIRADS
phantom_labels = phantom_meta['VBD.1']

# class_labels = mia.analysis.remove_duplicate_index(hologic_labels)
class_labels = pd.concat([hologic_labels, phantom_labels])
class_labels.index.name = "img_name"
labels = mia.analysis.remove_duplicate_index(class_labels)[0]

Create blob features from distribution of blobs



In [6]:

    
hologic_line_features = mia.analysis.features_from_lines(hologic)
hologic_line_features.fillna(hologic_line_features.mean(), inplace=True)

hologic_blob_features = mia.analysis.features_from_blobs(hologic_blobs)

phantom_line_features = mia.analysis.features_from_lines(phantom)
phantom_line_features.fillna(phantom_line_features.mean(), inplace=True)

phantom_blob_features = mia.analysis.features_from_blobs(phantom_blobs)



In [7]:

    
syn_feature_meta = mia.analysis.remove_duplicate_index(phantom_meta)
phantom_line_features['phantom_name'] = syn_feature_meta.phantom_name.tolist()
phantom_line_features_subset = mia.analysis.create_random_subset(phantom_line_features, 'phantom_name')
phantom_blob_features_subset = phantom_blob_features.loc[phantom_line_features_subset.index]



In [9]:

    
line_features = pd.concat([hologic_line_features, phantom_line_features])
blob_features = pd.concat([hologic_blob_features, phantom_blob_features])
features = blob_features.join(line_features, rsuffix='_line')
# assert features.shape[0] == 366
features.head()









    Out[9]:






  
    
      
      blob_count
      avg_radius
      std_radius
      min_radius
      max_radius
      small_radius_count
      med_radius_count
      large_radius_count
      density
      lower_radius_qt
      ...
      75%
      count
      kurtosis
      max
      mean
      min
      phantom_name
      skew
      std
      upper_dist_count_line
    
  
  
    
      p214-010-60001-cl.png
        56
       22.121831
       22.923389
       8
       128.000000
        52
       1
       3
       52.940812
       8
      ...
       137.25
        72
       26.793023
       1744
       161.791667
       1
       NaN
       4.786025
       245.194659
       16
    
    
      p214-010-60001-cr.png
        78
       19.054538
       17.506086
       8
        90.509668
        68
       4
       6
       40.749811
       8
      ...
       152.00
        57
       50.202367
       3494
       176.421053
       1
       NaN
       6.905092
       460.921203
       13
    
    
      p214-010-60001-ml.png
        98
       20.011191
       21.876304
       8
       128.000000
        90
       3
       5
       42.644057
       8
      ...
       210.50
        72
       13.937538
       1662
       200.708333
       1
       NaN
       3.484285
       296.524822
       19
    
    
      p214-010-60001-mr.png
       139
       15.309764
       15.307860
       8
       128.000000
       136
       1
       2
       38.287439
       8
      ...
       192.25
        74
       22.429202
       2522
       203.459459
       1
       NaN
       4.309621
       371.688380
       16
    
    
      p214-010-60005-cl.png
        97
       20.132590
       23.255605
       8
       181.019336
        94
       2
       1
       41.456308
       8
      ...
       170.75
       114
        9.028130
        911
       143.184211
       1
       NaN
       2.667624
       152.914376
       36
    
  

5 rows × 24 columns

t-SNE

Running t-SNE to obtain a lower dimensional representation.



In [16]:

    
selected_features = blob_features.copy()
# selected_features.drop(['skew', 'kurtosis', 'min'], inplace=True, axis=1)
mapping = mia.analysis.isomap(selected_features, n_components=2)



In [52]:

    
mapping.to_csv("/Volumes/Seagate/mmp_data/2015-04-16/lines-mapping.csv")



In [ ]:

    
left = hologic_line_features[mapping[0] < 0]
right = hologic_line_features[mapping[0] >=0]

left.describe() - right.describe()



In [ ]:

    
phantom_line_features_subset.describe() - hologic_line_features.describe()



In [18]:

    
def plot_mapping(m):
    hologic_map = m.loc[hologic_line_features.index]
    phantom_map = m.loc[phantom_line_features.index]

    hol_labels = labels[hologic_map.index]
    syn_labels = labels[phantom_map.index]

    ax = mia.plotting.plot_scatter_2d(hologic_map, labels=hol_labels, s=20)
    ax = mia.plotting.plot_scatter_2d(phantom_map, labels=syn_labels, ax=ax, marker='^', s=50)

plot_mapping(mapping)



In [ ]:

    
selected_features.to_csv("/Volumes/Seagate/mmp_data/features.csv")
mapping.to_csv("/Volumes/Seagate/mmp_data/mapping.csv")

Analysis



In [ ]:

    
mapping = pd.DataFrame.from_csv("/Volumes/Seagate/mmp_data/2015-04-01/both_shape_mapping.csv")
plot_mapping(mapping)



In [45]:

    
m = selected_features.copy()
m['class'] = labels
m.drop(['25%', 'upper_dist_count'], inplace=True, axis=1)
pd.tools.plotting.radviz(m, 'class')









    Out[45]:





<matplotlib.axes._subplots.AxesSubplot at 0x11ca64350>

	blob_count	avg_radius	std_radius	min_radius	max_radius	small_radius_count	med_radius_count	large_radius_count	density	lower_radius_qt	...	75%	count	kurtosis	max	mean	min	phantom_name	skew	std	upper_dist_count_line
p214-010-60001-cl.png	56	22.121831	22.923389	8	128.000000	52	1	3	52.940812	8	...	137.25	72	26.793023	1744	161.791667	1	NaN	4.786025	245.194659	16
p214-010-60001-cr.png	78	19.054538	17.506086	8	90.509668	68	4	6	40.749811	8	...	152.00	57	50.202367	3494	176.421053	1	NaN	6.905092	460.921203	13
p214-010-60001-ml.png	98	20.011191	21.876304	8	128.000000	90	3	5	42.644057	8	...	210.50	72	13.937538	1662	200.708333	1	NaN	3.484285	296.524822	19
p214-010-60001-mr.png	139	15.309764	15.307860	8	128.000000	136	1	2	38.287439	8	...	192.25	74	22.429202	2522	203.459459	1	NaN	4.309621	371.688380	16
p214-010-60005-cl.png	97	20.132590	23.255605	8	181.019336	94	2	1	41.456308	8	...	170.75	114	9.028130	911	143.184211	1	NaN	2.667624	152.914376	36