In [14]:
%matplotlib qt
import pandas as pd
import numpy as np
import mia

Loading and Preprocessing

Loading the hologic and synthetic datasets.


In [15]:
hologic = pd.DataFrame.from_csv("/Volumes/Seagate/mmp_data/2015-04-01/hologic.csv")
phantom = pd.DataFrame.from_csv("/Volumes/Seagate/mmp_data/2015-04-01/synthetics1-blobs.csv")

Loading the meta data for the real and synthetic datasets.


In [16]:
hologic_meta = mia.analysis.create_hologic_meta_data(hologic, "/Volumes/Seagate/mmp_data/meta_data/BIRADS.csv")
phantom_meta = mia.analysis.create_synthetic_meta_data(phantom, "/Volumes/Seagate/mmp_data/meta_data/synthetic_meta_data_cleaned.csv")
phantom_meta.index.name = 'img_name'

Load the texture data generated from the real and synthetic blobs


In [17]:
hologic_texture = pd.DataFrame.from_csv("/Volumes/Seagate/mmp_data/2015-04-01/hologic_texture.csv")
phantom_texture = pd.DataFrame.from_csv("/Volumes/Seagate/mmp_data/2015-04-01/synthetics1_texture.csv")
hologic_texture.drop(['x', 'y', 'breast_area'], inplace=True, axis=1)
phantom_texture.drop(['x', 'y', 'breast_area'], inplace=True, axis=1)

Group blobs by radius and image name


In [18]:
hologic_texture_features = mia.analysis.group_by_scale_space(hologic_texture)
phantom_texture_features = mia.analysis.group_by_scale_space(phantom_texture)

Select random subset of the phantoms cases. This is important so that each synthetic case is only represented once.


In [19]:
syn_feature_meta = mia.analysis.remove_duplicate_index(phantom_meta)
phantom_texture_features['phantom_name'] = syn_feature_meta.phantom_name.tolist()
phantom_texture_features_subset = mia.analysis.create_random_subset(phantom_texture_features, 'phantom_name')

In [20]:
features = pd.concat([hologic_texture_features, phantom_texture_features_subset])
assert features.shape[0] == 366

In [21]:
hologic_labels = hologic_meta.drop_duplicates().BIRADS
phantom_labels = phantom_meta['VBD.1']

class_labels = pd.concat([hologic_labels, phantom_labels])
class_labels.index.name = "img_name"
labels = mia.analysis.remove_duplicate_index(class_labels)[0]

t-SNE

Running t-SNE to obtain a lower dimensional representation.


In [162]:
selected_features = features
mapping = mia.analysis.tSNE(selected_features, n_components=2, learning_rate=300, perplexity=30, verbose=1)


[t-SNE] Computing pairwise distances...
[t-SNE] Computed conditional probabilities for sample 366 / 366
[t-SNE] Mean sigma: 0.886677
[t-SNE] Error after 83 iterations with early exaggeration: 13.065503
[t-SNE] Error after 285 iterations: 0.413171

In [163]:
def plot_mapping(m):
    hologic_map = m.loc[hologic_texture_features.index]
    phantom_map = m.loc[phantom_texture_features_subset.index]

    hol_labels = labels[hologic_map.index]
    syn_labels = labels[phantom_map.index]

    ax = mia.plotting.plot_scatter_2d(hologic_map, labels=hol_labels, s=10)
    ax = mia.plotting.plot_scatter_2d(phantom_map, labels=syn_labels, ax=ax, marker='^', s=50)

plot_mapping(mapping)

Analysis


In [164]:
mapping = pd.DataFrame.from_csv("/Volumes/Seagate/mmp_data/2015-04-01/both_texture_mapping.csv")
plot_mapping(mapping)


---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-164-47091ca3fa93> in <module>()
      1 mapping = pd.DataFrame.from_csv("/Volumes/Seagate/mmp_data/2015-04-01/both_texture_mapping.csv")
----> 2 plot_mapping(mapping)

<ipython-input-163-b930ee3c2f1f> in plot_mapping(m)
      1 def plot_mapping(m):
      2     hologic_map = m.loc[hologic_texture_features.index]
----> 3     phantom_map = m.loc[phantom_texture_features_subset.index]
      4 
      5     hol_labels = labels[hologic_map.index]

/Users/samuel/git/major-project/lib/python2.7/site-packages/pandas/core/indexing.pyc in __getitem__(self, key)
   1200             return self._getitem_tuple(key)
   1201         else:
-> 1202             return self._getitem_axis(key, axis=0)
   1203 
   1204     def _getitem_axis(self, key, axis=0):

/Users/samuel/git/major-project/lib/python2.7/site-packages/pandas/core/indexing.pyc in _getitem_axis(self, key, axis)
   1333                     raise ValueError('Cannot index with multidimensional key')
   1334 
-> 1335                 return self._getitem_iterable(key, axis=axis)
   1336 
   1337             # nested tuple slicing

/Users/samuel/git/major-project/lib/python2.7/site-packages/pandas/core/indexing.pyc in _getitem_iterable(self, key, axis)
    930     def _getitem_iterable(self, key, axis=0):
    931         if self._should_validate_iterable(axis):
--> 932             self._has_valid_type(key, axis)
    933 
    934         labels = self.obj._get_axis(axis)

/Users/samuel/git/major-project/lib/python2.7/site-packages/pandas/core/indexing.pyc in _has_valid_type(self, key, axis)
   1281 
   1282                 raise KeyError("None of [%s] are in the [%s]" %
-> 1283                                (key, self.obj._get_axis_name(axis)))
   1284 
   1285             return True

KeyError: "None of [Index([u'test_Mix_DPerc0_c_2.dcm', u'test_Mix_DPerc10_c_14.dcm', u'test_Mix_DPerc20_c_4.dcm', u'test_Mix_DPerc35_c_6.dcm', u'test_Mix_DPerc5_c_1.dcm', u'test_Mix_DPerc75_c_0.dcm'], dtype='object')] are in the [index]"

In [165]:
index = hologic_texture_features.columns[:]
phantom_texture_features_subset[index].describe() - hologic_texture_features[index].describe()


Out[165]:
contrast dissimilarity homogeneity energy contrast_1 dissimilarity_1 homogeneity_1 energy_1 contrast_2 dissimilarity_2 ... homogeneity_7 energy_7 contrast_8 dissimilarity_8 homogeneity_8 energy_8 contrast_9 dissimilarity_9 homogeneity_9 energy_9
count -354.000000 -354.000000 -354.000000 -354.000000 -354.000000 -354.000000 -354.000000 -354.000000 -354.000000 -354.000000 ... -354.000000 -354.000000 -354.000000 -354.000000 -354.000000 -354.000000 -354.000000 -354.000000 -354.000000 -354.000000
mean -36.094598 -8.574261 0.820865 0.629094 -12.069407 -8.393536 0.818207 0.598906 0.040391 -8.242244 ... 0.824067 0.480287 -97.507895 -8.322890 0.830725 0.446720 -75.764810 -8.094365 0.835126 0.398989
std 77.573677 -1.029735 -0.003017 0.064853 74.799274 -1.064820 -0.005005 0.060455 123.808647 -0.829207 ... -0.007729 0.053569 -17.650738 -1.144664 -0.002428 0.067979 -27.771133 -1.186786 -0.017358 -0.003905
min -38.452866 -4.813404 0.828990 0.536927 -32.148727 -4.409458 0.831783 0.515243 -40.507932 -4.703870 ... 0.842187 0.387856 -26.728379 -4.474546 0.849976 0.369924 5.245929 -4.287897 0.869067 0.405998
25% -65.875776 -7.832061 0.820324 0.591417 -90.334968 -7.866095 0.822500 0.580275 -85.609579 -7.725292 ... 0.834502 0.467432 -93.503658 -8.115844 0.830353 0.403402 -70.184771 -7.931239 0.845489 0.401188
50% -64.321031 -8.946135 0.827458 0.629251 -25.729937 -8.659722 0.819128 0.584306 -49.876558 -8.713124 ... 0.820628 0.478023 -102.099849 -8.356179 0.823936 0.440101 -77.181437 -8.100156 0.835247 0.398753
75% -81.441826 -9.816628 0.826492 0.668716 58.084608 -9.112851 0.823046 0.625016 -3.506759 -9.216946 ... 0.826872 0.506736 -102.324877 -8.882649 0.842619 0.462243 -91.726046 -8.712936 0.835247 0.398552
max -200.431191 -10.803769 0.757227 0.672196 -288.605375 -13.275121 0.755516 0.673573 -87.352778 -10.808649 ... 0.737696 0.465857 -175.575772 -11.482547 0.729919 0.474071 -159.263549 -11.156297 0.758837 0.363268

8 rows × 40 columns


In [172]:
f = features.copy()
f = mia.analysis.normalize_data_frame(f)
f.columns = features.columns

cols = [features.columns[i::4] for i in range(4)]
cols = [c for l in cols for c in l]
f = f[cols[10*3:4*10]]

f['class'] = np.zeros(f.shape[0])
f['class'].loc[phantom_texture_features_subset.index] = np.ones(phantom_texture_features_subset.shape[0])
pd.tools.plotting.parallel_coordinates(f, 'class')


Out[172]:
<matplotlib.axes._subplots.AxesSubplot at 0x122842ed0>

In [186]:
f = hologic_texture_features[hologic_texture_features.columns[:4]]
f['class'] = hologic_meta.drop_duplicates().BIRADS
f
mia.plotting.plot_scattermatrix(f, 'class')

In [191]:
left = hologic_texture_features[mapping['0'] < -4]
right = hologic_texture_features[mapping['0'] >= -4]
left.describe() - right.describe()


Out[191]:
contrast dissimilarity homogeneity energy contrast_1 dissimilarity_1 homogeneity_1 energy_1 contrast_2 dissimilarity_2 ... homogeneity_7 energy_7 contrast_8 dissimilarity_8 homogeneity_8 energy_8 contrast_9 dissimilarity_9 homogeneity_9 energy_9
count -90.000000 -90.000000 -90.000000 -90.000000 -90.000000 -90.000000 -90.000000 -90.000000 -90.000000 -90.000000 ... -90.000000 -90.000000 -90.000000 -90.000000 -90.000000 -90.000000 -90.000000 -90.000000 -90.000000 -90.000000
mean -86.171773 -2.681908 0.031002 0.002128 -84.660404 -2.660814 0.030519 0.002087 -79.640903 -2.573057 ... 0.021603 -0.002349 -45.565221 -1.656036 0.020533 0.000379 -41.502819 -1.593714 0.021742 0.001495
std -12.739985 0.308036 0.008289 -0.001467 -12.293600 0.251085 0.008582 -0.000359 -10.405573 0.258100 ... -0.002317 -0.010311 3.333136 0.241429 0.002670 -0.003432 1.471672 0.248384 0.009003 0.001225
min -90.872427 -2.913262 0.019346 0.001655 -93.693252 -3.709904 0.023080 0.001237 -45.172599 -2.558345 ... 0.029215 0.002004 -42.770182 -2.237593 0.027019 0.002358 -31.831485 -1.825060 0.024871 0.001643
25% -82.916297 -2.969193 0.022056 0.001775 -82.296937 -2.848831 0.023506 0.001874 -80.332102 -2.876961 ... 0.020339 0.001421 -48.846495 -1.846719 0.019721 0.002247 -47.809307 -1.885748 0.016427 0.000597
50% -80.445848 -2.333760 0.028005 0.002325 -77.190709 -2.439521 0.026628 0.002332 -75.420045 -2.267087 ... 0.021891 0.001118 -30.733926 -0.997947 0.010487 0.000000 -21.352052 -0.836117 0.008916 0.000000
75% -78.784536 -2.368436 0.037234 0.003006 -81.636575 -2.412879 0.037871 0.002339 -75.218499 -2.241973 ... 0.023782 0.000000 -30.346690 -1.202800 0.022948 0.000057 -30.383160 -1.186611 0.025740 0.002352
max -390.430554 -3.107604 0.006784 -0.038958 -372.418003 -5.521467 0.043013 -0.008510 -359.249160 -3.857261 ... -0.021980 -0.073146 -78.501100 -3.162326 0.028471 -0.033165 -86.313251 -2.751284 0.049249 0.017271

8 rows × 40 columns