In [1]:
%matplotlib qt
import pandas as pd
import numpy as np
import mia

Loading and Preprocessing

Loading the hologic and synthetic datasets.


In [2]:
hologic = pd.DataFrame.from_csv("/Volumes/Seagate/mmp_data/2015-04-01/hologic.csv")
phantom = pd.DataFrame.from_csv("/Volumes/Seagate/mmp_data/2015-04-14/synthetics1-blobs.csv")

Loading the meta data for the real and synthetic datasets.


In [3]:
hologic_meta = mia.analysis.create_hologic_meta_data(hologic, "/Volumes/Seagate/mmp_data/meta_data/BIRADS.csv")
phantom_meta = mia.analysis.create_synthetic_meta_data(phantom, "/Volumes/Seagate/mmp_data/meta_data/synthetic_meta_data_cleaned.csv")
phantom_meta.index.name = 'img_name'

Load the texture data generated from the real and synthetic blobs


In [4]:
hologic_intensity = pd.DataFrame.from_csv("/Volumes/Seagate/mmp_data/2015-04-01/hologic_intensity.csv")
phantom_intensity = pd.DataFrame.from_csv("/Volumes/Seagate/mmp_data/2015-04-14/synthetics1_intensity.csv")
hologic_intensity.drop(['x', 'y', 'breast_area'], inplace=True, axis=1)
phantom_intensity.drop(['x', 'y', 'breast_area'], inplace=True, axis=1)

Group blobs by radius and image name


In [5]:
hologic_intensity_features = mia.analysis.group_by_scale_space(hologic_intensity)
phantom_intensity_features = mia.analysis.group_by_scale_space(phantom_intensity)
hologic_intensity_features.head()


Out[5]:
count mean std min 25% 50% 75% max skew kurtosis ... count_9 mean_9 std_9 min_9 25%_9 50%_9 75%_9 max_9 skew_9 kurtosis_9
p214-010-60001-cl.png 256 0.593224 0.071104 0.407598 0.542218 0.596446 0.646201 0.753186 -0.176690 -0.383312 ... 131044 0.492402 0.101979 0.148553 0.425211 0.496810 0.564164 0.816943 -0.158402 0.190464
p214-010-60001-cr.png 256 0.570726 0.085527 0.345359 0.523889 0.582157 0.632516 0.732288 -0.407291 -0.125079 ... 131044 0.492402 0.101979 0.148553 0.425211 0.496810 0.564164 0.816943 -0.158402 0.190464
p214-010-60001-ml.png 256 0.556979 0.075092 0.351806 0.508643 0.562178 0.612384 0.721259 -0.331462 -0.081281 ... 131044 0.492402 0.101979 0.148553 0.425211 0.496810 0.564164 0.816943 -0.158402 0.190464
p214-010-60001-mr.png 256 0.546299 0.090477 0.306556 0.490564 0.555362 0.610876 0.734252 -0.350162 0.044639 ... 131044 0.492402 0.101979 0.148553 0.425211 0.496810 0.564164 0.816943 -0.158402 0.190464
p214-010-60005-cl.png 256 0.559844 0.065667 0.366782 0.516955 0.565802 0.608189 0.695271 -0.386427 -0.065749 ... 131044 0.621300 0.126696 0.086275 0.545098 0.639216 0.717647 0.898039 -0.848023 0.758542

5 rows × 100 columns

Select random subset of the phantoms cases. This is important so that each synthetic case is only represented once.


In [6]:
syn_feature_meta = mia.analysis.remove_duplicate_index(phantom_meta)
phantom_intensity_features['phantom_name'] = syn_feature_meta.phantom_name.tolist()
phantom_intensity_features_subset = mia.analysis.create_random_subset(phantom_intensity_features, 'phantom_name')
phantom_intensity_features_subset


Out[6]:
count mean std min 25% 50% 75% max skew kurtosis ... count_4 mean_4 std_4 min_4 25%_4 50%_4 75%_4 max_4 skew_4 kurtosis_4
test_Mix_DPerc0_c_2.dcm 256 0.375382 0.164984 0.000000 0.379871 0.430250 0.478700 0.547233 -1.827690 3.395689 ... 4096 0.448151 0.242225 0 0.310503 0.443628 0.616847 0.991944 -0.065636 -0.38977
test_Mix_DPerc10_c_2.dcm 256 0.408695 0.161441 0.089518 0.289818 0.479850 0.516518 0.587736 -1.241404 1.637172 ... 4096 0.448151 0.242225 0 0.310503 0.443628 0.616847 0.991944 -0.065636 -0.38977
test_Mix_DPerc20_c_10.dcm 256 0.533051 0.048901 0.421600 0.497075 0.543300 0.574775 0.609900 -0.581546 -0.832763 ... 4096 0.448151 0.242225 0 0.310503 0.443628 0.616847 0.991944 -0.065636 -0.38977
test_Mix_DPerc35_c_6.dcm 256 0.470082 0.181881 0.148860 0.282915 0.549580 0.596560 0.665700 -0.912473 -0.510519 ... 4096 0.448151 0.242225 0 0.310503 0.443628 0.616847 0.991944 -0.065636 -0.38977
test_Mix_DPerc5_c_10.dcm 256 0.569600 0.292271 0.000000 0.581762 0.683825 0.744962 0.885400 -1.286940 0.054693 ... 4096 0.448151 0.242225 0 0.310503 0.443628 0.616847 0.991944 -0.065636 -0.38977
test_Mix_DPerc75_c_0.dcm 256 0.366560 0.164703 0.042500 0.307818 0.424571 0.476471 0.549071 -1.102359 0.227993 ... 4096 0.448151 0.242225 0 0.310503 0.443628 0.616847 0.991944 -0.065636 -0.38977

6 rows × 50 columns


In [16]:
features = pd.concat([hologic_intensity_features, phantom_intensity_features_subset], axis=0)
phantom_intensity_features_subset
from IPython.display import display
pd.options.display.max_columns = None
display(phantom_intensity_features_subset)
# assert features.shape[0] == 366


count mean std min 25% 50% 75% max skew kurtosis count_1 mean_1 std_1 min_1 25%_1 50%_1 75%_1 max_1 skew_1 kurtosis_1 count_2 mean_2 std_2 min_2 25%_2 50%_2 75%_2 max_2 skew_2 kurtosis_2 count_3 mean_3 std_3 min_3 25%_3 50%_3 75%_3 max_3 skew_3 kurtosis_3 count_4 mean_4 std_4 min_4 25%_4 50%_4 75%_4 max_4 skew_4 kurtosis_4
test_Mix_DPerc0_c_2.dcm 256 0.375382 0.164984 0.000000 0.379871 0.430250 0.478700 0.547233 -1.827690 3.395689 484 0.394286 0.109718 0.016506 0.358194 0.409811 0.463932 0.551089 -1.777259 4.835428 1024 0.403053 0.138559 0.000000 0.348271 0.413814 0.501071 0.641971 -1.108034 2.249627 1936 0.439749 0.124994 0 0.345975 0.433050 0.510475 0.749700 -0.045835 0.877839 4096 0.448151 0.242225 0 0.310503 0.443628 0.616847 0.991944 -0.065636 -0.38977
test_Mix_DPerc10_c_2.dcm 256 0.408695 0.161441 0.089518 0.289818 0.479850 0.516518 0.587736 -1.241404 1.637172 484 0.390645 0.128934 0.000000 0.363114 0.413272 0.469836 0.563178 -1.814805 4.034819 1024 0.419758 0.107221 0.000000 0.359691 0.414481 0.489225 0.663263 -0.529069 2.618749 1936 0.427107 0.155415 0 0.368133 0.437433 0.535833 0.765133 -0.466940 1.611641 4096 0.448151 0.242225 0 0.310503 0.443628 0.616847 0.991944 -0.065636 -0.38977
test_Mix_DPerc20_c_10.dcm 256 0.533051 0.048901 0.421600 0.497075 0.543300 0.574775 0.609900 -0.581546 -0.832763 484 0.564046 0.163469 0.000000 0.500467 0.572133 0.664050 0.848550 -1.592424 4.931728 1024 0.571977 0.152273 0.104033 0.464225 0.553400 0.663750 0.995667 0.168743 1.058678 1936 0.534488 0.245550 0 0.449937 0.546000 0.698200 0.999300 -0.553463 0.390660 4096 0.448151 0.242225 0 0.310503 0.443628 0.616847 0.991944 -0.065636 -0.38977
test_Mix_DPerc35_c_6.dcm 256 0.470082 0.181881 0.148860 0.282915 0.549580 0.596560 0.665700 -0.912473 -0.510519 484 0.460870 0.228278 0.000000 0.300275 0.525533 0.642267 0.813300 -1.237789 2.467917 1024 0.490607 0.124931 0.000000 0.420583 0.492183 0.568438 0.761517 -1.024568 3.471358 1936 0.507196 0.154884 0 0.403467 0.503517 0.604483 0.928067 -0.238375 1.239616 4096 0.448151 0.242225 0 0.310503 0.443628 0.616847 0.991944 -0.065636 -0.38977
test_Mix_DPerc5_c_10.dcm 256 0.569600 0.292271 0.000000 0.581762 0.683825 0.744962 0.885400 -1.286940 0.054693 484 0.503212 0.144616 0.000000 0.456462 0.516375 0.588533 0.728567 -2.009849 5.302331 1024 0.439877 0.185244 0.000000 0.337375 0.468000 0.555650 0.784600 -0.776406 0.539945 1936 0.520494 0.177431 0 0.406783 0.507900 0.642025 0.958700 -0.175832 0.617413 4096 0.448151 0.242225 0 0.310503 0.443628 0.616847 0.991944 -0.065636 -0.38977
test_Mix_DPerc75_c_0.dcm 256 0.366560 0.164703 0.042500 0.307818 0.424571 0.476471 0.549071 -1.102359 0.227993 484 0.373632 0.153843 0.000000 0.350186 0.410907 0.474875 0.577386 -1.593369 3.400753 1024 0.375871 0.126916 0.034829 0.304050 0.378657 0.463779 0.650329 -0.331226 0.621471 1936 0.358314 0.129889 0 0.276133 0.350900 0.437258 0.676833 -0.284515 0.902454 4096 0.448151 0.242225 0 0.310503 0.443628 0.616847 0.991944 -0.065636 -0.38977

In [8]:
hologic_labels = hologic_meta.drop_duplicates().BIRADS
phantom_labels = phantom_meta['VBD.1']

class_labels = pd.concat([hologic_labels, phantom_labels])
class_labels.index.name = "img_name"
labels = mia.analysis.remove_duplicate_index(class_labels)[0]

In [15]:
features.tail()


Out[15]:
25% 25%_1 25%_2 25%_3 25%_4 25%_5 25%_6 25%_7 25%_8 25%_9 ... std std_1 std_2 std_3 std_4 std_5 std_6 std_7 std_8 std_9
test_Mix_DPerc10_c_4.dcm 0.300066 0.366894 0.350481 0.360617 0.310503 NaN NaN NaN NaN NaN ... 0.188274 0.120729 0.100846 0.139517 0.242225 NaN NaN NaN NaN NaN
test_Mix_DPerc20_c_9.dcm 0.376875 0.491846 0.447031 0.432087 0.310503 NaN NaN NaN NaN NaN ... 0.262732 0.163726 0.168441 0.165256 0.242225 NaN NaN NaN NaN NaN
test_Mix_DPerc35_c_12.dcm 0.426458 0.316425 0.374546 0.396663 0.310503 NaN NaN NaN NaN NaN ... 0.144980 0.255368 0.104600 0.138985 0.242225 NaN NaN NaN NaN NaN
test_Mix_DPerc5_c_6.dcm 0.309700 0.478350 0.413694 0.428363 0.310503 NaN NaN NaN NaN NaN ... 0.333692 0.164321 0.151213 0.178124 0.242225 NaN NaN NaN NaN NaN
test_Mix_DPerc75_c_0.dcm 0.307818 0.350186 0.304050 0.276133 0.310503 NaN NaN NaN NaN NaN ... 0.164703 0.153843 0.126916 0.129889 0.242225 NaN NaN NaN NaN NaN

5 rows × 100 columns

t-SNE

Running t-SNE to obtain a lower dimensional representation.


In [9]:
selected_features = features[[c for c in features.columns if 'count' not in c]]
selected_features.drop([c for c in features.columns if 'skew'in c or 'kurtosis' in c], axis=1, inplace=True)
mapping = mia.analysis.tSNE(selected_features, n_components=2, learning_rate=300, perplexity=30, verbose=1)


-c:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-9-eff593eee55c> in <module>()
      1 selected_features = features[[c for c in features.columns if 'count' not in c]]
      2 selected_features.drop([c for c in features.columns if 'skew'in c or 'kurtosis' in c], axis=1, inplace=True)
----> 3 mapping = mia.analysis.tSNE(selected_features, n_components=2, learning_rate=300, perplexity=30, verbose=1)

/Users/samuel/git/major-project/src/mia/analysis.pyc in inner(feature_matrix, **kwargs)
     21     def inner(feature_matrix, **kwargs):
     22         if isinstance(feature_matrix, pd.DataFrame):
---> 23             fit_output = func(feature_matrix.as_matrix(), **kwargs)
     24             return pd.DataFrame(fit_output, index=feature_matrix.index)
     25         else:

/Users/samuel/git/major-project/src/mia/analysis.pyc in tSNE(feature_matrix, **kwargs)
     35     :returns: 2darray -- lower dimensional mapping of the t-SNE algorithm
     36     """
---> 37     feature_matrix = standard_scaler(feature_matrix)
     38     tSNE = manifold.TSNE(**kwargs)
     39     fit_output = tSNE.fit_transform(feature_matrix)

/Users/samuel/git/major-project/src/mia/analysis.pyc in inner(feature_matrix, **kwargs)
     24             return pd.DataFrame(fit_output, index=feature_matrix.index)
     25         else:
---> 26             return func(feature_matrix)
     27     return inner
     28 

/Users/samuel/git/major-project/src/mia/analysis.pyc in standard_scaler(feature_matrix)
     57 def standard_scaler(feature_matrix):
     58     scalar = preprocessing.StandardScaler()
---> 59     feature_matrix = scalar.fit_transform(feature_matrix)
     60     return feature_matrix
     61 

/Users/samuel/git/major-project/lib/python2.7/site-packages/sklearn/base.pyc in fit_transform(self, X, y, **fit_params)
    424         if y is None:
    425             # fit method of arity 1 (unsupervised transformation)
--> 426             return self.fit(X, **fit_params).transform(X)
    427         else:
    428             # fit method of arity 2 (supervised transformation)

/Users/samuel/git/major-project/lib/python2.7/site-packages/sklearn/preprocessing/data.pyc in fit(self, X, y)
    313             used for later scaling along the features axis.
    314         """
--> 315         X = check_arrays(X, copy=self.copy, sparse_format="csr")[0]
    316         if warn_if_not_float(X, estimator=self):
    317             X = X.astype(np.float)

/Users/samuel/git/major-project/lib/python2.7/site-packages/sklearn/utils/validation.pyc in check_arrays(*arrays, **options)
    281                     array = np.asarray(array, dtype=dtype)
    282                 if not allow_nans:
--> 283                     _assert_all_finite(array)
    284 
    285             if not allow_nd and array.ndim >= 3:

/Users/samuel/git/major-project/lib/python2.7/site-packages/sklearn/utils/validation.pyc in _assert_all_finite(X)
     41             and not np.isfinite(X).all()):
     42         raise ValueError("Input contains NaN, infinity"
---> 43                          " or a value too large for %r." % X.dtype)
     44 
     45 

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [14]:
def plot_mapping(m):
    hologic_map = m.loc[hologic_intensity_features.index]
    phantom_map = m.loc[phantom_intensity_features_subset.index]

    hol_labels = labels[hologic_map.index]
    syn_labels = labels[phantom_map.index]

    ax = mia.plotting.plot_scatter_2d(hologic_map, labels=hol_labels, s=10)
    ax = mia.plotting.plot_scatter_2d(phantom_map, labels=syn_labels, ax=ax, marker='^', s=50)

plot_mapping(mapping)

In [42]:
ordered_features = mia.analysis.sort_by_scale_space(selected_features, 9)
# ordered_features['class']
ordered_features.index.name = 'img_name'
ordered_features['class'] = labels
ordered_features.sort('class', inplace=True)
# ordered_features['class'].loc[mapping[1] < 0] = np.ones(mapping[mapping[1] < 0].shape[0]) 
# ordered_features['class'].loc[phantom_intensity_features_subset.index] = np.ones(phantom_intensity_features_subset.shape[0]) + 1
pd.tools.plotting.parallel_coordinates(ordered_features, 'class')

# left = ordered_features[mapping[0] < 0]
# right = ordered_features[mapping[0] >= 0]


Out[42]:
<matplotlib.axes._subplots.AxesSubplot at 0x1197b82d0>

Analysis


In [13]:
mapping = pd.DataFrame.from_csv("/Volumes/Seagate/mmp_data/2015-04-01/both_intensity_mapping.csv")
plot_mapping(mapping)

In [14]:
mapping.loc[phantom_intensity_features_subset.index]


Out[14]:
0 1
test_Mix_DPerc0_c_5.dcm NaN NaN
test_Mix_DPerc10_c_10.dcm NaN NaN
test_Mix_DPerc20_c_3.dcm NaN NaN
test_Mix_DPerc35_c_14.dcm NaN NaN
test_Mix_DPerc5_c_4.dcm NaN NaN
test_Mix_DPerc75_c_1.dcm -15.441212 -0.499856

In [15]:
f = features.copy()
f = mia.analysis.normalize_data_frame(f)
f.columns = features.columns

cols = [features.columns[i::10] for i in range(10)]
cols = [c for l in cols for c in l]
f = f[cols]

f = f[f.columns[9*10:10*10]]


f['class'] = np.zeros(f.shape[0])
f['class'].loc[phantom_intensity_features_subset.index] = np.ones(phantom_intensity_features_subset.shape[0])
pd.tools.plotting.parallel_coordinates(f, 'class')


Out[15]:
<matplotlib.axes._subplots.AxesSubplot at 0x117038590>

In [16]:
phantom_intensity_features.describe() - hologic_intensity_features.describe()


Out[16]:
count mean std min 25% 50% 75% max skew kurtosis ... count_9 mean_9 std_9 min_9 25%_9 50%_9 75%_9 max_9 skew_9 kurtosis_9
count -280 -280.000000 -280.000000 -280.000000 -280.000000 -280.000000 -280.000000 -280.000000 -280.000000 -280.000000 ... -280 -280.000000 -280.000000 -280.000000 -280.000000 -280.000000 -280.000000 -280.000000 -280.000000 -280.000000
mean 0 0.503442 -0.042213 0.601736 0.546657 0.504846 0.466452 0.371917 -0.209116 0.613187 ... 0 0.483986 -0.005547 -0.148553 0.557345 0.489906 0.425921 0.180490 -9.962433 101.889662
std 0 -0.045391 -0.001060 0.007905 -0.045395 -0.050233 -0.054350 -0.059506 0.058450 1.019439 ... 0 -0.049160 -0.021419 -0.046546 -0.043851 -0.055636 -0.066502 -0.067273 0.173831 8.857408
min 0 0.598642 -0.028113 0.543952 0.638197 0.609318 0.579099 0.510818 -0.238572 0.052755 ... 0 0.605970 0.035818 0.000000 0.664347 0.627443 0.588735 0.377096 -9.594619 63.023480
25% 0 0.538940 -0.042707 0.609742 0.582952 0.545177 0.506770 0.412943 -0.199547 0.138901 ... 0 0.504944 0.009414 -0.133333 0.576674 0.513677 0.458767 0.209687 -9.866476 102.253404
50% 0 0.509653 -0.043021 0.614549 0.552364 0.510969 0.473009 0.379099 -0.177385 0.394877 ... 0 0.483986 -0.005547 -0.148553 0.557345 0.489906 0.425921 0.180490 -9.962433 101.889662
75% 0 0.472427 -0.043378 0.605665 0.514350 0.468776 0.426332 0.327545 -0.176150 0.588639 ... 0 0.477625 -0.017144 -0.168627 0.549223 0.483775 0.411653 0.148413 -10.095130 101.889662
max 0 0.373887 -0.068904 0.563898 0.414178 0.366211 0.314211 0.202341 -0.122859 5.388688 ... 0 0.197564 -0.081762 -0.288235 0.229999 0.155810 0.115831 -0.002457 -8.742101 126.693342

8 rows × 100 columns