In [14]:

    
%matplotlib qt
import pandas as pd
import numpy as np
import mia

Loading and Preprocessing

Loading the hologic and synthetic datasets.



In [15]:

    
hologic = pd.DataFrame.from_csv("/Volumes/Seagate/mmp_data/2015-04-01/hologic.csv")
phantom = pd.DataFrame.from_csv("/Volumes/Seagate/mmp_data/2015-04-01/synthetics1-blobs.csv")

Loading the meta data for the real and synthetic datasets.



In [16]:

    
hologic_meta = mia.analysis.create_hologic_meta_data(hologic, "/Volumes/Seagate/mmp_data/meta_data/BIRADS.csv")
phantom_meta = mia.analysis.create_synthetic_meta_data(phantom, "/Volumes/Seagate/mmp_data/meta_data/synthetic_meta_data_cleaned.csv")
phantom_meta.index.name = 'img_name'

Load the texture data generated from the real and synthetic blobs



In [17]:

    
hologic_texture = pd.DataFrame.from_csv("/Volumes/Seagate/mmp_data/2015-04-01/hologic_texture.csv")
phantom_texture = pd.DataFrame.from_csv("/Volumes/Seagate/mmp_data/2015-04-01/synthetics1_texture.csv")
hologic_texture.drop(['x', 'y', 'breast_area'], inplace=True, axis=1)
phantom_texture.drop(['x', 'y', 'breast_area'], inplace=True, axis=1)

Group blobs by radius and image name



In [18]:

    
hologic_texture_features = mia.analysis.group_by_scale_space(hologic_texture)
phantom_texture_features = mia.analysis.group_by_scale_space(phantom_texture)

Select random subset of the phantoms cases. This is important so that each synthetic case is only represented once.



In [19]:

    
syn_feature_meta = mia.analysis.remove_duplicate_index(phantom_meta)
phantom_texture_features['phantom_name'] = syn_feature_meta.phantom_name.tolist()
phantom_texture_features_subset = mia.analysis.create_random_subset(phantom_texture_features, 'phantom_name')



In [20]:

    
features = pd.concat([hologic_texture_features, phantom_texture_features_subset])
assert features.shape[0] == 366



In [21]:

    
hologic_labels = hologic_meta.drop_duplicates().BIRADS
phantom_labels = phantom_meta['VBD.1']

class_labels = pd.concat([hologic_labels, phantom_labels])
class_labels.index.name = "img_name"
labels = mia.analysis.remove_duplicate_index(class_labels)[0]

t-SNE

Running t-SNE to obtain a lower dimensional representation.



In [162]:

    
selected_features = features
mapping = mia.analysis.tSNE(selected_features, n_components=2, learning_rate=300, perplexity=30, verbose=1)









    



[t-SNE] Computing pairwise distances...
[t-SNE] Computed conditional probabilities for sample 366 / 366
[t-SNE] Mean sigma: 0.886677
[t-SNE] Error after 83 iterations with early exaggeration: 13.065503
[t-SNE] Error after 285 iterations: 0.413171



In [163]:

    
def plot_mapping(m):
    hologic_map = m.loc[hologic_texture_features.index]
    phantom_map = m.loc[phantom_texture_features_subset.index]

    hol_labels = labels[hologic_map.index]
    syn_labels = labels[phantom_map.index]

    ax = mia.plotting.plot_scatter_2d(hologic_map, labels=hol_labels, s=10)
    ax = mia.plotting.plot_scatter_2d(phantom_map, labels=syn_labels, ax=ax, marker='^', s=50)

plot_mapping(mapping)

Analysis



In [164]:

    
mapping = pd.DataFrame.from_csv("/Volumes/Seagate/mmp_data/2015-04-01/both_texture_mapping.csv")
plot_mapping(mapping)









    



---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-164-47091ca3fa93> in <module>()
      1 mapping = pd.DataFrame.from_csv("/Volumes/Seagate/mmp_data/2015-04-01/both_texture_mapping.csv")
----> 2 plot_mapping(mapping)

<ipython-input-163-b930ee3c2f1f> in plot_mapping(m)
      1 def plot_mapping(m):
      2     hologic_map = m.loc[hologic_texture_features.index]
----> 3     phantom_map = m.loc[phantom_texture_features_subset.index]
      4 
      5     hol_labels = labels[hologic_map.index]

/Users/samuel/git/major-project/lib/python2.7/site-packages/pandas/core/indexing.pyc in __getitem__(self, key)
   1200             return self._getitem_tuple(key)
   1201         else:
-> 1202             return self._getitem_axis(key, axis=0)
   1203 
   1204     def _getitem_axis(self, key, axis=0):

/Users/samuel/git/major-project/lib/python2.7/site-packages/pandas/core/indexing.pyc in _getitem_axis(self, key, axis)
   1333                     raise ValueError('Cannot index with multidimensional key')
   1334 
-> 1335                 return self._getitem_iterable(key, axis=axis)
   1336 
   1337             # nested tuple slicing

/Users/samuel/git/major-project/lib/python2.7/site-packages/pandas/core/indexing.pyc in _getitem_iterable(self, key, axis)
    930     def _getitem_iterable(self, key, axis=0):
    931         if self._should_validate_iterable(axis):
--> 932             self._has_valid_type(key, axis)
    933 
    934         labels = self.obj._get_axis(axis)

/Users/samuel/git/major-project/lib/python2.7/site-packages/pandas/core/indexing.pyc in _has_valid_type(self, key, axis)
   1281 
   1282                 raise KeyError("None of [%s] are in the [%s]" %
-> 1283                                (key, self.obj._get_axis_name(axis)))
   1284 
   1285             return True

KeyError: "None of [Index([u'test_Mix_DPerc0_c_2.dcm', u'test_Mix_DPerc10_c_14.dcm', u'test_Mix_DPerc20_c_4.dcm', u'test_Mix_DPerc35_c_6.dcm', u'test_Mix_DPerc5_c_1.dcm', u'test_Mix_DPerc75_c_0.dcm'], dtype='object')] are in the [index]"



In [165]:

    
index = hologic_texture_features.columns[:]
phantom_texture_features_subset[index].describe() - hologic_texture_features[index].describe()









    Out[165]:






  
    
      
      contrast
      dissimilarity
      homogeneity
      energy
      contrast_1
      dissimilarity_1
      homogeneity_1
      energy_1
      contrast_2
      dissimilarity_2
      ...
      homogeneity_7
      energy_7
      contrast_8
      dissimilarity_8
      homogeneity_8
      energy_8
      contrast_9
      dissimilarity_9
      homogeneity_9
      energy_9
    
  
  
    
      count
      -354.000000
      -354.000000
      -354.000000
      -354.000000
      -354.000000
      -354.000000
      -354.000000
      -354.000000
      -354.000000
      -354.000000
      ...
      -354.000000
      -354.000000
      -354.000000
      -354.000000
      -354.000000
      -354.000000
      -354.000000
      -354.000000
      -354.000000
      -354.000000
    
    
      mean
       -36.094598
        -8.574261
         0.820865
         0.629094
       -12.069407
        -8.393536
         0.818207
         0.598906
         0.040391
        -8.242244
      ...
         0.824067
         0.480287
       -97.507895
        -8.322890
         0.830725
         0.446720
       -75.764810
        -8.094365
         0.835126
         0.398989
    
    
      std
        77.573677
        -1.029735
        -0.003017
         0.064853
        74.799274
        -1.064820
        -0.005005
         0.060455
       123.808647
        -0.829207
      ...
        -0.007729
         0.053569
       -17.650738
        -1.144664
        -0.002428
         0.067979
       -27.771133
        -1.186786
        -0.017358
        -0.003905
    
    
      min
       -38.452866
        -4.813404
         0.828990
         0.536927
       -32.148727
        -4.409458
         0.831783
         0.515243
       -40.507932
        -4.703870
      ...
         0.842187
         0.387856
       -26.728379
        -4.474546
         0.849976
         0.369924
         5.245929
        -4.287897
         0.869067
         0.405998
    
    
      25%
       -65.875776
        -7.832061
         0.820324
         0.591417
       -90.334968
        -7.866095
         0.822500
         0.580275
       -85.609579
        -7.725292
      ...
         0.834502
         0.467432
       -93.503658
        -8.115844
         0.830353
         0.403402
       -70.184771
        -7.931239
         0.845489
         0.401188
    
    
      50%
       -64.321031
        -8.946135
         0.827458
         0.629251
       -25.729937
        -8.659722
         0.819128
         0.584306
       -49.876558
        -8.713124
      ...
         0.820628
         0.478023
      -102.099849
        -8.356179
         0.823936
         0.440101
       -77.181437
        -8.100156
         0.835247
         0.398753
    
    
      75%
       -81.441826
        -9.816628
         0.826492
         0.668716
        58.084608
        -9.112851
         0.823046
         0.625016
        -3.506759
        -9.216946
      ...
         0.826872
         0.506736
      -102.324877
        -8.882649
         0.842619
         0.462243
       -91.726046
        -8.712936
         0.835247
         0.398552
    
    
      max
      -200.431191
       -10.803769
         0.757227
         0.672196
      -288.605375
       -13.275121
         0.755516
         0.673573
       -87.352778
       -10.808649
      ...
         0.737696
         0.465857
      -175.575772
       -11.482547
         0.729919
         0.474071
      -159.263549
       -11.156297
         0.758837
         0.363268
    
  

8 rows × 40 columns



In [172]:

    
f = features.copy()
f = mia.analysis.normalize_data_frame(f)
f.columns = features.columns

cols = [features.columns[i::4] for i in range(4)]
cols = [c for l in cols for c in l]
f = f[cols[10*3:4*10]]

f['class'] = np.zeros(f.shape[0])
f['class'].loc[phantom_texture_features_subset.index] = np.ones(phantom_texture_features_subset.shape[0])
pd.tools.plotting.parallel_coordinates(f, 'class')









    Out[172]:





<matplotlib.axes._subplots.AxesSubplot at 0x122842ed0>



In [186]:

    
f = hologic_texture_features[hologic_texture_features.columns[:4]]
f['class'] = hologic_meta.drop_duplicates().BIRADS
f
mia.plotting.plot_scattermatrix(f, 'class')



In [191]:

    
left = hologic_texture_features[mapping['0'] < -4]
right = hologic_texture_features[mapping['0'] >= -4]
left.describe() - right.describe()









    Out[191]:






  
    
      
      contrast
      dissimilarity
      homogeneity
      energy
      contrast_1
      dissimilarity_1
      homogeneity_1
      energy_1
      contrast_2
      dissimilarity_2
      ...
      homogeneity_7
      energy_7
      contrast_8
      dissimilarity_8
      homogeneity_8
      energy_8
      contrast_9
      dissimilarity_9
      homogeneity_9
      energy_9
    
  
  
    
      count
       -90.000000
      -90.000000
      -90.000000
      -90.000000
       -90.000000
      -90.000000
      -90.000000
      -90.000000
       -90.000000
      -90.000000
      ...
      -90.000000
      -90.000000
      -90.000000
      -90.000000
      -90.000000
      -90.000000
      -90.000000
      -90.000000
      -90.000000
      -90.000000
    
    
      mean
       -86.171773
       -2.681908
        0.031002
        0.002128
       -84.660404
       -2.660814
        0.030519
        0.002087
       -79.640903
       -2.573057
      ...
        0.021603
       -0.002349
      -45.565221
       -1.656036
        0.020533
        0.000379
      -41.502819
       -1.593714
        0.021742
        0.001495
    
    
      std
       -12.739985
        0.308036
        0.008289
       -0.001467
       -12.293600
        0.251085
        0.008582
       -0.000359
       -10.405573
        0.258100
      ...
       -0.002317
       -0.010311
        3.333136
        0.241429
        0.002670
       -0.003432
        1.471672
        0.248384
        0.009003
        0.001225
    
    
      min
       -90.872427
       -2.913262
        0.019346
        0.001655
       -93.693252
       -3.709904
        0.023080
        0.001237
       -45.172599
       -2.558345
      ...
        0.029215
        0.002004
      -42.770182
       -2.237593
        0.027019
        0.002358
      -31.831485
       -1.825060
        0.024871
        0.001643
    
    
      25%
       -82.916297
       -2.969193
        0.022056
        0.001775
       -82.296937
       -2.848831
        0.023506
        0.001874
       -80.332102
       -2.876961
      ...
        0.020339
        0.001421
      -48.846495
       -1.846719
        0.019721
        0.002247
      -47.809307
       -1.885748
        0.016427
        0.000597
    
    
      50%
       -80.445848
       -2.333760
        0.028005
        0.002325
       -77.190709
       -2.439521
        0.026628
        0.002332
       -75.420045
       -2.267087
      ...
        0.021891
        0.001118
      -30.733926
       -0.997947
        0.010487
        0.000000
      -21.352052
       -0.836117
        0.008916
        0.000000
    
    
      75%
       -78.784536
       -2.368436
        0.037234
        0.003006
       -81.636575
       -2.412879
        0.037871
        0.002339
       -75.218499
       -2.241973
      ...
        0.023782
        0.000000
      -30.346690
       -1.202800
        0.022948
        0.000057
      -30.383160
       -1.186611
        0.025740
        0.002352
    
    
      max
      -390.430554
       -3.107604
        0.006784
       -0.038958
      -372.418003
       -5.521467
        0.043013
       -0.008510
      -359.249160
       -3.857261
      ...
       -0.021980
       -0.073146
      -78.501100
       -3.162326
        0.028471
       -0.033165
      -86.313251
       -2.751284
        0.049249
        0.017271
    
  

8 rows × 40 columns

	contrast	dissimilarity	homogeneity	energy	contrast_1	dissimilarity_1	homogeneity_1	energy_1	contrast_2	dissimilarity_2	...	homogeneity_7	energy_7	contrast_8	dissimilarity_8	homogeneity_8	energy_8	contrast_9	dissimilarity_9	homogeneity_9	energy_9
count	-354.000000	-354.000000	-354.000000	-354.000000	-354.000000	-354.000000	-354.000000	-354.000000	-354.000000	-354.000000	...	-354.000000	-354.000000	-354.000000	-354.000000	-354.000000	-354.000000	-354.000000	-354.000000	-354.000000	-354.000000
mean	-36.094598	-8.574261	0.820865	0.629094	-12.069407	-8.393536	0.818207	0.598906	0.040391	-8.242244	...	0.824067	0.480287	-97.507895	-8.322890	0.830725	0.446720	-75.764810	-8.094365	0.835126	0.398989
std	77.573677	-1.029735	-0.003017	0.064853	74.799274	-1.064820	-0.005005	0.060455	123.808647	-0.829207	...	-0.007729	0.053569	-17.650738	-1.144664	-0.002428	0.067979	-27.771133	-1.186786	-0.017358	-0.003905
min	-38.452866	-4.813404	0.828990	0.536927	-32.148727	-4.409458	0.831783	0.515243	-40.507932	-4.703870	...	0.842187	0.387856	-26.728379	-4.474546	0.849976	0.369924	5.245929	-4.287897	0.869067	0.405998
25%	-65.875776	-7.832061	0.820324	0.591417	-90.334968	-7.866095	0.822500	0.580275	-85.609579	-7.725292	...	0.834502	0.467432	-93.503658	-8.115844	0.830353	0.403402	-70.184771	-7.931239	0.845489	0.401188
50%	-64.321031	-8.946135	0.827458	0.629251	-25.729937	-8.659722	0.819128	0.584306	-49.876558	-8.713124	...	0.820628	0.478023	-102.099849	-8.356179	0.823936	0.440101	-77.181437	-8.100156	0.835247	0.398753
75%	-81.441826	-9.816628	0.826492	0.668716	58.084608	-9.112851	0.823046	0.625016	-3.506759	-9.216946	...	0.826872	0.506736	-102.324877	-8.882649	0.842619	0.462243	-91.726046	-8.712936	0.835247	0.398552
max	-200.431191	-10.803769	0.757227	0.672196	-288.605375	-13.275121	0.755516	0.673573	-87.352778	-10.808649	...	0.737696	0.465857	-175.575772	-11.482547	0.729919	0.474071	-159.263549	-11.156297	0.758837	0.363268

	contrast	dissimilarity	homogeneity	energy	contrast_1	dissimilarity_1	homogeneity_1	energy_1	contrast_2	dissimilarity_2	...	homogeneity_7	energy_7	contrast_8	dissimilarity_8	homogeneity_8	energy_8	contrast_9	dissimilarity_9	homogeneity_9	energy_9
count	-90.000000	-90.000000	-90.000000	-90.000000	-90.000000	-90.000000	-90.000000	-90.000000	-90.000000	-90.000000	...	-90.000000	-90.000000	-90.000000	-90.000000	-90.000000	-90.000000	-90.000000	-90.000000	-90.000000	-90.000000
mean	-86.171773	-2.681908	0.031002	0.002128	-84.660404	-2.660814	0.030519	0.002087	-79.640903	-2.573057	...	0.021603	-0.002349	-45.565221	-1.656036	0.020533	0.000379	-41.502819	-1.593714	0.021742	0.001495
std	-12.739985	0.308036	0.008289	-0.001467	-12.293600	0.251085	0.008582	-0.000359	-10.405573	0.258100	...	-0.002317	-0.010311	3.333136	0.241429	0.002670	-0.003432	1.471672	0.248384	0.009003	0.001225
min	-90.872427	-2.913262	0.019346	0.001655	-93.693252	-3.709904	0.023080	0.001237	-45.172599	-2.558345	...	0.029215	0.002004	-42.770182	-2.237593	0.027019	0.002358	-31.831485	-1.825060	0.024871	0.001643
25%	-82.916297	-2.969193	0.022056	0.001775	-82.296937	-2.848831	0.023506	0.001874	-80.332102	-2.876961	...	0.020339	0.001421	-48.846495	-1.846719	0.019721	0.002247	-47.809307	-1.885748	0.016427	0.000597
50%	-80.445848	-2.333760	0.028005	0.002325	-77.190709	-2.439521	0.026628	0.002332	-75.420045	-2.267087	...	0.021891	0.001118	-30.733926	-0.997947	0.010487	0.000000	-21.352052	-0.836117	0.008916	0.000000
75%	-78.784536	-2.368436	0.037234	0.003006	-81.636575	-2.412879	0.037871	0.002339	-75.218499	-2.241973	...	0.023782	0.000000	-30.346690	-1.202800	0.022948	0.000057	-30.383160	-1.186611	0.025740	0.002352
max	-390.430554	-3.107604	0.006784	-0.038958	-372.418003	-5.521467	0.043013	-0.008510	-359.249160	-3.857261	...	-0.021980	-0.073146	-78.501100	-3.162326	0.028471	-0.033165	-86.313251	-2.751284	0.049249	0.017271