In [1]:

    
%matplotlib inline
import pandas as pd
from pandas.tools import plotting
import mia



In [2]:

    
raw = pd.DataFrame.from_csv('../results/2015-03-05-results.csv')
meta_data = raw[['patient_id', 'class', 'side', 'view']]
raw.index = raw.image_name
raw.describe()









    Out[2]:






  
    
      
      x
      y
      radius
      avg_intensity
      std_intensity
      skew_intensity
      kurtosis_intensity
      patient_id
      class
    
  
  
    
      count
       56496.000000
       56496.000000
       56496.000000
       56496.000000
       56496.000000
       56496.000000
       56496.000000
       5.649600e+04
       56496.000000
    
    
      mean
        1963.719361
        1500.389089
          17.024544
          10.562333
          16.777163
           2.125780
           4.379336
       2.140106e+10
           2.286551
    
    
      std
         610.241223
         881.015395
          19.805117
          46.107491
          71.274800
           0.666986
           5.842874
       6.165381e+02
           0.872022
    
    
      min
         160.000000
         146.000000
           8.000000
           1.000000
           1.049553
          -0.772842
          -1.683268
       2.140106e+10
           1.000000
    
    
      25%
        1528.000000
         696.000000
           8.000000
           1.000000
           2.316719
           1.744069
           2.043031
       2.140106e+10
           2.000000
    
    
      50%
        1966.000000
        1464.000000
          11.313708
           1.890625
           3.747362
           2.163188
           3.948978
       2.140106e+10
           2.000000
    
    
      75%
        2402.000000
        2247.000000
          16.000000
           4.000000
           7.908905
           2.525493
           5.987966
       2.140106e+10
           3.000000
    
    
      max
        3932.000000
        3181.000000
         181.019336
         511.890625
        1328.880730
          14.407756
         219.390003
       2.140106e+10
           4.000000

Shape Based Features

Examining the effect of shape based features and what they contribute to dimensionality reduction.

Blob Detection Features



In [15]:

    
[g['radius'].describe() for i, g in raw.groupby('class')]

r = raw[raw['radius']>30]
mia.plotting.plot_risk_classes(r, 'radius')

Intensity Based Features

Examining the effect of intensity based features and what they contribute to the dimensionality reduction. Firstly, extract all of the intensity features from the dataset. Summary statistics are shown below.

Raw Intensity Data

Examine the raw intensity statistics for each image. There's a really high response in the lower range of the data. This is almost certainly down to a high average intensity in the smaller blobs. Again, as with shape based measures, the risk classes differ in the intensity of the number of responses, rather than shape of the distribution



In [78]:

    
intensity_columns = [c for c in raw.columns if 'intensity'in c]
intensity_stats = raw[intensity_columns]
intensity_stats.describe()









    Out[78]:






  
    
      
      avg_intensity
      std_intensity
      skew_intensity
      kurtosis_intensity
    
  
  
    
      count
       56496.000000
       56496.000000
       56496.000000
       56496.000000
    
    
      mean
          10.562333
          16.777163
           2.125780
           4.379336
    
    
      std
          46.107491
          71.274800
           0.666986
           5.842874
    
    
      min
           1.000000
           1.049553
          -0.772842
          -1.683268
    
    
      25%
           1.000000
           2.316719
           1.744069
           2.043031
    
    
      50%
           1.890625
           3.747362
           2.163188
           3.948978
    
    
      75%
           4.000000
           7.908905
           2.525493
           5.987966
    
    
      max
         511.890625
        1328.880730
          14.407756
         219.390003



In [87]:

    
intensity_stats['class'] = meta_data['class']
mia.plotting.plot_risk_classes_single(intensity_stats, 'avg_intensity')

Intensity Features



In [80]:

    
features = mia.reduction.feature_statistics(raw)
meta_data = features[['class', 'patient_id', 'view', 'side']]
features.describe()









    Out[80]:






  
    
      
      blob_count
      avg_radius
      std_radius
      min_radius
      max_radius
      small_radius_count
      med_radius_count
      large_radius_count
      density
      lower_radius_qt
      ...
      avg_avg_intensity
      avg_std_intensity
      avg_skew_intensity
      avg_kurt_intensity
      std_avg_intensity
      std_std_intensity
      std_skew_intensity
      std_kurt_intensity
      patient_id
      class
    
  
  
    
      count
       360.000000
       360.000000
       360.000000
       360
       360.000000
       360.000000
       360.000000
       360.000000
       360
       360.000000
      ...
       360.000000
       360.000000
       360.000000
       360.000000
       360.000000
       360.000000
       360.000000
       360.000000
       3.600000e+02
       360.000000
    
    
      mean
       156.933333
        18.624276
        21.312272
         8
       151.570838
       151.025000
         3.097222
         2.811111
         0
         8.230119
      ...
        13.646452
        21.408362
         2.149312
         4.530314
        48.577633
        70.135633
         0.639060
         4.595784
       2.140106e+10
         2.533333
    
    
      std
       101.082779
         3.949985
         7.962819
         0
        41.586007
        99.448462
         4.390924
         1.491999
         0
         0.820570
      ...
         8.183294
        14.144367
         0.223716
         1.069942
        27.595639
        46.483148
         0.189975
         4.059613
       5.295273e+02
         0.958178
    
    
      min
        26.000000
        11.251783
         4.942047
         8
        32.000000
        24.000000
         0.000000
         1.000000
         0
         8.000000
      ...
         2.310020
         3.553002
         1.535727
         2.118016
         2.702558
         4.345298
         0.380111
         2.106592
       2.140106e+10
         1.000000
    
    
      25%
        81.000000
        15.828461
        15.449720
         8
       128.000000
        76.750000
         1.000000
         2.000000
         0
         8.000000
      ...
         7.709935
        10.969145
         1.986801
         3.848934
        24.798375
        30.502043
         0.532565
         2.667155
       2.140106e+10
         2.000000
    
    
      50%
       131.000000
        18.024105
        20.845785
         8
       181.019336
       127.000000
         2.000000
         3.000000
         0
         8.000000
      ...
        11.557075
        17.897254
         2.138407
         4.356317
        48.859672
        63.259866
         0.599148
         3.030966
       2.140106e+10
         3.000000
    
    
      75%
       198.250000
        20.307923
        26.569746
         8
       181.019336
       189.250000
         4.000000
         4.000000
         0
         8.000000
      ...
        17.654056
        27.732207
         2.291443
         5.081154
        68.743505
        98.691971
         0.669634
         3.897467
       2.140106e+10
         3.000000
    
    
      max
       648.000000
        33.661574
        46.518068
         8
       181.019336
       620.000000
        37.000000
         8.000000
         0
        11.313708
      ...
        51.369960
        92.965144
         3.073701
         9.312543
       132.856182
       231.725405
         1.625442
        25.687622
       2.140106e+10
         4.000000
    
  

8 rows × 21 columns



In [81]:

    
intensity_columns = [c for c in features.columns if 'intensity'in c]
intensity_features = features[intensity_columns]
intensity_features.describe()









    Out[81]:






  
    
      
      avg_avg_intensity
      avg_std_intensity
      avg_skew_intensity
      avg_kurt_intensity
      std_avg_intensity
      std_std_intensity
      std_skew_intensity
      std_kurt_intensity
    
  
  
    
      count
       360.000000
       360.000000
       360.000000
       360.000000
       360.000000
       360.000000
       360.000000
       360.000000
    
    
      mean
        13.646452
        21.408362
         2.149312
         4.530314
        48.577633
        70.135633
         0.639060
         4.595784
    
    
      std
         8.183294
        14.144367
         0.223716
         1.069942
        27.595639
        46.483148
         0.189975
         4.059613
    
    
      min
         2.310020
         3.553002
         1.535727
         2.118016
         2.702558
         4.345298
         0.380111
         2.106592
    
    
      25%
         7.709935
        10.969145
         1.986801
         3.848934
        24.798375
        30.502043
         0.532565
         2.667155
    
    
      50%
        11.557075
        17.897254
         2.138407
         4.356317
        48.859672
        63.259866
         0.599148
         3.030966
    
    
      75%
        17.654056
        27.732207
         2.291443
         5.081154
        68.743505
        98.691971
         0.669634
         3.897467
    
    
      max
        51.369960
        92.965144
         3.073701
         9.312543
       132.856182
       231.725405
         1.625442
        25.687622

Radviz on Intensity Features

Using radviz on the intensity features shows that the biggest contributors are (in order of importance):

std_std_intensity
std_avg_intensity (most, with some pull on the risk 3 classes to avg_std_intensity)
All classes pulled equally between avg_std_intensity and avg_avg_intensity
Removeing both the aboe leads to avg_kurtosis being the primary measure
After this skew becomes the primary measure for the rest of the features



In [82]:

    
intensity_norm = mia.analysis.normalize_data_frame(intensity_features)
intensity_norm.columns = intensity_columns
intensity_norm['class'] = meta_data['class']



In [83]:

    
columns = [c for c in intensity_norm.columns if c not in ['std_std_intensity', 'std_avg_intensity', 'avg_avg_intensity', 'avg_std_intensity']]
plotting.radviz(intensity_norm[columns], 'class')









    Out[83]:





<matplotlib.axes._subplots.AxesSubplot at 0x1114830d0>

Initial Run With t-SNE

Firstly, examine the strength of the reduction with all features in place. There appears to be some clustering but with extreme high and low risk all mixed up



In [84]:

    
intensity_features = intensity_features.fillna(0)
mapping = mia.analysis.tSNE(intensity_features)
mapping['class'] = meta_data['class']
mia.plotting.plot_scatter_2d(mapping, [0,1], label_name='class')









    



[t-SNE] Computing pairwise distances...
[t-SNE] Computed conditional probabilities for sample 360 / 360
[t-SNE] Mean sigma: 0.826757
[t-SNE] Error after 80 iterations with early exaggeration: 3.367466
[t-SNE] Error after 250 iterations: 0.339204

Taking the most significant elements determined via radviz, the data set appears to form clear bands. Also, risk class 2 appears to be more clustered towards the end of the data.



In [85]:

    
sig_features = intensity_features[['std_std_intensity', 'std_avg_intensity', 'avg_avg_intensity', 'avg_std_intensity']]
sig_mapping = mia.analysis.tSNE(sig_features)
sig_mapping['class'] = meta_data['class']
mia.plotting.plot_scatter_2d(sig_mapping, [0,1], label_name='class')









    



[t-SNE] Computing pairwise distances...
[t-SNE] Computed conditional probabilities for sample 360 / 360
[t-SNE] Mean sigma: 0.353174
[t-SNE] Error after 83 iterations with early exaggeration: 2.998553
[t-SNE] Error after 256 iterations: 0.255339

Plot some the the features in 3D against one another to examine the relationship



In [86]:

    
intensity_features['class'] = meta_data['class']
columns = ['avg_std_intensity', 'std_avg_intensity', 'avg_avg_intensity']
mia.plotting.plot_scatter_3d(intensity_features, columns=columns, labels=meta_data['class'])

	x	y	radius	avg_intensity	std_intensity	skew_intensity	kurtosis_intensity	patient_id	class
count	56496.000000	56496.000000	56496.000000	56496.000000	56496.000000	56496.000000	56496.000000	5.649600e+04	56496.000000
mean	1963.719361	1500.389089	17.024544	10.562333	16.777163	2.125780	4.379336	2.140106e+10	2.286551
std	610.241223	881.015395	19.805117	46.107491	71.274800	0.666986	5.842874	6.165381e+02	0.872022
min	160.000000	146.000000	8.000000	1.000000	1.049553	-0.772842	-1.683268	2.140106e+10	1.000000
25%	1528.000000	696.000000	8.000000	1.000000	2.316719	1.744069	2.043031	2.140106e+10	2.000000
50%	1966.000000	1464.000000	11.313708	1.890625	3.747362	2.163188	3.948978	2.140106e+10	2.000000
75%	2402.000000	2247.000000	16.000000	4.000000	7.908905	2.525493	5.987966	2.140106e+10	3.000000
max	3932.000000	3181.000000	181.019336	511.890625	1328.880730	14.407756	219.390003	2.140106e+10	4.000000

	blob_count	avg_radius	std_radius	min_radius	max_radius	small_radius_count	med_radius_count	large_radius_count	density	lower_radius_qt	...	avg_avg_intensity	avg_std_intensity	avg_skew_intensity	avg_kurt_intensity	std_avg_intensity	std_std_intensity	std_skew_intensity	std_kurt_intensity	patient_id	class
count	360.000000	360.000000	360.000000	360	360.000000	360.000000	360.000000	360.000000	360	360.000000	...	360.000000	360.000000	360.000000	360.000000	360.000000	360.000000	360.000000	360.000000	3.600000e+02	360.000000
mean	156.933333	18.624276	21.312272	8	151.570838	151.025000	3.097222	2.811111	0	8.230119	...	13.646452	21.408362	2.149312	4.530314	48.577633	70.135633	0.639060	4.595784	2.140106e+10	2.533333
std	101.082779	3.949985	7.962819	0	41.586007	99.448462	4.390924	1.491999	0	0.820570	...	8.183294	14.144367	0.223716	1.069942	27.595639	46.483148	0.189975	4.059613	5.295273e+02	0.958178
min	26.000000	11.251783	4.942047	8	32.000000	24.000000	0.000000	1.000000	0	8.000000	...	2.310020	3.553002	1.535727	2.118016	2.702558	4.345298	0.380111	2.106592	2.140106e+10	1.000000
25%	81.000000	15.828461	15.449720	8	128.000000	76.750000	1.000000	2.000000	0	8.000000	...	7.709935	10.969145	1.986801	3.848934	24.798375	30.502043	0.532565	2.667155	2.140106e+10	2.000000
50%	131.000000	18.024105	20.845785	8	181.019336	127.000000	2.000000	3.000000	0	8.000000	...	11.557075	17.897254	2.138407	4.356317	48.859672	63.259866	0.599148	3.030966	2.140106e+10	3.000000
75%	198.250000	20.307923	26.569746	8	181.019336	189.250000	4.000000	4.000000	0	8.000000	...	17.654056	27.732207	2.291443	5.081154	68.743505	98.691971	0.669634	3.897467	2.140106e+10	3.000000
max	648.000000	33.661574	46.518068	8	181.019336	620.000000	37.000000	8.000000	0	11.313708	...	51.369960	92.965144	3.073701	9.312543	132.856182	231.725405	1.625442	25.687622	2.140106e+10	4.000000