In [131]:

    
%matplotlib inline
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import mia

Loading and Preprocessing

Loading the hologic and synthetic datasets.



In [82]:

    
hologic = pd.DataFrame.from_csv("real_intensity.csv")
hologic.drop(hologic.columns[:2], axis=1, inplace=True)
hologic.drop('breast_area', axis=1, inplace=True)

phantom = pd.DataFrame.from_csv("synthetic_intensity.csv")
phantom.drop(phantom.columns[:2], axis=1, inplace=True)
phantom.drop('breast_area', axis=1, inplace=True)

Loading the meta data for the real and synthetic datasets.



In [83]:

    
hologic_meta = mia.analysis.create_hologic_meta_data(hologic, "meta_data/real_meta.csv")
phantom_meta = mia.analysis.create_synthetic_meta_data(phantom, 
                                                        "meta_data/synthetic_meta.csv")
phantom_meta.index.name = 'img_name'

Prepare the BI-RADS/VBD labels for both datasets.



In [84]:

    
hologic_labels = hologic_meta.drop_duplicates().BIRADS
phantom_labels = phantom_meta['VBD.1']

class_labels = pd.concat([hologic_labels, phantom_labels])
class_labels.index.name = "img_name"
labels = mia.analysis.remove_duplicate_index(class_labels)[0]

Creating Features

Create blob features from distribution of blobs



In [85]:

    
hologic_intensity_features = mia.analysis.group_by_scale_space(hologic)
phantom_intensity_features = mia.analysis.group_by_scale_space(phantom)

Take a random subset of the real mammograms. This is important so that each patient is not over represented.



In [86]:

    
hologic_intensity_features['patient_id'] = hologic_meta.drop_duplicates()['patient_id']
hologic_intensity_features_subset = mia.analysis.create_random_subset(hologic_intensity_features, 
                                                                      'patient_id')

Take a random subset of the phantom mammograms. This is important so that each case is not over represented.



In [87]:

    
syn_feature_meta = mia.analysis.remove_duplicate_index(phantom_meta)
phantom_intensity_features['phantom_name'] = syn_feature_meta.phantom_name.tolist()
phantom_intensity_features_subset \
    = mia.analysis.create_random_subset(phantom_intensity_features, 'phantom_name')

Combine the features from both datasets.



In [88]:

    
features = pd.concat([hologic_intensity_features_subset, phantom_intensity_features_subset])
assert features.shape[0] == 96
features.head()









    Out[88]:






  
    
      
      count
      mean
      std
      min
      25%
      50%
      75%
      max
      skew
      kurtosis
      ...
      count_9
      mean_9
      std_9
      min_9
      25%_9
      50%_9
      75%_9
      max_9
      skew_9
      kurtosis_9
    
  
  
    
      p214-010-60001-cr.png
       256
       0.558904
       0.087279
       0.328763
       0.510450
       0.570363
       0.622144
       0.724731
      -0.406150
      -0.127657
      ...
       131044
       0.541212
       0.118465
       0.141845
       0.462988
       0.546140
       0.624509
       0.921247
      -0.150691
       0.148522
    
    
      p214-010-60005-ml.png
       256
       0.579815
       0.090863
       0.314655
       0.530318
       0.591799
       0.644746
       0.759436
      -0.575184
       0.634554
      ...
       131044
       0.541212
       0.118465
       0.141845
       0.462988
       0.546140
       0.624509
       0.921247
      -0.150691
       0.148522
    
    
      p214-010-60008-cr.png
       256
       0.493326
       0.074813
       0.296470
       0.447373
       0.496014
       0.544188
       0.679322
      -0.100948
       0.021890
      ...
       131044
       0.523970
       0.099598
       0.148544
       0.455340
       0.521359
       0.589320
       0.936893
      -0.017659
       0.898368
    
    
      p214-010-60012-ml.png
       256
       0.469238
       0.081322
       0.254982
       0.415496
       0.473583
       0.527386
       0.660004
      -0.204323
      -0.192516
      ...
       131044
       0.541212
       0.118465
       0.141845
       0.462988
       0.546140
       0.624509
       0.921247
      -0.150691
       0.148522
    
    
      p214-010-60013-mr.png
       256
       0.524458
       0.087562
       0.285158
       0.467286
       0.534160
       0.588593
       0.704526
      -0.407108
       0.041762
      ...
       131044
       0.554975
       0.131461
       0.138075
       0.463040
       0.552301
       0.645746
       0.949791
       0.038416
      -0.374100
    
  

5 rows × 100 columns

Filter some features, such as the min, to remove noise.



In [89]:

    
selected_features = features.copy()

Compare Real and Synthetic Features

Compare the distributions of features detected from the real mammograms and the phantoms using the Kolmogorov-Smirnov two sample test.



In [90]:

    
ks_stats = [list(stats.ks_2samp(hologic_intensity_features[col], 
                                phantom_intensity_features[col]))
                                for col in selected_features.columns]

ks_test = pd.DataFrame(ks_stats, columns=['KS', 'p-value'], index=selected_features.columns)
ks_test.to_latex("tables/intensity_features_ks.tex", longtable=True)
ks_test









    Out[90]:






  
    
      
      KS
      p-value
    
  
  
    
      count
       0.000000
       1.000000e+00
    
    
      mean
       1.000000
       3.587622e-59
    
    
      std
       0.876389
       1.515491e-45
    
    
      min
       1.000000
       3.587622e-59
    
    
      25%
       1.000000
       3.587622e-59
    
    
      50%
       1.000000
       3.587622e-59
    
    
      75%
       1.000000
       3.587622e-59
    
    
      max
       1.000000
       3.587622e-59
    
    
      skew
       0.891667
       3.923764e-47
    
    
      kurtosis
       0.568056
       2.209941e-19
    
    
      count_1
       0.000000
       1.000000e+00
    
    
      mean_1
       1.000000
       3.587622e-59
    
    
      std_1
       0.981944
       4.539903e-57
    
    
      min_1
       1.000000
       3.587622e-59
    
    
      25%_1
       1.000000
       3.587622e-59
    
    
      50%_1
       1.000000
       3.587622e-59
    
    
      75%_1
       1.000000
       3.587622e-59
    
    
      max_1
       0.997222
       7.598385e-59
    
    
      skew_1
       0.784722
       1.335838e-36
    
    
      kurtosis_1
       0.466667
       3.216681e-13
    
    
      count_2
       0.000000
       1.000000e+00
    
    
      mean_2
       1.000000
       3.587622e-59
    
    
      std_2
       1.000000
       3.587622e-59
    
    
      min_2
       1.000000
       3.587622e-59
    
    
      25%_2
       1.000000
       3.587622e-59
    
    
      50%_2
       1.000000
       3.587622e-59
    
    
      75%_2
       1.000000
       3.587622e-59
    
    
      max_2
       0.994444
       1.605940e-58
    
    
      skew_2
       0.501389
       3.410114e-15
    
    
      kurtosis_2
       0.436111
       1.342503e-11
    
    
      ...
      ...
      ...
    
    
      count_7
       0.000000
       1.000000e+00
    
    
      mean_7
       1.000000
       3.587622e-59
    
    
      std_7
       0.955556
       4.577742e-54
    
    
      min_7
       1.000000
       3.587622e-59
    
    
      25%_7
       1.000000
       3.587622e-59
    
    
      50%_7
       1.000000
       3.587622e-59
    
    
      75%_7
       1.000000
       3.587622e-59
    
    
      max_7
       0.740278
       1.280685e-32
    
    
      skew_7
       0.319444
       2.024353e-06
    
    
      kurtosis_7
       0.304167
       7.344875e-06
    
    
      count_8
       0.000000
       1.000000e+00
    
    
      mean_8
       1.000000
       3.587622e-59
    
    
      std_8
       0.929167
       3.823290e-51
    
    
      min_8
       1.000000
       3.587622e-59
    
    
      25%_8
       1.000000
       3.587622e-59
    
    
      50%_8
       1.000000
       3.587622e-59
    
    
      75%_8
       1.000000
       3.587622e-59
    
    
      max_8
       0.736111
       2.943248e-32
    
    
      skew_8
       0.547222
       5.120902e-18
    
    
      kurtosis_8
       0.395833
       1.248634e-09
    
    
      count_9
       0.000000
       1.000000e+00
    
    
      mean_9
       1.000000
       3.587622e-59
    
    
      std_9
       0.956944
       3.195999e-54
    
    
      min_9
       1.000000
       3.587622e-59
    
    
      25%_9
       1.000000
       3.587622e-59
    
    
      50%_9
       1.000000
       3.587622e-59
    
    
      75%_9
       0.997222
       7.598385e-59
    
    
      max_9
       0.794444
       1.674259e-37
    
    
      skew_9
       0.702778
       1.934059e-29
    
    
      kurtosis_9
       0.891667
       3.923764e-47
    
  

100 rows × 2 columns

Dimensionality Reduction

t-SNE

Running t-SNE to obtain a two dimensional representation.



In [91]:

    
real_index = hologic_intensity_features_subset.index
phantom_index = phantom_intensity_features_subset.index



In [92]:

    
kwargs = {
    'learning_rate': 200,
    'perplexity': 20,
    'verbose': 1
}



In [93]:

    
SNE_mapping_2d = mia.analysis.tSNE(selected_features, n_components=2, **kwargs)









    



[t-SNE] Computing pairwise distances...
[t-SNE] Computed conditional probabilities for sample 96 / 96
[t-SNE] Mean sigma: 2.693481
[t-SNE] Error after 65 iterations with early exaggeration: 12.552178
[t-SNE] Error after 136 iterations: 1.190411



In [94]:

    
mia.plotting.plot_mapping_2d(SNE_mapping_2d, real_index, phantom_index, labels)
plt.savefig('figures/mappings/intensity_SNE_mapping_2d.png', dpi=300)

Running t-SNE to obtain a 3 dimensional mapping



In [95]:

    
SNE_mapping_3d = mia.analysis.tSNE(selected_features, n_components=3, **kwargs)









    



[t-SNE] Computing pairwise distances...
[t-SNE] Computed conditional probabilities for sample 96 / 96
[t-SNE] Mean sigma: 2.693481
[t-SNE] Error after 100 iterations with early exaggeration: 16.755029
[t-SNE] Error after 314 iterations: 2.633436



In [127]:

    
mia.plotting.plot_mapping_3d(SNE_mapping_3d, real_index, phantom_index, labels)









    Out[127]:





<matplotlib.axes._subplots.Axes3DSubplot at 0x108bf72d0>

Isomap

Running Isomap to obtain a 2 dimensional mapping



In [97]:

    
iso_kwargs = {
    'n_neighbors': 4,
}



In [98]:

    
iso_mapping_2d = mia.analysis.isomap(selected_features, n_components=2, **iso_kwargs)



In [99]:

    
mia.plotting.plot_mapping_2d(iso_mapping_2d, real_index, phantom_index, labels)
plt.savefig('figures/mappings/intensity_iso_mapping_2d.png', dpi=300)



In [100]:

    
iso_mapping_3d = mia.analysis.isomap(selected_features, n_components=3, **iso_kwargs)



In [129]:

    
mia.plotting.plot_mapping_3d(iso_mapping_3d, real_index, phantom_index, labels)









    Out[129]:





<matplotlib.axes._subplots.Axes3DSubplot at 0x108ea9090>

Locally Linear Embedding

Running locally linear embedding to obtain 2d mapping



In [102]:

    
lle_kwargs = {
    'n_neighbors': 4,
}



In [103]:

    
lle_mapping_2d = mia.analysis.lle(selected_features, n_components=2, **lle_kwargs)



In [104]:

    
mia.plotting.plot_mapping_2d(lle_mapping_2d, real_index, phantom_index, labels)
plt.savefig('figures/mappings/intensity_lle_mapping_2d.png', dpi=300)



In [105]:

    
lle_mapping_3d = mia.analysis.lle(selected_features, n_components=3, **lle_kwargs)



In [130]:

    
mia.plotting.plot_mapping_3d(lle_mapping_3d, real_index, phantom_index, labels)









    Out[130]:





<matplotlib.axes._subplots.Axes3DSubplot at 0x10a9fe950>

Quality Assessment of Dimensionality Reduction

Assess the quality of the DR against measurements from the co-ranking matrices. First create co-ranking matrices for each of the dimensionality reduction mappings



In [107]:

    
max_k = 50



In [108]:

    
SNE_mapping_2d_cm = mia.coranking.coranking_matrix(selected_features, 
                                                   SNE_mapping_2d)
iso_mapping_2d_cm = mia.coranking.coranking_matrix(selected_features, 
                                                   iso_mapping_2d)
lle_mapping_2d_cm = mia.coranking.coranking_matrix(selected_features, 
                                                   lle_mapping_2d)

SNE_mapping_3d_cm = mia.coranking.coranking_matrix(selected_features, 
                                                   SNE_mapping_3d)
iso_mapping_3d_cm = mia.coranking.coranking_matrix(selected_features, 
                                                   iso_mapping_3d)
lle_mapping_3d_cm = mia.coranking.coranking_matrix(selected_features, 
                                                   lle_mapping_3d)

2D Mappings



In [109]:

    
SNE_trustworthiness_2d = [mia.coranking.trustworthiness(SNE_mapping_2d_cm, k) 
                          for k in range(1, max_k)]
iso_trustworthiness_2d = [mia.coranking.trustworthiness(iso_mapping_2d_cm, k) 
                          for k in range(1, max_k)]
lle_trustworthiness_2d = [mia.coranking.trustworthiness(lle_mapping_2d_cm, k) 
                          for k in range(1, max_k)]



In [110]:

    
trustworthiness_df = pd.DataFrame([SNE_trustworthiness_2d,
                                   iso_trustworthiness_2d,
                                   lle_trustworthiness_2d], 
                                   index=['SNE', 'Isomap', 'LLE']).T
trustworthiness_df.plot()
plt.savefig('figures/quality_measures/intensity_trustworthiness_2d.png', dpi=300)



In [111]:

    
SNE_continuity_2d = [mia.coranking.continuity(SNE_mapping_2d_cm, k) 
                     for k in range(1, max_k)]
iso_continuity_2d = [mia.coranking.continuity(iso_mapping_2d_cm, k) 
                     for k in range(1, max_k)]
lle_continuity_2d = [mia.coranking.continuity(lle_mapping_2d_cm, k) 
                     for k in range(1, max_k)]



In [112]:

    
continuity_df = pd.DataFrame([SNE_continuity_2d,
                              iso_continuity_2d,
                              lle_continuity_2d], 
                              index=['SNE', 'Isomap', 'LLE']).T
continuity_df.plot()
plt.savefig('figures/quality_measures/intensity_continuity_2d.png', dpi=300)



In [113]:

    
SNE_lcmc_2d = [mia.coranking.LCMC(SNE_mapping_2d_cm, k) 
               for k in range(2, max_k)]
iso_lcmc_2d = [mia.coranking.LCMC(iso_mapping_2d_cm, k) 
               for k in range(2, max_k)]
lle_lcmc_2d = [mia.coranking.LCMC(lle_mapping_2d_cm, k) 
               for k in range(2, max_k)]



In [114]:

    
lcmc_df = pd.DataFrame([SNE_lcmc_2d,
                        iso_lcmc_2d,
                        lle_lcmc_2d], 
                        index=['SNE', 'Isomap', 'LLE']).T
lcmc_df.plot()
plt.savefig('figures/quality_measures/intensity_lcmc_2d.png', dpi=300)

3D Mappings



In [115]:

    
SNE_trustworthiness_3d = [mia.coranking.trustworthiness(SNE_mapping_3d_cm, k) 
                          for k in range(1, max_k)]
iso_trustworthiness_3d = [mia.coranking.trustworthiness(iso_mapping_3d_cm, k) 
                          for k in range(1, max_k)]
lle_trustworthiness_3d = [mia.coranking.trustworthiness(lle_mapping_3d_cm, k) 
                          for k in range(1, max_k)]



In [116]:

    
trustworthiness3d_df = pd.DataFrame([SNE_trustworthiness_3d,
                                   iso_trustworthiness_3d,
                                   lle_trustworthiness_3d], 
                                   index=['SNE', 'Isomap', 'LLE']).T
trustworthiness3d_df.plot()
plt.savefig('figures/quality_measures/intensity_trustworthiness_3d.png', dpi=300)



In [117]:

    
SNE_continuity_3d = [mia.coranking.continuity(SNE_mapping_3d_cm, k) 
                     for k in range(1, max_k)]
iso_continuity_3d = [mia.coranking.continuity(iso_mapping_3d_cm, k) 
                     for k in range(1, max_k)]
lle_continuity_3d = [mia.coranking.continuity(lle_mapping_3d_cm, k) 
                     for k in range(1, max_k)]



In [118]:

    
continuity3d_df = pd.DataFrame([SNE_continuity_3d,
                              iso_continuity_3d,
                              lle_continuity_3d], 
                              index=['SNE', 'Isomap', 'LLE']).T
continuity3d_df.plot()
plt.savefig('figures/quality_measures/intensity_continuity_3d.png', dpi=300)



In [119]:

    
SNE_lcmc_3d = [mia.coranking.LCMC(SNE_mapping_3d_cm, k) 
               for k in range(2, max_k)]
iso_lcmc_3d = [mia.coranking.LCMC(iso_mapping_3d_cm, k) 
               for k in range(2, max_k)]
lle_lcmc_3d = [mia.coranking.LCMC(lle_mapping_3d_cm, k)
               for k in range(2, max_k)]



In [120]:

    
lcmc3d_df = pd.DataFrame([SNE_lcmc_3d,
                        iso_lcmc_3d,
                        lle_lcmc_3d], 
                        index=['SNE', 'Isomap', 'LLE']).T
lcmc3d_df.plot()
plt.savefig('figures/quality_measures/intensity_lcmc_3d.png', dpi=300)

	count	mean	std	min	25%	50%	75%	max	skew	kurtosis	...	count_9	mean_9	std_9	min_9	25%_9	50%_9	75%_9	max_9	skew_9	kurtosis_9
p214-010-60001-cr.png	256	0.558904	0.087279	0.328763	0.510450	0.570363	0.622144	0.724731	-0.406150	-0.127657	...	131044	0.541212	0.118465	0.141845	0.462988	0.546140	0.624509	0.921247	-0.150691	0.148522
p214-010-60005-ml.png	256	0.579815	0.090863	0.314655	0.530318	0.591799	0.644746	0.759436	-0.575184	0.634554	...	131044	0.541212	0.118465	0.141845	0.462988	0.546140	0.624509	0.921247	-0.150691	0.148522
p214-010-60008-cr.png	256	0.493326	0.074813	0.296470	0.447373	0.496014	0.544188	0.679322	-0.100948	0.021890	...	131044	0.523970	0.099598	0.148544	0.455340	0.521359	0.589320	0.936893	-0.017659	0.898368
p214-010-60012-ml.png	256	0.469238	0.081322	0.254982	0.415496	0.473583	0.527386	0.660004	-0.204323	-0.192516	...	131044	0.541212	0.118465	0.141845	0.462988	0.546140	0.624509	0.921247	-0.150691	0.148522
p214-010-60013-mr.png	256	0.524458	0.087562	0.285158	0.467286	0.534160	0.588593	0.704526	-0.407108	0.041762	...	131044	0.554975	0.131461	0.138075	0.463040	0.552301	0.645746	0.949791	0.038416	-0.374100

	KS	p-value
count	0.000000	1.000000e+00
mean	1.000000	3.587622e-59
std	0.876389	1.515491e-45
min	1.000000	3.587622e-59
25%	1.000000	3.587622e-59
50%	1.000000	3.587622e-59
75%	1.000000	3.587622e-59
max	1.000000	3.587622e-59
skew	0.891667	3.923764e-47
kurtosis	0.568056	2.209941e-19
count_1	0.000000	1.000000e+00
mean_1	1.000000	3.587622e-59
std_1	0.981944	4.539903e-57
min_1	1.000000	3.587622e-59
25%_1	1.000000	3.587622e-59
50%_1	1.000000	3.587622e-59
75%_1	1.000000	3.587622e-59
max_1	0.997222	7.598385e-59
skew_1	0.784722	1.335838e-36
kurtosis_1	0.466667	3.216681e-13
count_2	0.000000	1.000000e+00
mean_2	1.000000	3.587622e-59
std_2	1.000000	3.587622e-59
min_2	1.000000	3.587622e-59
25%_2	1.000000	3.587622e-59
50%_2	1.000000	3.587622e-59
75%_2	1.000000	3.587622e-59
max_2	0.994444	1.605940e-58
skew_2	0.501389	3.410114e-15
kurtosis_2	0.436111	1.342503e-11
...	...	...
count_7	0.000000	1.000000e+00
mean_7	1.000000	3.587622e-59
std_7	0.955556	4.577742e-54
min_7	1.000000	3.587622e-59
25%_7	1.000000	3.587622e-59
50%_7	1.000000	3.587622e-59
75%_7	1.000000	3.587622e-59
max_7	0.740278	1.280685e-32
skew_7	0.319444	2.024353e-06
kurtosis_7	0.304167	7.344875e-06
count_8	0.000000	1.000000e+00
mean_8	1.000000	3.587622e-59
std_8	0.929167	3.823290e-51
min_8	1.000000	3.587622e-59
25%_8	1.000000	3.587622e-59
50%_8	1.000000	3.587622e-59
75%_8	1.000000	3.587622e-59
max_8	0.736111	2.943248e-32
skew_8	0.547222	5.120902e-18
kurtosis_8	0.395833	1.248634e-09
count_9	0.000000	1.000000e+00
mean_9	1.000000	3.587622e-59
std_9	0.956944	3.195999e-54
min_9	1.000000	3.587622e-59
25%_9	1.000000	3.587622e-59
50%_9	1.000000	3.587622e-59
75%_9	0.997222	7.598385e-59
max_9	0.794444	1.674259e-37
skew_9	0.702778	1.934059e-29
kurtosis_9	0.891667	3.923764e-47