In [159]:

    
%matplotlib inline
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import mia









    



Warning: Cannot change to a different GUI toolkit: qt. Using osx instead.

Loading and Preprocessing

Loading the hologic and synthetic datasets.



In [113]:

    
hologic = pd.DataFrame.from_csv("real-lines.csv")
phantom = pd.DataFrame.from_csv("phantom-lines.csv")

Loading the meta data for the real and synthetic datasets.



In [114]:

    
hologic_meta = mia.analysis.create_hologic_meta_data(hologic, "meta_data/real_meta.csv")
phantom_meta = mia.analysis.create_synthetic_meta_data(phantom, "meta_data/synthetic_meta.csv")
phantom_meta.index.name = 'img_name'

Prepare the BI-RADS/VBD labels for both datasets.



In [115]:

    
hologic_labels = hologic_meta.drop_duplicates().BIRADS
phantom_labels = phantom_meta['VBD.1']

class_labels = pd.concat([hologic_labels, phantom_labels])
class_labels.index.name = "img_name"
labels = mia.analysis.remove_duplicate_index(class_labels)[0]

Creating Features

Create blob features from distribution of blobs



In [116]:

    
hologic_line_features = mia.analysis.features_from_lines(hologic)
phantom_line_features = mia.analysis.features_from_lines(phantom)

Take a random subset of the phantom mammograms. This is important so that each case is not over represented.



In [117]:

    
syn_feature_meta = mia.analysis.remove_duplicate_index(phantom_meta)
phantom_line_features['phantom_name'] = syn_feature_meta.phantom_name.tolist()
phantom_line_features_subset = mia.analysis.create_random_subset(phantom_line_features, 'phantom_name')

# hologic_blob_features['patient_id'] = hologic_meta['patient_id'].drop_duplicates()
# hologic_blob_features_subset = mia.analysis.create_random_subset(hologic_blob_features, 'patient_id')

Combine the features from both datasets.



In [118]:

    
features = pd.concat([hologic_line_features, phantom_line_features_subset])
assert features.shape[0] == 366
features.head()









    Out[118]:






  
    
      
      count
      mean
      std
      min
      25%
      50%
      75%
      max
      skew
      kurtosis
      upper_dist_count
    
  
  
    
      p214-010-60001-cl.png
        72
       161.791667
       245.194659
       1
       61.50
       94.5
       137.25
       1744
       4.786025
       26.793023
       16
    
    
      p214-010-60001-cr.png
        57
       176.421053
       460.921203
       1
       59.00
       77.0
       152.00
       3494
       6.905092
       50.202367
       13
    
    
      p214-010-60001-ml.png
        72
       200.708333
       296.524822
       1
       62.00
       95.5
       210.50
       1662
       3.484285
       13.937538
       19
    
    
      p214-010-60001-mr.png
        74
       203.459459
       371.688380
       1
       52.25
       81.0
       192.25
       2522
       4.309621
       22.429202
       16
    
    
      p214-010-60005-cl.png
       114
       143.184211
       152.914376
       1
       58.25
       89.5
       170.75
        911
       2.667624
        9.028130
       36

Filter some features, such as the min, to remove noise.



In [119]:

    
selected_features = features.drop(['min'], axis=1)
selected_features.fillna(0, inplace=True)

Compare Real and Synthetic Features

Compare the distributions of features detected from the real mammograms and the phantoms using the Kolmogorov-Smirnov two sample test.



In [120]:

    
ks_stats = [list(stats.ks_2samp(hologic_line_features[col], 
                                phantom_line_features[col]))
                                for col in selected_features.columns]

ks_test = pd.DataFrame(ks_stats, columns=['KS', 'p-value'], index=selected_features.columns)
ks_test.to_latex("tables/line_features_ks.tex")
ks_test









    Out[120]:






  
    
      
      KS
      p-value
    
  
  
    
      count
       0.933333
       1.338265e-51
    
    
      mean
       0.593056
       4.356206e-21
    
    
      std
       0.654167
       1.450514e-25
    
    
      25%
       0.143056
       1.255126e-01
    
    
      50%
       0.304167
       7.344875e-06
    
    
      75%
       0.508333
       1.320817e-15
    
    
      max
       0.737500
       2.231475e-32
    
    
      skew
       0.480556
       5.426886e-14
    
    
      kurtosis
       0.506944
       1.598384e-15
    
    
      upper_dist_count
       0.913889
       1.724254e-49

Dimensionality Reduction

t-SNE

Running t-SNE to obtain a two dimensional representation.



In [121]:

    
kwargs = {
    'learning_rate': 300,
    'perplexity': 30,
    'verbose': 1
}



In [122]:

    
SNE_mapping_2d, error = mia.analysis.tSNE(selected_features, n_components=2, **kwargs)









    



[t-SNE] Computing pairwise distances...
[t-SNE] Computed conditional probabilities for sample 366 / 366
[t-SNE] Mean sigma: 0.935505
[t-SNE] Error after 83 iterations with early exaggeration: 14.319531
[t-SNE] Error after 292 iterations: 0.654946



In [123]:

    
mia.plotting.plot_mapping_2d(SNE_mapping_2d, hologic_line_features.index, phantom_line_features_subset.index, labels)
plt.savefig('figures/mappings/line_SNE_mapping_2d.png', dpi=300)

Running t-SNE to obtain a 3 dimensional mapping



In [124]:

    
SNE_mapping_3d, error = mia.analysis.tSNE(selected_features, n_components=3, **kwargs)









    



[t-SNE] Computing pairwise distances...
[t-SNE] Computed conditional probabilities for sample 366 / 366
[t-SNE] Mean sigma: 0.935505
[t-SNE] Error after 92 iterations with early exaggeration: 14.468912
[t-SNE] Error after 333 iterations: 0.664658



In [160]:

    
mia.plotting.plot_mapping_3d(SNE_mapping_3d, hologic_line_features.index, phantom_line_features_subset.index, labels)
# plt.savefig('figures/mappings/line_SNE_mapping_3d.png', dpi=300)









    Out[160]:





<matplotlib.axes._subplots.Axes3DSubplot at 0x110d3dd50>

Isomap

Running Isomap to obtain a 2 dimensional mapping



In [126]:

    
iso_kwargs = {
    'n_neighbors': 10,
}



In [127]:

    
iso_mapping_2d, error = mia.analysis.isomap(selected_features, n_components=2, **iso_kwargs)



In [128]:

    
mia.plotting.plot_mapping_2d(iso_mapping_2d, hologic_line_features.index, phantom_line_features_subset.index, labels)
plt.savefig('figures/mappings/line_iso_mapping_2d.png', dpi=300)



In [129]:

    
iso_mapping_3d, error = mia.analysis.isomap(selected_features, n_components=3, **iso_kwargs)



In [163]:

    
mia.plotting.plot_mapping_3d(iso_mapping_3d, hologic_line_features.index, phantom_line_features_subset.index, labels)
# plt.savefig('figures/mappings/line_iso_mapping_3d.png', dpi=300)









    Out[163]:





<matplotlib.axes._subplots.Axes3DSubplot at 0x113f47b90>

Locally Linear Embedding

Running locally linear embedding to obtain 2d mapping



In [131]:

    
lle_kwargs = {
    'n_neighbors': 10,
}



In [132]:

    
lle_mapping_2d, error = mia.analysis.lle(selected_features, n_components=2, **lle_kwargs)



In [133]:

    
mia.plotting.plot_mapping_2d(lle_mapping_2d, hologic_line_features.index, phantom_line_features_subset.index, labels)
plt.savefig('figures/mappings/line_lle_mapping_2d.png', dpi=300)



In [134]:

    
lle_mapping_3d, error = mia.analysis.lle(selected_features, n_components=3, **lle_kwargs)



In [162]:

    
mia.plotting.plot_mapping_3d(lle_mapping_3d, hologic_line_features.index, phantom_line_features_subset.index, labels)
# plt.savefig('figures/mappings/line_lle_mapping_3d.png', dpi=300)









    Out[162]:





<matplotlib.axes._subplots.Axes3DSubplot at 0x113799550>

Quality Assessment of Dimensionality Reduction

Assess the quality of the DR against measurements from the co-ranking matrices. First create co-ranking matrices for each of the dimensionality reduction mappings



In [136]:

    
max_k = 50



In [137]:

    
SNE_mapping_2d_cm = mia.coranking.coranking_matrix(selected_features, SNE_mapping_2d)
iso_mapping_2d_cm = mia.coranking.coranking_matrix(selected_features, iso_mapping_2d)
lle_mapping_2d_cm = mia.coranking.coranking_matrix(selected_features, lle_mapping_2d)

SNE_mapping_3d_cm = mia.coranking.coranking_matrix(selected_features, SNE_mapping_3d)
iso_mapping_3d_cm = mia.coranking.coranking_matrix(selected_features, iso_mapping_3d)
lle_mapping_3d_cm = mia.coranking.coranking_matrix(selected_features, lle_mapping_3d)

2D Mappings



In [138]:

    
SNE_trustworthiness_2d = [mia.coranking.trustworthiness(SNE_mapping_2d_cm, k) for k in range(1, max_k)]
iso_trustworthiness_2d = [mia.coranking.trustworthiness(iso_mapping_2d_cm, k) for k in range(1, max_k)]
lle_trustworthiness_2d = [mia.coranking.trustworthiness(lle_mapping_2d_cm, k) for k in range(1, max_k)]



In [139]:

    
trustworthiness_df = pd.DataFrame([SNE_trustworthiness_2d,
                                   iso_trustworthiness_2d,
                                   lle_trustworthiness_2d], 
                                   index=['SNE', 'Isomap', 'LLE']).T
trustworthiness_df.plot()
plt.savefig('figures/quality_measures/line_trustworthiness_2d.png', dpi=300)



In [140]:

    
SNE_continuity_2d = [mia.coranking.continuity(SNE_mapping_2d_cm, k) for k in range(1, max_k)]
iso_continuity_2d = [mia.coranking.continuity(iso_mapping_2d_cm, k) for k in range(1, max_k)]
lle_continuity_2d = [mia.coranking.continuity(lle_mapping_2d_cm, k) for k in range(1, max_k)]



In [141]:

    
continuity_df = pd.DataFrame([SNE_continuity_2d,
                              iso_continuity_2d,
                              lle_continuity_2d], 
                              index=['SNE', 'Isomap', 'LLE']).T
continuity_df.plot()
plt.savefig('figures/quality_measures/line_continuity_2d.png', dpi=300)



In [142]:

    
SNE_lcmc_2d = [mia.coranking.LCMC(SNE_mapping_2d_cm, k) for k in range(2, max_k)]
iso_lcmc_2d = [mia.coranking.LCMC(iso_mapping_2d_cm, k) for k in range(2, max_k)]
lle_lcmc_2d = [mia.coranking.LCMC(lle_mapping_2d_cm, k) for k in range(2, max_k)]



In [143]:

    
lcmc_df = pd.DataFrame([SNE_lcmc_2d,
                        iso_lcmc_2d,
                        lle_lcmc_2d], 
                        index=['SNE', 'Isomap', 'LLE']).T
lcmc_df.plot()
plt.savefig('figures/quality_measures/line_lcmc_2d.png', dpi=300)

3D Mappings



In [144]:

    
SNE_trustworthiness_3d = [mia.coranking.trustworthiness(SNE_mapping_3d_cm, k) for k in range(1, max_k)]
iso_trustworthiness_3d = [mia.coranking.trustworthiness(iso_mapping_3d_cm, k) for k in range(1, max_k)]
lle_trustworthiness_3d = [mia.coranking.trustworthiness(lle_mapping_3d_cm, k) for k in range(1, max_k)]



In [145]:

    
trustworthiness3d_df = pd.DataFrame([SNE_trustworthiness_3d,
                                   iso_trustworthiness_3d,
                                   lle_trustworthiness_3d], 
                                   index=['SNE', 'Isomap', 'LLE']).T
trustworthiness3d_df.plot()
plt.savefig('figures/quality_measures/line_trustworthiness_3d.png', dpi=300)



In [146]:

    
SNE_continuity_3d = [mia.coranking.continuity(SNE_mapping_3d_cm, k) for k in range(1, max_k)]
iso_continuity_3d = [mia.coranking.continuity(iso_mapping_3d_cm, k) for k in range(1, max_k)]
lle_continuity_3d = [mia.coranking.continuity(lle_mapping_3d_cm, k) for k in range(1, max_k)]



In [147]:

    
continuity3d_df = pd.DataFrame([SNE_continuity_3d,
                              iso_continuity_3d,
                              lle_continuity_3d], 
                              index=['SNE', 'Isomap', 'LLE']).T
continuity3d_df.plot()
plt.savefig('figures/quality_measures/line_continuity_3d.png', dpi=300)



In [148]:

    
SNE_lcmc_3d = [mia.coranking.LCMC(SNE_mapping_3d_cm, k) for k in range(2, max_k)]
iso_lcmc_3d = [mia.coranking.LCMC(iso_mapping_3d_cm, k) for k in range(2, max_k)]
lle_lcmc_3d = [mia.coranking.LCMC(lle_mapping_3d_cm, k) for k in range(2, max_k)]



In [149]:

    
lcmc3d_df = pd.DataFrame([SNE_lcmc_3d,
                        iso_lcmc_3d,
                        lle_lcmc_3d], 
                        index=['SNE', 'Isomap', 'LLE']).T
lcmc3d_df.plot()
plt.savefig('figures/quality_measures/line_lcmc_3d.png', dpi=300)

	count	mean	std	min	25%	50%	75%	max	skew	kurtosis	upper_dist_count
p214-010-60001-cl.png	72	161.791667	245.194659	1	61.50	94.5	137.25	1744	4.786025	26.793023	16
p214-010-60001-cr.png	57	176.421053	460.921203	1	59.00	77.0	152.00	3494	6.905092	50.202367	13
p214-010-60001-ml.png	72	200.708333	296.524822	1	62.00	95.5	210.50	1662	3.484285	13.937538	19
p214-010-60001-mr.png	74	203.459459	371.688380	1	52.25	81.0	192.25	2522	4.309621	22.429202	16
p214-010-60005-cl.png	114	143.184211	152.914376	1	58.25	89.5	170.75	911	2.667624	9.028130	36

	KS	p-value
count	0.933333	1.338265e-51
mean	0.593056	4.356206e-21
std	0.654167	1.450514e-25
25%	0.143056	1.255126e-01
50%	0.304167	7.344875e-06
75%	0.508333	1.320817e-15
max	0.737500	2.231475e-32
skew	0.480556	5.426886e-14
kurtosis	0.506944	1.598384e-15
upper_dist_count	0.913889	1.724254e-49