In [121]:

    
%matplotlib inline
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import mia









    



Warning: Cannot change to a different GUI toolkit: qt. Using osx instead.

Loading and Preprocessing

Loading the hologic and synthetic datasets.



In [83]:

    
hologic = pd.DataFrame.from_csv("hologic_blobs.csv")
phantom = pd.DataFrame.from_csv("synthetic_blobs.csv")

Loading the meta data for the real and synthetic datasets.



In [84]:

    
hologic_meta = mia.analysis.create_hologic_meta_data(hologic, "meta_data/real_meta.csv")
phantom_meta = mia.analysis.create_synthetic_meta_data(phantom, "meta_data/synthetic_meta.csv")
phantom_meta.index.name = 'img_name'

Prepare the BI-RADS/VBD labels for both datasets.



In [85]:

    
hologic_labels = hologic_meta.drop_duplicates().BIRADS
phantom_labels = phantom_meta['VBD.1']

class_labels = pd.concat([hologic_labels, phantom_labels])
class_labels.index.name = "img_name"
labels = mia.analysis.remove_duplicate_index(class_labels)[0]

Creating Features

Create blob features from distribution of blobs



In [86]:

    
hologic_blob_features = mia.analysis.features_from_blobs(hologic)
phantom_blob_features = mia.analysis.features_from_blobs(phantom)

Take a random subset of the phantom mammograms. This is important so that each case is not over represented.



In [87]:

    
syn_feature_meta = mia.analysis.remove_duplicate_index(phantom_meta)
phantom_blob_features['phantom_name'] = syn_feature_meta.phantom_name.tolist()
phantom_blob_features_subset = mia.analysis.create_random_subset(phantom_blob_features, 'phantom_name')

# hologic_blob_features['patient_id'] = hologic_meta['patient_id'].drop_duplicates()
# hologic_blob_features_subset = mia.analysis.create_random_subset(hologic_blob_features, 'patient_id')

Combine the features from both datasets.



In [88]:

    
features = pd.concat([hologic_blob_features, phantom_blob_features_subset])
assert features.shape[0] == 366
features.head()









    Out[88]:






  
    
      
      blob_count
      avg_radius
      std_radius
      min_radius
      max_radius
      small_radius_count
      med_radius_count
      large_radius_count
      density
      upper_dist_count
      25%
      50%
      75%
    
  
  
    
      p214-010-60001-cl.png
        56
       22.121831
       22.923389
       8
       128.000000
        52
       1
       3
       52.940812
       21
       8
       11.313708
       22.627417
    
    
      p214-010-60001-cr.png
        78
       19.054538
       17.506086
       8
        90.509668
        68
       4
       6
       40.749811
       22
       8
       11.313708
       22.627417
    
    
      p214-010-60001-ml.png
        98
       20.011191
       21.876304
       8
       128.000000
        90
       3
       5
       42.644057
       27
       8
       11.313708
       22.627417
    
    
      p214-010-60001-mr.png
       139
       15.309764
       15.307860
       8
       128.000000
       136
       1
       2
       38.287439
       40
       8
       11.313708
       16.000000
    
    
      p214-010-60005-cl.png
        97
       20.132590
       23.255605
       8
       181.019336
        94
       2
       1
       41.456308
       27
       8
       11.313708
       22.627417

Filter some features, such as the min, to remove noise.



In [89]:

    
selected_features = features.drop(['min_radius'], axis=1)

Compare Real and Synthetic Features

Compare the distributions of features detected from the real mammograms and the phantoms using the Kolmogorov-Smirnov two sample test.



In [90]:

    
ks_stats = [list(stats.ks_2samp(hologic_blob_features[col], 
                                phantom_blob_features[col]))
                                for col in selected_features.columns]

ks_test = pd.DataFrame(ks_stats, columns=['KS', 'p-value'], index=selected_features.columns)
ks_test.to_latex("tables/blob_features_ks.tex")
ks_test









    Out[90]:






  
    
      
      KS
      p-value
    
  
  
    
      blob_count
       0.341667
       2.774753e-07
    
    
      avg_radius
       0.847222
       1.360953e-42
    
    
      std_radius
       0.711111
       3.929143e-30
    
    
      max_radius
       0.363889
       3.327680e-08
    
    
      small_radius_count
       0.319444
       2.024353e-06
    
    
      med_radius_count
       0.338889
       3.583275e-07
    
    
      large_radius_count
       0.733333
       5.112303e-32
    
    
      density
       0.169444
       4.114705e-02
    
    
      upper_dist_count
       0.345833
       1.883393e-07
    
    
      25%
       0.358333
       5.726005e-08
    
    
      50%
       0.743056
       7.334764e-33
    
    
      75%
       0.777778
       5.796838e-36

Dimensionality Reduction

t-SNE

Running t-SNE to obtain a two dimensional representation.



In [91]:

    
kwargs = {
    'learning_rate': 300,
    'perplexity': 30,
    'verbose': 1
}



In [92]:

    
SNE_mapping_2d, error = mia.analysis.tSNE(selected_features, n_components=2, **kwargs)









    



[t-SNE] Computing pairwise distances...
[t-SNE] Computed conditional probabilities for sample 366 / 366
[t-SNE] Mean sigma: 0.989818
[t-SNE] Error after 83 iterations with early exaggeration: 12.840095
[t-SNE] Error after 284 iterations: 0.384253



In [93]:

    
mia.plotting.plot_mapping_2d(SNE_mapping_2d, hologic_blob_features.index, phantom_blob_features_subset.index, labels)
plt.savefig('figures/mappings/blob_SNE_mapping_2d.png', dpi=300)

Running t-SNE to obtain a 3 dimensional mapping



In [94]:

    
SNE_mapping_3d, error = mia.analysis.tSNE(selected_features, n_components=3, **kwargs)









    



[t-SNE] Computing pairwise distances...
[t-SNE] Computed conditional probabilities for sample 366 / 366
[t-SNE] Mean sigma: 0.989818
[t-SNE] Error after 83 iterations with early exaggeration: 13.395568
[t-SNE] Error after 228 iterations: 0.412479



In [126]:

    
mia.plotting.plot_mapping_3d(SNE_mapping_3d, hologic_blob_features.index, phantom_blob_features_subset.index, labels)









    Out[126]:





<matplotlib.axes._subplots.Axes3DSubplot at 0x11472cdd0>

Isomap

Running Isomap to obtain a 2 dimensional mapping



In [96]:

    
iso_kwargs = {
    'n_neighbors': 10,
}



In [97]:

    
iso_mapping_2d, error = mia.analysis.isomap(selected_features, n_components=2, **iso_kwargs)
print error









    



5.68455464542



In [132]:

    
mia.plotting.plot_mapping_2d(iso_mapping_2d, hologic_blob_features.index, phantom_blob_features_subset.index, labels)
mia.io_tools.dump_mapping_to_json(iso_mapping_2d.loc[hologic_blob_features.index], [0,1], labels, 'data.json')
plt.savefig('figures/mappings/blob_iso_mapping_2d.png', dpi=300)



In [99]:

    
iso_mapping_3d, error = mia.analysis.isomap(selected_features, n_components=3, **iso_kwargs)
print error









    



4.26343423263



In [123]:

    
mia.plotting.plot_mapping_3d(iso_mapping_3d, hologic_blob_features.index, phantom_blob_features_subset.index, labels)









    Out[123]:





<matplotlib.axes._subplots.Axes3DSubplot at 0x11230ecd0>

Locally Linear Embedding

Running locally linear embedding to obtain 2d mapping



In [101]:

    
lle_kwargs = {
    'n_neighbors': 10,
}



In [102]:

    
lle_mapping_2d, error = mia.analysis.lle(selected_features, n_components=2, **lle_kwargs)
print error









    



1.17807316961e-07



In [103]:

    
mia.plotting.plot_mapping_2d(lle_mapping_2d, hologic_blob_features.index, phantom_blob_features_subset.index, labels)
plt.savefig('figures/mappings/blob_lle_mapping_2d.png', dpi=300)



In [104]:

    
lle_mapping_3d, error = mia.analysis.lle(selected_features, n_components=3, **lle_kwargs)



In [124]:

    
mia.plotting.plot_mapping_3d(lle_mapping_3d, hologic_blob_features.index, phantom_blob_features_subset.index, labels)









    Out[124]:





<matplotlib.axes._subplots.Axes3DSubplot at 0x1124fee10>

Quality Assessment of Dimensionality Reduction

Assess the quality of the DR against measurements from the co-ranking matrices. First create co-ranking matrices for each of the dimensionality reduction mappings



In [106]:

    
max_k = 50



In [107]:

    
SNE_mapping_2d_cm = mia.coranking.coranking_matrix(selected_features, SNE_mapping_2d)
iso_mapping_2d_cm = mia.coranking.coranking_matrix(selected_features, iso_mapping_2d)
lle_mapping_2d_cm = mia.coranking.coranking_matrix(selected_features, lle_mapping_2d)

SNE_mapping_3d_cm = mia.coranking.coranking_matrix(selected_features, SNE_mapping_3d)
iso_mapping_3d_cm = mia.coranking.coranking_matrix(selected_features, iso_mapping_3d)
lle_mapping_3d_cm = mia.coranking.coranking_matrix(selected_features, lle_mapping_3d)

2D Mappings



In [108]:

    
SNE_trustworthiness_2d = [mia.coranking.trustworthiness(SNE_mapping_2d_cm, k) for k in range(1, max_k)]
iso_trustworthiness_2d = [mia.coranking.trustworthiness(iso_mapping_2d_cm, k) for k in range(1, max_k)]
lle_trustworthiness_2d = [mia.coranking.trustworthiness(lle_mapping_2d_cm, k) for k in range(1, max_k)]



In [109]:

    
trustworthiness_df = pd.DataFrame([SNE_trustworthiness_2d,
                                   iso_trustworthiness_2d,
                                   lle_trustworthiness_2d], 
                                   index=['SNE', 'Isomap', 'LLE']).T
trustworthiness_df.plot()
plt.savefig('figures/quality_measures/blob_trustworthiness_2d.png', dpi=300)



In [110]:

    
SNE_continuity_2d = [mia.coranking.continuity(SNE_mapping_2d_cm, k) for k in range(1, max_k)]
iso_continuity_2d = [mia.coranking.continuity(iso_mapping_2d_cm, k) for k in range(1, max_k)]
lle_continuity_2d = [mia.coranking.continuity(lle_mapping_2d_cm, k) for k in range(1, max_k)]



In [111]:

    
continuity_df = pd.DataFrame([SNE_continuity_2d,
                              iso_continuity_2d,
                              lle_continuity_2d], 
                              index=['SNE', 'Isomap', 'LLE']).T
continuity_df.plot()
plt.savefig('figures/quality_measures/blob_continuity_2d.png', dpi=300)



In [112]:

    
SNE_lcmc_2d = [mia.coranking.LCMC(SNE_mapping_2d_cm, k) for k in range(2, max_k)]
iso_lcmc_2d = [mia.coranking.LCMC(iso_mapping_2d_cm, k) for k in range(2, max_k)]
lle_lcmc_2d = [mia.coranking.LCMC(lle_mapping_2d_cm, k) for k in range(2, max_k)]



In [113]:

    
lcmc_df = pd.DataFrame([SNE_lcmc_2d,
                        iso_lcmc_2d,
                        lle_lcmc_2d], 
                        index=['SNE', 'Isomap', 'LLE']).T
lcmc_df.plot()
plt.savefig('figures/quality_measures/blob_lcmc_2d.png', dpi=300)

3D Mappings



In [114]:

    
SNE_trustworthiness_3d = [mia.coranking.trustworthiness(SNE_mapping_3d_cm, k) for k in range(1, max_k)]
iso_trustworthiness_3d = [mia.coranking.trustworthiness(iso_mapping_3d_cm, k) for k in range(1, max_k)]
lle_trustworthiness_3d = [mia.coranking.trustworthiness(lle_mapping_3d_cm, k) for k in range(1, max_k)]



In [115]:

    
trustworthiness3d_df = pd.DataFrame([SNE_trustworthiness_3d,
                                   iso_trustworthiness_3d,
                                   lle_trustworthiness_3d], 
                                   index=['SNE', 'Isomap', 'LLE']).T
trustworthiness3d_df.plot()
plt.savefig('figures/quality_measures/blob_trustworthiness_3d.png', dpi=300)



In [116]:

    
SNE_continuity_3d = [mia.coranking.continuity(SNE_mapping_3d_cm, k) for k in range(1, max_k)]
iso_continuity_3d = [mia.coranking.continuity(iso_mapping_3d_cm, k) for k in range(1, max_k)]
lle_continuity_3d = [mia.coranking.continuity(lle_mapping_3d_cm, k) for k in range(1, max_k)]



In [117]:

    
continuity3d_df = pd.DataFrame([SNE_continuity_3d,
                              iso_continuity_3d,
                              lle_continuity_3d], 
                              index=['SNE', 'Isomap', 'LLE']).T
continuity3d_df.plot()
plt.savefig('figures/quality_measures/blob_continuity_3d.png', dpi=300)



In [118]:

    
SNE_lcmc_3d = [mia.coranking.LCMC(SNE_mapping_3d_cm, k) for k in range(2, max_k)]
iso_lcmc_3d = [mia.coranking.LCMC(iso_mapping_3d_cm, k) for k in range(2, max_k)]
lle_lcmc_3d = [mia.coranking.LCMC(lle_mapping_3d_cm, k) for k in range(2, max_k)]



In [119]:

    
lcmc3d_df = pd.DataFrame([SNE_lcmc_3d,
                        iso_lcmc_3d,
                        lle_lcmc_3d], 
                        index=['SNE', 'Isomap', 'LLE']).T
lcmc3d_df.plot()
plt.savefig('figures/quality_measures/blob_lcmc_3d.png', dpi=300)

	blob_count	avg_radius	std_radius	min_radius	max_radius	small_radius_count	med_radius_count	large_radius_count	density	upper_dist_count	25%	50%	75%
p214-010-60001-cl.png	56	22.121831	22.923389	8	128.000000	52	1	3	52.940812	21	8	11.313708	22.627417
p214-010-60001-cr.png	78	19.054538	17.506086	8	90.509668	68	4	6	40.749811	22	8	11.313708	22.627417
p214-010-60001-ml.png	98	20.011191	21.876304	8	128.000000	90	3	5	42.644057	27	8	11.313708	22.627417
p214-010-60001-mr.png	139	15.309764	15.307860	8	128.000000	136	1	2	38.287439	40	8	11.313708	16.000000
p214-010-60005-cl.png	97	20.132590	23.255605	8	181.019336	94	2	1	41.456308	27	8	11.313708	22.627417

	KS	p-value
blob_count	0.341667	2.774753e-07
avg_radius	0.847222	1.360953e-42
std_radius	0.711111	3.929143e-30
max_radius	0.363889	3.327680e-08
small_radius_count	0.319444	2.024353e-06
med_radius_count	0.338889	3.583275e-07
large_radius_count	0.733333	5.112303e-32
density	0.169444	4.114705e-02
upper_dist_count	0.345833	1.883393e-07
25%	0.358333	5.726005e-08
50%	0.743056	7.334764e-33
75%	0.777778	5.796838e-36