In [48]:

    
%matplotlib inline
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import mia

Loading and Preprocessing

Loading the hologic and synthetic datasets.



In [2]:

    
hologic = pd.DataFrame.from_csv("real_intensity_lines.csv")
hologic.drop(hologic.columns[:2], axis=1, inplace=True)
hologic.drop('breast_area', axis=1, inplace=True)

phantom = pd.DataFrame.from_csv("synthetic_intensity_lines.csv")
phantom.drop(phantom.columns[:2], axis=1, inplace=True)
phantom.drop('breast_area', axis=1, inplace=True)

Loading the meta data for the real and synthetic datasets.



In [3]:

    
hologic_meta = mia.analysis.create_hologic_meta_data(hologic, "meta_data/real_meta.csv")
phantom_meta = mia.analysis.create_synthetic_meta_data(phantom, "meta_data/synthetic_meta.csv")
phantom_meta.index.name = 'img_name'

Prepare the BI-RADS/VBD labels for both datasets.



In [4]:

    
hologic_labels = hologic_meta.drop_duplicates().BIRADS
phantom_labels = phantom_meta['VBD.1']

class_labels = pd.concat([hologic_labels, phantom_labels])
class_labels.index.name = "img_name"
labels = mia.analysis.remove_duplicate_index(class_labels)[0]

Creating Features

Create blob features from distribution of blobs



In [55]:

    
hologic_intensity_features = hologic[hologic.columns[4:]]
hologic_intensity_features = hologic_intensity_features.groupby(hologic.index).agg(np.mean)
phantom_intensity_features = phantom[phantom.columns[4:]]
phantom_intensity_features = phantom_intensity_features.groupby(phantom.index).agg(np.mean)

Take a random subset of the phantom mammograms. This is important so that each case is not over represented.



In [6]:

    
syn_feature_meta = mia.analysis.remove_duplicate_index(phantom_meta)
phantom_intensity_features['phantom_name'] = syn_feature_meta.phantom_name.tolist()
phantom_intensity_features_subset = mia.analysis.create_random_subset(phantom_intensity_features, 'phantom_name')

# hologic_texture_features['patient_id'] = hologic_meta['patient_id'].drop_duplicates()
# hologic_texture_features_subset = mia.analysis.create_random_subset(hologic_texture_features, 'patient_id')

Combine the features from both datasets.



In [7]:

    
features = pd.concat([hologic_intensity_features, phantom_intensity_features_subset])
assert features.shape[0] == 366
features.head()









    Out[7]:






  
    
      
      mean
      std
      min
      25%
      50%
      75%
      max
      skew
      kurtosis
    
  
  
    
      p214-010-60001-cl.png
       0.362584
       0.099563
       0.176311
       0.290825
       0.350695
       0.425325
       0.651941
       0.554375
       0.564813
    
    
      p214-010-60001-cr.png
       0.326219
       0.102222
       0.153672
       0.250758
       0.317584
       0.391641
       0.590179
       0.510056
       0.047178
    
    
      p214-010-60001-ml.png
       0.394061
       0.102072
       0.194879
       0.324951
       0.387695
       0.456746
       0.673763
       0.354676
       0.266671
    
    
      p214-010-60001-mr.png
       0.346520
       0.092847
       0.181811
       0.286208
       0.339103
       0.398680
       0.605823
       0.455182
       0.255500
    
    
      p214-010-60005-cl.png
       0.406239
       0.110257
       0.187572
       0.328258
       0.400346
       0.478228
       0.672765
       0.219887
      -0.055904

Filter some features, such as the min, to remove noise.



In [8]:

    
selected_features = features.copy()

Compare Real and Synthetic Features

Compare the distributions of features detected from the real mammograms and the phantoms using the Kolmogorov-Smirnov two sample test.



In [56]:

    
ks_stats = [list(stats.ks_2samp(hologic_intensity_features[col], 
                                phantom_intensity_features[col]))
                                for col in selected_features.columns]

ks_test = pd.DataFrame(ks_stats, columns=['KS', 'p-value'], index=selected_features.columns)
ks_test.to_latex("tables/line_intensity_features_ks.tex")
ks_test









    Out[56]:






  
    
      
      KS
      p-value
    
  
  
    
      mean
       1.000000
       3.587622e-59
    
    
      std
       0.966667
       2.546525e-55
    
    
      min
       1.000000
       3.587622e-59
    
    
      25%
       1.000000
       3.587622e-59
    
    
      50%
       1.000000
       3.587622e-59
    
    
      75%
       1.000000
       3.587622e-59
    
    
      max
       1.000000
       3.587622e-59
    
    
      skew
       1.000000
       3.587622e-59
    
    
      kurtosis
       0.213889
       4.106586e-03

Dimensionality Reduction

t-SNE

Running t-SNE to obtain a two dimensional representation.



In [10]:

    
kwargs = {
    'learning_rate': 300,
    'perplexity': 30,
    'verbose': 1
}



In [11]:

    
SNE_mapping_2d, error = mia.analysis.tSNE(selected_features, n_components=2, **kwargs)









    



[t-SNE] Computing pairwise distances...
[t-SNE] Computed conditional probabilities for sample 366 / 366
[t-SNE] Mean sigma: 0.599228
[t-SNE] Error after 83 iterations with early exaggeration: 13.517177
[t-SNE] Error after 280 iterations: 0.549352



In [12]:

    
mia.plotting.plot_mapping_2d(SNE_mapping_2d, hologic_intensity_features.index, phantom_intensity_features_subset.index, labels)
plt.savefig('figures/mappings/line_intensity_SNE_mapping_2d.png', dpi=300)

Running t-SNE to obtain a 3 dimensional mapping



In [13]:

    
SNE_mapping_3d, error = mia.analysis.tSNE(selected_features, n_components=3, **kwargs)









    



[t-SNE] Computing pairwise distances...
[t-SNE] Computed conditional probabilities for sample 366 / 366
[t-SNE] Mean sigma: 0.599228
[t-SNE] Error after 83 iterations with early exaggeration: 13.324412
[t-SNE] Error after 173 iterations: 0.359292



In [43]:

    
mia.plotting.plot_mapping_3d(SNE_mapping_3d, hologic_intensity_features.index, phantom_intensity_features_subset.index, labels)









    Out[43]:





<matplotlib.axes._subplots.Axes3DSubplot at 0x101fef190>

Isomap

Running Isomap to obtain a 2 dimensional mapping



In [15]:

    
iso_kwargs = {
    'n_neighbors': 10,
}



In [16]:

    
iso_mapping_2d, error = mia.analysis.isomap(selected_features, n_components=2, **iso_kwargs)



In [58]:

    
mia.plotting.plot_mapping_2d(iso_mapping_2d, hologic_intensity_features.index, phantom_intensity_features_subset.index, labels)
plt.savefig('figures/mappings/line_intensity_iso_mapping_2d.png', dpi=300)
features[iso_mapping_2d[1] > 0].describe() - features[iso_mapping_2d[1] <= 0].describe()



In [18]:

    
iso_mapping_3d, error = mia.analysis.isomap(selected_features, n_components=3, **iso_kwargs)



In [45]:

    
mia.plotting.plot_mapping_3d(iso_mapping_3d, hologic_intensity_features.index, phantom_intensity_features_subset.index, labels)









    Out[45]:





<matplotlib.axes._subplots.Axes3DSubplot at 0x10204ae10>

Locally Linear Embedding

Running locally linear embedding to obtain 2d mapping



In [20]:

    
lle_kwargs = {
    'n_neighbors': 10,
}



In [21]:

    
lle_mapping_2d, error = mia.analysis.lle(selected_features, n_components=2, **lle_kwargs)



In [22]:

    
mia.plotting.plot_mapping_2d(lle_mapping_2d, hologic_intensity_features.index, phantom_intensity_features_subset.index, labels)
plt.savefig('figures/mappings/line_intensity_lle_mapping_2d.png', dpi=300)



In [23]:

    
lle_mapping_3d, error = mia.analysis.lle(selected_features, n_components=3, **lle_kwargs)



In [46]:

    
mia.plotting.plot_mapping_3d(lle_mapping_3d, hologic_intensity_features.index, phantom_intensity_features_subset.index, labels)









    Out[46]:





<matplotlib.axes._subplots.Axes3DSubplot at 0x108282a50>

Quality Assessment of Dimensionality Reduction

Assess the quality of the DR against measurements from the co-ranking matrices. First create co-ranking matrices for each of the dimensionality reduction mappings



In [27]:

    
max_k = 50



In [28]:

    
SNE_mapping_2d_cm = mia.coranking.coranking_matrix(selected_features, SNE_mapping_2d)
iso_mapping_2d_cm = mia.coranking.coranking_matrix(selected_features, iso_mapping_2d)
lle_mapping_2d_cm = mia.coranking.coranking_matrix(selected_features, lle_mapping_2d)

SNE_mapping_3d_cm = mia.coranking.coranking_matrix(selected_features, SNE_mapping_3d)
iso_mapping_3d_cm = mia.coranking.coranking_matrix(selected_features, iso_mapping_3d)
lle_mapping_3d_cm = mia.coranking.coranking_matrix(selected_features, lle_mapping_3d)

2D Mappings



In [29]:

    
SNE_trustworthiness_2d = [mia.coranking.trustworthiness(SNE_mapping_2d_cm, k) for k in range(1, max_k)]
iso_trustworthiness_2d = [mia.coranking.trustworthiness(iso_mapping_2d_cm, k) for k in range(1, max_k)]
lle_trustworthiness_2d = [mia.coranking.trustworthiness(lle_mapping_2d_cm, k) for k in range(1, max_k)]



In [30]:

    
trustworthiness_df = pd.DataFrame([SNE_trustworthiness_2d,
                                   iso_trustworthiness_2d,
                                   lle_trustworthiness_2d], 
                                   index=['SNE', 'Isomap', 'LLE']).T
trustworthiness_df.plot()
plt.savefig('figures/quality_measures/line_intensity_trustworthiness_2d.png', dpi=300)



In [31]:

    
SNE_continuity_2d = [mia.coranking.continuity(SNE_mapping_2d_cm, k) for k in range(1, max_k)]
iso_continuity_2d = [mia.coranking.continuity(iso_mapping_2d_cm, k) for k in range(1, max_k)]
lle_continuity_2d = [mia.coranking.continuity(lle_mapping_2d_cm, k) for k in range(1, max_k)]



In [32]:

    
continuity_df = pd.DataFrame([SNE_continuity_2d,
                              iso_continuity_2d,
                              lle_continuity_2d], 
                              index=['SNE', 'Isomap', 'LLE']).T
continuity_df.plot()
plt.savefig('figures/quality_measures/line_intensity_continuity_2d.png', dpi=300)



In [33]:

    
SNE_lcmc_2d = [mia.coranking.LCMC(SNE_mapping_2d_cm, k) for k in range(2, max_k)]
iso_lcmc_2d = [mia.coranking.LCMC(iso_mapping_2d_cm, k) for k in range(2, max_k)]
lle_lcmc_2d = [mia.coranking.LCMC(lle_mapping_2d_cm, k) for k in range(2, max_k)]



In [34]:

    
lcmc_df = pd.DataFrame([SNE_lcmc_2d,
                        iso_lcmc_2d,
                        lle_lcmc_2d], 
                        index=['SNE', 'Isomap', 'LLE']).T
lcmc_df.plot()
plt.savefig('figures/quality_measures/line_intensity_lcmc_2d.png', dpi=300)

3D Mappings



In [35]:

    
SNE_trustworthiness_3d = [mia.coranking.trustworthiness(SNE_mapping_3d_cm, k) for k in range(1, max_k)]
iso_trustworthiness_3d = [mia.coranking.trustworthiness(iso_mapping_3d_cm, k) for k in range(1, max_k)]
lle_trustworthiness_3d = [mia.coranking.trustworthiness(lle_mapping_3d_cm, k) for k in range(1, max_k)]



In [36]:

    
trustworthiness3d_df = pd.DataFrame([SNE_trustworthiness_3d,
                                   iso_trustworthiness_3d,
                                   lle_trustworthiness_3d], 
                                   index=['SNE', 'Isomap', 'LLE']).T
trustworthiness3d_df.plot()
plt.savefig('figures/quality_measures/line_intensity_trustworthiness_3d.png', dpi=300)



In [37]:

    
SNE_continuity_3d = [mia.coranking.continuity(SNE_mapping_3d_cm, k) for k in range(1, max_k)]
iso_continuity_3d = [mia.coranking.continuity(iso_mapping_3d_cm, k) for k in range(1, max_k)]
lle_continuity_3d = [mia.coranking.continuity(lle_mapping_3d_cm, k) for k in range(1, max_k)]



In [38]:

    
continuity3d_df = pd.DataFrame([SNE_continuity_3d,
                              iso_continuity_3d,
                              lle_continuity_3d], 
                              index=['SNE', 'Isomap', 'LLE']).T
continuity3d_df.plot()
plt.savefig('figures/quality_measures/line_intensity_continuity_3d.png', dpi=300)



In [39]:

    
SNE_lcmc_3d = [mia.coranking.LCMC(SNE_mapping_3d_cm, k) for k in range(2, max_k)]
iso_lcmc_3d = [mia.coranking.LCMC(iso_mapping_3d_cm, k) for k in range(2, max_k)]
lle_lcmc_3d = [mia.coranking.LCMC(lle_mapping_3d_cm, k) for k in range(2, max_k)]



In [40]:

    
lcmc3d_df = pd.DataFrame([SNE_lcmc_3d,
                        iso_lcmc_3d,
                        lle_lcmc_3d], 
                        index=['SNE', 'Isomap', 'LLE']).T
lcmc3d_df.plot()
plt.savefig('figures/quality_measures/line_intensity_lcmc_3d.png', dpi=300)

	mean	std	min	25%	50%	75%	max	skew	kurtosis
count	-2.000000	-2.000000	-2.000000	-2.000000	-2.000000	-2.000000	-2.000000	-2.000000	-2.000000
mean	-0.003513	-0.021104	0.036702	0.011160	-0.003569	-0.018553	-0.039642	0.120643	0.351995
std	0.061442	0.001805	0.071537	0.066832	0.061887	0.056479	0.040878	0.069032	0.354517
min	-0.006950	-0.069210	0.024354	0.007557	-0.005128	-0.026422	-0.070078	-0.634472	0.076846
25%	-0.021067	-0.018684	0.012954	-0.012430	-0.020751	-0.031740	-0.059877	0.173825	0.253910
50%	-0.025319	-0.018390	0.019640	-0.008693	-0.024878	-0.037709	-0.054216	0.155946	0.251659
75%	-0.014675	-0.021351	0.020545	-0.000035	-0.016425	-0.033499	-0.039873	0.116317	0.313281
max	0.459748	-0.029071	0.634287	0.525722	0.458126	0.396945	0.236608	0.183788	3.838945

	mean	std	min	25%	50%	75%	max	skew	kurtosis
p214-010-60001-cl.png	0.362584	0.099563	0.176311	0.290825	0.350695	0.425325	0.651941	0.554375	0.564813
p214-010-60001-cr.png	0.326219	0.102222	0.153672	0.250758	0.317584	0.391641	0.590179	0.510056	0.047178
p214-010-60001-ml.png	0.394061	0.102072	0.194879	0.324951	0.387695	0.456746	0.673763	0.354676	0.266671
p214-010-60001-mr.png	0.346520	0.092847	0.181811	0.286208	0.339103	0.398680	0.605823	0.455182	0.255500
p214-010-60005-cl.png	0.406239	0.110257	0.187572	0.328258	0.400346	0.478228	0.672765	0.219887	-0.055904

	KS	p-value
mean	1.000000	3.587622e-59
std	0.966667	2.546525e-55
min	1.000000	3.587622e-59
25%	1.000000	3.587622e-59
50%	1.000000	3.587622e-59
75%	1.000000	3.587622e-59
max	1.000000	3.587622e-59
skew	1.000000	3.587622e-59
kurtosis	0.213889	4.106586e-03