In [83]:
%matplotlib inline
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import mia


Warning: Cannot change to a different GUI toolkit: qt. Using osx instead.

Loading and Preprocessing

Loading the hologic and synthetic datasets.


In [42]:
hologic = pd.DataFrame.from_csv("real_intensity_lines.csv")
hologic.drop(hologic.columns[:2], axis=1, inplace=True)
hologic.drop('breast_area', axis=1, inplace=True)

phantom = pd.DataFrame.from_csv("synthetic_intensity_lines.csv")
phantom.drop(phantom.columns[:2], axis=1, inplace=True)
phantom.drop('breast_area', axis=1, inplace=True)

Loading the meta data for the real and synthetic datasets.


In [43]:
hologic_meta = mia.analysis.create_hologic_meta_data(hologic, "meta_data/real_meta.csv")
phantom_meta = mia.analysis.create_synthetic_meta_data(phantom, 
                                                       "meta_data/synthetic_meta.csv")
phantom_meta.index.name = 'img_name'

Prepare the BI-RADS/VBD labels for both datasets.


In [44]:
hologic_labels = hologic_meta.drop_duplicates().BIRADS
phantom_labels = phantom_meta['VBD.1']

class_labels = pd.concat([hologic_labels, phantom_labels])
class_labels.index.name = "img_name"
labels = mia.analysis.remove_duplicate_index(class_labels)[0]

Creating Features

Create blob features from distribution of blobs


In [45]:
hologic_intensity_features = hologic[hologic.columns[4:]]
hologic_intensity_features = hologic_intensity_features.groupby(hologic.index).agg(np.mean)
phantom_intensity_features = phantom[phantom.columns[4:]]
phantom_intensity_features = phantom_intensity_features.groupby(phantom.index).agg(np.mean)

Take a random subset of the real mammograms. This is important so that each patient is not over represented.


In [46]:
hologic_intensity_features['patient_id'] = hologic_meta.drop_duplicates()['patient_id']
hologic_intensity_features_subset = mia.analysis.create_random_subset(hologic_intensity_features, 
                                                                      'patient_id')

Take a random subset of the phantom mammograms. This is important so that each case is not over represented.


In [47]:
syn_feature_meta = mia.analysis.remove_duplicate_index(phantom_meta)
phantom_intensity_features['phantom_name'] = syn_feature_meta.phantom_name.tolist()
phantom_intensity_features_subset \
    = mia.analysis.create_random_subset(phantom_intensity_features, 'phantom_name')

Combine the features from both datasets.


In [48]:
features = pd.concat([hologic_intensity_features_subset, phantom_intensity_features_subset])
assert features.shape[0] == 96
features.head()


Out[48]:
mean std min 25% 50% 75% max skew kurtosis
p214-010-60001-cl.png 0.362584 0.099563 0.176311 0.290825 0.350695 0.425325 0.651941 0.554375 0.564813
p214-010-60005-cr.png 0.390749 0.115338 0.178942 0.308598 0.382695 0.464941 0.671299 0.285397 -0.168101
p214-010-60008-mr.png 0.380682 0.068131 0.252233 0.335284 0.374149 0.418809 0.586883 0.533288 0.537177
p214-010-60012-cr.png 0.310309 0.084479 0.157582 0.252724 0.301992 0.359711 0.574846 0.591287 0.876965
p214-010-60013-cl.png 0.328995 0.077626 0.185815 0.275454 0.321237 0.373890 0.553304 0.527032 0.352804

Filter some features, such as the min, to remove noise.


In [49]:
selected_features = features.copy()

Compare Real and Synthetic Features

Compare the distributions of features detected from the real mammograms and the phantoms using the Kolmogorov-Smirnov two sample test.


In [50]:
ks_stats = [list(stats.ks_2samp(hologic_intensity_features[col], 
                                phantom_intensity_features[col]))
                                for col in selected_features.columns]

ks_test = pd.DataFrame(ks_stats, columns=['KS', 'p-value'], index=selected_features.columns)
ks_test.to_latex("tables/line_intensity_features_ks.tex")
ks_test


Out[50]:
KS p-value
mean 1.000000 3.587622e-59
std 0.966667 2.546525e-55
min 1.000000 3.587622e-59
25% 1.000000 3.587622e-59
50% 1.000000 3.587622e-59
75% 1.000000 3.587622e-59
max 1.000000 3.587622e-59
skew 1.000000 3.587622e-59
kurtosis 0.213889 4.106586e-03

Dimensionality Reduction

t-SNE

Running t-SNE to obtain a two dimensional representation.


In [51]:
real_index = hologic_intensity_features_subset.index
phantom_index = phantom_intensity_features_subset.index

In [52]:
kwargs = {
    'learning_rate': 200,
    'perplexity': 20,
    'verbose': 1
}

In [53]:
SNE_mapping_2d = mia.analysis.tSNE(selected_features, n_components=2, **kwargs)


[t-SNE] Computing pairwise distances...
[t-SNE] Computed conditional probabilities for sample 96 / 96
[t-SNE] Mean sigma: 0.714076
[t-SNE] Error after 65 iterations with early exaggeration: 12.750609
[t-SNE] Error after 130 iterations: 0.743695

In [54]:
mia.plotting.plot_mapping_2d(SNE_mapping_2d, real_index, phantom_index, labels)
plt.savefig('figures/mappings/line_intensity_SNE_mapping_2d.png', dpi=300)


Running t-SNE to obtain a 3 dimensional mapping


In [55]:
SNE_mapping_3d = mia.analysis.tSNE(selected_features, n_components=3, **kwargs)


[t-SNE] Computing pairwise distances...
[t-SNE] Computed conditional probabilities for sample 96 / 96
[t-SNE] Mean sigma: 0.714076
[t-SNE] Error after 100 iterations with early exaggeration: 16.315305
[t-SNE] Error after 297 iterations: 2.612050

In [84]:
mia.plotting.plot_mapping_3d(SNE_mapping_3d, real_index, phantom_index, labels)


Out[84]:
<matplotlib.axes._subplots.Axes3DSubplot at 0x115f941d0>

Isomap

Running Isomap to obtain a 2 dimensional mapping


In [57]:
iso_kwargs = {
    'n_neighbors': 4,
}

In [58]:
iso_mapping_2d = mia.analysis.isomap(selected_features, n_components=2, **iso_kwargs)

In [59]:
mia.plotting.plot_mapping_2d(iso_mapping_2d, real_index, phantom_index, labels)
plt.savefig('figures/mappings/line_intensity_iso_mapping_2d.png', dpi=300)



In [60]:
iso_mapping_3d = mia.analysis.isomap(selected_features, n_components=3, **iso_kwargs)

In [86]:
mia.plotting.plot_mapping_3d(iso_mapping_3d, real_index, phantom_index, labels)


Out[86]:
<matplotlib.axes._subplots.Axes3DSubplot at 0x117134a90>

Locally Linear Embedding

Running locally linear embedding to obtain 2d mapping


In [62]:
lle_kwargs = {
    'n_neighbors': 4,
}

In [63]:
lle_mapping_2d = mia.analysis.lle(selected_features, n_components=2, **lle_kwargs)

In [64]:
mia.plotting.plot_mapping_2d(lle_mapping_2d, real_index, phantom_index, labels)
plt.savefig('figures/mappings/line_intensity_lle_mapping_2d.png', dpi=300)



In [65]:
lle_mapping_3d = mia.analysis.lle(selected_features, n_components=3, **lle_kwargs)

In [87]:
mia.plotting.plot_mapping_3d(lle_mapping_3d, real_index, phantom_index, labels)


Out[87]:
<matplotlib.axes._subplots.Axes3DSubplot at 0x115c09d90>

Quality Assessment of Dimensionality Reduction

Assess the quality of the DR against measurements from the co-ranking matrices. First create co-ranking matrices for each of the dimensionality reduction mappings


In [67]:
max_k = 50

In [68]:
SNE_mapping_2d_cm = mia.coranking.coranking_matrix(selected_features, 
                                                   SNE_mapping_2d)
iso_mapping_2d_cm = mia.coranking.coranking_matrix(selected_features, 
                                                   iso_mapping_2d)
lle_mapping_2d_cm = mia.coranking.coranking_matrix(selected_features, 
                                                   lle_mapping_2d)

SNE_mapping_3d_cm = mia.coranking.coranking_matrix(selected_features, 
                                                   SNE_mapping_3d)
iso_mapping_3d_cm = mia.coranking.coranking_matrix(selected_features, 
                                                   iso_mapping_3d)
lle_mapping_3d_cm = mia.coranking.coranking_matrix(selected_features, 
                                                   lle_mapping_3d)

2D Mappings


In [69]:
SNE_trustworthiness_2d = [mia.coranking.trustworthiness(SNE_mapping_2d_cm, k) 
                          for k in range(1, max_k)]
iso_trustworthiness_2d = [mia.coranking.trustworthiness(iso_mapping_2d_cm, k) 
                          for k in range(1, max_k)]
lle_trustworthiness_2d = [mia.coranking.trustworthiness(lle_mapping_2d_cm, k) 
                          for k in range(1, max_k)]

In [70]:
trustworthiness_df = pd.DataFrame([SNE_trustworthiness_2d,
                                   iso_trustworthiness_2d,
                                   lle_trustworthiness_2d], 
                                   index=['SNE', 'Isomap', 'LLE']).T
trustworthiness_df.plot()
plt.savefig('figures/quality_measures/line_intensity_trustworthiness_2d.png', dpi=300)



In [71]:
SNE_continuity_2d = [mia.coranking.continuity(SNE_mapping_2d_cm, k) 
                     for k in range(1, max_k)]
iso_continuity_2d = [mia.coranking.continuity(iso_mapping_2d_cm, k) 
                     for k in range(1, max_k)]
lle_continuity_2d = [mia.coranking.continuity(lle_mapping_2d_cm, k) 
                     for k in range(1, max_k)]

In [72]:
continuity_df = pd.DataFrame([SNE_continuity_2d,
                              iso_continuity_2d,
                              lle_continuity_2d], 
                              index=['SNE', 'Isomap', 'LLE']).T
continuity_df.plot()
plt.savefig('figures/quality_measures/line_intensity_continuity_2d.png', dpi=300)



In [73]:
SNE_lcmc_2d = [mia.coranking.LCMC(SNE_mapping_2d_cm, k) 
               for k in range(2, max_k)]
iso_lcmc_2d = [mia.coranking.LCMC(iso_mapping_2d_cm, k) 
               for k in range(2, max_k)]
lle_lcmc_2d = [mia.coranking.LCMC(lle_mapping_2d_cm, k) 
               for k in range(2, max_k)]

In [74]:
lcmc_df = pd.DataFrame([SNE_lcmc_2d,
                        iso_lcmc_2d,
                        lle_lcmc_2d], 
                        index=['SNE', 'Isomap', 'LLE']).T
lcmc_df.plot()
plt.savefig('figures/quality_measures/line_intensity_lcmc_2d.png', dpi=300)


3D Mappings


In [75]:
SNE_trustworthiness_3d = [mia.coranking.trustworthiness(SNE_mapping_3d_cm, k) 
                          for k in range(1, max_k)]
iso_trustworthiness_3d = [mia.coranking.trustworthiness(iso_mapping_3d_cm, k) 
                          for k in range(1, max_k)]
lle_trustworthiness_3d = [mia.coranking.trustworthiness(lle_mapping_3d_cm, k) 
                          for k in range(1, max_k)]

In [76]:
trustworthiness3d_df = pd.DataFrame([SNE_trustworthiness_3d,
                                   iso_trustworthiness_3d,
                                   lle_trustworthiness_3d], 
                                   index=['SNE', 'Isomap', 'LLE']).T
trustworthiness3d_df.plot()
plt.savefig('figures/quality_measures/line_intensity_trustworthiness_3d.png', dpi=300)



In [77]:
SNE_continuity_3d = [mia.coranking.continuity(SNE_mapping_3d_cm, k) 
                     for k in range(1, max_k)]
iso_continuity_3d = [mia.coranking.continuity(iso_mapping_3d_cm, k) 
                     for k in range(1, max_k)]
lle_continuity_3d = [mia.coranking.continuity(lle_mapping_3d_cm, k) 
                     for k in range(1, max_k)]

In [78]:
continuity3d_df = pd.DataFrame([SNE_continuity_3d,
                              iso_continuity_3d,
                              lle_continuity_3d], 
                              index=['SNE', 'Isomap', 'LLE']).T
continuity3d_df.plot()
plt.savefig('figures/quality_measures/line_intensity_continuity_3d.png', dpi=300)



In [79]:
SNE_lcmc_3d = [mia.coranking.LCMC(SNE_mapping_3d_cm, k) 
               for k in range(2, max_k)]
iso_lcmc_3d = [mia.coranking.LCMC(iso_mapping_3d_cm, k) 
               for k in range(2, max_k)]
lle_lcmc_3d = [mia.coranking.LCMC(lle_mapping_3d_cm, k) 
               for k in range(2, max_k)]

In [80]:
lcmc3d_df = pd.DataFrame([SNE_lcmc_3d,
                        iso_lcmc_3d,
                        lle_lcmc_3d], 
                        index=['SNE', 'Isomap', 'LLE']).T
lcmc3d_df.plot()
plt.savefig('figures/quality_measures/line_intensity_lcmc_3d.png', dpi=300)