In [98]:
%matplotlib qt
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import mia


Warning: Cannot change to a different GUI toolkit: qt. Using osx instead.

Loading and Preprocessing

Loading the hologic and synthetic datasets.


In [57]:
hologic = pd.DataFrame.from_csv("real_intensity.csv")
hologic.drop(hologic.columns[:2], axis=1, inplace=True)
hologic.drop('breast_area', axis=1, inplace=True)

phantom = pd.DataFrame.from_csv("synthetic_intensity.csv")
phantom.drop(phantom.columns[:2], axis=1, inplace=True)
phantom.drop('breast_area', axis=1, inplace=True)

Loading the meta data for the real and synthetic datasets.


In [58]:
hologic_meta = mia.analysis.create_hologic_meta_data(hologic, "meta_data/real_meta.csv")
phantom_meta = mia.analysis.create_synthetic_meta_data(phantom, "meta_data/synthetic_meta.csv")
phantom_meta.index.name = 'img_name'

Prepare the BI-RADS/VBD labels for both datasets.


In [59]:
hologic_labels = hologic_meta.drop_duplicates().BIRADS
phantom_labels = phantom_meta['VBD.1']

class_labels = pd.concat([hologic_labels, phantom_labels])
class_labels.index.name = "img_name"
labels = mia.analysis.remove_duplicate_index(class_labels)[0]

Creating Features

Create blob features from distribution of blobs


In [60]:
hologic_intensity_features = mia.analysis.group_by_scale_space(hologic)
phantom_intensity_features = mia.analysis.group_by_scale_space(phantom)

Take a random subset of the phantom mammograms. This is important so that each case is not over represented.


In [61]:
syn_feature_meta = mia.analysis.remove_duplicate_index(phantom_meta)
phantom_intensity_features['phantom_name'] = syn_feature_meta.phantom_name.tolist()
phantom_intensity_features_subset = mia.analysis.create_random_subset(phantom_intensity_features, 'phantom_name')

# hologic_texture_features['patient_id'] = hologic_meta['patient_id'].drop_duplicates()
# hologic_texture_features_subset = mia.analysis.create_random_subset(hologic_texture_features, 'patient_id')

Combine the features from both datasets.


In [62]:
features = pd.concat([hologic_intensity_features, phantom_intensity_features_subset])
assert features.shape[0] == 366
features.head()


Out[62]:
count mean std min 25% 50% 75% max skew kurtosis ... count_9 mean_9 std_9 min_9 25%_9 50%_9 75%_9 max_9 skew_9 kurtosis_9
p214-010-60001-cl.png 256 0.610387 0.076183 0.411502 0.555738 0.613839 0.667148 0.781775 -0.176690 -0.383312 ... 131044 0.541212 0.118465 0.141845 0.462988 0.54614 0.624509 0.921247 -0.150691 0.148522
p214-010-60001-cr.png 256 0.558904 0.087279 0.328763 0.510450 0.570363 0.622144 0.724731 -0.406150 -0.127657 ... 131044 0.541212 0.118465 0.141845 0.462988 0.54614 0.624509 0.921247 -0.150691 0.148522
p214-010-60001-ml.png 256 0.566832 0.079640 0.349452 0.515433 0.572314 0.625658 0.741338 -0.329743 -0.087993 ... 131044 0.541212 0.118465 0.141845 0.462988 0.54614 0.624509 0.921247 -0.150691 0.148522
p214-010-60001-mr.png 256 0.534146 0.093002 0.287792 0.476245 0.543159 0.600705 0.728595 -0.344968 0.028718 ... 131044 0.541212 0.118465 0.141845 0.462988 0.54614 0.624509 0.921247 -0.150691 0.148522
p214-010-60005-cl.png 256 0.613984 0.074093 0.396148 0.565591 0.620705 0.668532 0.766788 -0.386427 -0.065749 ... 131044 0.683325 0.142954 0.079646 0.597345 0.70354 0.792035 0.995575 -0.848023 0.758542

5 rows × 100 columns

Filter some features, such as the min, to remove noise.


In [63]:
selected_features = features.copy()

Compare Real and Synthetic Features

Compare the distributions of features detected from the real mammograms and the phantoms using the Kolmogorov-Smirnov two sample test.


In [106]:
ks_stats = [list(stats.ks_2samp(hologic_intensity_features[col], 
                                phantom_intensity_features[col]))
                                for col in selected_features.columns]

ks_test = pd.DataFrame(ks_stats, columns=['KS', 'p-value'], index=selected_features.columns)
ks_test.to_latex("tables/intensity_features_ks.tex", longtable=True)
ks_test


Out[106]:
KS p-value
count 0.000000 1.000000e+00
mean 1.000000 3.587622e-59
std 0.876389 1.515491e-45
min 1.000000 3.587622e-59
25% 1.000000 3.587622e-59
50% 1.000000 3.587622e-59
75% 1.000000 3.587622e-59
max 1.000000 3.587622e-59
skew 0.891667 3.923764e-47
kurtosis 0.568056 2.209941e-19
count_1 0.000000 1.000000e+00
mean_1 1.000000 3.587622e-59
std_1 0.981944 4.539903e-57
min_1 1.000000 3.587622e-59
25%_1 1.000000 3.587622e-59
50%_1 1.000000 3.587622e-59
75%_1 1.000000 3.587622e-59
max_1 0.997222 7.598385e-59
skew_1 0.784722 1.335838e-36
kurtosis_1 0.466667 3.216681e-13
count_2 0.000000 1.000000e+00
mean_2 1.000000 3.587622e-59
std_2 1.000000 3.587622e-59
min_2 1.000000 3.587622e-59
25%_2 1.000000 3.587622e-59
50%_2 1.000000 3.587622e-59
75%_2 1.000000 3.587622e-59
max_2 0.994444 1.605940e-58
skew_2 0.501389 3.410114e-15
kurtosis_2 0.436111 1.342503e-11
... ... ...
count_7 0.000000 1.000000e+00
mean_7 1.000000 3.587622e-59
std_7 0.955556 4.577742e-54
min_7 1.000000 3.587622e-59
25%_7 1.000000 3.587622e-59
50%_7 1.000000 3.587622e-59
75%_7 1.000000 3.587622e-59
max_7 0.740278 1.280685e-32
skew_7 0.319444 2.024353e-06
kurtosis_7 0.304167 7.344875e-06
count_8 0.000000 1.000000e+00
mean_8 1.000000 3.587622e-59
std_8 0.929167 3.823290e-51
min_8 1.000000 3.587622e-59
25%_8 1.000000 3.587622e-59
50%_8 1.000000 3.587622e-59
75%_8 1.000000 3.587622e-59
max_8 0.736111 2.943248e-32
skew_8 0.547222 5.120902e-18
kurtosis_8 0.395833 1.248634e-09
count_9 0.000000 1.000000e+00
mean_9 1.000000 3.587622e-59
std_9 0.956944 3.195999e-54
min_9 1.000000 3.587622e-59
25%_9 1.000000 3.587622e-59
50%_9 1.000000 3.587622e-59
75%_9 0.997222 7.598385e-59
max_9 0.794444 1.674259e-37
skew_9 0.702778 1.934059e-29
kurtosis_9 0.891667 3.923764e-47

100 rows × 2 columns

Dimensionality Reduction

t-SNE

Running t-SNE to obtain a two dimensional representation.


In [65]:
kwargs = {
    'learning_rate': 300,
    'perplexity': 30,
    'verbose': 1
}

In [66]:
SNE_mapping_2d, error = mia.analysis.tSNE(selected_features, n_components=2, **kwargs)


[t-SNE] Computing pairwise distances...
[t-SNE] Computed conditional probabilities for sample 366 / 366
[t-SNE] Mean sigma: 2.401870
[t-SNE] Error after 83 iterations with early exaggeration: 14.124506
[t-SNE] Error after 276 iterations: 0.673903

In [95]:
mia.plotting.plot_mapping_2d(SNE_mapping_2d, hologic_intensity_features.index, phantom_intensity_features_subset.index, labels)
plt.savefig('figures/mappings/intensity_SNE_mapping_2d.png', dpi=300)


Running t-SNE to obtain a 3 dimensional mapping


In [68]:
SNE_mapping_3d, error = mia.analysis.tSNE(selected_features, n_components=3, **kwargs)


[t-SNE] Computing pairwise distances...
[t-SNE] Computed conditional probabilities for sample 366 / 366
[t-SNE] Mean sigma: 2.401870
[t-SNE] Error after 100 iterations with early exaggeration: 17.879216
[t-SNE] Error after 346 iterations: 1.711401

In [99]:
mia.plotting.plot_mapping_3d(SNE_mapping_3d, hologic_intensity_features.index, phantom_intensity_features_subset.index, labels)


Out[99]:
<matplotlib.axes._subplots.Axes3DSubplot at 0x10f3a8690>

Isomap

Running Isomap to obtain a 2 dimensional mapping


In [70]:
iso_kwargs = {
    'n_neighbors': 10,
}

In [71]:
iso_mapping_2d, error = mia.analysis.isomap(selected_features, n_components=2, **iso_kwargs)

In [72]:
mia.plotting.plot_mapping_2d(iso_mapping_2d, hologic_intensity_features.index, phantom_intensity_features_subset.index, labels)
plt.savefig('figures/mappings/intensity_iso_mapping_2d.png', dpi=300)



In [73]:
iso_mapping_3d, error = mia.analysis.isomap(selected_features, n_components=3, **iso_kwargs)

In [101]:
mia.plotting.plot_mapping_3d(iso_mapping_3d, hologic_intensity_features.index, phantom_intensity_features_subset.index, labels)


Out[101]:
<matplotlib.axes._subplots.Axes3DSubplot at 0x10e572ed0>

Locally Linear Embedding

Running locally linear embedding to obtain 2d mapping


In [75]:
lle_kwargs = {
    'n_neighbors': 10,
}

In [76]:
lle_mapping_2d, error = mia.analysis.lle(selected_features, n_components=2, **lle_kwargs)

In [77]:
mia.plotting.plot_mapping_2d(lle_mapping_2d, hologic_intensity_features.index, phantom_intensity_features_subset.index, labels)
plt.savefig('figures/mappings/intensity_lle_mapping_2d.png', dpi=300)



In [78]:
lle_mapping_3d, error = mia.analysis.lle(selected_features, n_components=3, **lle_kwargs)

In [102]:
mia.plotting.plot_mapping_3d(lle_mapping_3d, hologic_intensity_features.index, phantom_intensity_features_subset.index, labels)


Out[102]:
<matplotlib.axes._subplots.Axes3DSubplot at 0x107926a50>

Quality Assessment of Dimensionality Reduction

Assess the quality of the DR against measurements from the co-ranking matrices. First create co-ranking matrices for each of the dimensionality reduction mappings


In [80]:
max_k = 50

In [81]:
SNE_mapping_2d_cm = mia.coranking.coranking_matrix(selected_features, SNE_mapping_2d)
iso_mapping_2d_cm = mia.coranking.coranking_matrix(selected_features, iso_mapping_2d)
lle_mapping_2d_cm = mia.coranking.coranking_matrix(selected_features, lle_mapping_2d)

SNE_mapping_3d_cm = mia.coranking.coranking_matrix(selected_features, SNE_mapping_3d)
iso_mapping_3d_cm = mia.coranking.coranking_matrix(selected_features, iso_mapping_3d)
lle_mapping_3d_cm = mia.coranking.coranking_matrix(selected_features, lle_mapping_3d)

2D Mappings


In [82]:
SNE_trustworthiness_2d = [mia.coranking.trustworthiness(SNE_mapping_2d_cm, k) for k in range(1, max_k)]
iso_trustworthiness_2d = [mia.coranking.trustworthiness(iso_mapping_2d_cm, k) for k in range(1, max_k)]
lle_trustworthiness_2d = [mia.coranking.trustworthiness(lle_mapping_2d_cm, k) for k in range(1, max_k)]

In [83]:
trustworthiness_df = pd.DataFrame([SNE_trustworthiness_2d,
                                   iso_trustworthiness_2d,
                                   lle_trustworthiness_2d], 
                                   index=['SNE', 'Isomap', 'LLE']).T
trustworthiness_df.plot()
plt.savefig('figures/quality_measures/intensity_trustworthiness_2d.png', dpi=300)



In [84]:
SNE_continuity_2d = [mia.coranking.continuity(SNE_mapping_2d_cm, k) for k in range(1, max_k)]
iso_continuity_2d = [mia.coranking.continuity(iso_mapping_2d_cm, k) for k in range(1, max_k)]
lle_continuity_2d = [mia.coranking.continuity(lle_mapping_2d_cm, k) for k in range(1, max_k)]

In [85]:
continuity_df = pd.DataFrame([SNE_continuity_2d,
                              iso_continuity_2d,
                              lle_continuity_2d], 
                              index=['SNE', 'Isomap', 'LLE']).T
continuity_df.plot()
plt.savefig('figures/quality_measures/intensity_continuity_2d.png', dpi=300)



In [86]:
SNE_lcmc_2d = [mia.coranking.LCMC(SNE_mapping_2d_cm, k) for k in range(2, max_k)]
iso_lcmc_2d = [mia.coranking.LCMC(iso_mapping_2d_cm, k) for k in range(2, max_k)]
lle_lcmc_2d = [mia.coranking.LCMC(lle_mapping_2d_cm, k) for k in range(2, max_k)]

In [87]:
lcmc_df = pd.DataFrame([SNE_lcmc_2d,
                        iso_lcmc_2d,
                        lle_lcmc_2d], 
                        index=['SNE', 'Isomap', 'LLE']).T
lcmc_df.plot()
plt.savefig('figures/quality_measures/intensity_lcmc_2d.png', dpi=300)


3D Mappings


In [88]:
SNE_trustworthiness_3d = [mia.coranking.trustworthiness(SNE_mapping_3d_cm, k) for k in range(1, max_k)]
iso_trustworthiness_3d = [mia.coranking.trustworthiness(iso_mapping_3d_cm, k) for k in range(1, max_k)]
lle_trustworthiness_3d = [mia.coranking.trustworthiness(lle_mapping_3d_cm, k) for k in range(1, max_k)]

In [89]:
trustworthiness3d_df = pd.DataFrame([SNE_trustworthiness_3d,
                                   iso_trustworthiness_3d,
                                   lle_trustworthiness_3d], 
                                   index=['SNE', 'Isomap', 'LLE']).T
trustworthiness3d_df.plot()
plt.savefig('figures/quality_measures/intensity_trustworthiness_3d.png', dpi=300)



In [90]:
SNE_continuity_3d = [mia.coranking.continuity(SNE_mapping_3d_cm, k) for k in range(1, max_k)]
iso_continuity_3d = [mia.coranking.continuity(iso_mapping_3d_cm, k) for k in range(1, max_k)]
lle_continuity_3d = [mia.coranking.continuity(lle_mapping_3d_cm, k) for k in range(1, max_k)]

In [91]:
continuity3d_df = pd.DataFrame([SNE_continuity_3d,
                              iso_continuity_3d,
                              lle_continuity_3d], 
                              index=['SNE', 'Isomap', 'LLE']).T
continuity3d_df.plot()
plt.savefig('figures/quality_measures/intensity_continuity_3d.png', dpi=300)



In [92]:
SNE_lcmc_3d = [mia.coranking.LCMC(SNE_mapping_3d_cm, k) for k in range(2, max_k)]
iso_lcmc_3d = [mia.coranking.LCMC(iso_mapping_3d_cm, k) for k in range(2, max_k)]
lle_lcmc_3d = [mia.coranking.LCMC(lle_mapping_3d_cm, k) for k in range(2, max_k)]

In [93]:
lcmc3d_df = pd.DataFrame([SNE_lcmc_3d,
                        iso_lcmc_3d,
                        lle_lcmc_3d], 
                        index=['SNE', 'Isomap', 'LLE']).T
lcmc3d_df.plot()
plt.savefig('figures/quality_measures/intensity_lcmc_3d.png', dpi=300)