In [121]:
    
%matplotlib inline
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import mia
    
    
Loading the hologic and synthetic datasets.
In [83]:
    
hologic = pd.DataFrame.from_csv("hologic_blobs.csv")
phantom = pd.DataFrame.from_csv("synthetic_blobs.csv")
    
Loading the meta data for the real and synthetic datasets.
In [84]:
    
hologic_meta = mia.analysis.create_hologic_meta_data(hologic, "meta_data/real_meta.csv")
phantom_meta = mia.analysis.create_synthetic_meta_data(phantom, "meta_data/synthetic_meta.csv")
phantom_meta.index.name = 'img_name'
    
Prepare the BI-RADS/VBD labels for both datasets.
In [85]:
    
hologic_labels = hologic_meta.drop_duplicates().BIRADS
phantom_labels = phantom_meta['VBD.1']
class_labels = pd.concat([hologic_labels, phantom_labels])
class_labels.index.name = "img_name"
labels = mia.analysis.remove_duplicate_index(class_labels)[0]
    
Create blob features from distribution of blobs
In [86]:
    
hologic_blob_features = mia.analysis.features_from_blobs(hologic)
phantom_blob_features = mia.analysis.features_from_blobs(phantom)
    
Take a random subset of the phantom mammograms. This is important so that each case is not over represented.
In [87]:
    
syn_feature_meta = mia.analysis.remove_duplicate_index(phantom_meta)
phantom_blob_features['phantom_name'] = syn_feature_meta.phantom_name.tolist()
phantom_blob_features_subset = mia.analysis.create_random_subset(phantom_blob_features, 'phantom_name')
# hologic_blob_features['patient_id'] = hologic_meta['patient_id'].drop_duplicates()
# hologic_blob_features_subset = mia.analysis.create_random_subset(hologic_blob_features, 'patient_id')
    
Combine the features from both datasets.
In [88]:
    
features = pd.concat([hologic_blob_features, phantom_blob_features_subset])
assert features.shape[0] == 366
features.head()
    
    Out[88]:
Filter some features, such as the min, to remove noise.
In [89]:
    
selected_features = features.drop(['min_radius'], axis=1)
    
Compare the distributions of features detected from the real mammograms and the phantoms using the Kolmogorov-Smirnov two sample test.
In [90]:
    
ks_stats = [list(stats.ks_2samp(hologic_blob_features[col], 
                                phantom_blob_features[col]))
                                for col in selected_features.columns]
ks_test = pd.DataFrame(ks_stats, columns=['KS', 'p-value'], index=selected_features.columns)
ks_test.to_latex("tables/blob_features_ks.tex")
ks_test
    
    Out[90]:
In [91]:
    
kwargs = {
    'learning_rate': 300,
    'perplexity': 30,
    'verbose': 1
}
    
In [92]:
    
SNE_mapping_2d, error = mia.analysis.tSNE(selected_features, n_components=2, **kwargs)
    
    
In [93]:
    
mia.plotting.plot_mapping_2d(SNE_mapping_2d, hologic_blob_features.index, phantom_blob_features_subset.index, labels)
plt.savefig('figures/mappings/blob_SNE_mapping_2d.png', dpi=300)
    
    
Running t-SNE to obtain a 3 dimensional mapping
In [94]:
    
SNE_mapping_3d, error = mia.analysis.tSNE(selected_features, n_components=3, **kwargs)
    
    
In [126]:
    
mia.plotting.plot_mapping_3d(SNE_mapping_3d, hologic_blob_features.index, phantom_blob_features_subset.index, labels)
    
    Out[126]:
In [96]:
    
iso_kwargs = {
    'n_neighbors': 10,
}
    
In [97]:
    
iso_mapping_2d, error = mia.analysis.isomap(selected_features, n_components=2, **iso_kwargs)
print error
    
    
In [132]:
    
mia.plotting.plot_mapping_2d(iso_mapping_2d, hologic_blob_features.index, phantom_blob_features_subset.index, labels)
mia.io_tools.dump_mapping_to_json(iso_mapping_2d.loc[hologic_blob_features.index], [0,1], labels, 'data.json')
plt.savefig('figures/mappings/blob_iso_mapping_2d.png', dpi=300)
    
In [99]:
    
iso_mapping_3d, error = mia.analysis.isomap(selected_features, n_components=3, **iso_kwargs)
print error
    
    
In [123]:
    
mia.plotting.plot_mapping_3d(iso_mapping_3d, hologic_blob_features.index, phantom_blob_features_subset.index, labels)
    
    Out[123]:
In [101]:
    
lle_kwargs = {
    'n_neighbors': 10,
}
    
In [102]:
    
lle_mapping_2d, error = mia.analysis.lle(selected_features, n_components=2, **lle_kwargs)
print error
    
    
In [103]:
    
mia.plotting.plot_mapping_2d(lle_mapping_2d, hologic_blob_features.index, phantom_blob_features_subset.index, labels)
plt.savefig('figures/mappings/blob_lle_mapping_2d.png', dpi=300)
    
    
In [104]:
    
lle_mapping_3d, error = mia.analysis.lle(selected_features, n_components=3, **lle_kwargs)
    
In [124]:
    
mia.plotting.plot_mapping_3d(lle_mapping_3d, hologic_blob_features.index, phantom_blob_features_subset.index, labels)
    
    Out[124]:
Assess the quality of the DR against measurements from the co-ranking matrices. First create co-ranking matrices for each of the dimensionality reduction mappings
In [106]:
    
max_k = 50
    
In [107]:
    
SNE_mapping_2d_cm = mia.coranking.coranking_matrix(selected_features, SNE_mapping_2d)
iso_mapping_2d_cm = mia.coranking.coranking_matrix(selected_features, iso_mapping_2d)
lle_mapping_2d_cm = mia.coranking.coranking_matrix(selected_features, lle_mapping_2d)
SNE_mapping_3d_cm = mia.coranking.coranking_matrix(selected_features, SNE_mapping_3d)
iso_mapping_3d_cm = mia.coranking.coranking_matrix(selected_features, iso_mapping_3d)
lle_mapping_3d_cm = mia.coranking.coranking_matrix(selected_features, lle_mapping_3d)
    
In [108]:
    
SNE_trustworthiness_2d = [mia.coranking.trustworthiness(SNE_mapping_2d_cm, k) for k in range(1, max_k)]
iso_trustworthiness_2d = [mia.coranking.trustworthiness(iso_mapping_2d_cm, k) for k in range(1, max_k)]
lle_trustworthiness_2d = [mia.coranking.trustworthiness(lle_mapping_2d_cm, k) for k in range(1, max_k)]
    
In [109]:
    
trustworthiness_df = pd.DataFrame([SNE_trustworthiness_2d,
                                   iso_trustworthiness_2d,
                                   lle_trustworthiness_2d], 
                                   index=['SNE', 'Isomap', 'LLE']).T
trustworthiness_df.plot()
plt.savefig('figures/quality_measures/blob_trustworthiness_2d.png', dpi=300)
    
    
In [110]:
    
SNE_continuity_2d = [mia.coranking.continuity(SNE_mapping_2d_cm, k) for k in range(1, max_k)]
iso_continuity_2d = [mia.coranking.continuity(iso_mapping_2d_cm, k) for k in range(1, max_k)]
lle_continuity_2d = [mia.coranking.continuity(lle_mapping_2d_cm, k) for k in range(1, max_k)]
    
In [111]:
    
continuity_df = pd.DataFrame([SNE_continuity_2d,
                              iso_continuity_2d,
                              lle_continuity_2d], 
                              index=['SNE', 'Isomap', 'LLE']).T
continuity_df.plot()
plt.savefig('figures/quality_measures/blob_continuity_2d.png', dpi=300)
    
    
In [112]:
    
SNE_lcmc_2d = [mia.coranking.LCMC(SNE_mapping_2d_cm, k) for k in range(2, max_k)]
iso_lcmc_2d = [mia.coranking.LCMC(iso_mapping_2d_cm, k) for k in range(2, max_k)]
lle_lcmc_2d = [mia.coranking.LCMC(lle_mapping_2d_cm, k) for k in range(2, max_k)]
    
In [113]:
    
lcmc_df = pd.DataFrame([SNE_lcmc_2d,
                        iso_lcmc_2d,
                        lle_lcmc_2d], 
                        index=['SNE', 'Isomap', 'LLE']).T
lcmc_df.plot()
plt.savefig('figures/quality_measures/blob_lcmc_2d.png', dpi=300)
    
    
In [114]:
    
SNE_trustworthiness_3d = [mia.coranking.trustworthiness(SNE_mapping_3d_cm, k) for k in range(1, max_k)]
iso_trustworthiness_3d = [mia.coranking.trustworthiness(iso_mapping_3d_cm, k) for k in range(1, max_k)]
lle_trustworthiness_3d = [mia.coranking.trustworthiness(lle_mapping_3d_cm, k) for k in range(1, max_k)]
    
In [115]:
    
trustworthiness3d_df = pd.DataFrame([SNE_trustworthiness_3d,
                                   iso_trustworthiness_3d,
                                   lle_trustworthiness_3d], 
                                   index=['SNE', 'Isomap', 'LLE']).T
trustworthiness3d_df.plot()
plt.savefig('figures/quality_measures/blob_trustworthiness_3d.png', dpi=300)
    
    
In [116]:
    
SNE_continuity_3d = [mia.coranking.continuity(SNE_mapping_3d_cm, k) for k in range(1, max_k)]
iso_continuity_3d = [mia.coranking.continuity(iso_mapping_3d_cm, k) for k in range(1, max_k)]
lle_continuity_3d = [mia.coranking.continuity(lle_mapping_3d_cm, k) for k in range(1, max_k)]
    
In [117]:
    
continuity3d_df = pd.DataFrame([SNE_continuity_3d,
                              iso_continuity_3d,
                              lle_continuity_3d], 
                              index=['SNE', 'Isomap', 'LLE']).T
continuity3d_df.plot()
plt.savefig('figures/quality_measures/blob_continuity_3d.png', dpi=300)
    
    
In [118]:
    
SNE_lcmc_3d = [mia.coranking.LCMC(SNE_mapping_3d_cm, k) for k in range(2, max_k)]
iso_lcmc_3d = [mia.coranking.LCMC(iso_mapping_3d_cm, k) for k in range(2, max_k)]
lle_lcmc_3d = [mia.coranking.LCMC(lle_mapping_3d_cm, k) for k in range(2, max_k)]
    
In [119]:
    
lcmc3d_df = pd.DataFrame([SNE_lcmc_3d,
                        iso_lcmc_3d,
                        lle_lcmc_3d], 
                        index=['SNE', 'Isomap', 'LLE']).T
lcmc3d_df.plot()
plt.savefig('figures/quality_measures/blob_lcmc_3d.png', dpi=300)