%matplotlib inline
import re
import mia
import os.path
import numpy as np
from sklearn import manifold, decomposition

Organise the meta data

Organise the meta data from the patient ID and BIRADS classes.

image_directory = '/Volumes/Seagate/MammoData/pngs'
masks_directory = '/Volumes/Seagate/MammoData/masks'

name_regex = re.compile(r"p(\d{3}-\d{3}-\d{5})-([a-z]{2})\.png")

img_names, patient_id, views, sides = [], [], [], []
for img_path, msk_path in mia.io_tools.iterate_directory(image_directory, masks_directory):
    name = os.path.basename(img_path)
    m = re.match(name_regex, name)
    num ='-', '')
    view, side =
feature_matrix = np.load('../raw_images.npy')

df = pd.DataFrame(feature_matrix, index=img_names)
md = pd.DataFrame(np.array([patient_id, views, sides]).T, columns=['patient_id', 'view', 'side'], index=img_names)
md = mia.reduction.add_BIRADS_class(md, '../data/BIRADS.csv')

Run dimensionality reduction

First perform PCA to scale the size of the data down first so that t-SNE can handle it. This should remove the most irrelevant components. Then run t-SNE on the reduced set of components

X = df.as_matrix()

array([ 0.,  0.,  0., ...,  0.,  0.,  0.])

pca = decomposition.PCA(n_components=10)
X = pca.fit_transform(X)

array([-44.03278639, -20.10827969,   7.64452451,  18.33399668,
        15.12215433,  -1.65562797,   2.37125881,  -1.20035446,
        10.02527258,  -5.16636072])

tsne = manifold.TSNE(n_components=2, perplexity=50, early_exaggeration=2.0, verbose=2,  learning_rate=300.0)
y = tsne.fit_transform(X)

%matplotlib qt
mapping = pd.DataFrame(y, index=md.index.values)
mia.plotting.plot_scatter_2d(mapping, columns=[0,1], labels=md['class'])

mapping['class'] = md['class']
mia.io_tools.dump_mapping_to_json(mapping, columns=[0,1], output_file="../mapping_viz/data.json")