In [4]:
%matplotlib inline
import re
import mia
import os.path
import numpy as np
from sklearn import manifold, decomposition

Organise the meta data

Organise the meta data from the patient ID and BIRADS classes.


In [52]:
image_directory = '/Volumes/Seagate/MammoData/pngs'
masks_directory = '/Volumes/Seagate/MammoData/masks'

name_regex = re.compile(r"p(\d{3}-\d{3}-\d{5})-([a-z]{2})\.png")

img_names, patient_id, views, sides = [], [], [], []
for img_path, msk_path in mia.io_tools.iterate_directory(image_directory, masks_directory):
    name = os.path.basename(img_path)
    img_names.append(name)
    m = re.match(name_regex, name)
    num = m.group(1).replace('-', '')
    view, side = m.group(2)
    views.append(view)
    sides.append(side)
    patient_id.append(int(num))
    
feature_matrix = np.load('../raw_images.npy')


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-52-ee707b6db428> in <module>()
      5 
      6 img_names, patient_id, views, sides = [], [], [], []
----> 7 for img_path, msk_path in mia.io_tools.iterate_directory(image_directory, masks_directory):
      8     name = os.path.basename(img_path)
      9     img_names.append(name)

/Users/samuel/git/major-project/src/mia/io_tools.pyc in iterate_directory(directory, mask_directory)
     16     :returns: iterator to the image paths in the directory
     17     """
---> 18     check_is_directory(directory)
     19 
     20     for img_name in os.listdir(directory):

/Users/samuel/git/major-project/src/mia/io_tools.pyc in check_is_directory(directory)
     51     """
     52     if not os.path.isdir(directory):
---> 53         raise ValueError("%s is not a directory" % directory)
     54 
     55 

ValueError: /Volumes/Seagate/MammoData/pngs is not a directory

In [6]:
df = pd.DataFrame(feature_matrix, index=img_names)
md = pd.DataFrame(np.array([patient_id, views, sides]).T, columns=['patient_id', 'view', 'side'], index=img_names)
md = mia.reduction.add_BIRADS_class(md, '../data/BIRADS.csv')

Run dimensionality reduction

First perform PCA to scale the size of the data down first so that t-SNE can handle it. This should remove the most irrelevant components. Then run t-SNE on the reduced set of components


In [86]:
X = df.as_matrix()


Out[86]:
array([ 0.,  0.,  0., ...,  0.,  0.,  0.])

In [87]:
pca = decomposition.PCA(n_components=10)
X = pca.fit_transform(X)


Out[87]:
array([-44.03278639, -20.10827969,   7.64452451,  18.33399668,
        15.12215433,  -1.65562797,   2.37125881,  -1.20035446,
        10.02527258,  -5.16636072])

In [90]:
tsne = manifold.TSNE(n_components=2, perplexity=50, early_exaggeration=2.0, verbose=2,  learning_rate=300.0)
y = tsne.fit_transform(X)


[t-SNE] Computing pairwise distances...
[t-SNE] Computed conditional probabilities for sample 360 / 360
[t-SNE] Mean sigma: 23.230741
[t-SNE] Iteration 10: error = 4.6092650, gradient norm = 0.0930863
[t-SNE] Iteration 20: error = 3.5026420, gradient norm = 0.0795287
[t-SNE] Iteration 30: error = 3.6462744, gradient norm = 0.0785828
[t-SNE] Iteration 40: error = 3.5836784, gradient norm = 0.0726549
[t-SNE] Iteration 45: did not make any progress during the last 30 episodes. Finished.
[t-SNE] Iteration 50: error = 3.1309570, gradient norm = 0.0667554
[t-SNE] Iteration 60: error = 3.6121856, gradient norm = 0.0821283
[t-SNE] Iteration 70: error = 3.5110685, gradient norm = 0.0771007
[t-SNE] Iteration 78: did not make any progress during the last 30 episodes. Finished.
[t-SNE] Error after 78 iterations with early exaggeration: 3.563292
[t-SNE] Iteration 80: error = 0.7780572, gradient norm = 0.0213501
[t-SNE] Iteration 90: error = 0.5158594, gradient norm = 0.0085724
[t-SNE] Iteration 100: error = 0.4807104, gradient norm = 0.0027211
[t-SNE] Iteration 110: error = 0.4744094, gradient norm = 0.0008895
[t-SNE] Iteration 120: error = 0.4724821, gradient norm = 0.0005412
[t-SNE] Iteration 130: error = 0.4713123, gradient norm = 0.0005568
[t-SNE] Iteration 140: error = 0.4707005, gradient norm = 0.0004141
[t-SNE] Iteration 150: error = 0.4704620, gradient norm = 0.0003834
[t-SNE] Iteration 160: error = 0.4703361, gradient norm = 0.0003739
[t-SNE] Iteration 170: error = 0.4702641, gradient norm = 0.0003694
[t-SNE] Iteration 180: error = 0.4702221, gradient norm = 0.0003670
[t-SNE] Iteration 190: error = 0.4701973, gradient norm = 0.0003655
[t-SNE] Iteration 200: error = 0.4701825, gradient norm = 0.0003646
[t-SNE] Iteration 210: error = 0.4701738, gradient norm = 0.0003641
[t-SNE] Iteration 220: error = 0.4701685, gradient norm = 0.0003637
[t-SNE] Iteration 230: error = 0.4701654, gradient norm = 0.0003635
[t-SNE] Iteration 240: error = 0.4701635, gradient norm = 0.0003634
[t-SNE] Iteration 248: error difference 0.000000. Finished.
[t-SNE] Error after 248 iterations: 0.470163

In [92]:
%matplotlib qt
mapping = pd.DataFrame(y, index=md.index.values)
mia.plotting.plot_scatter_2d(mapping, columns=[0,1], labels=md['class'])

In [440]:
mapping['class'] = md['class']
mia.io_tools.dump_mapping_to_json(mapping, columns=[0,1], output_file="../mapping_viz/data.json")