In [107]:
%matplotlib inline
import pandas as pd
import numpy as np
import mia
import re

In [46]:
hologic = pd.DataFrame.from_csv('../2015-03-28-real-intensity.csv')
hologic.head()


Out[46]:
count mean std min 25% 50% 75% max skew kurtosis
p214-010-60001-cl.png 1033968 0.324531 0.182652 0.023529 0.184314 0.274510 0.435294 0.956863 0.856085 -0.103735
p214-010-60001-cr.png 854832 0.335106 0.189121 0.027451 0.180392 0.294118 0.474510 1.000000 0.605977 -0.475660
p214-010-60001-ml.png 1170384 0.341614 0.184175 0.023529 0.180392 0.305882 0.478431 0.964706 0.577949 -0.618552
p214-010-60001-mr.png 1120256 0.341279 0.179566 0.031373 0.184314 0.321569 0.470588 0.996078 0.509100 -0.599624
p214-010-60005-cl.png 1249856 0.338331 0.207879 0.015686 0.152941 0.301961 0.505882 0.901961 0.412472 -0.994880

In [3]:
hologic_meta = mia.analysis.create_hologic_meta_data(hologic_cluster, '../data/BIRADS.csv')
hologic_meta.head()


Out[3]:
patient_id side view img_name BIRADS img_number
p214-010-60001-cl.png 21401060001 c l p214-010-60001-cl.png 3 1
p214-010-60001-cr.png 21401060001 c r p214-010-60001-cr.png 3 1
p214-010-60001-ml.png 21401060001 m l p214-010-60001-ml.png 3 1
p214-010-60001-mr.png 21401060001 m r p214-010-60001-mr.png 3 1
p214-010-60005-cl.png 21401060005 c l p214-010-60005-cl.png 4 5

Real Dataset Analysis


In [94]:
selected_columns = filter(lambda x: x in ['kurtosis', 'skew', 'mean'], hologic.columns)

In [77]:
mapping = mia.analysis.tSNE(hologic, verbose=2, learning_rate=300)


[t-SNE] Computing pairwise distances...
[t-SNE] Computed conditional probabilities for sample 360 / 360
[t-SNE] Mean sigma: 1.138025
[t-SNE] Iteration 10: error = 16.4980988, gradient norm = 0.1710645
[t-SNE] Iteration 20: error = 14.7514618, gradient norm = 0.1522693
[t-SNE] Iteration 30: error = 14.3356695, gradient norm = 0.1490703
[t-SNE] Iteration 40: error = 13.8885170, gradient norm = 0.1476228
[t-SNE] Iteration 50: error = 14.0658323, gradient norm = 0.1423092
[t-SNE] Iteration 60: error = 14.2502065, gradient norm = 0.1525889
[t-SNE] Iteration 70: error = 14.2726163, gradient norm = 0.1458728
[t-SNE] Iteration 80: error = 14.0104198, gradient norm = 0.1405560
[t-SNE] Iteration 83: did not make any progress during the last 30 episodes. Finished.
[t-SNE] Error after 83 iterations with early exaggeration: 14.297692
[t-SNE] Iteration 90: error = 1.0477474, gradient norm = 0.0243286
[t-SNE] Iteration 100: error = 0.7543315, gradient norm = 0.0087214
[t-SNE] Iteration 110: error = 0.7141827, gradient norm = 0.0029759
[t-SNE] Iteration 120: error = 0.7053682, gradient norm = 0.0010927
[t-SNE] Iteration 130: error = 0.7013745, gradient norm = 0.0008200
[t-SNE] Iteration 140: error = 0.6989091, gradient norm = 0.0007137
[t-SNE] Iteration 150: error = 0.6974379, gradient norm = 0.0007452
[t-SNE] Iteration 160: error = 0.6965513, gradient norm = 0.0006670
[t-SNE] Iteration 170: error = 0.6960289, gradient norm = 0.0006553
[t-SNE] Iteration 180: error = 0.6957326, gradient norm = 0.0006424
[t-SNE] Iteration 190: error = 0.6955636, gradient norm = 0.0006310
[t-SNE] Iteration 200: error = 0.6954661, gradient norm = 0.0006250
[t-SNE] Iteration 210: error = 0.6954093, gradient norm = 0.0006217
[t-SNE] Iteration 220: error = 0.6953757, gradient norm = 0.0006197
[t-SNE] Iteration 230: error = 0.6953558, gradient norm = 0.0006185
[t-SNE] Iteration 240: error = 0.6953440, gradient norm = 0.0006178
[t-SNE] Iteration 250: error = 0.6953369, gradient norm = 0.0006174
[t-SNE] Iteration 260: error = 0.6953327, gradient norm = 0.0006171
[t-SNE] Iteration 270: error = 0.6953301, gradient norm = 0.0006170
[t-SNE] Iteration 280: error = 0.6953286, gradient norm = 0.0006169
[t-SNE] Iteration 284: error difference 0.000000. Finished.
[t-SNE] Error after 284 iterations: 0.695328

In [79]:
mia.plotting.plot_scatter_2d(mapping, [0,1], hologic_meta.BIRADS)


Out[79]:
<matplotlib.axes._subplots.AxesSubplot at 0x1353e0510>

In [95]:
mia.plotting.plot_scatter_3d(hologic, selected_columns, hologic_meta.BIRADS)

In [44]:
mapping.to_csv('../2015-03-28-real-intensity-mapping.csv')

Scatter matrix of the properties


In [73]:
h = hologic[selected_columns].copy()
h['BIRADS'] = hologic_meta.BIRADS
mia.plotting.plot_scattermatrix(h, 'BIRADS')

In [72]:
mapping['class'] = hologic_meta.BIRADS
mia.io_tools.dump_mapping_to_json(mapping, [0,1], '../mapping_viz/data.json')

In [104]:
class_1 = hologic[hologic_meta.BIRADS == 1]
class_4 = hologic[hologic_meta.BIRADS == 4]
# left = class_1[mapping[0] < 0 ]
# right = class_1[mapping[0] >= 0]

class_1.describe() - class_4.describe()


Out[104]:
count mean std min 25% 50% 75% max skew kurtosis
count -8.000000 -8.000000 -8.000000 -8.000000 -8.000000 -8.000000 -8.000000 -8.000000 -8.000000 -8.000000
mean 2245849.428571 -0.015390 -0.033828 0.007064 0.011432 0.005523 -0.039793 0.004193 -0.206480 0.003884
std 971690.816861 -0.006580 -0.013681 0.006919 -0.010602 -0.013983 -0.017689 0.005092 -0.025865 0.154963
min 773680.000000 0.010058 -0.019379 -0.007843 0.023529 0.039216 -0.003922 0.035294 -0.223580 -0.059259
25% 1218908.000000 -0.014993 -0.024057 0.003922 0.023529 0.011765 -0.031373 0.000980 -0.107588 0.045777
50% 2354928.000000 -0.018932 -0.031975 0.007843 0.000000 -0.005882 -0.027451 -0.013725 -0.136601 -0.074180
75% 3074100.000000 -0.022482 -0.045443 0.011765 0.000000 -0.009804 -0.040196 0.003922 -0.294951 -0.002076
max 3429008.000000 -0.026202 -0.076477 0.019608 -0.031373 0.000000 -0.086275 0.003922 -0.183079 1.875380

In [93]:
hol_norm = mia.analysis.normalize_data_frame(hologic)
hol_norm.columns = hologic.columns
hol_norm = hol_norm[selected_columns]
hol_norm['BIRADS'] = hologic_meta.BIRADS
pd.tools.plotting.radviz(hol_norm, 'BIRADS')


Out[93]:
<matplotlib.axes._subplots.AxesSubplot at 0x12e23edd0>

With Phantoms


In [112]:
phantoms = pd.DataFrame.from_csv('../2015-03-28-phantom-intensity.csv')
phantoms.head()


Out[112]:
count mean std min 25% 50% 75% max skew kurtosis
test_Mix_DPerc0_c_0.dcm 421792 0.976549 0.014647 0.867536 0.977173 0.981674 0.984085 0.996689 -2.630465 7.921945
test_Mix_DPerc0_c_1.dcm 421792 0.976739 0.015084 0.859236 0.977493 0.981964 0.984314 0.996689 -2.819386 9.471373
test_Mix_DPerc0_c_10.dcm 421516 0.974648 0.024636 0.485496 0.977661 0.981994 0.984298 0.996704 -5.256216 41.179821
test_Mix_DPerc0_c_11.dcm 421516 0.974302 0.028131 0.104173 0.977813 0.982269 0.984588 0.996658 -6.731141 78.174573
test_Mix_DPerc0_c_12.dcm 421312 0.974203 0.027126 0.237430 0.977508 0.982040 0.984375 0.996658 -6.138902 60.294812

In [113]:
phantom_meta = mia.analysis.create_synthetic_meta_data(phantoms, '/Volumes/Seagate/2015-03-26/synthetic_meta_data_cleaned.csv')
#replace BIRADS inspecific BIRADS classes
phantom_meta.BIRADS.replace('3 or 4', 4, inplace=True)
phantom_meta.BIRADS.replace(re.compile(r'2 \([a-z]+\)'), 2, inplace=True)
phantom_meta.BIRADS = phantom_meta.BIRADS.astype(float)
phantom_meta.head()


Out[113]:
Vol CmprTh SkTh LigThCrs LigThFn #cmprts #cmprts.1 Dperc VBD VBD.1 BIRADS min_speed max_speed min_ratio max_ratio phantom_name
test_Mix_DPerc0_c_0.dcm 436 5 0.5 400 200 333 1000 0 24 21 1 0.5 2 0.5 2 test_Mix_DPerc0_c
test_Mix_DPerc0_c_1.dcm 436 5 0.5 400 200 333 1000 0 24 21 1 0.5 2 0.5 2 test_Mix_DPerc0_c
test_Mix_DPerc0_c_10.dcm 436 5 0.5 400 200 333 1000 0 24 21 1 0.5 2 0.5 2 test_Mix_DPerc0_c
test_Mix_DPerc0_c_11.dcm 436 5 0.5 400 200 333 1000 0 24 21 1 0.5 2 0.5 2 test_Mix_DPerc0_c
test_Mix_DPerc0_c_12.dcm 436 5 0.5 400 200 333 1000 0 24 21 1 0.5 2 0.5 2 test_Mix_DPerc0_c

In [114]:
import random
group = phantoms.groupby(phantom_meta.phantom_name)

def select_random(x):
    return x.ix[random.sample(x.index, 1)]
random_synthetic_features = group.apply(select_random)

random_synthetic_features.reset_index(drop=True, level=0, inplace=True)
random_synthetic_features


Out[114]:
count mean std min 25% 50% 75% max skew kurtosis
test_Mix_DPerc0_c_2.dcm 421155 0.976388 0.013857 0.879683 0.976928 0.981292 0.983566 0.996780 -2.463729 6.445373
test_Mix_DPerc10_c_7.dcm 421452 0.977100 0.013874 0.872251 0.976532 0.980652 0.984558 0.997543 -2.300235 6.268264
test_Mix_DPerc20_c_2.dcm 421452 0.979687 0.013570 0.863142 0.977569 0.982406 0.987488 0.997452 -2.916876 12.114483
test_Mix_DPerc35_c_4.dcm 421689 0.980394 0.015766 0.848737 0.977966 0.984146 0.989502 0.997757 -2.833318 10.690941
test_Mix_DPerc5_c_10.dcm 421399 0.976306 0.015404 0.820600 0.976867 0.981079 0.983978 0.997238 -3.149511 13.230049
test_Mix_DPerc75_c_11.dcm 421520 0.983768 0.015330 0.838193 0.979675 0.987243 0.993637 0.997787 -3.298324 16.340042

In [115]:
features = pd.concat([hologic_cluster, random_synthetic_features])

class_labels = pd.concat([hologic_meta.BIRADS, phantom_meta.loc[random_synthetic_features.index].BIRADS])
class_labels.shape


Out[115]:
(366,)

In [122]:
joint_mapping = mia.analysis.tSNE(features, verbose=2, learning_rate=300)


[t-SNE] Computing pairwise distances...
[t-SNE] Computed conditional probabilities for sample 366 / 366
[t-SNE] Mean sigma: 0.721936
[t-SNE] Iteration 10: error = 16.1792125, gradient norm = 0.1742892
[t-SNE] Iteration 20: error = 13.4244793, gradient norm = 0.1609672
[t-SNE] Iteration 30: error = 13.6572978, gradient norm = 0.1455846
[t-SNE] Iteration 40: error = 13.8237459, gradient norm = 0.1412720
[t-SNE] Iteration 50: error = 13.9072265, gradient norm = 0.1354474
[t-SNE] Iteration 60: error = 13.3592098, gradient norm = 0.1587049
[t-SNE] Iteration 70: error = 13.5264709, gradient norm = 0.1453783
[t-SNE] Iteration 80: error = 13.4277231, gradient norm = 0.1391117
[t-SNE] Iteration 83: did not make any progress during the last 30 episodes. Finished.
[t-SNE] Error after 83 iterations with early exaggeration: 13.335394
[t-SNE] Iteration 90: error = 0.9379367, gradient norm = 0.0232923
[t-SNE] Iteration 100: error = 0.6464655, gradient norm = 0.0093879
[t-SNE] Iteration 110: error = 0.6006694, gradient norm = 0.0028309
[t-SNE] Iteration 120: error = 0.5909515, gradient norm = 0.0011089
[t-SNE] Iteration 130: error = 0.5869502, gradient norm = 0.0007166
[t-SNE] Iteration 140: error = 0.5849960, gradient norm = 0.0006285
[t-SNE] Iteration 150: error = 0.5839584, gradient norm = 0.0006030
[t-SNE] Iteration 160: error = 0.5833810, gradient norm = 0.0005895
[t-SNE] Iteration 170: error = 0.5830498, gradient norm = 0.0005823
[t-SNE] Iteration 180: error = 0.5828564, gradient norm = 0.0005780
[t-SNE] Iteration 190: error = 0.5827424, gradient norm = 0.0005754
[t-SNE] Iteration 200: error = 0.5826746, gradient norm = 0.0005739
[t-SNE] Iteration 210: error = 0.5826343, gradient norm = 0.0005730
[t-SNE] Iteration 220: error = 0.5826102, gradient norm = 0.0005725
[t-SNE] Iteration 230: error = 0.5825958, gradient norm = 0.0005722
[t-SNE] Iteration 240: error = 0.5825872, gradient norm = 0.0005720
[t-SNE] Iteration 250: error = 0.5825821, gradient norm = 0.0005719
[t-SNE] Iteration 260: error = 0.5825790, gradient norm = 0.0005718
[t-SNE] Iteration 270: error = 0.5825771, gradient norm = 0.0005718
[t-SNE] Iteration 278: error difference 0.000000. Finished.
[t-SNE] Error after 278 iterations: 0.582576

In [123]:
%matplotlib qt
joint_mapping['BIRADS'] = class_labels
hol_map = joint_mapping[:-6]
hol_map.shape

syn_map = joint_mapping[-6:]
syn_map.head()

ax = mia.plotting.plot_scatter_2d(hol_map, [0,1], 'BIRADS')
ax = mia.plotting.plot_scatter_2d(syn_map, [0,1], 'BIRADS', ax=ax, marker='^', s=50)

In [129]:
hologic.describe()


Out[129]:
count mean std min 25% 50% 75% max skew kurtosis
count 360.000000 360.000000 360.000000 360.000000 360.000000 360.000000 360.000000 360.000000 360.000000 360.000000
mean 2817220.533333 0.300174 0.140723 0.030414 0.183497 0.286013 0.398660 0.901329 0.474393 -0.206034
std 1518976.331463 0.027252 0.029524 0.017896 0.031410 0.038208 0.048177 0.072678 0.374146 0.816320
min 617600.000000 0.228634 0.078955 0.000000 0.094118 0.184314 0.290196 0.662745 -0.506836 -1.377802
25% 1675040.000000 0.283351 0.119196 0.019608 0.160784 0.262745 0.364706 0.850980 0.189885 -0.797907
50% 2304328.000000 0.296968 0.136649 0.027451 0.184314 0.286275 0.396078 0.909804 0.449050 -0.409374
75% 3635468.000000 0.314018 0.161092 0.039216 0.203922 0.305882 0.427451 0.956863 0.765841 0.140670
max 7497904.000000 0.405543 0.228753 0.117647 0.313725 0.411765 0.576471 1.000000 1.696356 3.227792