In [113]:
%matplotlib qt
import pandas as pd
import numpy as np
import re
import mia

In [104]:
hologic_cluster = pd.DataFrame.from_csv('../2015-03-28-real-texture-cluster.csv')
hologic_cluster.head()


Out[104]:
contrast_cluster_1 dissimilarity_cluster_1 homogeneity_cluster_1 energy_cluster_1 contrast_cluster_2 dissimilarity_cluster_2 homogeneity_cluster_2 energy_cluster_2 contrast_cluster_3 dissimilarity_cluster_3 homogeneity_cluster_3 energy_cluster_3 contrast_cluster_4 dissimilarity_cluster_4 homogeneity_cluster_4 energy_cluster_4
p214-010-60001-cl.png 2.931535e+08 7003850.25 8130257.674942 51371.507991 8.260166e+08 12106920.25 8178692.790514 51247.278008 1.189669e+09 10889315.75 8309986.114009 46728.453963 7.767933e+08 5886436.25 8380532.207002 43973.709887
p214-010-60001-cr.png 1.856295e+08 4623518.75 8241054.817176 52008.002383 6.412453e+08 9141035.75 8250194.492365 57204.728899 1.151640e+09 10339648.50 8311361.858274 48833.499865 7.751633e+08 5625592.50 8402698.450330 47068.058470
p214-010-60001-ml.png 2.225075e+08 5867047.50 8121710.629886 53939.606377 8.938307e+08 12497527.25 8178878.867447 60888.448918 1.785641e+09 16247420.50 8210113.855566 39805.116295 1.237097e+09 9180578.50 8338267.205284 40480.385535
p214-010-60001-mr.png 2.280928e+08 5802305.25 8143748.833719 56363.712378 1.047522e+09 13875836.25 8165493.907820 53059.072536 1.850879e+09 16737395.50 8209670.079957 47579.096741 1.073341e+09 7801042.00 8371460.064432 57173.612063
p214-010-60005-cl.png 1.563874e+08 4584042.25 8163100.805695 37606.734376 6.720121e+08 9797939.50 8198951.000702 36979.949572 1.548208e+09 13951816.50 8220116.309021 32055.018330 1.131708e+09 8562726.25 8306607.852877 55253.097523

In [105]:
hologic_meta = mia.analysis.create_hologic_meta_data(hologic_cluster, '../data/BIRADS.csv')
hologic_meta.head()


Out[105]:
patient_id side view img_name BIRADS img_number
p214-010-60001-cl.png 21401060001 c l p214-010-60001-cl.png 3 1
p214-010-60001-cr.png 21401060001 c r p214-010-60001-cr.png 3 1
p214-010-60001-ml.png 21401060001 m l p214-010-60001-ml.png 3 1
p214-010-60001-mr.png 21401060001 m r p214-010-60001-mr.png 3 1
p214-010-60005-cl.png 21401060005 c l p214-010-60005-cl.png 4 5

Real Dataset Analysis


In [185]:
columns = filter(lambda x: 'homogeneity' in x, hologic_cluster.columns)

Scatter matrix of the clusters. From this is is noted that Homogeneity is the major cause for the splitting as it's bimodal across all clusters. Contrast and dissimilarity show correlations from high to low risk. Enegery also shows this trend, but it's correlation is weaker.


In [194]:
hc = hologic_cluster[filter(lambda x: '1' in x, hologic_cluster.columns)].copy()
hc['BIRADS'] = hologic_meta.BIRADS
mia.plotting.plot_scattermatrix(hc, 'BIRADS')


The bimodal-ness of homogeneity can be better seen when plotted as a histogram for each cluster:


In [193]:
%matplotlib inline
hc = hologic_cluster.copy()
hc['class'] = hologic_meta.BIRADS
mia.plotting.plot_risk_classes(hc, 'homogeneity_cluster_4')



In [177]:
mapping = mia.analysis.tSNE(hologic_cluster[columns], n_components=2, verbose=2, learning_rate=300)


[t-SNE] Computing pairwise distances...
[t-SNE] Computed conditional probabilities for sample 360 / 360
[t-SNE] Mean sigma: 0.574917
[t-SNE] Iteration 10: error = 16.1541876, gradient norm = 0.1719006
[t-SNE] Iteration 20: error = 13.2540182, gradient norm = 0.1536431
[t-SNE] Iteration 30: error = 12.5407164, gradient norm = 0.1586372
[t-SNE] Iteration 40: error = 12.8821844, gradient norm = 0.1474866
[t-SNE] Iteration 50: error = 12.8103744, gradient norm = 0.1423375
[t-SNE] Iteration 60: error = 12.7278322, gradient norm = 0.1492145
[t-SNE] Iteration 70: error = 12.9535844, gradient norm = 0.1467182
[t-SNE] Iteration 80: error = 12.9187282, gradient norm = 0.1383605
[t-SNE] Iteration 83: did not make any progress during the last 30 episodes. Finished.
[t-SNE] Error after 83 iterations with early exaggeration: 12.640844
[t-SNE] Iteration 90: error = 0.8356086, gradient norm = 0.0233477
[t-SNE] Iteration 100: error = 0.5918354, gradient norm = 0.0087953
[t-SNE] Iteration 110: error = 0.5427384, gradient norm = 0.0032808
[t-SNE] Iteration 120: error = 0.5263414, gradient norm = 0.0016312
[t-SNE] Iteration 130: error = 0.5209433, gradient norm = 0.0008191
[t-SNE] Iteration 140: error = 0.5183772, gradient norm = 0.0006766
[t-SNE] Iteration 150: error = 0.5169651, gradient norm = 0.0006389
[t-SNE] Iteration 160: error = 0.5161954, gradient norm = 0.0006213
[t-SNE] Iteration 170: error = 0.5157535, gradient norm = 0.0006135
[t-SNE] Iteration 180: error = 0.5154955, gradient norm = 0.0006089
[t-SNE] Iteration 190: error = 0.5153432, gradient norm = 0.0006062
[t-SNE] Iteration 200: error = 0.5152528, gradient norm = 0.0006047
[t-SNE] Iteration 210: error = 0.5151990, gradient norm = 0.0006037
[t-SNE] Iteration 220: error = 0.5151668, gradient norm = 0.0006032
[t-SNE] Iteration 230: error = 0.5151476, gradient norm = 0.0006028
[t-SNE] Iteration 240: error = 0.5151362, gradient norm = 0.0006026
[t-SNE] Iteration 250: error = 0.5151293, gradient norm = 0.0006025
[t-SNE] Iteration 260: error = 0.5151252, gradient norm = 0.0006025
[t-SNE] Iteration 270: error = 0.5151227, gradient norm = 0.0006024
[t-SNE] Iteration 280: error = 0.5151212, gradient norm = 0.0006024
[t-SNE] Iteration 283: error difference 0.000000. Finished.
[t-SNE] Error after 283 iterations: 0.515121

In [178]:
mia.plotting.plot_scatter_2d(mapping, [0,1], hologic_meta.BIRADS)


Out[178]:
<matplotlib.axes._subplots.AxesSubplot at 0x133f1bf50>

In [196]:
mia.analysis.measure_closeness(mapping, hologic_meta.img_name).mean()


Out[196]:
5.3192683969678507

In [106]:
left_cluster = hologic_cluster[mapping[0] < 0]
right_cluster = hologic_cluster[mapping[0] >= 0]

# columns = filter(lambda x: 'cluster_4' in x, hologic_cluster.columns)
# mapping[]
mask = filter(lambda x: '61246' in x, mapping.index)
hologic_cluster.loc[mask].describe() - hologic_cluster[hologic_meta.BIRADS == 1].describe()


Out[106]:
contrast_cluster_1 dissimilarity_cluster_1 homogeneity_cluster_1 energy_cluster_1 contrast_cluster_2 dissimilarity_cluster_2 homogeneity_cluster_2 energy_cluster_2 contrast_cluster_3 dissimilarity_cluster_3 homogeneity_cluster_3 energy_cluster_3 contrast_cluster_4 dissimilarity_cluster_4 homogeneity_cluster_4 energy_cluster_4
count -5.200000e+01 -52.000000 -52.000000 -52.000000 -5.200000e+01 -52.000000 -52.000000 -52.000000 -5.200000e+01 -52.000000 -52.000000 -52.000000 -5.200000e+01 -52.000000 -52.000000 -52.000000
mean 9.160567e+07 74524.982143 -4061161.758205 -18304.689283 -5.051321e+08 -7603719.178571 -4147655.094432 -20643.333663 -3.373641e+09 -36808132.767857 -3666362.056107 -42376.793559 -3.327056e+09 -31426773.834821 -3804590.423466 -29025.715724
std -1.603533e+08 -5406764.273350 -1618317.125137 -19830.220933 -7.159395e+08 -12883083.536642 -1569686.264484 -25174.014670 -1.874185e+09 -22761707.482165 -1522252.920157 -30145.025223 -1.554493e+09 -15425890.091613 -1602577.047234 -19292.470167
min 3.823331e+08 9378632.750000 0.000000 22302.068686 1.215520e+09 20742859.000000 0.000000 22709.714106 0.000000e+00 0.000000 20266.891924 5965.814005 0.000000e+00 0.000000 218008.899588 4024.187474
25% 2.387171e+08 6330557.437500 -4194017.424574 -1511.915397 -5.778795e+06 755407.062500 -4202539.132652 -1039.978464 -1.904371e+09 -18989405.625000 -3815734.235083 -21753.060392 -2.172337e+09 -17971034.187500 -4219773.376121 -15959.627871
50% 8.880562e+07 -190636.000000 -4574191.429087 -18499.407759 -4.143155e+08 -6934196.375000 -4664886.311245 -14429.145909 -3.429784e+09 -33118396.125000 -4180971.192237 -38970.849494 -3.478431e+09 -31447243.125000 -4406834.832800 -30190.599009
75% 2.835360e+07 -3909092.812500 -4986291.515469 -30188.766977 -8.355635e+08 -14812328.625000 -5032462.009876 -32851.642817 -4.576160e+09 -51285987.125000 -4581666.947618 -56118.123489 -4.471933e+09 -42422963.500000 -4554812.161786 -40418.875428
max -5.056405e+08 -15772208.000000 -5508030.187789 -71767.455048 -2.995128e+09 -43548365.000000 -5568131.421848 -109680.219475 -7.532141e+09 -92935277.000000 -4947226.463877 -159337.533306 -6.141837e+09 -66472843.250000 -4989668.123093 -100752.339516

Looking at the most significant features


In [156]:
hc = mia.analysis.normalize_data_frame(hologic_cluster)
hc.columns = hologic_cluster.columns
hc['BIRADS'] = hologic_meta.BIRADS
pd.tools.plotting.radviz(hc, 'BIRADS')


Out[156]:
<matplotlib.axes._subplots.AxesSubplot at 0x129410b90>

In [73]:
mapping['class'] = hologic_meta['BIRADS']
mia.io_tools.dump_mapping_to_json(mapping, [0,1], '../mapping_viz/data.json')

In [49]:
mapping.to_csv('../2015-03-28-real-texture-cluster-mapping.csv')

Including the Synthetic Data


In [109]:
phantoms = pd.DataFrame.from_csv('../2015-03-28-phantom-texture-cluster.csv')
phantoms.head()


Out[109]:
contrast_cluster_1 dissimilarity_cluster_1 homogeneity_cluster_1 energy_cluster_1 contrast_cluster_2 dissimilarity_cluster_2 homogeneity_cluster_2 energy_cluster_2 contrast_cluster_3 dissimilarity_cluster_3 homogeneity_cluster_3 energy_cluster_3 contrast_cluster_4 dissimilarity_cluster_4 homogeneity_cluster_4 energy_cluster_4
test_Mix_DPerc0_c_0.dcm 7.805321e+07 337040.75 3664686.896810 49792.877875 2.137532e+08 899534.25 3659636.929234 37305.672427 3.012525e+08 1239525.25 3656372.640303 60874.139738 2.190925e+08 934585.25 3637426.757401 91539.044083
test_Mix_DPerc0_c_1.dcm 1.123977e+08 483231.25 3662776.650960 58883.868513 2.284522e+08 955230.00 3658457.557997 37682.485071 4.226799e+08 1717695.00 3655296.549928 41519.417587 3.340117e+08 1389453.25 3637492.324049 94920.560177
test_Mix_DPerc0_c_10.dcm 1.746862e+07 90670.75 3665781.117732 49048.932066 7.524988e+07 337097.25 3662244.233806 53688.129002 2.188807e+08 914860.75 3656727.883353 31368.812801 1.995978e+08 864599.25 3635457.378337 92797.242762
test_Mix_DPerc0_c_11.dcm 8.080798e+06 49274.25 3666552.061988 46708.531551 5.140643e+07 240891.50 3662899.062626 41445.544031 1.732943e+08 735558.00 3656810.989204 40871.497863 1.718443e+08 756757.25 3634511.683852 101160.576150
test_Mix_DPerc0_c_12.dcm 1.394344e+07 77445.00 3665783.596805 47152.382667 7.345143e+07 331874.00 3661879.744782 41461.886764 2.239596e+08 936619.50 3656330.531212 57281.910172 2.009572e+08 869104.50 3635644.891045 98646.047275

In [117]:
phantom_meta = mia.analysis.create_synthetic_meta_data(phantoms, '/Volumes/Seagate/2015-03-26/synthetic_meta_data_cleaned.csv')
#replace BIRADS inspecific BIRADS classes
phantom_meta.BIRADS.replace('3 or 4', 4, inplace=True)
phantom_meta.BIRADS.replace(re.compile(r'2 \([a-z]+\)'), 2, inplace=True)
phantom_meta.BIRADS = phantom_meta.BIRADS.astype(float)
phantom_meta.head()


Out[117]:
Vol CmprTh SkTh LigThCrs LigThFn #cmprts #cmprts.1 Dperc VBD VBD.1 BIRADS min_speed max_speed min_ratio max_ratio phantom_name
test_Mix_DPerc0_c_0.dcm 436 5 0.5 400 200 333 1000 0 24 21 1 0.5 2 0.5 2 test_Mix_DPerc0_c
test_Mix_DPerc0_c_1.dcm 436 5 0.5 400 200 333 1000 0 24 21 1 0.5 2 0.5 2 test_Mix_DPerc0_c
test_Mix_DPerc0_c_10.dcm 436 5 0.5 400 200 333 1000 0 24 21 1 0.5 2 0.5 2 test_Mix_DPerc0_c
test_Mix_DPerc0_c_11.dcm 436 5 0.5 400 200 333 1000 0 24 21 1 0.5 2 0.5 2 test_Mix_DPerc0_c
test_Mix_DPerc0_c_12.dcm 436 5 0.5 400 200 333 1000 0 24 21 1 0.5 2 0.5 2 test_Mix_DPerc0_c

Select a random subset of the phantoms for use with the t-SNE algorithm


In [122]:
import random
group = phantoms.groupby(phantom_meta.phantom_name)

def select_random(x):
    return x.ix[random.sample(x.index, 1)]
random_synthetic_features = group.apply(select_random)

random_synthetic_features.reset_index(drop=True, level=0, inplace=True)
random_synthetic_features


Out[122]:
contrast_cluster_1 dissimilarity_cluster_1 homogeneity_cluster_1 energy_cluster_1 contrast_cluster_2 dissimilarity_cluster_2 homogeneity_cluster_2 energy_cluster_2 contrast_cluster_3 dissimilarity_cluster_3 homogeneity_cluster_3 energy_cluster_3 contrast_cluster_4 dissimilarity_cluster_4 homogeneity_cluster_4 energy_cluster_4
test_Mix_DPerc0_c_6.dcm 1.289264e+08 551006.50 3662803.274767 53445.460117 2.306973e+08 962567.50 3659318.739485 42593.927039 3.455569e+08 1411829.00 3657185.743758 57941.727834 2.559077e+08 1080401.25 3637437.591758 76573.368273
test_Mix_DPerc10_c_3.dcm 1.382765e+08 588710.75 3662005.961962 28960.985685 2.301554e+08 956916.00 3658477.753351 39581.689743 1.096317e+09 4418884.25 3639789.584119 55585.581781 9.496640e+08 3818397.00 3642493.255595 83856.953167
test_Mix_DPerc20_c_3.dcm 5.541825e+07 240940.50 3665167.187222 52121.902079 1.394425e+08 584554.75 3660939.875182 41342.833212 8.630619e+08 3480343.00 3643674.748048 97355.501648 7.266742e+08 2927117.25 3644305.585281 75286.592959
test_Mix_DPerc35_c_7.dcm 4.736188e+07 210227.75 3663797.672162 45032.226662 2.066614e+08 854896.25 3658942.095173 43871.778160 6.274249e+08 2529067.00 3647462.072949 92228.524380 4.550624e+08 1828349.75 3652664.802134 79720.458224
test_Mix_DPerc5_c_6.dcm 4.005631e+07 180438.50 3664749.354619 38294.706842 1.566761e+08 660404.50 3660755.434569 42336.236493 3.231703e+08 1323028.00 3656627.025249 38625.867028 2.629593e+08 1100958.50 3641405.868696 103933.893189
test_Mix_DPerc75_c_10.dcm 3.715249e+07 163717.50 3665041.626076 49269.152625 1.884157e+08 777045.25 3660657.189942 44628.508039 4.566236e+08 1843255.50 3650387.653658 96029.157381 3.009384e+08 1202946.25 3658487.012120 75580.943172

In [132]:
features = pd.concat([hologic_cluster, random_synthetic_features])

class_labels = pd.concat([hologic_meta.BIRADS, phantom_meta.loc[random_synthetic_features.index].BIRADS])
class_labels.shape


Out[132]:
(366,)

In [213]:
columns = filter(lambda x: 'homogeneity' not in x, features.columns)

In [203]:
joint_mapping = mia.analysis.tSNE(features[columns], verbose=2, learning_rate=300)


[t-SNE] Computing pairwise distances...
[t-SNE] Computed conditional probabilities for sample 366 / 366
[t-SNE] Mean sigma: 0.575752
[t-SNE] Iteration 10: error = 16.4174517, gradient norm = 0.1587858
[t-SNE] Iteration 20: error = 13.4212003, gradient norm = 0.1597509
[t-SNE] Iteration 30: error = 13.1282000, gradient norm = 0.1497105
[t-SNE] Iteration 40: error = 12.6614001, gradient norm = 0.1495442
[t-SNE] Iteration 50: error = 12.9267517, gradient norm = 0.1458217
[t-SNE] Iteration 60: error = 12.4563635, gradient norm = 0.1550798
[t-SNE] Iteration 70: error = 12.5049876, gradient norm = 0.1562701
[t-SNE] Iteration 80: error = 12.7761028, gradient norm = 0.1421605
[t-SNE] Iteration 83: did not make any progress during the last 30 episodes. Finished.
[t-SNE] Error after 83 iterations with early exaggeration: 13.093702
[t-SNE] Iteration 90: error = 0.8760613, gradient norm = 0.0257260
[t-SNE] Iteration 100: error = 0.5608583, gradient norm = 0.0093508
[t-SNE] Iteration 110: error = 0.5177783, gradient norm = 0.0030459
[t-SNE] Iteration 120: error = 0.5073383, gradient norm = 0.0012317
[t-SNE] Iteration 130: error = 0.5029114, gradient norm = 0.0007389
[t-SNE] Iteration 140: error = 0.5006705, gradient norm = 0.0006484
[t-SNE] Iteration 150: error = 0.4994553, gradient norm = 0.0006244
[t-SNE] Iteration 160: error = 0.4987725, gradient norm = 0.0006113
[t-SNE] Iteration 170: error = 0.4983793, gradient norm = 0.0006043
[t-SNE] Iteration 180: error = 0.4981494, gradient norm = 0.0006002
[t-SNE] Iteration 190: error = 0.4980136, gradient norm = 0.0005978
[t-SNE] Iteration 200: error = 0.4979330, gradient norm = 0.0005963
[t-SNE] Iteration 210: error = 0.4978849, gradient norm = 0.0005955
[t-SNE] Iteration 220: error = 0.4978563, gradient norm = 0.0005950
[t-SNE] Iteration 230: error = 0.4978391, gradient norm = 0.0005947
[t-SNE] Iteration 240: error = 0.4978289, gradient norm = 0.0005945
[t-SNE] Iteration 250: error = 0.4978227, gradient norm = 0.0005944
[t-SNE] Iteration 260: error = 0.4978191, gradient norm = 0.0005943
[t-SNE] Iteration 270: error = 0.4978169, gradient norm = 0.0005943
[t-SNE] Iteration 280: error = 0.4978155, gradient norm = 0.0005943
[t-SNE] Iteration 281: error difference 0.000000. Finished.
[t-SNE] Error after 281 iterations: 0.497815

Plotting both reals and phantoms together as a single scatter plot:


In [212]:
%matplotlib qt
joint_mapping['BIRADS'] = class_labels
hol_map = joint_mapping[:-6]
hol_map.shape

syn_map = joint_mapping[-6:]
syn_map.head()

ax = mia.plotting.plot_scatter_2d(hol_map, [0,1], 'BIRADS')
ax = mia.plotting.plot_scatter_2d(syn_map, [0,1], 'BIRADS', ax=ax, marker='^', s=50)

In [198]:
random_synthetic_features.describe() - hologic_cluster.describe()


Out[198]:
contrast_cluster_1 dissimilarity_cluster_1 homogeneity_cluster_1 energy_cluster_1 contrast_cluster_2 dissimilarity_cluster_2 homogeneity_cluster_2 energy_cluster_2 contrast_cluster_3 dissimilarity_cluster_3 homogeneity_cluster_3 energy_cluster_3 contrast_cluster_4 dissimilarity_cluster_4 homogeneity_cluster_4 energy_cluster_4
count -3.540000e+02 -354.000000 -354.000000 -354.000000 -3.540000e+02 -354.000000 -354.000000 -354.000000 -3.540000e+02 -3.540000e+02 -354.000000 -354.000000 -3.540000e+02 -354.000000 -354.000000 -354.000000
mean -3.471572e+08 -10930283.923611 -6402789.438092 -28498.201271 -1.785893e+09 -30031024.609028 -6284383.394894 -40080.248296 -3.131587e+09 -3.897421e+07 -6335356.628016 -5954.835599 -1.915045e+09 -19744053.645139 -6784295.703629 24317.489556
std -2.303295e+08 -6669743.006260 -2302256.699216 -14454.358176 -1.226421e+09 -19925311.553204 -2188673.993117 -34683.839547 -2.211510e+09 -2.823480e+07 -2137806.775196 -13787.374858 -1.582827e+09 -16642677.608562 -2349208.283240 -9909.520978
min -3.780830e+07 -1970660.750000 -3611006.038227 -224.249771 -1.700863e+08 -4646772.750000 -3666852.205031 4265.839270 -3.668226e+08 -6.096884e+06 -3720603.120880 9479.079436 -1.833975e+08 -2659986.250000 -4314395.676846 50184.657298
25% -1.680347e+08 -5644954.187500 -4320432.937471 -15904.433861 -7.481320e+08 -12961745.250000 -4351030.258263 -14096.681674 -1.299226e+09 -1.544254e+07 -4440650.296657 4739.420079 -7.914160e+08 -7837961.125000 -4652986.150354 30268.965165
50% -2.893096e+08 -8969151.625000 -4598166.617793 -19669.673215 -1.447143e+09 -24490327.000000 -4578058.258306 -27598.871381 -2.123659e+09 -2.649811e+07 -4631287.500777 11405.435315 -1.104483e+09 -11165600.875000 -4732512.514484 25266.269413
75% -4.722470e+08 -15021015.187500 -8862648.149257 -35709.537162 -2.474048e+09 -41708742.687500 -8590099.385389 -55294.361379 -4.627798e+09 -5.631630e+07 -8655609.223456 -1732.795363 -2.869220e+09 -28466116.000000 -9350053.244120 18897.068134
max -1.346652e+09 -32273665.250000 -9721225.033498 -92768.273617 -5.757936e+09 -94930068.000000 -9590192.566618 -184062.020468 -1.034807e+10 -1.279552e+08 -9545922.986910 -139384.968335 -8.751343e+09 -82078711.750000 -9820911.412866 -49430.650173

In [207]:
f = features[filter(lambda x: '4' in x, features[columns].columns)].copy()
f['BIRADS'] = class_labels
mia.plotting.plot_scattermatrix(f, 'BIRADS')