In [113]:
%matplotlib qt
import pandas as pd
import numpy as np
import re
import mia
In [104]:
hologic_cluster = pd.DataFrame.from_csv('../2015-03-28-real-texture-cluster.csv')
hologic_cluster.head()
Out[104]:
In [105]:
hologic_meta = mia.analysis.create_hologic_meta_data(hologic_cluster, '../data/BIRADS.csv')
hologic_meta.head()
Out[105]:
In [185]:
columns = filter(lambda x: 'homogeneity' in x, hologic_cluster.columns)
Scatter matrix of the clusters. From this is is noted that Homogeneity is the major cause for the splitting as it's bimodal across all clusters. Contrast and dissimilarity show correlations from high to low risk. Enegery also shows this trend, but it's correlation is weaker.
In [194]:
hc = hologic_cluster[filter(lambda x: '1' in x, hologic_cluster.columns)].copy()
hc['BIRADS'] = hologic_meta.BIRADS
mia.plotting.plot_scattermatrix(hc, 'BIRADS')
The bimodal-ness of homogeneity can be better seen when plotted as a histogram for each cluster:
In [193]:
%matplotlib inline
hc = hologic_cluster.copy()
hc['class'] = hologic_meta.BIRADS
mia.plotting.plot_risk_classes(hc, 'homogeneity_cluster_4')
In [177]:
mapping = mia.analysis.tSNE(hologic_cluster[columns], n_components=2, verbose=2, learning_rate=300)
In [178]:
mia.plotting.plot_scatter_2d(mapping, [0,1], hologic_meta.BIRADS)
Out[178]:
In [196]:
mia.analysis.measure_closeness(mapping, hologic_meta.img_name).mean()
Out[196]:
In [106]:
left_cluster = hologic_cluster[mapping[0] < 0]
right_cluster = hologic_cluster[mapping[0] >= 0]
# columns = filter(lambda x: 'cluster_4' in x, hologic_cluster.columns)
# mapping[]
mask = filter(lambda x: '61246' in x, mapping.index)
hologic_cluster.loc[mask].describe() - hologic_cluster[hologic_meta.BIRADS == 1].describe()
Out[106]:
Looking at the most significant features
In [156]:
hc = mia.analysis.normalize_data_frame(hologic_cluster)
hc.columns = hologic_cluster.columns
hc['BIRADS'] = hologic_meta.BIRADS
pd.tools.plotting.radviz(hc, 'BIRADS')
Out[156]:
In [73]:
mapping['class'] = hologic_meta['BIRADS']
mia.io_tools.dump_mapping_to_json(mapping, [0,1], '../mapping_viz/data.json')
In [49]:
mapping.to_csv('../2015-03-28-real-texture-cluster-mapping.csv')
In [109]:
phantoms = pd.DataFrame.from_csv('../2015-03-28-phantom-texture-cluster.csv')
phantoms.head()
Out[109]:
In [117]:
phantom_meta = mia.analysis.create_synthetic_meta_data(phantoms, '/Volumes/Seagate/2015-03-26/synthetic_meta_data_cleaned.csv')
#replace BIRADS inspecific BIRADS classes
phantom_meta.BIRADS.replace('3 or 4', 4, inplace=True)
phantom_meta.BIRADS.replace(re.compile(r'2 \([a-z]+\)'), 2, inplace=True)
phantom_meta.BIRADS = phantom_meta.BIRADS.astype(float)
phantom_meta.head()
Out[117]:
Select a random subset of the phantoms for use with the t-SNE algorithm
In [122]:
import random
group = phantoms.groupby(phantom_meta.phantom_name)
def select_random(x):
return x.ix[random.sample(x.index, 1)]
random_synthetic_features = group.apply(select_random)
random_synthetic_features.reset_index(drop=True, level=0, inplace=True)
random_synthetic_features
Out[122]:
In [132]:
features = pd.concat([hologic_cluster, random_synthetic_features])
class_labels = pd.concat([hologic_meta.BIRADS, phantom_meta.loc[random_synthetic_features.index].BIRADS])
class_labels.shape
Out[132]:
In [213]:
columns = filter(lambda x: 'homogeneity' not in x, features.columns)
In [203]:
joint_mapping = mia.analysis.tSNE(features[columns], verbose=2, learning_rate=300)
Plotting both reals and phantoms together as a single scatter plot:
In [212]:
%matplotlib qt
joint_mapping['BIRADS'] = class_labels
hol_map = joint_mapping[:-6]
hol_map.shape
syn_map = joint_mapping[-6:]
syn_map.head()
ax = mia.plotting.plot_scatter_2d(hol_map, [0,1], 'BIRADS')
ax = mia.plotting.plot_scatter_2d(syn_map, [0,1], 'BIRADS', ax=ax, marker='^', s=50)
In [198]:
random_synthetic_features.describe() - hologic_cluster.describe()
Out[198]:
In [207]:
f = features[filter(lambda x: '4' in x, features[columns].columns)].copy()
f['BIRADS'] = class_labels
mia.plotting.plot_scattermatrix(f, 'BIRADS')