In [2]:
%matplotlib qt
import pandas as pd
import numpy as np
import mia
Loading the hologic and synthetic datasets.
In [3]:
hologic = pd.DataFrame.from_csv("/Volumes/Seagate/mmp_data/2015-04-16/real-lines.csv")
hologic_blobs = pd.DataFrame.from_csv("/Volumes/Seagate/mmp_data/2015-04-01/hologic.csv")
phantom = pd.DataFrame.from_csv("/Volumes/Seagate/mmp_data/2015-04-16/phantom-lines.csv")
phantom_blobs = pd.DataFrame.from_csv("/Volumes/Seagate/mmp_data/2015-04-14/synthetics1-blobs.csv")
Loading the meta data for the real and synthetic datasets.
In [4]:
hologic_meta = mia.analysis.create_hologic_meta_data(hologic, "/Volumes/Seagate/mmp_data/meta_data/BIRADS.csv")
phantom_meta = mia.analysis.create_synthetic_meta_data(phantom, "/Volumes/Seagate/mmp_data/meta_data/synthetic_meta_data_cleaned.csv")
phantom_meta.index.name = 'img_name'
In [5]:
hologic_labels = hologic_meta.drop_duplicates().BIRADS
phantom_labels = phantom_meta['VBD.1']
# class_labels = mia.analysis.remove_duplicate_index(hologic_labels)
class_labels = pd.concat([hologic_labels, phantom_labels])
class_labels.index.name = "img_name"
labels = mia.analysis.remove_duplicate_index(class_labels)[0]
Create blob features from distribution of blobs
In [6]:
hologic_line_features = mia.analysis.features_from_lines(hologic)
hologic_line_features.fillna(hologic_line_features.mean(), inplace=True)
hologic_blob_features = mia.analysis.features_from_blobs(hologic_blobs)
phantom_line_features = mia.analysis.features_from_lines(phantom)
phantom_line_features.fillna(phantom_line_features.mean(), inplace=True)
phantom_blob_features = mia.analysis.features_from_blobs(phantom_blobs)
In [7]:
syn_feature_meta = mia.analysis.remove_duplicate_index(phantom_meta)
phantom_line_features['phantom_name'] = syn_feature_meta.phantom_name.tolist()
phantom_line_features_subset = mia.analysis.create_random_subset(phantom_line_features, 'phantom_name')
phantom_blob_features_subset = phantom_blob_features.loc[phantom_line_features_subset.index]
In [9]:
line_features = pd.concat([hologic_line_features, phantom_line_features])
blob_features = pd.concat([hologic_blob_features, phantom_blob_features])
features = blob_features.join(line_features, rsuffix='_line')
# assert features.shape[0] == 366
features.head()
Out[9]:
In [16]:
selected_features = blob_features.copy()
# selected_features.drop(['skew', 'kurtosis', 'min'], inplace=True, axis=1)
mapping = mia.analysis.isomap(selected_features, n_components=2)
In [52]:
mapping.to_csv("/Volumes/Seagate/mmp_data/2015-04-16/lines-mapping.csv")
In [ ]:
left = hologic_line_features[mapping[0] < 0]
right = hologic_line_features[mapping[0] >=0]
left.describe() - right.describe()
In [ ]:
phantom_line_features_subset.describe() - hologic_line_features.describe()
In [18]:
def plot_mapping(m):
hologic_map = m.loc[hologic_line_features.index]
phantom_map = m.loc[phantom_line_features.index]
hol_labels = labels[hologic_map.index]
syn_labels = labels[phantom_map.index]
ax = mia.plotting.plot_scatter_2d(hologic_map, labels=hol_labels, s=20)
ax = mia.plotting.plot_scatter_2d(phantom_map, labels=syn_labels, ax=ax, marker='^', s=50)
plot_mapping(mapping)
In [ ]:
selected_features.to_csv("/Volumes/Seagate/mmp_data/features.csv")
mapping.to_csv("/Volumes/Seagate/mmp_data/mapping.csv")
In [ ]:
mapping = pd.DataFrame.from_csv("/Volumes/Seagate/mmp_data/2015-04-01/both_shape_mapping.csv")
plot_mapping(mapping)
In [45]:
m = selected_features.copy()
m['class'] = labels
m.drop(['25%', 'upper_dist_count'], inplace=True, axis=1)
pd.tools.plotting.radviz(m, 'class')
Out[45]: