In [2]:
%matplotlib qt
import pandas as pd
import numpy as np
import mia

Loading and Preprocessing

Loading the hologic and synthetic datasets.


In [3]:
hologic = pd.DataFrame.from_csv("/Volumes/Seagate/mmp_data/2015-04-16/real-lines.csv")
hologic_blobs = pd.DataFrame.from_csv("/Volumes/Seagate/mmp_data/2015-04-01/hologic.csv")
phantom = pd.DataFrame.from_csv("/Volumes/Seagate/mmp_data/2015-04-16/phantom-lines.csv")
phantom_blobs = pd.DataFrame.from_csv("/Volumes/Seagate/mmp_data/2015-04-14/synthetics1-blobs.csv")

Loading the meta data for the real and synthetic datasets.


In [4]:
hologic_meta = mia.analysis.create_hologic_meta_data(hologic, "/Volumes/Seagate/mmp_data/meta_data/BIRADS.csv")
phantom_meta = mia.analysis.create_synthetic_meta_data(phantom, "/Volumes/Seagate/mmp_data/meta_data/synthetic_meta_data_cleaned.csv")
phantom_meta.index.name = 'img_name'

In [5]:
hologic_labels = hologic_meta.drop_duplicates().BIRADS
phantom_labels = phantom_meta['VBD.1']

# class_labels = mia.analysis.remove_duplicate_index(hologic_labels)
class_labels = pd.concat([hologic_labels, phantom_labels])
class_labels.index.name = "img_name"
labels = mia.analysis.remove_duplicate_index(class_labels)[0]

Create blob features from distribution of blobs


In [6]:
hologic_line_features = mia.analysis.features_from_lines(hologic)
hologic_line_features.fillna(hologic_line_features.mean(), inplace=True)

hologic_blob_features = mia.analysis.features_from_blobs(hologic_blobs)

phantom_line_features = mia.analysis.features_from_lines(phantom)
phantom_line_features.fillna(phantom_line_features.mean(), inplace=True)

phantom_blob_features = mia.analysis.features_from_blobs(phantom_blobs)

In [7]:
syn_feature_meta = mia.analysis.remove_duplicate_index(phantom_meta)
phantom_line_features['phantom_name'] = syn_feature_meta.phantom_name.tolist()
phantom_line_features_subset = mia.analysis.create_random_subset(phantom_line_features, 'phantom_name')
phantom_blob_features_subset = phantom_blob_features.loc[phantom_line_features_subset.index]

In [9]:
line_features = pd.concat([hologic_line_features, phantom_line_features])
blob_features = pd.concat([hologic_blob_features, phantom_blob_features])
features = blob_features.join(line_features, rsuffix='_line')
# assert features.shape[0] == 366
features.head()


Out[9]:
blob_count avg_radius std_radius min_radius max_radius small_radius_count med_radius_count large_radius_count density lower_radius_qt ... 75% count kurtosis max mean min phantom_name skew std upper_dist_count_line
p214-010-60001-cl.png 56 22.121831 22.923389 8 128.000000 52 1 3 52.940812 8 ... 137.25 72 26.793023 1744 161.791667 1 NaN 4.786025 245.194659 16
p214-010-60001-cr.png 78 19.054538 17.506086 8 90.509668 68 4 6 40.749811 8 ... 152.00 57 50.202367 3494 176.421053 1 NaN 6.905092 460.921203 13
p214-010-60001-ml.png 98 20.011191 21.876304 8 128.000000 90 3 5 42.644057 8 ... 210.50 72 13.937538 1662 200.708333 1 NaN 3.484285 296.524822 19
p214-010-60001-mr.png 139 15.309764 15.307860 8 128.000000 136 1 2 38.287439 8 ... 192.25 74 22.429202 2522 203.459459 1 NaN 4.309621 371.688380 16
p214-010-60005-cl.png 97 20.132590 23.255605 8 181.019336 94 2 1 41.456308 8 ... 170.75 114 9.028130 911 143.184211 1 NaN 2.667624 152.914376 36

5 rows × 24 columns

t-SNE

Running t-SNE to obtain a lower dimensional representation.


In [16]:
selected_features = blob_features.copy()
# selected_features.drop(['skew', 'kurtosis', 'min'], inplace=True, axis=1)
mapping = mia.analysis.isomap(selected_features, n_components=2)

In [52]:
mapping.to_csv("/Volumes/Seagate/mmp_data/2015-04-16/lines-mapping.csv")

In [ ]:
left = hologic_line_features[mapping[0] < 0]
right = hologic_line_features[mapping[0] >=0]

left.describe() - right.describe()

In [ ]:
phantom_line_features_subset.describe() - hologic_line_features.describe()

In [18]:
def plot_mapping(m):
    hologic_map = m.loc[hologic_line_features.index]
    phantom_map = m.loc[phantom_line_features.index]

    hol_labels = labels[hologic_map.index]
    syn_labels = labels[phantom_map.index]

    ax = mia.plotting.plot_scatter_2d(hologic_map, labels=hol_labels, s=20)
    ax = mia.plotting.plot_scatter_2d(phantom_map, labels=syn_labels, ax=ax, marker='^', s=50)

plot_mapping(mapping)

In [ ]:
selected_features.to_csv("/Volumes/Seagate/mmp_data/features.csv")
mapping.to_csv("/Volumes/Seagate/mmp_data/mapping.csv")

Analysis


In [ ]:
mapping = pd.DataFrame.from_csv("/Volumes/Seagate/mmp_data/2015-04-01/both_shape_mapping.csv")
plot_mapping(mapping)

In [45]:
m = selected_features.copy()
m['class'] = labels
m.drop(['25%', 'upper_dist_count'], inplace=True, axis=1)
pd.tools.plotting.radviz(m, 'class')


Out[45]:
<matplotlib.axes._subplots.AxesSubplot at 0x11ca64350>