In [37]:
%matplotlib inline
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
from scipy.spatial import distance
import ml.Features as ft
import matplotlib.pyplot as plt

from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs
from sklearn.preprocessing import StandardScaler
from sklearn import decomposition # PCA
from sklearn.metrics import confusion_matrix

In [38]:
MAX_HEIGHT = 203
MAX_WIDTH = 142
SPEED = 3
SAMPLING_RATE = 8

In [39]:
def convert_file_to_data_frame(filename,id):
    my_file = open(filename,'r')
    lines = my_file.readlines()
    dict = {}
    data = []
    for line in lines:
        key = line.split('=')[0].rstrip()
        val = line.split('=')[1].rstrip()
        if dict.has_key(key):
            # we probably have all of them at this point
            height = MAX_HEIGHT - dict['UT']
            if height < 5:
                height = np.nan
            width = np.nan
            if dict.has_key('UL') and dict.has_key('UR'):
                if dict['UL'] > 140 or dict['UR'] > 140:
                    width = np.nan
                else:
                    width = MAX_WIDTH - dict['UL'] - dict['UR']
            data.append([height,width])
            dict = {}
        else:
            dict[key] = float(val)
    frame = DataFrame(data,columns=['height','width'])
    frame['id'] = id
    return frame
def get_frame(path):
    result = []
    for id in range(1, 21):
        filename = path + 'u%d.dat' % id
        frame = convert_file_to_data_frame(filename, id)
        result.append(frame)
    frame = pd.concat(result,ignore_index=True)
    return frame

In [40]:
frame = get_frame('../../data/')
frame['event'] = float(-1)

In [41]:
event_count = 1
max_id = frame['id'].max() + 1
for id in range(1,21):
    res = frame[(frame['height'] > 50) & (frame['id'] == id) & (frame['width'] > 10)]
    prev_index = 0
    for row in res.itertuples():
        if prev_index == 0 or row.Index - prev_index <= 3:
            frame.set_value(row.Index,'event',event_count)
        else:
            event_count +=1
            frame.set_value(row.Index,'event',event_count)
        prev_index = row.Index
    event_count +=1

In [42]:
first_event = int(frame[frame['event'] > -1]['event'].min())
last_event = int(frame[frame['event'] > -1]['event'].max())
columns = ['mean_height','min_height','max_height','mean_width','min_width','max_width','time','girth','id']
lines = []
index = []
for event_num in range(first_event,last_event + 1):
    data = frame[frame['event'] == event_num]
    line = []
    line.append(ft.extract_mean_height(data))
    line.extend(ft.extract_min_max_height(data))
    line.append(ft.extract_mean_width(data))
    line.extend(ft.extract_min_max_width(data))
    line.append(ft.extract_time(data,sampling_rate=SAMPLING_RATE,speed=SPEED))
    line.append(ft.extract_girth(data,SAMPLING_RATE,SPEED))
    line.append(data['id'].iloc[0])
    index.append(event_num)
    lines.append(line)
features = DataFrame(lines,index = index,columns=columns)

In [91]:
X = features[['mean_height','time','girth']]
labels_true = features['id']
X = StandardScaler().fit_transform(X)
# Compute DBSCAN
db = DBSCAN(eps=0.47, min_samples=1).fit(X)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

In [101]:
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

print('Estimated number of clusters: %d' % n_clusters_)
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
print("Adjusted Rand Index: %0.3f"
      % metrics.adjusted_rand_score(labels_true, labels))
print("Adjusted Mutual Information: %0.3f"
      % metrics.adjusted_mutual_info_score(labels_true, labels))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, labels))


Estimated number of clusters: 20
Homogeneity: 0.685
Completeness: 0.778
V-measure: 0.728
Adjusted Rand Index: 0.055
Adjusted Mutual Information: 0.090
Silhouette Coefficient: 0.492

In [98]:
# Black removed and is used for noise instead.
unique_labels = set(labels)
colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))
for k, col in zip(unique_labels, colors):
    if k == -1:
        # Black used for noise.
        col = 'k'

    class_member_mask = (labels == k)

    xy = X[class_member_mask & core_samples_mask]
    plt.plot(xy[:, 0], xy[:, 2], 'o', markerfacecolor=col,
             markeredgecolor='k', markersize=14)

    xy = X[class_member_mask & ~core_samples_mask]
    plt.plot(xy[:, 0], xy[:, 2], 'o', markerfacecolor=col,
             markeredgecolor='k', markersize=6)

plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()



In [94]:
pca = decomposition.RandomizedPCA(n_components=20)
pca.fit(features)
pca.components_.shape


Out[94]:
(9, 9)

In [95]:
def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(20)
    plt.xticks(tick_marks, rotation=45)
    plt.yticks(tick_marks)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

cm = confusion_matrix(labels, labels_true)
np.set_printoptions(precision=2)
#print('Confusion matrix, without normalization')
#print(cm)
plt.figure()
plot_confusion_matrix(cm)
# Normalize the confusion matrix by row (i.e by the number of samples
# in each class)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
#print('Normalized confusion matrix')
#print(cm_normalized)
plt.figure()
plot_confusion_matrix(cm_normalized, title='Normalized confusion matrix')

plt.show()



In [ ]: