In [37]:
%matplotlib inline
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
from scipy.spatial import distance
import ml.Features as ft
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs
from sklearn.preprocessing import StandardScaler
from sklearn import decomposition # PCA
from sklearn.metrics import confusion_matrix
In [38]:
MAX_HEIGHT = 203
MAX_WIDTH = 142
SPEED = 3
SAMPLING_RATE = 8
In [39]:
def convert_file_to_data_frame(filename,id):
my_file = open(filename,'r')
lines = my_file.readlines()
dict = {}
data = []
for line in lines:
key = line.split('=')[0].rstrip()
val = line.split('=')[1].rstrip()
if dict.has_key(key):
# we probably have all of them at this point
height = MAX_HEIGHT - dict['UT']
if height < 5:
height = np.nan
width = np.nan
if dict.has_key('UL') and dict.has_key('UR'):
if dict['UL'] > 140 or dict['UR'] > 140:
width = np.nan
else:
width = MAX_WIDTH - dict['UL'] - dict['UR']
data.append([height,width])
dict = {}
else:
dict[key] = float(val)
frame = DataFrame(data,columns=['height','width'])
frame['id'] = id
return frame
def get_frame(path):
result = []
for id in range(1, 21):
filename = path + 'u%d.dat' % id
frame = convert_file_to_data_frame(filename, id)
result.append(frame)
frame = pd.concat(result,ignore_index=True)
return frame
In [40]:
frame = get_frame('../../data/')
frame['event'] = float(-1)
In [41]:
event_count = 1
max_id = frame['id'].max() + 1
for id in range(1,21):
res = frame[(frame['height'] > 50) & (frame['id'] == id) & (frame['width'] > 10)]
prev_index = 0
for row in res.itertuples():
if prev_index == 0 or row.Index - prev_index <= 3:
frame.set_value(row.Index,'event',event_count)
else:
event_count +=1
frame.set_value(row.Index,'event',event_count)
prev_index = row.Index
event_count +=1
In [42]:
first_event = int(frame[frame['event'] > -1]['event'].min())
last_event = int(frame[frame['event'] > -1]['event'].max())
columns = ['mean_height','min_height','max_height','mean_width','min_width','max_width','time','girth','id']
lines = []
index = []
for event_num in range(first_event,last_event + 1):
data = frame[frame['event'] == event_num]
line = []
line.append(ft.extract_mean_height(data))
line.extend(ft.extract_min_max_height(data))
line.append(ft.extract_mean_width(data))
line.extend(ft.extract_min_max_width(data))
line.append(ft.extract_time(data,sampling_rate=SAMPLING_RATE,speed=SPEED))
line.append(ft.extract_girth(data,SAMPLING_RATE,SPEED))
line.append(data['id'].iloc[0])
index.append(event_num)
lines.append(line)
features = DataFrame(lines,index = index,columns=columns)
In [91]:
X = features[['mean_height','time','girth']]
labels_true = features['id']
X = StandardScaler().fit_transform(X)
# Compute DBSCAN
db = DBSCAN(eps=0.47, min_samples=1).fit(X)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
In [101]:
# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print('Estimated number of clusters: %d' % n_clusters_)
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
print("Adjusted Rand Index: %0.3f"
% metrics.adjusted_rand_score(labels_true, labels))
print("Adjusted Mutual Information: %0.3f"
% metrics.adjusted_mutual_info_score(labels_true, labels))
print("Silhouette Coefficient: %0.3f"
% metrics.silhouette_score(X, labels))
In [98]:
# Black removed and is used for noise instead.
unique_labels = set(labels)
colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))
for k, col in zip(unique_labels, colors):
if k == -1:
# Black used for noise.
col = 'k'
class_member_mask = (labels == k)
xy = X[class_member_mask & core_samples_mask]
plt.plot(xy[:, 0], xy[:, 2], 'o', markerfacecolor=col,
markeredgecolor='k', markersize=14)
xy = X[class_member_mask & ~core_samples_mask]
plt.plot(xy[:, 0], xy[:, 2], 'o', markerfacecolor=col,
markeredgecolor='k', markersize=6)
plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()
In [94]:
pca = decomposition.RandomizedPCA(n_components=20)
pca.fit(features)
pca.components_.shape
Out[94]:
In [95]:
def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues):
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(20)
plt.xticks(tick_marks, rotation=45)
plt.yticks(tick_marks)
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
cm = confusion_matrix(labels, labels_true)
np.set_printoptions(precision=2)
#print('Confusion matrix, without normalization')
#print(cm)
plt.figure()
plot_confusion_matrix(cm)
# Normalize the confusion matrix by row (i.e by the number of samples
# in each class)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
#print('Normalized confusion matrix')
#print(cm_normalized)
plt.figure()
plot_confusion_matrix(cm_normalized, title='Normalized confusion matrix')
plt.show()
In [ ]: