In [36]:
"""
Perform simple KMeans clustering of time-freq waterfall data
using the mean and standard deviation as simple features in
both time and frequency dimensions and plot the results.
Expects to read numpy arrays from disk which can be prepared
prior to running this notebook from filterbank format data using
prepare_binned_data.ipynb notebook.
Tests here were performed on GBT data.
"""
%matplotlib inline
from __future__ import print_function
import glob
import numpy as np
import pylab as pl
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
In [37]:
# USER PARAMS
data_dir = '../../data'
In [38]:
# CALCULATE SOME SIMPLE FEATURES TO BE USED IN THE CLUSTERING
# There is scope here for more complicated features to be added
# later.
dir_files = glob.glob(data_dir+'/*.npy')
# will only process files with these dimensions
# (go for uniformity in size for now)
time_dim = 277
freq_dim = 64
files = []
for f in dir_files:
d = np.load(f)
if np.shape(d)[0] == time_dim and np.shape(d)[1] == freq_dim:
files.append(f)
num_files = len(files)
print("To process", num_files, "files of dim", time_dim, freq_dim)
features = np.zeros((num_files,time_dim*2+freq_dim*2))
i = 0
for f in files:
# load numpy array from disk
d = np.load(f)
# calulate features over the time axis
time_mean = d.mean(axis=0) # produces array of freq_dim values
time_std = d.std(axis=0)
# calculate features over the freq axis
freq_mean = d.mean(axis=1) # produces array of time_dim values
freq_std = d.std(axis=1)
# (FUTURE: add more feature calculations here and add results to line below)
features[i] = np.concatenate((time_mean,time_std,freq_mean,freq_std))
i += 1
print("shape of feature array=",features.shape)
In [39]:
# clustering
km = KMeans(n_clusters=5) # don't exceed colour map below for now :)
km.fit(features)
# plot with dimensionality reduction
pca = PCA(n_components=2)
pca.fit(features)
colours = ['b', 'g','r','c','m','y']
pl.figure(figsize=(8,8))
i = 0;
for label in km.labels_:
Y = pca.transform(features[i])
pl.scatter(Y[:, 0], Y[:, 1], c=colours[label])
i += 1
pl.show()
# Should see some fairly obvious clusters below with some outliers.
# The clusters found by KMeans are identified with the different
# colours.
In [40]:
# Show all the images associated with each class.
# TODO: Probably nicer to do this in a more compact
# way with subplots in future.
# CLASS 0
i = 0;
for label in km.labels_:
if label == 0:
# load numpy array from disk
d = np.load(files[i])
pl.imshow(d, aspect='auto')
pl.show()
i += 1
In [41]:
# CLASS 1
i = 0;
for label in km.labels_:
if label == 1:
# load numpy array from disk
d = np.load(files[i])
pl.imshow(d, aspect='auto')
pl.show()
i += 1
In [42]:
# CLASS 2
i = 0;
for label in km.labels_:
if label == 2:
# load numpy array from disk
d = np.load(files[i])
pl.imshow(d, aspect='auto')
pl.show()
i += 1
In [43]:
# CLASS 3
i = 0;
for label in km.labels_:
if label == 3:
# load numpy array from disk
d = np.load(files[i])
pl.imshow(d, aspect='auto')
pl.show()
i += 1
In [44]:
# CLASS 4
i = 0;
for label in km.labels_:
if label == 4:
# load numpy array from disk
d = np.load(files[i])
pl.imshow(d, aspect='auto')
pl.show()
i += 1
In [ ]: