In [1]:
import os
import glob
import hdf5_getters
import pickle
import itertools
import numpy as np
import matplotlib.pyplot as plt
from scipy.spatial.distance import euclidean
from sklearn import datasets
import random
random.seed(3222)
np.random.seed(3222)
In [2]:
def get_data(basedir, function, upto=10000, ext='.h5'):
data = []
count = 0
for root, dirs, files in os.walk(basedir):
files = glob.glob(os.path.join(root,'*'+ext))
for f in files:
if count == upto:
return data
if count%100 == 0:
print count,
h5 = hdf5_getters.open_h5_file_read(f)
data.append(function(h5))
h5.close()
count += 1
return data
# save as pickle to save time in loading
segments_pitches_1000 = get_data('../MillionSongSubset/data', hdf5_getters.get_segments_pitches, upto=200)
pickle.dump(segments_pitches_1000, open("../MillionSongSubset/pitches1000", "wb"))
In [3]:
# use first 100 songs to create vocabulary
vocab_size = 100
def build_set(data):
num_of_items = 0
for i in range(len(data)):
num_of_items += data[i].shape[0]
print num_of_items
results = np.ndarray((num_of_items, data[0].shape[1]))
count = 0
for i in range(len(data)):
for j in range(data[i].shape[0]):
results[count] = data[i][j]
count += 1
return results
print "building vocab set"
vocab_pitches = build_set(segments_pitches_1000[:100])
print vocab_pitches.shape
In [4]:
def assign_points_to_clusters(centroids, points, k):
# 1 list for each centroid (will contain indices of points)
clusters = [[] for i in range(k)]
for i in range(points.shape[0]):
# find nearest centroid to this point
best_centroid = 0
best_distance = euclidean(centroids[best_centroid], points[i])
for c in range(1, k):
distance = euclidean(centroids[c], points[i])
if distance < best_distance:
best_distance = distance
best_centroid = c
clusters[best_centroid].append(i)
return clusters
def update_centroids(centroids, clusters, points, k):
for ci in range(k):
if clusters[ci]:
sum_points = np.zeros(points.shape[1])
for point in clusters[ci]:
sum_points += points[point]
#print sum_points
centroids[ci] = sum_points/len(clusters[ci])
def mykmeans(points, k, max_iter=50, centroids=None):
n_samples, n_features = points.shape
if not centroids:
centroids = [points[x] for x in random.sample(range(n_samples), k)]
# seeds = random_state.permutation(n_samples)[:k]
clusters = None
for p in range(max_iter):
print p,
clusters = assign_points_to_clusters(centroids, points, k)
update_centroids(centroids, clusters, points, k)
return centroids, clusters
def mykmeansplusplus(points, k, max_iter=50):
n_samples, n_features = points.shape
centroids = []
centroids.append(np.random.randint(0, n_samples))
for i in range(k-1):
weightProb = []
leftPoints = [p for p in range(n_samples) if p not in centroids]
for p in leftPoints:
best_D = euclidean(points[p], points[centroids[0]])
best_p = centroids[0]
for c in range(1, len(centroids)):
D = euclidean(points[p], points[centroids[c]])
if D < best_D:
best_D = D
weightProb.append(best_D)
cum = sum(weightProb)
weightProb = [p/cum for p in weightProb]
centroids.append(np.random.choice(leftPoints, p=weightProb))
centroids = [points[c] for c in centroids]
return mykmeans(points, k, max_iter, centroids)
In [5]:
k = 4
def learnvocabulary(vocab_data, num_clusters, max_iter=50):
return mykmeansplusplus(vocab_data, num_clusters, max_iter)
centroids, clusters = learnvocabulary(vocab_pitches, k, 10)
In [15]:
def getbof(centroids, signal):
bof_tf = np.zeros(len(centroids))
predicted_clusters = assign_points_to_clusters(centroids, signal, len(centroids))
for i in range(len(centroids)):
bof_tf[i] = len(predicted_clusters[i])
# make it sum to 1
return bof_tf/sum(bof_tf)
print getbof(centroids, segments_pitches_1000[199])
In [ ]: