In [2]:
%matplotlib inline
In [93]:
import os
import numpy as np
from sklearn.cluster import KMeans
import pandas as pd
from itertools import islice, product, combinations
In [127]:
def array_from_text(text):
text_stripped = text.replace(' ', '').replace('\n', '')
meta, content = tuple(text_stripped.split('$'))
text_points = content.replace(')', '').replace('(', '').split(';')
pairs = []
for text_point in text_points:
pair = [float(x) for x in text_point.split(',')]
pairs.append(pair)
return np.array(pairs)
def euclidean(p1, p2):
return np.sqrt((p1[0] - p2[0]) ** 2 + (p1[1] - p2[1]) ** 2)
def between_cluster_sum_of_squares(kmeans_result):
total_error = 0
total_mean = np.mean(kmeans_result.cluster_centers_, axis=0)
for point in kmeans_result.cluster_centers_:
total_error += euclidean(point, total_mean) ** 2
return total_error
def single_cluster_sum_of_squares(points):
error = 0
mean = np.mean(points, axis=0)
for point in points:
error += euclidean(point, mean) ** 2
return error
def within_cluster_sum_of_squares(clusters):
return sum([single_cluster_sum_of_squares(clust) for clust in clusters])
def find_elbow(series):
# idea: find the point with the sharpest change
pass
def n_grams(a, n):
z = (islice(a, i, None) for i in range(n))
return zip(*z)
def avg_distance_from_center(points):
centroid = np.mean(points, axis=0)
result = 0
for p in points:
result += euclidean(p, centroid) / len(points)
return result
def avg_distance_from_other_clusters(points):
centroid = np.mean(points, axis=0)
result = 0
for p, q in product(points, points):
if p == q:
continue
result += euclidean(p, centroid) / len(points)
return result
In [146]:
file_handles = [open('datasets/' + file) for file in os.listdir('datasets')]
In [184]:
[fh.close() for fh in file_handles]
Out[184]:
In [183]:
arr = array_from_text(file_handles[0].read())
In [87]:
kmeans_raw_objects = [KMeans(n_clusters=n) for n in range(2, len(arr))]
kmeans_results_objects = [kmeans.fit(arr) for kmeans in kmeans_raw_objects]
within_series = pd.Series({n: result.inertia_ for n, result in zip(range(2, len(arr)), kmeans_results_objects)})
between_series = pd.Series(
{n: between_cluster_sum_of_squares(result) for n, result in zip(range(2, len(arr)), kmeans_results_objects)})
avg_from_center_series = pd.Series(
{n: avg_distance_from_center(result.cluster_centers_) for n, result in zip(range(2, len(arr)), kmeans_results_objects)})
In [88]:
within_series.plot()
between_series.plot()
#(between_series - within_series).plot()
Out[88]:
In [92]:
kmeans_results_objects[1].inertia_
Out[92]:
In [124]:
# we want all possible 3-partitions of an input, where each partition has at least 1 element
# I think this can be done as a recursive function
def partitions(items):
# Using TAOCP 7.2.1 page 26. Yes, I'm a real computer scientist now.
"""
## H1
n = len(items)
a_s = [0] * n
b_s = [1] * n
m = 1 # b_s[n]
## H2 - by "visit" I think they imply that we should yield.
if a_s[n - 1] == m:
# goto H4
## H3
a_s[n - 1] += 1
# goto H2
## H4
j = n - 2
while a_s[j] == b_s[j]:
j -= 1
## H5
if j == 0:
raise StopIteration
else:
a_s[j] += 1
## H6
m = b_s[j]
j += 1
while j < n - 1:
a_s[j] = 0
b_s[j] = m
j += 1
a_s[n - 1] = 0
# goto H2
"""
n = len(items)
a = [0] * n
b = [1] * n
m = 1 # b_s[n]
while True:
yield list(a)
if a[n - 1] == m:
j = n - 2
while a[j] == b[j]:
j -= 1
if j == 0:
raise StopIteration
else:
a[j] += 1
m = b[j] + 1 if a[j] == b[j] else 0
j += 1
while j < n - 1:
a[j] = 0
b[j] = m
j += 1
a[n - 1] = 0
else:
a[n - 1] += 1
# there's an efficient algorithm for doing unique and the partitions at once, but hell if I don't know it
three_class_partitions = filter(lambda urns: len(set(urns)) == 3, partitions(range(5)))
In [125]:
list(three_class_partitions)
Out[125]:
In [176]:
def partition_to_points(partition, points):
bins = [list() for _ in range(len(set(partition)))]
for ix, point in enumerate(points):
bins[partition[ix]].append(point)
return bins
def deep_tuple(nested_list):
if isinstance(nested_list, (list, np.ndarray)):
return tuple(deep_tuple(element) for element in nested_list)
else:
return nested_list
def deep_array(nested_tuple):
if isinstance(nested_tuple, tuple):
return [deep_array(element) for element in nested_tuple]
else:
return nested_tuple
def worst_clustering(points, intended_cluster_number):
possible_partitions = filter(lambda urns: len(set(urns)) == intended_cluster_number,
partitions(range(len(points))))
# if quadruple nested lists bother you, go get a drink right now
possible_clusterings = [partition_to_points(partition, points) for partition in possible_partitions]
scores = {deep_tuple(clustering): within_cluster_sum_of_squares(clustering) for clustering in possible_clusterings}
sorted_scores = sorted(scores.items(), key=lambda t: t[1], reverse=True)
worst = sorted_scores[0]
return worst
In [185]:
w = worst_clustering(arr, 3)
In [186]:
w
Out[186]:
In [187]:
deep_array(w[0])
Out[187]:
In [169]:
isinstance(1, (np.ndarray, int))
Out[169]:
In [ ]: