notebook.community

Edit and run



In [2]:

    
%matplotlib inline



In [93]:

    
import os
import numpy as np
from sklearn.cluster import KMeans
import pandas as pd
from itertools import islice, product, combinations



In [127]:

    
def array_from_text(text):
    text_stripped = text.replace(' ', '').replace('\n', '')
    meta, content = tuple(text_stripped.split('$'))
    text_points = content.replace(')', '').replace('(', '').split(';')
    pairs = []
    for text_point in text_points:
        pair = [float(x) for x in text_point.split(',')]
        pairs.append(pair)
    return np.array(pairs)

def euclidean(p1, p2):
    return np.sqrt((p1[0] - p2[0]) ** 2 + (p1[1] - p2[1]) ** 2)

def between_cluster_sum_of_squares(kmeans_result):
    total_error = 0
    total_mean = np.mean(kmeans_result.cluster_centers_, axis=0)
    for point in kmeans_result.cluster_centers_:
        total_error += euclidean(point, total_mean) ** 2
    return total_error

def single_cluster_sum_of_squares(points):
    error = 0
    mean = np.mean(points, axis=0)
    for point in points:
        error += euclidean(point, mean) ** 2
    return error

def within_cluster_sum_of_squares(clusters):
    return sum([single_cluster_sum_of_squares(clust) for clust in clusters])
        

def find_elbow(series):
    # idea: find the point with the sharpest change
    pass

def n_grams(a, n):
    z = (islice(a, i, None) for i in range(n))
    return zip(*z)

def avg_distance_from_center(points):
    centroid = np.mean(points, axis=0)
    result = 0
    for p in points:
        result += euclidean(p, centroid) / len(points)
    return result

def avg_distance_from_other_clusters(points):
    centroid = np.mean(points, axis=0)
    result = 0
    for p, q in product(points, points):
        if p == q:
            continue
        result += euclidean(p, centroid) / len(points)
    return result



In [146]:

    
file_handles = [open('datasets/' + file) for file in os.listdir('datasets')]



In [184]:

    
[fh.close() for fh in file_handles]









    Out[184]:





[None, None, None]



In [183]:

    
arr = array_from_text(file_handles[0].read())



In [87]:

    
kmeans_raw_objects = [KMeans(n_clusters=n) for n in range(2, len(arr))]
kmeans_results_objects = [kmeans.fit(arr) for kmeans in kmeans_raw_objects]
within_series = pd.Series({n: result.inertia_ for n, result in zip(range(2, len(arr)), kmeans_results_objects)})
between_series = pd.Series(
    {n: between_cluster_sum_of_squares(result) for n, result in zip(range(2, len(arr)), kmeans_results_objects)})
avg_from_center_series = pd.Series(
    {n: avg_distance_from_center(result.cluster_centers_) for n, result in zip(range(2, len(arr)), kmeans_results_objects)})



In [88]:

    
within_series.plot()
between_series.plot()
#(between_series - within_series).plot()









    Out[88]:





<matplotlib.axes._subplots.AxesSubplot at 0xb80dc18>



In [92]:

    
kmeans_results_objects[1].inertia_









    Out[92]:





4.4999999999999929



In [124]:

    
# we want all possible 3-partitions of an input, where each partition has at least 1 element
# I think this can be done as a recursive function
def partitions(items):
    # Using TAOCP 7.2.1 page 26. Yes, I'm a real computer scientist now.
    """
    ## H1
    n = len(items)
    a_s = [0] * n
    b_s = [1] * n
    m = 1  # b_s[n]
    
    ## H2 - by "visit" I think they imply that we should yield.
    if a_s[n - 1] == m:
        # goto H4
    
    ## H3
    a_s[n - 1] += 1
    # goto H2
    
    ## H4
    j = n - 2
    while a_s[j] == b_s[j]:
        j -= 1
        
    ## H5
    if j == 0:
        raise StopIteration
    else:
        a_s[j] += 1
        
    ## H6
    m = b_s[j] 
    j += 1
    while j < n - 1:
        a_s[j] = 0
        b_s[j] = m
        j += 1
    a_s[n - 1] = 0
    # goto H2
    """
    n = len(items)
    a = [0] * n
    b = [1] * n
    m = 1  # b_s[n]
    
    while True:
        yield list(a)
        if a[n - 1] == m:
            j = n - 2
            while a[j] == b[j]:
                j -= 1
            if j == 0:
                raise StopIteration
            else:
                a[j] += 1
                m = b[j] + 1 if a[j] == b[j] else 0
                j += 1
                while j < n - 1:
                    a[j] = 0
                    b[j] = m
                    j += 1
                a[n - 1] = 0
        else:
            a[n - 1] += 1
        
# there's an efficient algorithm for doing unique and the partitions at once, but hell if I don't know it
three_class_partitions = filter(lambda urns: len(set(urns)) == 3, partitions(range(5)))



In [125]:

    
list(three_class_partitions)









    Out[125]:





[[0, 0, 0, 1, 2],
 [0, 0, 1, 0, 2],
 [0, 0, 1, 2, 0],
 [0, 0, 1, 2, 1],
 [0, 0, 1, 2, 2],
 [0, 1, 0, 0, 2],
 [0, 1, 0, 2, 0],
 [0, 1, 0, 2, 1],
 [0, 1, 0, 2, 2],
 [0, 1, 2, 0, 0],
 [0, 1, 2, 0, 1],
 [0, 1, 2, 0, 2],
 [0, 1, 2, 1, 0],
 [0, 1, 2, 2, 0]]



In [176]:

    
def partition_to_points(partition, points):
    bins = [list() for _ in range(len(set(partition)))]
    for ix, point in enumerate(points):
        bins[partition[ix]].append(point)
    return bins

def deep_tuple(nested_list):
    if isinstance(nested_list, (list, np.ndarray)):
        return tuple(deep_tuple(element) for element in nested_list)
    else:
        return nested_list
    
def deep_array(nested_tuple):
    if isinstance(nested_tuple, tuple):
        return [deep_array(element) for element in nested_tuple]
    else:
        return nested_tuple

def worst_clustering(points, intended_cluster_number):
    possible_partitions = filter(lambda urns: len(set(urns)) == intended_cluster_number, 
                                 partitions(range(len(points))))
    # if quadruple nested lists bother you, go get a drink right now
    possible_clusterings = [partition_to_points(partition, points) for partition in possible_partitions]
    scores = {deep_tuple(clustering): within_cluster_sum_of_squares(clustering) for clustering in possible_clusterings}
    sorted_scores = sorted(scores.items(), key=lambda t: t[1], reverse=True)
    worst = sorted_scores[0]
    return worst



In [185]:

    
w = worst_clustering(arr, 3)



In [186]:

    
w









    Out[186]:





((((1.0, 1.0),
   (2.0, 2.0),
   (3.0, 4.0),
   (-1.0, -4.0),
   (-2.0, -3.0),
   (-3.0, 0.0),
   (-2.0, 1.0),
   (-2.5, -1.0)),
  ((0.0, -2.0),),
  ((-0.5, -0.5),)),
 83.71875)



In [187]:

    
deep_array(w[0])









    Out[187]:





[[[1.0, 1.0],
  [2.0, 2.0],
  [3.0, 4.0],
  [-1.0, -4.0],
  [-2.0, -3.0],
  [-3.0, 0.0],
  [-2.0, 1.0],
  [-2.5, -1.0]],
 [[0.0, -2.0]],
 [[-0.5, -0.5]]]



In [169]:

    
isinstance(1, (np.ndarray, int))









    Out[169]:





True



In [ ]: