In [11]:
#use this notebook to determine the optimal number of K means groups for the dataset
#there are three options
#KL 12 April 2016

In [16]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [6]:
#this sets the stage to figure out the optimal number of K means groups
filename = 'nClustersRequired.py'
%run $filename

In [ ]:
##the data file read into the next cell was created as follows:
##(in main iPython notebook)
#dataFilename = 'NB_combined_for_kmeans.csv'
#Combined_KO_CO_final.to_csv(dataFilename)

In [7]:
dataFilename = 'NB_combined_for_kmeans.csv'
data = Data(dataFilename)
#pattern_labels are the rows...for us this will the RInumber
pattern_labels = []
patterns_data, pattern_labels = data.load_data()

In [8]:
def forest_run(dimensions, patterns, pattern_labels, metric='qe', fNamePrefix = '', k_up=20, k_down=2, simulations=55, iterations=50):
    """
    A method for watching Forest Gump run
    :param dimensions: the dimensionality of the data
    :param patterns: the data itself
    :param metric: the quality metric
    :fNamePrefix: put text in here if I want to add a unique prefix to the data file spit out at end
    :param k_up: the maximum number of clusters
    :param k_down: the minimum number of clusters
    :param simulations: the number of simulations for each k
    :param iterations: the number of iterations for each k-means pass
    """
    # variable to store the best result
    best_clustering = None
    # the quality of that result
    best_quality = 1000.00
    # write results out to file while simulating
    file_out = fNamePrefix + 'MonteCarloFinalResults' + '_' + metric + '.csv'
    #with open(file_out, 'w', newline='') as f: #newline doesn't work here
    with open(file_out,'w') as f:
        # different k values to test on
        for i in range(k_down, k_up):
            num_clusters = i
            # number of retries / simulations
            print('working on ' , i, '# of kmeans groups') #works, but doesn't look as I intended.
            
            for j in range(simulations):
                # create a clustering solution and apply k-means
                clustering = Clustering(dimensions, num_clusters, patterns, 0.0001)
                clustering.k_means_clustering(iterations)
                # used to compute quality of the solution
                quality = ClusteringQuality(clustering, 0.0001)
                this_quality = 0.0
                if metric == 'qe':
                    this_quality = quality.quantization_error()
                if metric == 'si':
                    this_quality = quality.average_silhouette_index()
                if metric == 'db':
                    this_quality = quality.davies_bouldin()
                # update the best clustering
                if this_quality < best_quality:
                    best_quality = this_quality
                    best_clustering = clustering
                    #print("Updated best clustering") #comment out, clogging up display
                # write result to the file
                result = [num_clusters, this_quality]
                for x in result:
                    f.write(str(x))
                    f.write(",")
                f.write("\n")
                f.flush()

In [9]:
dimensions = 5 #this is a function of the data itself. In the NB data we have five sampling days.
setSimulations = 100
setIterations = 100 #this is the default from the Turing Finance code
setKup = 20
setKdown = 2

In [17]:
#first method to try is Davies-Bouldin index
prefix = 'newKO_norm2mean_'

#set the next line to true to actually run this
if False:   
    forest_run(dimensions, patterns_data, pattern_labels, metric='db', fNamePrefix = prefix,
           simulations=setSimulations, k_down=setKdown, k_up=setKup, iterations = setIterations)

#read in the results
riScores_db=pd.read_csv((prefix + 'MonteCarloFinalResults_db.csv'),header=None,delimiter=',',
                index_col=False,names=['nClusters', 'score'])

#optimal cluster solution has the smallest Davies-Bouldin index
grouped = riScores_db.groupby('nClusters')
means = grouped.mean().unstack()
errors = grouped.std().unstack()
fig, ax = plt.subplots()
plt.plot(range(setKdown,setKup),means,marker = 'o',color = '#1b9e77')
plt.errorbar(range(setKdown,setKup),means,errors,marker = 'o',color = '#1b9e77')
plt.title('Kmeans, Davies-Bouldin')
ax.set_xlabel('nClusters')
plt.show() 
fig.savefig('DaviesBouldin_newKO.pdf')



In [ ]:
#next method is quantization error 
if False:
    prefix = 'newKO_norm2mean_'

    if False:
        forest_run(dimensions, patterns_data, pattern_labels, metric='qe', fNamePrefix = prefix,
               simulations=setSimulations, k_down=setKdown, k_up=setKup, iterations = setIterations)

    #now read in the results
    riScores_qe=pd.read_csv((prefix + 'MonteCarloFinalResults_qe.csv'),header=None,delimiter=',',
                    index_col=False,names=['nClusters', 'score'])

    grouped = riScores_qe.groupby('nClusters')
    means = grouped.mean().unstack()
    errors = grouped.std().unstack()
    fig, ax = plt.subplots()
    plt.plot(range(setKdown,setKup),means,marker = 'o',color = '#1b9e77')
    plt.errorbar(range(setKdown,setKup),means,errors,marker = 'o',color = '#1b9e77')
    plt.title('Kmeans, Quantization error')
    ax.set_xlabel('nClusters')
    plt.show()

    #goal is to minimize quantization error. QE is the distance between a sample
    #and its representation. Lower quantization errors represent a good data cluster.

In [ ]:
#final method is silhouette error
if False: ##silhouette is really slow cfd to the other
    prefix = 'newKO_norm2mean_'

    # #silhouette is quite slow cfd to the other two metrics
    if False:
        forest_run(dimensions, patterns_data, pattern_labels, metric='si', fNamePrefix = prefix,
               simulations=setSimulations, k_down=setKdown, k_up=setKup, iterations = setIterations)

    riScores_si=pd.read_csv((prefix + 'MonteCarloFinalResults_si.csv'),header=None,delimiter=',',
                    index_col=False,names=['nClusters', 'score'])


    ##note, want to maximize the silhouette value for each pattern in the dataset
    grouped = riScores_si.groupby('nClusters')
    means = grouped.mean().unstack()
    errors = grouped.std().unstack()
    fig, ax = plt.subplots()
    plt.plot(range(setKdown,setKup),means,marker = 'o',color = '#1b9e77')
    plt.errorbar(range(setKdown,setKup),means,errors,marker = 'o',color = '#1b9e77')
    plt.title('Kmeans, silhouette index')
    ax.set_xlabel('nClusters')
    plt.show()

    #remember, want to maximize this value