In [14]:
import numpy as np
import pandas as pd
from sklearn import preprocessing

import matplotlib.pyplot as plt
import matplotlib


#
# TODO: Parameters to play around with
PLOT_TYPE_TEXT = False    # If you'd like to see indices
PLOT_VECTORS = True       # If you'd like to see your original features in P.C.-Space


matplotlib.style.use('ggplot') # Look Pretty
c = ['red', 'green', 'blue', 'orange', 'yellow', 'brown']

def drawVectors(transformed_features, components_, columns, plt):
  num_columns = len(columns)

  # This function will project your *original* feature (columns)
  # onto your principal component feature-space, so that you can
  # visualize how "important" each one was in the
  # multi-dimensional scaling
  
  # Scale the principal components by the max value in
  # the transformed set belonging to that component
  xvector = components_[0] * max(transformed_features[:,0])
  yvector = components_[1] * max(transformed_features[:,1])

  ## Visualize projections

  # Sort each column by its length. These are your *original*
  # columns, not the principal components.
  import math
  important_features = { columns[i] : math.sqrt(xvector[i]**2 + yvector[i]**2) for i in range(num_columns) }
  important_features = sorted(zip(important_features.values(), important_features.keys()), reverse=True)
  print("Projected Features by importance:\n", important_features)

  ax = plt.axes()

  for i in range(num_columns):
    # Use an arrow to project each original feature as a
    # labeled vector on your principal component axes
    plt.arrow(0, 0, xvector[i], yvector[i], color='b', width=0.0005, head_width=0.02, alpha=0.75, zorder=600000)
    plt.text(xvector[i]*1.2, yvector[i]*1.2, list(columns)[i], color='b', alpha=0.75, zorder=600000)
  return ax
    

def doPCA(data, dimensions=2):
  from sklearn.decomposition import PCA
  model = PCA(n_components=dimensions, svd_solver='randomized', random_state=7)
  model.fit(data)
  return model


def doKMeans(data, clusters=0):
  #
  # TODO: Do the KMeans clustering here, passing in the # of clusters parameter
  # and fit it against your data. Then, return a tuple containing the cluster
  # centers and the labels.
  
    from sklearn.cluster import KMeans
    model = KMeans(clusters)
    model.fit(data)
    
  # Hint: Just like with doPCA above, you will have to create a variable called
  # `model`, which is a SKLearn K-Means model for this to work.
  #
  # .. your code here ..
    return model.cluster_centers_, model.labels_


#

In [5]:
# TODO: Load up the dataset. It may or may not have nans in it. Make
# sure you catch them and destroy them, by setting them to '0'. This is valid
# for this dataset, since if the value is missing, you can assume no $ was spent
# on it.
#
df = pd.read_csv('Datasets/Wholesale customers data.csv')
df = df.fillna(value = 0)

In [9]:
modelPCA = doPCA(df)
modelPCA


Out[9]:
PCA(copy=True, iterated_power='auto', n_components=2, random_state=7,
  svd_solver='randomized', tol=0.0, whiten=False)

In [15]:
modelkmean = doKMeans(df)
modelkmean


---------------------------------------------------------------------------
OverflowError                             Traceback (most recent call last)
<ipython-input-15-03c9259e3c72> in <module>()
----> 1 modelkmean = doKMeans(df)
      2 modelkmean

<ipython-input-14-ee6747502054> in doKMeans(data, clusters)
     63     from sklearn.cluster import KMeans
     64     model = KMeans(clusters)
---> 65     model.fit(data)
     66 
     67   # Hint: Just like with doPCA above, you will have to create a variable called

//anaconda/lib/python3.5/site-packages/sklearn/cluster/k_means_.py in fit(self, X, y)
    887                 tol=self.tol, random_state=random_state, copy_x=self.copy_x,
    888                 n_jobs=self.n_jobs, algorithm=self.algorithm,
--> 889                 return_n_iter=True)
    890         return self
    891 

//anaconda/lib/python3.5/site-packages/sklearn/cluster/k_means_.py in k_means(X, n_clusters, init, precompute_distances, n_init, max_iter, verbose, tol, random_state, copy_x, n_jobs, algorithm, return_n_iter)
    343                 X, n_clusters, max_iter=max_iter, init=init, verbose=verbose,
    344                 precompute_distances=precompute_distances, tol=tol,
--> 345                 x_squared_norms=x_squared_norms, random_state=random_state)
    346             # determine if these results are the best so far
    347             if best_inertia is None or inertia < best_inertia:

//anaconda/lib/python3.5/site-packages/sklearn/cluster/k_means_.py in _kmeans_single_elkan(X, n_clusters, max_iter, init, verbose, x_squared_norms, random_state, tol, precompute_distances)
    392     # init
    393     centers = _init_centroids(X, n_clusters, init, random_state=random_state,
--> 394                               x_squared_norms=x_squared_norms)
    395     centers = np.ascontiguousarray(centers)
    396     if verbose:

//anaconda/lib/python3.5/site-packages/sklearn/cluster/k_means_.py in _init_centroids(X, k, init, random_state, x_squared_norms, init_size)
    679     if isinstance(init, string_types) and init == 'k-means++':
    680         centers = _k_init(X, k, random_state=random_state,
--> 681                           x_squared_norms=x_squared_norms)
    682     elif isinstance(init, string_types) and init == 'random':
    683         seeds = random_state.permutation(n_samples)[:k]

//anaconda/lib/python3.5/site-packages/sklearn/cluster/k_means_.py in _k_init(X, n_clusters, x_squared_norms, random_state, n_local_trials)
     88         # specific results for other than mentioning in the conclusion
     89         # that it helped.
---> 90         n_local_trials = 2 + int(np.log(n_clusters))
     91 
     92     # Pick first center randomly

OverflowError: cannot convert float infinity to integer

In [ ]: