In [1]:
import math

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import missingno
import seaborn as sns

%matplotlib inline

In [2]:
data = pd.read_csv("dataset/Wholesale customers data.csv")
data.drop(['Region', 'Channel'], axis = 1, inplace = True)

In [3]:
data.describe()


Out[3]:
Fresh Milk Grocery Frozen Detergents_Paper Delicassen
count 440.000000 440.000000 440.000000 440.000000 440.000000 440.000000
mean 12000.297727 5796.265909 7951.277273 3071.931818 2881.493182 1524.870455
std 12647.328865 7380.377175 9503.162829 4854.673333 4767.854448 2820.105937
min 3.000000 55.000000 3.000000 25.000000 3.000000 3.000000
25% 3127.750000 1533.000000 2153.000000 742.250000 256.750000 408.250000
50% 8504.000000 3627.000000 4755.500000 1526.000000 816.500000 965.500000
75% 16933.750000 7190.250000 10655.750000 3554.250000 3922.000000 1820.250000
max 112151.000000 73498.000000 92780.000000 60869.000000 40827.000000 47943.000000

In [4]:
# Quick look at the data
data.head()


Out[4]:
Fresh Milk Grocery Frozen Detergents_Paper Delicassen
0 12669 9656 7561 214 2674 1338
1 7057 9810 9568 1762 3293 1776
2 6353 8808 7684 2405 3516 7844
3 13265 1196 4221 6404 507 1788
4 22615 5410 7198 3915 1777 5185

In [5]:
# Plot of the distribution of each feature
def plot_distribution(dataset, cols=5, width=20, height=15, hspace=0.2, wspace=0.5):
    plt.style.use('seaborn-whitegrid')
    fig = plt.figure(figsize=(width,height))
    fig.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=wspace, hspace=hspace)
    rows = math.ceil(float(dataset.shape[1]) / cols)
    for i, column in enumerate(dataset.columns):
        ax = fig.add_subplot(rows, cols, i + 1)
        ax.set_title(column)
        if dataset.dtypes[column] == np.object:
            g = sns.countplot(y=column, data=dataset)
            substrings = [s.get_text()[:18] for s in g.get_yticklabels()]
            g.set(yticklabels=substrings)
            plt.xticks(rotation=25)
        else:
            g = sns.distplot(dataset[column])
            plt.xticks(rotation=25)
    
plot_distribution(data, cols=3, width=20, height=20, hspace=0.45, wspace=0.5)



In [7]:
# Relevance of a feature

# Do we need a particualr feature? We can make this determination quite easily
# by training a supervised regression learner on a subset of the data with
# one feature removed, and then score how well that model can predict the removed feature.

# The coefficient of determination, R^2, is scored between 0 and 1, with 1 being a perfect fit.
# A negative R^2 implies the model fails to fit the data.

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeRegressor

# Make a copy of the DataFrame, using the 'drop' function to drop the given feature
new_data = data.copy().drop('Detergents_Paper', 1)

# Split the data into training and testing sets using the given feature as the target
X_train, X_test, y_train, y_test = train_test_split(new_data, data['Detergents_Paper'], test_size=0.25, random_state=0)

# Create a decision tree regressor and fit it to the training set
regressor = DecisionTreeRegressor(random_state=0)
regressor.fit(X_train, y_train)

# Calculate the score of the prediction using the testing set
score = regressor.score(X_test, y_test)
print("Model has a coefficient of determination, R^2, of {:.3f}.".format(score))


Model has a coefficient of determination, R^2, of 0.729.

In [10]:
# Produce a scatter matrix for each pair of features in the data
pd.scatter_matrix(data, alpha = 0.3, figsize = (14,8), diagonal = 'kde');


/opt/infilect/dev/py3.5discoverenv/lib/python3.5/site-packages/ipykernel_launcher.py:2: FutureWarning: pandas.scatter_matrix is deprecated. Use pandas.plotting.scatter_matrix instead
  

In [9]:
# Scale the data using the natural logarithm
log_data = np.log(data.copy())

# Produce a scatter matrix for each pair of newly-transformed features
pd.scatter_matrix(log_data, alpha = 0.3, figsize = (14,8), diagonal = 'kde');


/opt/infilect/dev/py3.5discoverenv/lib/python3.5/site-packages/ipykernel_launcher.py:5: FutureWarning: pandas.scatter_matrix is deprecated. Use pandas.plotting.scatter_matrix instead
  """

Detergents_Paper appears to be somewhat correlated with Milk and rather highly correlated with Grocery. This confirms the suspicions that it's not too relevant for identifying a specific customer with our model. All features appear to have a heavily positively skewed distribution.


In [12]:
# For each feature find the data points with extreme high or low values
for feature in log_data.keys():

    # Calculate Q1 (25th percentile of the data) for the given feature
    Q1 = np.percentile(log_data[feature], 25)

    # Calculate Q3 (75th percentile of the data) for the given feature
    Q3 = np.percentile(log_data[feature], 75)

    # Use the interquartile range to calculate an outlier step (1.5 times the interquartile range)
    step = 1.5 * (Q3 - Q1)

    # Display the outliers
    print("Data points considered outliers for the feature '{}':".format(feature))
    display(log_data[~((log_data[feature] >= Q1 - step) & (log_data[feature] <= Q3 + step))])


Data points considered outliers for the feature 'Fresh':
Fresh Milk Grocery Frozen Detergents_Paper Delicassen
65 4.442651 9.950323 10.732651 3.583519 10.095388 7.260523
66 2.197225 7.335634 8.911530 5.164786 8.151333 3.295837
81 5.389072 9.163249 9.575192 5.645447 8.964184 5.049856
95 1.098612 7.979339 8.740657 6.086775 5.407172 6.563856
96 3.135494 7.869402 9.001839 4.976734 8.262043 5.379897
128 4.941642 9.087834 8.248791 4.955827 6.967909 1.098612
171 5.298317 10.160530 9.894245 6.478510 9.079434 8.740337
193 5.192957 8.156223 9.917982 6.865891 8.633731 6.501290
218 2.890372 8.923191 9.629380 7.158514 8.475746 8.759669
304 5.081404 8.917311 10.117510 6.424869 9.374413 7.787382
305 5.493061 9.468001 9.088399 6.683361 8.271037 5.351858
338 1.098612 5.808142 8.856661 9.655090 2.708050 6.309918
353 4.762174 8.742574 9.961898 5.429346 9.069007 7.013016
355 5.247024 6.588926 7.606885 5.501258 5.214936 4.844187
357 3.610918 7.150701 10.011086 4.919981 8.816853 4.700480
412 4.574711 8.190077 9.425452 4.584967 7.996317 4.127134
Data points considered outliers for the feature 'Milk':
Fresh Milk Grocery Frozen Detergents_Paper Delicassen
86 10.039983 11.205013 10.377047 6.894670 9.906981 6.805723
98 6.220590 4.718499 6.656727 6.796824 4.025352 4.882802
154 6.432940 4.007333 4.919981 4.317488 1.945910 2.079442
356 10.029503 4.897840 5.384495 8.057377 2.197225 6.306275
Data points considered outliers for the feature 'Grocery':
Fresh Milk Grocery Frozen Detergents_Paper Delicassen
75 9.923192 7.036148 1.098612 8.390949 1.098612 6.882437
154 6.432940 4.007333 4.919981 4.317488 1.945910 2.079442
Data points considered outliers for the feature 'Frozen':
Fresh Milk Grocery Frozen Detergents_Paper Delicassen
38 8.431853 9.663261 9.723703 3.496508 8.847360 6.070738
57 8.597297 9.203618 9.257892 3.637586 8.932213 7.156177
65 4.442651 9.950323 10.732651 3.583519 10.095388 7.260523
145 10.000569 9.034080 10.457143 3.737670 9.440738 8.396155
175 7.759187 8.967632 9.382106 3.951244 8.341887 7.436617
264 6.978214 9.177714 9.645041 4.110874 8.696176 7.142827
325 10.395650 9.728181 9.519735 11.016479 7.148346 8.632128
420 8.402007 8.569026 9.490015 3.218876 8.827321 7.239215
429 9.060331 7.467371 8.183118 3.850148 4.430817 7.824446
439 7.932721 7.437206 7.828038 4.174387 6.167516 3.951244
Data points considered outliers for the feature 'Detergents_Paper':
Fresh Milk Grocery Frozen Detergents_Paper Delicassen
75 9.923192 7.036148 1.098612 8.390949 1.098612 6.882437
161 9.428190 6.291569 5.645447 6.995766 1.098612 7.711101
Data points considered outliers for the feature 'Delicassen':
Fresh Milk Grocery Frozen Detergents_Paper Delicassen
66 2.197225 7.335634 8.911530 5.164786 8.151333 3.295837
109 7.248504 9.724899 10.274568 6.511745 6.728629 1.098612
128 4.941642 9.087834 8.248791 4.955827 6.967909 1.098612
137 8.034955 8.997147 9.021840 6.493754 6.580639 3.583519
142 10.519646 8.875147 9.018332 8.004700 2.995732 1.098612
154 6.432940 4.007333 4.919981 4.317488 1.945910 2.079442
183 10.514529 10.690808 9.911952 10.505999 5.476464 10.777768
184 5.789960 6.822197 8.457443 4.304065 5.811141 2.397895
187 7.798933 8.987447 9.192075 8.743372 8.148735 1.098612
203 6.368187 6.529419 7.703459 6.150603 6.860664 2.890372
233 6.871091 8.513988 8.106515 6.842683 6.013715 1.945910
285 10.602965 6.461468 8.188689 6.948897 6.077642 2.890372
289 10.663966 5.655992 6.154858 7.235619 3.465736 3.091042
343 7.431892 8.848509 10.177932 7.283448 9.646593 3.610918

In [13]:
# Cosindering rows which are outliers in multiple features
outliers  = [65, 66, 75, 128, 154]

# Remove the outliers, if any were specified
good_data = log_data.drop(log_data.index[outliers]).reset_index(drop = True)

In [23]:
# Feature reduction
from sklearn.decomposition import PCA

# Apply PCA by fitting the good data with the same number of dimensions as features
pca = PCA(n_components=6)
pca.fit(good_data)

# Display cumulative sums of the explained variance ratios
print(pca.explained_variance_ratio_)


[ 0.44302505  0.26379218  0.1230638   0.10120908  0.04850196  0.02040793]

In [27]:
# For the ease, we will chse first 2 components which has a cumulative variance of 0.7
# Apply PCA by fitting the good data with only two dimensions

pca = PCA(n_components=2)
pca.fit(good_data)

# Transform the good data using the PCA fit above
reduced_data = pca.transform(good_data)

# Create a DataFrame for the reduced data
reduced_data = pd.DataFrame(reduced_data, columns = ['Dimension 1', 'Dimension 2'])

In [31]:
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score

best_num_clusters = 0
best_score = 0

for n_ in  range(2,8):
    clusterer = GaussianMixture(n_components=n_, covariance_type='full')
    clusterer.fit(reduced_data)

    # Predict the cluster for each data point
    preds = clusterer.predict(reduced_data)

    # Find the cluster centers
    centers = clusterer.means_

    # Calculate the mean silhouette coefficient for the number of clusters chosen
    score = silhouette_score(reduced_data, preds)
    print("Silhouette coefficient for {} clusters: {:.3f}".format(n_, score))


Silhouette coefficient for 2 clusters: 0.422
Silhouette coefficient for 3 clusters: 0.421
Silhouette coefficient for 4 clusters: 0.346
Silhouette coefficient for 5 clusters: 0.284
Silhouette coefficient for 6 clusters: 0.293
Silhouette coefficient for 7 clusters: 0.332

In [43]:
# the best silhouette coefficient is acheived with 2 clusters
clusterer = GaussianMixture(n_components=2, covariance_type='full')
clusterer.fit(reduced_data)

# Predict the cluster for each data point
preds = clusterer.predict(reduced_data)

# Find the cluster centers
centers = clusterer.means_

In [44]:
import matplotlib.cm as cm

def cluster_results(reduced_data, preds, centers):
    predictions = pd.DataFrame(preds, columns = ['Cluster'])
    plot_data = pd.concat([predictions, reduced_data], axis = 1)

    # Generate the cluster plot
    fig, ax = plt.subplots(figsize = (14,8))

    # Color map
    cmap = cm.get_cmap('gist_rainbow')

    # Color the points based on assigned cluster
    for i, cluster in plot_data.groupby('Cluster'):   
        cluster.plot(ax = ax, kind = 'scatter', x = 'Dimension 1', y = 'Dimension 2', \
                     color = cmap((i)*1.0/(len(centers)-1)), label = 'Cluster %i'%(i), s=30);

    # Plot centers with indicators
    for i, c in enumerate(centers):
        ax.scatter(x = c[0], y = c[1], color = 'white', edgecolors = 'black', \
                   alpha = 1, linewidth = 2, marker = 'o', s=200);
        ax.scatter(x = c[0], y = c[1], marker='$%d$'%(i), alpha = 1, s=100);

    # Set plot title
    ax.set_title("Cluster Learning on PCA-Reduced Data - Centroids Marked by Number")


cluster_results(reduced_data, preds, centers)



In [ ]: