In [1]:

    
import math

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import missingno
import seaborn as sns

%matplotlib inline



In [2]:

    
data = pd.read_csv("dataset/Wholesale customers data.csv")
data.drop(['Region', 'Channel'], axis = 1, inplace = True)



In [3]:

    
data.describe()









    Out[3]:







  
    
      
      Fresh
      Milk
      Grocery
      Frozen
      Detergents_Paper
      Delicassen
    
  
  
    
      count
      440.000000
      440.000000
      440.000000
      440.000000
      440.000000
      440.000000
    
    
      mean
      12000.297727
      5796.265909
      7951.277273
      3071.931818
      2881.493182
      1524.870455
    
    
      std
      12647.328865
      7380.377175
      9503.162829
      4854.673333
      4767.854448
      2820.105937
    
    
      min
      3.000000
      55.000000
      3.000000
      25.000000
      3.000000
      3.000000
    
    
      25%
      3127.750000
      1533.000000
      2153.000000
      742.250000
      256.750000
      408.250000
    
    
      50%
      8504.000000
      3627.000000
      4755.500000
      1526.000000
      816.500000
      965.500000
    
    
      75%
      16933.750000
      7190.250000
      10655.750000
      3554.250000
      3922.000000
      1820.250000
    
    
      max
      112151.000000
      73498.000000
      92780.000000
      60869.000000
      40827.000000
      47943.000000



In [4]:

    
# Quick look at the data
data.head()









    Out[4]:







  
    
      
      Fresh
      Milk
      Grocery
      Frozen
      Detergents_Paper
      Delicassen
    
  
  
    
      0
      12669
      9656
      7561
      214
      2674
      1338
    
    
      1
      7057
      9810
      9568
      1762
      3293
      1776
    
    
      2
      6353
      8808
      7684
      2405
      3516
      7844
    
    
      3
      13265
      1196
      4221
      6404
      507
      1788
    
    
      4
      22615
      5410
      7198
      3915
      1777
      5185



In [5]:

    
# Plot of the distribution of each feature
def plot_distribution(dataset, cols=5, width=20, height=15, hspace=0.2, wspace=0.5):
    plt.style.use('seaborn-whitegrid')
    fig = plt.figure(figsize=(width,height))
    fig.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=wspace, hspace=hspace)
    rows = math.ceil(float(dataset.shape[1]) / cols)
    for i, column in enumerate(dataset.columns):
        ax = fig.add_subplot(rows, cols, i + 1)
        ax.set_title(column)
        if dataset.dtypes[column] == np.object:
            g = sns.countplot(y=column, data=dataset)
            substrings = [s.get_text()[:18] for s in g.get_yticklabels()]
            g.set(yticklabels=substrings)
            plt.xticks(rotation=25)
        else:
            g = sns.distplot(dataset[column])
            plt.xticks(rotation=25)
    
plot_distribution(data, cols=3, width=20, height=20, hspace=0.45, wspace=0.5)



In [7]:

    
# Relevance of a feature

# Do we need a particualr feature? We can make this determination quite easily
# by training a supervised regression learner on a subset of the data with
# one feature removed, and then score how well that model can predict the removed feature.

# The coefficient of determination, R^2, is scored between 0 and 1, with 1 being a perfect fit.
# A negative R^2 implies the model fails to fit the data.

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeRegressor

# Make a copy of the DataFrame, using the 'drop' function to drop the given feature
new_data = data.copy().drop('Detergents_Paper', 1)

# Split the data into training and testing sets using the given feature as the target
X_train, X_test, y_train, y_test = train_test_split(new_data, data['Detergents_Paper'], test_size=0.25, random_state=0)

# Create a decision tree regressor and fit it to the training set
regressor = DecisionTreeRegressor(random_state=0)
regressor.fit(X_train, y_train)

# Calculate the score of the prediction using the testing set
score = regressor.score(X_test, y_test)
print("Model has a coefficient of determination, R^2, of {:.3f}.".format(score))









    



Model has a coefficient of determination, R^2, of 0.729.



In [10]:

    
# Produce a scatter matrix for each pair of features in the data
pd.scatter_matrix(data, alpha = 0.3, figsize = (14,8), diagonal = 'kde');









    



/opt/infilect/dev/py3.5discoverenv/lib/python3.5/site-packages/ipykernel_launcher.py:2: FutureWarning: pandas.scatter_matrix is deprecated. Use pandas.plotting.scatter_matrix instead



In [9]:

    
# Scale the data using the natural logarithm
log_data = np.log(data.copy())

# Produce a scatter matrix for each pair of newly-transformed features
pd.scatter_matrix(log_data, alpha = 0.3, figsize = (14,8), diagonal = 'kde');









    



/opt/infilect/dev/py3.5discoverenv/lib/python3.5/site-packages/ipykernel_launcher.py:5: FutureWarning: pandas.scatter_matrix is deprecated. Use pandas.plotting.scatter_matrix instead
  """

Detergents_Paper appears to be somewhat correlated with Milk and rather highly correlated with Grocery. This confirms the suspicions that it's not too relevant for identifying a specific customer with our model. All features appear to have a heavily positively skewed distribution.



In [12]:

    
# For each feature find the data points with extreme high or low values
for feature in log_data.keys():

    # Calculate Q1 (25th percentile of the data) for the given feature
    Q1 = np.percentile(log_data[feature], 25)

    # Calculate Q3 (75th percentile of the data) for the given feature
    Q3 = np.percentile(log_data[feature], 75)

    # Use the interquartile range to calculate an outlier step (1.5 times the interquartile range)
    step = 1.5 * (Q3 - Q1)

    # Display the outliers
    print("Data points considered outliers for the feature '{}':".format(feature))
    display(log_data[~((log_data[feature] >= Q1 - step) & (log_data[feature] <= Q3 + step))])









    



Data points considered outliers for the feature 'Fresh':






    







  
    
      
      Fresh
      Milk
      Grocery
      Frozen
      Detergents_Paper
      Delicassen
    
  
  
    
      65
      4.442651
      9.950323
      10.732651
      3.583519
      10.095388
      7.260523
    
    
      66
      2.197225
      7.335634
      8.911530
      5.164786
      8.151333
      3.295837
    
    
      81
      5.389072
      9.163249
      9.575192
      5.645447
      8.964184
      5.049856
    
    
      95
      1.098612
      7.979339
      8.740657
      6.086775
      5.407172
      6.563856
    
    
      96
      3.135494
      7.869402
      9.001839
      4.976734
      8.262043
      5.379897
    
    
      128
      4.941642
      9.087834
      8.248791
      4.955827
      6.967909
      1.098612
    
    
      171
      5.298317
      10.160530
      9.894245
      6.478510
      9.079434
      8.740337
    
    
      193
      5.192957
      8.156223
      9.917982
      6.865891
      8.633731
      6.501290
    
    
      218
      2.890372
      8.923191
      9.629380
      7.158514
      8.475746
      8.759669
    
    
      304
      5.081404
      8.917311
      10.117510
      6.424869
      9.374413
      7.787382
    
    
      305
      5.493061
      9.468001
      9.088399
      6.683361
      8.271037
      5.351858
    
    
      338
      1.098612
      5.808142
      8.856661
      9.655090
      2.708050
      6.309918
    
    
      353
      4.762174
      8.742574
      9.961898
      5.429346
      9.069007
      7.013016
    
    
      355
      5.247024
      6.588926
      7.606885
      5.501258
      5.214936
      4.844187
    
    
      357
      3.610918
      7.150701
      10.011086
      4.919981
      8.816853
      4.700480
    
    
      412
      4.574711
      8.190077
      9.425452
      4.584967
      7.996317
      4.127134
    
  








    



Data points considered outliers for the feature 'Milk':






    







  
    
      
      Fresh
      Milk
      Grocery
      Frozen
      Detergents_Paper
      Delicassen
    
  
  
    
      86
      10.039983
      11.205013
      10.377047
      6.894670
      9.906981
      6.805723
    
    
      98
      6.220590
      4.718499
      6.656727
      6.796824
      4.025352
      4.882802
    
    
      154
      6.432940
      4.007333
      4.919981
      4.317488
      1.945910
      2.079442
    
    
      356
      10.029503
      4.897840
      5.384495
      8.057377
      2.197225
      6.306275
    
  








    



Data points considered outliers for the feature 'Grocery':






    







  
    
      
      Fresh
      Milk
      Grocery
      Frozen
      Detergents_Paper
      Delicassen
    
  
  
    
      75
      9.923192
      7.036148
      1.098612
      8.390949
      1.098612
      6.882437
    
    
      154
      6.432940
      4.007333
      4.919981
      4.317488
      1.945910
      2.079442
    
  








    



Data points considered outliers for the feature 'Frozen':






    







  
    
      
      Fresh
      Milk
      Grocery
      Frozen
      Detergents_Paper
      Delicassen
    
  
  
    
      38
      8.431853
      9.663261
      9.723703
      3.496508
      8.847360
      6.070738
    
    
      57
      8.597297
      9.203618
      9.257892
      3.637586
      8.932213
      7.156177
    
    
      65
      4.442651
      9.950323
      10.732651
      3.583519
      10.095388
      7.260523
    
    
      145
      10.000569
      9.034080
      10.457143
      3.737670
      9.440738
      8.396155
    
    
      175
      7.759187
      8.967632
      9.382106
      3.951244
      8.341887
      7.436617
    
    
      264
      6.978214
      9.177714
      9.645041
      4.110874
      8.696176
      7.142827
    
    
      325
      10.395650
      9.728181
      9.519735
      11.016479
      7.148346
      8.632128
    
    
      420
      8.402007
      8.569026
      9.490015
      3.218876
      8.827321
      7.239215
    
    
      429
      9.060331
      7.467371
      8.183118
      3.850148
      4.430817
      7.824446
    
    
      439
      7.932721
      7.437206
      7.828038
      4.174387
      6.167516
      3.951244
    
  








    



Data points considered outliers for the feature 'Detergents_Paper':






    







  
    
      
      Fresh
      Milk
      Grocery
      Frozen
      Detergents_Paper
      Delicassen
    
  
  
    
      75
      9.923192
      7.036148
      1.098612
      8.390949
      1.098612
      6.882437
    
    
      161
      9.428190
      6.291569
      5.645447
      6.995766
      1.098612
      7.711101
    
  








    



Data points considered outliers for the feature 'Delicassen':






    







  
    
      
      Fresh
      Milk
      Grocery
      Frozen
      Detergents_Paper
      Delicassen
    
  
  
    
      66
      2.197225
      7.335634
      8.911530
      5.164786
      8.151333
      3.295837
    
    
      109
      7.248504
      9.724899
      10.274568
      6.511745
      6.728629
      1.098612
    
    
      128
      4.941642
      9.087834
      8.248791
      4.955827
      6.967909
      1.098612
    
    
      137
      8.034955
      8.997147
      9.021840
      6.493754
      6.580639
      3.583519
    
    
      142
      10.519646
      8.875147
      9.018332
      8.004700
      2.995732
      1.098612
    
    
      154
      6.432940
      4.007333
      4.919981
      4.317488
      1.945910
      2.079442
    
    
      183
      10.514529
      10.690808
      9.911952
      10.505999
      5.476464
      10.777768
    
    
      184
      5.789960
      6.822197
      8.457443
      4.304065
      5.811141
      2.397895
    
    
      187
      7.798933
      8.987447
      9.192075
      8.743372
      8.148735
      1.098612
    
    
      203
      6.368187
      6.529419
      7.703459
      6.150603
      6.860664
      2.890372
    
    
      233
      6.871091
      8.513988
      8.106515
      6.842683
      6.013715
      1.945910
    
    
      285
      10.602965
      6.461468
      8.188689
      6.948897
      6.077642
      2.890372
    
    
      289
      10.663966
      5.655992
      6.154858
      7.235619
      3.465736
      3.091042
    
    
      343
      7.431892
      8.848509
      10.177932
      7.283448
      9.646593
      3.610918



In [13]:

    
# Cosindering rows which are outliers in multiple features
outliers  = [65, 66, 75, 128, 154]

# Remove the outliers, if any were specified
good_data = log_data.drop(log_data.index[outliers]).reset_index(drop = True)



In [23]:

    
# Feature reduction
from sklearn.decomposition import PCA

# Apply PCA by fitting the good data with the same number of dimensions as features
pca = PCA(n_components=6)
pca.fit(good_data)

# Display cumulative sums of the explained variance ratios
print(pca.explained_variance_ratio_)









    



[ 0.44302505  0.26379218  0.1230638   0.10120908  0.04850196  0.02040793]



In [27]:

    
# For the ease, we will chse first 2 components which has a cumulative variance of 0.7
# Apply PCA by fitting the good data with only two dimensions

pca = PCA(n_components=2)
pca.fit(good_data)

# Transform the good data using the PCA fit above
reduced_data = pca.transform(good_data)

# Create a DataFrame for the reduced data
reduced_data = pd.DataFrame(reduced_data, columns = ['Dimension 1', 'Dimension 2'])



In [31]:

    
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score

best_num_clusters = 0
best_score = 0

for n_ in  range(2,8):
    clusterer = GaussianMixture(n_components=n_, covariance_type='full')
    clusterer.fit(reduced_data)

    # Predict the cluster for each data point
    preds = clusterer.predict(reduced_data)

    # Find the cluster centers
    centers = clusterer.means_

    # Calculate the mean silhouette coefficient for the number of clusters chosen
    score = silhouette_score(reduced_data, preds)
    print("Silhouette coefficient for {} clusters: {:.3f}".format(n_, score))









    



Silhouette coefficient for 2 clusters: 0.422
Silhouette coefficient for 3 clusters: 0.421
Silhouette coefficient for 4 clusters: 0.346
Silhouette coefficient for 5 clusters: 0.284
Silhouette coefficient for 6 clusters: 0.293
Silhouette coefficient for 7 clusters: 0.332



In [43]:

    
# the best silhouette coefficient is acheived with 2 clusters
clusterer = GaussianMixture(n_components=2, covariance_type='full')
clusterer.fit(reduced_data)

# Predict the cluster for each data point
preds = clusterer.predict(reduced_data)

# Find the cluster centers
centers = clusterer.means_



In [44]:

    
import matplotlib.cm as cm

def cluster_results(reduced_data, preds, centers):
    predictions = pd.DataFrame(preds, columns = ['Cluster'])
    plot_data = pd.concat([predictions, reduced_data], axis = 1)

    # Generate the cluster plot
    fig, ax = plt.subplots(figsize = (14,8))

    # Color map
    cmap = cm.get_cmap('gist_rainbow')

    # Color the points based on assigned cluster
    for i, cluster in plot_data.groupby('Cluster'):   
        cluster.plot(ax = ax, kind = 'scatter', x = 'Dimension 1', y = 'Dimension 2', \
                     color = cmap((i)*1.0/(len(centers)-1)), label = 'Cluster %i'%(i), s=30);

    # Plot centers with indicators
    for i, c in enumerate(centers):
        ax.scatter(x = c[0], y = c[1], color = 'white', edgecolors = 'black', \
                   alpha = 1, linewidth = 2, marker = 'o', s=200);
        ax.scatter(x = c[0], y = c[1], marker='$%d$'%(i), alpha = 1, s=100);

    # Set plot title
    ax.set_title("Cluster Learning on PCA-Reduced Data - Centroids Marked by Number")


cluster_results(reduced_data, preds, centers)



In [ ]:

	Fresh	Milk	Grocery	Frozen	Detergents_Paper	Delicassen
count	440.000000	440.000000	440.000000	440.000000	440.000000	440.000000
mean	12000.297727	5796.265909	7951.277273	3071.931818	2881.493182	1524.870455
std	12647.328865	7380.377175	9503.162829	4854.673333	4767.854448	2820.105937
min	3.000000	55.000000	3.000000	25.000000	3.000000	3.000000
25%	3127.750000	1533.000000	2153.000000	742.250000	256.750000	408.250000
50%	8504.000000	3627.000000	4755.500000	1526.000000	816.500000	965.500000
75%	16933.750000	7190.250000	10655.750000	3554.250000	3922.000000	1820.250000
max	112151.000000	73498.000000	92780.000000	60869.000000	40827.000000	47943.000000

	Fresh	Milk	Grocery	Frozen	Detergents_Paper	Delicassen
0	12669	9656	7561	214	2674	1338
1	7057	9810	9568	1762	3293	1776
2	6353	8808	7684	2405	3516	7844
3	13265	1196	4221	6404	507	1788
4	22615	5410	7198	3915	1777	5185

	Fresh	Milk	Grocery	Frozen	Detergents_Paper	Delicassen
65	4.442651	9.950323	10.732651	3.583519	10.095388	7.260523
66	2.197225	7.335634	8.911530	5.164786	8.151333	3.295837
81	5.389072	9.163249	9.575192	5.645447	8.964184	5.049856
95	1.098612	7.979339	8.740657	6.086775	5.407172	6.563856
96	3.135494	7.869402	9.001839	4.976734	8.262043	5.379897
128	4.941642	9.087834	8.248791	4.955827	6.967909	1.098612
171	5.298317	10.160530	9.894245	6.478510	9.079434	8.740337
193	5.192957	8.156223	9.917982	6.865891	8.633731	6.501290
218	2.890372	8.923191	9.629380	7.158514	8.475746	8.759669
304	5.081404	8.917311	10.117510	6.424869	9.374413	7.787382
305	5.493061	9.468001	9.088399	6.683361	8.271037	5.351858
338	1.098612	5.808142	8.856661	9.655090	2.708050	6.309918
353	4.762174	8.742574	9.961898	5.429346	9.069007	7.013016
355	5.247024	6.588926	7.606885	5.501258	5.214936	4.844187
357	3.610918	7.150701	10.011086	4.919981	8.816853	4.700480
412	4.574711	8.190077	9.425452	4.584967	7.996317	4.127134

	Fresh	Milk	Grocery	Frozen	Detergents_Paper	Delicassen
86	10.039983	11.205013	10.377047	6.894670	9.906981	6.805723
98	6.220590	4.718499	6.656727	6.796824	4.025352	4.882802
154	6.432940	4.007333	4.919981	4.317488	1.945910	2.079442
356	10.029503	4.897840	5.384495	8.057377	2.197225	6.306275

	Fresh	Milk	Grocery	Frozen	Detergents_Paper	Delicassen
75	9.923192	7.036148	1.098612	8.390949	1.098612	6.882437
154	6.432940	4.007333	4.919981	4.317488	1.945910	2.079442

	Fresh	Milk	Grocery	Frozen	Detergents_Paper	Delicassen
38	8.431853	9.663261	9.723703	3.496508	8.847360	6.070738
57	8.597297	9.203618	9.257892	3.637586	8.932213	7.156177
65	4.442651	9.950323	10.732651	3.583519	10.095388	7.260523
145	10.000569	9.034080	10.457143	3.737670	9.440738	8.396155
175	7.759187	8.967632	9.382106	3.951244	8.341887	7.436617
264	6.978214	9.177714	9.645041	4.110874	8.696176	7.142827
325	10.395650	9.728181	9.519735	11.016479	7.148346	8.632128
420	8.402007	8.569026	9.490015	3.218876	8.827321	7.239215
429	9.060331	7.467371	8.183118	3.850148	4.430817	7.824446
439	7.932721	7.437206	7.828038	4.174387	6.167516	3.951244