In [1]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import missingno
import seaborn as sns
%matplotlib inline
In [2]:
data = pd.read_csv("dataset/Wholesale customers data.csv")
data.drop(['Region', 'Channel'], axis = 1, inplace = True)
In [3]:
data.describe()
Out[3]:
In [4]:
# Quick look at the data
data.head()
Out[4]:
In [5]:
# Plot of the distribution of each feature
def plot_distribution(dataset, cols=5, width=20, height=15, hspace=0.2, wspace=0.5):
plt.style.use('seaborn-whitegrid')
fig = plt.figure(figsize=(width,height))
fig.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=wspace, hspace=hspace)
rows = math.ceil(float(dataset.shape[1]) / cols)
for i, column in enumerate(dataset.columns):
ax = fig.add_subplot(rows, cols, i + 1)
ax.set_title(column)
if dataset.dtypes[column] == np.object:
g = sns.countplot(y=column, data=dataset)
substrings = [s.get_text()[:18] for s in g.get_yticklabels()]
g.set(yticklabels=substrings)
plt.xticks(rotation=25)
else:
g = sns.distplot(dataset[column])
plt.xticks(rotation=25)
plot_distribution(data, cols=3, width=20, height=20, hspace=0.45, wspace=0.5)
In [7]:
# Relevance of a feature
# Do we need a particualr feature? We can make this determination quite easily
# by training a supervised regression learner on a subset of the data with
# one feature removed, and then score how well that model can predict the removed feature.
# The coefficient of determination, R^2, is scored between 0 and 1, with 1 being a perfect fit.
# A negative R^2 implies the model fails to fit the data.
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeRegressor
# Make a copy of the DataFrame, using the 'drop' function to drop the given feature
new_data = data.copy().drop('Detergents_Paper', 1)
# Split the data into training and testing sets using the given feature as the target
X_train, X_test, y_train, y_test = train_test_split(new_data, data['Detergents_Paper'], test_size=0.25, random_state=0)
# Create a decision tree regressor and fit it to the training set
regressor = DecisionTreeRegressor(random_state=0)
regressor.fit(X_train, y_train)
# Calculate the score of the prediction using the testing set
score = regressor.score(X_test, y_test)
print("Model has a coefficient of determination, R^2, of {:.3f}.".format(score))
In [10]:
# Produce a scatter matrix for each pair of features in the data
pd.scatter_matrix(data, alpha = 0.3, figsize = (14,8), diagonal = 'kde');
In [9]:
# Scale the data using the natural logarithm
log_data = np.log(data.copy())
# Produce a scatter matrix for each pair of newly-transformed features
pd.scatter_matrix(log_data, alpha = 0.3, figsize = (14,8), diagonal = 'kde');
Detergents_Paper appears to be somewhat correlated with Milk and rather highly correlated with Grocery. This confirms the suspicions that it's not too relevant for identifying a specific customer with our model. All features appear to have a heavily positively skewed distribution.
In [12]:
# For each feature find the data points with extreme high or low values
for feature in log_data.keys():
# Calculate Q1 (25th percentile of the data) for the given feature
Q1 = np.percentile(log_data[feature], 25)
# Calculate Q3 (75th percentile of the data) for the given feature
Q3 = np.percentile(log_data[feature], 75)
# Use the interquartile range to calculate an outlier step (1.5 times the interquartile range)
step = 1.5 * (Q3 - Q1)
# Display the outliers
print("Data points considered outliers for the feature '{}':".format(feature))
display(log_data[~((log_data[feature] >= Q1 - step) & (log_data[feature] <= Q3 + step))])
In [13]:
# Cosindering rows which are outliers in multiple features
outliers = [65, 66, 75, 128, 154]
# Remove the outliers, if any were specified
good_data = log_data.drop(log_data.index[outliers]).reset_index(drop = True)
In [23]:
# Feature reduction
from sklearn.decomposition import PCA
# Apply PCA by fitting the good data with the same number of dimensions as features
pca = PCA(n_components=6)
pca.fit(good_data)
# Display cumulative sums of the explained variance ratios
print(pca.explained_variance_ratio_)
In [27]:
# For the ease, we will chse first 2 components which has a cumulative variance of 0.7
# Apply PCA by fitting the good data with only two dimensions
pca = PCA(n_components=2)
pca.fit(good_data)
# Transform the good data using the PCA fit above
reduced_data = pca.transform(good_data)
# Create a DataFrame for the reduced data
reduced_data = pd.DataFrame(reduced_data, columns = ['Dimension 1', 'Dimension 2'])
In [31]:
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score
best_num_clusters = 0
best_score = 0
for n_ in range(2,8):
clusterer = GaussianMixture(n_components=n_, covariance_type='full')
clusterer.fit(reduced_data)
# Predict the cluster for each data point
preds = clusterer.predict(reduced_data)
# Find the cluster centers
centers = clusterer.means_
# Calculate the mean silhouette coefficient for the number of clusters chosen
score = silhouette_score(reduced_data, preds)
print("Silhouette coefficient for {} clusters: {:.3f}".format(n_, score))
In [43]:
# the best silhouette coefficient is acheived with 2 clusters
clusterer = GaussianMixture(n_components=2, covariance_type='full')
clusterer.fit(reduced_data)
# Predict the cluster for each data point
preds = clusterer.predict(reduced_data)
# Find the cluster centers
centers = clusterer.means_
In [44]:
import matplotlib.cm as cm
def cluster_results(reduced_data, preds, centers):
predictions = pd.DataFrame(preds, columns = ['Cluster'])
plot_data = pd.concat([predictions, reduced_data], axis = 1)
# Generate the cluster plot
fig, ax = plt.subplots(figsize = (14,8))
# Color map
cmap = cm.get_cmap('gist_rainbow')
# Color the points based on assigned cluster
for i, cluster in plot_data.groupby('Cluster'):
cluster.plot(ax = ax, kind = 'scatter', x = 'Dimension 1', y = 'Dimension 2', \
color = cmap((i)*1.0/(len(centers)-1)), label = 'Cluster %i'%(i), s=30);
# Plot centers with indicators
for i, c in enumerate(centers):
ax.scatter(x = c[0], y = c[1], color = 'white', edgecolors = 'black', \
alpha = 1, linewidth = 2, marker = 'o', s=200);
ax.scatter(x = c[0], y = c[1], marker='$%d$'%(i), alpha = 1, s=100);
# Set plot title
ax.set_title("Cluster Learning on PCA-Reduced Data - Centroids Marked by Number")
cluster_results(reduced_data, preds, centers)
In [ ]: