Suppose we want to cluster our customers according to their buying behavior, e.g. their purchase frequency. The Kmeans function in scikit is one way to do that.
In [2]:
import pandas as pd
import contiamo
import datetime
import seaborn as sns
import numpy as np
from sklearn.cluster import KMeans
from sklearn import datasets
In [3]:
transactions = %contiamo query query:sql:48590597:411:g71GXzJjsx4Uvad11ouKjoYbQUNNPy-qRMKkBNZfyx4
customers = %contiamo query query:sql:48590597:441:MG5W2dMjXzYgsHsgdQYzmhv44dxEQX2Lodu5Uh2Hx_s
applications = %contiamo query query:sql:48590597:442:-gz3nbw1fdmtSXkD4zGNA-cVa7s6sQtRn8upCSn6uys
In [4]:
df = pd.DataFrame({
'customer_id' : transactions['Field customer id'],
'revenue' : pd.to_numeric(transactions['Field app price']),
'n_of_purchases' : [1]*len(transactions)
})
df.head()
Out[4]:
In [5]:
pf = df.groupby('customer_id').agg({
'revenue' : 'sum',
'n_of_purchases' : 'sum'
})
pf['average_transaction_price'] = pf['revenue'] / pf['n_of_purchases']
del pf['revenue']
pf.head()
Out[5]:
In [6]:
%matplotlib inline
sns.set(style="ticks", color_codes=True)
In [7]:
g = sns.pairplot(pf)
In [12]:
estimator = KMeans(n_clusters=4)
estimator.fit(pf)
labels = estimator.labels_
pf["labels"] = labels
In [13]:
g = sns.pairplot(pf, vars=['n_of_purchases', 'average_transaction_price'], hue="labels")
We see that 4 groups have been created:
A new column has been added in our dataframe containing that information. So we could observe each group’s buying behavior even further by uploading the new dataframe into Contiamo and exploring the data filtered by cluster in the Explore section.