In [ ]:
%reset

In [ ]:
# Import the required modules
import pandas as pd
import numpy as np
import scipy as sp

In [ ]:
# simple function to read in the user data file.
# the argument parse_dates takes in a list of colums, which are to be parsed as date format
user_data_raw_0day = pd.read_csv\
("/home/eyebell/local_bin/janacare/janCC/datasets/user_retention_email-campaign/user_data_binned_post30thApril_0day.csv",\
 parse_dates = [-5 ,-4 ,-3])

In [ ]:
# user data first week only
user_data_raw_1st_week = pd.DataFrame()
filelist = ['user_data_binned_post30thApril_1st-day.csv', 'user_data_binned_post30thApril_1st-week.csv']
dflist = []
path = r'/home/eyebell/local_bin/janacare/janCC/datasets/user_retention_email-campaign/'

for filename in filelist:
    df = pd.read_csv(path+filename, index_col=None, header=0, parse_dates = [-5 ,-4 ,-3])
    dflist.append(df)

user_data_raw_1st_week = pd.concat(dflist)

In [ ]:
# user data post first week 
user_data_raw_rest = pd.DataFrame()
filelist = ['user_data_binned_post30thApril_12thweek-to-6thmonth.csv', 'user_data_binned_post30thApril_2nd-week.csv',\
            'user_data_binned_post30thApril_3rd-week.csv', 'user_data_binned_post30thApril_4th-to-6th-week.csv', \
            'user_data_binned_post30thApril_4th-week.csv', 'user_data_binned_post30thApril_6thmonth-to-1year.csv', \
            'user_data_binned_post30thApril_6th-to-8th-week.csv', 'user_data_binned_post30thApril_8th-to-12th-week.csv',\
            'user_data_binned_post30thApril_beyond-1year.csv']
dflist = []
path = r'/home/eyebell/local_bin/janacare/janCC/datasets/user_retention_email-campaign/'

for filename in filelist:
    df = pd.read_csv(path+filename, index_col=None, header=0, parse_dates = [-5 ,-4 ,-3])
    dflist.append(df)

user_data_raw_rest = pd.concat(dflist)

In [ ]:
user_data_raw_1st_week.info()

In [ ]:
user_data_raw_0day.info()
user_data_raw_0day.loc[1]

In [ ]:
user_data_raw_rest.info()

Clustering the three data sets:

We use Mean Shift clustering here. Changing the value for quantile (in estimate_bandwidth function ) alters the number of clusters that are created!

Day 0 clusters


In [ ]:
# Convert value in age_on_platform into seconds
convert_hr_to_sec = lambda x: x*3600
user_data_raw_0day["age_on_platform"] = user_data_raw_0day['age_on_platform'].map(convert_hr_to_sec).copy()
user_data_raw_1st_week["age_on_platform"] = user_data_raw_1st_week['age_on_platform'].map(convert_hr_to_sec).copy()
user_data_raw_rest["age_on_platform"] = user_data_raw_rest['age_on_platform'].map(convert_hr_to_sec).copy()

In [ ]:
# Clustering using Mean shift

from sklearn.cluster import MeanShift, estimate_bandwidth

#x = [1,1,5,6,1,5,10,22,23,23,50,51,51,52,100,112,130,500,512,600,12000,12230]
x = pd.Series(user_data_raw_0day['age_on_platform'])

X = np.array(zip(x,np.zeros(len(x))), dtype=np.int)

# The following bandwidth can be automatically detected using
bandwidth = estimate_bandwidth(X, quantile=0.5)

ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
ms.fit(X)
labels = ms.labels_
cluster_centers = ms.cluster_centers_

labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)

print("number of estimated clusters : %d" % n_clusters_)
for k in range(n_clusters_):
    my_members = labels == k
    print "cluster {0} : lenght = {1}".format(k, len(X[my_members, 0]))
    cluster_sorted = sorted(X[my_members, 0])
    print "cluster {0} : Min = {1} days & Max {2} days".format(k, cluster_sorted[0]*1.15741e-5, cluster_sorted[-1]*1.15741e-5)

# Plot result
import matplotlib.pyplot as plt
from itertools import cycle

%matplotlib inline

plt.figure(1)
plt.clf()

colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
for k, col in zip(range(n_clusters_), colors):
    my_members = labels == k
    cluster_center = cluster_centers[k]
    plt.plot(X[my_members, 0], X[my_members, 1], col + '.')
    plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
             markeredgecolor='k', markersize=14)
plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()

Week 1 clusters


In [ ]:
# Clustering using Mean shift

from sklearn.cluster import MeanShift, estimate_bandwidth

#x = [1,1,5,6,1,5,10,22,23,23,50,51,51,52,100,112,130,500,512,600,12000,12230]
x = pd.Series(user_data_raw_1st_week['age_on_platform'])

X = np.array(zip(x,np.zeros(len(x))), dtype=np.int)

# The following bandwidth can be automatically detected using
bandwidth = estimate_bandwidth(X, quantile=0.1)

ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
ms.fit(X)
labels = ms.labels_
cluster_centers = ms.cluster_centers_

labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)

print("number of estimated clusters : %d" % n_clusters_)
for k in range(n_clusters_):
    my_members = labels == k
    print "cluster {0} : lenght = {1}".format(k, len(X[my_members, 0]))
    cluster_sorted = sorted(X[my_members, 0])
    print "cluster {0} : Min = {1} days & Max {2} days".format(k, cluster_sorted[0]*1.15741e-5, cluster_sorted[-1]*1.15741e-5)

# Plot result
import matplotlib.pyplot as plt
from itertools import cycle

%matplotlib inline

plt.figure(1)
plt.clf()

colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
for k, col in zip(range(n_clusters_), colors):
    my_members = labels == k
    cluster_center = cluster_centers[k]
    plt.plot(X[my_members, 0], X[my_members, 1], col + '.')
    plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
             markeredgecolor='k', markersize=14)
plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()

Post Week 1 Cluster


In [ ]:
# Clustering using Mean shift

from sklearn.cluster import MeanShift, estimate_bandwidth

#x = [1,1,5,6,1,5,10,22,23,23,50,51,51,52,100,112,130,500,512,600,12000,12230]
x = pd.Series(user_data_raw_rest['age_on_platform'])

X = np.array(zip(x,np.zeros(len(x))), dtype=np.int)

# The following bandwidth can be automatically detected using
bandwidth = estimate_bandwidth(X, quantile=0.1)

ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
ms.fit(X)
labels = ms.labels_
cluster_centers = ms.cluster_centers_

labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)

print("number of estimated clusters : %d" % n_clusters_)
for k in range(n_clusters_):
    my_members = labels == k
    print "cluster {0} : lenght = {1}".format(k, len(X[my_members, 0]))
    cluster_sorted = sorted(X[my_members, 0])
    print "cluster {0} : Min = {1} days & Max {2} days".format(k, cluster_sorted[0]*1.15741e-5, cluster_sorted[-1]*1.15741e-5)

# Plot result
import matplotlib.pyplot as plt
from itertools import cycle

%matplotlib inline

plt.figure(1)
plt.clf()

colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
for k, col in zip(range(n_clusters_), colors):
    my_members = labels == k
    cluster_center = cluster_centers[k]
    plt.plot(X[my_members, 0], X[my_members, 1], col + '.')
    plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
             markeredgecolor='k', markersize=14)
plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()

In [ ]: