In [ ]:
%reset
In [ ]:
# Import the required modules
import pandas as pd
import numpy as np
import scipy as sp
In [ ]:
# simple function to read in the user data file.
# the argument parse_dates takes in a list of colums, which are to be parsed as date format
user_data_raw_0day = pd.read_csv\
("/home/eyebell/local_bin/janacare/janCC/datasets/user_retention_email-campaign/user_data_binned_post30thApril_0day.csv",\
parse_dates = [-5 ,-4 ,-3])
In [ ]:
# user data first week only
user_data_raw_1st_week = pd.DataFrame()
filelist = ['user_data_binned_post30thApril_1st-day.csv', 'user_data_binned_post30thApril_1st-week.csv']
dflist = []
path = r'/home/eyebell/local_bin/janacare/janCC/datasets/user_retention_email-campaign/'
for filename in filelist:
df = pd.read_csv(path+filename, index_col=None, header=0, parse_dates = [-5 ,-4 ,-3])
dflist.append(df)
user_data_raw_1st_week = pd.concat(dflist)
In [ ]:
# user data post first week
user_data_raw_rest = pd.DataFrame()
filelist = ['user_data_binned_post30thApril_12thweek-to-6thmonth.csv', 'user_data_binned_post30thApril_2nd-week.csv',\
'user_data_binned_post30thApril_3rd-week.csv', 'user_data_binned_post30thApril_4th-to-6th-week.csv', \
'user_data_binned_post30thApril_4th-week.csv', 'user_data_binned_post30thApril_6thmonth-to-1year.csv', \
'user_data_binned_post30thApril_6th-to-8th-week.csv', 'user_data_binned_post30thApril_8th-to-12th-week.csv',\
'user_data_binned_post30thApril_beyond-1year.csv']
dflist = []
path = r'/home/eyebell/local_bin/janacare/janCC/datasets/user_retention_email-campaign/'
for filename in filelist:
df = pd.read_csv(path+filename, index_col=None, header=0, parse_dates = [-5 ,-4 ,-3])
dflist.append(df)
user_data_raw_rest = pd.concat(dflist)
In [ ]:
user_data_raw_1st_week.info()
In [ ]:
user_data_raw_0day.info()
user_data_raw_0day.loc[1]
In [ ]:
user_data_raw_rest.info()
In [ ]:
# Convert value in age_on_platform into seconds
convert_hr_to_sec = lambda x: x*3600
user_data_raw_0day["age_on_platform"] = user_data_raw_0day['age_on_platform'].map(convert_hr_to_sec).copy()
user_data_raw_1st_week["age_on_platform"] = user_data_raw_1st_week['age_on_platform'].map(convert_hr_to_sec).copy()
user_data_raw_rest["age_on_platform"] = user_data_raw_rest['age_on_platform'].map(convert_hr_to_sec).copy()
In [ ]:
# Clustering using Mean shift
from sklearn.cluster import MeanShift, estimate_bandwidth
#x = [1,1,5,6,1,5,10,22,23,23,50,51,51,52,100,112,130,500,512,600,12000,12230]
x = pd.Series(user_data_raw_0day['age_on_platform'])
X = np.array(zip(x,np.zeros(len(x))), dtype=np.int)
# The following bandwidth can be automatically detected using
bandwidth = estimate_bandwidth(X, quantile=0.5)
ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
ms.fit(X)
labels = ms.labels_
cluster_centers = ms.cluster_centers_
labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)
print("number of estimated clusters : %d" % n_clusters_)
for k in range(n_clusters_):
my_members = labels == k
print "cluster {0} : lenght = {1}".format(k, len(X[my_members, 0]))
cluster_sorted = sorted(X[my_members, 0])
print "cluster {0} : Min = {1} days & Max {2} days".format(k, cluster_sorted[0]*1.15741e-5, cluster_sorted[-1]*1.15741e-5)
# Plot result
import matplotlib.pyplot as plt
from itertools import cycle
%matplotlib inline
plt.figure(1)
plt.clf()
colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
for k, col in zip(range(n_clusters_), colors):
my_members = labels == k
cluster_center = cluster_centers[k]
plt.plot(X[my_members, 0], X[my_members, 1], col + '.')
plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
markeredgecolor='k', markersize=14)
plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()
In [ ]:
# Clustering using Mean shift
from sklearn.cluster import MeanShift, estimate_bandwidth
#x = [1,1,5,6,1,5,10,22,23,23,50,51,51,52,100,112,130,500,512,600,12000,12230]
x = pd.Series(user_data_raw_1st_week['age_on_platform'])
X = np.array(zip(x,np.zeros(len(x))), dtype=np.int)
# The following bandwidth can be automatically detected using
bandwidth = estimate_bandwidth(X, quantile=0.1)
ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
ms.fit(X)
labels = ms.labels_
cluster_centers = ms.cluster_centers_
labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)
print("number of estimated clusters : %d" % n_clusters_)
for k in range(n_clusters_):
my_members = labels == k
print "cluster {0} : lenght = {1}".format(k, len(X[my_members, 0]))
cluster_sorted = sorted(X[my_members, 0])
print "cluster {0} : Min = {1} days & Max {2} days".format(k, cluster_sorted[0]*1.15741e-5, cluster_sorted[-1]*1.15741e-5)
# Plot result
import matplotlib.pyplot as plt
from itertools import cycle
%matplotlib inline
plt.figure(1)
plt.clf()
colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
for k, col in zip(range(n_clusters_), colors):
my_members = labels == k
cluster_center = cluster_centers[k]
plt.plot(X[my_members, 0], X[my_members, 1], col + '.')
plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
markeredgecolor='k', markersize=14)
plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()
In [ ]:
# Clustering using Mean shift
from sklearn.cluster import MeanShift, estimate_bandwidth
#x = [1,1,5,6,1,5,10,22,23,23,50,51,51,52,100,112,130,500,512,600,12000,12230]
x = pd.Series(user_data_raw_rest['age_on_platform'])
X = np.array(zip(x,np.zeros(len(x))), dtype=np.int)
# The following bandwidth can be automatically detected using
bandwidth = estimate_bandwidth(X, quantile=0.1)
ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
ms.fit(X)
labels = ms.labels_
cluster_centers = ms.cluster_centers_
labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)
print("number of estimated clusters : %d" % n_clusters_)
for k in range(n_clusters_):
my_members = labels == k
print "cluster {0} : lenght = {1}".format(k, len(X[my_members, 0]))
cluster_sorted = sorted(X[my_members, 0])
print "cluster {0} : Min = {1} days & Max {2} days".format(k, cluster_sorted[0]*1.15741e-5, cluster_sorted[-1]*1.15741e-5)
# Plot result
import matplotlib.pyplot as plt
from itertools import cycle
%matplotlib inline
plt.figure(1)
plt.clf()
colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
for k, col in zip(range(n_clusters_), colors):
my_members = labels == k
cluster_center = cluster_centers[k]
plt.plot(X[my_members, 0], X[my_members, 1], col + '.')
plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
markeredgecolor='k', markersize=14)
plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()
In [ ]: