In [1]:

    
%matplotlib inline
import pandas as pd
import json
from pandas.io.json import json_normalize
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set(color_codes=True)
from pylab import plot, show, text
import datetime
import matplotlib.dates as mdates
import pylab



In [2]:

    
# load as Pandas dataframe
df_users = pd.read_csv('takehome_users.csv', encoding='latin-1')

# look at basic column descriptions
df_users.describe()









    Out[2]:







  
    
      
      object_id
      last_session_creation_time
      opted_in_to_mailing_list
      enabled_for_marketing_drip
      org_id
      invited_by_user_id
    
  
  
    
      count
      12000.00000
      8.823000e+03
      12000.000000
      12000.000000
      12000.000000
      6417.000000
    
    
      mean
      6000.50000
      1.379279e+09
      0.249500
      0.149333
      141.884583
      5962.957145
    
    
      std
      3464.24595
      1.953116e+07
      0.432742
      0.356432
      124.056723
      3383.761968
    
    
      min
      1.00000
      1.338452e+09
      0.000000
      0.000000
      0.000000
      3.000000
    
    
      25%
      3000.75000
      1.363195e+09
      0.000000
      0.000000
      29.000000
      3058.000000
    
    
      50%
      6000.50000
      1.382888e+09
      0.000000
      0.000000
      108.000000
      5954.000000
    
    
      75%
      9000.25000
      1.398443e+09
      0.000000
      0.000000
      238.250000
      8817.000000
    
    
      max
      12000.00000
      1.402067e+09
      1.000000
      1.000000
      416.000000
      11999.000000



In [3]:

    
# put zero for NaN values
df_users = df_users.fillna(0)



In [4]:

    
# peek at the dataframe
df_users.head(3)









    Out[4]:







  
    
      
      object_id
      creation_time
      name
      email
      creation_source
      last_session_creation_time
      opted_in_to_mailing_list
      enabled_for_marketing_drip
      org_id
      invited_by_user_id
    
  
  
    
      0
      1
      2014-04-22 03:53:30
      Clausen August
      AugustCClausen@yahoo.com
      GUEST_INVITE
      1.398139e+09
      1
      0
      11
      10803.0
    
    
      1
      2
      2013-11-15 03:45:04
      Poole Matthew
      MatthewPoole@gustr.com
      ORG_INVITE
      1.396238e+09
      0
      0
      1
      316.0
    
    
      2
      3
      2013-03-19 23:14:52
      Bottrill Mitchell
      MitchellBottrill@gustr.com
      ORG_INVITE
      1.363735e+09
      0
      0
      94
      1525.0



In [5]:

    
# what organization has the most users?
# ANSWER: most users belong to organization id: 1 -> 10 except 8
df_users.org_id.value_counts().head(10)









    Out[5]:





0     319
1     233
2     201
3     168
4     159
6     138
5     128
9     124
7     119
10    104
Name: org_id, dtype: int64



In [6]:

    
# who invited most users
df_users.invited_by_user_id.value_counts().head(10)









    Out[6]:





0.0        5583
10741.0      13
2527.0       12
1525.0       11
2308.0       11
11770.0      11
4612.0       10
7012.0       10
10481.0      10
4803.0       10
Name: invited_by_user_id, dtype: int64



In [7]:

    
# who are these users who invited the most users , 11770
df_users[(df_users['object_id'] == 10741) | (df_users['object_id'] == 2527) | (df_users['object_id'] == 2308)| (df_users['object_id'] == 1525)| (df_users['object_id'] == 11770)]['name']









    Out[7]:





1524         Kay Alice
2307       Melo Leonor
2526     Ribeiro Diego
10740    Tennant Piper
11769    Barbosa Tiago
Name: name, dtype: object



In [8]:

    
# check if there are blank email
print(len(df_users[df_users.email == None]))



In [9]:

    
# how many opted in to mailing list
df_users.opted_in_to_mailing_list.value_counts()









    Out[9]:





0    9006
1    2994
Name: opted_in_to_mailing_list, dtype: int64



In [10]:

    
# Create a pie chart

# create dataframe for creation source
df_mail = df_users.opted_in_to_mailing_list.value_counts()
df_mail = df_mail.reset_index()

# Put parameter values
plt.pie(
    df_mail['opted_in_to_mailing_list'],
    labels=df_mail['index'],
    shadow=False,
    startangle=0,
    autopct='%1.1f%%',
    )

# Add title
plt.title('25% percent of the users has opted_in_to_mailing_list')
plt.axis('equal')

# Display plot 
plt.tight_layout()
plt.show()



In [11]:

    
# how many enabled for marketing drip
df_users.enabled_for_marketing_drip.value_counts()









    Out[11]:





0    10208
1     1792
Name: enabled_for_marketing_drip, dtype: int64



In [12]:

    
# Create a pie chart

# create dataframe for creation source
df_drip = df_users.enabled_for_marketing_drip.value_counts()
df_drip = df_drip.reset_index()

# Put parameter values
plt.pie(
    df_drip['enabled_for_marketing_drip'],
    labels=df_drip['index'],
    shadow=False,
    startangle=0,
    autopct='%1.1f%%',
    )

# Add title
plt.title('15% of users have enabled_for_marketing_drip')
plt.axis('equal')

# Display plot
plt.tight_layout()
plt.show()



In [13]:

    
#creation source distribution
df_users.creation_source.value_counts()









    Out[13]:





ORG_INVITE            4254
GUEST_INVITE          2163
PERSONAL_PROJECTS     2111
SIGNUP                2087
SIGNUP_GOOGLE_AUTH    1385
Name: creation_source, dtype: int64



In [14]:

    
# Create a pie chart

# create dataframe for creation source
df_source = df_users.creation_source.value_counts()
df_source = df_source.reset_index()

# Put parameter values
plt.pie(
    df_source['creation_source'],
    labels=df_source['index'],
    shadow=False,
    startangle=0,
    autopct='%1.1f%%',
    )

# Add title
plt.title('Percent distribution of creation source')
plt.axis('equal')

# Display plot
plt.tight_layout()
plt.show()



In [15]:

    
# read file into dataframe
df_engage = pd.read_csv('takehome_user_engagement.csv')
df_engage.head(3)









    Out[15]:







  
    
      
      time_stamp
      user_id
      visited
    
  
  
    
      0
      2014-04-22 03:53:30
      1
      1
    
    
      1
      2013-11-15 03:45:04
      2
      1
    
    
      2
      2013-11-29 03:45:04
      2
      1



In [16]:

    
# top 10 active users (may or may not be adopted users)
df_engage.user_id.value_counts().head(10)









    Out[16]:





3623    606
906     600
1811    593
7590    590
8068    585
9402    581
4022    580
3269    566
9312    547
7210    545
Name: user_id, dtype: int64



In [17]:

    
# who are these top 3 users?
df_users[(df_users['object_id'] == 3623) | (df_users['object_id'] == 906) | (df_users['object_id'] == 1811)]









    Out[17]:







  
    
      
      object_id
      creation_time
      name
      email
      creation_source
      last_session_creation_time
      opted_in_to_mailing_list
      enabled_for_marketing_drip
      org_id
      invited_by_user_id
    
  
  
    
      905
      906
      2012-06-17 22:49:52
      Becker Jessika
      JessikaBecker@gmail.com
      SIGNUP_GOOGLE_AUTH
      1.401922e+09
      0
      0
      358
      0.0
    
    
      1810
      1811
      2012-06-19 16:09:31
      Carvalho Eduardo
      EduardoCunhaCarvalho@gmail.com
      SIGNUP_GOOGLE_AUTH
      1.400948e+09
      1
      1
      33
      0.0
    
    
      3622
      3623
      2012-06-25 14:34:33
      Melo Vitória
      VitoriaAlvesMelo@jourrapide.com
      ORG_INVITE
      1.401201e+09
      0
      0
      364
      3623.0



In [18]:

    
# convert string time stamp into datetime
df_engage['time_stamp'] = pd.to_datetime(df_engage['time_stamp'])

# change index to time_stamp column for timegrouper function (later on)
df_engage.index = pd.to_datetime(df_engage.time_stamp, unit='D')



In [19]:

    
# create dataframe with users that has logged into the product on three separate days in at least one sevenday period
df_adoption = df_engage.groupby(['user_id', pd.TimeGrouper(freq='7D')]).filter(lambda x: len(x)>1).groupby('user_id').sum()

# reset index
df_adoption = df_adoption.reset_index()



In [20]:

    
# peek at some data 
df_adoption.head(3)



In [40]:

    
# merge users and adopted users dataframe
df = df_users.merge(df_adoption, left_on='object_id', right_on='user_id', how='outer')

# drop column user_id since it is duplicate with object_id
df.drop('user_id', axis=1, inplace=True)

# replace NaN with zero
df = df.fillna(0)

from datetime import datetime
# convert unix timestamp to datetime
df['last_session_creation_time'] = df['last_session_creation_time'].apply(
    lambda x: datetime.strptime(str(datetime.fromtimestamp(float(int(x)))), '%Y-%m-%d %H:%M:%S'))
df['creation_time'] = df['creation_time'].apply(lambda x: datetime.strptime(str(x), '%Y-%m-%d %H:%M:%S'))

#calculate active days
df['days_since_signup'] = df['last_session_creation_time'] - df['creation_time']
df['days_since_signup'] = df['days_since_signup'].apply(lambda x: abs(x.total_seconds()/60/60/24/30)) 

#convert creation_source into numeric values
df['creation_source']= df['creation_source'].astype('category')
cat_columns = df.select_dtypes(['category']).columns
df[cat_columns] = df[cat_columns].apply(lambda x: x.cat.codes)

# create column if us
df['adopted_user']=df['visited'].apply(lambda x: int(x > 0))

# column visited is not needed
df.drop('visited', axis=1, inplace=True)

df.head(3)









    Out[40]:







  
    
      
      object_id
      creation_time
      name
      email
      creation_source
      last_session_creation_time
      opted_in_to_mailing_list
      enabled_for_marketing_drip
      org_id
      invited_by_user_id
      days_since_signup
      adopted_user
    
  
  
    
      0
      1
      2014-04-22 03:53:30
      Clausen August
      AugustCClausen@yahoo.com
      0
      2014-04-21 20:53:30
      1
      0
      11
      10803.0
      0.009722
      0
    
    
      1
      2
      2013-11-15 03:45:04
      Poole Matthew
      MatthewPoole@gustr.com
      1
      2014-03-30 20:45:04
      0
      0
      1
      316.0
      4.523611
      1
    
    
      2
      3
      2013-03-19 23:14:52
      Bottrill Mitchell
      MitchellBottrill@gustr.com
      1
      2013-03-19 16:14:52
      0
      0
      94
      1525.0
      0.009722
      0



In [41]:

    
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

#construct the dataset X, y
X = df[['creation_source', 'opted_in_to_mailing_list', 'enabled_for_marketing_drip', 'days_since_signup']]
y = (df.adopted_user == 1)

# Split the data into a training and test set.
Xlr, Xtestlr, ylr, ytestlr = train_test_split(X.values, y.values, test_size=0.20, random_state=5)

clf = LogisticRegression()
# Fit the model on the trainng data.
clf.fit(Xlr, ylr)
# Print the accuracy from the testing data.
print("Accuracy score: ", accuracy_score(clf.predict(Xtestlr), ytestlr))

# Print importance of each features
clf.fit(Xlr / np.std(Xlr, 0), ylr)
print("Regression coefficients: ", clf.coef_)
print("Intecept: ", clf.intercept_) 
print("Column names: ", (X.columns.values))









    



Accuracy score:  0.8375
Regression coefficients:  [[-0.1408732   0.03920644 -0.01876419 -1.59402912]]
Intecept:  [-1.04834014]
Column names:  ['creation_source' 'opted_in_to_mailing_list' 'enabled_for_marketing_drip'
 'days_since_signup']

Results/Interpretation/Further Research:

Results:
I used the algorithm Logistics Regression since the target variable is binary (0, 1) if the user is adopted (1) or not (0). The features are 'creation_source', 'opted_in_to_mailing_list', 'enabled_for_marketing_drip', 'days_since_signup'. Accuracy score is 83.75% and the regression coefficients are [[-0.1408732 0.03920644 -0.01876419 -1.59402912]].
Interpretation:
Creation source most important is Org invite and least is signup google auth. Having a high value on creation source is 0.14 unit decrease in the odds of being an adopted user.
Opted in mailing list is 0.03 unit increase in the probability that the user becomes adopted user.
Enabled for marketing drip is .02 unit decrease in the likelihood of being an adopted user.
Number of days since signup is 1.6 unit decrease in the chance that user is an adopted user
Further research/data:
(Assuming that Relax Inc. is a spa facility) Add user information such as gender, age, location, salary range and number of friends/co-workers/relatives who are also member. Also, user profile on social network and their outlook regarding about health and fitness can also help to improve this prediction.

	object_id	last_session_creation_time	opted_in_to_mailing_list	enabled_for_marketing_drip	org_id	invited_by_user_id
count	12000.00000	8.823000e+03	12000.000000	12000.000000	12000.000000	6417.000000
mean	6000.50000	1.379279e+09	0.249500	0.149333	141.884583	5962.957145
std	3464.24595	1.953116e+07	0.432742	0.356432	124.056723	3383.761968
min	1.00000	1.338452e+09	0.000000	0.000000	0.000000	3.000000
25%	3000.75000	1.363195e+09	0.000000	0.000000	29.000000	3058.000000
50%	6000.50000	1.382888e+09	0.000000	0.000000	108.000000	5954.000000
75%	9000.25000	1.398443e+09	0.000000	0.000000	238.250000	8817.000000
max	12000.00000	1.402067e+09	1.000000	1.000000	416.000000	11999.000000

	object_id	creation_time	name	email	creation_source	last_session_creation_time	opted_in_to_mailing_list	org_id	invited_by_user_id
0	1	2014-04-22 03:53:30	Clausen August	AugustCClausen@yahoo.com	GUEST_INVITE	1.398139e+09	1	11	10803.0
1	2	2013-11-15 03:45:04	Poole Matthew	MatthewPoole@gustr.com	ORG_INVITE	1.396238e+09	0	1	316.0
2	3	2013-03-19 23:14:52	Bottrill Mitchell	MitchellBottrill@gustr.com	ORG_INVITE	1.363735e+09	0	94	1525.0

	time_stamp	user_id	visited
0	2014-04-22 03:53:30	1	1
1	2013-11-15 03:45:04	2	1
2	2013-11-29 03:45:04	2	1

	object_id	creation_time	name	email	creation_source	last_session_creation_time	opted_in_to_mailing_list	enabled_for_marketing_drip	org_id	invited_by_user_id
905	906	2012-06-17 22:49:52	Becker Jessika	JessikaBecker@gmail.com	SIGNUP_GOOGLE_AUTH	1.401922e+09	0	0	358	0.0
1810	1811	2012-06-19 16:09:31	Carvalho Eduardo	EduardoCunhaCarvalho@gmail.com	SIGNUP_GOOGLE_AUTH	1.400948e+09	1	1	33	0.0
3622	3623	2012-06-25 14:34:33	Melo Vitória	VitoriaAlvesMelo@jourrapide.com	ORG_INVITE	1.401201e+09	0	0	364	3623.0

Results/Interpretation/Further Research:

END OF REPORT