In [1]:
%matplotlib inline
import pandas as pd
import json
from pandas.io.json import json_normalize
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set(color_codes=True)
from pylab import plot, show, text
import datetime
import matplotlib.dates as mdates
import pylab
In [2]:
# load as Pandas dataframe
df_users = pd.read_csv('takehome_users.csv', encoding='latin-1')
# look at basic column descriptions
df_users.describe()
Out[2]:
In [3]:
# put zero for NaN values
df_users = df_users.fillna(0)
In [4]:
# peek at the dataframe
df_users.head(3)
Out[4]:
In [5]:
# what organization has the most users?
# ANSWER: most users belong to organization id: 1 -> 10 except 8
df_users.org_id.value_counts().head(10)
Out[5]:
In [6]:
# who invited most users
df_users.invited_by_user_id.value_counts().head(10)
Out[6]:
In [7]:
# who are these users who invited the most users , 11770
df_users[(df_users['object_id'] == 10741) | (df_users['object_id'] == 2527) | (df_users['object_id'] == 2308)| (df_users['object_id'] == 1525)| (df_users['object_id'] == 11770)]['name']
Out[7]:
In [8]:
# check if there are blank email
print(len(df_users[df_users.email == None]))
In [9]:
# how many opted in to mailing list
df_users.opted_in_to_mailing_list.value_counts()
Out[9]:
In [10]:
# Create a pie chart
# create dataframe for creation source
df_mail = df_users.opted_in_to_mailing_list.value_counts()
df_mail = df_mail.reset_index()
# Put parameter values
plt.pie(
df_mail['opted_in_to_mailing_list'],
labels=df_mail['index'],
shadow=False,
startangle=0,
autopct='%1.1f%%',
)
# Add title
plt.title('25% percent of the users has opted_in_to_mailing_list')
plt.axis('equal')
# Display plot
plt.tight_layout()
plt.show()
In [11]:
# how many enabled for marketing drip
df_users.enabled_for_marketing_drip.value_counts()
Out[11]:
In [12]:
# Create a pie chart
# create dataframe for creation source
df_drip = df_users.enabled_for_marketing_drip.value_counts()
df_drip = df_drip.reset_index()
# Put parameter values
plt.pie(
df_drip['enabled_for_marketing_drip'],
labels=df_drip['index'],
shadow=False,
startangle=0,
autopct='%1.1f%%',
)
# Add title
plt.title('15% of users have enabled_for_marketing_drip')
plt.axis('equal')
# Display plot
plt.tight_layout()
plt.show()
In [13]:
#creation source distribution
df_users.creation_source.value_counts()
Out[13]:
In [14]:
# Create a pie chart
# create dataframe for creation source
df_source = df_users.creation_source.value_counts()
df_source = df_source.reset_index()
# Put parameter values
plt.pie(
df_source['creation_source'],
labels=df_source['index'],
shadow=False,
startangle=0,
autopct='%1.1f%%',
)
# Add title
plt.title('Percent distribution of creation source')
plt.axis('equal')
# Display plot
plt.tight_layout()
plt.show()
In [15]:
# read file into dataframe
df_engage = pd.read_csv('takehome_user_engagement.csv')
df_engage.head(3)
Out[15]:
In [16]:
# top 10 active users (may or may not be adopted users)
df_engage.user_id.value_counts().head(10)
Out[16]:
In [17]:
# who are these top 3 users?
df_users[(df_users['object_id'] == 3623) | (df_users['object_id'] == 906) | (df_users['object_id'] == 1811)]
Out[17]:
In [18]:
# convert string time stamp into datetime
df_engage['time_stamp'] = pd.to_datetime(df_engage['time_stamp'])
# change index to time_stamp column for timegrouper function (later on)
df_engage.index = pd.to_datetime(df_engage.time_stamp, unit='D')
In [19]:
# create dataframe with users that has logged into the product on three separate days in at least one sevenday period
df_adoption = df_engage.groupby(['user_id', pd.TimeGrouper(freq='7D')]).filter(lambda x: len(x)>1).groupby('user_id').sum()
# reset index
df_adoption = df_adoption.reset_index()
In [20]:
# peek at some data
df_adoption.head(3)
Out[20]:
In [40]:
# merge users and adopted users dataframe
df = df_users.merge(df_adoption, left_on='object_id', right_on='user_id', how='outer')
# drop column user_id since it is duplicate with object_id
df.drop('user_id', axis=1, inplace=True)
# replace NaN with zero
df = df.fillna(0)
from datetime import datetime
# convert unix timestamp to datetime
df['last_session_creation_time'] = df['last_session_creation_time'].apply(
lambda x: datetime.strptime(str(datetime.fromtimestamp(float(int(x)))), '%Y-%m-%d %H:%M:%S'))
df['creation_time'] = df['creation_time'].apply(lambda x: datetime.strptime(str(x), '%Y-%m-%d %H:%M:%S'))
#calculate active days
df['days_since_signup'] = df['last_session_creation_time'] - df['creation_time']
df['days_since_signup'] = df['days_since_signup'].apply(lambda x: abs(x.total_seconds()/60/60/24/30))
#convert creation_source into numeric values
df['creation_source']= df['creation_source'].astype('category')
cat_columns = df.select_dtypes(['category']).columns
df[cat_columns] = df[cat_columns].apply(lambda x: x.cat.codes)
# create column if us
df['adopted_user']=df['visited'].apply(lambda x: int(x > 0))
# column visited is not needed
df.drop('visited', axis=1, inplace=True)
df.head(3)
Out[40]:
In [41]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
#construct the dataset X, y
X = df[['creation_source', 'opted_in_to_mailing_list', 'enabled_for_marketing_drip', 'days_since_signup']]
y = (df.adopted_user == 1)
# Split the data into a training and test set.
Xlr, Xtestlr, ylr, ytestlr = train_test_split(X.values, y.values, test_size=0.20, random_state=5)
clf = LogisticRegression()
# Fit the model on the trainng data.
clf.fit(Xlr, ylr)
# Print the accuracy from the testing data.
print("Accuracy score: ", accuracy_score(clf.predict(Xtestlr), ytestlr))
# Print importance of each features
clf.fit(Xlr / np.std(Xlr, 0), ylr)
print("Regression coefficients: ", clf.coef_)
print("Intecept: ", clf.intercept_)
print("Column names: ", (X.columns.values))