In [122]:
%matplotlib inline
import pandas as pd
import json
from pandas.io.json import json_normalize
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set(color_codes=True)
from pylab import plot, show, text
import datetime
import matplotlib.dates as mdates
import pylab
In [123]:
# load as Pandas dataframe
df_logins = pd.read_json('logins.json')
df_logins.index = pd.to_datetime(df_logins.login_time, unit='m')
df_freq = df_logins.groupby(pd.TimeGrouper(freq='15Min')).agg(['count']).rename(columns=dict(count='counts'))
df_freq.columns = df_freq.columns.droplevel(0)
df_freq = df_freq.reset_index(drop=False)
In [124]:
# do scatterplot and moving window average
def window_lag(size):
return np.ones(size)/float(size)
# assign data
timepart = df_freq.login_time
height = df_freq.counts
plot(timepart,height,'k.')
plot(timepart,np.convolve(height,window_lag(50),'same'),'b')
plot(timepart,np.convolve(height,window_lag(100),'same'),'r')
pylab.title('Average logins is increasing from 10 to 15 from Jan to Apr 1970')
show()
In [125]:
#sort data by counts in descending order
df_sorted = df_freq['counts'].sort_values(ascending=False)
#get only those times with 50 or more logins
df = df_freq[df_freq.loc[df_sorted.index]['counts'] >= 50].reset_index(drop=True)
df['_time_']=df['login_time'].dt.strftime('%H:%M:%S')
df = df.sort_values(by='_time_', ascending=True)
#format time into 2 decimal places like 4:30 -> 4.30
df['_time_']=df['_time_'].apply(lambda x: float(x.replace(':',''))/10000)
#draw bar chart
ax=sns.barplot(x="_time_", y="counts", data=df, linewidth=2.5, facecolor=(1, 1, 1, 0), errcolor=".2", edgecolor=".2")
ax.set(ylabel='# of logins', xlabel='login time')
plt.xticks(rotation=45)
#show chart
plt.show()
In [126]:
import json
#read and load json file
with open('ultimate_data_challenge.json') as f:
data = json.load(f)
#convert to dataframe
df_usage = pd.DataFrame(data)
df_backup = pd.DataFrame(data)
#Winterfell |Astapor |King's Landing
#df_usage = df_usage[df_usage.city == 'King\'s Landing']
#convert string to datetime
df_usage['last_trip_date'] = pd.to_datetime(df_usage['last_trip_date'])
df_usage['signup_date'] = pd.to_datetime(df_usage['signup_date'])
#calculate number of days that user is active
df_usage['num_of_days_active'] = df_usage['last_trip_date'] - df_usage['signup_date']
#assign 1 if active, 0 if not active for past
df_usage['active_user'] = df_usage['num_of_days_active'].apply(lambda x: int(x >= pd.Timedelta('150 days')))
#assign 1 if iPhone, 0 if not
df_usage['phone'] = df_usage['phone'].apply(lambda x: int(x == 'iPhone'))
#assign 1 if True, 0 if False
df_usage['ultimate_black_user'] = df_usage['ultimate_black_user'].apply(lambda x: int(x == True))
#drop columns with NaN
#df_usage=df_usage.fillna(0)
#df_usage=df_usage.fillna(df_usage.mean())
df_usage=df_usage.dropna(0)
In [127]:
df_usage.describe()
Out[127]:
In [128]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
#construct the dataset X, y
X = df_usage[['avg_dist', 'avg_rating_by_driver', 'avg_rating_of_driver', 'avg_surge',
'phone', 'surge_pct', 'trips_in_first_30_days', 'ultimate_black_user', 'weekday_pct']]
y = (df_usage.active_user == 1)
# Split the data into a training and test set.
Xlr, Xtestlr, ylr, ytestlr = train_test_split(X.values, y.values, test_size=0.20, random_state=5)
clf = LogisticRegression()
# Fit the model on the trainng data.
clf.fit(Xlr, ylr)
# Print the accuracy from the testing data.
print("Accuracy score: ", accuracy_score(clf.predict(Xtestlr), ytestlr))
# Print importance of each features
clf.fit(Xlr / np.std(Xlr, 0), ylr)
print("Regression coefficients: ", clf.coef_)
print("Column names: ", (X.columns.values))
Data description
● city: city this user signed up in
● phone: primary device for this user
● signup_date: date of account registration; in the form ‘YYYY MM DD’
● last_trip_date: the last time this user completed a trip; in the form ‘YYYY MM DD’
● avg_dist: the average distance in miles per trip taken in the first 30 days after signup
● avg_rating_by_driver: the rider’s average rating over all of their trips
● avg_rating_of_driver: the rider’s average rating of their drivers over all of their trips
● surge_pct: the percent of trips taken with surge multiplier > 1
● avg_surge: The average surge multiplier over all of this user’s trips
● trips_in_first_30_days: the number of trips this user took in the first 30 days after
signing up
● ultimate_black_user: TRUE if the user took an Ultimate Black in their first 30 days;
FALSE otherwise
● weekday_pct: the percent of the user’s trips occurring during a weekday
In [129]:
#END OF REPORT