In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
In [2]:
users = pd.read_csv('timeseries_users.csv')
users.head()
Out[2]:
In [3]:
events = pd.read_csv('timeseries_events.csv')
events.index = pd.to_datetime(events['event_date'], format='%Y-%m-%d %H:%M:%S')
del events['event_date']
events.tail()
Out[3]:
In [4]:
users.describe()
Out[4]:
User's age mean is from 24 to 63 years old, with a mean of 41 years old.
In [5]:
# 2. Check for NaNs:
print(users.isnull().values.any())
print(events.isnull().values.any())
In [6]:
# 3. Check for duplicated entries
users_duplicated = users[users.duplicated() == True ]
print('Users: duplicated entries {}'.format(len(users_duplicated)))
events_duplicated = events[events.duplicated() == True ]
print('Events: duplicated entries {}'.format(len(events_duplicated)))
Many duplicated entries are found in the events dataset.
We could decide to drop them if needed. Here I keep them because I don't know if duplicates are valid entries of this particular dataset.
In [7]:
# 1. count all events for each user:
events_per_user = events.groupby('user_id').size()
events_per_user.head()
# Select only 30+ male users:
for user_id in events_per_user.index:
if user_id in users['user_id'].values:
user = users[ users['user_id'] ==user_id]
age = user['age'].values[0]
gender = user['gender'].values[0]
if ( age < 30 ) or (gender == 'f'):
del events_per_user[user_id]
else:
del events_per_user[user_id]
In [8]:
print(type(events_per_user))
events_per_user.values
Out[8]:
In [9]:
sns.set(style="ticks")
# Show the results of a linear regression within each dataset
ax = sns.distplot(events_per_user.values)
ax.set_title('Event per male users of age 30+ old')
ax.set_ylabel('Normalized distribution')
ax.set_xlabel('Counts')
Out[9]:
For each user, compute the list of inter-event intervals in days. An inter-event interval is the period of time between an event and the one directly before it in time for the same user. Once you have a list of all the inter-event intervals across all users, plot a histogram of them below:
In [10]:
def get_inter_events(events_per_user):
"""From a list of events for a given user, gets a list of inter time events in dates."""
from datetime import datetime
nanosecond_to_days=float(1.15741e-14)
inter_times = []
for event_index in range(1,len(events_per_user)):
time1 = events_per_user[event_index-1]
time2 = events_per_user[event_index]
time_diff = time2 - time1
# Convert from nanoseconds to days:
time_diff = int(float(time_diff)*nanosecond_to_days)
inter_times.append(time_diff)
return inter_times
# Cycle by user
inter_event_intervals=[]
for user_id in users['user_id'].values:
# Get events for this user:
events_per_user = events[events['user_id']==user_id].sort_index()
events_per_user = events_per_user.index.values
if len(events_per_user) > 1:
inter_event_intervals_this = get_inter_events(events_per_user)
inter_event_intervals = list(inter_event_intervals)+ list(inter_event_intervals_this)
In [11]:
inter_event_intervals=np.array(inter_event_intervals)
type(inter_event_intervals)
Out[11]:
In [12]:
print(len(inter_event_intervals))
print(inter_event_intervals.shape)
In [13]:
sns.set(style="ticks")
# Show the results of a linear regression within each dataset
ax = sns.distplot(inter_event_intervals)
ax.set_ylim(0,0.005)
ax.set_title('Inter-event intervals')
ax.set_ylabel('Normalized distribution')
ax.set_xlabel('Inter-event interval (days)')
Out[13]:
In [ ]: