User activity

First lets get out sample file data/activity.txt, which includes activity from users as json entries, one for each line.


In [15]:
%matplotlib inline

import json
import numpy as np
import pandas as pd
from pandas import DataFrame, Series

activity = [json.loads(s) for s in open('data/activity.txt')]
activity[0]


Out[15]:
{u'action': u'log-in', u'time': u'00:00:01', u'user': 1}

In [16]:
frame = DataFrame(activity)
frame[:5]


Out[16]:
action time user
0 log-in 00:00:01 1
1 browsing 00:10:00 1
2 browsing 00:08:00 1
3 browsing 00:05:00 1
4 log-out 00:00:01 1

In [17]:
frame['action'][:2]


Out[17]:
0      log-in
1    browsing
Name: action, dtype: object

In [18]:
action_counts = frame['action'].value_counts()
action_counts


Out[18]:
browsing    11
log-out      5
log-in       5
dtype: int64

In [19]:
action_counts.plot(kind='barh', rot=0)


Out[19]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f7799209250>

In [20]:
results = Series([col for col in frame.action])
results[:5]


Out[20]:
0      log-in
1    browsing
2    browsing
3    browsing
4     log-out
dtype: object

In [21]:
results.value_counts()


Out[21]:
browsing    11
log-out      5
log-in       5
dtype: int64

In [22]:
browsing = np.where(frame['action'].str.contains('browsing'), 'browsing', 'no-browsing')
browsing[:5]


Out[22]:
array(['no-browsing', 'browsing', 'browsing', 'browsing', 'no-browsing'], 
      dtype='|S11')

In [23]:
by_action_browsing = frame.groupby(['user', browsing])
browsing_counts = by_action_browsing.size().unstack().fillna(0)
browsing_counts


Out[23]:
browsing no-browsing
user
1 3 2
2 2 2
3 3 4
4 3 2

In [24]:
indexer = browsing_counts.sum(1).argsort()
indexer


Out[24]:
user
1    1
2    0
3    3
4    2
dtype: int64

In [25]:
count_subset = browsing_counts.take(indexer)[-10:]
count_subset


Out[25]:
browsing no-browsing
user
2 2 2
1 3 2
4 3 2
3 3 4

In [26]:
count_subset.plot(kind='barh', stacked=True)


Out[26]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f779904a690>

In [27]:
normed_subset = count_subset.div(count_subset.sum(1), axis=0)
normed_subset.plot(kind='barh', stacked=True)


Out[27]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f7798f8d110>