First lets get out sample file data/activity.txt
, which includes activity from users as json entries, one for each line.
In [15]:
%matplotlib inline
import json
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
activity = [json.loads(s) for s in open('data/activity.txt')]
activity[0]
Out[15]:
In [16]:
frame = DataFrame(activity)
frame[:5]
Out[16]:
In [17]:
frame['action'][:2]
Out[17]:
In [18]:
action_counts = frame['action'].value_counts()
action_counts
Out[18]:
In [19]:
action_counts.plot(kind='barh', rot=0)
Out[19]:
In [20]:
results = Series([col for col in frame.action])
results[:5]
Out[20]:
In [21]:
results.value_counts()
Out[21]:
In [22]:
browsing = np.where(frame['action'].str.contains('browsing'), 'browsing', 'no-browsing')
browsing[:5]
Out[22]:
In [23]:
by_action_browsing = frame.groupby(['user', browsing])
browsing_counts = by_action_browsing.size().unstack().fillna(0)
browsing_counts
Out[23]:
In [24]:
indexer = browsing_counts.sum(1).argsort()
indexer
Out[24]:
In [25]:
count_subset = browsing_counts.take(indexer)[-10:]
count_subset
Out[25]:
In [26]:
count_subset.plot(kind='barh', stacked=True)
Out[26]:
In [27]:
normed_subset = count_subset.div(count_subset.sum(1), axis=0)
normed_subset.plot(kind='barh', stacked=True)
Out[27]: