In [58]:
import numpy as np
import pandas as pd
import scipy.stats as stats
from matplotlib import pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
import json
%run -i nb_analysis # Run instead of import so we can access 'df' defined in notebook
%matplotlib inline
users = np.array(['A', 'B', 'C', 'D', 'E'])
noc = np.array(['A', 'B', 'D', 'E'])
In [117]:
df_all = pd.read_json('combined.json', orient='records')
In [111]:
#pd.DataFrame(df, columns=['tabsActivated'])
#df.info() # Column information
colors = sns.color_palette()
c = colors[0]
con, coff = colors[1], colors[0]
co = 'y'
df_all.info()
dfnoc = df_all[:].drop(df_all.index[[4,5]]).reset_index()
dfnoc.index
Out[111]:
In [105]:
#df['name'][2],df['state'][2], df['tabsOpen'][2], df['name'][3], df['state'][3], df['tabsOpen'][3]
#[df['tabsOpen'][index].keys() for index in df.index]
#df['tabsActivated'][0]['index'][23:26], df['tabsActivated'][0]['numOpenTabs'][23:26], df['tabsActivated'][0]['id'][23:26]
Out[105]:
In [ ]:
# URLs visited vs URLs revisited
In [54]:
# Tab switching over time
fig = plt.figure(figsize=(16, 8)) # Larger figure
nrows, ncols = len(df.index) / 2, 2
for index in df.index:
x, y = nestedxy(index=index, maincol='tabsActivated', xcol='time', ycol='index')
plt.subplot(nrows, ncols, index + 1)
plt.hold(True)
plt.plot(x, y, 'ro', markersize=3)
plt.plot(x, y)
plt.xlabel('Time (s)')
plt.ylabel('Active tab index')
plt.title('Run %d (%s)' % (index, df['state'][index]))
plt.autoscale(tight='x')
plt.tight_layout()
In [137]:
from scipy.stats import ttest_ind, ttest_rel
# Number of tab switches per second (ON vs OFF)
on, off = [], []
for index in df.index:
state = df['state'][index]
n_switches = len(df['tabsActivated'][index]['index'])
last_switch_time = df['tabsActivated'][index]['time'][-1]
first_switch_time = df['tabsActivated'][index]['time'][0]
#duration = (df['endTime'][index] - df['startTime'][index]) / 1000. # ms -> s
duration = last_switch_time - first_switch_time
duration /= 1000. # ms -> s
switches_per_second = n_switches / duration
if state == 'ON':
on.append(switches_per_second)
else:
off.append(switches_per_second)
on = 60 * np.array(on)
off = 60 * np.array(off)
print on
print off
print "Independent: ", ttest_ind(on, off, equal_var=False)
print "Dependent: ", ttest_rel(on, off)
In [151]:
# Number of tabs open, average
#on = np.array([df['tabsOpened'][index]['numOpenTabs'])
on = [np.array(df['tabsOpen'][index]['numOpenTabs']).mean() for index in df.index if index % 2 == 0]
off = [np.array(df['tabsOpen'][index]['numOpenTabs']).mean() for index in df.index if index % 2 == 1]
ttest_ind(on, off, equal_var=False)
Out[151]:
In [223]:
# Tab switching over time
def tab_diff(y0, y1, n1):
"""Returns the min number of shifts required to go from tab y0 to tab y1.
Args:
y0: Initial tab index
y1: Final tab index
n1: Number of open tabs when switch occurs
Returns:
Minimum number of switches required.
"""
diff_right = (y1 - y0) % n1
diff_left = (y0 - y1) % n1
return min(diff_right, diff_left)
df = dfnoc
fig = plt.figure(figsize=(32, 16)) # Larger figure
nrows, ncols = len(df.index) / 2, 2
for index in df.index:
x, y = nestedxy(df=df, index=index, maincol='tabsActivated', xcol='time', ycol='index')
_, y_open = nestedxy(df=df, index=index, maincol='tabsActivated', xcol='time', ycol='numOpenTabs')
x_diff = x[1:]
y_diff = [tab_diff(y[i], y[i+1], y_open[i+1]) for i in range(len(y) - 1)]
#y_diff = [abs(y[i+1] - y[i]) % (y_open[i+1]) for i in range(len(y) - 1)] # Difference in tab index
#y_diff = [abs(y[i+1] - y[i]) for i in range(len(y) - 1)] # Difference in tab index
plt.subplot(nrows, ncols, index + 1)
plt.hold(True)
plt.plot(x, y_open, 'g--')
plt.plot(x_diff, y_diff, 'ro', markersize=3)
plt.plot(x_diff, y_diff)
plt.xlabel('Time (s)')
plt.ylabel('Tab index change')
plt.title('Run %d (%s)' % (index, df['state'][index]))
plt.autoscale(tight='x')
plt.tight_layout()
In [124]:
x0, y0 = nestedxy(index=0, maincol='tabsActivated', xcol='time', ycol='numOpenTabs')
x4, y4 = nestedxy(index=4, maincol='tabsActivated', xcol='time', ycol='numOpenTabs')
plt.hold(True)
plt.plot(x0, y0, 'b')
plt.plot(x4, y4, 'g')
plt.hold(False)
In [217]:
# Tab switching over time
df = dfnoc
u = noc
fig = plt.figure(figsize=(12, 12)) # Larger figure
nrows, ncols = len(df.index) / 2, 2
colors = sns.color_palette()
for index in df.index:
if index % 2 == 1:
continue
start_time = min(df['startTime'][index], df['startTime'][index + 1])
# Tabs activated
x0, y0 = nestedxy(df=df, index=index, maincol='tabsActivated', xcol='time', ycol='index', start_time=start_time)
x1, y1 = nestedxy(df=df, index=index+1, maincol='tabsActivated', xcol='time', ycol='index', start_time=start_time)
# Tabs open
x0_open, y0_open = nestedxy(df=df, index=index, maincol='tabsOpen', xcol='time', ycol='numOpenTabs', start_time=start_time)
x1_open, y1_open = nestedxy(df=df, index=index+1, maincol='tabsOpen', xcol='time', ycol='numOpenTabs', start_time=start_time)
plt.subplot(nrows, ncols, index / 2 + 1)
plt.hold(True)
#plt.plot(x0, y0, 'ro', markersize=3)
plt.plot(x0, y0, color=con, label='On')
plt.plot(x1, y1, color=coff, label='Off')
plt.plot(x0_open, y0_open, '--', color=co, markersize=3, label='Open tabs')
plt.plot(x1_open, y1_open, '--', color=co, markersize=3)
plt.xlabel('Time (s)')
plt.ylabel('Tab index')
plt.title('%s' % (u[index / 2]))
plt.legend()
plt.autoscale(tight='x')
plt.tight_layout()
plt.savefig('switching.png', bbox_inches='tight')
In [224]:
# Tab marking over time
df = dfnoc
u = noc
fig = plt.figure(figsize=(16, 8)) # Larger figure
#nrows, ncols = len(df.index) / 2, 2
nrows, ncols = 3, 2
for index in df.index:
if index % 2 == 1:
continue
start_time = min(df['startTime'][index], df['startTime'][index + 1])
# Tabs marked
x0, y0 = nestedxy(df=df, index=index, maincol='tabsMarked', xcol='time', ycol='numMarkedTabs', start_time=start_time)
x1, y1 = nestedxy(df=df, index=index+1, maincol='tabsMarked', xcol='time', ycol='numMarkedTabs', start_time=start_time)
# Tabs open
x0_open, y0_open = nestedxy(df=df, index=index, maincol='tabsOpen', xcol='time', ycol='numOpenTabs', start_time=start_time)
x1_open, y1_open = nestedxy(df=df, index=index+1, maincol='tabsOpen', xcol='time', ycol='numOpenTabs', start_time=start_time)
# Tabs moved
x0_moved, y0_moved = nestedxy(df=df, index=index, maincol='tabsOpen', xcol='time', ycol='numOpenTabs', start_time=start_time)
x1_open, y1_open = nestedxy(df=df, index=index+1, maincol='tabsOpen', xcol='time', ycol='numOpenTabs', start_time=start_time)
plt.subplot(nrows, ncols, index / 2 + 1)
plt.hold(True)
plt.plot(x0, y0, 'ro', markersize=3)
plt.plot(x0, y0, label='Marked')
# Redundant - no marked tabs in OFF
#plt.plot(x1, y1, 'ro', markersize=3)
#plt.plot(x1, y1)
plt.plot(x0_open, y0_open, '--', color=con, markersize=3, label='ON')
plt.plot(x1_open, y1_open, '--', color=coff, markersize=3, label='OFF')
#plt.plot(x, y1, 'g--', markersize=3)
plt.xlabel('Time (s)')
plt.ylabel('Number of marked tabs')
plt.title('%s' % (u[index / 2]))
plt.autoscale(tight='x')
plt.tight_layout()
plt.legend()
In [91]:
# Open over time, with and without extension
fig = plt.figure(figsize=(16, 8)) # Larger figure
nrows, ncols = len(df.index) / 2, 2
for index in df.index:
if index % 2 == 1:
continue
state0 = df['state'][index]
x0, y0 = nestedxy(index=index, maincol='tabsOpen', xcol='time', ycol='numOpenTabs')
state1 = df['state'][index+1]
x1, y1 = nestedxy(index=index+1, maincol='tabsOpen', xcol='time', ycol='numOpenTabs')
plt.subplot(nrows, ncols, index + 1)
plt.hold(True)
# ON: blue, OFF: red
color0 = 'b' if state0 == 'ON' else 'r'
color1 = 'b' if state1 == 'ON' else 'r'
plt.plot(x0, y0, color0)
plt.plot(x1, y1, color1)
#plt.plot(x, y1, 'g--', markersize=3)
plt.xlabel('Time (s)')
plt.ylabel('Number of open tabs')
plt.title('Run %d (%s)' % (index, df['state'][index]))
plt.autoscale(tight='x')
plt.tight_layout()
In [61]:
# URL revisitation; info missing in runs where older version that did not log this was used
nrows, ncols = len(df.index) / 2, 2
x0, y0 = scalarxy(ycol='urlsVisited')
x1, y1 = scalarxy(ycol='urlsRevisited')
plt.plot(x0, y0, label='URLs visited')
plt.plot(x1, y1, label='URLs revisited')
annotate(x=x0, y=y0) # Show y values
annotate(x=x1, y=y1)
plt.autoscale(tight='x')
plt.tight_layout()
plt.legend()
Out[61]:
In [133]:
# URL revisitation; info missing in runs where older version that did not log this was used
df = df_all
nrows, ncols = len(df.index) / 2, 2
x0, y0 = scalarxy(df=df, ycol='urlsVisited')
x1, y1 = scalarxy(df=df, ycol='urlsRevisited')
ind = np.arange(len(x0) / 2)
width = 0.35
color_revisit = sns.color_palette()[1]
plt.bar(ind, y0[::2], width=width, hold=True)
plt.bar(ind + width, y0[1::2], width=width, hold=True, label='URLs visited')
plt.bar(ind, y1[::2], color=color_revisit, width=width, hold=True)
plt.bar(ind + width, y1[1::2], color=color_revisit, width=width, hold=True, label='URLs revisited')
plt.ylabel('Visits')
plt.xticks(ind+width, users)
plt.xlabel('User (Left: ON, Right: OFF)')
plt.title('Page revisits')
plt.legend()
plt.savefig('revisits.png', bbox_inches='tight')
In [120]:
sns.palplot(sns.color_palette())
In [120]:
In [132]:
# Tabs moved
df = dfnoc
for index in df.index:
state = df['state'][index]
n_moved = len(df['tabsMoved'][index]['time'])
print "State: %s Moved: %d" % (state, n_moved)
In [136]:
# Scanning vs visualization
vi = np.array([6, 10, 11, 3.4, 3.5])
sc = np.array([7, 12, 12, 5.14, 7])
print sc.mean(), sc.var()
print vi.mean(), vi.var()
In [154]:
n_marked = sum([len(df['tabsMarked'][index]['numMarkedTabs']) for index in df.index])
n_marked
Out[154]:
In [168]:
item = 'tabsMarked'
a = np.array([(index, len(df[item][index]['time'])) for index in df.index if index % 2 == 0])
print a[:,1]
print a[:,1].mean(), a[:, 1].std()
In [171]:
item = 'openFrequentFromPopup'
a = np.array([(index, df[item][index]) for index in df.index if index % 2 == 0])
print a[:,1]
print a[:,1].mean(), a[:, 1].std()
In [210]:
only_on = True
for column in df.columns:
try:
if not np.isscalar(df[column].max()):
if only_on:
l = np.array([len(df[column][index]['time']) for index in df.index if index % 2 == 0])
else:
l = np.array([len(df[column][index]['time']) for index in df.index])
print column, " ", l.max(), " ", l.mean(), " ", l.std(), " ", df.name[l.argmax()], " ", users[l.argmax() / 2], " ", users[l.argmin() / 2]
print
continue
col = df[column][::2] if only_on else df[column]
print column, col.max(), col.mean(), col.std(), df.name[col.argmax()], users[col.argmax() / 2], " ", users[col.argmin() / 2]
print
except:
pass
In [166]:
df.info()
In [201]:
a = np.array([len(df['visualizationOpened'][index]['time']) for index in df.index])
print a.mean(), a.max(), a.std()
In [204]:
df['markTabFromHotkey']
Out[204]:
In [212]:
df['urlsVisited'].sum()
Out[212]:
In [213]:
df['urlsRevisited'].sum()
Out[213]:
In [214]:
114./496
Out[214]:
In [344]:
# Number of tabs used vs number of open tabs
# Five or ten minute intervals
df = dfnoc
df_used = pd.DataFrame(columns=['name', 'state', 't_dwell', 'index_used', 'ids_used', 'n_dwelt'])
for index in df.index:
t_outer, ids_outer = nestedxy(df=df, maincol='tabsActivated', xcol='time', ycol='id', index=index)
t_outer, indices_outer = nestedxy(df=df, maincol='tabsActivated', xcol='time', ycol='index', index=index)
t_outer, open_tabs_outer = nestedxy(df=df, maincol='tabsActivated', xcol='time', ycol='numOpenTabs', index=index)
dt = 60 * 5 # Time interval to measure over
for t_dwell in range(0, 20, 2):
dwelt = [i for i in range(len(t_outer) - 1) if t_outer[i+1] - t_outer[i] >= t_dwell]
ids = np.array(ids_outer)[dwelt]
indices = np.array(indices_outer)[dwelt]
open_tabs = np.array(open_tabs_outer)[dwelt]
t = np.array(t_outer)[dwelt]
bins = np.arange(0, t[-1], dt)
nbins = len(bins)
inds = np.digitize(t, bins)
indices_by_bin = [indices[np.where(inds == b)[0]] for b in np.unique(inds)]
ids_by_bin = [ids[np.where(inds == b)[0]] for b in np.unique(inds)]
open_by_bin = [open_tabs[np.where(inds == b)[0]] for b in np.unique(inds)]
n_indices_used_by_bin = np.array([len(np.unique(ind)) for ind in indices_by_bin])
n_ids_used_by_bin = np.array([len(np.unique(tab_ids)) for tab_ids in ids_by_bin])
n_open_by_bin = np.array([np.max(tabs) for tabs in open_by_bin])
pct_index_used = np.mean(100 * n_indices_used_by_bin / n_open_by_bin)
pct_ids_used = np.mean(100 * n_ids_used_by_bin / n_open_by_bin)
df_used = df_used.append({'state': df['state'][index], 't_dwell': t_dwell,
'index_used': pct_index_used, 'ids_used': pct_ids_used,
'name': df['name'][index], 'n_dwelt': len(dwelt)}, ignore_index=True)
#print "State: %-3s t_dwell: %ds index_used: %.3f%% ids_used: %.3f%% n_dwelt: %d" % (
# df['state'][index], t_dwell, pct_index_used, pct_ids_used, len(dwelt))
#print "-" * 75
df_used
Out[344]:
In [292]:
In [ ]: