In [30]:
import numpy as np
import pandas as pd
import scipy.stats as stats
from matplotlib import pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
import json
%run -i nb_analysis # Run instead of import so we can access 'df' defined in notebook
%matplotlib inline

In [86]:
df = pd.read_json('combined.json', orient='records')
df['tabsActivated']


Out[86]:
0    {u'index': [2, 1, 0, 2, 3, 2, 3, 0, 1, 4, 1, 4...
1    {u'index': [16, 5, 6, 5, 6, 7, 7, 6, 7, 8, 9, ...
2    {u'index': [16, 15, 14, 13, 14, 13, 14, 13, 14...
3    {u'index': [1, 0, 1, 2, 3, 2, 3, 6, 11, 6, 4, ...
4    {u'index': [7, 8, 15, 9, 16, 10, 17, 11, 18, 1...
5    {u'index': [3, 2, 3, 2, 3, 2, 3, 3, 4, 3, 2, 1...
6    {u'index': [2, 1, 0, 0, 1, 2, 3, 2, 1, 2, 1, 2...
7    {u'index': [0, 0, 1, 2, 1, 4, 3, 3, 2, 2, 3, 3...
8    {u'index': [2, 1, 2, 0, 2, 1, 1, 0, 1, 2, 3, 1...
9    {u'index': [0, 1, 2, 1, 0, 1, 0, 3, 0, 1, 1, 0...
Name: tabsActivated, dtype: object

In [53]:
#pd.DataFrame(df, columns=['tabsActivated'])
#df.info() # Column information
df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 10 entries, 0 to 9
Data columns (total 30 columns):
endTime                          10 non-null int64
groupBy                          8 non-null object
helpOpenedFromPopup              10 non-null int64
length                           10 non-null int64
markTabFromHotkey                10 non-null int64
markTabFromPanelTree             10 non-null int64
markTabFromPopup                 10 non-null int64
markTabFromVisualization         10 non-null int64
name                             10 non-null object
openFrequentFromPopup            10 non-null int64
openRecentFromPopup              10 non-null int64
popupOpened                      10 non-null object
startTime                        10 non-null int64
state                            10 non-null object
switchToMarkedTabFromHotkey      10 non-null int64
switchToMarkedTabFromPopup       10 non-null int64
switchToTabFromPanelTree         10 non-null int64
switchToTabFromVisualization     10 non-null int64
tabsActivated                    10 non-null object
tabsClosed                       10 non-null object
tabsCreated                      10 non-null object
tabsMarked                       10 non-null object
tabsMoved                        10 non-null object
tabsOpen                         10 non-null object
urlsRevisited                    8 non-null float64
urlsVisited                      8 non-null float64
visualizationOpened              10 non-null object
visualizationOpenedFromHotkey    10 non-null int64
visualizationOpenedFromPopup     10 non-null int64
when                             10 non-null int64
dtypes: float64(2), int64(17), object(11)

In [105]:
#df['name'][2],df['state'][2], df['tabsOpen'][2], df['name'][3], df['state'][3], df['tabsOpen'][3]
#[df['tabsOpen'][index].keys() for index in df.index]
#df['tabsActivated'][0]['index'][23:26], df['tabsActivated'][0]['numOpenTabs'][23:26], df['tabsActivated'][0]['id'][23:26]


Out[105]:
([7, 6, 6], [11, 10, 9], [401, 397, 387])

In [ ]:
# URLs visited vs URLs revisited

In [54]:
# Tab switching over time
fig = plt.figure(figsize=(16, 8)) # Larger figure
nrows, ncols = len(df.index) / 2, 2
for index in df.index:
    x, y = nestedxy(index=index, maincol='tabsActivated', xcol='time', ycol='index')
    plt.subplot(nrows, ncols, index + 1)
    plt.hold(True)
    plt.plot(x, y, 'ro', markersize=3)
    plt.plot(x, y)
    plt.xlabel('Time (s)')
    plt.ylabel('Active tab index')
    plt.title('Run %d (%s)' % (index, df['state'][index]))
plt.autoscale(tight='x')
plt.tight_layout()



In [135]:
from scipy.stats import ttest_ind, ttest_rel
# Number of tab switches per second (ON vs OFF)
on, off = [], []
for index in df.index:
    state = df['state'][index]
    n_switches = len(df['tabsActivated'][index]['index'])
    last_switch_time = df['tabsActivated'][index]['time'][-1]
    first_switch_time = df['tabsActivated'][index]['time'][0]
    #duration = (df['endTime'][index] - df['startTime'][index]) / 1000. # ms -> s
    duration = last_switch_time - first_switch_time
    duration /= 1000. # ms -> s
    switches_per_second = n_switches / duration
    if state == 'ON':
        on.append(switches_per_second)
    else:
        off.append(switches_per_second)

        
on = np.array(on)
off = np.array(off)
print on
print off
print "Independent: ", ttest_ind(on, off, equal_var=False)
print "Dependent: ", ttest_rel(on, off)


[ 0.05052587  0.09889183  0.34521271  0.10193381  0.02330743]
[ 0.01478591  0.02845534  0.02865828  0.05010162  0.01173913]
Independent:  (array(1.6859886959449246), 0.16512349045744906)
Dependent:  (array(1.7462211656384086), 0.1557019991266764)

In [122]:
print df['state']
print df['when']


0     ON
1    OFF
2     ON
3    OFF
4     ON
5    OFF
6     ON
7    OFF
8     ON
9    OFF
Name: state, dtype: object
0    1
1    2
2    1
3    2
4    1
5    2
6    2
7    1
8    2
9    1
Name: when, dtype: int64

In [107]:
# Tab switching over time
fig = plt.figure(figsize=(16, 8)) # Larger figure
nrows, ncols = len(df.index) / 2, 2
for index in df.index:
    x, y = nestedxy(index=index, maincol='tabsActivated', xcol='time', ycol='index')
    _, y_open = nestedxy(index=index, maincol='tabsActivated', xcol='time', ycol='numOpenTabs')
    x_diff = x[1:]
    y_diff = [(y[i+1] - y[i]) % (y_open[i+1]) for i in range(len(y) - 1)] # Difference in tab index
    
    plt.subplot(nrows, ncols, index + 1)
    plt.hold(True)
    plt.plot(x, y_open, 'g--')
    plt.plot(x_diff, y_diff, 'ro', markersize=3)
    plt.plot(x_diff, y_diff)

    plt.xlabel('Time (s)')
    plt.ylabel('Tab index difference')
    plt.title('Run %d (%s)' % (index, df['state'][index]))
plt.autoscale(tight='x')
plt.tight_layout()



In [124]:
x0, y0 = nestedxy(index=0, maincol='tabsActivated', xcol='time', ycol='numOpenTabs')
x4, y4 = nestedxy(index=4, maincol='tabsActivated', xcol='time', ycol='numOpenTabs')
plt.hold(True)
plt.plot(x0, y0, 'b')
plt.plot(x4, y4, 'g')
plt.hold(False)



In [97]:
# Tab marking over time
fig = plt.figure(figsize=(16, 8)) # Larger figure
nrows, ncols = len(df.index) / 2, 2
for index in df.index:
    if index % 2 == 1:
        continue
    x, y = nestedxy(index=index, maincol='tabsMarked', xcol='time', ycol='numMarkedTabs')
    _, y1 = nestedxy(index=index, maincol='tabsMarked', xcol='time', ycol='numOpenTabs')
    plt.subplot(nrows, ncols, index + 1)
    plt.hold(True)
    plt.plot(x, y, 'ro', markersize=3)
    plt.plot(x, y)
    plt.plot(x, y1, 'g--', markersize=3)
    plt.xlabel('Time (s)')
    plt.ylabel('Number of marked tabs')
    plt.title('Run %d (%s)' % (index, df['state'][index]))
plt.autoscale(tight='x')
plt.tight_layout()



In [91]:
# Open over time, with and without extension
fig = plt.figure(figsize=(16, 8)) # Larger figure
nrows, ncols = len(df.index) / 2, 2
for index in df.index:
    if index % 2 == 1:
        continue
    state0 = df['state'][index]
    x0, y0 = nestedxy(index=index, maincol='tabsOpen', xcol='time', ycol='numOpenTabs')
    state1 = df['state'][index+1]
    x1, y1 = nestedxy(index=index+1, maincol='tabsOpen', xcol='time', ycol='numOpenTabs')
    plt.subplot(nrows, ncols, index + 1)
    plt.hold(True)
    # ON: blue, OFF: red
    color0 = 'b' if state0 == 'ON' else 'r'
    color1 = 'b' if state1 == 'ON' else 'r'
    plt.plot(x0, y0, color0)
    plt.plot(x1, y1, color1)
    #plt.plot(x, y1, 'g--', markersize=3)
    plt.xlabel('Time (s)')
    plt.ylabel('Number of open tabs')
    plt.title('Run %d (%s)' % (index, df['state'][index]))
plt.autoscale(tight='x')
plt.tight_layout()



In [61]:
# URL revisitation; info missing in runs where older version that did not log this was used
nrows, ncols = len(df.index) / 2, 2
x0, y0 = scalarxy(ycol='urlsVisited')
x1, y1 = scalarxy(ycol='urlsRevisited')
plt.plot(x0, y0, label='URLs visited')
plt.plot(x1, y1, label='URLs revisited')

annotate(x=x0, y=y0) # Show y values
annotate(x=x1, y=y1)
plt.autoscale(tight='x')
plt.tight_layout()
plt.legend()


Out[61]:
<matplotlib.legend.Legend at 0x49eb6d0>