In [282]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
Let's load data
In [283]:
data = pd.read_csv("Analytics All Web Site Data Audience Overview 20150905-20151005.csv", comment="#")
data=data[:-1]
data.describe()
Out[283]:
In [284]:
hour_start = "20150905"
#hour_start = datetime.datetime.strptime(hour_start,"%Y%m%d")
#seems like pandas is smart enough to process the data string so line above is not needed (for now)
date_index = pd.date_range(hour_start, periods=len(data), freq='H')
Data cleaning
In [285]:
#clean up data
del data['Hour Index']
indexed_data = data.set_index(date_index)
indexed_data.plot(title="sessions per hour over time period")
Out[285]:
In [286]:
# last_line = None
# for index, row in indexed_data.iterrows():
# last_line = row
# last_line.name.dayofweek
# last_line.name.hour
In [287]:
indexed_data.groupby(lambda x: x.dayofweek).mean().plot(kind="bar", title="sessions per week (0-Mon)")
Out[287]:
In [288]:
indexed_data.groupby(lambda x: x.hour).mean().plot(kind="bar", title="sessions per hour")
Out[288]:
In [289]:
heat = indexed_data.groupby(lambda x: (x.dayofweek, x.hour)).mean()
mti = pd.MultiIndex.from_tuples(heat.index)
mti_data = heat.set_index(mti)
unstacked_data = mti_data.unstack(level=-1)
plt.pcolor(unstacked_data.T, cmap=matplotlib.cm.Blues)
plt.gca().invert_yaxis()
plt.title('Sessions heatmap')
plt.xlabel("day of week")
plt.ylabel("hour")
Out[289]:
In [ ]: