In [4]:
import pandas as pd
In [5]:
df = pd.read_csv('daphne.csv', sep='\t', encoding='utf-16')
So first we need to import matplotlib
In [3]:
import matplotlib.pyplot as plt
import matplotlib
plt.style.use('fivethirtyeight')
%matplotlib inline
Let's look at the data file first
In [6]:
df.info()
In [5]:
df.head()
Out[5]:
So lets turn Date_3 into an actual date, here are the codes.
First, lets bring the year date in. We need to clean Date_1 first though
In [6]:
def clean(elem):
elem = str(elem).split('\n')[0].strip()
return elem
In [7]:
df['Date_1'] = df['Date_1'].apply(clean)
In [8]:
df['Date'] = df['Date_3'] + " " + df['Date_1']
In [15]:
df['Final Date'] = pd.to_datetime(df['Date'], format='%A, %d %B %I:%S %p %Y')
In [18]:
df = df[df['Date_3'] != 'n.a.']
In [19]:
df['Final Date'] = pd.to_datetime(df['Date'], format='%A, %d %B %I:%S %p %Y')
In [20]:
df.info()
In [22]:
df.index = df['Final Date']
Let's just plot everything?
In [32]:
df.resample('B')['ID_page'].count().plot()
Out[32]:
Most active day
In [36]:
df.resample('D')['ID_page'].count().sort_values(ascending=False).head()
Out[36]:
By Day
In [27]:
df.groupby(df['Final Date'].dt.weekday)['ID_page'].count()
Out[27]:
By Day
In [28]:
df.groupby(df['Final Date'].dt.week)['ID_page'].count()
Out[28]:
By hour of day
In [29]:
df.groupby(df['Final Date'].dt.hour)['ID_page'].count()
Out[29]: