In [93]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
import time
from statistics import stdev
import pytz as tz
from datetime import datetime as dt
In [108]:
#!wget https://www.dropbox.com/s/qqq/twitter.csv?dl=0 -O yymmdd_tweet_activity_metrics.csv
!ls -l | grep tweet
In [95]:
data = pd.read_csv('140906_tweet_activity_metrics.csv')
#data.head()
In [96]:
data = data.rename(columns = {
'Tweet id':'id', 'Tweet permalink':'link', 'Tweet text':'text', 'impressions':'impr', 'engagements':'eng',
'engagement rate':'engr', 'retweets':'rts', 'replies ':'repls', 'user profile clicks':'c_pro', 'url clicks':'c_url',
'hashtag clicks':'c_ht', 'detail expands':'c_expd', 'permalink clicks':'c_prm', 'embedded media clicks':'c_med',
'email tweet':'email',
})
In [97]:
data.columns
Out[97]:
In [98]:
def convert_time(t):
lon = tz.timezone('Europe/London')
dto = dt.strptime(t, "%Y-%m-%d %H:%M %z")
dto = lon.normalize(dto.astimezone(lon))
ts = dto.timetuple()
tod = float(ts.tm_hour) + ts.tm_min / 60
wday = ts.tm_wday
swday = time.strftime("%a",ts)
wkend = 1 if wday > 4 else 0
if ts.tm_hour < 7: tod1 = '0-7'
elif ts.tm_hour < 9: tod1 = '7-9'
elif ts.tm_hour < 12: tod1 = '9-12'
elif ts.tm_hour < 15: tod1 = '12-15'
elif ts.tm_hour < 18: tod1 = '15-18'
elif ts.tm_hour < 21: tod1 = '18-21'
elif ts.tm_hour < 24: tod1 = '21-24'
todh = ts.tm_hour
return (tod, tod1, todh, wday, swday, wkend)
return ts
#convert_time('2014-12-06 07:16 +0000')
data1 = pd.DataFrame(list(map(convert_time, data['time'])), columns =['tod','tod1','todh', 'wday','swday','wkend'])
#data1.head()
In [99]:
data['tod'] = data1['tod']
data['tod1'] = data1['tod1']
data['todh'] = data1['todh']
data['wday'] = data1['wday']
data['swday'] = data1['swday']
data['wkend'] = data1['wkend']
data1 = None
In [100]:
data_nort = data[data['rts']==0]
data_all = data
data = data_nort
normalised = True
In [101]:
nstr = " (normalised)" if normalised else ""
In [102]:
def stats1 (data, item, param, digits=0, normalised=normalised):
if normalised:
f_med = 100. / mean([median(data[data[param] == x][item]) for x in set(data[param])])
f_avg = 100. / mean([average(data[data[param] == x][item]) for x in set(data[param])])
else:
f_med = 1.0
f_avg = 1.0
return pd.DataFrame([
( item, x,
round(len(data[data[param] == x][item]),digits),
round(f_med * median(data[data[param] == x][item]),digits),
round(f_avg * average(data[data[param] == x][item]),digits),
) for x in set(data[param])], columns=['item', 'val', 'count', 'median', 'mean'])
In [103]:
data
s1 = stats1(data, 'impr','tod1', normalised=True)
s1
Out[103]:
In [104]:
s1 = stats1(data, 'impr','todh', normalised=normalised)
plt.bar(s1['val'], s1['median'])
plt.title("Impressions vs Time of Day"+nstr)
s1
Out[104]:
In [105]:
s1 = stats1(data, 'impr','wkend', normalised=normalised)
plt.bar(s1['val'], s1['median'])
plt.title("Impressions vs Weekend"+nstr)
s1
Out[105]:
In [106]:
s1 = stats1(data, 'impr','wday', normalised=normalised)
plt.bar(s1['val'], s1['median'])
plt.title("Impressions vs Weekday"+nstr)
s1
Out[106]:
In [107]:
s1 = stats1(data, 'impr','rts', normalised=normalised)
plt.bar(s1['val'], s1['median'])
plt.title("Impressions vs RT's"+nstr)
s1
Out[107]:
In [107]: