Twitter Stats


In [93]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
import time
from statistics import stdev
import pytz as tz
from datetime import datetime as dt

In [108]:
#!wget https://www.dropbox.com/s/qqq/twitter.csv?dl=0 -O yymmdd_tweet_activity_metrics.csv
!ls -l | grep tweet


-rw-r--r-- 1 root root 700565 Aug 28 11:12 140828_tweet_activity_metrics.csv
-rw-r--r-- 1 root root 710354 Sep  6 07:27 140906_tweet_activity_metrics.csv

In [95]:
data = pd.read_csv('140906_tweet_activity_metrics.csv')
#data.head()

In [96]:
data = data.rename(columns = {
    'Tweet id':'id', 'Tweet permalink':'link', 'Tweet text':'text', 'impressions':'impr', 'engagements':'eng', 
    'engagement rate':'engr', 'retweets':'rts', 'replies ':'repls', 'user profile clicks':'c_pro', 'url clicks':'c_url',
    'hashtag clicks':'c_ht', 'detail expands':'c_expd', 'permalink clicks':'c_prm', 'embedded media clicks':'c_med',
    'email tweet':'email', 
    })

In [97]:
data.columns


Out[97]:
Index(['id', 'link', 'text', 'time', 'impr', 'eng', 'engr', 'rts', 'replies', 'favorites', 'c_pro', 'c_url', 'c_ht', 'c_expd', 'c_prm', 'c_med', 'app opens', 'app install attempts', 'follows', 'email', 'dial phone', 'promoted impressions', 'promoted engagements', 'promoted engagement rate', 'promoted retweets', 'promoted replies', 'promoted favorites', 'promoted user profile clicks', 'promoted url clicks', 'promoted hashtag clicks', 'promoted detail expands', 'promoted permalink clicks', 'promoted embedded media clicks', 'promoted app opens', 'promoted app install attempts', 'promoted follows', 'promoted email tweet', 'promoted dial phone'], dtype='object')

In [98]:
def convert_time(t):
    lon = tz.timezone('Europe/London')
    dto = dt.strptime(t, "%Y-%m-%d %H:%M %z")
    dto = lon.normalize(dto.astimezone(lon))
    ts = dto.timetuple()
    tod = float(ts.tm_hour) + ts.tm_min / 60
    wday = ts.tm_wday
    swday = time.strftime("%a",ts)
    wkend = 1 if wday > 4 else 0
    if ts.tm_hour < 7: tod1 = '0-7'
    elif ts.tm_hour < 9: tod1 = '7-9'
    elif ts.tm_hour < 12: tod1 = '9-12'
    elif ts.tm_hour < 15: tod1 = '12-15'
    elif ts.tm_hour < 18: tod1 = '15-18'
    elif ts.tm_hour < 21: tod1 = '18-21'
    elif ts.tm_hour < 24: tod1 = '21-24'
    todh = ts.tm_hour
    return (tod, tod1, todh, wday, swday, wkend)
    
    return ts

#convert_time('2014-12-06 07:16 +0000')    
data1 = pd.DataFrame(list(map(convert_time, data['time'])), columns =['tod','tod1','todh', 'wday','swday','wkend'])
#data1.head()

In [99]:
data['tod'] = data1['tod']
data['tod1'] = data1['tod1']
data['todh'] = data1['todh']
data['wday'] = data1['wday']
data['swday'] = data1['swday']
data['wkend'] = data1['wkend']
data1 = None

In [100]:
data_nort = data[data['rts']==0]
data_all = data
data = data_nort
normalised = True

In [101]:
nstr = " (normalised)" if normalised else ""

In [102]:
def stats1 (data, item, param, digits=0, normalised=normalised):
    
    if normalised:
        f_med = 100. / mean([median(data[data[param] == x][item]) for x in set(data[param])])
        f_avg = 100. / mean([average(data[data[param] == x][item]) for x in set(data[param])])
    else:
        f_med = 1.0
        f_avg = 1.0
    
    return pd.DataFrame([
    ( item, x, 
        round(len(data[data[param] == x][item]),digits), 
        round(f_med * median(data[data[param] == x][item]),digits), 
        round(f_avg * average(data[data[param] == x][item]),digits),
     ) for x in set(data[param])], columns=['item', 'val', 'count', 'median', 'mean'])

In [103]:
data
s1 = stats1(data, 'impr','tod1', normalised=True)
s1


Out[103]:
item val count median mean
0 impr 21-24 229 69 82
1 impr 15-18 290 84 72
2 impr 9-12 380 98 83
3 impr 12-15 357 74 67
4 impr 0-7 37 145 80
5 impr 7-9 119 146 246
6 impr 18-21 280 84 70

7 rows × 5 columns


In [104]:
s1 = stats1(data, 'impr','todh', normalised=normalised)
plt.bar(s1['val'], s1['median'])
plt.title("Impressions vs Time of Day"+nstr)
s1


Out[104]:
item val count median mean
0 impr 0 8 142 89
1 impr 1 6 116 77
2 impr 3 1 140 101
3 impr 5 2 7 5
4 impr 6 20 155 108
5 impr 7 49 139 113
6 impr 8 70 145 411
7 impr 9 120 63 79
8 impr 10 126 115 100
9 impr 11 134 134 110
10 impr 12 108 97 84
11 impr 13 148 62 72
12 impr 14 101 68 82
13 impr 15 93 157 96
14 impr 16 104 110 91
15 impr 17 93 48 66
16 impr 18 77 134 85
17 impr 19 89 95 87
18 impr 20 114 68 75
19 impr 21 75 62 93
20 impr 22 110 82 110
21 impr 23 44 60 67

22 rows × 5 columns


In [105]:
s1 = stats1(data, 'impr','wkend', normalised=normalised)
plt.bar(s1['val'], s1['median'])
plt.title("Impressions vs Weekend"+nstr)
s1


Out[105]:
item val count median mean
0 impr 0 1203 102 91
1 impr 1 489 98 109

2 rows × 5 columns


In [106]:
s1 = stats1(data, 'impr','wday', normalised=normalised)
plt.bar(s1['val'], s1['median'])
plt.title("Impressions vs Weekday"+nstr)
s1


Out[106]:
item val count median mean
0 impr 0 244 94 109
1 impr 1 241 136 89
2 impr 2 209 139 105
3 impr 3 252 52 68
4 impr 4 257 89 85
5 impr 5 168 113 168
6 impr 6 321 77 76

7 rows × 5 columns


In [107]:
s1 = stats1(data, 'impr','rts', normalised=normalised)
plt.bar(s1['val'], s1['median'])
plt.title("Impressions vs RT's"+nstr)
s1


Out[107]:
item val count median mean
0 impr 0 1692 100 100

1 rows × 5 columns


In [107]: