Track the level of engagement across users who tweeted 10+ times in August


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
## read in the user.id and tweet.id data from the august dataset
df = pd.read_csv("/home/data/aug_reduced_all.csv",usecols=[1,5])

In [4]:
df.head()


Out[4]:
id user.id_x
0 4.986011e+17 41712041
1 4.986012e+17 1928268895
2 4.986013e+17 462427721
3 4.986013e+17 1952105298
4 4.986015e+17 355918910

In [5]:
## count the number of tweets per user
counter = pd.DataFrame({'count' : df.groupby( [ "user.id_x"] ).size()}).reset_index()

In [6]:
counter.head()


Out[6]:
user.id_x count
0 12 11
1 57 28
2 58 9
3 76 9
4 521 6

the below is commented out because kernel crashes required us to process the data separately, but the process for counting tweets over the time period is the same as above


In [7]:
#%reset_selective df

In [8]:
#df = pd.read_csv("/home/data/nov_reduced.csv")
#df.head()

In [9]:
#counter = pd.DataFrame({'count_nov' : df.groupby( [ "user.id"] ).size()}).reset_index()
#counter.head()

In [10]:
#counter.to_csv("/home/data/nov_tweet_count_byuser", index=False)

In [11]:
## read in november tweet-count data
nov_count = pd.read_csv("/home/data/nov_tweet_count_byuser")

In [12]:
counter.columns = ["user.id","count_aug"]

In [13]:
tweet_freq = pd.merge(counter,nov_count, on="user.id", how="left")

In [14]:
power_aug = tweet_freq[tweet_freq["count_aug"]>=10]

In [15]:
aug_10plus = power_aug.shape[0]

In [16]:
aug_users = power_aug.shape[0]

Our goal is to calculate the percentage of users who tweeted <= 10 times in August, returned to tweet at least once after the grand jury decision


In [17]:
percent_return_min1 = (power_aug["count_nov"].count())/float(aug_users)
### percent of users who returned in november to tweet at least once
percent_return_min1*100


Out[17]:
81.489976604431391

In [18]:
ten_plus_nov = power_aug[power_aug["count_nov"]>=10]

In [19]:
nov_users = ten_plus_nov.shape[0]

In [20]:
percent_return_10plus = nov_users/float(aug_users)
### percent of users who returned in november to tweet 10 or more times
percent_return_10plus*100


Out[20]:
43.304738749483924

In [21]:
percentage_retention_by_tweet = []
for i in range(1,11):
    nov_counter = power_aug[power_aug["count_nov"]>=i]
    pct = (nov_counter.shape[0]/float(aug_users))*100
    #print pct
    percentage_retention_by_tweet.append(pct)
    
percentage_retention_by_tweet


Out[21]:
[81.48997660443139,
 73.48043488233405,
 67.60401853296023,
 62.603789164640574,
 58.33983210239002,
 54.60342217532914,
 51.160603697417315,
 48.174228175604384,
 45.63970824349741,
 43.304738749483924]

In [ ]:
x = np.arange(1,11)
#x = np.array(list("ABCDEFGHIJ"))
sns.set(style="white", context="talk")

plt.bar(x, percentage_retention_by_tweet, palette="coolwarm")

In [ ]: