Track the level of engagement across users who tweeted 10+ times in August



In [2]:

    
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns



In [3]:

    
## read in the user.id and tweet.id data from the august dataset
df = pd.read_csv("/home/data/aug_reduced_all.csv",usecols=[1,5])



In [4]:

    
df.head()









    Out[4]:






  
    
      
      id
      user.id_x
    
  
  
    
      0
       4.986011e+17
         41712041
    
    
      1
       4.986012e+17
       1928268895
    
    
      2
       4.986013e+17
        462427721
    
    
      3
       4.986013e+17
       1952105298
    
    
      4
       4.986015e+17
        355918910



In [5]:

    
## count the number of tweets per user
counter = pd.DataFrame({'count' : df.groupby( [ "user.id_x"] ).size()}).reset_index()



In [6]:

    
counter.head()

the below is commented out because kernel crashes required us to process the data separately, but the process for counting tweets over the time period is the same as above



In [7]:

    
#%reset_selective df



In [8]:

    
#df = pd.read_csv("/home/data/nov_reduced.csv")
#df.head()



In [9]:

    
#counter = pd.DataFrame({'count_nov' : df.groupby( [ "user.id"] ).size()}).reset_index()
#counter.head()



In [10]:

    
#counter.to_csv("/home/data/nov_tweet_count_byuser", index=False)



In [11]:

    
## read in november tweet-count data
nov_count = pd.read_csv("/home/data/nov_tweet_count_byuser")



In [12]:

    
counter.columns = ["user.id","count_aug"]



In [13]:

    
tweet_freq = pd.merge(counter,nov_count, on="user.id", how="left")



In [14]:

    
power_aug = tweet_freq[tweet_freq["count_aug"]>=10]



In [15]:

    
aug_10plus = power_aug.shape[0]



In [16]:

    
aug_users = power_aug.shape[0]

Our goal is to calculate the percentage of users who tweeted <= 10 times in August, returned to tweet at least once after the grand jury decision



In [17]:

    
percent_return_min1 = (power_aug["count_nov"].count())/float(aug_users)
### percent of users who returned in november to tweet at least once
percent_return_min1*100









    Out[17]:





81.489976604431391



In [18]:

    
ten_plus_nov = power_aug[power_aug["count_nov"]>=10]



In [19]:

    
nov_users = ten_plus_nov.shape[0]



In [20]:

    
percent_return_10plus = nov_users/float(aug_users)
### percent of users who returned in november to tweet 10 or more times
percent_return_10plus*100









    Out[20]:





43.304738749483924



In [21]:

    
percentage_retention_by_tweet = []
for i in range(1,11):
    nov_counter = power_aug[power_aug["count_nov"]>=i]
    pct = (nov_counter.shape[0]/float(aug_users))*100
    #print pct
    percentage_retention_by_tweet.append(pct)
    
percentage_retention_by_tweet









    Out[21]:





[81.48997660443139,
 73.48043488233405,
 67.60401853296023,
 62.603789164640574,
 58.33983210239002,
 54.60342217532914,
 51.160603697417315,
 48.174228175604384,
 45.63970824349741,
 43.304738749483924]



In [ ]:

    
x = np.arange(1,11)
#x = np.array(list("ABCDEFGHIJ"))
sns.set(style="white", context="talk")

plt.bar(x, percentage_retention_by_tweet, palette="coolwarm")



In [ ]:

	id	user.id_x
0	4.986011e+17	41712041
1	4.986012e+17	1928268895
2	4.986013e+17	462427721
3	4.986013e+17	1952105298
4	4.986015e+17	355918910