In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
In [3]:
## read in the user.id and tweet.id data from the august dataset
df = pd.read_csv("/home/data/aug_reduced_all.csv",usecols=[1,5])
In [4]:
df.head()
Out[4]:
In [5]:
## count the number of tweets per user
counter = pd.DataFrame({'count' : df.groupby( [ "user.id_x"] ).size()}).reset_index()
In [6]:
counter.head()
Out[6]:
the below is commented out because kernel crashes required us to process the data separately, but the process for counting tweets over the time period is the same as above
In [7]:
#%reset_selective df
In [8]:
#df = pd.read_csv("/home/data/nov_reduced.csv")
#df.head()
In [9]:
#counter = pd.DataFrame({'count_nov' : df.groupby( [ "user.id"] ).size()}).reset_index()
#counter.head()
In [10]:
#counter.to_csv("/home/data/nov_tweet_count_byuser", index=False)
In [11]:
## read in november tweet-count data
nov_count = pd.read_csv("/home/data/nov_tweet_count_byuser")
In [12]:
counter.columns = ["user.id","count_aug"]
In [13]:
tweet_freq = pd.merge(counter,nov_count, on="user.id", how="left")
In [14]:
power_aug = tweet_freq[tweet_freq["count_aug"]>=10]
In [15]:
aug_10plus = power_aug.shape[0]
In [16]:
aug_users = power_aug.shape[0]
Our goal is to calculate the percentage of users who tweeted <= 10 times in August, returned to tweet at least once after the grand jury decision
In [17]:
percent_return_min1 = (power_aug["count_nov"].count())/float(aug_users)
### percent of users who returned in november to tweet at least once
percent_return_min1*100
Out[17]:
In [18]:
ten_plus_nov = power_aug[power_aug["count_nov"]>=10]
In [19]:
nov_users = ten_plus_nov.shape[0]
In [20]:
percent_return_10plus = nov_users/float(aug_users)
### percent of users who returned in november to tweet 10 or more times
percent_return_10plus*100
Out[20]:
In [21]:
percentage_retention_by_tweet = []
for i in range(1,11):
nov_counter = power_aug[power_aug["count_nov"]>=i]
pct = (nov_counter.shape[0]/float(aug_users))*100
#print pct
percentage_retention_by_tweet.append(pct)
percentage_retention_by_tweet
Out[21]:
In [ ]:
x = np.arange(1,11)
#x = np.array(list("ABCDEFGHIJ"))
sns.set(style="white", context="talk")
plt.bar(x, percentage_retention_by_tweet, palette="coolwarm")
In [ ]: