In [70]:
from collections import Counter, OrderedDict, defaultdict
from dateutil.parser import parse
import random
import matplotlib.pyplot as plt
import numpy as np
import pandas
import snap
import loader
import tweet_util
import util
In [2]:
michigan_tweets = loader.load_michigan_tweets() # a list of dictionaries
In [3]:
'''
There are three different types of graphs:
Gs: directed single graph
Gu: undirected single graph
G: undirected multi-graph (allows multile edges)
We will be using Gu to detect communities because it seems to work best.
We use Gs to find popular users
G is useless.
'''
Gs, nodes2names_s, names2nodes_s, ids2screennames = util.create_graph_from_tweets(michigan_tweets, multi=False)
print 'single graph directed. number of nodes: %d. number of edges: %d' %(Gs.GetNodes(), Gs.GetEdges())
print 'self edge', snap.CntSelfEdges(Gs)
Gu, nodes2names_u, names2nodes_u, ids2screennames = util.create_graph_from_tweets(michigan_tweets, multi=False, directed=False)
print 'single graph undirected. number of nodes: %d. number of edges: %d' %(Gu.GetNodes(), Gu.GetEdges())
print 'self edge', snap.CntSelfEdges(Gu)
G, nodes2names, names2nodes, ids2screennames = util.create_graph_from_tweets(michigan_tweets, multi=True)
print 'multi graph. number of nodes: %d. number of edges: %d' %(G.GetNodes(), G.GetEdges())
print 'self edge', snap.CntSelfEdges(G)
In [78]:
pro_hillary, pro_trump = util.load_hashtags('tags/curated_hillary.tags', 'tags/curated_trump.tags')
print '%d pro Hillary hashtags, %d pro Trump hashtags' %(len(pro_hillary), len(pro_trump))
In [79]:
pro_hillary_orig, pro_trump_orig = util.load_hashtags('tags/orig_hillary.tags', 'tags/orig_trump.tags')
print '%d pro Hillary hashtags, %d pro Trump hashtags' %(len(pro_hillary_orig), len(pro_trump_orig))
In [1]:
pro_hillary_all, pro_trump_all = util.load_hashtags('tags/all_hillary.tags', 'tags/all_trump.tags')
print '%d pro Hillary hashtags, %d pro Trump tweets' %(len(pro_hillary_all), len(pro_trump_all))
In [80]:
'''
Add the tags got from semi-supervised
'''
new_hillary, new_trump = util.load_hashtags('tags/correct_hillary.tags', 'tags/correct_trump.tags')
pro_hillary_all = pro_hillary | new_hillary
pro_trump_all = pro_trump | new_trump
print '%d pro Hillary tweets, %d pro Trump tweets' %(len(pro_hillary_all), len(pro_trump_all))
In [6]:
hashtag2ids, id2hashtags = util.get_unique_hashtags(michigan_tweets, names2nodes)
In [191]:
'''
Modularity scores are negative, which means that overall,
pro Trump users are less likely to interact with each other than just random.
Same for pro Hillary users.
There are 4834 users in the entire graph that tweet pro_Trump hashtags.
There are 4270 users in the entire graph that tweet pro_Trump hashtags.
Out of 19595 users who tweet with hashtags and 38936 total users.
(only 23.4% of total users)
'''
util.get_modularity_tag(Gu, hashtag2ids, ['makeamericagreatagain'])
util.get_modularity_tag(Gu, hashtag2ids, ['nevertrump'])
util.get_modularity_tag(Gu, hashtag2ids, pro_trump_all)
util.get_modularity_tag(Gu, hashtag2ids, pro_hillary_all)
In [213]:
def find_celebrities(G, k=-1, threshold=100):
'''
Find k people with the most people tweet to them (with highest in degrees)
over the entire graph
Or find all nodes with the at least threshold people tweeting to them
(with in degree of at least threshold)
'''
if k != -1:
deg_seq = sorted([(node.GetInDeg(), -node.GetId()) for node in G.Nodes()])[::-1]
celebs = [-x[1] for x in deg_seq[:k]]
else:
deg_seq = sorted([(node.GetInDeg(), -node.GetId()) for node in G.Nodes() if node.GetInDeg() >= threshold])[::-1]
print deg_seq[:10]
celebs = [-x[1] for x in deg_seq]
print celebs
return celebs
In [221]:
celebs = find_celebrities(Gs, threshold=100)
celebs = find_celebrities(Gs, k=100)
for celeb in celebs:
print ids2screennames[nodes2names[celeb]]
In [222]:
for celeb in celebs[:10]:
print ids2screennames[nodes2names[celeb]]
nbrs = [v for v in Gs.GetNI(celeb).GetInEdges()]
print len(nbrs)
util.get_modularity(Gu, nbrs)
In [14]:
'''
communes is a dict mapping from community_id to list of ids in that community
but id is node id, not twitter id. you can translate from node id to twitter ids through map nodes2names
so if node_id is 2, you can get the twitter user id associated with it using nodes2names[2]
'''
all_communes, modularity = util.find_communities(Gu)
In [13]:
# make sure each node is in exactly one community
assert sum([len(all_communes[i]) for i in all_communes]) == Gu.GetNodes()
In [232]:
count_community_size = OrderedDict(sorted(Counter([len(all_communes[i]) for i in all_communes]).items()))
print count_community_size
counts = []
sizes = []
for i in count_community_size:
sizes.append(i)
counts.append(count_community_size[i])
plt.loglog(sizes, counts)
plt.title('Community sizes')
plt.xlabel('size')
plt.ylabel('number of communities with that size')
plt.savefig('community_size.png')
plt.show()
In [15]:
threshold = 20
all_nodes = [node.GetId() for node in Gu.Nodes()]
print 'number of communities:', len(all_communes)
communes = {i: all_communes[i] for i in all_communes if len(all_communes[i]) >= threshold}
print '%d communities with at least %d members:' %(len(communes), threshold)
communes_modularity = []
random_modularity = []
for i in communes:
nodes = snap.TIntV()
for node in communes[i]:
nodes.Add(node)
rand = snap.TIntV()
rand_nodes = random.sample(all_nodes, len(nodes))
for node in rand_nodes:
rand.Add(node)
score = snap.GetModularity(G, nodes)
communes_modularity.append((score, i))
random_modularity.append(snap.GetModularity(G, rand))
print "The modularity of the network is %f" % modularity
In [20]:
print min(random_modularity), max(random_modularity)
In [19]:
communes_modularity = sorted(communes_modularity)[::-1]
In [22]:
for score, i in communes_modularity:
print 'Community %d with %d users, modularity score %f' %(i, len(communes[i]), score)
In [183]:
def label_community(G, communes, pro_trump, pro_hillary):
diff_threshold = 5
trump_neutrals = set()
hillary_neutrals = set()
trump_users = {}
hillary_users = {}
labeled_communes = {'trump': [], 'hillary': [], 'neutral': []}
all_nodes = [node.GetId() for node in G.Nodes()]
print "Over the entire network"
_, _, tag_ratio, user_ratio, diff, trumpers, hillaryers, neutral_users, neutrals \
= util.homophily(Gu, id2hashtags, all_nodes, pro_trump, pro_hillary)
print "\nOver each community"
for i in communes:
print '%d. size of community %d' %(i, len(communes[i]))
hashtags, others, tag, user, commune_diff, trumpers, hillaryers, neutral_users, neutrals \
= util.homophily(Gu, id2hashtags, top_communes[i], pro_trump, pro_hillary)
user_diff = abs(len(trumpers) - len(hillaryers))
if (user >= 2 * user_ratio or user == -1) and user_diff >= diff_threshold:
labeled_communes['trump'].append(i)
trump_neutrals = trump_neutrals | set(neutrals)
elif user <= 0.5 * user_ratio and user_diff >= diff_threshold:
labeled_communes['hillary'].append(i)
hillary_neutrals = hillary_neutrals | set(neutrals)
else:
labeled_communes['neutral'].append(i)
trump_users[i] = trumpers
hillary_users[i] = hillaryers
print "Labeled communities", labeled_communes
return labeled_communes, trump_users, hillary_users, trump_neutrals, hillary_neutrals
In [185]:
labeled_communes, trump_users, hillary_users, trump_neutrals, hillary_neutrals = \
label_community(Gu, communes, pro_trump_all, pro_hillary_all)
In [233]:
all_trump_users, all_hillary_users = set(), set()
for c in trump_users:
all_trump_users = all_trump_users | set(trump_users[c])
all_hillary_users = all_hillary_users | set(hillary_users[c])
In [105]:
def eliminate_common_tags(trump_neutrals, hillary_neutrals):
trump_only = trump_neutrals - hillary_neutrals
hillary_only = hillary_neutrals - trump_neutrals
common = trump_neutrals & hillary_neutrals
print len(trump_only), len(hillary_only), len(common)
return trump_only, hillary_only
In [237]:
def write_semi_tags(hashtag2ids, tags, filename, users, threshold=3):
pop = []
for tag in tags:
count = 0
for user in hashtag2ids[tag]:
if user in users:
count += 1
if count >= threshold:
pop.append(tag)
with open(filename, 'w') as f:
for tag in pop:
f.write(tag.encode('utf-8') + '\n')
return pop
In [239]:
def grow_tags(trump_neutrals, hillary_neutrals, hashtag2ids, trump_users, hillary_users, mode='general', threshold=3):
trump_only, hillary_only = eliminate_common_tags(trump_neutrals, hillary_neutrals)
trump_pop = write_semi_tags(hashtag2ids, trump_only, mode + '_trump_semi.tags', trump_users, threshold)
print 'popular trump:', len(trump_pop)
hillary_pop = write_semi_tags(hashtag2ids, hillary_only, mode + '_hillary_semi.tags', hillary_users, threshold)
print 'popular hillary:', len(hillary_pop)
In [240]:
print len(trump_neutrals), len(hillary_neutrals)
grow_tags(trump_neutrals, hillary_neutrals, hashtag2ids, all_trump_users, all_hillary_users, 'general', threshold=3)
In [241]:
def get_semi_tags(id2hashtags, users):
neutrals = set()
for node in users:
neutrals = neutrals | set(id2hashtags[node])
return neutrals
In [242]:
indi_trump_neutrals = get_semi_tags(id2hashtags,all_trump_users)
indi_hillary_neutrals = get_semi_tags(id2hashtags, all_hillary_users)
In [249]:
print len(indi_trump_neutrals), len(indi_hillary_neutrals)
grow_tags(indi_trump_neutrals, indi_hillary_neutrals, hashtag2ids, all_trump_users, all_hillary_users, mode='indi', threshold=3)
In [244]:
def count_tags(semi, pro_trump, pro_hillary):
trumps = set()
hillarys = set()
neutrals = set()
for tag in semi:
if tag in pro_trump:
if tag in pro_hillary:
neutrals.add(tag)
else:
trumps.add(tag)
elif tag in pro_hillary:
hillarys.add(tag)
else:
neutrals.add(tag)
return trumps, hillarys, neutrals
In [245]:
def evaluate_semi(mode='indi'):
pro_hillary_all, pro_trump_all = util.load_hashtags('tags/all_hillary.tags', 'tags/all_trump.tags')
semi_hillary, semi_trump = util.load_hashtags(mode + '_hillary_semi.tags', mode + '_trump_semi.tags')
trumps, hillarys, trump_neutrals = count_tags(semi_trump, pro_trump_all, pro_hillary_all)
print "%d Trump semi hashtags" %(len(semi_trump))
print "Correct %d. Wrong %d. Neutrals %d\n" %(len(trumps), len(hillarys), len(trump_neutrals))
with open('chosen_semi_trump.tags', 'w') as f:
for tag in trumps:
f.write(tag + '\n')
print hillarys
trumps, hillarys, hillary_neutrals = count_tags(semi_hillary, pro_trump_all, pro_hillary_all)
print "%d Hillary semi hashtags" %(len(semi_hillary))
print "Correct %d. Wrong %d. Neutrals %d\n" %(len(hillarys), len(trumps), len(hillary_neutrals))
with open('chosen_semi_hillary.tags', 'w') as f:
for tag in hillarys:
f.write(tag + '\n')
print trumps
return trump_neutrals, hillary_neutrals
In [251]:
trump_neutrals, hillary_neutrals = evaluate_semi(mode='indi')
In [181]:
print trump_neutrals
top_communes: the dict of communities with at least threshold nodes. it's a dict mapping from community's index to list of nodes in that community
trump_user_communes: dict mapping from community's index to list of pro-Trump users in that community hillary_user_communes: dict mapping from community's index to list of pro-Hillary users in that community trump_communes: list of indices of pro-Trump communities hillary_communes: list of indices of pro-Hillary communities
In [ ]:
'''
Random list of user from a certain community
Example: 5 random pro_Trump users from a pro_Trump community
'''
idx = random.sample(trump_communes, 1)[0]
users = random.sample(trump_user_communes[idx], k=5)
print users
In [ ]:
def convert_screenname(node):
return ids2screennames[nodes2names[node]]
In [ ]:
'''
People who interact with others a lot ("active community members")
Based on the node degree in undirected graph Gu
This function can also be used to find people whose posts are popular
but don't necessarily interact with others a lot ("celebrities")
To do so, just pass in the directed graph Gs instead of Gu
a list of users that primarily create content ("creators") is
the same as the list of popular users. they create content
and people retweet them (high in degree)
return:
active_all: k most active users in that community
active_trump: k most active pro_trump users in that community
active_hillary: k most active pro_hillary users in that community
'''
def commune_active_users(G, communes, trump_users, hillary_users, idx, k):
deg_seq = sorted([(G.GetNI(node).GetInDeg(), -node) for node in communes[idx]])[::-1]
active_all = [-x[1] for x in deg_seq[:k]]
deg_seq = sorted([(G.GetNI(node).GetInDeg(), -node) for node in trump_users[idx]])[::-1]
active_trump = [-x[1] for x in deg_seq[:k]]
deg_seq = sorted([(G.GetNI(node).GetInDeg(), -node) for node in hillary_users[idx]])[::-1]
active_hillary = [-x[1] for x in deg_seq[:k]]
return active_all, active_trump, active_hillary
In [ ]:
'''
Example: find 5 most active members in a commune
Might not print all 5 if there aren't enough trump/hillary users in that community
'''
# idx = random.sample(trump_communes, 1)[0]
idx = 25
active_all, active_trump, active_hillary = \
commune_active_users(Gu, top_communes, trump_user_communes, hillary_user_communes, idx, k)
print active_all, active_trump, active_hillary
In [ ]:
'''
Example: find 5 most popular members in a commune
Might not print all 5 if there aren't enough trump/hillary users in that community
'''
# idx = random.sample(trump_communes, 1)[0]
idx = 25
popular_all, popular_trump, popular_hillary = \
commune_active_users(Gs, top_communes, trump_user_communes, hillary_user_communes, idx, k)
print popular_all, popular_trump, popular_hillary
In [ ]:
'''
return a list of k users that primarily spread content others create ("distributors")
Based on out-degree on directed graph Gs
return:
dist_all: k most active distributors in that community
dist_trump: k most active pro_trump distributors in that community
dist_hillary: k most active pro_hillary distributors in that community
'''
def commune_active_distributors(G, communes, trump_users, hillary_users, idx, k):
deg_seq = sorted([(G.GetNI(node).GetOutDeg(), -node) for node in communes[idx]])[::-1]
active_all = [-x[1] for x in deg_seq[:k]]
deg_seq = sorted([(G.GetNI(node).GetOutDeg(), -node) for node in trump_users[idx]])[::-1]
active_trump = [-x[1] for x in deg_seq[:k]]
deg_seq = sorted([(G.GetNI(node).GetOutDeg(), -node) for node in hillary_users[idx]])[::-1]
active_hillary = [-x[1] for x in deg_seq[:k]]
return active_all, active_trump, active_hillary
In [ ]:
'''
Example: find 5 most active distributors in a commune
Might not print all 5 if there aren't enough trump/hillary users in that community
'''
# idx = random.sample(trump_communes, 1)[0]
idx = 25
dist_all, dist_trump, dist_hillary = \
commune_active_distributors(Gs, top_communes, trump_user_communes, hillary_user_communes, idx, k)
print dist_all, dist_trump, dist_hillary
In [ ]:
'''
For most communities,
there's no evidence that Trump users in a community are more likely to tweet
to each other. same for hillary users
But in some communities, there are.
'''
for i in communes:
print 'community', i
print util.get_modularity(Gu, trump_users[i])
print util.get_modularity(Gu, hillary_users[i])