In [70]:
from collections import Counter, OrderedDict, defaultdict
from dateutil.parser import parse
import random

import matplotlib.pyplot as plt
import numpy as np
import pandas
import snap

import loader
import tweet_util
import util

In [2]:
michigan_tweets = loader.load_michigan_tweets() # a list of dictionaries

In [3]:
'''
There are three different types of graphs:
Gs: directed single graph
Gu: undirected single graph
G: undirected multi-graph (allows multile edges)

We will be using Gu to detect communities because it seems to work best.
We use Gs to find popular users
G is useless.
'''
Gs, nodes2names_s, names2nodes_s, ids2screennames = util.create_graph_from_tweets(michigan_tweets, multi=False)
print 'single graph directed. number of nodes: %d. number of edges: %d' %(Gs.GetNodes(), Gs.GetEdges())
print 'self edge', snap.CntSelfEdges(Gs)

Gu, nodes2names_u, names2nodes_u, ids2screennames = util.create_graph_from_tweets(michigan_tweets, multi=False, directed=False)
print 'single graph undirected. number of nodes: %d. number of edges: %d' %(Gu.GetNodes(), Gu.GetEdges())
print 'self edge', snap.CntSelfEdges(Gu)

G, nodes2names, names2nodes, ids2screennames = util.create_graph_from_tweets(michigan_tweets, multi=True)
print 'multi graph. number of nodes: %d. number of edges: %d' %(G.GetNodes(), G.GetEdges())
print 'self edge', snap.CntSelfEdges(G)


single graph directed. number of nodes: 38936. number of edges: 51732
self edge 0
single graph undirected. number of nodes: 38936. number of edges: 51702
self edge 0
multi graph. number of nodes: 38936. number of edges: 66970
self edge 0

In [78]:
pro_hillary, pro_trump = util.load_hashtags('tags/curated_hillary.tags', 'tags/curated_trump.tags')
print '%d pro Hillary hashtags, %d pro Trump hashtags' %(len(pro_hillary), len(pro_trump))


417 pro Hillary tweets, 647 pro Trump tweets

In [79]:
pro_hillary_orig, pro_trump_orig = util.load_hashtags('tags/orig_hillary.tags', 'tags/orig_trump.tags')
print '%d pro Hillary hashtags, %d pro Trump hashtags' %(len(pro_hillary_orig), len(pro_trump_orig))


20 pro Hillary tweets, 20 pro Trump tweets

In [1]:
pro_hillary_all, pro_trump_all = util.load_hashtags('tags/all_hillary.tags', 'tags/all_trump.tags')
print '%d pro Hillary hashtags, %d pro Trump tweets' %(len(pro_hillary_all), len(pro_trump_all))


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-388d9562b5c9> in <module>()
----> 1 pro_hillary_all, pro_trump_all = util.load_hashtags('tags/all_hillary.tags', 'tags/all_trump.tags')
      2 print '%d pro Hillary hashtags, %d pro Trump tweets' %(len(pro_hillary_all), len(pro_trump_all))

NameError: name 'util' is not defined

In [80]:
'''
Add the tags got from semi-supervised
'''
new_hillary, new_trump = util.load_hashtags('tags/correct_hillary.tags', 'tags/correct_trump.tags')
pro_hillary_all = pro_hillary | new_hillary
pro_trump_all = pro_trump | new_trump
print '%d pro Hillary tweets, %d pro Trump tweets' %(len(pro_hillary_all), len(pro_trump_all))


431 pro Hillary tweets, 682 pro Trump tweets

In [6]:
hashtag2ids, id2hashtags = util.get_unique_hashtags(michigan_tweets, names2nodes)

Find communities based on hashtag and users that they follow

See milestone part 2.1


In [191]:
'''
Modularity scores are negative, which means that overall,
pro Trump users are less likely to interact with each other than just random.
Same for pro Hillary users.

There are 4834 users in the entire graph that tweet pro_Trump hashtags.
There are 4270 users in the entire graph that tweet pro_Trump hashtags.
Out of 19595 users who tweet with hashtags and 38936 total users.
(only 23.4% of total users)
'''
util.get_modularity_tag(Gu, hashtag2ids, ['makeamericagreatagain'])
util.get_modularity_tag(Gu, hashtag2ids, ['nevertrump'])

util.get_modularity_tag(Gu, hashtag2ids, pro_trump_all)
util.get_modularity_tag(Gu, hashtag2ids, pro_hillary_all)


number of nodes: 757
modularity score: -0.031860, compared to -0.000115 random
number of nodes: 533
modularity score: -0.015996, compared to 0.000066 random
number of nodes: 3453
modularity score: -0.108515, compared to 0.000240 random
number of nodes: 3342
modularity score: -0.097418, compared to -0.000324 random

In [213]:
def find_celebrities(G, k=-1, threshold=100):
    '''
    Find k people with the most people tweet to them (with highest in degrees)
    over the entire graph
    Or find all nodes with the at least threshold people tweeting to them 
    (with in degree of at least threshold)
    '''
    if k != -1:
        deg_seq = sorted([(node.GetInDeg(), -node.GetId()) for node in G.Nodes()])[::-1]
        celebs = [-x[1] for x in deg_seq[:k]]
    else:
        deg_seq = sorted([(node.GetInDeg(), -node.GetId()) for node in G.Nodes() if node.GetInDeg() >= threshold])[::-1]
        print deg_seq[:10]
        celebs = [-x[1] for x in deg_seq]
        print celebs
    return celebs

In [221]:
celebs = find_celebrities(Gs, threshold=100)
celebs =  find_celebrities(Gs, k=100) 
for celeb in celebs:
    print ids2screennames[nodes2names[celeb]]


[(1575, -11), (991, -5), (417, -84), (300, -877), (277, -89), (273, -9), (265, -63), (260, -54), (245, -43), (235, -361)]
[11, 5, 84, 877, 89, 9, 63, 54, 43, 361, 36, 42, 290, 71, 170, 339, 358, 489, 88, 146, 2386, 159, 150, 1560, 138, 262, 19, 137, 1450, 45, 194, 771, 115, 1648, 199, 276]
realDonaldTrump
HillaryClinton
wikileaks
DonaldJTrumpJr
FoxNews
DanScavino
bfraser747
mike_pence
LindaSuhler
CNN
LouDobbs
TeamTrump
POTUS
FBI
KellyannePolls
WayneDupreeShow
YouTube
vivelafra
seanhannity
VoteTrumpPics
RealJamesWoods
ChristiChat
Lrihendry
EricTrump
WeNeedTrump
DailyCaller
SandraTXAS
RealJack
BernieSanders
Cernovich
Miami4Trump
mitchellvii
JamesOKeefeIII
TomiLahren
Stevenwhirsch99
GenFlynn
LeahR77
JohnKStahlUSA
TrumpSuperPAC
RealAlexJones
DiamondandSilk
DrMartyFox
ABC
YoungDems4Trump
GOP
Stonewall_77
AP
MIGOP
TheDemocrats
MSNBC
ScottAdamsSays
HumaAbedin
BarackObama
IngrahamAngle
FoxBusiness
ladygaga
IvankaTrump
somaliadev
JohnFromCranber
MELANIATRUMP
CarmineZozzora
peterdaou
oranicuhh
LaraLeaTrump
donnabrazile
KamVTV
PJStrikeForce
ScottWalker
brunelldonald
jojoh888
NadelParis
newtgingrich
Bikers4Trump
Darren32895836
SpeakerRyan
The_Trump_Train
joelpollak
megynkelly
CorrectRecord
WeSupport45
healthandcents
LVNancy
chuckwoolery
MAGA3X
StockMonsterUSA
DBloom451
ConstanceQueen8
NIVIsa4031
pattonoswalt
mikandynothem
carrieksada
Project_Veritas
ChrisJZullo
alaskantexanQCT
TallahForTrump
tedcruz
DrLee4America
jaketapper
politicususa
DrJillStein

In [222]:
for celeb in celebs[:10]:
    print ids2screennames[nodes2names[celeb]]
    nbrs = [v for v in Gs.GetNI(celeb).GetInEdges()]
    print len(nbrs)
    util.get_modularity(Gu, nbrs)


realDonaldTrump
1575
-0.0798304565441 -0.000131314361483
HillaryClinton
991
-0.0615741180805 9.30123441949e-05
wikileaks
417
-0.0262568481976 -4.24900436939e-06
DonaldJTrumpJr
300
-0.016676141389 1.30115003497e-05
FoxNews
277
-0.015529720786 6.45758506545e-06
DanScavino
273
-0.0219186184774 1.5613912649e-06
bfraser747
265
-0.0224424352722 -4.03950952462e-06
mike_pence
260
-0.0177954879632 -2.92762498847e-05
LindaSuhler
245
-0.0222845488277 -1.71913901252e-05
CNN
235
-0.0143243738658 -1.46661355254e-05

Find communities based on graph's structure using CNM algorithm

See milestone part 2.2


In [14]:
'''
communes is a dict mapping from community_id to list of ids in that community
but id is node id, not twitter id. you can translate from node id to twitter ids through map nodes2names
so if node_id is 2, you can get the twitter user id associated with it using nodes2names[2]
'''
all_communes, modularity = util.find_communities(Gu)

In [13]:
# make sure each node is in exactly one community
assert sum([len(all_communes[i]) for i in all_communes]) == Gu.GetNodes()

In [232]:
count_community_size = OrderedDict(sorted(Counter([len(all_communes[i]) for i in all_communes]).items()))
print count_community_size
counts = []
sizes = []
for i in count_community_size:
    sizes.append(i)
    counts.append(count_community_size[i])
plt.loglog(sizes, counts)
plt.title('Community sizes')
plt.xlabel('size')
plt.ylabel('number of communities with that size')
plt.savefig('community_size.png')
plt.show()


OrderedDict([(1, 6209), (2, 5012), (3, 964), (4, 306), (5, 107), (6, 68), (7, 52), (8, 23), (9, 11), (10, 16), (11, 8), (12, 11), (13, 4), (14, 6), (15, 3), (16, 5), (17, 4), (18, 2), (19, 4), (20, 1), (21, 2), (23, 1), (26, 1), (28, 3), (30, 1), (33, 2), (35, 4), (52, 1), (55, 1), (104, 2), (113, 1), (128, 2), (192, 1), (238, 1), (249, 1), (322, 1), (341, 1), (349, 1), (486, 1), (1156, 1), (4881, 1), (6847, 1)])

In [15]:
threshold = 20
all_nodes = [node.GetId() for node in Gu.Nodes()]
print 'number of communities:', len(all_communes)
communes = {i: all_communes[i] for i in all_communes if len(all_communes[i]) >= threshold}
print '%d communities with at least %d members:' %(len(communes), threshold)
communes_modularity = []
random_modularity = []

for i in communes:
    nodes = snap.TIntV()
    for node in communes[i]:
        nodes.Add(node)
    rand = snap.TIntV()
    rand_nodes = random.sample(all_nodes, len(nodes))
    for node in rand_nodes:
        rand.Add(node)
    score = snap.GetModularity(G, nodes)
    communes_modularity.append((score, i))
    random_modularity.append(snap.GetModularity(G, rand))
print "The modularity of the network is %f" % modularity


number of communities: 12847
32 communities with at least 20 members:
The modularity of the network is 0.613922

In [20]:
print min(random_modularity), max(random_modularity)


-6.33051748336e-06 0.00693659944014

In [19]:
communes_modularity = sorted(communes_modularity)[::-1]

In [22]:
for score, i in communes_modularity:
    print 'Community %d with %d users, modularity score %f' %(i, len(communes[i]), score)


Community 0 with 6847 users, modularity score 0.158291
Community 1 with 4881 users, modularity score 0.067584
Community 26 with 1156 users, modularity score 0.012794
Community 56 with 486 users, modularity score 0.005132
Community 312 with 349 users, modularity score 0.002811
Community 31 with 341 users, modularity score 0.002687
Community 25 with 322 users, modularity score 0.002684
Community 38 with 238 users, modularity score 0.002082
Community 634 with 249 users, modularity score 0.001922
Community 41 with 192 users, modularity score 0.001504
Community 146 with 128 users, modularity score 0.001228
Community 685 with 128 users, modularity score 0.000984
Community 66 with 113 users, modularity score 0.000894
Community 24 with 104 users, modularity score 0.000871
Community 247 with 104 users, modularity score 0.000820
Community 368 with 55 users, modularity score 0.000455
Community 18 with 52 users, modularity score 0.000395
Community 789 with 35 users, modularity score 0.000343
Community 1566 with 33 users, modularity score 0.000291
Community 49 with 35 users, modularity score 0.000291
Community 693 with 33 users, modularity score 0.000261
Community 177 with 35 users, modularity score 0.000254
Community 1237 with 35 users, modularity score 0.000254
Community 916 with 28 users, modularity score 0.000239
Community 710 with 30 users, modularity score 0.000224
Community 510 with 28 users, modularity score 0.000216
Community 10897 with 28 users, modularity score 0.000202
Community 1232 with 26 users, modularity score 0.000187
Community 1390 with 21 users, modularity score 0.000172
Community 1862 with 21 users, modularity score 0.000164
Community 746 with 23 users, modularity score 0.000164
Community 262 with 20 users, modularity score 0.000142

In [183]:
def label_community(G, communes, pro_trump, pro_hillary):
    diff_threshold = 5
    trump_neutrals = set()
    hillary_neutrals = set()
    trump_users = {}
    hillary_users = {}
    labeled_communes = {'trump': [], 'hillary': [], 'neutral': []}
    
    all_nodes = [node.GetId() for node in G.Nodes()]
    
    print "Over the entire network"
    _, _, tag_ratio, user_ratio, diff, trumpers, hillaryers, neutral_users, neutrals \
                            = util.homophily(Gu, id2hashtags, all_nodes, pro_trump, pro_hillary)
    
    print "\nOver each community"
    for i in communes:
        print '%d. size of community %d' %(i, len(communes[i]))
        hashtags, others, tag, user, commune_diff, trumpers, hillaryers, neutral_users, neutrals \
            = util.homophily(Gu, id2hashtags, top_communes[i], pro_trump, pro_hillary)
        user_diff = abs(len(trumpers) - len(hillaryers))
        if (user >= 2 * user_ratio or user == -1) and user_diff >= diff_threshold:
            labeled_communes['trump'].append(i)
            trump_neutrals = trump_neutrals | set(neutrals)
        elif user <= 0.5 * user_ratio and user_diff >= diff_threshold: 
            labeled_communes['hillary'].append(i)
            hillary_neutrals = hillary_neutrals | set(neutrals)
        else:
            labeled_communes['neutral'].append(i)

        trump_users[i] = trumpers
        hillary_users[i] = hillaryers
    print "Labeled communities", labeled_communes
    return labeled_communes, trump_users, hillary_users, trump_neutrals, hillary_neutrals

In [185]:
labeled_communes, trump_users, hillary_users, trump_neutrals, hillary_neutrals = \
                                        label_community(Gu, communes, pro_trump_all, pro_hillary_all)


Over the entire network
708 trump tags, 444 hillary tags out of 10318. Trump/Hillary = 1.594595
3073 trump users, 2530 hillary users. 205 users are neutral. Trump/Hillary = 1.214625

Over each community
0. size of community 6847
669 trump tags, 145 hillary tags out of 4227. Trump/Hillary = 4.613793
1101 trump users, 163 hillary users. 31 users are neutral. Trump/Hillary = 6.754601
1. size of community 4881
119 trump tags, 367 hillary tags out of 2421. Trump/Hillary = 0.324251
492 trump users, 662 hillary users. 51 users are neutral. Trump/Hillary = 0.743202
262. size of community 20
0 trump tags, 0 hillary tags out of 8. Trump/Hillary = -1.000000
0 trump users, 0 hillary users. 0 users are neutral. Trump/Hillary = -1.000000
10897. size of community 28
0 trump tags, 0 hillary tags out of 3. Trump/Hillary = -1.000000
0 trump users, 0 hillary users. 0 users are neutral. Trump/Hillary = -1.000000
146. size of community 128
42 trump tags, 12 hillary tags out of 148. Trump/Hillary = 3.500000
11 trump users, 7 hillary users. 1 users are neutral. Trump/Hillary = 1.571429
916. size of community 28
0 trump tags, 0 hillary tags out of 2. Trump/Hillary = -1.000000
0 trump users, 0 hillary users. 0 users are neutral. Trump/Hillary = -1.000000
789. size of community 35
4 trump tags, 9 hillary tags out of 37. Trump/Hillary = 0.444444
0 trump users, 1 hillary users. 0 users are neutral. Trump/Hillary = 0.000000
24. size of community 104
40 trump tags, 15 hillary tags out of 166. Trump/Hillary = 2.666667
11 trump users, 10 hillary users. 2 users are neutral. Trump/Hillary = 1.100000
25. size of community 322
56 trump tags, 13 hillary tags out of 296. Trump/Hillary = 4.307692
86 trump users, 21 hillary users. 2 users are neutral. Trump/Hillary = 4.095238
26. size of community 1156
107 trump tags, 46 hillary tags out of 867. Trump/Hillary = 2.326087
149 trump users, 74 hillary users. 29 users are neutral. Trump/Hillary = 2.013514
1566. size of community 33
0 trump tags, 7 hillary tags out of 30. Trump/Hillary = 0.000000
0 trump users, 11 hillary users. 0 users are neutral. Trump/Hillary = 0.000000
31. size of community 341
115 trump tags, 13 hillary tags out of 341. Trump/Hillary = 8.846154
35 trump users, 5 hillary users. 0 users are neutral. Trump/Hillary = 7.000000
1862. size of community 21
0 trump tags, 0 hillary tags out of 6. Trump/Hillary = -1.000000
0 trump users, 0 hillary users. 0 users are neutral. Trump/Hillary = -1.000000
38. size of community 238
56 trump tags, 11 hillary tags out of 252. Trump/Hillary = 5.090909
27 trump users, 14 hillary users. 0 users are neutral. Trump/Hillary = 1.928571
49. size of community 35
15 trump tags, 4 hillary tags out of 61. Trump/Hillary = 3.750000
11 trump users, 2 hillary users. 1 users are neutral. Trump/Hillary = 5.500000
41. size of community 192
28 trump tags, 15 hillary tags out of 158. Trump/Hillary = 1.866667
11 trump users, 36 hillary users. 3 users are neutral. Trump/Hillary = 0.305556
685. size of community 128
2 trump tags, 5 hillary tags out of 28. Trump/Hillary = 0.400000
4 trump users, 29 hillary users. 0 users are neutral. Trump/Hillary = 0.137931
177. size of community 35
5 trump tags, 1 hillary tags out of 24. Trump/Hillary = 5.000000
11 trump users, 2 hillary users. 0 users are neutral. Trump/Hillary = 5.500000
693. size of community 33
0 trump tags, 3 hillary tags out of 19. Trump/Hillary = 0.000000
0 trump users, 3 hillary users. 0 users are neutral. Trump/Hillary = 0.000000
56. size of community 486
24 trump tags, 13 hillary tags out of 232. Trump/Hillary = 1.846154
24 trump users, 26 hillary users. 3 users are neutral. Trump/Hillary = 0.923077
66. size of community 113
32 trump tags, 12 hillary tags out of 185. Trump/Hillary = 2.666667
17 trump users, 4 hillary users. 0 users are neutral. Trump/Hillary = 4.250000
710. size of community 30
1 trump tags, 3 hillary tags out of 24. Trump/Hillary = 0.333333
1 trump users, 1 hillary users. 0 users are neutral. Trump/Hillary = 1.000000
1232. size of community 26
2 trump tags, 0 hillary tags out of 8. Trump/Hillary = -1.000000
19 trump users, 0 hillary users. 0 users are neutral. Trump/Hillary = -1.000000
312. size of community 349
13 trump tags, 17 hillary tags out of 97. Trump/Hillary = 0.764706
7 trump users, 79 hillary users. 5 users are neutral. Trump/Hillary = 0.088608
1237. size of community 35
2 trump tags, 4 hillary tags out of 14. Trump/Hillary = 0.500000
1 trump users, 24 hillary users. 1 users are neutral. Trump/Hillary = 0.041667
746. size of community 23
17 trump tags, 5 hillary tags out of 48. Trump/Hillary = 3.400000
3 trump users, 1 hillary users. 0 users are neutral. Trump/Hillary = 3.000000
18. size of community 52
6 trump tags, 8 hillary tags out of 51. Trump/Hillary = 0.750000
1 trump users, 13 hillary users. 1 users are neutral. Trump/Hillary = 0.076923
1390. size of community 21
0 trump tags, 2 hillary tags out of 31. Trump/Hillary = 0.000000
0 trump users, 3 hillary users. 0 users are neutral. Trump/Hillary = 0.000000
368. size of community 55
16 trump tags, 2 hillary tags out of 50. Trump/Hillary = 8.000000
18 trump users, 1 hillary users. 1 users are neutral. Trump/Hillary = 18.000000
247. size of community 104
5 trump tags, 10 hillary tags out of 64. Trump/Hillary = 0.500000
0 trump users, 18 hillary users. 4 users are neutral. Trump/Hillary = 0.000000
634. size of community 249
5 trump tags, 8 hillary tags out of 47. Trump/Hillary = 0.625000
2 trump users, 44 hillary users. 1 users are neutral. Trump/Hillary = 0.045455
510. size of community 28
0 trump tags, 4 hillary tags out of 38. Trump/Hillary = 0.000000
0 trump users, 3 hillary users. 0 users are neutral. Trump/Hillary = 0.000000
Labeled communities {'hillary': [1566, 41, 685, 312, 1237, 18, 247, 634], 'neutral': [1, 262, 10897, 146, 916, 789, 24, 26, 1862, 38, 693, 56, 710, 746, 1390, 510], 'trump': [0, 25, 31, 49, 177, 66, 1232, 368]}

In [233]:
all_trump_users, all_hillary_users = set(), set()
for c in trump_users:
    all_trump_users = all_trump_users | set(trump_users[c])
    all_hillary_users = all_hillary_users | set(hillary_users[c])

Semi-supervised way to grow lists of polarized hashtags


In [105]:
def eliminate_common_tags(trump_neutrals, hillary_neutrals):
    trump_only = trump_neutrals - hillary_neutrals
    hillary_only = hillary_neutrals - trump_neutrals
    common = trump_neutrals & hillary_neutrals
    print len(trump_only), len(hillary_only), len(common)
    return trump_only, hillary_only

In [237]:
def write_semi_tags(hashtag2ids, tags, filename, users, threshold=3):
    pop = []
    for tag in tags:
        count = 0
        for user in hashtag2ids[tag]:
            if user in users:
                count += 1
        if count >= threshold:
            pop.append(tag)
    with open(filename, 'w') as f:
        for tag in pop:
            f.write(tag.encode('utf-8') + '\n')
    return pop

In [239]:
def grow_tags(trump_neutrals, hillary_neutrals, hashtag2ids, trump_users, hillary_users, mode='general', threshold=3):
    trump_only, hillary_only = eliminate_common_tags(trump_neutrals, hillary_neutrals)
    trump_pop = write_semi_tags(hashtag2ids, trump_only, mode + '_trump_semi.tags', trump_users, threshold)
    print 'popular trump:', len(trump_pop)
    hillary_pop = write_semi_tags(hashtag2ids, hillary_only, mode + '_hillary_semi.tags', hillary_users, threshold)
    print 'popular hillary:', len(hillary_pop)

In [240]:
print len(trump_neutrals), len(hillary_neutrals)
grow_tags(trump_neutrals, hillary_neutrals, hashtag2ids, all_trump_users, all_hillary_users, 'general', threshold=3)


3683 267
3556 140 127
popular trump: 884
popular hillary: 7

In [241]:
def get_semi_tags(id2hashtags, users):
    neutrals = set()
    for node in users:
        neutrals = neutrals | set(id2hashtags[node])
    return neutrals

In [242]:
indi_trump_neutrals = get_semi_tags(id2hashtags,all_trump_users)
indi_hillary_neutrals = get_semi_tags(id2hashtags, all_hillary_users)

In [249]:
print len(indi_trump_neutrals), len(indi_hillary_neutrals)
grow_tags(indi_trump_neutrals, indi_hillary_neutrals, hashtag2ids, all_trump_users, all_hillary_users, mode='indi', threshold=3)


4807 2285
4058 1536 749
popular trump: 917
popular hillary: 211

In [244]:
def count_tags(semi, pro_trump, pro_hillary):
    trumps = set()
    hillarys = set()
    neutrals = set()
    for tag in semi:
        if tag in pro_trump:
            if tag in pro_hillary:
                neutrals.add(tag)
            else:
                trumps.add(tag)
        elif tag in pro_hillary:
            hillarys.add(tag)
        else:
            neutrals.add(tag)
    return trumps, hillarys, neutrals

In [245]:
def evaluate_semi(mode='indi'):
    pro_hillary_all, pro_trump_all = util.load_hashtags('tags/all_hillary.tags', 'tags/all_trump.tags')
    semi_hillary, semi_trump = util.load_hashtags(mode + '_hillary_semi.tags', mode + '_trump_semi.tags')
    trumps, hillarys, trump_neutrals = count_tags(semi_trump, pro_trump_all, pro_hillary_all)
    print "%d Trump semi hashtags" %(len(semi_trump))
    print "Correct %d. Wrong %d. Neutrals %d\n" %(len(trumps), len(hillarys), len(trump_neutrals)) 
    with open('chosen_semi_trump.tags', 'w') as f:
        for tag in trumps:
            f.write(tag + '\n')
    print hillarys
    trumps, hillarys, hillary_neutrals = count_tags(semi_hillary, pro_trump_all, pro_hillary_all)
    print "%d Hillary semi hashtags" %(len(semi_hillary))
    print "Correct %d. Wrong %d. Neutrals %d\n" %(len(hillarys), len(trumps), len(hillary_neutrals)) 
    with open('chosen_semi_hillary.tags', 'w') as f:
        for tag in hillarys:
            f.write(tag + '\n')
    print trumps
    return trump_neutrals, hillary_neutrals

In [251]:
trump_neutrals, hillary_neutrals = evaluate_semi(mode='indi')


917 Trump semi hashtags
Correct 276. Wrong 16. Neutrals 625

set(['remeberwhentrump', 'thepeoplepresident', 'trumpprorest', 'notfittoserve', 'hillarycare', 'clintoncampaign', 'teamclinton', 'hillarysamerica', 'hillaryinphilly', 'hillarygate', 'vote4hillary', 'hillaryrally', 'lockhimuptoo', 'backtheblue', 'bluebloods', 'hillarysupporters'])
211 Hillary semi hashtags
Correct 95. Wrong 1. Neutrals 115

set(['keepamericagreat'])

In [181]:
print trump_neutrals


set(['yuge', 'stevepieczenik', 'partnerincrime', 'gettysburg', 'orgyisland', '4days', 'mediabias', 'rtw', 'floridafraud', 'manchester', 'mopsecret', 'left', 'oakland', 'wonderwoman', 'nwo', 'truemu', 'children', 'islam', 'johnkasich', 'podcast', 'police', 'hilary', 'corruptdoj', 'la', 'pray', 'lawandorder', 'cmaawards50', 'dtmag', 'paranoia', 'thursday', 'winning', 'rico', 'truepundit', 'julianassange', 'prison', 'criminal', 'cruzcrew', 'momentum', 'mlb', 'butinagoodway', 'grubhub', 'freespeech', 'government', 'americadecides', 'thesimpsons', 'upnews', 'tbt', 'spiritdinner', 'theshitmagnet', 'facebook', 'ferguson', 'rip', 'haiti', 'weinergate', 'jacksonville', 'saginaw', 'waynecounty', 'saveourchildren', 'pbs', 'nfl', 'wilmington', '5days', 'dealwithit', 'freezerdig', 'michiganmatters2016', '2a', 'veritas', 'weiner', 'podernfamily', 'independanceday', 'copolitics', 'emas', 'art', 'cnnsoto', 'people', 'wemadehistory', 'siouxcity', 'lnyhbt', 'writeinbernie', 'debate2016', 'video', 'cleveland', 'nbc', 'nba', 'bringdownthering', 'monstervote', 'pizzagate', 'trending', 'crooked', 'abedin', 'pensacola', 'globalism', 'liberty', 'pedophila', 'waar', 'brendasnipes', 'followthemoney', 'summerzervos', 'dobbs', 'guns', 'epstein', 'riggedmedia', 'november8', 'sarasota', 'specialreport', 'nov8', 'pompanobeach', 'bombshell', 'wv', 'pawnstars', 'china', 'evil', 'conservatives', 'mondaymotivation', 'news', 'tjms', 'militarymonday', 'minesota', 'a1', 'phase3', 'wdshow', 'feminist', 'voteguilty', 'thankobamain4words', 'cmas', 'justice', 'kidsforcash', 'basementdwellers', 'raleigh', 'syria', 'bernie2016', 'or', 'openborders', 'johnmolesta', 'electionday2016', 'enoughisenough', 'muslimbrotherhood', 'win', 'disobey', 'civicduty', 'lorettalynchmob', 'traitors', 'spiritcooker', 'jesus', 'lies', 'rino', 'ronbrown', 'fake', 'stophate', 'cisco', 'tuesday', 'crime', 'ballotselfie', 'themovement', 'goliath', 'rallyforriley', 'notmeus', 'bears', 'sorosriots', 'bigleaguetruth', 'hilliary', 'fl4', 'bcot', 'birddogging', '1a', 'christianity', 'wednesdaymotivation', 'hollywood', 'minnesota', 'africanamericans', 'savethechildren', 'catholics', 'life', 'wallup', 'pay4play', 'dtla', 'themorningafter', 'rsbn', 'jebbush', 'mansfield', 'getoutandvote', 'varneyco', 'treygowdy', 'usss', 'nypd', 'tampa', 'cartoons', 'me', 'sandiego', 'molonlabe', 'politicsnation', 'bill', 'abortion', 'imwithyou', 'sethrich', 'changeyourvote', 'flythew', 'marines', 'libtards', 'mypresident', 'crookedmedia', '2adefenders', 'donnabrazile', 'liberal', 'ad', 'blacklives', 'nascar', 'sad', 'libertarian', 'bias', 'mannequinchallenge', 'cookcounty', 'grandrapidsmi', 'insidepolitics', 'tgdn', 'stein', 'veritasproject', 'ca', 'strongerasone', 'shelies', 'ny', 'sick', 'boycottgrubhub', 'imwithwl', 'establishment', 'snowflake', 'thirdparty', 'ificouldchangehistory', 'lorettalynch', 'quote', 'freejulian', 'kadzik', 'absenteeballot', 'fromwhereistand2016', 'dc', 'walledlake', 'undecidedvoters', 'blackvotes', 'a2', 'germany', 'showusyoursticker', 'dmyourcrushday', 'electionnightchecklist', 'fakevotingfacts', 'infowars', 'teamtomi', 'nm', 'chrisstevens', 'pedophilia', 'paycheck', 'thisisit', 'masa', 'closernation', 'scranton', 'nov8th', 'navy', 'youtube', 'riggedsystem', 'protesters', 'flotus', 'ct', 'ignorant', 'ky', 'sterlingheights', 'conservative', 'lynch'])

Recommend users

Variables to know

top_communes: the dict of communities with at least threshold nodes. it's a dict mapping from community's index to list of nodes in that community

trump_user_communes: dict mapping from community's index to list of pro-Trump users in that community hillary_user_communes: dict mapping from community's index to list of pro-Hillary users in that community trump_communes: list of indices of pro-Trump communities hillary_communes: list of indices of pro-Hillary communities


In [ ]:
''' 
Random list of user from a certain community
Example: 5 random pro_Trump users from a pro_Trump community
'''
idx = random.sample(trump_communes, 1)[0]
users = random.sample(trump_user_communes[idx], k=5)
print users

In [ ]:
def convert_screenname(node):
    return ids2screennames[nodes2names[node]]

In [ ]:
''' 
People who interact with others a lot ("active community members")
Based on the node degree in undirected graph Gu

This function can also be used to find people whose posts are popular 
but don't necessarily interact with others a lot ("celebrities")
To do so, just pass in the directed graph Gs instead of Gu

a list of users that primarily create content ("creators") is 
the same as the list of popular users. they create content
and people retweet them (high in degree)

return:
    active_all: k most active users in that community
    active_trump: k most active pro_trump users in that community
    active_hillary: k most active pro_hillary users in that community
'''
def commune_active_users(G, communes, trump_users, hillary_users, idx, k):
    deg_seq = sorted([(G.GetNI(node).GetInDeg(), -node) for node in communes[idx]])[::-1]
    active_all = [-x[1] for x in deg_seq[:k]]
    deg_seq = sorted([(G.GetNI(node).GetInDeg(), -node) for node in trump_users[idx]])[::-1]
    active_trump = [-x[1] for x in deg_seq[:k]]
    deg_seq = sorted([(G.GetNI(node).GetInDeg(), -node) for node in hillary_users[idx]])[::-1]
    active_hillary = [-x[1] for x in deg_seq[:k]]
    return active_all, active_trump, active_hillary

In [ ]:
'''
Example: find 5 most active members in a commune
Might not print all 5 if there aren't enough trump/hillary users in that community
'''
# idx = random.sample(trump_communes, 1)[0]
idx = 25
active_all, active_trump, active_hillary = \
            commune_active_users(Gu, top_communes, trump_user_communes, hillary_user_communes, idx, k)
print active_all, active_trump, active_hillary

In [ ]:
'''
Example: find 5 most popular members in a commune
Might not print all 5 if there aren't enough trump/hillary users in that community
'''
# idx = random.sample(trump_communes, 1)[0]
idx = 25
popular_all, popular_trump, popular_hillary = \
            commune_active_users(Gs, top_communes, trump_user_communes, hillary_user_communes, idx, k)
print popular_all, popular_trump, popular_hillary

In [ ]:
''' 
return a list of k users that primarily spread content others create ("distributors")
Based on out-degree on directed graph Gs

return:
    dist_all: k most active distributors in that community
    dist_trump: k most active pro_trump distributors in that community
    dist_hillary: k most active pro_hillary distributors in that community
'''
def commune_active_distributors(G, communes, trump_users, hillary_users, idx, k):
    deg_seq = sorted([(G.GetNI(node).GetOutDeg(), -node) for node in communes[idx]])[::-1]
    active_all = [-x[1] for x in deg_seq[:k]]
    deg_seq = sorted([(G.GetNI(node).GetOutDeg(), -node) for node in trump_users[idx]])[::-1]
    active_trump = [-x[1] for x in deg_seq[:k]]
    deg_seq = sorted([(G.GetNI(node).GetOutDeg(), -node) for node in hillary_users[idx]])[::-1]
    active_hillary = [-x[1] for x in deg_seq[:k]]
    return active_all, active_trump, active_hillary

In [ ]:
'''
Example: find 5 most active distributors in a commune
Might not print all 5 if there aren't enough trump/hillary users in that community
'''
# idx = random.sample(trump_communes, 1)[0]
idx = 25
dist_all, dist_trump, dist_hillary = \
            commune_active_distributors(Gs, top_communes, trump_user_communes, hillary_user_communes, idx, k)
print dist_all, dist_trump, dist_hillary

Appendix


In [ ]:
'''
For most communities, 
there's no evidence that Trump users in a community are more likely to tweet 
to each other. same for hillary users
But in some communities, there are.
'''
for i in communes:
    print 'community', i
    print util.get_modularity(Gu, trump_users[i])
    print util.get_modularity(Gu, hillary_users[i])