In [31]:
from pymongo import MongoClient
from collections import defaultdict, Counter
import re
from __future__ import division

database_name = "beresi-data"
collection_name1 = "roland_garros_2017"
collection_name2 = "brussels"
k = 20

# connect to collection
client = MongoClient()
db = client[database_name]
collection = db[collection_name1]

In [32]:
# find mentions in Roland Garros dataset
mentioned_rg = Counter()
for rec in collection.find():
    mentions = re.findall(r"@(\w+)", rec["text\n"])
    for mention in mentions:
        mentioned_rg[mention] += 1

In [33]:
n = float(sum(mentioned_rg.values()))

print "mentioned users:", len(mentioned_rg) # mentioned users
print "--------"

# k most mentioned
showed_rg = 0
for key,v in mentioned_rg.most_common(k):
    print(key + " - " + str(v/n*100) + "%")
    showed_rg += v

print "--------"
print "showed/all: " + str(showed_rg/n*100) + "%" # shown/all percentage


mentioned users: 24243
--------
rolandgarros - 24.3557140924%
RafaelNadal - 7.2178806382%
stanwawrinka - 2.32809448113%
DjokerNole - 1.50560613069%
KikiMladenovic - 1.23100571015%
andy_murray - 1.16657206973%
ThiemDomi - 1.12678076201%
WTA - 1.07638517178%
Simona_Halep - 1.04649119441%
TennisChannel - 0.947921863607%
ATPWorldTour - 0.828850919947%
GarbiMuguruza - 0.787544714725%
francetvsport - 0.786332796723%
delpotrojuan - 0.746339502669%
CaroGarcia - 0.694126035432%
TimeaOfficial - 0.531830016381%
Gael_Monfils - 0.467598362295%
alizecornet - 0.427605068241%
tsonga7 - 0.41467794289%
KaPliskova - 0.386197869852%
--------
showed/all: 48.0735553432%

In [34]:
# switch to brussels collection
collection = db[collection_name2]

In [35]:
# find mentions in Brussels terror attack dataset
mentioned_brussels = Counter()
for rec in collection.find():
    mentions = re.findall(r"@(\w+)", rec["text\n"])
    for mention in mentions:
        mentioned_brussels[mention] += 1

In [36]:
n = float(sum(mentioned_brussels.values()))

print "mentioned users:", len(mentioned_brussels) # mentioned users
print "--------"

# k most mentioned
showed_brussels = 0
for k,v in mentioned_brussels.most_common(k):
    print(k + " - " + str(v/n*100) + "%")
    showed_brussels += v

print "--------"
print "showed/all: " + str(showed_brussels/n*100) + "%" # shown/all percentage


mentioned users: 2213
--------
zpz_polbru - 2.07156308851%
CrisiscenterBE - 1.74199623352%
rtlinfo - 0.988700564972%
STIBMIVB - 0.941619585687%
CNN - 0.906308851224%
p_vanostaeyen - 0.706214689266%
POTUS - 0.623822975518%
RTBFinfo - 0.576741996234%
EU_Commission - 0.553201506591%
TRobinsonNewEra - 0.553201506591%
FoxNews - 0.54143126177%
realDonaldTrump - 0.529661016949%
MeherSlimene - 0.517890772128%
DavidAntoine - 0.482580037665%
nytimes - 0.482580037665%
MailOnline - 0.459039548023%
BBCWorld - 0.459039548023%
YouTube - 0.459039548023%
GhadakpourN - 0.447269303202%
BFMTV - 0.400188323917%
--------
showed/all: 14.4420903955%