In [31]:
from pymongo import MongoClient
from collections import defaultdict, Counter
import re
from __future__ import division
database_name = "beresi-data"
collection_name1 = "roland_garros_2017"
collection_name2 = "brussels"
k = 20
# connect to collection
client = MongoClient()
db = client[database_name]
collection = db[collection_name1]
In [32]:
# find mentions in Roland Garros dataset
mentioned_rg = Counter()
for rec in collection.find():
mentions = re.findall(r"@(\w+)", rec["text\n"])
for mention in mentions:
mentioned_rg[mention] += 1
In [33]:
n = float(sum(mentioned_rg.values()))
print "mentioned users:", len(mentioned_rg) # mentioned users
print "--------"
# k most mentioned
showed_rg = 0
for key,v in mentioned_rg.most_common(k):
print(key + " - " + str(v/n*100) + "%")
showed_rg += v
print "--------"
print "showed/all: " + str(showed_rg/n*100) + "%" # shown/all percentage
In [34]:
# switch to brussels collection
collection = db[collection_name2]
In [35]:
# find mentions in Brussels terror attack dataset
mentioned_brussels = Counter()
for rec in collection.find():
mentions = re.findall(r"@(\w+)", rec["text\n"])
for mention in mentions:
mentioned_brussels[mention] += 1
In [36]:
n = float(sum(mentioned_brussels.values()))
print "mentioned users:", len(mentioned_brussels) # mentioned users
print "--------"
# k most mentioned
showed_brussels = 0
for k,v in mentioned_brussels.most_common(k):
print(k + " - " + str(v/n*100) + "%")
showed_brussels += v
print "--------"
print "showed/all: " + str(showed_brussels/n*100) + "%" # shown/all percentage