In [1]:
import pymongo
client=pymongo.MongoClient()
db=client['eumssi_db']
col=db['content_items']
In [2]:
col.count()
Out[2]:
In [3]:
col.find({'source':{'$in':['Twitter','Twitter-DW']}}).count()
Out[3]:
In [4]:
col.find({'source':{'$in':['Youtube-video-GeneralChannel',
'Youtube-video-dwEnglishChannel',
'Youtube-video-theguardianChannel'
]}}).count()
Out[4]:
In [5]:
top_tags = col.aggregate([
{'$match' : {'source' : {'$in':['Twitter','Twitter-DW']}}}, # only count tweets
{'$project' : {'meta.original.entities.hashtags.text':1}}, # only keep hashtags
{'$group' :{ '_id' : "$meta.original.entities.hashtags.text",'groupCount' : {'$sum':1} } }, # count hashtag groups
{'$unwind':"$_id"}, # split hashtag groups
{'$group' :{ '_id' : {'$toLower':"$_id"},'tagCount' : {'$sum':'$groupCount'} } }, # count individual hashtags
{'$sort':{'tagCount':-1}} # top hashtags first
])['result']
In [6]:
print '\n'.join(['\t'.join((str(x['tagCount']),x['_id'])) for x in top_tags[:50]]) # pretty print top tags
In [7]:
for lang in ('en','es','de','fr'):
top_tags = col.aggregate([
{'$match' : {'source' : {'$in':['Twitter','Twitter-DW']},'meta.source.inLanguage':lang}}, # only count tweets
{'$project' : {'meta.original.entities.hashtags.text':1}}, # only keep hashtags
{'$group' :{ '_id' : "$meta.original.entities.hashtags.text",'groupCount' : {'$sum':1} } }, # count hashtag groups
{'$unwind':"$_id"}, # split hashtag groups
{'$group' :{ '_id' : {'$toLower':"$_id"},'tagCount' : {'$sum':'$groupCount'} } }, # count individual hashtags
{'$sort':{'tagCount':-1}} # top hashtags first
])['result']
print '== '+lang+' =='
print '\n'.join(['\t'.join((str(x['tagCount']),x['_id'])) for x in top_tags[:50]]) # pretty print top tags
print
In [8]:
langs = col.aggregate([
{'$match' : {'source' : {'$in':['Twitter','Twitter-DW']}}}, # only count tweets
{'$project' : {'meta.source.inLanguage':1}}, # only keep language field
{'$group' :{ '_id' : "$meta.source.inLanguage",'langCount' : {'$sum':1} } }, # count tweets per language
{'$sort':{'langCount':-1}} # top languages first
])['result']
In [9]:
print '\n'.join(['\t'.join((str(x['langCount']),str(x['_id']))) for x in langs]) # pretty print languages