Setup


In [1]:
import pymongo
client=pymongo.MongoClient()
db=client['eumssi_db']
col=db['content_items']

Analysis

number of content items


In [2]:
col.count()


Out[2]:
1303900

number of tweets


In [3]:
col.find({'source':{'$in':['Twitter','Twitter-DW']}}).count()


Out[3]:
543734

number of Youtube videos


In [4]:
col.find({'source':{'$in':['Youtube-video-GeneralChannel',
                           'Youtube-video-dwEnglishChannel',
                           'Youtube-video-theguardianChannel'
        ]}}).count()


Out[4]:
7826

top tags


In [5]:
top_tags = col.aggregate([
{'$match' : {'source' : {'$in':['Twitter','Twitter-DW']}}}, # only count tweets
{'$project' : {'meta.original.entities.hashtags.text':1}}, # only keep hashtags
{'$group' :{ '_id' : "$meta.original.entities.hashtags.text",'groupCount' : {'$sum':1} } }, # count hashtag groups
{'$unwind':"$_id"}, # split hashtag groups
{'$group' :{ '_id' : {'$toLower':"$_id"},'tagCount' : {'$sum':'$groupCount'} } }, # count individual hashtags
{'$sort':{'tagCount':-1}} # top hashtags first
])['result']

In [6]:
print '\n'.join(['\t'.join((str(x['tagCount']),x['_id'])) for x in top_tags[:50]]) # pretty print top tags


280422	fracking
126855	climate
59787	environment
39700	sustainability
37733	nuclear
30295	cop20
14314	energy
14166	shale
12079	climatechange
11746	oilandgas
11099	oil
10254	green
9965	auspol
9615	thorium
9297	natgas
8620	economy
8312	cdnpoli
8120	health
6255	iran
6066	water
5630	gas
5132	usa
5025	gop
4771	frackoff
4715	solar
4596	shalegas
4336	ttip
4331	passabill
4311	anfa
4304	uslatino
4239	workbalance
4009	globalwarming
3992	csr
3855	csg
3525	texas
3512	science
3422	earth
3353	p2
3263	cir
3246	immigration
3242	vets
3225	lgtb
3197	bartonmoss
3192	lima
3072	coal
2982	uranium
2964	eco
2853	nature
2548	tarsands
2510	renewables

top tags by language


In [7]:
for lang in ('en','es','de','fr'):
    top_tags = col.aggregate([
{'$match' : {'source' : {'$in':['Twitter','Twitter-DW']},'meta.source.inLanguage':lang}}, # only count tweets
{'$project' : {'meta.original.entities.hashtags.text':1}}, # only keep hashtags
{'$group' :{ '_id' : "$meta.original.entities.hashtags.text",'groupCount' : {'$sum':1} } }, # count hashtag groups
{'$unwind':"$_id"}, # split hashtag groups
{'$group' :{ '_id' : {'$toLower':"$_id"},'tagCount' : {'$sum':'$groupCount'} } }, # count individual hashtags
{'$sort':{'tagCount':-1}} # top hashtags first
])['result']
    print '==   '+lang+'   =='
    print '\n'.join(['\t'.join((str(x['tagCount']),x['_id'])) for x in top_tags[:50]]) # pretty print top tags
    print


==   en   ==
242818	fracking
123546	climate
56756	environment
38254	sustainability
34429	nuclear
29836	cop20
13948	energy
13444	shale
11804	climatechange
10769	oilandgas
10761	oil
9665	green
9445	auspol
9297	thorium
9133	natgas
8572	economy
8151	cdnpoli
7973	health
6136	iran
5891	water
5221	gas
5013	gop
4473	frackoff
4449	usa
4366	solar
4331	passabill
4309	anfa
4304	uslatino
4239	workbalance
3908	shalegas
3889	csr
3792	csg
3604	globalwarming
3385	texas
3309	p2
3290	earth
3263	cir
3255	science
3244	immigration
3240	vets
3225	lgtb
3140	bartonmoss
3119	lima
3017	coal
2956	uranium
2558	nature
2473	tarsands
2436	renewables
2367	irantalks
2331	planet

==   es   ==
19638	fracking
1705	nuclear
810	medioambiente
708	frackingno
446	ucrania
389	climate
346	environment
323	shale
319	cantabria
302	méxico
299	shapoporose
259	oilandgas
258	eeuu
251	reformaenergética
241	Últimahoratve
239	mexico
238	burgos
237	shalegas
220	sustainability
215	renovables
208	españa
201	tamaulipas
201	marcaespaña
197	coahuila
167	reformaenergetica
154	pp
150	agua
149	science
148	nofracking
145	gas
143	argentina
142	nuevayork
138	pemex
136	bbc
135	cop20
133	nl
130	thorium
129	unasur
129	integración
129	eeuusanciones
128	auspol
125	merindades
119	vacamuerta
115	falso
113	ttip
110	prospecciones
102	frackingez
100	energía
95	petróleo
92	nuevolaredo

==   de   ==
13050	fracking
2931	ttip
2458	gentechnik
2410	acta
531	schiefergas
478	wm2014
437	erdgas
332	nofracking
287	eu
285	100000haende
279	usa
258	climate
247	energiewende
231	ewendemo
213	umwelt
201	gasbohren
191	oilandgas
179	piraten
175	spd
165	gas
153	cdu
152	groko
146	ukraine
144	energie
135	shalegas
120	nrw
115	shale
110	gabriel
94	deutschland
93	russland
90	nuclear
88	bigoil
87	engagingindeception
85	nato
83	eid
77	auspol
76	energy
73	ceta
72	natgas
70	oil
69	environment
68	grüne
68	exxon
63	sockpuppet
63	propaganda
62	nokxl
60	co2
58	kohle
57	umweltschutz
56	sustainability

==   fr   ==
1058	fracking
443	climate
424	environment
379	gazdeschiste
134	nuclear
124	schiste
115	cop20
109	oilandgas
103	sustainability
63	jobs
53	pétrole
51	climat
47	cofrentes17
47	solaridad
46	energy
42	shalegas
40	gaz
40	change
38	shale
35	australia
34	metals
34	occupychevron
34	sweden
33	environnement
31	pollution
29	canada
28	polqc
28	québec
28	chevron
27	toxic
27	polcan
27	green
26	climatechange
26	svpol
26	ericgarner
26	investors
26	icantbreathe
25	agriculture
25	gettheffout
24	total
24	holyfieldholywar
22	tafta
22	oil
22	job
22	cdnpoli
22	usa
21	camac
21	bartonmoss
21	france
21	europe

languages


In [8]:
langs = col.aggregate([
{'$match' : {'source' : {'$in':['Twitter','Twitter-DW']}}}, # only count tweets
{'$project' : {'meta.source.inLanguage':1}}, # only keep language field
{'$group' :{ '_id' : "$meta.source.inLanguage",'langCount' : {'$sum':1} } }, # count tweets per language
{'$sort':{'langCount':-1}} # top languages first
])['result']

In [9]:
print '\n'.join(['\t'.join((str(x['langCount']),str(x['_id']))) for x in langs]) # pretty print languages


494107	en
22895	es
13412	de
3448	und
2256	fr
1389	it
1015	nl
948	ja
711	pt
513	in
343	ro
332	sk
271	tl
208	ar
195	pl
189	da
186	et
163	sv
126	sl
107	cy
104	no
97	ht
91	tr
77	bs
70	vi
68	fi
51	el
47	ru
44	hu
40	lt
38	hr
35	fa
30	hi
26	is
26	id
16	bg
15	zh
14	ko
12	lv
9	ta
9	uk
9	th
8	None
2	iw
2	bn
1	ne
1	ur
1	sr