In [115]:
import os, ConfigParser, mediacloud, datetime
import pandas as pd
In [2]:
# load mediacloud and topic_id from config file
config = ConfigParser.ConfigParser()
config.read('app.config')
key = config.get('mediacloud','key')
topic_id = config.get('mediacloud', 'topic_id')
In [42]:
# instantiate mediacloud api
mc = mediacloud.api.MediaCloud(key)
mca = mediacloud.api.AdminMediaCloud(key)
We're looking at the US Presidential Election topic in Media Cloud. That's topic ID #1404. This is a set of stories published between Apr 30, 2015 to Nov 7, 2016, queried on the names of the major presidential candidates. The topic is queried from the following media source sets:
The seed query is:
+( fiorina ( scott and walker ) ( ben and carson ) trump ( cruz and -victor ) kasich rubio (jeb and bush) clinton sanders ) AND (+publish_date:[2016-09-30T00:00:00Z TO 2016-11-08T23:59:59Z]) AND ((tags_id_media:9139487 OR tags_id_media:9139458 OR tags_id_media:2453107 OR tags_id_stories:9139487 OR tags_id_stories:9139458 OR tags_id_stories:2453107))
I think this is the same dataset used for this CJR report, "Breitbart-led right-wing media ecosystem altered broader media agenda", but I'm not totally sure.
In [122]:
# this api call takes a minute or two, but you should only need to do this once.
network = mc.topicMediaMap(topic_id)
with open('network.gexf', 'wb') as f:
f.write(network)
In [123]:
# if you've already generated network.gexf, run this cell to import it
with open('network.gexf', 'r') as f:
network = f.read()
In [ ]:
In [61]:
# this is the query we're interested in. put the term(s) you want to search for here
query = '( "alt-right" OR "alt right" OR "alternative right" )'
In [119]:
# define function fetch stories from topic, based on query
def fetch_all_stories(query, topic_id):
stories_id = []
media_id = []
media_name = []
publish_date = []
media_inlink_count = []
outlink_count = []
title = []
url = []
# do the first page of stories
stories = mc.topicStoryList(topic_id, q=query)
# append new data to lists
stories_id.extend( [s['stories_id'] for s in stories['stories']])
media_id.extend( [s['media_id'] for s in stories['stories']])
media_name.extend( [s['media_name'] for s in stories['stories']])
publish_date.extend( [s['publish_date'] for s in stories['stories']])
media_inlink_count.extend( [s['media_inlink_count'] for s in stories['stories']])
outlink_count.extend( [s['outlink_count'] for s in stories['stories']])
title.extend( [s['title'] for s in stories['stories']])
url.extend( [s['url'] for s in stories['stories']])
nextpage_id = stories['link_ids']['next']
# page through all the remaining stories in the topic
while True:
stories = mc.topicStoryList(topic_id, q=query, link_id = nextpage_id)
# append story data
stories_id.extend( [s['stories_id'] for s in stories['stories']])
media_id.extend( [s['media_id'] for s in stories['stories']])
media_name.extend( [s['media_name'] for s in stories['stories']])
publish_date.extend( [s['publish_date'] for s in stories['stories']])
media_inlink_count.extend( [s['media_inlink_count'] for s in stories['stories']])
outlink_count.extend( [s['outlink_count'] for s in stories['stories']])
title.extend( [s['title'] for s in stories['stories']])
url.extend( [s['url'] for s in stories['stories']])
if (len(stories['stories']) < 1) or ('next' not in stories['link_ids']):
break
nextpage_id = stories['link_ids']['next']
stories = pd.DataFrame({
'stories_id' : stories_id,
'media_id' : media_id,
'media_name' : media_name,
'publish_date' : publish_date,
'media_inlink_count' : media_inlink_count,
'outlink_count' : outlink_count,
'title' : title,
'url' : url
})
return stories
In [120]:
stories = fetch_all_stories(query, topic_id)
In [124]:
# write to csv
stories.to_csv('stories_mentioning_altright.csv', encoding='utf-8')
We can get the same data for some other terms...
In [125]:
query = '( "nasty woman" OR "nasty women" OR "nastywomen" OR "nastywoman" )'
stories_nastywomen = fetch_all_stories(query, topic_id)
stories_nastywomen.to_csv('stories_mentioning_nastywomen.csv', encoding='utf-8')
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: