In [1]:
import cPickle
import os.path
api_key = cPickle.load( file( os.path.expanduser( '~/mediacloud_api_key.pickle' ), 'r' ) )
In [3]:
import cPickle
import os.path
cPickle.dump( api_key, file( os.path.expanduser( '~/mediacloud_api_key.pickle' ), 'wb' ) )
In [2]:
import mediacloud, json
mc = mediacloud.api.MediaCloud(api_key)
In [3]:
import sys
sys.path.append('../')
sys.path
Out[3]:
In [4]:
import solr_reimport
import psycopg2
In [5]:
conn = solr_reimport.connect_to_database()
cursor = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
In [6]:
MEDIA= {
'1': "new york times",
'2': "washington post",
'6':"la times",
'7': "new york post",
'1150': "wall street journal",
'1757': "salon",
'1707': "daily beast",
'1750': "telegraph",
'314' : "huffington post",
"27502":"huffington post" #assuming these are the same for now
}
In [7]:
query = "WITH medium_stories as ( SELECT stories_id from stories where media_id=314 and date_trunc( 'day', publish_date) >= '2008-01-01' and date_trunc( 'day', publish_date) < '2014-08-01' ), medium_story_downloads as ( select * from downloads, medium_stories where downloads.stories_id = medium_stories.stories_id) SELECT date_trunc('month', download_time), count(*) from medium_story_downloads where feeds_id not in (select feeds_id from feeds where media_id=1 and name like '%Spider%' ) and sequence=1 group by date_trunc( 'month', download_time) order by date_trunc( 'month', download_time) "
query
Out[7]:
In [ ]:
In [8]:
#media_id = 314
#cursor = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
#cursor.execute("SELECT media_id, date_trunc('month', publish_date), count(*) from stories where media_id = %(media_id)s group by media_id, date_trunc('month', publish_date) ",
# { 'media_id': media_id } )
In [9]:
def get_counts_for_media( conn, media_id ):
sql = """
WITH related_feeds as ( SELECT feeds_id from feeds where media_id = %(media_id)s and name not like '%%Spider%%' ),
non_spidered_stories as (SELECT distinct(stories_id) from feeds_stories_map, related_feeds
where related_feeds.feeds_id=feeds_stories_map.feeds_id)
SELECT media_id, date_trunc('month', publish_date) as month, count(*) from stories, non_spidered_stories
where media_id = %(media_id)s and stories.stories_id = non_spidered_stories.stories_id
group by media_id, date_trunc('month', publish_date)
"""
#print sql
cursor = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
cursor.execute( sql , { 'media_id': media_id, 'media_id1': media_id } )
#cursor.close()
fetched = cursor.fetchall()
return [ dict(result) for result in sorted(fetched, key=lambda r: r['month'] ) ]
In [10]:
from operator import itemgetter, attrgetter
import pandas
media_ids = [ 1757, 314 ]
results = []
for media_id in MEDIA.keys():
print MEDIA[media_id], media_id
results.extend ( get_counts_for_media( conn, media_id ) )
df = pandas.DataFrame( results )
df.to_csv( '/tmp/counts.csv')
df
Out[10]:
In [14]:
results_valid_dates = [ result for result in results if result['month'] >= datetime.datetime(2008, 1, 1, 0, 0) and
result['month'] < datetime.datetime(2014, 8, 1, 0, 0) ]
results_valid_dates
df = pandas.DataFrame( results_valid_dates )
df.to_csv( '/tmp/publication_counts.csv')
df
Out[14]:
In [5]:
all_media = []
last_media_id = 0
while True:
media = mc.mediaList( last_media_id=last_media_id, rows=1000)
print last_media_id, len( media ), len( all_media )
if len(media) == 0:
break
last_media_id = media[-1]['media_id']
last_media_id
all_media.extend(media)
len(all_media)
Out[5]:
import mediacloud, json mc = mediacloud.api.MediaCloud(api_key)
import mediacloud, json mc = mediacloud.api.MediaCloud(api_key)