mediacloud_source_count



In [1]:
import cPickle
import os.path

api_key = cPickle.load( file( os.path.expanduser( '~/mediacloud_api_key.pickle' ), 'r' ) )

In [3]:
import cPickle
import os.path

cPickle.dump( api_key, file( os.path.expanduser( '~/mediacloud_api_key.pickle' ), 'wb' ) )

In [2]:
import mediacloud, json
mc = mediacloud.api.MediaCloud(api_key)

In [3]:
import sys

sys.path.append('../')
sys.path


Out[3]:
['',
 '/usr/local/lib/python2.7/dist-packages/supervisor-3.0-py2.7.egg',
 '/usr/local/lib/python2.7/dist-packages/meld3-0.6.10-py2.7.egg',
 '/usr/lib/python2.7',
 '/usr/lib/python2.7/plat-x86_64-linux-gnu',
 '/usr/lib/python2.7/lib-tk',
 '/usr/lib/python2.7/lib-old',
 '/usr/lib/python2.7/lib-dynload',
 '/usr/local/lib/python2.7/dist-packages',
 '/usr/lib/python2.7/dist-packages',
 '/usr/lib/python2.7/dist-packages/PILcompat',
 '/usr/lib/python2.7/dist-packages/gtk-2.0',
 '/usr/lib/pymodules/python2.7',
 '/usr/lib/python2.7/dist-packages/ubuntu-sso-client',
 '/usr/lib/python2.7/dist-packages/ubuntuone-client',
 '/usr/lib/python2.7/dist-packages/ubuntuone-control-panel',
 '/usr/lib/python2.7/dist-packages/ubuntuone-storage-protocol',
 '/usr/local/lib/python2.7/dist-packages/IPython/extensions',
 '../']

In [4]:
import solr_reimport
import psycopg2

In [5]:
conn = solr_reimport.connect_to_database()
cursor = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)

In [6]:
MEDIA= {
  '1': "new york times",
  '2': "washington post",
  '6':"la times",
  '7': "new york post",
  '1150': "wall street journal",
  '1757': "salon",
  '1707': "daily beast",
  '1750': "telegraph",
  '314' : "huffington post",
"27502":"huffington post" #assuming these are the same for now
}

In [7]:
query = "WITH medium_stories as ( SELECT stories_id from stories where media_id=314  and date_trunc( 'day', publish_date) >= '2008-01-01'  and date_trunc( 'day', publish_date) < '2014-08-01'  ), medium_story_downloads as ( select * from downloads, medium_stories where downloads.stories_id = medium_stories.stories_id) SELECT date_trunc('month', download_time), count(*) from medium_story_downloads where feeds_id not in (select feeds_id from feeds where media_id=1 and name like '%Spider%'  ) and sequence=1 group by date_trunc( 'month', download_time)  order by  date_trunc( 'month', download_time) "
query


Out[7]:
"WITH medium_stories as ( SELECT stories_id from stories where media_id=314  and date_trunc( 'day', publish_date) >= '2008-01-01'  and date_trunc( 'day', publish_date) < '2014-08-01'  ), medium_story_downloads as ( select * from downloads, medium_stories where downloads.stories_id = medium_stories.stories_id) SELECT date_trunc('month', download_time), count(*) from medium_story_downloads where feeds_id not in (select feeds_id from feeds where media_id=1 and name like '%Spider%'  ) and sequence=1 group by date_trunc( 'month', download_time)  order by  date_trunc( 'month', download_time) "

In [ ]:


In [8]:
#media_id = 314
#cursor = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
#cursor.execute("SELECT media_id, date_trunc('month', publish_date), count(*) from stories where media_id = %(media_id)s group by media_id, date_trunc('month', publish_date) ",
#               { 'media_id': media_id } )

In [9]:
def get_counts_for_media( conn, media_id ):

    sql = """ 
WITH related_feeds as ( SELECT feeds_id from feeds where media_id = %(media_id)s and name not like '%%Spider%%' ),
     non_spidered_stories as (SELECT distinct(stories_id) from feeds_stories_map, related_feeds 
     where related_feeds.feeds_id=feeds_stories_map.feeds_id)
SELECT media_id, date_trunc('month', publish_date) as month, count(*) from stories, non_spidered_stories 
where media_id = %(media_id)s and stories.stories_id = non_spidered_stories.stories_id 
group by media_id, date_trunc('month', publish_date)
"""

    #print sql
    cursor = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
    cursor.execute( sql , { 'media_id': media_id, 'media_id1': media_id } )
#cursor.close()

    fetched = cursor.fetchall()
    
    return [ dict(result) for result in sorted(fetched, key=lambda r: r['month']  ) ]

In [10]:
from operator import itemgetter, attrgetter
import pandas

media_ids = [ 1757, 314 ]
results = []
for media_id in MEDIA.keys():
    print MEDIA[media_id], media_id
    results.extend ( get_counts_for_media( conn, media_id ) )
    
    
df = pandas.DataFrame( results )    

df.to_csv( '/tmp/counts.csv')
df


new york times 1
huffington post 27502
washington post 2
new york post 7
la times 6
salon 1757
daily beast 1707
huffington post 314
wall street journal 1150
telegraph 1750
/usr/local/lib/python2.7/dist-packages/pandas/io/gbq.py:9: UserWarning: Module dap was already imported from None, but /usr/lib/python2.7/dist-packages is being added to sys.path
  import pkg_resources
Out[10]:
count media_id month
0 2 1 1888-06-01
1 9 1 1967-01-01
2 2 1 1970-01-01
3 37 1 1999-11-01
4 1 1 2001-01-01
5 1 1 2005-08-01
6 1 1 2005-10-01
7 24 1 2006-01-01
8 24 1 2006-02-01
9 15 1 2006-03-01
10 2 1 2006-05-01
11 2 1 2006-06-01
12 2 1 2006-07-01
13 6 1 2006-08-01
14 2 1 2006-09-01
15 3 1 2006-10-01
16 2 1 2006-11-01
17 8 1 2006-12-01
18 8 1 2007-01-01
19 7 1 2007-02-01
20 4 1 2007-03-01
21 3 1 2007-04-01
22 12 1 2007-05-01
23 7 1 2007-06-01
24 39 1 2007-07-01
25 4 1 2007-08-01
26 9 1 2007-09-01
27 15 1 2007-10-01
28 32 1 2007-11-01
29 35 1 2007-12-01
... ... ... ...
909 9762 1750 2012-03-01
910 9476 1750 2012-04-01
911 10433 1750 2012-05-01
912 10056 1750 2012-06-01
913 9454 1750 2012-07-01
914 10023 1750 2012-08-01
915 10887 1750 2012-09-01
916 12130 1750 2012-10-01
917 12194 1750 2012-11-01
918 10437 1750 2012-12-01
919 12183 1750 2013-01-01
920 10997 1750 2013-02-01
921 11621 1750 2013-03-01
922 11090 1750 2013-04-01
923 10991 1750 2013-05-01
924 11070 1750 2013-06-01
925 11625 1750 2013-07-01
926 11002 1750 2013-08-01
927 11375 1750 2013-09-01
928 12735 1750 2013-10-01
929 12239 1750 2013-11-01
930 10786 1750 2013-12-01
931 11180 1750 2014-01-01
932 10005 1750 2014-02-01
933 10830 1750 2014-03-01
934 10970 1750 2014-04-01
935 11308 1750 2014-05-01
936 11524 1750 2014-06-01
937 11884 1750 2014-07-01
938 6583 1750 2014-08-01

939 rows × 3 columns


In [14]:
results_valid_dates = [ result for result in results if result['month'] >= datetime.datetime(2008, 1, 1, 0, 0) and 
                       result['month'] < datetime.datetime(2014, 8, 1, 0, 0) ]
results_valid_dates
df = pandas.DataFrame( results_valid_dates )    

df.to_csv( '/tmp/publication_counts.csv')
df


Out[14]:
count media_id month
0 14 1 2008-01-01
1 16 1 2008-02-01
2 96 1 2008-03-01
3 3752 1 2008-04-01
4 8431 1 2008-05-01
5 5394 1 2008-06-01
6 6763 1 2008-07-01
7 8158 1 2008-08-01
8 8009 1 2008-09-01
9 10040 1 2008-10-01
10 9398 1 2008-11-01
11 8915 1 2008-12-01
12 9040 1 2009-01-01
13 9421 1 2009-02-01
14 10804 1 2009-03-01
15 10827 1 2009-04-01
16 10397 1 2009-05-01
17 11202 1 2009-06-01
18 8099 1 2009-07-01
19 7129 1 2009-08-01
20 8101 1 2009-09-01
21 11543 1 2009-10-01
22 9300 1 2009-11-01
23 9106 1 2009-12-01
24 12378 1 2010-01-01
25 9122 1 2010-02-01
26 12578 1 2010-03-01
27 11773 1 2010-04-01
28 11373 1 2010-05-01
29 12565 1 2010-06-01
... ... ... ...
636 9341 1750 2012-02-01
637 9762 1750 2012-03-01
638 9476 1750 2012-04-01
639 10433 1750 2012-05-01
640 10056 1750 2012-06-01
641 9454 1750 2012-07-01
642 10023 1750 2012-08-01
643 10887 1750 2012-09-01
644 12130 1750 2012-10-01
645 12194 1750 2012-11-01
646 10437 1750 2012-12-01
647 12183 1750 2013-01-01
648 10997 1750 2013-02-01
649 11621 1750 2013-03-01
650 11090 1750 2013-04-01
651 10991 1750 2013-05-01
652 11070 1750 2013-06-01
653 11625 1750 2013-07-01
654 11002 1750 2013-08-01
655 11375 1750 2013-09-01
656 12735 1750 2013-10-01
657 12239 1750 2013-11-01
658 10786 1750 2013-12-01
659 11180 1750 2014-01-01
660 10005 1750 2014-02-01
661 10830 1750 2014-03-01
662 10970 1750 2014-04-01
663 11308 1750 2014-05-01
664 11524 1750 2014-06-01
665 11884 1750 2014-07-01

666 rows × 3 columns


In [5]:
all_media = []

last_media_id = 0

while True:
    media = mc.mediaList( last_media_id=last_media_id, rows=1000)
    print last_media_id, len( media ), len( all_media )

    if len(media) == 0:
        break
        
    last_media_id = media[-1]['media_id']
    last_media_id
    

        
    all_media.extend(media)

len(all_media)


0 1000 0
1253 1000 1000
2333 1000 2000
3358 1000 3000
4364 1000 4000
5378 1000 5000
6392 1000 6000
7418 1000 7000
8419 1000 8000
9419 1000 9000
10419 1000 10000
11419 1000 11000
12420 1000 12000
13422 1000 13000
14424 1000 14000
15424 1000 15000
16425 1000 16000
17425 1000 17000
18655 1000 18000
19655 1000 19000
20657 1000 20000
21658 1000 21000
22658 1000 22000
23660 1000 23000
24811 1000 24000
25811 1000 25000
26811 1000 26000
27820 1000 27000
29019 1000 28000
30019 1000 29000
31019 1000 30000
32019 1000 31000
33019 1000 32000
34019 1000 33000
35019 1000 34000
39076 1000 35000
40076 1000 36000
41076 1000 37000
42077 1000 38000
43077 1000 39000
44077 1000 40000
45077 1000 41000
46077 1000 42000
47077 1000 43000
48077 1000 44000
49077 1000 45000
50077 1000 46000
51077 1000 47000
52078 1000 48000
53103 1000 49000
54103 1000 50000
55880 1000 51000
56880 1000 52000
57880 1000 53000
58880 1000 54000
59880 1000 55000
60880 1000 56000
61880 1000 57000
62880 1000 58000
63880 1000 59000
64880 1000 60000
65880 1000 61000
67013 1000 62000
68014 1000 63000
69015 1000 64000
70015 1000 65000
71016 1000 66000
72017 1000 67000
73017 1000 68000
74017 1000 69000
75018 1000 70000
76018 1000 71000
77018 1000 72000
78018 1000 73000
79019 1000 74000
80019 1000 75000
81019 1000 76000
82020 1000 77000
83021 1000 78000
84063 1000 79000
85072 1000 80000
86072 1000 81000
87072 1000 82000
88072 1000 83000
89072 1000 84000
90072 1000 85000
91089 1000 86000
92089 1000 87000
93089 1000 88000
94089 1000 89000
95089 1000 90000
96089 1000 91000
97091 1000 92000
98091 1000 93000
99091 1000 94000
100091 1000 95000
101091 1000 96000
102268 1000 97000
103306 1000 98000
104306 551 99000
104857 0 99551
Out[5]:
99551

import mediacloud, json mc = mediacloud.api.MediaCloud(api_key)

import mediacloud, json mc = mediacloud.api.MediaCloud(api_key)