notebook.community

Edit and run



In [1]:

    
import cPickle
import os.path

api_key = cPickle.load( file( os.path.expanduser( '~/mediacloud_api_key.pickle' ), 'r' ) )



In [3]:

    
import cPickle
import os.path

cPickle.dump( api_key, file( os.path.expanduser( '~/mediacloud_api_key.pickle' ), 'wb' ) )



In [2]:

    
import mediacloud, json
mc = mediacloud.api.MediaCloud(api_key)



In [3]:

    
import sys

sys.path.append('../')
sys.path









    Out[3]:





['',
 '/usr/local/lib/python2.7/dist-packages/supervisor-3.0-py2.7.egg',
 '/usr/local/lib/python2.7/dist-packages/meld3-0.6.10-py2.7.egg',
 '/usr/lib/python2.7',
 '/usr/lib/python2.7/plat-x86_64-linux-gnu',
 '/usr/lib/python2.7/lib-tk',
 '/usr/lib/python2.7/lib-old',
 '/usr/lib/python2.7/lib-dynload',
 '/usr/local/lib/python2.7/dist-packages',
 '/usr/lib/python2.7/dist-packages',
 '/usr/lib/python2.7/dist-packages/PILcompat',
 '/usr/lib/python2.7/dist-packages/gtk-2.0',
 '/usr/lib/pymodules/python2.7',
 '/usr/lib/python2.7/dist-packages/ubuntu-sso-client',
 '/usr/lib/python2.7/dist-packages/ubuntuone-client',
 '/usr/lib/python2.7/dist-packages/ubuntuone-control-panel',
 '/usr/lib/python2.7/dist-packages/ubuntuone-storage-protocol',
 '/usr/local/lib/python2.7/dist-packages/IPython/extensions',
 '../']



In [4]:

    
import solr_reimport
import psycopg2



In [5]:

    
conn = solr_reimport.connect_to_database()
cursor = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)



In [6]:

    
MEDIA= {
  '1': "new york times",
  '2': "washington post",
  '6':"la times",
  '7': "new york post",
  '1150': "wall street journal",
  '1757': "salon",
  '1707': "daily beast",
  '1750': "telegraph",
  '314' : "huffington post",
"27502":"huffington post" #assuming these are the same for now
}



In [7]:

    
query = "WITH medium_stories as ( SELECT stories_id from stories where media_id=314  and date_trunc( 'day', publish_date) >= '2008-01-01'  and date_trunc( 'day', publish_date) < '2014-08-01'  ), medium_story_downloads as ( select * from downloads, medium_stories where downloads.stories_id = medium_stories.stories_id) SELECT date_trunc('month', download_time), count(*) from medium_story_downloads where feeds_id not in (select feeds_id from feeds where media_id=1 and name like '%Spider%'  ) and sequence=1 group by date_trunc( 'month', download_time)  order by  date_trunc( 'month', download_time) "
query









    Out[7]:





"WITH medium_stories as ( SELECT stories_id from stories where media_id=314  and date_trunc( 'day', publish_date) >= '2008-01-01'  and date_trunc( 'day', publish_date) < '2014-08-01'  ), medium_story_downloads as ( select * from downloads, medium_stories where downloads.stories_id = medium_stories.stories_id) SELECT date_trunc('month', download_time), count(*) from medium_story_downloads where feeds_id not in (select feeds_id from feeds where media_id=1 and name like '%Spider%'  ) and sequence=1 group by date_trunc( 'month', download_time)  order by  date_trunc( 'month', download_time) "



In [ ]:



In [8]:

    
#media_id = 314
#cursor = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
#cursor.execute("SELECT media_id, date_trunc('month', publish_date), count(*) from stories where media_id = %(media_id)s group by media_id, date_trunc('month', publish_date) ",
#               { 'media_id': media_id } )



In [9]:

    
def get_counts_for_media( conn, media_id ):

    sql = """ 
WITH related_feeds as ( SELECT feeds_id from feeds where media_id = %(media_id)s and name not like '%%Spider%%' ),
     non_spidered_stories as (SELECT distinct(stories_id) from feeds_stories_map, related_feeds 
     where related_feeds.feeds_id=feeds_stories_map.feeds_id)
SELECT media_id, date_trunc('month', publish_date) as month, count(*) from stories, non_spidered_stories 
where media_id = %(media_id)s and stories.stories_id = non_spidered_stories.stories_id 
group by media_id, date_trunc('month', publish_date)
"""

    #print sql
    cursor = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
    cursor.execute( sql , { 'media_id': media_id, 'media_id1': media_id } )
#cursor.close()

    fetched = cursor.fetchall()
    
    return [ dict(result) for result in sorted(fetched, key=lambda r: r['month']  ) ]



In [10]:

    
from operator import itemgetter, attrgetter
import pandas

media_ids = [ 1757, 314 ]
results = []
for media_id in MEDIA.keys():
    print MEDIA[media_id], media_id
    results.extend ( get_counts_for_media( conn, media_id ) )
    
    
df = pandas.DataFrame( results )    

df.to_csv( '/tmp/counts.csv')
df









    



new york times 1
huffington post 27502
washington post 2
new york post 7
la times 6
salon 1757
daily beast 1707
huffington post 314
wall street journal 1150
telegraph 1750






    



/usr/local/lib/python2.7/dist-packages/pandas/io/gbq.py:9: UserWarning: Module dap was already imported from None, but /usr/lib/python2.7/dist-packages is being added to sys.path
  import pkg_resources






    Out[10]:






  
    
      
      count
      media_id
      month
    
  
  
    
      0  
           2
          1
      1888-06-01
    
    
      1  
           9
          1
      1967-01-01
    
    
      2  
           2
          1
      1970-01-01
    
    
      3  
          37
          1
      1999-11-01
    
    
      4  
           1
          1
      2001-01-01
    
    
      5  
           1
          1
      2005-08-01
    
    
      6  
           1
          1
      2005-10-01
    
    
      7  
          24
          1
      2006-01-01
    
    
      8  
          24
          1
      2006-02-01
    
    
      9  
          15
          1
      2006-03-01
    
    
      10 
           2
          1
      2006-05-01
    
    
      11 
           2
          1
      2006-06-01
    
    
      12 
           2
          1
      2006-07-01
    
    
      13 
           6
          1
      2006-08-01
    
    
      14 
           2
          1
      2006-09-01
    
    
      15 
           3
          1
      2006-10-01
    
    
      16 
           2
          1
      2006-11-01
    
    
      17 
           8
          1
      2006-12-01
    
    
      18 
           8
          1
      2007-01-01
    
    
      19 
           7
          1
      2007-02-01
    
    
      20 
           4
          1
      2007-03-01
    
    
      21 
           3
          1
      2007-04-01
    
    
      22 
          12
          1
      2007-05-01
    
    
      23 
           7
          1
      2007-06-01
    
    
      24 
          39
          1
      2007-07-01
    
    
      25 
           4
          1
      2007-08-01
    
    
      26 
           9
          1
      2007-09-01
    
    
      27 
          15
          1
      2007-10-01
    
    
      28 
          32
          1
      2007-11-01
    
    
      29 
          35
          1
      2007-12-01
    
    
      ...
      ...
      ...
      ...
    
    
      909
        9762
       1750
      2012-03-01
    
    
      910
        9476
       1750
      2012-04-01
    
    
      911
       10433
       1750
      2012-05-01
    
    
      912
       10056
       1750
      2012-06-01
    
    
      913
        9454
       1750
      2012-07-01
    
    
      914
       10023
       1750
      2012-08-01
    
    
      915
       10887
       1750
      2012-09-01
    
    
      916
       12130
       1750
      2012-10-01
    
    
      917
       12194
       1750
      2012-11-01
    
    
      918
       10437
       1750
      2012-12-01
    
    
      919
       12183
       1750
      2013-01-01
    
    
      920
       10997
       1750
      2013-02-01
    
    
      921
       11621
       1750
      2013-03-01
    
    
      922
       11090
       1750
      2013-04-01
    
    
      923
       10991
       1750
      2013-05-01
    
    
      924
       11070
       1750
      2013-06-01
    
    
      925
       11625
       1750
      2013-07-01
    
    
      926
       11002
       1750
      2013-08-01
    
    
      927
       11375
       1750
      2013-09-01
    
    
      928
       12735
       1750
      2013-10-01
    
    
      929
       12239
       1750
      2013-11-01
    
    
      930
       10786
       1750
      2013-12-01
    
    
      931
       11180
       1750
      2014-01-01
    
    
      932
       10005
       1750
      2014-02-01
    
    
      933
       10830
       1750
      2014-03-01
    
    
      934
       10970
       1750
      2014-04-01
    
    
      935
       11308
       1750
      2014-05-01
    
    
      936
       11524
       1750
      2014-06-01
    
    
      937
       11884
       1750
      2014-07-01
    
    
      938
        6583
       1750
      2014-08-01
    
  

939 rows × 3 columns



In [14]:

    
results_valid_dates = [ result for result in results if result['month'] >= datetime.datetime(2008, 1, 1, 0, 0) and 
                       result['month'] < datetime.datetime(2014, 8, 1, 0, 0) ]
results_valid_dates
df = pandas.DataFrame( results_valid_dates )    

df.to_csv( '/tmp/publication_counts.csv')
df









    Out[14]:






  
    
      
      count
      media_id
      month
    
  
  
    
      0  
          14
          1
      2008-01-01
    
    
      1  
          16
          1
      2008-02-01
    
    
      2  
          96
          1
      2008-03-01
    
    
      3  
        3752
          1
      2008-04-01
    
    
      4  
        8431
          1
      2008-05-01
    
    
      5  
        5394
          1
      2008-06-01
    
    
      6  
        6763
          1
      2008-07-01
    
    
      7  
        8158
          1
      2008-08-01
    
    
      8  
        8009
          1
      2008-09-01
    
    
      9  
       10040
          1
      2008-10-01
    
    
      10 
        9398
          1
      2008-11-01
    
    
      11 
        8915
          1
      2008-12-01
    
    
      12 
        9040
          1
      2009-01-01
    
    
      13 
        9421
          1
      2009-02-01
    
    
      14 
       10804
          1
      2009-03-01
    
    
      15 
       10827
          1
      2009-04-01
    
    
      16 
       10397
          1
      2009-05-01
    
    
      17 
       11202
          1
      2009-06-01
    
    
      18 
        8099
          1
      2009-07-01
    
    
      19 
        7129
          1
      2009-08-01
    
    
      20 
        8101
          1
      2009-09-01
    
    
      21 
       11543
          1
      2009-10-01
    
    
      22 
        9300
          1
      2009-11-01
    
    
      23 
        9106
          1
      2009-12-01
    
    
      24 
       12378
          1
      2010-01-01
    
    
      25 
        9122
          1
      2010-02-01
    
    
      26 
       12578
          1
      2010-03-01
    
    
      27 
       11773
          1
      2010-04-01
    
    
      28 
       11373
          1
      2010-05-01
    
    
      29 
       12565
          1
      2010-06-01
    
    
      ...
      ...
      ...
      ...
    
    
      636
        9341
       1750
      2012-02-01
    
    
      637
        9762
       1750
      2012-03-01
    
    
      638
        9476
       1750
      2012-04-01
    
    
      639
       10433
       1750
      2012-05-01
    
    
      640
       10056
       1750
      2012-06-01
    
    
      641
        9454
       1750
      2012-07-01
    
    
      642
       10023
       1750
      2012-08-01
    
    
      643
       10887
       1750
      2012-09-01
    
    
      644
       12130
       1750
      2012-10-01
    
    
      645
       12194
       1750
      2012-11-01
    
    
      646
       10437
       1750
      2012-12-01
    
    
      647
       12183
       1750
      2013-01-01
    
    
      648
       10997
       1750
      2013-02-01
    
    
      649
       11621
       1750
      2013-03-01
    
    
      650
       11090
       1750
      2013-04-01
    
    
      651
       10991
       1750
      2013-05-01
    
    
      652
       11070
       1750
      2013-06-01
    
    
      653
       11625
       1750
      2013-07-01
    
    
      654
       11002
       1750
      2013-08-01
    
    
      655
       11375
       1750
      2013-09-01
    
    
      656
       12735
       1750
      2013-10-01
    
    
      657
       12239
       1750
      2013-11-01
    
    
      658
       10786
       1750
      2013-12-01
    
    
      659
       11180
       1750
      2014-01-01
    
    
      660
       10005
       1750
      2014-02-01
    
    
      661
       10830
       1750
      2014-03-01
    
    
      662
       10970
       1750
      2014-04-01
    
    
      663
       11308
       1750
      2014-05-01
    
    
      664
       11524
       1750
      2014-06-01
    
    
      665
       11884
       1750
      2014-07-01
    
  

666 rows × 3 columns



In [5]:

    
all_media = []

last_media_id = 0

while True:
    media = mc.mediaList( last_media_id=last_media_id, rows=1000)
    print last_media_id, len( media ), len( all_media )

    if len(media) == 0:
        break
        
    last_media_id = media[-1]['media_id']
    last_media_id
    

        
    all_media.extend(media)

len(all_media)









    



0 1000 0
1253 1000 1000
2333 1000 2000
3358 1000 3000
4364 1000 4000
5378 1000 5000
6392 1000 6000
7418 1000 7000
8419 1000 8000
9419 1000 9000
10419 1000 10000
11419 1000 11000
12420 1000 12000
13422 1000 13000
14424 1000 14000
15424 1000 15000
16425 1000 16000
17425 1000 17000
18655 1000 18000
19655 1000 19000
20657 1000 20000
21658 1000 21000
22658 1000 22000
23660 1000 23000
24811 1000 24000
25811 1000 25000
26811 1000 26000
27820 1000 27000
29019 1000 28000
30019 1000 29000
31019 1000 30000
32019 1000 31000
33019 1000 32000
34019 1000 33000
35019 1000 34000
39076 1000 35000
40076 1000 36000
41076 1000 37000
42077 1000 38000
43077 1000 39000
44077 1000 40000
45077 1000 41000
46077 1000 42000
47077 1000 43000
48077 1000 44000
49077 1000 45000
50077 1000 46000
51077 1000 47000
52078 1000 48000
53103 1000 49000
54103 1000 50000
55880 1000 51000
56880 1000 52000
57880 1000 53000
58880 1000 54000
59880 1000 55000
60880 1000 56000
61880 1000 57000
62880 1000 58000
63880 1000 59000
64880 1000 60000
65880 1000 61000
67013 1000 62000
68014 1000 63000
69015 1000 64000
70015 1000 65000
71016 1000 66000
72017 1000 67000
73017 1000 68000
74017 1000 69000
75018 1000 70000
76018 1000 71000
77018 1000 72000
78018 1000 73000
79019 1000 74000
80019 1000 75000
81019 1000 76000
82020 1000 77000
83021 1000 78000
84063 1000 79000
85072 1000 80000
86072 1000 81000
87072 1000 82000
88072 1000 83000
89072 1000 84000
90072 1000 85000
91089 1000 86000
92089 1000 87000
93089 1000 88000
94089 1000 89000
95089 1000 90000
96089 1000 91000
97091 1000 92000
98091 1000 93000
99091 1000 94000
100091 1000 95000
101091 1000 96000
102268 1000 97000
103306 1000 98000
104306 551 99000
104857 0 99551






    Out[5]:





99551

import mediacloud, json mc = mediacloud.api.MediaCloud(api_key)

	count	media_id	month
0	2	1	1888-06-01
1	9	1	1967-01-01
2	2	1	1970-01-01
3	37	1	1999-11-01
4	1	1	2001-01-01
5	1	1	2005-08-01
6	1	1	2005-10-01
7	24	1	2006-01-01
8	24	1	2006-02-01
9	15	1	2006-03-01
10	2	1	2006-05-01
11	2	1	2006-06-01
12	2	1	2006-07-01
13	6	1	2006-08-01
14	2	1	2006-09-01
15	3	1	2006-10-01
16	2	1	2006-11-01
17	8	1	2006-12-01
18	8	1	2007-01-01
19	7	1	2007-02-01
20	4	1	2007-03-01
21	3	1	2007-04-01
22	12	1	2007-05-01
23	7	1	2007-06-01
24	39	1	2007-07-01
25	4	1	2007-08-01
26	9	1	2007-09-01
27	15	1	2007-10-01
28	32	1	2007-11-01
29	35	1	2007-12-01
...	...	...	...
909	9762	1750	2012-03-01
910	9476	1750	2012-04-01
911	10433	1750	2012-05-01
912	10056	1750	2012-06-01
913	9454	1750	2012-07-01
914	10023	1750	2012-08-01
915	10887	1750	2012-09-01
916	12130	1750	2012-10-01
917	12194	1750	2012-11-01
918	10437	1750	2012-12-01
919	12183	1750	2013-01-01
920	10997	1750	2013-02-01
921	11621	1750	2013-03-01
922	11090	1750	2013-04-01
923	10991	1750	2013-05-01
924	11070	1750	2013-06-01
925	11625	1750	2013-07-01
926	11002	1750	2013-08-01
927	11375	1750	2013-09-01
928	12735	1750	2013-10-01
929	12239	1750	2013-11-01
930	10786	1750	2013-12-01
931	11180	1750	2014-01-01
932	10005	1750	2014-02-01
933	10830	1750	2014-03-01
934	10970	1750	2014-04-01
935	11308	1750	2014-05-01
936	11524	1750	2014-06-01
937	11884	1750	2014-07-01
938	6583	1750	2014-08-01

	count	media_id	month
0	14	1	2008-01-01
1	16	1	2008-02-01
2	96	1	2008-03-01
3	3752	1	2008-04-01
4	8431	1	2008-05-01
5	5394	1	2008-06-01
6	6763	1	2008-07-01
7	8158	1	2008-08-01
8	8009	1	2008-09-01
9	10040	1	2008-10-01
10	9398	1	2008-11-01
11	8915	1	2008-12-01
12	9040	1	2009-01-01
13	9421	1	2009-02-01
14	10804	1	2009-03-01
15	10827	1	2009-04-01
16	10397	1	2009-05-01
17	11202	1	2009-06-01
18	8099	1	2009-07-01
19	7129	1	2009-08-01
20	8101	1	2009-09-01
21	11543	1	2009-10-01
22	9300	1	2009-11-01
23	9106	1	2009-12-01
24	12378	1	2010-01-01
25	9122	1	2010-02-01
26	12578	1	2010-03-01
27	11773	1	2010-04-01
28	11373	1	2010-05-01
29	12565	1	2010-06-01
...	...	...	...
636	9341	1750	2012-02-01
637	9762	1750	2012-03-01
638	9476	1750	2012-04-01
639	10433	1750	2012-05-01
640	10056	1750	2012-06-01
641	9454	1750	2012-07-01
642	10023	1750	2012-08-01
643	10887	1750	2012-09-01
644	12130	1750	2012-10-01
645	12194	1750	2012-11-01
646	10437	1750	2012-12-01
647	12183	1750	2013-01-01
648	10997	1750	2013-02-01
649	11621	1750	2013-03-01
650	11090	1750	2013-04-01
651	10991	1750	2013-05-01
652	11070	1750	2013-06-01
653	11625	1750	2013-07-01
654	11002	1750	2013-08-01
655	11375	1750	2013-09-01
656	12735	1750	2013-10-01
657	12239	1750	2013-11-01
658	10786	1750	2013-12-01
659	11180	1750	2014-01-01
660	10005	1750	2014-02-01
661	10830	1750	2014-03-01
662	10970	1750	2014-04-01
663	11308	1750	2014-05-01
664	11524	1750	2014-06-01
665	11884	1750	2014-07-01