fetch_extractor_training_stories



In [1]:
import cPickle
import os.path

api_key = cPickle.load( file( os.path.expanduser( '~/mediacloud_api_key.pickle' ), 'r' ) )

In [2]:
import cPickle
import os.path

cPickle.dump( api_key, file( os.path.expanduser( '~/mediacloud_api_key.pickle' ), 'wb' ) )

In [3]:
import mediacloud, json
mc = mediacloud.api.MediaCloud(api_key)

In [4]:
def fetch_1000_stories_id_from_query( solr_query ):  
    q = "+sentence_number:0 AND {} ".format( solr_query )
    d = mc.sentenceList( solr_query=q, 
                        sort=mc.SORT_RANDOM, rows=1000 )
    ret = [ s['stories_id'] for s in d['response']['docs'] ]
    return ret

In [5]:
import sys

sys.path.append('../')

import extractor_utils

conn = extractor_utils.get_chloe_db_connection()

import psycopg2
import psycopg2.extras

def downloads_id_from_stories_id( stories_id ):
    cursor = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
    
    #cursor = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
    
    cursor.execute( "SELECT * from downloads where stories_id = %(stories_id)s and type='content' and sequence = 1 AND state='success'" ,
                   { 'stories_id': stories_id } )
    #cursor.close()
    
    fetched = cursor.fetchall()
    #dict( fetched[0] )
    return fetched[0]['downloads_id']

In [6]:
training_sets = [{ 'media_tag': 8878255,
 'name': 'egypt_composite_dalia_20140425' },
 {
  'media_tag': 7796878,
  'name': 'russian'}
 ]

training_sets = [ 
    { 'media_tag': 2453107,
     'name': 'pew_knight_study' },
    { 'media_tag': 8875027,
     'name': 'ap_english_us_top25_20100110' }
    ]

training_sets = [ 
    { 'story_tag': 8875452,
     'name': 'spidered' },
    ]


for training_set in training_sets:
    
    print 'training_set', training_set['name']
    
    query = '+publish_date:[2014-01-01T00:00:00Z TO 2014-10-01T00:00:00Z}'
    if 'media_tag' in training_set:
        query += ' AND tags_id_media:{0} '.format( training_set['media_tag'] )
        
    if 'story_tag' in training_set:
        query += ' AND tags_id_stories:{0} '.format( training_set['story_tag'] )
       
    #print query
    
    stories_ids = fetch_1000_stories_id_from_query( query )
    
    download_ids = [ downloads_id_from_stories_id( stories_id ) for stories_id in stories_ids ]
    
    output_file_name = training_set['name']
    #print download_ids
    if 'media_tag' in training_set:
        output_file_name += '_media_tag_' + str(training_set['media_tag'])
    
    if 'story_tag' in training_set:
        output_file_name += '_story_tag_' + str(training_set['story_tag'])
        
    output_file_name += '_downloads_ids.txt'
    with open ( output_file_name , 'wb' ) as f:
        f.write( "\n".join([ str( d) for d in download_ids ] ) )


training_set spidered

In [7]:
print mc.story( 237688812 )['url']


http://lenta.ru/news/2014/06/01/quatar/

In [8]:
[ downloads_id_from_stories_id( stories_id ) for stories_id in stories_ids ]


Out[8]:
[565774715,
 560908068,
 529645874,
 536621628,
 562541326,
 577226126,
 539614415,
 562253854,
 562360768,
 562328219,
 540246359,
 562376891,
 560751009,
 562157715,
 540448548,
 539639827,
 561605351,
 540356681,
 566726127,
 561915362,
 581963231,
 562427761,
 562384605,
 560707859,
 560460180,
 561202964,
 529985569,
 539868781,
 562373238,
 539913903,
 536587708,
 562329093,
 529645941,
 561915198,
 562483340,
 561593245,
 561827756,
 591998702,
 561947033,
 562360417,
 540063014,
 561033274,
 562736854,
 539947916,
 562456052,
 577076453,
 540356543,
 539784484,
 562338351,
 560535896,
 561903669,
 536720884,
 561863129,
 560908980,
 566175982,
 576905622,
 578167861,
 540167253,
 540199207,
 539535265,
 576838787,
 561144075,
 561086005,
 561045286,
 560883171,
 579387061,
 562366755,
 561215491,
 576906221,
 561278968,
 560814110,
 529897403,
 560936732,
 577226213,
 560871962,
 562715534,
 578636827,
 566839874,
 562408980,
 539709451,
 540337668,
 578156372,
 501420884,
 561405794,
 502622175,
 560937153,
 567387189,
 578544900,
 560768548,
 562229729,
 567149912,
 561488129,
 562704554,
 561174738,
 560752485,
 561748711,
 539444757,
 562043238,
 561836635,
 562191476,
 579044624,
 540643125,
 578124694,
 562445122,
 562378280,
 561387810,
 562513447,
 579604030,
 561931708,
 578294304,
 562691838,
 540148991,
 561837570,
 562052838,
 562354183,
 561936046,
 560249487,
 561854134,
 566220098,
 562482843,
 577070880,
 562392314,
 561776507,
 539393524,
 577268987,
 539483121,
 560262829,
 539444342,
 561114264,
 562393410,
 501442658,
 578153622,
 567132910,
 536622410,
 560920655,
 561123137,
 579012991,
 561604881,
 582858415,
 529698279,
 536689603,
 562742684,
 540628443,
 566592645,
 539482866,
 565620132,
 510954456,
 539445540,
 560707952,
 537518222,
 566202312,
 562148076,
 537501183,
 560470732,
 501421014,
 536631908,
 501409722,
 576826346,
 561076711,
 540612478,
 562657028,
 560459981,
 561094507,
 578156115,
 539415012,
 561903643,
 540167577,
 540327412,
 561738299,
 540412336,
 576800758,
 536697475,
 540373172,
 562601181,
 560872366,
 560497277,
 561828237,
 591968468,
 501431149,
 561344413,
 561913662,
 475080737,
 562457089,
 578589537,
 562484269,
 562621732,
 536586967,
 540509089,
 562529616,
 502653177,
 560369874,
 578156412,
 560798923,
 561010875,
 561011867,
 561747963,
 536621815,
 561894012,
 529792413,
 566090458,
 539464526,
 540475808,
 561143242,
 560828952,
 536737002,
 579140983,
 560881968,
 562653139,
 562116085,
 562667650,
 562002311,
 536721260,
 540126133,
 561827781,
 540513494,
 540223853,
 561094385,
 501453175,
 561827718,
 561108611,
 562418674,
 539411169,
 567150914,
 529860654,
 539699310,
 561143865,
 578103835,
 561827188,
 561085739,
 540246614,
 561177213,
 560689769,
 539392922,
 560398068,
 562147036,
 568333199,
 562602349,
 536596439,
 562040913,
 591998982,
 540207521,
 539535898,
 580343812,
 529985616,
 562011772,
 562353324,
 561661897,
 539855645,
 536817371,
 536721715,
 578210949,
 540627170,
 578653839,
 567068388,
 561243907,
 539693458,
 536828101,
 536679410,
 566194155,
 565606755,
 578210309,
 540412658,
 560378287,
 562042942,
 577076222,
 536628714,
 560909291,
 560538800,
 529751725,
 539382819,
 561775839,
 566546448,
 529897845,
 561094139,
 501466598,
 529645707,
 562435523,
 562201798,
 582057991,
 536622427,
 540124644,
 576800952,
 561870308,
 562229573,
 562360883,
 561883891,
 560529663,
 578974914,
 536670142,
 561903150,
 561346917,
 562445940,
 562228938,
 560339123,
 561792237,
 536833731,
 536858912,
 568314417,
 562137382,
 562321836,
 561801847,
 562692557,
 560351631,
 562384964,
 578371687,
 529632328,
 560857192,
 536826380,
 561214633,
 561143933,
 529878837,
 539453644,
 562633132,
 578364597,
 578713987,
 562610857,
 567594230,
 562033354,
 539423034,
 562574480,
 560620542,
 562640243,
 501471473,
 580296869,
 560310961,
 560417790,
 510955330,
 562622275,
 578520886,
 560416972,
 562325607,
 560339085,
 562328128,
 540350826,
 501420351,
 540124060,
 562704238,
 561619112,
 561203104,
 561163706,
 562393677,
 561122957,
 576713845,
 568333436,
 539387198,
 560842330,
 536842096,
 579065561,
 560653306,
 540305913,
 562436740,
 576579465,
 562691052,
 561174292,
 502622309,
 562581075,
 562210174,
 560535479,
 536670272,
 562743788,
 561163052,
 536641485,
 540674115,
 539482885,
 539473930,
 562512906,
 539403473,
 560426639,
 560814425,
 562097159,
 560826819,
 560689759,
 560350680,
 561101624,
 536632038,
 540347256,
 560408127,
 502637688,
 561894011,
 578156626,
 561162443,
 567194473,
 529898090,
 540192578,
 475092335,
 539383227,
 562076705,
 562691986,
 565922453,
 560884475,
 501438090,
 561229544,
 562679569,
 565679095,
 475169348,
 501461634,
 562068222,
 539483281,
 539722663,
 563071685,
 561092664,
 501409836,
 562513263,
 560614820,
 562385231,
 560330583,
 578397830,
 561164039,
 561867598,
 560417835,
 561132270,
 562359797,
 560267678,
 560856537,
 560767061,
 562167906,
 540412270,
 561098547,
 579482065,
 536670723,
 579140678,
 560641453,
 562580676,
 561737203,
 561758784,
 540356715,
 501471289,
 578415831,
 561166962,
 567048241,
 561884117,
 529916191,
 561739997,
 562613296,
 578519940,
 562086750,
 540519892,
 537525792,
 560496735,
 561748260,
 560516842,
 576852877,
 578384194,
 578750469,
 561369909,
 561864433,
 560694773,
 565919211,
 529766606,
 561917988,
 561999444,
 565708468,
 560378481,
 562178505,
 539739890,
 560814508,
 576839108,
 577270709,
 562505359,
 537510646,
 579575085,
 560378579,
 561837783,
 539382997,
 560398276,
 562646940,
 568305795,
 561775448,
 562810126,
 562647698,
 566750707,
 539895440,
 540159583,
 562043570,
 561014022,
 561592407,
 578195479,
 539883968,
 562228484,
 539930943,
 560847159,
 562401825,
 561775645,
 561045554,
 536818812,
 540469335,
 578113764,
 560334908,
 501483116,
 536648306,
 561174393,
 560751651,
 577015317,
 562591210,
 529897837,
 561045774,
 577212531,
 576758842,
 560871337,
 560835572,
 580215177,
 561633233,
 536712575,
 562242904,
 560484851,
 540067290,
 501421368,
 578639479,
 560998656,
 561131779,
 561769554,
 560301141,
 560302191,
 561054498,
 540674373,
 565867333,
 517283991,
 501462091,
 560905804,
 562087468,
 560397790,
 562328588,
 578357590,
 537446836,
 461191679,
 561076092,
 529712856,
 561278217,
 562147837,
 529879374,
 576927233,
 578945322,
 501460997,
 560942312,
 561820286,
 536826778,
 474791650,
 565707294,
 561633382,
 536833304,
 560708163,
 561215516,
 539403554,
 560380093,
 540536680,
 540400806,
 561837843,
 576750993,
 474790974,
 536848910,
 561894425,
 560216078,
 475099887,
 539403845,
 567133388,
 560201407,
 561739173,
 562401462,
 577005671,
 561854132,
 502679023,
 565722606,
 561244347,
 562454001,
 561476957,
 502622825,
 529632233,
 529792769,
 560611885,
 529537992,
 560483447,
 562053576,
 539869367,
 561075998,
 577260703,
 562021263,
 562645910,
 561848018,
 560517599,
 577093162,
 562076240,
 562201433,
 562393526,
 560517282,
 561649177,
 579638116,
 560271592,
 529698767,
 561132757,
 579007504,
 562425924,
 540092680,
 536850410,
 540007122,
 562514187,
 562342135,
 560909926,
 501409368,
 475118616,
 577075976,
 566330875,
 537518216,
 561992084,
 562148296,
 565722522,
 562457010,
 529698778,
 560426270,
 561818904,
 561447983,
 561101223,
 501482317,
 501431048,
 566318763,
 540205728,
 561151310,
 562242749,
 474773305,
 562218704,
 501409542,
 541544969,
 539900195,
 561843536,
 561792144,
 562342387,
 568343659,
 529632008,
 562514102,
 540399129,
 578550671,
 562410371,
 539473380,
 562334248,
 560417622,
 536728856,
 560497088,
 540214440,
 529715778,
 565755323,
 562116336,
 561278856,
 539415299,
 540457439,
 562328483,
 561649079,
 539535990,
 536858682,
 560216524,
 566135689,
 577270324,
 562418948,
 536697066,
 561132406,
 510955198,
 536820231,
 562565057,
 578520424,
 576995747,
 529672297,
 560302154,
 529698570,
 560282370,
 578968054,
 561818954,
 576812313,
 529712349,
 561055743,
 539535278,
 561914600,
 577076398,
 539930750,
 579718986,
 539807795,
 517285209,
 501453530,
 561936391,
 567064843,
 540628364,
 501483933,
 536834349,
 578240678,
 539463481,
 567572931,
 539415632,
 536662274,
 501401836,
 562496168,
 536995029,
 529792694,
 583549041,
 560296551,
 540192346,
 560311013,
 562410630,
 561659482,
 560535538,
 561048463,
 560857945,
 583197375,
 562670037,
 562401313,
 565620168,
 560398704,
 561229794,
 539898320,
 562704306,
 560497361,
 539382675,
 561275886,
 561158114,
 561738504,
 562484382,
 562631880,
 560745204,
 529897659,
 560408101,
 539393133,
 561489651,
 560611016,
 561189788,
 562654529,
 560302143,
 562201638,
 561810960,
 562116201,
 577092868,
 561101044,
 560345550,
 565678843,
 562607188,
 562253932,
 560752487,
 536661098,
 561114574,
 567420906,
 562644702,
 578698382,
 536697153,
 562329234,
 561863990,
 501442017,
 562450101,
 561369955,
 540356670,
 561883816,
 562409555,
 561648738,
 562320170,
 560378008,
 561728649,
 560360970,
 560357861,
 560689105,
 536617541,
 562393765,
 561785806,
 560923959,
 561837220,
 561347010,
 517276137,
 561943685,
 536713039,
 561504556,
 561230134,
 566090965,
 561591779,
 567526011,
 578104061,
 529712767,
 561617865,
 561903229,
 560389814,
 561817407,
 561698387,
 560205514,
 536676528,
 562033876,
 576572180,
 560517168,
 529684678,
 517276031,
 540390748,
 529712037,
 562484260,
 562677230,
 562655637,
 561957206,
 539473760,
 560301281,
 565892311,
 567014737,
 576925445,
 591999062,
 562584411,
 536621567,
 536850411,
 541544838,
 578507004,
 561578359,
 562254660,
 501409616,
 536994942,
 529665233,
 562168079,
 536995057,
 529684526,
 576852573,
 567512530,
 562456440,
 539432761,
 578210899,
 561388191,
 561759111,
 561758275,
 541545204,
 565907043,
 501409600,
 539394328,
 562621139,
 562087039,
 529773243,
 568333501,
 540235310,
 578397965,
 565594939,
 561730040,
 539483552,
 578103696,
 561863927,
 540149367,
 562715391,
 562647501,
 539473624,
 540167715,
 502664241,
 562377886,
 561758737,
 560199463,
 578210733,
 578124769,
 577075903,
 561489941,
 562179724,
 562633772,
 561646055,
 537462803,
 580136934,
 536622696,
 562528917,
 562327304,
 561617826,
 562656183,
 536689555,
 561123885,
 562495782,
 579614611,
 560497484,
 562691937,
 576905703,
 561424022,
 568321401,
 578520936,
 539440186,
 561768313,
 562255517,
 577035233,
 529670452,
 540148905,
 539807901,
 540356206,
 560942088,
 561094503,
 576927180,
 562410755,
 561123835,
 562248105,
 576768083,
 536670859,
 539067758,
 561801408,
 562335031,
 536631834,
 560842567,
 539433701,
 561629198,
 561933086,
 561740084,
 562436733,
 576926797,
 561133074,
 561819620,
 529707689,
 536820940,
 562378214,
 560301910,
 579542686,
 539675328,
 561904683,
 576759465,
 562305344,
 562249705,
 529658020,
 561279362,
 536689696,
 536858251,
 561177052,
 560836274,
 539623057,
 529879483,
 536712989,
 561085170,
 540030854,
 565853948,
 560360752,
 561931787,
 501409746,
 539510450,
 560311029,
 562343827,
 560299162,
 562427753,
 562334820,
 560198968,
 561489891,
 502633736,
 562611214,
 578210182,
 501467141,
 577050823,
 561490021,
 562329038,
 561151828,
 561529225,
 560413693,
 540458587,
 562529781,
 539722425,
 529637175,
 567136650,
 577183363,
 561707411,
 561112741,
 578803866,
 501453770,
 536841399,
 562542070,
 562734006,
 561244117,
 562326614,
 540235664,
 561843883,
 560826896,
 562763324,
 566853742,
 561932258,
 560350619,
 501453625,
 560360372,
 561143145,
 529984337,
 576768569,
 561113557,
 540347849,
 560821564,
 565582523,
 539415462,
 536680360,
 539790517,
 562191466,
 536688906,
 540730197,
 529791104,
 562353766,
 561854378,
 536849972,
 540448184,
 561863466,
 537538294,
 560330791,
 562033801,
 562122683,
 565558117,
 562704143,
 579614861,
 536679441,
 501451338,
 561769657,
 579388155,
 577196644,
 537432670,
 540479656,
 560767590,
 560407567,
 562514086,
 561894501,
 562231407,
 578114266,
 539424422,
 539623191,
 540083562]