In [1]:
import query_mediacloud_fulltext
reload( query_mediacloud_fulltext)
#dreload( query_mediacloud_fulltext)


Out[1]:
<module 'query_mediacloud_fulltext' from 'query_mediacloud_fulltext.pyc'>

In [2]:
import yaml
import mediacloud
import base_meta
reload( base_meta)
api_key = yaml.load(open('config.yaml'))['mediacloud']['api_key']
mc = mediacloud.api.MediaCloud(api_key)

In [3]:
media_id = 1750
query = '+publish_date:[2010-01-01T00:00:00Z TO 2014-07-26T00:00:00Z] AND +media_id:('+str(media_id)+')'
res = mc.sentenceList('sentence_number:1', query, start=0, rows=2, sort='random')

res


Out[3]:
{u'response': {u'docs': [{u'_version_': 1471981003576705025,
    u'media_id': 1750,
    u'medium_name': u'Daily Telegraph',
    u'processed_stories_id': 128265178,
    u'publish_date': u'2013-05-24 10:35:37',
    u'sentence': u'The tourism agency launched the poll to find a home-grown \u201c8th Wonder of the World\u201d which featured 12 candidates, including well-known attractions such as the Forth Bridge, Edinburgh Castle, and St Andrews golf course.',
    u'sentence_number': 1,
    u'solr_id': u'120305485!1403330022',
    u'stories_id': 120305485,
    u'story_sentences_id': u'1403330022',
    u'url': u'http://telegraph.feedsportal.com/c/32726/f/564430/s/2c593f80/l/0L0Stelegraph0O0Ctravel0Ctravelnews0C10A0A782670CTourist0Eboard0Eaccused0Eover0Epublic0Evote0Bhtml/story01.htm'},
   {u'_version_': 1471952751636250625,
    u'media_id': 1750,
    u'medium_name': u'Daily Telegraph',
    u'processed_stories_id': 170979511,
    u'publish_date': u'2013-12-01 11:30:28',
    u'sentence': u'Affordable party wines to stock up on for Christmas?',
    u'sentence_number': 1,
    u'solr_id': u'181435494!2036599153',
    u'stories_id': 181435494,
    u'story_sentences_id': u'2036599153',
    u'url': u'http://telegraph.feedsportal.com/c/32726/f/568391/s/344b0ae7/sc/26/l/0L0Stelegraph0O0Cfoodanddrink0Cseasonal0Efood0Eand0Edrink0C10A4760A320CThe0Ebest0Ewhite0Eand0Ered0Ewines0Efor0EChristmas0Eparties0Bhtml/story01.htm'}],
  u'numFound': 379557,
  u'start': 0},
 u'responseHeader': {u'QTime': 1770,
  u'params': {u'df': u'sentence',
   u'fq': u'+publish_date:[2010-01-01T00:00:00Z TO 2014-07-26T00:00:00Z] AND +media_id:(1750)',
   u'q': u'sentence_number:1',
   u'rows': u'2',
   u'sort': u'random_1 asc',
   u'start': u'0',
   u'wt': u'json'},
  u'status': 0}}

In [4]:
def extract_metadata(extractor, url=None, file_name=None, stories_id=None, import_full_text=False):
    extractor.make_soup(url=url,file_name=file_name,stories_id=stories_id,import_full_text=import_full_text)
    return extractor.get_byline(), extractor.get_section(), extractor.is_opinion()

In [5]:
import bs4
from bs4 import BeautifulSoup 
s = res[u'response'][u'docs'][0]
extractor = query_mediacloud_fulltext.MEDIA_BASEMETA_DICTIONARY['1750']
print s['url']
extract_metadata(extractor, stories_id=s[u'stories_id'], import_full_text=True)
#print extractor.byline_tags
#extractor.get_byline()


http://telegraph.feedsportal.com/c/32726/f/564430/s/2c593f80/l/0L0Stelegraph0O0Ctravel0Ctravelnews0C10A0A782670CTourist0Eboard0Eaccused0Eover0Epublic0Evote0Bhtml/story01.htm
found
<meta content="Oliver Smith" name="DCSext.author"/>
['meta[name="DCSext.author"]', 'content']
Oliver Smith
Oliver Smith
Out[5]:
('Oliver Smith', 'travel', False)

In [6]:
found = extractor.soup.select('meta[name="DCSext.author"]')
print found
first_found = found[0]
first_found['content']


[<meta content="Oliver Smith" name="DCSext.author"/>]
Out[6]:
'Oliver Smith'

In [7]:
#reload( base_meta)
#extractor = None
#base_meta.Telegraph.section_tags
print extractor.section_tags
extractor.soup.select( 'meta' )


[['meta[name="DCSext.cmsSect"]', 'content'], ['meta[name="DCSext.Channel"]', 'content']]
Out[7]:
[<meta content="IE=EmulateIE8" http-equiv="X-UA-Compatible"/>,
 <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>,
 <meta content="VisitScotland has been criticised over its &amp;ldquo;unfair&amp;rdquo; handling of a vote to find 
  Scotland&amp;rsquo;s most popular historical attraction." name="description"/>,
 <meta content="Travel News,Travel" name="keywords"/>,
 <meta content="travel.travelnews" name="tmgads.zone"/>,
 <meta content="travel" name="tmgads.channel"/>,
 <meta content="travel-travelnews" name="tmgads.section"/>,
 <meta content="10078267" name="tmgads.articleid"/>,
 <meta content="story" name="tmgads.pagetype"/>,
 <meta content="3" name="tmgads.level"/>,
 <meta content="" name="tmgads.otherdata"/>,
 <meta content="US" name="tmgads.geo"/>,
 <meta content="2329" name="section-id"/>,
 <meta content="2013-05-24" name="last-modified"/>,
 <meta content="10078267" name="article-id"/>,
 <meta content="Tourist board accused over public vote" name="title"/>,
 <meta content="travel/travelnews" name="GSAMLC"/>,
 <meta content="travel" name="GSAChannel"/>,
 <meta content="Travel" name="GSAChannelName"/>,
 <meta content="travelnews" name="GSACategory"/>,
 <meta content="travel-travel_news" name="GSASectionUniqueName"/>,
 <meta content="Story" name="GSAArticleType"/>,
 <meta content="2013-05-24" name="DC.date.issued"/>,
 <meta content="noarchive,noodp" name="robots"/>,
 <meta content="120118784736295" property="fb:app_id"/>,
 <meta content="686953094,531239902,100002344351237" property="fb:admins"/>,
 <meta content="VisitScotland has been criticised over its &amp;ldquo;unfair&amp;rdquo; handling of a vote to find   Scotland&amp;rsquo;s most popular historical attraction." property="og:description"/>,
 <meta content="Telegraph.co.uk" property="og:site_name"/>,
 <meta content="Tourist board accused over public vote - Telegraph" property="og:title"/>,
 <meta content="http://www.telegraph.co.uk/travel/travelnews/10078267/Tourist-board-accused-over-public-vote.html" property="og:url"/>,
 <meta content="http://i.telegraph.co.uk/multimedia/archive/02572/Dunnottar-Castle_2572156k.jpg" property="og:image"/>,
 <meta content="article" property="og:type"/>,
 <meta content="summary" name="twitter:card"/>,
 <meta content="@Telegraph" name="twitter:site"/>,
 <meta content="@urban_achiever" name="twitter:creator"/>,
 <meta content="/travel/travelnews" name="DCSext.MLC"/>,
 <meta content="travelnews" name="DCSext.Category"/>,
 <meta content="travelnews" name="WT.cg_s"/>,
 <meta content="travel" name="DCSext.Channel"/>,
 <meta content="travel" name="WT.cg_n"/>,
 <meta content="Story" name="DCSext.Content_Type"/>,
 <meta content="3" name="DCSext.Level"/>,
 <meta content="Oliver Smith" name="DCSext.author"/>,
 <meta content="2013-05-24" name="DCSext.articleFirstPublished"/>,
 <meta content="10078267" name="DCSext.articleId"/>,
 <meta content="0" name="DCSext.cf"/>,
 <meta content="0" name="DCSext.cn"/>,
 <meta content="0" name="DCSext.cd"/>]