In [5]:
import json
import urllib2

from IPython.display import Image

In [6]:
class TopicSummarizer(object):
    """
    Our stepwise processor that uses Wikipedia to summarize topics.
    
    Just instantiate with the topic name, call .process(), and then .get_results()
    """
    
    def __init__(self, topic):
        self.topic = unicode(topic)
        
    def process(self):
        self._fetch_text()
        self._fetch_thumbnail()
        return self
    
    def get_results(self, as_text=False):
        if as_text:
            return self.topic + ' summary: ' + self._text
        return TopicSummary(self.topic, self._thumb_url, self._text)
    
    def _fetch_text(self):
        self._text_api_url = TEXT_URL_TMPL.format(title=self.topic)
        self._text_resp = self._get_url_json(self._text_api_url)
        self._text = self._text_resp['query']['pages'].values()[0]['extract']
        
    def _fetch_thumbnail(self):
        self._thumb_api_url = THUMB_URL_TMPL.format(title=self.topic)
        self._thumb_resp = self._get_url_json(self._thumb_api_url)
        self._thumb_url = self._thumb_resp['query']['pages'].values()[0]['thumbnail']['source']
        
    def _get_url_json(self, url):
        resp = urllib2.urlopen(url)
        resp_body = resp.read()
        return json.loads(resp_body)
    
    
class TopicSummary(object):
    def __init__(self, topic, thumb_url, text):
        self.topic = topic
        self.thumb_url = thumb_url
        self.text = text
        
    def __repr__(self):
        cn = self.__class__.__name__
        return '%s(%r, %r, %r)' % (cn, self.topic, self.thumb_url, self.text)
    

TEXT_URL_TMPL = 'https://en.wikipedia.org/w/api.php?action=query&prop=extracts&exsentences=2&titles={title}&explaintext=1&exintro=1&format=json'
THUMB_URL_TMPL = 'https://en.wikipedia.org/w/api.php?action=query&prop=pageimages&titles={title}&format=json'

In [7]:
summarizer = TopicSummarizer('Coffee')
summarizer.process()


Out[7]:
<__main__.TopicSummarizer at 0x7fd8d01fa1d0>

In [8]:
coffee_summary = summarizer.get_results()

In [9]:
print(coffee_summary.text)
Image(url=coffee_summary.thumb_url)


Coffee is a brewed drink prepared from roasted coffee beans, which are the seeds of berries from the Coffea plant. The plant is native to subtropical Africa and some islands in southern Asia.
Out[9]:

TODO

  • Better URL composition
  • __repr__ for the TopicSummarizer