notebook.community

Edit and run



In [1]:

    
# Abstract Extraction:
#     1. Retrieve Txt (Download and parse the text from a webpage using BeautifulSoup)
#     2. Preprocess Text (Tokenize text and remove stopwords)
#     3. Extract Sentences (Rank words and sentences by frequencies)



In [5]:

    
#Auto-summarizing Text and Scraping Websites
!pip install urllib
import urllib2
from bs4 import BeautifulSoup









    



Collecting urllib
  Could not find a version that satisfies the requirement urllib (from versions: )
No matching distribution found for urllib



In [3]:

    
articleURL = "https://www.washingtonpost.com/news/the-switch/wp/2016/10/18/the-pentagons-massive-new-telescope-is-designed-to-track-space-junk-and-watch-out-for-killer-asteroids/"



In [4]:

    
page = urllib2.urlopen(articleURL).read().decode('utf8','ignore') 
soup = BeautifulSoup(page,"lxml")
soup #this object contains the html of the file









    



---------------------------------------------------------------------------
HTTPError                                 Traceback (most recent call last)
<ipython-input-4-0c3c6f8d2c48> in <module>()
----> 1 page = urllib2.urlopen(articleURL).read().decode('utf8','ignore')
      2 soup = BeautifulSoup(page,"lxml")
      3 soup

/Users/dwdcw/miniconda3/envs/py27/lib/python2.7/urllib2.pyc in urlopen(url, data, timeout, cafile, capath, cadefault, context)
    152     else:
    153         opener = _opener
--> 154     return opener.open(url, data, timeout)
    155 
    156 def install_opener(opener):

/Users/dwdcw/miniconda3/envs/py27/lib/python2.7/urllib2.pyc in open(self, fullurl, data, timeout)
    433         for processor in self.process_response.get(protocol, []):
    434             meth = getattr(processor, meth_name)
--> 435             response = meth(req, response)
    436 
    437         return response

/Users/dwdcw/miniconda3/envs/py27/lib/python2.7/urllib2.pyc in http_response(self, request, response)
    546         if not (200 <= code < 300):
    547             response = self.parent.error(
--> 548                 'http', request, response, code, msg, hdrs)
    549 
    550         return response

/Users/dwdcw/miniconda3/envs/py27/lib/python2.7/urllib2.pyc in error(self, proto, *args)
    471         if http_err:
    472             args = (dict, 'default', 'http_error_default') + orig_args
--> 473             return self._call_chain(*args)
    474 
    475 # XXX probably also want an abstract factory that knows when it makes

/Users/dwdcw/miniconda3/envs/py27/lib/python2.7/urllib2.pyc in _call_chain(self, chain, kind, meth_name, *args)
    405             func = getattr(handler, meth_name)
    406 
--> 407             result = func(*args)
    408             if result is not None:
    409                 return result

/Users/dwdcw/miniconda3/envs/py27/lib/python2.7/urllib2.pyc in http_error_default(self, req, fp, code, msg, hdrs)
    554 class HTTPDefaultErrorHandler(BaseHandler):
    555     def http_error_default(self, req, fp, code, msg, hdrs):
--> 556         raise HTTPError(req.get_full_url(), code, msg, hdrs, fp)
    557 
    558 class HTTPRedirectHandler(BaseHandler):

HTTPError: HTTP Error 403: Forbidden



In [ ]:

    
soup.find('article') #prints first 'article' tags(elements)



In [ ]:

    
soup.find('article').text #prints the text content in the first article element



In [ ]:

    
text = ' '.join(map(lambda p: p.text, soup.find_all('article'))) 
text #combines all text of the article



In [ ]:

    
text.encode('ascii', errors='replace').replace("?"," ") #there might be few spl chars like \u201. for that do this



In [ ]:

    
def getTextWaPo(url):#made all the above steps into a function
    page = urllib2.urlopen(url).read().decode('utf8')
    soup = BeautifulSoup(page,"lxml")
    text = ' '.join(map(lambda p: p.text, soup.find_all('article')))
    return text.encode('ascii', errors='replace').replace("?"," ")



In [ ]:

    
text = getTextWaPo(articleURL)



In [ ]:

    
#Preprocess Article Text 
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
from string import punctuation



In [ ]:

    
sents = sent_tokenize(text)
sents #note that delimiter is a period followed by a space    hello.bye is treated as one word only as there isn't a space followed by period



In [ ]:

    
word_sent = word_tokenize(text.lower())
word_sent



In [ ]:

    
_stopwords = set(stopwords.words('english') + list(punctuation))
_stopwords #our collection of stopwords



In [ ]:

    
word_sent=[word for word in word_sent if word not in _stopwords]



In [ ]:

    
word_sent



In [ ]:

    
#3 Extracting a Summary
from nltk.probability import FreqDist
freq = FreqDist(word_sent)
freq #result is a dictionary with their corresponding frequencies



In [ ]:

    
from heapq import nlargest
#used to sort any collection: be it a list or dict



In [ ]:

    
nlargest(10, freq, key=freq.get) #gives the list of top 10 words from freq dict and sorting criterion is the values of freq dictionary



In [ ]:

    
from collections import defaultdict #defaultdict is a bit different in that, if a value is not present for a key instead of throwing an error it adds the key to the dict
ranking = defaultdict(int)

#enumerate converts [a, b, c] to [(0,a), (1,b), (2,c)]
for i,sent in enumerate(sents):  #i represents index of the sentence
    for w in word_tokenize(sent.lower()): #each sentence is in turn tokenized into words
        if w in freq: #freq is the frequency distribution dictionary
            ranking[i] += freq[w] #adding frequency of each word of the sentence into ranking
            
ranking



In [ ]:

    
sents_idx = nlargest(4, ranking, key=ranking.get) # Gives top 4 sentences from the ranking dictionary and sorting criterion is the values of ranking dictionary 
sents_idx



In [ ]:

    
[sents[j] for j in sorted(sents_idx)]



In [ ]:

    
def summarize(text, n):
    sents = sent_tokenize(text)
    
    assert n <= len(sents)
    word_sent = word_tokenize(text.lower())
    _stopwords = set(stopwords.words('english') + list(punctuation))
    
    word_sent=[word for word in word_sent if word not in _stopwords]
    freq = FreqDist(word_sent)
    
    
    ranking = defaultdict(int)
    
    for i,sent in enumerate(sents):
        for w in word_tokenize(sent.lower()):
            if w in freq:
                ranking[i] += freq[w]
             
        
    sents_idx = nlargest(n, ranking, key=ranking.get)
    return [sents[j] for j in sorted(sents_idx)]



In [ ]:

    
summarize(text,3)



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]: