In [1]:
# Abstract Extraction:
# 1. Retrieve Txt (Download and parse the text from a webpage using BeautifulSoup)
# 2. Preprocess Text (Tokenize text and remove stopwords)
# 3. Extract Sentences (Rank words and sentences by frequencies)
In [5]:
#Auto-summarizing Text and Scraping Websites
!pip install urllib
import urllib2
from bs4 import BeautifulSoup
In [3]:
articleURL = "https://www.washingtonpost.com/news/the-switch/wp/2016/10/18/the-pentagons-massive-new-telescope-is-designed-to-track-space-junk-and-watch-out-for-killer-asteroids/"
In [4]:
page = urllib2.urlopen(articleURL).read().decode('utf8','ignore')
soup = BeautifulSoup(page,"lxml")
soup #this object contains the html of the file
In [ ]:
soup.find('article') #prints first 'article' tags(elements)
In [ ]:
soup.find('article').text #prints the text content in the first article element
In [ ]:
text = ' '.join(map(lambda p: p.text, soup.find_all('article')))
text #combines all text of the article
In [ ]:
text.encode('ascii', errors='replace').replace("?"," ") #there might be few spl chars like \u201. for that do this
In [ ]:
def getTextWaPo(url):#made all the above steps into a function
page = urllib2.urlopen(url).read().decode('utf8')
soup = BeautifulSoup(page,"lxml")
text = ' '.join(map(lambda p: p.text, soup.find_all('article')))
return text.encode('ascii', errors='replace').replace("?"," ")
In [ ]:
text = getTextWaPo(articleURL)
In [ ]:
#Preprocess Article Text
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
from string import punctuation
In [ ]:
sents = sent_tokenize(text)
sents #note that delimiter is a period followed by a space hello.bye is treated as one word only as there isn't a space followed by period
In [ ]:
word_sent = word_tokenize(text.lower())
word_sent
In [ ]:
_stopwords = set(stopwords.words('english') + list(punctuation))
_stopwords #our collection of stopwords
In [ ]:
word_sent=[word for word in word_sent if word not in _stopwords]
In [ ]:
word_sent
In [ ]:
#3 Extracting a Summary
from nltk.probability import FreqDist
freq = FreqDist(word_sent)
freq #result is a dictionary with their corresponding frequencies
In [ ]:
from heapq import nlargest
#used to sort any collection: be it a list or dict
In [ ]:
nlargest(10, freq, key=freq.get) #gives the list of top 10 words from freq dict and sorting criterion is the values of freq dictionary
In [ ]:
from collections import defaultdict #defaultdict is a bit different in that, if a value is not present for a key instead of throwing an error it adds the key to the dict
ranking = defaultdict(int)
#enumerate converts [a, b, c] to [(0,a), (1,b), (2,c)]
for i,sent in enumerate(sents): #i represents index of the sentence
for w in word_tokenize(sent.lower()): #each sentence is in turn tokenized into words
if w in freq: #freq is the frequency distribution dictionary
ranking[i] += freq[w] #adding frequency of each word of the sentence into ranking
ranking
In [ ]:
sents_idx = nlargest(4, ranking, key=ranking.get) # Gives top 4 sentences from the ranking dictionary and sorting criterion is the values of ranking dictionary
sents_idx
In [ ]:
[sents[j] for j in sorted(sents_idx)]
In [ ]:
def summarize(text, n):
sents = sent_tokenize(text)
assert n <= len(sents)
word_sent = word_tokenize(text.lower())
_stopwords = set(stopwords.words('english') + list(punctuation))
word_sent=[word for word in word_sent if word not in _stopwords]
freq = FreqDist(word_sent)
ranking = defaultdict(int)
for i,sent in enumerate(sents):
for w in word_tokenize(sent.lower()):
if w in freq:
ranking[i] += freq[w]
sents_idx = nlargest(n, ranking, key=ranking.get)
return [sents[j] for j in sorted(sents_idx)]
In [ ]:
summarize(text,3)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: