In [1]:
#scraping
import time
import requests
from pprint import pprint
from bs4 import BeautifulSoup
#nltk
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
#nltk.download('popular', halt_on_error=False)
#textblob
from textblob import TextBlob, Word
In [2]:
def get_quotes(url):
response = requests.get(url)
page = response.content
page = BeautifulSoup(page,"html.parser")
hashtags = [i.get_text() for i in page.find_all("span",class_='text')]
return hashtags
In [3]:
#Method 1
base_url = "http://quotes.toscrape.com/page/"
urls = [base_url+str(i)+"/" for i in range(1,11)] #generate all page urls
all_quotes = []
for i in urls:
all_quotes.append(get_quotes(i))
time.sleep(5) #to make the crawler wait for 5 seconds
In [4]:
#Method 2
base = "http://quotes.toscrape.com"
all_quotes=[]
url = base
while True: #loop infinitely
response = requests.get(url)
page = response.content
page = BeautifulSoup(page,"html.parser")
quotes = [i.get_text() for i in page.find_all("span",class_="text")]
all_quotes.append(quotes)
our_li = page.find_all("li",class_="next")
if len(our_li)==0:
break #stop the loop if no next page button found
url = base+our_li[0].findChild("a").get("href") #use navigation to find Child
time.sleep(3)
In [7]:
quotes_list = get_quotes(base)
quotes_blob = [TextBlob(i) for i in quotes_list]
In [8]:
#sentence sentiment analysis (opinion polarity and subjectivity)
for i in quotes_blob:
print(i)
print(i.sentiment)
print("\n") #to print a separator line in between quotes
In [9]:
#tokenization
first_quote = quotes_blob[0]
print(first_quote,"\n")
print(first_quote.sentences,"\n")
print(first_quote.words,"\n")
In [10]:
#Inflection
word1 = Word("bought")
word2 = Word("animal")
word3 = Word("birds")
print(word2, "->", word2.pluralize())
print(word3, "->", word3.singularize())
In [11]:
#lemmatization / v-verb, n-noun, a-adjective, r-adverb
print(word1, "->", word1.lemmatize("v"))
print(word2, "->", word2.lemmatize("n"))
print(word3, "->", word3.lemmatize("n"))
In [12]:
#tagging / NN - noun, JJ - adjective, IN - preposition, VB_ - verb
print(first_quote.tags)
In [13]:
#definitions
for i in word1,word2,word3:
print(i,"->",i.definitions,"\n")
In [14]:
#spellcheck and correction
print(Word("bougt").spellcheck())
print("\n")
print(Word("bougt").correct())
In [15]:
# langauge detection and translation
word4 = Word("մարդ")
print(word4.detect_language())
print(word4.translate(from_lang="hy",to='en'))
In [16]:
#comparison
word1>word2
Out[16]:
In [17]:
#3-grams
first_quote.ngrams(n=3)[0:3]
Out[17]:
In [18]:
#synonyms
word3.synsets[0]
Out[18]:
In [19]:
#get top 5 most popular words
freq = nltk.FreqDist(first_quote.words)
freq.plot(5, cumulative=False)
In [20]:
#stemming
stemmer = PorterStemmer()
word_roots = [stemmer.stem(i) for i in first_quote.words]
print(word_roots)
In [21]:
#stopwords and their removal
sw = stopwords.words('english')
first_quote_clean = [i for i in first_quote.words if i not in sw]
print(first_quote.words)
print("\n")
print(first_quote_clean)
In [22]:
#synonyms, definitions, and examples
syn = wordnet.synsets("honey")
print(syn[0].definition())
print(syn[0].examples())
Our target is to scrape the newsfeed from the english version of the website and get the most frequent 10 words from there. For that reason, we will request and get the website page, find the paragraphs in newsfeed and extract text. Then the list of headline strings should be concatenated into a single string and chenged to TextBlob type. We will uncapitalize the whole string as well as delete any stopwords. In general, one should probably stem or lemmatize as well, to make sure words like "go" and "going" are considered the same (not done here). Afterwards, we will calculate frequency distribution of all the words in the text and plot top 10 of them.
In [23]:
url = "http://www.tert.am/en"
response = requests.get(url)
page = response.content
page = BeautifulSoup(page,"html.parser")
paragraphs = page.find_all("p",class_="today-title")
headlines = [i.get_text() for i in paragraphs]
headline =""
for i in headlines:
headline = headline + " " + i
blob = TextBlob(headline).lower()
clean_headlines = [i for i in blob.words if i not in sw]
freq = nltk.FreqDist(clean_headlines)
freq.plot(10,cumulative=False)