In [1]:
#NLP
from pattern.web import Twitter
from textblob import TextBlob
import nltk.data
from nltk.tokenize import word_tokenize, sent_tokenize
#NLTK RESOURCE DOWNLOADING
try:
nltk.data.find('tokenizers/punkt')
except LookupError:
nltk.download('punkt')
#PARSER
from newspaper import Article
import newspaper
#set tokenizer model
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
In [17]:
def analyseArticle(url):
#list, download, and parse article
a = Article(url) # Set language as english
a.download()
a.parse()
a.nlp()
#Overall sentiment trackers
count = 0
overallScore = [0.0,0.0]
#Split article into sentences
for index, token in enumerate(tokenizer.tokenize(a.text)):
analysis = TextBlob(tokenizer.tokenize(a.text)[index])
# analysis.correct() #Correct mispelt words !!! IF YOU ACTIVATE THIS IT'LL BE SLOW
#and for each sentence, analyze sentiment
#Prep overall analysis tracker, IGNORE if parameters are [0.0, 0.0] for sentence
if analysis.sentiment.polarity != 0.0 and analysis.sentiment.subjectivity != 0.0:
count += 1
overallScore[0] += analysis.sentiment.polarity
overallScore[1] += analysis.sentiment.subjectivity
#Guarding against divisions by 0
if count == 0:
count = 1
#Store variables
title = a.title
authors = a.authors
date = a.publish_date
summary = a.summary
polarity = overallScore[0]/count
subjectivity = overallScore[1]/count
keywords = a.keywords
images = a.top_image
videos = a.movies
text = a.text
language = getattr(a,"meta_lang")
#parameters = [title, url, authors, date, summary, polarity, subjectivity, keywords, images, videos, text, language]
#Output parameter variables in a dictionary as function output
return {"title":title, "url":url, "authors":authors, "date":date, "summary":summary, "polarity":polarity, "subjectivity":subjectivity, "keywords":keywords, "images":images, "videos":videos, "text":text, "language": language}
In [ ]:
paper = newspaper.build("http://" + "www.thehearttruths.com", memoize_articles=False)
analyseArticle("www.thehearttruths.com")
In [ ]: