Description: (Sentence) tokenises article from inputted URL, and conducts NLP analysis to draw out important parameters.
{"title", "url", "authors", "date", "summary", "polarity", "subjectivity", "keywords", "images", "videos", "text"}
To use, insert %run 'analyseArticle.ipynb'
after your import statement, then use
result = analyseArticle(URL)
result["PARAMETER"]
to get your wanted parameter!
eg. result["title"] #outputs title
In [2]:
#NLP
from pattern.web import Twitter
from textblob import TextBlob
import nltk.data
from nltk.tokenize import word_tokenize, sent_tokenize
#NLTK RESOURCE DOWNLOADING
try:
nltk.data.find('tokenizers/punkt')
except LookupError:
nltk.download('punkt')
#PARSER
from newspaper import Article
import newspaper
#set tokenizer model
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
In [3]:
def analyseArticle(url):
#list, download, and parse article
a = Article(str(url))
try:
a.download()
a.parse()
a.nlp()
except:
return "FETCH_ERROR"
#Overall sentiment trackers
count = 0
overallScore = [0.0,0.0]
sentences = []
for index, token in enumerate(tokenizer.tokenize(a.text)):
sentences.append(TextBlob(tokenizer.tokenize(a.text)[index]))
#Split article into sentences
for index, token in enumerate(tokenizer.tokenize(a.text)):
analysis = sentences[index]
# analysis.correct() #Correct mispelt words !!! IF YOU ACTIVATE THIS IT'LL BE SLOW
#and for each sentence, analyze sentiment
#Prep overall analysis tracker, IGNORE if parameters are [0.0, 0.0] for sentence
if analysis.sentiment.polarity != 0.0 and analysis.sentiment.subjectivity != 0.0:
count += 1
overallScore[0] += analysis.sentiment.polarity
overallScore[1] += analysis.sentiment.subjectivity
else:
try:
sentences[index + 1] = sentences[index] + " " + sentences[index + 1]
analysis = sentences[index + 1]
except:
continue
#Guarding against divisions by 0
if count == 0:
count = 1
#Store variables
title = a.title
authors = a.authors
date = a.publish_date
summary = a.summary
polarity = overallScore[0]/count
subjectivity = overallScore[1]/count
if polarity == 0.0 or subjectivity == 0.0:
return "ZERO_SENTIMENT_ERROR"
keywords = a.keywords
images = a.top_image
videos = a.movies
text = a.text
language = getattr(a,"meta_lang")
#parameters = [title, url, authors, date, summary, polarity, subjectivity, keywords, images, videos, text, language]
#Output parameter variables in a dictionary as function output
return {"title":title, "url":url, "authors":authors, "date":date, "summary":summary, "polarity":polarity, "subjectivity":subjectivity, "keywords":keywords, "images":images, "videos":videos, "text":text, "language": language}
In [ ]: