-- coding: utf-8 --
Author: methylDragon (methylDragon.com)
"Raa."
Description: Sentence tokenises article from inputted URL, and conducts NLP analysis to draw out and PRINT
important parameters.
To use, insert %run 'examineArticle.ipynb' after your import statement.
then call examineArticle("URL")
URL MUST BE A STRING
To get
{"title", "url", "authors", "date", "summary", "polarity", "subjectivity", "keywords", "images", "videos"} AS WELL AS A SENTENCE BY SENTENCE BREAKDOWN OF SENTIMENT!
In [2]:
#NLP
from pattern.web import Twitter
from textblob import TextBlob
import nltk.data
from nltk.tokenize import word_tokenize, sent_tokenize
#NLTK RESOURCE DOWNLOADING
try:
nltk.data.find('tokenizers/punkt')
except LookupError:
nltk.download('punkt')
#PARSER
from newspaper import Article
import newspaper
#set tokenizer model
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
In [3]:
def examineArticle(LINK):
#list, download, and parse article
a = Article(str(LINK)) # Set language as english
a.download()
a.parse()
a.nlp() #Quality of life bold delimiters
b = "\033[1m"
endb = "\033[0;0m"
#START OF ARTICLE
print(b + "START OF ARTICLE - START OF ARTICLE - START OF ARTICLE - START OF ARTICLE - START OF ARTICLE" + endb)
#Overall sentiment trackers
count = 0
overallScore = [0.0,0.0]
#Print meta-data
print("\n-----\n" + b + "METADATA" + endb + "\n-----")
print(b + "Title: " + endb, end="")
print(a.title)
print(b + "Language: " + endb, end="")
print(getattr(a,"meta_lang"))
print(b + "Author(s): " + endb, end="")
print(a.authors)
print(b + "Keywords: " + endb, end="")
print(a.keywords)
print(b + "Date: " + endb, end="")
print(a.publish_date)
print(b + "Top Image: " + endb, end="")
print(a.top_image)
print(b + "Videos: " + endb, end="")
print(a.movies)
#Print summary
print("\n-----\n" + b + "SUMMARY" + endb + "\n-----")
print(a.summary)
sentences = []
for index, token in enumerate(tokenizer.tokenize(a.text)):
sentences.append(TextBlob(tokenizer.tokenize(a.text)[index]))
#Split article into sentences
print("\n-----\n" + b + "ANALYSIS" + endb + "\n-----")
for index, token in enumerate(tokenizer.tokenize(a.text)):
analysis = sentences[index]
#Prep overall analysis tracker, IGNORE if parameters are [0.0, 0.0] for sentence
if analysis.sentiment.polarity != 0.0 and analysis.sentiment.subjectivity != 0.0:
count += 1
overallScore[0] += analysis.sentiment.polarity
overallScore[1] += analysis.sentiment.subjectivity
# analysis.correct() #Correct mispelt words !!! IF YOU ACTIVATE THIS IT'LL BE SLOW
print(analysis + b)
#and for each sentence, analyze sentiment
print("Polarity: " + "{0:.5f}".format(analysis.sentiment.polarity), end=" ")
print("Subjectivity: " + "{0:.5f}".format(analysis.sentiment.subjectivity) + endb)
print(endb + "-----")
else:
try:
sentences[index + 1] = sentences[index] + " " + sentences[index + 1]
analysis = sentences[index + 1]
except: #In the case where the last sentence is 0.0, 0.0
print("LAST SENTENCE")
print(analysis)
print("Polarity: " + "{0:.5f}".format(analysis.sentiment.polarity), end=" ")
print("Subjectivity: " + "{0:.5f}".format(analysis.sentiment.subjectivity) + endb)
continue
#Guarding against divisions by 0
if count == 0:
count = 1
#Print overall sentiment
print("\n-----\n" + b + "OVERALL SENTIMENT" + endb + "\n-----")
#print(TextBlob(a.text).sentiment)
print(b + "Polarity: " + endb, end="")
print("{0:.5f}".format(overallScore[0]/count), end=" | ")
print(b + "Subjectivity: " + endb, end="")
print("{0:.5f}".format(overallScore[1]/count), end="")
print(endb + "\n")
print(b + "END OF ARTICLE - END OF ARTICLE - END OF ARTICLE - END OF ARTICLE - END OF ARTICLE" + endb)
print("--------------------------------------------------------")
%run 'Experiments/methylSwag.ipynb'
methylSwag()
In [4]:
examineArticle("http://www.npr.org/sections/health-shots/2017/10/13/557541856/halt-in-subsidies-for-health-insurers-expected-to-drive-up-costs-for-middle-clas")
In [ ]: