analyseArticle

Author: methylDragon (methylDragon.com)

Description: (Sentence) tokenises article from inputted URL, and conducts NLP analysis to draw out important parameters. {"title", "url", "authors", "date", "summary", "polarity", "subjectivity", "keywords", "images", "videos", "text"}

To use, insert %run 'analyseArticle.ipynb' after your import statement.

then use

result = analyseArticle(URL) result["PARAMETER"]

To get your wanted parameter!

eg. result["title"] #outputs title

Initialise


In [2]:
#NLP
from pattern.web import Twitter
from textblob import TextBlob
import nltk.data
from nltk.tokenize import word_tokenize, sent_tokenize

#NLTK RESOURCE DOWNLOADING
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

#PARSER
from newspaper import Article
import newspaper

#set tokenizer model
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

Function Definition


In [3]:
def analyseArticle(url):
#list, download, and parse article
    a = Article(str(url))
    try:
        a.download()
        a.parse()
        a.nlp()
    except:
        return "FETCH_ERROR"
    
    #Overall sentiment trackers
    count = 0
    overallScore = [0.0,0.0]

    sentences = []
    
    for index, token in enumerate(tokenizer.tokenize(a.text)):
        sentences.append(TextBlob(tokenizer.tokenize(a.text)[index]))
    
    #Split article into sentences
    for index, token in enumerate(tokenizer.tokenize(a.text)):
        
        analysis = sentences[index]
        # analysis.correct() #Correct mispelt words !!! IF YOU ACTIVATE THIS IT'LL BE SLOW 
        
        #and for each sentence, analyze sentiment
        #Prep overall analysis tracker, IGNORE if parameters are [0.0, 0.0] for sentence
        if analysis.sentiment.polarity != 0.0 and analysis.sentiment.subjectivity != 0.0:
            count += 1
            overallScore[0] += analysis.sentiment.polarity
            overallScore[1] += analysis.sentiment.subjectivity
        else:
            try:
                sentences[index + 1] = sentences[index] + " " + sentences[index + 1]
                analysis = sentences[index + 1]
            except:
                continue

    #Guarding against divisions by 0
    if count == 0:
        count = 1    
    
    #Store variables
    title = a.title
    authors = a.authors
    date = a.publish_date
    summary = a.summary
    polarity = overallScore[0]/count
    subjectivity = overallScore[1]/count
    if polarity == 0.0 or subjectivity == 0.0:
        return "ZERO_SENTIMENT_ERROR"
    keywords = a.keywords
    images = a.top_image
    videos = a.movies
    text = a.text
    language = getattr(a,"meta_lang")

    #parameters = [title, url, authors, date, summary, polarity, subjectivity, keywords, images, videos, text, language]

    #Output parameter variables in a dictionary as function output
    return {"title":title, "url":url, "authors":authors, "date":date, "summary":summary, "polarity":polarity, "subjectivity":subjectivity, "keywords":keywords, "images":images, "videos":videos, "text":text, "language": language}

In [ ]: