notebook.community

Edit and run



In [1]:

    
#NLP
from pattern.web import Twitter
from textblob import TextBlob
import nltk.data
from nltk.tokenize import word_tokenize, sent_tokenize

#NLTK RESOURCE DOWNLOADING
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

#PARSER
from newspaper import Article
import newspaper

#set tokenizer model
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')



In [17]:

    
def analyseArticle(url):
#list, download, and parse article
    a = Article(url) # Set language as english
    a.download()
    a.parse()
    a.nlp()
    
    #Overall sentiment trackers
    count = 0
    overallScore = [0.0,0.0]
    
    #Split article into sentences
    for index, token in enumerate(tokenizer.tokenize(a.text)):
        analysis = TextBlob(tokenizer.tokenize(a.text)[index])
        # analysis.correct() #Correct mispelt words !!! IF YOU ACTIVATE THIS IT'LL BE SLOW 
        #and for each sentence, analyze sentiment
        #Prep overall analysis tracker, IGNORE if parameters are [0.0, 0.0] for sentence
        if analysis.sentiment.polarity != 0.0 and analysis.sentiment.subjectivity != 0.0:
            count += 1
            overallScore[0] += analysis.sentiment.polarity
            overallScore[1] += analysis.sentiment.subjectivity

    #Guarding against divisions by 0
    if count == 0:
        count = 1    
    
    #Store variables
    title = a.title
    authors = a.authors
    date = a.publish_date
    summary = a.summary
    polarity = overallScore[0]/count
    subjectivity = overallScore[1]/count
    keywords = a.keywords
    images = a.top_image
    videos = a.movies
    text = a.text
    language = getattr(a,"meta_lang")

    #parameters = [title, url, authors, date, summary, polarity, subjectivity, keywords, images, videos, text, language]

    #Output parameter variables in a dictionary as function output
    return {"title":title, "url":url, "authors":authors, "date":date, "summary":summary, "polarity":polarity, "subjectivity":subjectivity, "keywords":keywords, "images":images, "videos":videos, "text":text, "language": language}



In [ ]:

    
paper = newspaper.build("http://" + "www.thehearttruths.com", memoize_articles=False)

analyseArticle("www.thehearttruths.com")



In [ ]: