Description: Crawls, and analyses articles from stated URLs (and Mothership, because it's special/troublesome), churns out parameters via analyseArticle, and pushes them to Firebase.
{"title", "url", "authors", "date", "summary", "polarity", "subjectivity", "keywords", "images", "videos"}
In [3]:
%run 'Experiments/methylSwag.ipynb'
methylSwag()
print("\nINITIALISING MODULES\n.")
%run 'analyseArticle.ipynb'
%run 'firebasePush.ipynb'
import traceback
import newspaper
import requests
import time
from bs4 import BeautifulSoup
from timeit import default_timer as timer
start = timer()
print("OPENING LOGS\n.")
log = open("CRAWL_LOG.txt", "w")
print("LOADING URL LISTS\n.")
COMPLETED = []
QUEUE = []
#newsURLs = '''["kementah.blogspot.sg", "blog.wan-ifra.org/","www.straitstimes.com","www.todayonline.com","www.channelnewsasia.com","www.businessinsider.sg",'''
"""newsURLs = ["www.businesstimes.com.sg",
"alvinology.com","www.asiaone.com/singapore","sg.news.yahoo.com","www.gov.sg/news",
"www.theindependent.sg","www.tnp.sg","telegraph.co.uk/news/worldnews/asia/singapore/",
"themiddleground.sg","www.allsingaporestuff.com","www.theonlinecitizen.com","statestimesreview.com",
"www.tremeritus.com","thehearttruths.com","therealsingapore.com","mustsharenews.com",
"berthahenson.wordpress.com","yawningbread.wordpress.com","singaporedaily.net",
"www.msn.com/en-sg/news/",
newsURLs= ["asiancorrespondent.com/section/singapore/#fOzMdSd453Zgkgvy.97",
newsURLs= ["www.mrbrownshow.com/",
"www.buzzfeed.com/tag/singapore","stomp.straitstimes.com/",
"www.straitstimes.com/forum","fintechnews.sg/","www.techinasia.com/tag/singapore",
"www.theedgesingapore.com/latest-news",
"""
'''newsURLs = ["www.mfa.gov.sg/content/mfa/overseasmission/vientiane/News.html",
"www.google.com.sg/search?q=singapore+news&client=ubuntu&hs=HVO&channel=fs&dcr=0&source=lnms&tbm=nws&sa=X&ved=0ahUKEwj-tcLb6u3WAhWDs48KHUDqBzg4HhD8BQgKKAE&biw=1855&bih=981"]'''
newsURLs = ["www.channelnewsasia.com","statestimesreview.com"]
mothershipURLs = ["mothership.sg/category/news","mothership.sg/category/perspectives",
"mothership.sg/category/community","mothership.sg/category/almost-famous",
"mothership.sg/category/mps-in-the-house","mothership.sg/category/humour"]
print(".\n.\nINITIALISED FIREBASEPOPULATE")
In [4]:
'''mcount = 0
mnoteng = 0
mfailed = 0
mtooshort = 0
mfetcherror = 0
print("RUN MOTHERSHIP MODULE\n")
for URL in mothershipURLs:
print("Retrieving URL...\n")
try:
sourceCode = requests.get("http://" + str(URL))
soup = BeautifulSoup(sourceCode.content, "lxml")
print("Target URL: " + str(URL))
for div in soup.find_all("div", class_="ind-article"):
for a in div.find_all("a"):
if "mothership.sg" in a.get("href"):
try:
print(str(mcount + mnoteng + mfailed + mtooshort + mfetcherror + 1)+": ", end="")
parameters = analyseArticle(a.get("href")) #for getting link
if parameters == "ZERO_SENTIMENT_ERROR": #Check for zero sentiment, means article is too short or redirected
mtooshort += 1
print("SKIPPING: ZERO_SENTIMENT_ERROR, NO SENTIMENT DETECTED!", end=" #")
print(str(mtooshort))
continue
if parameters == "FETCH_ERROR": #Check for zero sentiment, means article is too short or redirected
mfetcherror += 1
print("SKIPPING: FETCH_ERROR, COULD NOT DOWNLOAD ARTICLE!", end=" #")
print(str(mfetcherror))
continue
if str(parameters["language"]) != "en": #Check if article is in English, if it isn't skip
mnoteng += 1
print("SKIPPING: LANG_ERROR, ARTICLE NOT IN ENGLISH!", end=" #")
print(str(mnoteng) + " (" + str(parameters["language"]) + ")")
continue
title = str(parameters["title"])
url = str(parameters["url"])
authors = parameters["authors"]
date = str(parameters["date"])
summary = str(parameters["summary"])
polarity = str(parameters["polarity"])
subjectivity = str(parameters["subjectivity"])
keywords = parameters["keywords"]
images = str(parameters["images"])
videos = str(parameters["videos"])
text = str(parameters["text"])
firebasePush(title, url, authors, date, summary, polarity, subjectivity, keywords, images, videos, text)
mcount += 1
print("Processed article #", end="")
print(mcount)
except Exception as ex:
mfailed += 1
print("FAILED article: #", end=" | ")
print(ex)
print(mfailed,end=" | Moving on...\n")
log.write("\n\n-----------------------------------------------")
log.write("\n\nMOTHERSHIP MODULE UNKNOWN ERROR DUMP | Fetch #")
log.write(str(mcount + mnoteng + mfailed + mtooshort + mfetcherror))
log.write(": \n\n")
log.write("ERROR:")
log.write(str(traceback.format_exc())) #FOR DEBUGGING
log.write("\n\n")
log.write("Data:")
log.write(str(parameters)) #FOR DEBUGGING
except Exception as ex:
print("Failed URL", end=" | ")
print(ex)
print("\n--------------!!-rawr-=rAwR=*RAWR*=rAwR=-rawr-!!--------------")
string = "FINISHED: " + str(URL)
print(string.center(63))
log.write("PROCESSED: ")
log.write(str(URL))
log.write("\n")
log.flush()
print("--------------!!-rawr-=rAwR=*RAWR*=rAwR=-rawr-!!--------------\n")
methylHalf()
print("\n ---!!--!!-raa--rawr-=rAwR=*RAAWR*=rAwR=-rawr--raa-!!--!!---")
print(" FINISHED PROCESSING MOTHERSHIP")
log.write("FINISHED PROCESSING: ")
log.write("MOTHERSHIP")
log.write("\n\n")
print(" ---!!--!!-raa--rawr-=rAwR=*RAAWR*=rAwR=-rawr--raa-!!--!!---\n")
print("SUMMARY:")
print("Elapsed time: ",end="")
checkpoint = timer()
print(checkpoint - start,end="")
print(" seconds\n")
log.write("Elapsed Time: " + str(checkpoint - start))
log.write("\n\n")
log.flush
print(str(mcount + mnoteng + mfailed + mtooshort + mfetcherror) + " Total Articles Accessed")
print(str(mcount) + " Processed Articles\n")
print(str(mnoteng) + " LANG_ERRORs (Article not in English)")
print(str(mtooshort) + " ZERO_SENTIMENT_ERRORs (No sentiment detected)")
print(str(mfetcherror) + " FETCH_ERRORs (Failed to fetch article)")
print(str(mfailed) + " Failed Articles\n")
firebaseRefresh()
time.sleep(1)
print(" ---!!--!!-raa--rawr-=rAwR=*RAAWR*=rAwR=-rawr--raa-!!--!!---\n")'''
Out[4]:
In [5]:
count = 0
noteng = 0
failed = 0
tooshort = 0
fetcherror = 0
print("RUN URL MODULE\n")
for URL in newsURLs:
print("Building domain...\n")
try:
paper = newspaper.build("http://" + str(URL), memoize_articles=False)
print("Domain building complete for: " + str(URL))
except Exception as ex:
print("Failed DOMAIN", end=" | ")
print(ex, end =" | moving on...\n")
for article in paper.articles:
try:
print(str(count + noteng + failed + tooshort + fetcherror + 1)+": ",end="")
parameters = analyseArticle(article.url)
if parameters == "ZERO_SENTIMENT_ERROR": #Check for zero sentiment, means article is too short or redirected
tooshort += 1
print("SKIPPING: ZERO_SENTIMENT_ERROR, NO SENTIMENT DETECTED!", end=" #")
print(str(tooshort))
print(article.url)
continue
if parameters == "FETCH_ERROR":
fetcherror +=1
print("SKIPPING: FETCH_ERROR, COULD NOT DOWNLOAD ARTICLE!", end=" #")
print(str(fetcherror))
continue
if str(parameters["language"]) != "en": #Check if article is in English, if it isn't skip
noteng += 1
print("SKIPPING: LANG_ERROR, ARTICLE NOT IN ENGLISH!", end=" #")
print(str(noteng) + " (" + str(parameters["language"]) + ")")
print(article.url)
continue
title = parameters["title"]
url = str(article.url)
authors = parameters["authors"]
date = str(parameters["date"])
summary = str(parameters["summary"])
polarity = str(parameters["polarity"])
subjectivity = str(parameters["subjectivity"])
keywords = parameters["keywords"]
images = str(parameters["images"])
videos = str(parameters["videos"])
text = str(parameters["text"])
firebasePush(title, url, authors, date, summary, polarity, subjectivity, keywords, images, videos, text)
count += 1
print("Processed article #", end="")
print(count)
except Exception as ex:
failed += 1
print("FAILED article: #",end="")
print(failed, end=" | ")
print(ex,end=" | Moving on...\n")
log.write("\n\n-----------------------------------------------")
log.write("\n\nURL MODULE UNKNOWN ERROR DUMP | Fetch #")
log.write(str(count + noteng + failed + tooshort + fetcherror))
log.write(": \n\n")
log.write("ERROR:")
log.write(str(traceback.format_exc())) #FOR DEBUGGING
log.write("\n\n")
log.write("DATA:\n")
log.write(str(parameters)) #FOR DEBUGGING
print("\n--------------!!-rawr-=rAwR=*RAWR*=rAwR=-rawr-!!--------------")
string = "FINISHED: " + str(URL)
print(string.center(63))
log.write("PROCESSED: ")
log.write(str(URL))
log.write("\n")
log.flush()
print("--------------!!-rawr-=rAwR=*RAWR*=rAwR=-rawr-!!--------------\n")
print("RUNNING SUMMARY:")
print("Elapsed time: ",end="")
checkpoint = timer()
print(checkpoint - start,end="")
print(" seconds\n")
log.write("Elapsed Time: " + str(checkpoint - start))
log.write("\n\n")
log.flush
print(str(count + noteng + failed + tooshort + fetcherror) + " Total Articles Fetched")
print(str(count) + " Processed Articles\n")
print(str(noteng) + " LANG_ERRORs (Article not in English)")
print(str(tooshort) + " ZERO_SENTIMENT_ERRORs (No sentiment detected)")
print(str(fetcherror) + " FETCH_ERRORs (Failed to fetch article)")
print(str(failed) + " Failed Articles\n")
firebaseRefresh()
time.sleep(1)
print("--------------!!-rawr-=rAwR=*RAWR*=rAwR=-rawr-!!--------------\n")
methylHalf()
print("\n ---!!--!!-raa--rawr-=rAwR=*RAAWR*=rAwR=-rawr--raa-!!--!!---")
print(" FINISHED PROCESSING URLS!")
log.write("FINISHED PROCESSING: ")
log.write("URLS")
log.write("\n\n")
print(" ---!!--!!-raa--rawr-=rAwR=*RAAWR*=rAwR=-rawr--raa-!!--!!---\n")
print("SUMMARY:")
print(str(count + noteng + failed + tooshort + fetcherror) + " Total Articles Accessed")
print(str(count) + " Processed Articles\n")
print(str(noteng) + " LANG_ERRORs (Article not in English)")
print(str(tooshort) + " ZERO_SENTIMENT_ERRORs (No sentiment detected)")
print(str(fetcherror) + " FETCH_ERRORs (Failed to fetch article)")
print(str(failed) + " Failed Articles\n")
print(" ---!!--!!-raa--rawr-=rAwR=*RAAWR*=rAwR=-rawr--raa-!!--!!---\n")
print("Elapsed time: ",end="")
checkpoint = timer()
print(checkpoint - start,end="")
print(" seconds\n")
print("SHUTTING DOWN")
log.write("Elapsed Time: " + str(checkpoint - start))
log.write("\n\n")
log.write("SHUTTING DOWN")
log.flush
log.close()
In [ ]: