Description: Crawls, and analyses articles from stated URLs (and Mothership, because it's special/troublesome), churns out parameters via analyseArticle, and pushes them to Firebase.
The parameters are:
{"title", "url", "authors", "date", "summary", "polarity", "subjectivity", "keywords", "images", "videos"}
In [7]:
print("\nINITIALISING MODULES\n.")
%run 'analyseArticle.ipynb'
%run 'firebasePush.ipynb'
import traceback
import newspaper
import requests
import time
from bs4 import BeautifulSoup
from timeit import default_timer as timer
start = timer()
print("OPENING LOGS\n.")
log = open("CRAWL_LOG.txt", "w")
print("LOADING URL LISTS\n.\n")
COMPLETED = []
QUEUE = []
newsURLs = ["www.straitstimes.com","www.allsingaporestuff.com"]
mothershipURLs = ["mothership.sg/category/news","mothership.sg/category/perspectives",
"mothership.sg/category/community","mothership.sg/category/almost-famous",
"mothership.sg/category/mps-in-the-house","mothership.sg/category/humour"]
print("\nINITIALISED FIREBASEPOPULATE")
In [ ]:
mcount = 0
mnoteng = 0
mfailed = 0
mtooshort = 0
mfetcherror = 0
print("RUN MOTHERSHIP MODULE\n")
for URL in mothershipURLs:
print("Retrieving URL...\n")
try:
sourceCode = requests.get("http://" + str(URL))
soup = BeautifulSoup(sourceCode.content, "lxml")
print("Target URL: " + str(URL))
for div in soup.find_all("div", class_="ind-article"):
for a in div.find_all("a"):
if "mothership.sg" in a.get("href"):
try:
print(str(mcount + mnoteng + mfailed + mtooshort + mfetcherror + 1)+": ", end="")
parameters = analyseArticle(a.get("href")) #for getting link
if parameters == "ZERO_SENTIMENT_ERROR": #Check for zero sentiment, means article is too short or redirected
mtooshort += 1
print("SKIPPING: ZERO_SENTIMENT_ERROR, NO SENTIMENT DETECTED!", end=" #")
print(str(mtooshort))
continue
if parameters == "FETCH_ERROR": #Check for zero sentiment, means article is too short or redirected
mfetcherror += 1
print("SKIPPING: FETCH_ERROR, COULD NOT DOWNLOAD ARTICLE!", end=" #")
print(str(mfetcherror))
continue
if str(parameters["language"]) != "en": #Check if article is in English, if it isn't skip
mnoteng += 1
print("SKIPPING: LANG_ERROR, ARTICLE NOT IN ENGLISH!", end=" #")
print(str(mnoteng) + " (" + str(parameters["language"]) + ")")
continue
title = str(parameters["title"])
url = str(parameters["url"])
authors = parameters["authors"]
date = str(parameters["date"])
summary = str(parameters["summary"])
polarity = str(parameters["polarity"])
subjectivity = str(parameters["subjectivity"])
keywords = parameters["keywords"]
images = str(parameters["images"])
videos = str(parameters["videos"])
text = str(parameters["text"])
firebasePush(title, url, authors, date, summary, polarity, subjectivity, keywords, images, videos, text)
mcount += 1
print("Processed article #", end="")
print(mcount)
except Exception as ex:
mfailed += 1
print("FAILED article: #", end=" | ")
print(ex)
print(mfailed,end=" | Moving on...\n")
log.write("\n\n ------------------------ ")
log.write("\n\nMOTHERSHIP MODULE UNKNOWN ERROR DUMP | Fetch #")
log.write(str(mcount + mnoteng + mfailed + mtooshort + mfetcherror))
log.write(": \n\n")
log.write("ERROR:")
log.write(str(traceback.format_exc())) #FOR DEBUGGING
log.write("\n\n")
log.write("Data:")
log.write(str(parameters)) #FOR DEBUGGING
except Exception as ex:
print("Failed URL", end=" | ")
print(ex)
print("\n ------------------------ ")
string = "FINISHED: " + str(URL)
print(string.center(63))
log.write("PROCESSED: ")
log.write(str(URL))
log.write("\n")
log.flush()
print(" ------------------------ \n")
methylHalf()
print("\n ------------------------ ")
print(" FINISHED PROCESSING MOTHERSHIP")
log.write("FINISHED PROCESSING: ")
log.write("MOTHERSHIP")
log.write("\n\n")
print(" ------------------------ \n")
print("SUMMARY:")
print("Elapsed time: ",end="")
checkpoint = timer()
print(checkpoint - start,end="")
print(" seconds\n")
log.write("Elapsed Time: " + str(checkpoint - start))
log.write("\n\n")
log.flush
print(str(mcount + mnoteng + mfailed + mtooshort + mfetcherror) + " Total Articles Accessed")
print(str(mcount) + " Processed Articles\n")
print(str(mnoteng) + " LANG_ERRORs (Article not in English)")
print(str(mtooshort) + " ZERO_SENTIMENT_ERRORs (No sentiment detected)")
print(str(mfetcherror) + " FETCH_ERRORs (Failed to fetch article)")
print(str(mfailed) + " Failed Articles\n")
firebaseRefresh()
time.sleep(1)
print(" ------------------------ ")
In [ ]:
count = 0
noteng = 0
failed = 0
tooshort = 0
fetcherror = 0
print("RUN URL MODULE\n")
for URL in newsURLs:
print("Building domain...\n")
try:
paper = newspaper.build("http://" + str(URL), memoize_articles=False)
print("Domain building complete for: " + str(URL))
except Exception as ex:
print("Failed DOMAIN", end=" | ")
print(ex, end =" | moving on...\n")
for article in paper.articles:
try:
print(str(count + noteng + failed + tooshort + fetcherror + 1)+": ",end="")
parameters = analyseArticle(article.url)
if parameters == "ZERO_SENTIMENT_ERROR": #Check for zero sentiment, means article is too short or redirected
tooshort += 1
print("SKIPPING: ZERO_SENTIMENT_ERROR, NO SENTIMENT DETECTED!", end=" #")
print(str(tooshort))
print(article.url)
continue
if parameters == "FETCH_ERROR":
fetcherror +=1
print("SKIPPING: FETCH_ERROR, COULD NOT DOWNLOAD ARTICLE!", end=" #")
print(str(fetcherror))
continue
if str(parameters["language"]) != "en": #Check if article is in English, if it isn't skip
noteng += 1
print("SKIPPING: LANG_ERROR, ARTICLE NOT IN ENGLISH!", end=" #")
print(str(noteng) + " (" + str(parameters["language"]) + ")")
print(article.url)
continue
title = parameters["title"]
url = str(article.url)
authors = parameters["authors"]
date = str(parameters["date"])
summary = str(parameters["summary"])
polarity = str(parameters["polarity"])
subjectivity = str(parameters["subjectivity"])
keywords = parameters["keywords"]
images = str(parameters["images"])
videos = str(parameters["videos"])
text = str(parameters["text"])
firebasePush(title, url, authors, date, summary, polarity, subjectivity, keywords, images, videos, text)
count += 1
print("Processed article #", end="")
print(count)
except Exception as ex:
failed += 1
print("FAILED article: #",end="")
print(failed, end=" | ")
print(ex,end=" | Moving on...\n")
log.write("\n\n ------------------------ ")
log.write("\n\nURL MODULE UNKNOWN ERROR DUMP | Fetch #")
log.write(str(count + noteng + failed + tooshort + fetcherror))
log.write(": \n\n")
log.write("ERROR:")
log.write(str(traceback.format_exc())) #FOR DEBUGGING
log.write("\n\n")
log.write("DATA:\n")
log.write(str(parameters)) #FOR DEBUGGING
print("\n ------------------------ ")
string = "FINISHED: " + str(URL)
print(string.center(63))
log.write("PROCESSED: ")
log.write(str(URL))
log.write("\n")
log.flush()
print(" ------------------------ ")
print("RUNNING SUMMARY:")
print("Elapsed time: ",end="")
checkpoint = timer()
print(checkpoint - start,end="")
print(" seconds\n")
log.write("Elapsed Time: " + str(checkpoint - start))
log.write("\n\n")
log.flush
print(str(count + noteng + failed + tooshort + fetcherror) + " Total Articles Fetched")
print(str(count) + " Processed Articles\n")
print(str(noteng) + " LANG_ERRORs (Article not in English)")
print(str(tooshort) + " ZERO_SENTIMENT_ERRORs (No sentiment detected)")
print(str(fetcherror) + " FETCH_ERRORs (Failed to fetch article)")
print(str(failed) + " Failed Articles\n")
firebaseRefresh()
time.sleep(1)
print(" ------------------------ ")
methylHalf()
print("\n ------------------------ ")
print(" FINISHED PROCESSING URLS!")
log.write("FINISHED PROCESSING: ")
log.write("URLS")
log.write("\n\n")
print(" ------------------------ \n")
print("SUMMARY:")
print(str(count + noteng + failed + tooshort + fetcherror) + " Total Articles Accessed")
print(str(count) + " Processed Articles\n")
print(str(noteng) + " LANG_ERRORs (Article not in English)")
print(str(tooshort) + " ZERO_SENTIMENT_ERRORs (No sentiment detected)")
print(str(fetcherror) + " FETCH_ERRORs (Failed to fetch article)")
print(str(failed) + " Failed Articles\n")
print(" ------------------------ \n")
print("Elapsed time: ",end="")
checkpoint = timer()
print(checkpoint - start,end="")
print(" seconds\n")
print("SHUTTING DOWN")
log.write("Elapsed Time: " + str(checkpoint - start))
log.write("\n\n")
log.write("SHUTTING DOWN")
log.flush
log.close()
In [ ]: