The system will provide a simple example of using python to extract information from a website. Note this is part 1 of 4. Participants are not expected to have any experience in python or background in web scraping. However, some understanding of HTML will be useful.
This draws inspiration from http://web.stanford.edu/~zlotnick/TextAsData/Web_Scraping_with_Beautiful_Soup.html
In [58]:
# Import dependencies (i.e. packages that extend the standard language to perform specific [advance] functionality)
import urllib
import urllib2
from datetime import datetime, date, timedelta
from bs4 import BeautifulSoup
From analysing the newsday archieve website we see that the URL follows a parsable convention http://www.newsday.co.tt/archives/YYYY-M-DD.html So our general approach will be as follows:
1. Generate date in the expected form between an ending and starting date
2. Test to ensure the dates generated are valid. (refine step1 based on results)
3. Read the content and process based on our goal for scaping the page
In [ ]:
# Step 1 - create a function to generates a list(array) of dates
def genDatesNewsDay(start_date = date.today(), num_days = 3):
# date_list = [start - timedelta(days=x) for x in range(0, num_days)] # generate a list of dates
# While we expand the above line for beginners understanding
date_list = []
for d in range(0, num_days):
temp = start_date - timedelta(days=d)
date_list.append(temp.strftime('%Y-%-m-%d'))# http://strftime.org/ used a reference
return date_list
In [ ]:
# Step 2 -Test the generated URL to ensure they point to
def traverseDatesNewsDay(func, start_date = date.today(), num_days = 3):
base_url="http://www.newsday.co.tt/archives/"
dates_str_list = genDatesNewsDay(start_date, num_days)
for date in dates_str_list:
url = base_url + date
func(url)
def printDate(date):
print(date)
traverseDatesNewsDay(printDate)
In [20]:
from dateutil.relativedelta import relativedelta
# http://www.guardian.co.tt/archive/2017-02?page=3
base_url = "http://www.guardian.co.tt/archive/"
# print date.today().strftime("%Y-%-m")
dates_str_list = []
page_content_list = []
for i in range(0, 12):
d = date.today() - relativedelta(months=+i)
page_url = base_url + d.strftime("%Y-%-m")
dates_str_list.append(page_url)
try:
page_content_list.append( urllib.urlopen(page_url).read() )
except:
print "Unable to find content for {0}".format(page_url)
In [22]:
print "Generated {0} urls and retrieved {1} pages".format(len(dates_str_list), len(page_content_list))
In [41]:
url = dates_str_list[0]
request = urllib2.Request("http://www.guardian.co.tt/archive/2017-2")
request.add_header('User-Agent', user_agent)
request.add_header('Accept-Language', accept_language)
content = urllib2.build_opener().open(request).read()
In [59]:
def fetch_content(url):
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3004.3 Safari/537.36"
accept_language="en-GB,en-US;q=0.8,en;q=0.6"
request = urllib2.Request(url)
request.add_header('User-Agent', user_agent)
request.add_header('Accept-Language', accept_language)
content = urllib2.build_opener().open(request).read()
return content
In [55]:
beau = BeautifulSoup(content, "html5lib")
main_block = beau.find(id="block-system-main")
links = main_block.find_all("div", class_="view-content")[0].find_all('a')
last = main_block.find("li", class_="pager-last last")
max_pages = int(last.find("a")['href'].split("=")[1])
pages_list = range(1, max_pages+1)
# len(links)
In [60]:
# url = "http://www.guardian.co.tt/archive/2017-2"
# page = pages_list[0]
# url = "{0}?page={1}".format(url, page)
# content = fetch_content(url)
In [67]:
base_url = "http://www.guardian.co.tt/"
stories = []
stories_links = []
for pg in links:
url = base_url + pg['href']
stories_links.append(url)
stories.append( fetch_content(url) )
In [83]:
first = True
emo_count = {
"anger" : 0,
"disgust": 0,
"fear" : 0,
"joy" : 0,
"sadness": 0
}
socio_count = {
"openness_big5": 0,
"conscientiousness_big5": 0,
"extraversion_big5" : 0,
"agreeableness_big5" : 0,
"emotional_range_big5": 0
}
In [84]:
for story in stories:
beau = BeautifulSoup(story, "html5lib")
# main_block = beau.find("h1", class_="title")
paragraphs = beau.find(id="block-system-main").find_all("p")
page_text = ""
for p in paragraphs:
page_text += p.get_text()
tone_analyzer = getAnalyser()
res = tone_analyzer.tone(page_text)
tone = res['document_tone']['tone_categories']
emo = tone[0]['tones'] # we want the emotional tone
soci= tone[2]['tones'] # we also want the social tone
e_res = processTone(emo)
emo_count[e_res['tone_id']] += 1
s_res = processTone(soci)
socio_count[s_res['tone_id']] += 1
In [85]:
for e in emo_count:
print("{0} articles were classified with the emotion {1}".format(emo_count[e], e))
for s in socio_count:
print("{0} articles were classified as {1}".format(socio_count[s], s))
In [64]:
# Step 3 - Read content and process page
def processPage(page_url):
print("Attempting to read content from {0}".format(page_url))
page_content = urllib.urlopen(page_url).read()
beau = BeautifulSoup(page_content, "html5lib")
tables = beau.find_all("table") #https://www.crummy.com/software/BeautifulSoup/bs4/doc/#find-all
for i in range(0,13):
named_sec = tables[i].h3
if named_sec:
print("i {0} produced {1}".format(i,named_sec))
article_links = beau.find_all("a", 'title')
print("Found {0} tables and {1} articles".format(len(tables), len(article_links)))
In [11]:
# traverseDatesNewsDay(processPage,num_days = 1)
Our main purpose of developing this exercise was to determine if the statement that the majority of the news published was negative. To do this we need to capture the sentiment of the information extracted from the link. While we can develop sentiment analysis tools using python, the process of training and validating is too much work at this time. Therefore, we utilize the IBM Watson Tone Analyzer API. We selected this API because it provides a greater amount of detail rather than a binary positive or negative result.
To use the watson api for python:
pip install --upgrade watson-developer-cloud
We created an account (free for 30 days) https://tone-analyzer-demo.mybluemix.net/
Use the API referene to build application http://www.ibm.com/watson/developercloud/tone-analyzer/api/v3/?python#
In [12]:
# Integrating IBM Watson
import json
from watson_developer_cloud import ToneAnalyzerV3
from local_settings import *
In [13]:
def getAnalyser():
tone_analyzer = ToneAnalyzerV3(
username= WATSON_CREDS['username'],
password= WATSON_CREDS['password'],
version='2016-05-19')
return tone_analyzer
In [14]:
# tone_analyzer = getAnalyser()
# tone_analyzer.tone(text='A word is dead when it is said, some say. Emily Dickinson')
In [15]:
def analysePage(page_url):
page_content = urllib.urlopen(page_url).read()
beau = BeautifulSoup(page_content, "html5lib")
tables = beau.find_all("table") #https://www.crummy.com/software/BeautifulSoup/bs4/doc/#find-all
article_links = beau.find_all("a", 'title')
print("Found {0} tables and {1} articles".format(len(tables), len(article_links)))
for i in article_links:
print i
In [16]:
# traverseDatesNewsDay(analysePage,num_days = 1)
In [17]:
page_content = urllib.urlopen("http://www.newsday.co.tt/archives/2017-2-2").read()
beau = BeautifulSoup(page_content, "html5lib")
tables = beau.find_all("table") #https://www.crummy.com/software/BeautifulSoup/bs4/doc/#find-all
article_links = beau.find_all("a", 'title')
print("Found {0} tables and {1} articles".format(len(tables), len(article_links)))
In [18]:
def processTone(tone):
large = tone[0]['score']
large_i = 0
for i in range(1, len(tone)):
if tone[i]['score'] > large:
large = tone[i]['score']
large_i = i
return tone[large_i]
The Understanding of the structure of the response is provided in the API reference
https://www.ibm.com/watson/developercloud/tone-analyzer/api/v3/?python#post-tone
In [19]:
first = True
emo_count = {
"anger" : 0,
"disgust": 0,
"fear" : 0,
"joy" : 0,
"sadness": 0
}
socio_count = {
"openness_big5": 0,
"conscientiousness_big5": 0,
"extraversion_big5" : 0,
"agreeableness_big5" : 0,
"emotional_range_big5": 0
}
for i in article_links:
res = tone_analyzer.tone(i['title'])
tone = res['document_tone']['tone_categories']
emo = tone[0]['tones'] # we want the emotional tone
soci= tone[2]['tones'] # we also want the social tone
e_res = processTone(emo)
emo_count[e_res['tone_id']] += 1
s_res = processTone(soci)
socio_count[s_res['tone_id']] += 1
In [ ]:
for e in emo_count:
print("{0} articles were classified with the emotion {1}".format(emo_count[e], e))
In [ ]:
for s in socio_count:
print("{0} articles were classified as {1}".format(socio_count[s], s))
In [ ]: