In [2]:
Image("img/init.png")
Out[2]:
In [3]:
Image("img/target_result.png")
Out[3]:
In [4]:
# FOR WEB SCRAPING
from lxml import html
import requests
# FOR FUNCTIONAL PROGRAMMING
import cytoolz # pipe
# FOR DATA WRANGLING
import pandas as pd # use of R like dataframes
import re #re for regular expressions
# TO INSERT IMAGES
from IPython.display import Image
In [44]:
### Target URL
outbreakNewsURL = "http://www.who.int/csr/don/archive/disease/zika-virus-infection/en/"
page = requests.get(outbreakNewsURL)
tree = html.fromstring(page.content)
newsXPath = '//li'
zikaNews = tree.xpath(newsXPath)
In [21]:
### Store the relevant news in a list
zikaNews_dirty = [p.text_content() for p in zikaNews]
In [22]:
# Printing the first 20 elements
zikaNews_dirty[1:20] # omitting first element
Out[22]:
In [9]:
Image("img/flatten_tree_data.png")
Out[9]:
In [23]:
# Extract only the items containing the pattern "Zika virus infection "
#sample= '\n22 April 2016\n\t\t\tZika virus infection – Papua New Guinea - USA\n'
keywdEN ="Zika virus infection "
zikaNews_content = [s for s in zikaNews_dirty if re.search(keywdEN, s)]
In [24]:
zikaNews_content[0:10] # first 11 elements
Out[24]:
In [25]:
#### Use of lambdas (avoid creating verbose Python functions with def f():{})
substitudeUnicodeDash = lambda s : re.sub(u'–',"@", s)
substituteNonUnicode = lambda s : re.sub(r"\s"," ",s)
removeSpace = lambda s: s.strip()
In [27]:
# Use of pipe to chain lambda functions within a list comprehension
### Should be familiar to those using R dplyr %>%
zikaNews_dirty = [cytoolz.pipe(s,
removeSpace,
substituteNonUnicode)
for s in zikaNews_content]
In [28]:
# List comprehension
zikaNews_dirty = [s.split("Zika virus infection") for s in zikaNews_dirty ]
In [31]:
zikaNews_dirty[0:10]
Out[31]:
In [71]:
# Structure data into a Pandas dataframe
zika = pd.DataFrame(zikaNews_dirty, columns = ["Date","Locations"])
In [72]:
zika.head(n=20)
Out[72]:
In [73]:
### Removing the first dash sign / for zika["Locations"]
# Step 1 : transform in a list of strings, via str.split()
# Step 2 : copy the list, except the first element list[1:]
# Step 3 : reconstitute the entire string using ' '.join(list[1:])
# Step 1 : transform in a list of strings, via str.split()
zika["Split_Locations"] = pd.Series(zika["Locations"].iloc[i].split() for i in range(len(zika)))
# Step 2 : copy the list, except the first element list[1:]
zika["Split_Locations"] = pd.Series([s[1:] for s in zika["Split_Locations"]])
# Step 3 : reconstitute the entire string using ' '.join(list[1:])
zika["Split_Locations"] = pd.Series([" ".join(s) for s in zika["Split_Locations"]])
zika["Split_Locations"] = pd.Series([s.split("-") for s in zika["Split_Locations"]])
zika["Split_Date"] = pd.Series([s.split() for s in zika["Date"]])
In [74]:
# Show the first 10 rows using HEAD
zika.head(n=10)
Out[74]:
In [75]:
### Extract Day / Month / Year in the Split_Date column, 1 row is of the form [21, January, 2016]
zika["Day"]= pd.Series(zika["Split_Date"].iloc[i][0] for i in range(len(zika)))
zika["Month"]= pd.Series(zika["Split_Date"].iloc[i][1] for i in range(len(zika)))
zika["Year"]= pd.Series(zika["Split_Date"].iloc[i][2] for i in range(len(zika)))
In [76]:
# Show the first 10 rows using HEAD
zika.head(n=10)
Out[76]:
In [77]:
# Extract Country and Territory
zika["Country"] = pd.Series(zika["Split_Locations"].iloc[i][0] for i in range(len(zika)))
zika["Territory"] = pd.Series(zika["Split_Locations"].iloc[i][len(zika["Split_Locations"].iloc[i])-1] for i in range(len(zika)))
In [78]:
# Show the first 20 rows using HEAD
zika[['Split_Locations','Country','Territory']].head(20)
Out[78]:
In [83]:
zika["Territory"] =pd.Series(zika["Territory"][i]
if zika["Territory"][i] != zika["Country"][i]
else " " for i in range(len(zika))
)
In [84]:
# Show the first 20 rows using HEAD
zika[['Split_Locations','Country','Territory']].head(20)
Out[84]: