In [11]:
# - name
# - taxonomic classification
# - countries of occurrence,
# - endangered status,
# - ecology
########- generation length (not listed often enough?)
# - population trend (decreasing, increasing, unkown),
########- subspecies list (not gonna work)
# - listed on CITES or not (conservation actions)
# - threats
# try:
# # code to process download here
# except Exception as e: # most generic exception you can catch
# logf.write("Failed to download {0}: {1}\n".format(str(download), str(e)))
# # optional: delete local version of failed download
# finally:
from bs4 import BeautifulSoup
import pandas as pd
def getRedlistFeaturesDF( htmlSiteSoup ):
featureList = {}
### animal name ###
try:
animalName = soup.title.string
animalName
# seems that some include nickname, see how many later after table is made
# process out nickname in cleaning stage, if desired
featureList['animalName'] = animalName
except Exception as e: # most generic exception you can catch
featureList['animalName'] = 'NA'
### taxonomy ###
try:
taxonomyList = soup.body.find_all('table',{'class':"tab_data"})[0].find_all('td') # returns list of taxonomic categorization from kindgom to family (skips genus and species)
taxonomyStrList = []
for index, item in enumerate(taxonomyList):
taxonomyStrList.append(str(item)) #= str()
KINGDOM = taxonomyStrList[0][4:-5]
PHYLUM = taxonomyStrList[1][4:-5]
CLASS = taxonomyStrList[2][4:-5]
ORDER = taxonomyStrList[3][4:-5]
FAMILY = taxonomyStrList[4][4:-5]
featureList['KINGDOM'] = KINGDOM
featureList['PHYLUM'] = PHYLUM
featureList['CLASS'] = CLASS
featureList['ORDER'] = ORDER
featureList['FAMILY'] = FAMILY
except Exception as e: # most generic exception you can catch
featureList['KINGDOM'] = 'NA'
featureList['PHYLUM'] = 'NA'
featureList['CLASS'] = 'NA'
featureList['ORDER'] = 'NA'
featureList['FAMILY'] = 'NA'
### countries of occurence ###
try:
rangeList = soup.body.find_all('div',{'class':"group"})
countriesPerGroupType = []
# loop through groups (e.g. Possibly exting, Regionally extinct, Native)
for index, item in enumerate(rangeList):
listCountries = item.contents # list with first item as group name, and rest as countries
countriesPerGroupType.append(listCountries)
# TO-DO: further country list parsing
fullListCountries = ''
for groupNum, group in enumerate(countriesPerGroupType):
groupName = group[0]
groupCountries = group[1]
fullListCountries = str(groupCountries) + "; " + fullListCountries
COUNTRIES = fullListCountries
COUNTRIES
featureList['COUNTRIES'] = COUNTRIES
except Exception as e: # most generic exception you can catch
featureList['COUNTRIES'] = 'NA'
### current population trend ###
try:
popTrend = soup.body.find_all('td',{'id':"popTrend"})
POP_TREND = popTrend[0].find_all('span')[0].string
POP_TREND
featureList['POP_TREND'] = POP_TREND
except Exception as e: # most generic exception you can catch
featureList['POP_TREND'] = 'NA'
### status ###
try:
statusInfo = soup.body.find_all('table',{'class':"tab_data"})
# again, need to clean this bit by grabbing first string between first two new lines
STATUS = statusInfo[2].find_all('td')[1].contents[0]
featureList['STATUS'] = STATUS
except Exception as e: # most generic exception you can catch
featureList['STATUS'] = 'NA'
### ecology ###
try:
ecologyInfo = soup.body.find_all('strong')
for index, item in enumerate(ecologyInfo):
if item.contents == ['Systems:']:
ecology = str(item.parent.next_sibling)
ECOLOGY = ecology[4:-5]
ECOLOGY
featureList['ECOLOGY'] = ECOLOGY
except Exception as e:
featureList['ECOLOGY'] = 'NA'
### threat list ###
try:
threatInfo = soup.body.find_all('strong')
for index, item in enumerate(threatInfo):
if item.contents == ['Major Threat(s):']:
# threatText = item.parent.next_sibling.parent.find_all('p')
THREAT_PARAGRAPH = item.parent.parent.find_all('td')[1].contents
# THREAT_PARAGRAPH = threatText[1].contents[0]
THREAT_PARAGRAPH
featureList['THREAT_PARAGRAPH'] = THREAT_PARAGRAPH
except Exception as e: # most generic exception you can catch
featureList['THREAT_PARAGRAPH'] = 'NA'
# threatInfo = soup.body.find_all('strong')
# for index, item in enumerate(threatInfo):
# if item.contents == ['Major Threat(s):']:
# threatText = item.parent.next_sibling.parent.find_all('p')
# THREAT_PARAGRAPH = threatText[1].contents[0]
# THREAT_PARAGRAPH
# featureList['THREAT_PARAGRAPH'] = THREAT_PARAGRAPH
### listed on CITES or not (conservation actions) ###
try:
for index, item in enumerate(threatInfo):
# print(item.contents)
if item.contents == ['Conservation Actions:']:
# print(item.parent.parent.find_all('td'))
CONSERVATION_PARAGRAPH = item.parent.parent.find_all('td')[1].contents
# print(CONSERVATION_PARAGRAPH.contents)
# fullString = ''
# for index, item in enumerate(CONSERVATION_PARAGRAPH):
# fullString = str(item.contents) + fullString
# CONSERVATION_PARAGRAPH = fullString
# print('CONSERVATION_PARAGRAPH' + CONSERVATION_PARAGRAPH)
featureList['CONSERVATION_PARAGRAPH'] = str(CONSERVATION_PARAGRAPH)
except Exception as e: # most generic exception you can catch
featureList['CONSERVATION_PARAGRAPH'] = 'NA'
# featureList = {}
# featureList = getRedlistFeatures(soup)
# print(featureList)
# print(featureList.keys())
columns =['htmlPage','animalName','KINGDOM','PHYLUM','CLASS','ORDER','FAMILY','ECOLOGY','COUNTRIES','THREAT_PARAGRAPH','CONSERVATION_PARAGRAPH','POP_TREND','STATUS']
# print(len(columns))
# print(featureList.items())
# df = pd.DataFrame.from_items(featureList.items())#, columns=['Date', 'DateValue'])
df = pd.DataFrame(featureList,dtype=None,index=[0])
# print(df)
return df
# Initialize df to save to csv
columns =['htmlPage','animalName','KINGDOM','PHYLUM','CLASS','ORDER','FAMILY','ECOLOGY','COUNTRIES','THREAT_PARAGRAPH','CONSERVATION_PARAGRAPH','POP_TREND','STATUS']
featureList = pd.DataFrame(columns=columns)
with open('redlistTable.csv', 'w') as f:
newFeatureList.to_csv(f, header=True)
## Get list of links
import traceback
from os import listdir
from os.path import isfile, join
mypath = "/Users/tammi/Desktop/cs638project/redlist-endangered-html/"
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
# onlyfiles = onlyfiles[0:100]
# Get ability to log errors
import logging
logger = logging.getLogger('getFeatures')
hdlr = logging.FileHandler('//Users/tammi/Desktop/errorLog_endangered.txt')
logger.addHandler(hdlr)
logger.setLevel(logging.DEBUG)
# logging.basicConfig( filename=,
# filemode='w',
# level=logging.DEBUG)
# For loop of links, run program
for index, file in enumerate(onlyfiles):
try:
# print(index)
soup = BeautifulSoup(open(mypath + file),"lxml")
newFeatureList = getRedlistFeaturesDF(soup)
newFeatureList['htmlPage'] = str(file)
columns =['animalName','htmlPage','KINGDOM','PHYLUM','CLASS','ORDER','FAMILY','ECOLOGY','COUNTRIES','THREAT_PARAGRAPH','CONSERVATION_PARAGRAPH','POP_TREND','STATUS']
newFeatureList = newFeatureList[columns]
with open('redlistTable.csv', 'a') as f:
newFeatureList.to_csv(f, header=False)
# featureList = featureList.append(newFeatureList, ignore_index=True)
except Exception as e: # most generic exception you can catch
# traceback.print_exc()
# logger.exception("oops!")
x=1
###### NEXT SET OF HTML FILES ######
# # Initialize df to save to csv
# columns =['htmlPage','animalName','KINGDOM','PHYLUM','CLASS','ORDER','FAMILY','ECOLOGY','COUNTRIES','THREAT_PARAGRAPH','CONSERVATION_PARAGRAPH','POP_TREND','STATUS']
# featureList = pd.DataFrame(columns=columns)
## Get list of links
mypath = "/Users/tammi/Desktop/cs638project/redlist-critically-endangered-html/"
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
# onlyfiles = onlyfiles[0:100]
# Get ability to log errors
import logging
logger = logging.getLogger('getFeatures')
hdlr = logging.FileHandler('//Users/tammi/Desktop/errorLog_critEndangered.txt')
logger.addHandler(hdlr)
logger.setLevel(logging.DEBUG)
# For loop of links, run program
for index, file in enumerate(onlyfiles):
try:
# print(file)
# print("sec set: " + str(index))
soup = BeautifulSoup(open(mypath + file),"lxml")
# soup = BeautifulSoup(open("/Users/tammi/Desktop/cs638project/redlist-endangered-html/3334.html"),"lxml")
newFeatureList = getRedlistFeaturesDF(soup)
newFeatureList['htmlPage'] = str(file)
# print(newFeatureList)
# print(newFeatureList)
# featureList = featureList.append(newFeatureList, ignore_index=True)
columns =['animalName','htmlPage','KINGDOM','PHYLUM','CLASS','ORDER','FAMILY','ECOLOGY','COUNTRIES','THREAT_PARAGRAPH','CONSERVATION_PARAGRAPH','POP_TREND','STATUS']
newFeatureList = newFeatureList[columns]
with open('redlistTable.csv', 'a') as f:
newFeatureList.to_csv(f, header=False)
except Exception as e: # most generic exception you can catch
# traceback.print_exc()
# logger.exception("oops!")
# print(file)
x=1
# columns =['animalName','htmlPage','KINGDOM','PHYLUM','CLASS','ORDER','FAMILY','ECOLOGY','COUNTRIES','THREAT_PARAGRAPH','CONSERVATION_PARAGRAPH','POP_TREND','STATUS']
# featureList = featureList[columns]
# featureList.to_csv('redlistTable.csv')
# featureList.to_csv('critEndangeredTable.csv')
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [43]:
# - name
# - taxonomic classification
# - countries of occurrence,
# - endangered status,
# - ecology
########- generation length (not listed often enough?)
# - population trend (decreasing, increasing, unkown),
########- subspecies list (not gonna work)
# - listed on CITES or not (conservation actions)
# - threats
# try:
# # code to process download here
# except Exception as e: # most generic exception you can catch
# logf.write("Failed to download {0}: {1}\n".format(str(download), str(e)))
# # optional: delete local version of failed download
# finally:
from bs4 import BeautifulSoup
import pandas as pd
# Initialize df to save to csv
columns =['animalName','KINGDOM','PHYLUM','CLASS','ORDER','FAMILY','ECOLOGY','COUNTRIES','THREAT_PARAGRAPH','CONSERVATION_PARAGRAPH','POP_TREND','STATUS']
featureList = pd.DataFrame(columns=columns)
## Get list of links
import traceback
from os import listdir
from os.path import isfile, join
mypath = "/Users/tammi/Desktop/cs638project/redlist-endangered-html/"
onlyFiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
# print(onlyfiles)
# print(onlyFiles[1:5])
onlyFiles = onlyFiles[1:2]
# print(onlyFiles)
# Get ability to log errors
import logging
logger = logging.getLogger('getFeatures')
hdlr = logging.FileHandler('//Users/tammi/Desktop/getFeatures_errorLog.txt')
logger.addHandler(hdlr)
logger.setLevel(logging.DEBUG)
# logging.basicConfig( filename=,
# filemode='w',
# level=logging.DEBUG)
# For loop of links, run program
for index, file in enumerate(onlyFiles):
try:
print(file)
file = '1087.html'
soup = BeautifulSoup(open(mypath + file),"lxml")
# soup = BeautifulSoup(open("/Users/tammi/Desktop/cs638project/redlist-endangered-html/3334.html"),"lxml")
featureList = {}
### animal name ###
animalName = soup.title.string
animalName
# seems that some include nickname, see how many later after table is made
# process out nickname in cleaning stage, if desired
featureList['animalName'] = animalName
### taxonomy ###
taxonomyList = soup.body.find_all('table',{'class':"tab_data"})[0].find_all('td') # returns list of taxonomic categorization from kindgom to family (skips genus and species)
#print(taxonomyList)
taxonomyStrList = []
for index, item in enumerate(taxonomyList):
#print(index)
taxonomyStrList.append(str(item)) #= str()
#print(taxonomyStrList)
#print(item)
KINGDOM = taxonomyStrList[0][4:-5]
PHYLUM = taxonomyStrList[1][4:-5]
CLASS = taxonomyStrList[2][4:-5]
ORDER = taxonomyStrList[3][4:-5]
FAMILY = taxonomyStrList[4][4:-5]
featureList['KINGDOM'] = KINGDOM
featureList['PHYLUM'] = PHYLUM
featureList['CLASS'] = CLASS
featureList['ORDER'] = ORDER
featureList['FAMILY'] = FAMILY
print(KINGDOM)
print(PHYLUM)
print(CLASS)
print(ORDER)
print(FAMILY)
### countries of occurence ###
rangeList = soup.body.find_all('div',{'class':"group"})
countriesPerGroupType = []
# loop through groups (e.g. Possibly exting, Regionally extinct, Native)
for index, item in enumerate(rangeList):
listCountries = item.contents # list with first item as group name, and rest as countries
countriesPerGroupType.append(listCountries)
# TO-DO: further country list parsing
print()
fullListCountries = ''
for groupNum, group in enumerate(countriesPerGroupType):
groupName = group[0]
groupCountries = group[1]
fullListCountries = groupCountries + "; " + fullListCountries
#print(groupCountries)
COUNTRIES = fullListCountries
COUNTRIES
featureList['COUNTRIES'] = COUNTRIES
### current population trend ###
threatInfo = soup.body.find_all('strong')
for index, item in enumerate(threatInfo):
# print(item)
if item.contents == ['Current Population Trend:']:
# print(item)
threatText = item.parent.next_sibling.parent.find_all('p')
popTrend = soup.body.find_all('td',{'id':"popTrend"})
# print(popTrend)
POP_TREND = popTrend[0].find_all('span')[0].string
POP_TREND
featureList['POP_TREND'] = POP_TREND
### threat list ###
threatInfo = soup.body.find_all('strong')
for index, item in enumerate(threatInfo):
if item.contents == ['Major Threat(s):']:
# print(item)
threatText = item.parent.next_sibling
print(threatText)
# print(item.parent.parent.find_all('td')[1].contents)
# threatText = item.parent.next_sibling.parent.find_all('p')
print(threatText)
# THREAT_PARAGRAPH = threatText[1].contents[0]
# THREAT_PARAGRAPH
# featureList['THREAT_PARAGRAPH'] = THREAT_PARAGRAPH
### listed on CITES or not (conservation actions) ###
for index, item in enumerate(threatInfo):
# print(item.contents)
if item.contents == ['Conservation Actions:']:
CONSERVATION_PARAGRAPH = item.parent.parent.find_all('td')[1].find_all('p')
fullString = ''
for index, item in enumerate(CONSERVATION_PARAGRAPH):
fullString = str(item.contents) + fullString
CONSERVATION_PARAGRAPH = fullString
featureList['CONSERVATION_PARAGRAPH'] = CONSERVATION_PARAGRAPH
except Exception as e: # most generic exception you can catch
traceback.print_exc()
logger.exception("oops!")
In [16]:
### status ###
statusInfo = soup.body.find_all('table',{'class':"tab_data"})
# again, need to clean this bit by grabbing first string between first two new lines
STATUS = statusInfo[2].find_all('td')[1].contents[0]
print(STATUS)
featureList['STATUS'] = STATUS
### ecology ###
ecologyInfo = soup.body.find_all('strong')
for index, item in enumerate(ecologyInfo):
if item.contents == ['Systems:']:
ecology = str(item.parent.next_sibling)
ECOLOGY = ecology[4:-5]
ECOLOGY
featureList['ECOLOGY'] = ECOLOGY
### listed on CITES or not (conservation actions) ###
for index, item in enumerate(threatInfo):
# print(item.contents)
if item.contents == ['Conservation Actions:']:
# print(item)
CONSERVATION_PARAGRAPH = item.parent.parent.find_all('strong')[1].contents
# print(threatText.contents)
fullString = ''
for index, item in enumerate(CONSERVATION_PARAGRAPH):
# print(item)
# print()
fullString = str(item) + fullString
CONSERVATION_PARAGRAPH = fullString
# print(CONSERVATION_PARAGRAPH)
featureList['CONSERVATION_PARAGRAPH'] = CONSERVATION_PARAGRAPH
# featureList = {}
# featureList = getRedlistFeatures(soup)
# print(featureList)
# print(featureList.keys())
columns =['animalName','KINGDOM','PHYLUM','CLASS','ORDER','FAMILY','ECOLOGY','COUNTRIES','THREAT_PARAGRAPH','CONSERVATION_PARAGRAPH','POP_TREND','STATUS']
# print(len(columns))
# print(featureList.items())
# df = pd.DataFrame.from_items(featureList.items())#, columns=['Date', 'DateValue'])
df = pd.DataFrame(featureList,dtype=None,index=[0])
# print(df)
newFeatureList = df
featureList = featureList.append(newFeatureList, ignore_index=True)
featureList.to_csv('example.csv')
In [ ]: