In [11]:
# - name
# - taxonomic classification
# - countries of occurrence, 
# - endangered status,
# - ecology
########- generation length (not listed often enough?)
# - population trend (decreasing, increasing, unkown),
########- subspecies list (not gonna work)
# - listed on CITES or not (conservation actions)
# - threats

#     try:
#         # code to process download here
#     except Exception as e:     # most generic exception you can catch
#         logf.write("Failed to download {0}: {1}\n".format(str(download), str(e)))
#         # optional: delete local version of failed download
#     finally:


from bs4 import BeautifulSoup
import pandas as pd


def getRedlistFeaturesDF( htmlSiteSoup ):

    featureList = {}

    ### animal name ###
    try:
        animalName = soup.title.string 
        animalName
        # seems that some include nickname, see how many later after table is made
        # process out nickname in cleaning stage, if desired
        featureList['animalName'] = animalName
    except Exception as e:     # most generic exception you can catch
        featureList['animalName'] = 'NA'


    ### taxonomy ###
    try:
        taxonomyList = soup.body.find_all('table',{'class':"tab_data"})[0].find_all('td') # returns list of taxonomic categorization from kindgom to family (skips genus and species)
        taxonomyStrList = []

        for index, item in enumerate(taxonomyList):
            taxonomyStrList.append(str(item)) #= str()
        KINGDOM = taxonomyStrList[0][4:-5]
        PHYLUM = taxonomyStrList[1][4:-5]
        CLASS = taxonomyStrList[2][4:-5]
        ORDER = taxonomyStrList[3][4:-5]
        FAMILY = taxonomyStrList[4][4:-5]
        featureList['KINGDOM'] = KINGDOM
        featureList['PHYLUM'] = PHYLUM
        featureList['CLASS'] = CLASS
        featureList['ORDER'] = ORDER
        featureList['FAMILY'] = FAMILY
    except Exception as e:     # most generic exception you can catch
        featureList['KINGDOM'] = 'NA'
        featureList['PHYLUM'] = 'NA'
        featureList['CLASS'] = 'NA'
        featureList['ORDER'] = 'NA'
        featureList['FAMILY'] = 'NA'
        
        
        

    ### countries of occurence ###
    try:
        rangeList = soup.body.find_all('div',{'class':"group"})

        countriesPerGroupType = []

        # loop through groups (e.g. Possibly exting, Regionally extinct, Native)
        for index, item in enumerate(rangeList):
            listCountries = item.contents # list with first item as group name, and rest as countries
            countriesPerGroupType.append(listCountries)

        # TO-DO: further country list parsing
        fullListCountries = ''
        for groupNum, group in enumerate(countriesPerGroupType):
            groupName = group[0]
            groupCountries = group[1]
            fullListCountries = str(groupCountries) +  "; " + fullListCountries

        COUNTRIES = fullListCountries
        COUNTRIES
        featureList['COUNTRIES'] = COUNTRIES
    except Exception as e:     # most generic exception you can catch
        featureList['COUNTRIES'] = 'NA'
        

    ### current population trend ###
    try:
        popTrend = soup.body.find_all('td',{'id':"popTrend"})
        POP_TREND = popTrend[0].find_all('span')[0].string
        POP_TREND
        featureList['POP_TREND'] = POP_TREND
    except Exception as e:     # most generic exception you can catch
        featureList['POP_TREND'] = 'NA'


    ### status ### 
    try:
        statusInfo = soup.body.find_all('table',{'class':"tab_data"})
        # again, need to clean this bit by grabbing first string between first two new lines
        STATUS = statusInfo[2].find_all('td')[1].contents[0]
        featureList['STATUS'] = STATUS
    except Exception as e:     # most generic exception you can catch
        featureList['STATUS'] = 'NA'


    ### ecology ###
    try:
        ecologyInfo = soup.body.find_all('strong')
        for index, item in enumerate(ecologyInfo):
            if item.contents == ['Systems:']:
                ecology = str(item.parent.next_sibling)

        ECOLOGY = ecology[4:-5]
        ECOLOGY
        featureList['ECOLOGY'] = ECOLOGY
    except Exception as e:    
        featureList['ECOLOGY']  = 'NA'


    ### threat list ###
    try:   
        threatInfo = soup.body.find_all('strong')
        for index, item in enumerate(threatInfo):
            if item.contents == ['Major Threat(s):']:
#                 threatText = item.parent.next_sibling.parent.find_all('p')
                THREAT_PARAGRAPH = item.parent.parent.find_all('td')[1].contents
#         THREAT_PARAGRAPH = threatText[1].contents[0]
        THREAT_PARAGRAPH
        featureList['THREAT_PARAGRAPH'] = THREAT_PARAGRAPH
    
    except Exception as e:     # most generic exception you can catch
        featureList['THREAT_PARAGRAPH'] = 'NA'

    
#     threatInfo = soup.body.find_all('strong')
#     for index, item in enumerate(threatInfo):
#         if item.contents == ['Major Threat(s):']:
#             threatText = item.parent.next_sibling.parent.find_all('p')

#     THREAT_PARAGRAPH = threatText[1].contents[0]
#     THREAT_PARAGRAPH
#     featureList['THREAT_PARAGRAPH'] = THREAT_PARAGRAPH



    ### listed on CITES or not (conservation actions) ###
    try: 
        for index, item in enumerate(threatInfo):
        #     print(item.contents)
            if item.contents == ['Conservation Actions:']:
    #                 print(item.parent.parent.find_all('td'))
                CONSERVATION_PARAGRAPH = item.parent.parent.find_all('td')[1].contents

    #             print(CONSERVATION_PARAGRAPH.contents)
    #         fullString = ''
    #         for index, item in enumerate(CONSERVATION_PARAGRAPH):
    #             fullString = str(item.contents) + fullString

    #         CONSERVATION_PARAGRAPH = fullString
    #         print('CONSERVATION_PARAGRAPH' + CONSERVATION_PARAGRAPH) 

        featureList['CONSERVATION_PARAGRAPH'] = str(CONSERVATION_PARAGRAPH)

    except Exception as e:     # most generic exception you can catch
        featureList['CONSERVATION_PARAGRAPH'] = 'NA'

    
    
    # featureList = {}
    # featureList = getRedlistFeatures(soup)
#     print(featureList)
#     print(featureList.keys())
    columns =['htmlPage','animalName','KINGDOM','PHYLUM','CLASS','ORDER','FAMILY','ECOLOGY','COUNTRIES','THREAT_PARAGRAPH','CONSERVATION_PARAGRAPH','POP_TREND','STATUS']
#     print(len(columns))
#     print(featureList.items())
#     df = pd.DataFrame.from_items(featureList.items())#, columns=['Date', 'DateValue'])
    df = pd.DataFrame(featureList,dtype=None,index=[0])
#     print(df)
    return df


# Initialize df to save to csv    
columns =['htmlPage','animalName','KINGDOM','PHYLUM','CLASS','ORDER','FAMILY','ECOLOGY','COUNTRIES','THREAT_PARAGRAPH','CONSERVATION_PARAGRAPH','POP_TREND','STATUS']
featureList = pd.DataFrame(columns=columns)
with open('redlistTable.csv', 'w') as f:
    newFeatureList.to_csv(f, header=True)

## Get list of links
import traceback
from os import listdir
from os.path import isfile, join
mypath = "/Users/tammi/Desktop/cs638project/redlist-endangered-html/"
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
# onlyfiles = onlyfiles[0:100]

# Get ability to log errors
import logging

logger = logging.getLogger('getFeatures')
hdlr = logging.FileHandler('//Users/tammi/Desktop/errorLog_endangered.txt')
logger.addHandler(hdlr) 
logger.setLevel(logging.DEBUG)
# logging.basicConfig( filename=,
#                      filemode='w',
#                      level=logging.DEBUG)


# For loop of links, run program
for index, file in enumerate(onlyfiles):
    try:
#         print(index)
        soup = BeautifulSoup(open(mypath + file),"lxml")
        newFeatureList = getRedlistFeaturesDF(soup)
        newFeatureList['htmlPage'] = str(file)
        columns =['animalName','htmlPage','KINGDOM','PHYLUM','CLASS','ORDER','FAMILY','ECOLOGY','COUNTRIES','THREAT_PARAGRAPH','CONSERVATION_PARAGRAPH','POP_TREND','STATUS']
        newFeatureList = newFeatureList[columns]
        
        with open('redlistTable.csv', 'a') as f:
            newFeatureList.to_csv(f, header=False)
        
#         featureList = featureList.append(newFeatureList, ignore_index=True)
    except Exception as e:     # most generic exception you can catch
#         traceback.print_exc()   
#         logger.exception("oops!")
        x=1



###### NEXT SET OF HTML FILES ######
# # Initialize df to save to csv    
# columns =['htmlPage','animalName','KINGDOM','PHYLUM','CLASS','ORDER','FAMILY','ECOLOGY','COUNTRIES','THREAT_PARAGRAPH','CONSERVATION_PARAGRAPH','POP_TREND','STATUS']
# featureList = pd.DataFrame(columns=columns)

## Get list of links
mypath = "/Users/tammi/Desktop/cs638project/redlist-critically-endangered-html/"
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
# onlyfiles = onlyfiles[0:100]


# Get ability to log errors
import logging
logger = logging.getLogger('getFeatures')
hdlr = logging.FileHandler('//Users/tammi/Desktop/errorLog_critEndangered.txt')
logger.addHandler(hdlr) 
logger.setLevel(logging.DEBUG)


# For loop of links, run program
for index, file in enumerate(onlyfiles):
    try:
#         print(file)
#         print("sec set: " + str(index))
        soup = BeautifulSoup(open(mypath + file),"lxml")
#         soup = BeautifulSoup(open("/Users/tammi/Desktop/cs638project/redlist-endangered-html/3334.html"),"lxml")
        
        newFeatureList = getRedlistFeaturesDF(soup)
        newFeatureList['htmlPage'] = str(file)
#         print(newFeatureList)
#         print(newFeatureList)
#         featureList = featureList.append(newFeatureList, ignore_index=True)
        columns =['animalName','htmlPage','KINGDOM','PHYLUM','CLASS','ORDER','FAMILY','ECOLOGY','COUNTRIES','THREAT_PARAGRAPH','CONSERVATION_PARAGRAPH','POP_TREND','STATUS']
        newFeatureList = newFeatureList[columns]
    
        with open('redlistTable.csv', 'a') as f:
            newFeatureList.to_csv(f, header=False)
        
    except Exception as e:     # most generic exception you can catch
#         traceback.print_exc()   
#         logger.exception("oops!")
#         print(file)
        x=1


# columns =['animalName','htmlPage','KINGDOM','PHYLUM','CLASS','ORDER','FAMILY','ECOLOGY','COUNTRIES','THREAT_PARAGRAPH','CONSERVATION_PARAGRAPH','POP_TREND','STATUS']
# featureList = featureList[columns]
# featureList.to_csv('redlistTable.csv')
# featureList.to_csv('critEndangeredTable.csv')

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [43]:
# - name
# - taxonomic classification
# - countries of occurrence, 
# - endangered status,
# - ecology
########- generation length (not listed often enough?)
# - population trend (decreasing, increasing, unkown),
########- subspecies list (not gonna work)
# - listed on CITES or not (conservation actions)
# - threats

#     try:
#         # code to process download here
#     except Exception as e:     # most generic exception you can catch
#         logf.write("Failed to download {0}: {1}\n".format(str(download), str(e)))
#         # optional: delete local version of failed download
#     finally:


from bs4 import BeautifulSoup
import pandas as pd

# Initialize df to save to csv    
columns =['animalName','KINGDOM','PHYLUM','CLASS','ORDER','FAMILY','ECOLOGY','COUNTRIES','THREAT_PARAGRAPH','CONSERVATION_PARAGRAPH','POP_TREND','STATUS']
featureList = pd.DataFrame(columns=columns)

## Get list of links
import traceback
from os import listdir
from os.path import isfile, join
mypath = "/Users/tammi/Desktop/cs638project/redlist-endangered-html/"
onlyFiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
# print(onlyfiles)
# print(onlyFiles[1:5])

onlyFiles = onlyFiles[1:2]
# print(onlyFiles)


# Get ability to log errors
import logging

logger = logging.getLogger('getFeatures')
hdlr = logging.FileHandler('//Users/tammi/Desktop/getFeatures_errorLog.txt')
logger.addHandler(hdlr) 
logger.setLevel(logging.DEBUG)
# logging.basicConfig( filename=,
#                      filemode='w',
#                      level=logging.DEBUG)


# For loop of links, run program
for index, file in enumerate(onlyFiles):
    try:
        print(file)
        file = '1087.html'
        soup = BeautifulSoup(open(mypath + file),"lxml")
#         soup = BeautifulSoup(open("/Users/tammi/Desktop/cs638project/redlist-endangered-html/3334.html"),"lxml")

        featureList = {}

        ### animal name ###
        animalName = soup.title.string 
        animalName
        # seems that some include nickname, see how many later after table is made
        # process out nickname in cleaning stage, if desired
        featureList['animalName'] = animalName


        ### taxonomy ###
        taxonomyList = soup.body.find_all('table',{'class':"tab_data"})[0].find_all('td') # returns list of taxonomic categorization from kindgom to family (skips genus and species)
        #print(taxonomyList)
        taxonomyStrList = []

        for index, item in enumerate(taxonomyList):
            #print(index)
            taxonomyStrList.append(str(item)) #= str()
            #print(taxonomyStrList)
            #print(item)

        KINGDOM = taxonomyStrList[0][4:-5]
        PHYLUM = taxonomyStrList[1][4:-5]
        CLASS = taxonomyStrList[2][4:-5]
        ORDER = taxonomyStrList[3][4:-5]
        FAMILY = taxonomyStrList[4][4:-5]
        featureList['KINGDOM'] = KINGDOM
        featureList['PHYLUM'] = PHYLUM
        featureList['CLASS'] = CLASS
        featureList['ORDER'] = ORDER
        featureList['FAMILY'] = FAMILY

        print(KINGDOM)
        print(PHYLUM)
        print(CLASS)
        print(ORDER)
        print(FAMILY)

        ### countries of occurence ###
        rangeList = soup.body.find_all('div',{'class':"group"})

        countriesPerGroupType = []

        # loop through groups (e.g. Possibly exting, Regionally extinct, Native)
        for index, item in enumerate(rangeList):
            listCountries = item.contents # list with first item as group name, and rest as countries
            countriesPerGroupType.append(listCountries)

        # TO-DO: further country list parsing
        print()
        fullListCountries = ''
        for groupNum, group in enumerate(countriesPerGroupType):
            groupName = group[0]
            groupCountries = group[1]
            fullListCountries = groupCountries +  "; " + fullListCountries
            #print(groupCountries)

        COUNTRIES = fullListCountries
        COUNTRIES
        featureList['COUNTRIES'] = COUNTRIES



        ### current population trend ###
        threatInfo = soup.body.find_all('strong')
        for index, item in enumerate(threatInfo):
#             print(item)
            if item.contents == ['Current Population Trend:']:
#                 print(item)
                threatText = item.parent.next_sibling.parent.find_all('p')
        

        
        
        popTrend = soup.body.find_all('td',{'id':"popTrend"})
#         print(popTrend)
        POP_TREND = popTrend[0].find_all('span')[0].string
        POP_TREND
        featureList['POP_TREND'] = POP_TREND
        
        ### threat list ###
        threatInfo = soup.body.find_all('strong')
        for index, item in enumerate(threatInfo):
            if item.contents == ['Major Threat(s):']:
#                 print(item)
                threatText = item.parent.next_sibling
                print(threatText)
#                 print(item.parent.parent.find_all('td')[1].contents)


#                 threatText = item.parent.next_sibling.parent.find_all('p')
        print(threatText)
#         THREAT_PARAGRAPH = threatText[1].contents[0]
#         THREAT_PARAGRAPH
#         featureList['THREAT_PARAGRAPH'] = THREAT_PARAGRAPH


        ### listed on CITES or not (conservation actions) ###
        for index, item in enumerate(threatInfo):
        #     print(item.contents)
            if item.contents == ['Conservation Actions:']:
                CONSERVATION_PARAGRAPH = item.parent.parent.find_all('td')[1].find_all('p')

        fullString = ''

        for index, item in enumerate(CONSERVATION_PARAGRAPH):
            fullString = str(item.contents) + fullString

        CONSERVATION_PARAGRAPH = fullString
        
        featureList['CONSERVATION_PARAGRAPH'] = CONSERVATION_PARAGRAPH


    except Exception as e:     # most generic exception you can catch
        traceback.print_exc()   
        logger.exception("oops!")


1001.html
Animalia
Chordata
Mammalia
Afrosoricida
Chrysochloridae





['Protected in the De Hoek, New Agatha and Woodbush Forest Reserves. Research needed to document most aspects of natural history, ecology, evolutionary relationships, phylogeography and population genetics of this species. In the former Transvaal Province (South Africa), its was given the highest regional priority score for mammals (Freitag and van Jaarsveld 1997). It currently ranks among the top 100 mammalian species (no. 73) of the EDGE of Existence Programme (Zoological Society of London), which aims to conserve the world’s Evolutionary Distinct and Globally Endangered species (', <span class="citation"><em>Mammals on the EDGE: Conservation Priorities Based on Threat and Phylogeny, </em>Isaac<em> et al. </em>2007 and subsequent updates). The species is not receiving dedicated conservation attention at present.</span>]

In [16]:
### status ### 
        statusInfo = soup.body.find_all('table',{'class':"tab_data"})
        # again, need to clean this bit by grabbing first string between first two new lines
        STATUS = statusInfo[2].find_all('td')[1].contents[0]
        print(STATUS)
        featureList['STATUS'] = STATUS



        ### ecology ###
        ecologyInfo = soup.body.find_all('strong')
        for index, item in enumerate(ecologyInfo):
            if item.contents == ['Systems:']:
                ecology = str(item.parent.next_sibling)

        ECOLOGY = ecology[4:-5]
        ECOLOGY
        featureList['ECOLOGY'] = ECOLOGY



 



        ### listed on CITES or not (conservation actions) ###
        for index, item in enumerate(threatInfo):
        #     print(item.contents)
            if item.contents == ['Conservation Actions:']:
        #         print(item)
                CONSERVATION_PARAGRAPH = item.parent.parent.find_all('strong')[1].contents
        #         print(threatText.contents)
        fullString = ''

        for index, item in enumerate(CONSERVATION_PARAGRAPH):
    #         print(item)
        #     print()
            fullString = str(item) + fullString

        CONSERVATION_PARAGRAPH = fullString
    #     print(CONSERVATION_PARAGRAPH)
        featureList['CONSERVATION_PARAGRAPH'] = CONSERVATION_PARAGRAPH

        # featureList = {}
        # featureList = getRedlistFeatures(soup)
    #     print(featureList)
    #     print(featureList.keys())
        columns =['animalName','KINGDOM','PHYLUM','CLASS','ORDER','FAMILY','ECOLOGY','COUNTRIES','THREAT_PARAGRAPH','CONSERVATION_PARAGRAPH','POP_TREND','STATUS']
    #     print(len(columns))
    #     print(featureList.items())
    #     df = pd.DataFrame.from_items(featureList.items())#, columns=['Date', 'DateValue'])
        df = pd.DataFrame(featureList,dtype=None,index=[0])
    #     print(df)

        newFeatureList = df
        featureList = featureList.append(newFeatureList, ignore_index=True)
        featureList.to_csv('example.csv')


  File "<ipython-input-16-fe57ed1e4e1a>", line 3
    statusInfo = soup.body.find_all('table',{'class':"tab_data"})
    ^
IndentationError: unexpected indent

In [ ]: