notebook.community

Edit and run



In [1]:

    
#Read in Redlist

import pandas as pd
import py_entitymatching as em
import math
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import style
import re
style.use('ggplot')

# Read in csv as dataframe
rl = em.read_csv_metadata("tenFeaturesRedlist.csv", encoding="ISO-8859-1", key='id')
# glance at first few rows
rl.head()









    



c:\users\aparn\anaconda\envs\my_first_env\lib\site-packages\sklearn\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
Metadata file is not present in the given path; proceeding to read the csv file.






    Out[1]:






  
    
      
      id
      name
      genus
      family
      ecology
      countries
      threat_paragraph
      conservation_paragraph
      pop_trend
      status
      country_count
      scientific_name
    
  
  
    
      0
      0
      Ochlockonee Moccasinshell
      Medionidus
      Unionidae
      Freshwater
      United States (Florida, Georgia);
      NaN
      NaN
      NaN
      \r\n    Critically Endangered\r\n\r\n\r\n    A1ce\r\n\r\n
      1
      Medionidus simpsonianus
    
    
      1
      1
      Nelson's Spiny Pocket Mouse
      Heteromys
      Heteromyidae
      Terrestrial
      Guatemala; Mexico (Chiapas);
      \r\n    The main threat to this species is the significant forest loss in its habitat. Habitat f...
      ['\n    This species occurs in a newly-named national park, Tacana, in Mexico.\n\n    \n  ']
      Decreasing
      \r\n    Endangered\r\n\r\n\r\n    B1ab(i,ii,iii,v)\r\n\r\n
      2
      Heteromys nelsoni
    
    
      2
      2
      Comoro Friar
      Amauris
      Nymphalidae
      NaN
      Comoros;
      NaN
      NaN
      NaN
      \r\n    Endangered\r\n\r\n\r\n    B1+2c, C2b\r\n\r\n
      1
      Amauris comorana
    
    
      3
      3
      Atlantic Halibut, Halibut
      Hippoglossus
      Pleuronectidae
      NaN
      <div>Atlantic â northeast; Atlantic â northwest</div>; Canada (Newfoundland I); Denmark; Far...
      NaN
      NaN
      NaN
      \r\n    Endangered\r\n\r\n\r\n    A1d\r\n\r\n
      15
      Hippoglossus hippoglossus
    
    
      4
      4
      NaN
      Hirasea
      Endodontidae
      NaN
      Japan (Ogasawara-shoto);
      NaN
      NaN
      NaN
      \r\n    Endangered\r\n\r\n\r\n    A1d\r\n\r\n
      1
      Hirasea acutissima



In [ ]:

    
# CME
rl.columns# get key

# Note: The id column of the  redlist csv doesn't actually look like the object created here (rl).  In the csv,
# id numbers jump around since they represent original row number of the original scraped data csv.  Apparently
# this library can detect what we want and enumerate things in order if you give it a key feature like this.



In [ ]:

    
#Read in ARKIVE
# Read in csv as dataframe
ar = em.read_csv_metadata("finalArkives.csv", encoding="ISO-8859-1", key="id")

# glance at first few rows
ar.head()



In [ ]:

    
# Remove parentheticals from arkive common names
# author: CME
# Next, actually remove scientific name and replace with common name(s)
#ar = arBeforeRemovingNames # in case script now run completely in order
#ar = ar[ar.animalName.str.contains("\(") == True] # update df to contain only tuples that contain common/nick names
error = 0 # make sure there's no wonky cases

# remove parenthesis contents
for index, row in ar.iterrows():
    genus = str(row['name'])

    pstart = genus.find("(")
    if pstart !=-1:        
        # extract full common/nick name
        noparens = genus[0:pstart]
        ar.loc[index, 'name'] = noparens
    else:
        error = error + 1

print(error)



In [ ]:

    
# Concatenate nickname and common_name fields in arkive
# author: @andrewedstrom

# remove parenthesis contents
nameColumn = []
for index, row in ar.iterrows():
    nickname = str(row['scientific_name']) #look here if errors start happening
    nickname = re.sub('[^A-Za-z0-9;,\s]+', '', nickname).lower()
    common = str(row['common_name'])
    common = re.sub('[^A-Za-z0-9;,\s]+', '', common).lower()
    if common in nickname: 
        nameColumn.append(nickname)
    else:
        nameColumn.append((nickname + "; " + common))
ar['name'] = nameColumn
ar.head()



In [2]:

    
# Lowercase redlist name field
# author: @andrewedstrom
error = 0 # make sure there's no wonky cases

# remove parenthesis contents
for index, row in rl.iterrows():
    n = str(row['name'])
    n = re.sub('[^A-Za-z0-9;,\s]+', '', n).lower()

    rl.loc[index, 'name'] = n.lower()

rl.head()









    Out[2]:






  
    
      
      id
      name
      genus
      family
      ecology
      countries
      threat_paragraph
      conservation_paragraph
      pop_trend
      status
      country_count
      scientific_name
    
  
  
    
      0
      0
      ochlockonee moccasinshell
      Medionidus
      Unionidae
      Freshwater
      United States (Florida, Georgia);
      NaN
      NaN
      NaN
      \r\n    Critically Endangered\r\n\r\n\r\n    A1ce\r\n\r\n
      1
      Medionidus simpsonianus
    
    
      1
      1
      nelsons spiny pocket mouse
      Heteromys
      Heteromyidae
      Terrestrial
      Guatemala; Mexico (Chiapas);
      \r\n    The main threat to this species is the significant forest loss in its habitat. Habitat f...
      ['\n    This species occurs in a newly-named national park, Tacana, in Mexico.\n\n    \n  ']
      Decreasing
      \r\n    Endangered\r\n\r\n\r\n    B1ab(i,ii,iii,v)\r\n\r\n
      2
      Heteromys nelsoni
    
    
      2
      2
      comoro friar
      Amauris
      Nymphalidae
      NaN
      Comoros;
      NaN
      NaN
      NaN
      \r\n    Endangered\r\n\r\n\r\n    B1+2c, C2b\r\n\r\n
      1
      Amauris comorana
    
    
      3
      3
      atlantic halibut, halibut
      Hippoglossus
      Pleuronectidae
      NaN
      <div>Atlantic â northeast; Atlantic â northwest</div>; Canada (Newfoundland I); Denmark; Far...
      NaN
      NaN
      NaN
      \r\n    Endangered\r\n\r\n\r\n    A1d\r\n\r\n
      15
      Hippoglossus hippoglossus
    
    
      4
      4
      nan
      Hirasea
      Endodontidae
      NaN
      Japan (Ogasawara-shoto);
      NaN
      NaN
      NaN
      \r\n    Endangered\r\n\r\n\r\n    A1d\r\n\r\n
      1
      Hirasea acutissima



In [ ]:



In [ ]:

    
# Remove parentheticals from arkive genus
# author: @andrewedstrom
# Next, actually remove scientific name and replace with common name(s)
#ar = arBeforeRemovingNames # in case script now run completely in order
#ar = ar[ar.animalName.str.contains("\(") == True] # update df to contain only tuples that contain common/nick names
error = 0 # make sure there's no wonky cases

# remove parenthesis contents
for index, row in ar.iterrows():
    genus = str(row['genus'])

    pstart = genus.find("(")
    if pstart !=-1:        
        # extract full common/nick name
        noparens = genus[0:pstart]
        ar.loc[index, 'genus'] = noparens
    else:
        error = error + 1

print(error)



In [3]:

    
rl.to_csv('finalRedlist.csv')
#ar.to_csv('finalArkives.csv')



In [ ]:

    
# Rule-based blocking over input tables
# first get features that can be used
feature_table = em.get_features_for_blocking(rl, ar)



In [ ]:

    
#len(feature_table)
feature_table
#em.get_attr_corres(rl, ar)['ltable']



In [ ]:



In [ ]:

    
# CME -- I'm realizing now that we may want to start with the attribute equivalence
# blocker for names and see where that leaves us.  On my TO-DO.
equivB = em.AttrEquivalenceBlocker()
C0 = equivB.block_tables(rl, ar, 'name', 'name',
                    l_output_attrs=['name', 'genus'], 
                    r_output_attrs=['name', 'genus'])
C0

# lolwat this reveals zero matches?



In [ ]:

    
#Do overlap blocking on name to get candidate set
#Perform some kind of blocking to find candidate pairs
ob = em.OverlapBlocker()
C0 = ob.block_tables(rl, ar,'name', 'name', 
                     l_output_attrs=['name', 'genus', 'family'], 
                     r_output_attrs=['name', 'genus', 'family'], overlap_size=2)
C0



In [ ]:

    
#Only keep candidate pairs which share genus
ab = em.AttrEquivalenceBlocker()
C1 = ob.block_candset(C0, 'genus', 'genus')
C1

#Do overlap blocking on name to get candidate set
#Perform some kind of blocking to find candidate pairs
#C1 = ab.block_tables(rl, ar,'genus', 'genus', 
#                     l_output_attrs=['name', 'genus'], 
#                     r_output_attrs=['name', 'genus'])



In [ ]:

    
C2 = equivB.block_tables(rl, ar, 'family', 'family',
                    l_output_attrs=['name', 'genus' ,'family'], 
                    r_output_attrs=['name', 'genus', 'family'])

C3 = ob.block_candset(C2, 'genus', 'genus')
C3



In [ ]:

    
C = em.combine_blocker_outputs_via_union([C1, C3])
C



In [ ]:

    
em.to_csv_metadata(C, './candidate_set.csv')



In [ ]:

    
C = em.read_csv_metadata('candidate_set.csv', ltable=rl, rtable=ar)



In [ ]:

    
C



In [ ]:

	id	name	genus	family	ecology	countries	threat_paragraph	conservation_paragraph	pop_trend	status	country_count	scientific_name
0	0	Ochlockonee Moccasinshell	Medionidus	Unionidae	Freshwater	United States (Florida, Georgia);	NaN	NaN	NaN	\r\n Critically Endangered\r\n\r\n\r\n A1ce\r\n\r\n	1	Medionidus simpsonianus
1	1	Nelson's Spiny Pocket Mouse	Heteromys	Heteromyidae	Terrestrial	Guatemala; Mexico (Chiapas);	\r\n The main threat to this species is the significant forest loss in its habitat. Habitat f...	['\n This species occurs in a newly-named national park, Tacana, in Mexico.\n\n \n ']	Decreasing	\r\n Endangered\r\n\r\n\r\n B1ab(i,ii,iii,v)\r\n\r\n	2	Heteromys nelsoni
2	2	Comoro Friar	Amauris	Nymphalidae	NaN	Comoros;	NaN	NaN	NaN	\r\n Endangered\r\n\r\n\r\n B1+2c, C2b\r\n\r\n	1	Amauris comorana
3	3	Atlantic Halibut, Halibut	Hippoglossus	Pleuronectidae	NaN	<div>Atlantic â northeast; Atlantic â northwest</div>; Canada (Newfoundland I); Denmark; Far...	NaN	NaN	NaN	\r\n Endangered\r\n\r\n\r\n A1d\r\n\r\n	15	Hippoglossus hippoglossus
4	4	NaN	Hirasea	Endodontidae	NaN	Japan (Ogasawara-shoto);	NaN	NaN	NaN	\r\n Endangered\r\n\r\n\r\n A1d\r\n\r\n	1	Hirasea acutissima