In [1]:
#Read in Redlist

import pandas as pd
import py_entitymatching as em
import math
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import style
import re
style.use('ggplot')

# Read in csv as dataframe
rl = em.read_csv_metadata("tenFeaturesRedlist.csv", encoding="ISO-8859-1", key='id')
# glance at first few rows
rl.head()


c:\users\aparn\anaconda\envs\my_first_env\lib\site-packages\sklearn\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
Metadata file is not present in the given path; proceeding to read the csv file.
Out[1]:
id name genus family ecology countries threat_paragraph conservation_paragraph pop_trend status country_count scientific_name
0 0 Ochlockonee Moccasinshell Medionidus Unionidae Freshwater United States (Florida, Georgia); NaN NaN NaN \r\n Critically Endangered\r\n\r\n\r\n A1ce\r\n\r\n 1 Medionidus simpsonianus
1 1 Nelson's Spiny Pocket Mouse Heteromys Heteromyidae Terrestrial Guatemala; Mexico (Chiapas); \r\n The main threat to this species is the significant forest loss in its habitat. Habitat f... ['\n This species occurs in a newly-named national park, Tacana, in Mexico.\n\n \n '] Decreasing \r\n Endangered\r\n\r\n\r\n B1ab(i,ii,iii,v)\r\n\r\n 2 Heteromys nelsoni
2 2 Comoro Friar Amauris Nymphalidae NaN Comoros; NaN NaN NaN \r\n Endangered\r\n\r\n\r\n B1+2c, C2b\r\n\r\n 1 Amauris comorana
3 3 Atlantic Halibut, Halibut Hippoglossus Pleuronectidae NaN <div>Atlantic – northeast; Atlantic – northwest</div>; Canada (Newfoundland I); Denmark; Far... NaN NaN NaN \r\n Endangered\r\n\r\n\r\n A1d\r\n\r\n 15 Hippoglossus hippoglossus
4 4 NaN Hirasea Endodontidae NaN Japan (Ogasawara-shoto); NaN NaN NaN \r\n Endangered\r\n\r\n\r\n A1d\r\n\r\n 1 Hirasea acutissima

In [ ]:
# CME
rl.columns# get key

# Note: The id column of the  redlist csv doesn't actually look like the object created here (rl).  In the csv,
# id numbers jump around since they represent original row number of the original scraped data csv.  Apparently
# this library can detect what we want and enumerate things in order if you give it a key feature like this.

In [ ]:
#Read in ARKIVE
# Read in csv as dataframe
ar = em.read_csv_metadata("finalArkives.csv", encoding="ISO-8859-1", key="id")

# glance at first few rows
ar.head()

In [ ]:
# Remove parentheticals from arkive common names
# author: CME
# Next, actually remove scientific name and replace with common name(s)
#ar = arBeforeRemovingNames # in case script now run completely in order
#ar = ar[ar.animalName.str.contains("\(") == True] # update df to contain only tuples that contain common/nick names
error = 0 # make sure there's no wonky cases

# remove parenthesis contents
for index, row in ar.iterrows():
    genus = str(row['name'])

    pstart = genus.find("(")
    if pstart !=-1:        
        # extract full common/nick name
        noparens = genus[0:pstart]
        ar.loc[index, 'name'] = noparens
    else:
        error = error + 1

print(error)

In [ ]:
# Concatenate nickname and common_name fields in arkive
# author: @andrewedstrom

# remove parenthesis contents
nameColumn = []
for index, row in ar.iterrows():
    nickname = str(row['scientific_name']) #look here if errors start happening
    nickname = re.sub('[^A-Za-z0-9;,\s]+', '', nickname).lower()
    common = str(row['common_name'])
    common = re.sub('[^A-Za-z0-9;,\s]+', '', common).lower()
    if common in nickname: 
        nameColumn.append(nickname)
    else:
        nameColumn.append((nickname + "; " + common))
ar['name'] = nameColumn
ar.head()

In [2]:
# Lowercase redlist name field
# author: @andrewedstrom
error = 0 # make sure there's no wonky cases

# remove parenthesis contents
for index, row in rl.iterrows():
    n = str(row['name'])
    n = re.sub('[^A-Za-z0-9;,\s]+', '', n).lower()

    rl.loc[index, 'name'] = n.lower()

rl.head()


Out[2]:
id name genus family ecology countries threat_paragraph conservation_paragraph pop_trend status country_count scientific_name
0 0 ochlockonee moccasinshell Medionidus Unionidae Freshwater United States (Florida, Georgia); NaN NaN NaN \r\n Critically Endangered\r\n\r\n\r\n A1ce\r\n\r\n 1 Medionidus simpsonianus
1 1 nelsons spiny pocket mouse Heteromys Heteromyidae Terrestrial Guatemala; Mexico (Chiapas); \r\n The main threat to this species is the significant forest loss in its habitat. Habitat f... ['\n This species occurs in a newly-named national park, Tacana, in Mexico.\n\n \n '] Decreasing \r\n Endangered\r\n\r\n\r\n B1ab(i,ii,iii,v)\r\n\r\n 2 Heteromys nelsoni
2 2 comoro friar Amauris Nymphalidae NaN Comoros; NaN NaN NaN \r\n Endangered\r\n\r\n\r\n B1+2c, C2b\r\n\r\n 1 Amauris comorana
3 3 atlantic halibut, halibut Hippoglossus Pleuronectidae NaN <div>Atlantic – northeast; Atlantic – northwest</div>; Canada (Newfoundland I); Denmark; Far... NaN NaN NaN \r\n Endangered\r\n\r\n\r\n A1d\r\n\r\n 15 Hippoglossus hippoglossus
4 4 nan Hirasea Endodontidae NaN Japan (Ogasawara-shoto); NaN NaN NaN \r\n Endangered\r\n\r\n\r\n A1d\r\n\r\n 1 Hirasea acutissima

In [ ]:


In [ ]:
# Remove parentheticals from arkive genus
# author: @andrewedstrom
# Next, actually remove scientific name and replace with common name(s)
#ar = arBeforeRemovingNames # in case script now run completely in order
#ar = ar[ar.animalName.str.contains("\(") == True] # update df to contain only tuples that contain common/nick names
error = 0 # make sure there's no wonky cases

# remove parenthesis contents
for index, row in ar.iterrows():
    genus = str(row['genus'])

    pstart = genus.find("(")
    if pstart !=-1:        
        # extract full common/nick name
        noparens = genus[0:pstart]
        ar.loc[index, 'genus'] = noparens
    else:
        error = error + 1

print(error)

In [3]:
rl.to_csv('finalRedlist.csv')
#ar.to_csv('finalArkives.csv')

In [ ]:
# Rule-based blocking over input tables
# first get features that can be used
feature_table = em.get_features_for_blocking(rl, ar)

In [ ]:
#len(feature_table)
feature_table
#em.get_attr_corres(rl, ar)['ltable']

In [ ]:


In [ ]:
# CME -- I'm realizing now that we may want to start with the attribute equivalence
# blocker for names and see where that leaves us.  On my TO-DO.
equivB = em.AttrEquivalenceBlocker()
C0 = equivB.block_tables(rl, ar, 'name', 'name',
                    l_output_attrs=['name', 'genus'], 
                    r_output_attrs=['name', 'genus'])
C0

# lolwat this reveals zero matches?

In [ ]:
#Do overlap blocking on name to get candidate set
#Perform some kind of blocking to find candidate pairs
ob = em.OverlapBlocker()
C0 = ob.block_tables(rl, ar,'name', 'name', 
                     l_output_attrs=['name', 'genus', 'family'], 
                     r_output_attrs=['name', 'genus', 'family'], overlap_size=2)
C0

In [ ]:
#Only keep candidate pairs which share genus
ab = em.AttrEquivalenceBlocker()
C1 = ob.block_candset(C0, 'genus', 'genus')
C1

#Do overlap blocking on name to get candidate set
#Perform some kind of blocking to find candidate pairs
#C1 = ab.block_tables(rl, ar,'genus', 'genus', 
#                     l_output_attrs=['name', 'genus'], 
#                     r_output_attrs=['name', 'genus'])

In [ ]:
C2 = equivB.block_tables(rl, ar, 'family', 'family',
                    l_output_attrs=['name', 'genus' ,'family'], 
                    r_output_attrs=['name', 'genus', 'family'])

C3 = ob.block_candset(C2, 'genus', 'genus')
C3

In [ ]:
C = em.combine_blocker_outputs_via_union([C1, C3])
C

In [ ]:
em.to_csv_metadata(C, './candidate_set.csv')

In [ ]:
C = em.read_csv_metadata('candidate_set.csv', ltable=rl, rtable=ar)

In [ ]:
C

In [ ]: