In [1]:
#Read in Redlist
import pandas as pd
import py_entitymatching as em
import math
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import style
import re
style.use('ggplot')
# Read in csv as dataframe
rl = em.read_csv_metadata("tenFeaturesRedlist.csv", encoding="ISO-8859-1", key='id')
# glance at first few rows
rl.head()
Out[1]:
In [ ]:
# CME
rl.columns# get key
# Note: The id column of the redlist csv doesn't actually look like the object created here (rl). In the csv,
# id numbers jump around since they represent original row number of the original scraped data csv. Apparently
# this library can detect what we want and enumerate things in order if you give it a key feature like this.
In [ ]:
#Read in ARKIVE
# Read in csv as dataframe
ar = em.read_csv_metadata("finalArkives.csv", encoding="ISO-8859-1", key="id")
# glance at first few rows
ar.head()
In [ ]:
# Remove parentheticals from arkive common names
# author: CME
# Next, actually remove scientific name and replace with common name(s)
#ar = arBeforeRemovingNames # in case script now run completely in order
#ar = ar[ar.animalName.str.contains("\(") == True] # update df to contain only tuples that contain common/nick names
error = 0 # make sure there's no wonky cases
# remove parenthesis contents
for index, row in ar.iterrows():
genus = str(row['name'])
pstart = genus.find("(")
if pstart !=-1:
# extract full common/nick name
noparens = genus[0:pstart]
ar.loc[index, 'name'] = noparens
else:
error = error + 1
print(error)
In [ ]:
# Concatenate nickname and common_name fields in arkive
# author: @andrewedstrom
# remove parenthesis contents
nameColumn = []
for index, row in ar.iterrows():
nickname = str(row['scientific_name']) #look here if errors start happening
nickname = re.sub('[^A-Za-z0-9;,\s]+', '', nickname).lower()
common = str(row['common_name'])
common = re.sub('[^A-Za-z0-9;,\s]+', '', common).lower()
if common in nickname:
nameColumn.append(nickname)
else:
nameColumn.append((nickname + "; " + common))
ar['name'] = nameColumn
ar.head()
In [2]:
# Lowercase redlist name field
# author: @andrewedstrom
error = 0 # make sure there's no wonky cases
# remove parenthesis contents
for index, row in rl.iterrows():
n = str(row['name'])
n = re.sub('[^A-Za-z0-9;,\s]+', '', n).lower()
rl.loc[index, 'name'] = n.lower()
rl.head()
Out[2]:
In [ ]:
In [ ]:
# Remove parentheticals from arkive genus
# author: @andrewedstrom
# Next, actually remove scientific name and replace with common name(s)
#ar = arBeforeRemovingNames # in case script now run completely in order
#ar = ar[ar.animalName.str.contains("\(") == True] # update df to contain only tuples that contain common/nick names
error = 0 # make sure there's no wonky cases
# remove parenthesis contents
for index, row in ar.iterrows():
genus = str(row['genus'])
pstart = genus.find("(")
if pstart !=-1:
# extract full common/nick name
noparens = genus[0:pstart]
ar.loc[index, 'genus'] = noparens
else:
error = error + 1
print(error)
In [3]:
rl.to_csv('finalRedlist.csv')
#ar.to_csv('finalArkives.csv')
In [ ]:
# Rule-based blocking over input tables
# first get features that can be used
feature_table = em.get_features_for_blocking(rl, ar)
In [ ]:
#len(feature_table)
feature_table
#em.get_attr_corres(rl, ar)['ltable']
In [ ]:
In [ ]:
# CME -- I'm realizing now that we may want to start with the attribute equivalence
# blocker for names and see where that leaves us. On my TO-DO.
equivB = em.AttrEquivalenceBlocker()
C0 = equivB.block_tables(rl, ar, 'name', 'name',
l_output_attrs=['name', 'genus'],
r_output_attrs=['name', 'genus'])
C0
# lolwat this reveals zero matches?
In [ ]:
#Do overlap blocking on name to get candidate set
#Perform some kind of blocking to find candidate pairs
ob = em.OverlapBlocker()
C0 = ob.block_tables(rl, ar,'name', 'name',
l_output_attrs=['name', 'genus', 'family'],
r_output_attrs=['name', 'genus', 'family'], overlap_size=2)
C0
In [ ]:
#Only keep candidate pairs which share genus
ab = em.AttrEquivalenceBlocker()
C1 = ob.block_candset(C0, 'genus', 'genus')
C1
#Do overlap blocking on name to get candidate set
#Perform some kind of blocking to find candidate pairs
#C1 = ab.block_tables(rl, ar,'genus', 'genus',
# l_output_attrs=['name', 'genus'],
# r_output_attrs=['name', 'genus'])
In [ ]:
C2 = equivB.block_tables(rl, ar, 'family', 'family',
l_output_attrs=['name', 'genus' ,'family'],
r_output_attrs=['name', 'genus', 'family'])
C3 = ob.block_candset(C2, 'genus', 'genus')
C3
In [ ]:
C = em.combine_blocker_outputs_via_union([C1, C3])
C
In [ ]:
em.to_csv_metadata(C, './candidate_set.csv')
In [ ]:
C = em.read_csv_metadata('candidate_set.csv', ltable=rl, rtable=ar)
In [ ]:
C
In [ ]: