In [1]:
import pandas as pd
import py_stringmatching as sm
import math
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import style
import re
import operator
style.use('ggplot')
# read in csv as dataframe
c = pd.read_csv("../blocking/candidate_set.csv", encoding="ISO-8859-1", index_col='_id')
c.head()
c.columns
Out[1]:
In [2]:
#list col names
list(c.columns.values)
Out[2]:
In [3]:
c['ltable_status'].head()
Out[3]:
In [4]:
# clean up ltable_status column
# author: @andrewedstrom
# remove parenthesis contents
for index, row in c.iterrows():
status = row['ltable_status']
if 'Critically' in status:
status = 'Critically Endangered'
else:
status = 'Endangered'
c.loc[index, 'ltable_status'] = status
c['rtable_status'].head()
Out[4]:
In [5]:
#Create status match col
import operator, re
statusMatchColumn = []
matches = 0
nonmatches = 0
for index, row in c.iterrows():
ls = row['ltable_status']
rs = row['rtable_status']
if ('critically' in ls.lower()) == ('critically' in rs.lower()):
statusMatchColumn.append(1)
matches = matches + 1
else:
statusMatchColumn.append(0)
nonmatches = nonmatches + 1
print('matches', matches)
print('nonmatches', nonmatches)
c['status_match'] = statusMatchColumn
c.head()
Out[5]:
In [6]:
c['rtable_countries']
Out[6]:
In [7]:
# add country match column
# author: @andrewedstrom
import operator, re, string
country_match_column = []
#create delimiter tokenizer using semicolon as delimiter
semi_tok = sm.DelimiterTokenizer(delim_set=['; '], return_set=True)
comma_tok = sm.DelimiterTokenizer(delim_set=[', '], return_set=True)
#create overlap coefficient similarity measure
oc = sm.OverlapCoefficient()
def clean_tokens(tok_list):
new_list = []
for s in tok_list:
s = s.lower()
#parentheticals
if '(' in s:
pstart = s.find('(')
pend = s.find(')')
if pend < len(s) - 1 and pend > 0:
s = s[0:pstart] + s[pend+1:]
else:
s = s[0:pstart]
#leading/trailing whitespace
s = s.strip()
#leading/trailing punctuation
s = s.strip(string.punctuation)
new_list.append(s)
return new_list
for index, row in c.iterrows():
lc = row['ltable_countries']
rc = row['rtable_countries']
ltok = semi_tok.tokenize(lc)
rtok = comma_tok.tokenize(rc)
ltok = clean_tokens(ltok)
rtok = clean_tokens(rtok)
overlap = oc.get_raw_score(ltok, rtok)
country_match_column.append(overlap)
c['country_overlap'] = country_match_column
c.head()
Out[7]:
In [8]:
# Code to add country similarity as feature
# Author @Jabroni McBroniFace
#'ltable_country_count'
#'rtable_cCount'
# Add column for countryCountSimilarity (similar if within same count by one)
import operator, re
countryMatchColumn = []
for index, row in c.iterrows():
lcCount = int(row['ltable_country_count'])
rcCount = int(row['rtable_country_count'])
if (lcCount-1) <= rcCount <= (lcCount+1):
countryMatchColumn.append(1)
else:
countryMatchColumn.append(0)
c['country_count_sim'] = countryMatchColumn
c.head()
Out[8]:
In [9]:
# c.to_csv('../blocking/candidate_set.csv')
c.to_csv('../blocking/candidate_set.csv')
In [ ]: