In [1]:
import pandas as pd
import py_stringmatching as sm
import math
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import style
import re
import operator
style.use('ggplot')

# read in csv as dataframe
c = pd.read_csv("../blocking/candidate_set.csv", encoding="ISO-8859-1", index_col='_id')
c.head()
c.columns


Out[1]:
Index(['ltable_id', 'rtable_id', 'ltable_name', 'ltable_genus',
       'ltable_family', 'ltable_ecology', 'ltable_countries',
       'ltable_threat_paragraph', 'ltable_conservation_paragraph',
       'ltable_pop_trend', 'ltable_status', 'ltable_country_count',
       'ltable_scientific_name', 'rtable_scientific_name', 'rtable_name',
       'rtable_kingdom', 'rtable_phylum', 'rtable_class', 'rtable_order',
       'rtable_family', 'rtable_genus', 'rtable_size', 'rtable_threats',
       'rtable_conservation', 'rtable_threat_keywords',
       'rtable_conservation_keywords', 'rtable_status', 'rtable_countries',
       'rtable_country_count', 'rtable_tCount', '_id.1', 'ltable_id.1',
       'rtable_id.1', 'name_name_jac_qgm_3_qgm_3',
       'name_name_jac_dlm_dc0_dlm_dc0', 'countries_countries_jac_qgm_3_qgm_3',
       'countries_countries_cos_dlm_dc0_dlm_dc0',
       'countries_countries_jac_dlm_dc0_dlm_dc0', 'countries_countries_mel',
       'countries_countries_lev_dist', 'countries_countries_lev_sim',
       'countries_countries_nmw', 'countries_countries_sw',
       'country_count_country_count_exm', 'country_count_country_count_anm',
       'country_count_country_count_lev_dist',
       'country_count_country_count_lev_sim'],
      dtype='object')

In [2]:
#list col names
list(c.columns.values)


Out[2]:
['ltable_id',
 'rtable_id',
 'ltable_name',
 'ltable_genus',
 'ltable_family',
 'ltable_ecology',
 'ltable_countries',
 'ltable_threat_paragraph',
 'ltable_conservation_paragraph',
 'ltable_pop_trend',
 'ltable_status',
 'ltable_country_count',
 'ltable_scientific_name',
 'rtable_scientific_name',
 'rtable_name',
 'rtable_kingdom',
 'rtable_phylum',
 'rtable_class',
 'rtable_order',
 'rtable_family',
 'rtable_genus',
 'rtable_size',
 'rtable_threats',
 'rtable_conservation',
 'rtable_threat_keywords',
 'rtable_conservation_keywords',
 'rtable_status',
 'rtable_countries',
 'rtable_country_count',
 'rtable_tCount',
 '_id.1',
 'ltable_id.1',
 'rtable_id.1',
 'name_name_jac_qgm_3_qgm_3',
 'name_name_jac_dlm_dc0_dlm_dc0',
 'countries_countries_jac_qgm_3_qgm_3',
 'countries_countries_cos_dlm_dc0_dlm_dc0',
 'countries_countries_jac_dlm_dc0_dlm_dc0',
 'countries_countries_mel',
 'countries_countries_lev_dist',
 'countries_countries_lev_sim',
 'countries_countries_nmw',
 'countries_countries_sw',
 'country_count_country_count_exm',
 'country_count_country_count_anm',
 'country_count_country_count_lev_dist',
 'country_count_country_count_lev_sim']

In [3]:
c['ltable_status'].head()


Out[3]:
_id
25215    \r\r\r\n    Endangered\r\r\r\n\r\r\r\n\r\r\r\n...
75598    \r\r\r\n    Endangered\r\r\r\n\r\r\r\n\r\r\r\n...
75600    \r\r\r\n    Endangered\r\r\r\n\r\r\r\n\r\r\r\n...
78778    \r\r\r\n    Critically Endangered\r\r\r\n\r\r\...
83513    \r\r\r\n    Endangered\r\r\r\n\r\r\r\n\r\r\r\n...
Name: ltable_status, dtype: object

In [4]:
# clean up ltable_status column
# author: @andrewedstrom

# remove parenthesis contents
for index, row in c.iterrows():
    status = row['ltable_status']
    if 'Critically' in status:
        status = 'Critically Endangered'
    else:
        status = 'Endangered'
    c.loc[index, 'ltable_status'] = status
    
c['rtable_status'].head()


Out[4]:
_id
25215    Endangered
75598    Endangered
75600    Endangered
78778    Endangered
83513    Endangered
Name: rtable_status, dtype: object

In [5]:
#Create status match col
import operator, re

statusMatchColumn = []
matches = 0
nonmatches = 0
for index, row in c.iterrows():
    ls = row['ltable_status']
    rs = row['rtable_status']
    
    if ('critically' in ls.lower())  == ('critically' in rs.lower()):
        statusMatchColumn.append(1)
        matches = matches + 1
    else:
        statusMatchColumn.append(0)
        nonmatches = nonmatches + 1

print('matches', matches)
print('nonmatches', nonmatches)
c['status_match'] = statusMatchColumn
c.head()


matches 1226
nonmatches 1352
Out[5]:
ltable_id rtable_id ltable_name ltable_genus ltable_family ltable_ecology ltable_countries ltable_threat_paragraph ltable_conservation_paragraph ltable_pop_trend ... countries_countries_mel countries_countries_lev_dist countries_countries_lev_sim countries_countries_nmw countries_countries_sw country_count_country_count_exm country_count_country_count_anm country_count_country_count_lev_dist country_count_country_count_lev_sim status_match
_id
25215 1470 3 adelaide pigmy bluetongue skink, pygmy bluetongue Tiliqua Scincidae NaN Australia; NaN NaN NaN ... 0.511818 21 0.16 -10.0 3.0 0 0.5 1.0 0.0 1
75598 4363 9 albanian water frog Pelophylax Ranidae Terrestrial; Freshwater Albania; Montenegro; \r\r\r\n The major threat is drainage of we... ["\n It is listed on Appendix III of the Be... Decreasing ... 0.531032 21 0.16 -2.0 10.0 1 1.0 0.0 1.0 1
75600 4365 9 NaN Pelophylax Ranidae Terrestrial; Freshwater China; \r\r\r\n The major threat is over-collectin... ['\n The Shapotou location is within a nati... Decreasing ... 0.455238 23 0.08 -16.0 1.0 0 0.5 1.0 0.0 1
78778 7543 9 karpathos frog Pelophylax Ranidae Terrestrial; Freshwater Greece; \r\r\r\n The major threat is habitat loss a... ['\n It is listed on Appendix III of the Be... Decreasing ... 0.470000 22 0.12 -14.0 2.0 0 0.5 1.0 0.0 0
83513 4363 10 albanian water frog Pelophylax Ranidae Terrestrial; Freshwater Albania; Montenegro; \r\r\r\n The major threat is drainage of we... ["\n It is listed on Appendix III of the Be... Decreasing ... 0.531032 21 0.16 -2.0 10.0 1 1.0 0.0 1.0 1

5 rows × 48 columns


In [6]:
c['rtable_countries']


Out[6]:
_id
25215                              ['Croatia', 'Montenegro']
75598                              ['Montenegro', 'Albania']
75600                              ['Montenegro', 'Albania']
78778                              ['Montenegro', 'Albania']
83513                              ['Montenegro', 'Albania']
83515                              ['Montenegro', 'Albania']
86693                              ['Montenegro', 'Albania']
105744                                             ['Japan']
105745                                             ['Japan']
105746                                             ['Japan']
105747                                             ['Japan']
109757                                             ['Japan']
109758                                             ['Japan']
109759                                             ['Japan']
109760                                             ['Japan']
109761                                             ['Japan']
109762                                             ['Japan']
109763                                             ['Japan']
109764                                             ['Japan']
109765                                             ['Japan']
109766                                             ['Japan']
109767                                             ['Japan']
109768                                             ['Japan']
109769                                             ['Japan']
166554                      ['Oman', 'United Arab Emirates']
174469                                   ['France', 'Spain']
182690                                            ['Mexico']
182694                                            ['Mexico']
182697                                            ['Mexico']
182700                                            ['Mexico']
                                 ...                        
9097111                               ['Bolivia', 'Uruguay']
9100153                               ['Bolivia', 'Uruguay']
9105025                                          ['Ecuador']
9105026                                          ['Ecuador']
9108068                                          ['Ecuador']
9118003    ['Angola', 'Equatorial Guinea', 'Gabon', 'Cent...
9125918    ['Angola', 'Equatorial Guinea', 'Gabon', 'Cent...
9133833                                   ['United Kingdom']
9137159                                   ['United Kingdom']
9141009                                   ['United Kingdom']
9141010                                   ['United Kingdom']
9145074                                   ['United Kingdom']
9148924                                   ['United Kingdom']
9148925                                   ['United Kingdom']
9157066                                            ['India']
9164981                                      ['Philippines']
9169927                                      ['Philippines']
9169928                                      ['Philippines']
9171058                                      ['Philippines']
9171059                                      ['Philippines']
9173116                                      ['Philippines']
9173120                                      ['Philippines']
9173121                                      ['Philippines']
9177842                          ['Philippines', 'Malaysia']
9177843                          ['Philippines', 'Malaysia']
9178973                          ['Philippines', 'Malaysia']
9178974                          ['Philippines', 'Malaysia']
9181031                          ['Philippines', 'Malaysia']
9181035                          ['Philippines', 'Malaysia']
9181036                          ['Philippines', 'Malaysia']
Name: rtable_countries, dtype: object

In [7]:
# add country match column
# author: @andrewedstrom
import operator, re, string
country_match_column = []

#create delimiter tokenizer using semicolon as delimiter
semi_tok = sm.DelimiterTokenizer(delim_set=['; '], return_set=True)
comma_tok = sm.DelimiterTokenizer(delim_set=[', '], return_set=True)

#create overlap coefficient similarity measure
oc = sm.OverlapCoefficient()

def clean_tokens(tok_list):
    new_list = []
    for s in tok_list:
        s = s.lower()
        
        #parentheticals
        if '(' in s:
            pstart = s.find('(')
            pend = s.find(')')
            if pend < len(s) - 1 and pend > 0:
                s = s[0:pstart] + s[pend+1:]
            else:
                s = s[0:pstart]
                
        #leading/trailing whitespace
        s = s.strip()
        
        #leading/trailing punctuation
        s = s.strip(string.punctuation)
        new_list.append(s)
    return new_list

for index, row in c.iterrows():
    lc = row['ltable_countries']
    rc = row['rtable_countries']
    
    ltok = semi_tok.tokenize(lc)
    rtok = comma_tok.tokenize(rc)
    ltok = clean_tokens(ltok)
    rtok = clean_tokens(rtok)
    
    overlap = oc.get_raw_score(ltok, rtok)
    
    country_match_column.append(overlap)
    
    
c['country_overlap'] = country_match_column
c.head()


Out[7]:
ltable_id rtable_id ltable_name ltable_genus ltable_family ltable_ecology ltable_countries ltable_threat_paragraph ltable_conservation_paragraph ltable_pop_trend ... countries_countries_lev_dist countries_countries_lev_sim countries_countries_nmw countries_countries_sw country_count_country_count_exm country_count_country_count_anm country_count_country_count_lev_dist country_count_country_count_lev_sim status_match country_overlap
_id
25215 1470 3 adelaide pigmy bluetongue skink, pygmy bluetongue Tiliqua Scincidae NaN Australia; NaN NaN NaN ... 21 0.16 -10.0 3.0 0 0.5 1.0 0.0 1 0.0
75598 4363 9 albanian water frog Pelophylax Ranidae Terrestrial; Freshwater Albania; Montenegro; \r\r\r\n The major threat is drainage of we... ["\n It is listed on Appendix III of the Be... Decreasing ... 21 0.16 -2.0 10.0 1 1.0 0.0 1.0 1 1.0
75600 4365 9 NaN Pelophylax Ranidae Terrestrial; Freshwater China; \r\r\r\n The major threat is over-collectin... ['\n The Shapotou location is within a nati... Decreasing ... 23 0.08 -16.0 1.0 0 0.5 1.0 0.0 1 0.0
78778 7543 9 karpathos frog Pelophylax Ranidae Terrestrial; Freshwater Greece; \r\r\r\n The major threat is habitat loss a... ['\n It is listed on Appendix III of the Be... Decreasing ... 22 0.12 -14.0 2.0 0 0.5 1.0 0.0 0 0.0
83513 4363 10 albanian water frog Pelophylax Ranidae Terrestrial; Freshwater Albania; Montenegro; \r\r\r\n The major threat is drainage of we... ["\n It is listed on Appendix III of the Be... Decreasing ... 21 0.16 -2.0 10.0 1 1.0 0.0 1.0 1 1.0

5 rows × 49 columns


In [8]:
# Code to add country similarity as feature

# Author @Jabroni McBroniFace

#'ltable_country_count'
#'rtable_cCount'

# Add column for countryCountSimilarity (similar if within same count by one)
import operator, re

countryMatchColumn = []
for index, row in c.iterrows():
    lcCount = int(row['ltable_country_count'])
    rcCount = int(row['rtable_country_count'])
    
    if (lcCount-1) <= rcCount <= (lcCount+1):
        countryMatchColumn.append(1)
    else:
        countryMatchColumn.append(0)

c['country_count_sim'] = countryMatchColumn
c.head()


Out[8]:
ltable_id rtable_id ltable_name ltable_genus ltable_family ltable_ecology ltable_countries ltable_threat_paragraph ltable_conservation_paragraph ltable_pop_trend ... countries_countries_lev_sim countries_countries_nmw countries_countries_sw country_count_country_count_exm country_count_country_count_anm country_count_country_count_lev_dist country_count_country_count_lev_sim status_match country_overlap country_count_sim
_id
25215 1470 3 adelaide pigmy bluetongue skink, pygmy bluetongue Tiliqua Scincidae NaN Australia; NaN NaN NaN ... 0.16 -10.0 3.0 0 0.5 1.0 0.0 1 0.0 1
75598 4363 9 albanian water frog Pelophylax Ranidae Terrestrial; Freshwater Albania; Montenegro; \r\r\r\n The major threat is drainage of we... ["\n It is listed on Appendix III of the Be... Decreasing ... 0.16 -2.0 10.0 1 1.0 0.0 1.0 1 1.0 1
75600 4365 9 NaN Pelophylax Ranidae Terrestrial; Freshwater China; \r\r\r\n The major threat is over-collectin... ['\n The Shapotou location is within a nati... Decreasing ... 0.08 -16.0 1.0 0 0.5 1.0 0.0 1 0.0 1
78778 7543 9 karpathos frog Pelophylax Ranidae Terrestrial; Freshwater Greece; \r\r\r\n The major threat is habitat loss a... ['\n It is listed on Appendix III of the Be... Decreasing ... 0.12 -14.0 2.0 0 0.5 1.0 0.0 0 0.0 1
83513 4363 10 albanian water frog Pelophylax Ranidae Terrestrial; Freshwater Albania; Montenegro; \r\r\r\n The major threat is drainage of we... ["\n It is listed on Appendix III of the Be... Decreasing ... 0.16 -2.0 10.0 1 1.0 0.0 1.0 1 1.0 1

5 rows × 50 columns


In [9]:
# c.to_csv('../blocking/candidate_set.csv')
c.to_csv('../blocking/candidate_set.csv')

In [ ]: