notebook.community

Edit and run



In [1]:

    
import pandas as pd
import py_stringmatching as sm
import math
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import style
import re
import operator
style.use('ggplot')

# read in csv as dataframe
c = pd.read_csv("../blocking/candidate_set.csv", encoding="ISO-8859-1", index_col='_id')
c.head()
c.columns









    Out[1]:





Index(['ltable_id', 'rtable_id', 'ltable_name', 'ltable_genus',
       'ltable_family', 'ltable_ecology', 'ltable_countries',
       'ltable_threat_paragraph', 'ltable_conservation_paragraph',
       'ltable_pop_trend', 'ltable_status', 'ltable_country_count',
       'ltable_scientific_name', 'rtable_scientific_name', 'rtable_name',
       'rtable_kingdom', 'rtable_phylum', 'rtable_class', 'rtable_order',
       'rtable_family', 'rtable_genus', 'rtable_size', 'rtable_threats',
       'rtable_conservation', 'rtable_threat_keywords',
       'rtable_conservation_keywords', 'rtable_status', 'rtable_countries',
       'rtable_country_count', 'rtable_tCount', '_id.1', 'ltable_id.1',
       'rtable_id.1', 'name_name_jac_qgm_3_qgm_3',
       'name_name_jac_dlm_dc0_dlm_dc0', 'countries_countries_jac_qgm_3_qgm_3',
       'countries_countries_cos_dlm_dc0_dlm_dc0',
       'countries_countries_jac_dlm_dc0_dlm_dc0', 'countries_countries_mel',
       'countries_countries_lev_dist', 'countries_countries_lev_sim',
       'countries_countries_nmw', 'countries_countries_sw',
       'country_count_country_count_exm', 'country_count_country_count_anm',
       'country_count_country_count_lev_dist',
       'country_count_country_count_lev_sim'],
      dtype='object')



In [2]:

    
#list col names
list(c.columns.values)









    Out[2]:





['ltable_id',
 'rtable_id',
 'ltable_name',
 'ltable_genus',
 'ltable_family',
 'ltable_ecology',
 'ltable_countries',
 'ltable_threat_paragraph',
 'ltable_conservation_paragraph',
 'ltable_pop_trend',
 'ltable_status',
 'ltable_country_count',
 'ltable_scientific_name',
 'rtable_scientific_name',
 'rtable_name',
 'rtable_kingdom',
 'rtable_phylum',
 'rtable_class',
 'rtable_order',
 'rtable_family',
 'rtable_genus',
 'rtable_size',
 'rtable_threats',
 'rtable_conservation',
 'rtable_threat_keywords',
 'rtable_conservation_keywords',
 'rtable_status',
 'rtable_countries',
 'rtable_country_count',
 'rtable_tCount',
 '_id.1',
 'ltable_id.1',
 'rtable_id.1',
 'name_name_jac_qgm_3_qgm_3',
 'name_name_jac_dlm_dc0_dlm_dc0',
 'countries_countries_jac_qgm_3_qgm_3',
 'countries_countries_cos_dlm_dc0_dlm_dc0',
 'countries_countries_jac_dlm_dc0_dlm_dc0',
 'countries_countries_mel',
 'countries_countries_lev_dist',
 'countries_countries_lev_sim',
 'countries_countries_nmw',
 'countries_countries_sw',
 'country_count_country_count_exm',
 'country_count_country_count_anm',
 'country_count_country_count_lev_dist',
 'country_count_country_count_lev_sim']



In [3]:

    
c['ltable_status'].head()









    Out[3]:





_id
25215    \r\r\r\n    Endangered\r\r\r\n\r\r\r\n\r\r\r\n...
75598    \r\r\r\n    Endangered\r\r\r\n\r\r\r\n\r\r\r\n...
75600    \r\r\r\n    Endangered\r\r\r\n\r\r\r\n\r\r\r\n...
78778    \r\r\r\n    Critically Endangered\r\r\r\n\r\r\...
83513    \r\r\r\n    Endangered\r\r\r\n\r\r\r\n\r\r\r\n...
Name: ltable_status, dtype: object



In [4]:

    
# clean up ltable_status column
# author: @andrewedstrom

# remove parenthesis contents
for index, row in c.iterrows():
    status = row['ltable_status']
    if 'Critically' in status:
        status = 'Critically Endangered'
    else:
        status = 'Endangered'
    c.loc[index, 'ltable_status'] = status
    
c['rtable_status'].head()









    Out[4]:





_id
25215    Endangered
75598    Endangered
75600    Endangered
78778    Endangered
83513    Endangered
Name: rtable_status, dtype: object



In [5]:

    
#Create status match col
import operator, re

statusMatchColumn = []
matches = 0
nonmatches = 0
for index, row in c.iterrows():
    ls = row['ltable_status']
    rs = row['rtable_status']
    
    if ('critically' in ls.lower())  == ('critically' in rs.lower()):
        statusMatchColumn.append(1)
        matches = matches + 1
    else:
        statusMatchColumn.append(0)
        nonmatches = nonmatches + 1

print('matches', matches)
print('nonmatches', nonmatches)
c['status_match'] = statusMatchColumn
c.head()









    



matches 1226
nonmatches 1352






    Out[5]:






  
    
      
      ltable_id
      rtable_id
      ltable_name
      ltable_genus
      ltable_family
      ltable_ecology
      ltable_countries
      ltable_threat_paragraph
      ltable_conservation_paragraph
      ltable_pop_trend
      ...
      countries_countries_mel
      countries_countries_lev_dist
      countries_countries_lev_sim
      countries_countries_nmw
      countries_countries_sw
      country_count_country_count_exm
      country_count_country_count_anm
      country_count_country_count_lev_dist
      country_count_country_count_lev_sim
      status_match
    
    
      _id
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      25215
      1470
      3
      adelaide pigmy bluetongue skink, pygmy bluetongue
      Tiliqua
      Scincidae
      NaN
      Australia;
      NaN
      NaN
      NaN
      ...
      0.511818
      21
      0.16
      -10.0
      3.0
      0
      0.5
      1.0
      0.0
      1
    
    
      75598
      4363
      9
      albanian water frog
      Pelophylax
      Ranidae
      Terrestrial; Freshwater
      Albania; Montenegro;
      \r\r\r\n    The major threat is drainage of we...
      ["\n    It is listed on Appendix III of the Be...
      Decreasing
      ...
      0.531032
      21
      0.16
      -2.0
      10.0
      1
      1.0
      0.0
      1.0
      1
    
    
      75600
      4365
      9
      NaN
      Pelophylax
      Ranidae
      Terrestrial; Freshwater
      China;
      \r\r\r\n    The major threat is over-collectin...
      ['\n    The Shapotou location is within a nati...
      Decreasing
      ...
      0.455238
      23
      0.08
      -16.0
      1.0
      0
      0.5
      1.0
      0.0
      1
    
    
      78778
      7543
      9
      karpathos frog
      Pelophylax
      Ranidae
      Terrestrial; Freshwater
      Greece;
      \r\r\r\n    The major threat is habitat loss a...
      ['\n    It is listed on Appendix III of the Be...
      Decreasing
      ...
      0.470000
      22
      0.12
      -14.0
      2.0
      0
      0.5
      1.0
      0.0
      0
    
    
      83513
      4363
      10
      albanian water frog
      Pelophylax
      Ranidae
      Terrestrial; Freshwater
      Albania; Montenegro;
      \r\r\r\n    The major threat is drainage of we...
      ["\n    It is listed on Appendix III of the Be...
      Decreasing
      ...
      0.531032
      21
      0.16
      -2.0
      10.0
      1
      1.0
      0.0
      1.0
      1
    
  

5 rows × 48 columns



In [6]:

    
c['rtable_countries']









    Out[6]:





_id
25215                              ['Croatia', 'Montenegro']
75598                              ['Montenegro', 'Albania']
75600                              ['Montenegro', 'Albania']
78778                              ['Montenegro', 'Albania']
83513                              ['Montenegro', 'Albania']
83515                              ['Montenegro', 'Albania']
86693                              ['Montenegro', 'Albania']
105744                                             ['Japan']
105745                                             ['Japan']
105746                                             ['Japan']
105747                                             ['Japan']
109757                                             ['Japan']
109758                                             ['Japan']
109759                                             ['Japan']
109760                                             ['Japan']
109761                                             ['Japan']
109762                                             ['Japan']
109763                                             ['Japan']
109764                                             ['Japan']
109765                                             ['Japan']
109766                                             ['Japan']
109767                                             ['Japan']
109768                                             ['Japan']
109769                                             ['Japan']
166554                      ['Oman', 'United Arab Emirates']
174469                                   ['France', 'Spain']
182690                                            ['Mexico']
182694                                            ['Mexico']
182697                                            ['Mexico']
182700                                            ['Mexico']
                                 ...                        
9097111                               ['Bolivia', 'Uruguay']
9100153                               ['Bolivia', 'Uruguay']
9105025                                          ['Ecuador']
9105026                                          ['Ecuador']
9108068                                          ['Ecuador']
9118003    ['Angola', 'Equatorial Guinea', 'Gabon', 'Cent...
9125918    ['Angola', 'Equatorial Guinea', 'Gabon', 'Cent...
9133833                                   ['United Kingdom']
9137159                                   ['United Kingdom']
9141009                                   ['United Kingdom']
9141010                                   ['United Kingdom']
9145074                                   ['United Kingdom']
9148924                                   ['United Kingdom']
9148925                                   ['United Kingdom']
9157066                                            ['India']
9164981                                      ['Philippines']
9169927                                      ['Philippines']
9169928                                      ['Philippines']
9171058                                      ['Philippines']
9171059                                      ['Philippines']
9173116                                      ['Philippines']
9173120                                      ['Philippines']
9173121                                      ['Philippines']
9177842                          ['Philippines', 'Malaysia']
9177843                          ['Philippines', 'Malaysia']
9178973                          ['Philippines', 'Malaysia']
9178974                          ['Philippines', 'Malaysia']
9181031                          ['Philippines', 'Malaysia']
9181035                          ['Philippines', 'Malaysia']
9181036                          ['Philippines', 'Malaysia']
Name: rtable_countries, dtype: object



In [7]:

    
# add country match column
# author: @andrewedstrom
import operator, re, string
country_match_column = []

#create delimiter tokenizer using semicolon as delimiter
semi_tok = sm.DelimiterTokenizer(delim_set=['; '], return_set=True)
comma_tok = sm.DelimiterTokenizer(delim_set=[', '], return_set=True)

#create overlap coefficient similarity measure
oc = sm.OverlapCoefficient()

def clean_tokens(tok_list):
    new_list = []
    for s in tok_list:
        s = s.lower()
        
        #parentheticals
        if '(' in s:
            pstart = s.find('(')
            pend = s.find(')')
            if pend < len(s) - 1 and pend > 0:
                s = s[0:pstart] + s[pend+1:]
            else:
                s = s[0:pstart]
                
        #leading/trailing whitespace
        s = s.strip()
        
        #leading/trailing punctuation
        s = s.strip(string.punctuation)
        new_list.append(s)
    return new_list

for index, row in c.iterrows():
    lc = row['ltable_countries']
    rc = row['rtable_countries']
    
    ltok = semi_tok.tokenize(lc)
    rtok = comma_tok.tokenize(rc)
    ltok = clean_tokens(ltok)
    rtok = clean_tokens(rtok)
    
    overlap = oc.get_raw_score(ltok, rtok)
    
    country_match_column.append(overlap)
    
    
c['country_overlap'] = country_match_column
c.head()









    Out[7]:






  
    
      
      ltable_id
      rtable_id
      ltable_name
      ltable_genus
      ltable_family
      ltable_ecology
      ltable_countries
      ltable_threat_paragraph
      ltable_conservation_paragraph
      ltable_pop_trend
      ...
      countries_countries_lev_dist
      countries_countries_lev_sim
      countries_countries_nmw
      countries_countries_sw
      country_count_country_count_exm
      country_count_country_count_anm
      country_count_country_count_lev_dist
      country_count_country_count_lev_sim
      status_match
      country_overlap
    
    
      _id
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      25215
      1470
      3
      adelaide pigmy bluetongue skink, pygmy bluetongue
      Tiliqua
      Scincidae
      NaN
      Australia;
      NaN
      NaN
      NaN
      ...
      21
      0.16
      -10.0
      3.0
      0
      0.5
      1.0
      0.0
      1
      0.0
    
    
      75598
      4363
      9
      albanian water frog
      Pelophylax
      Ranidae
      Terrestrial; Freshwater
      Albania; Montenegro;
      \r\r\r\n    The major threat is drainage of we...
      ["\n    It is listed on Appendix III of the Be...
      Decreasing
      ...
      21
      0.16
      -2.0
      10.0
      1
      1.0
      0.0
      1.0
      1
      1.0
    
    
      75600
      4365
      9
      NaN
      Pelophylax
      Ranidae
      Terrestrial; Freshwater
      China;
      \r\r\r\n    The major threat is over-collectin...
      ['\n    The Shapotou location is within a nati...
      Decreasing
      ...
      23
      0.08
      -16.0
      1.0
      0
      0.5
      1.0
      0.0
      1
      0.0
    
    
      78778
      7543
      9
      karpathos frog
      Pelophylax
      Ranidae
      Terrestrial; Freshwater
      Greece;
      \r\r\r\n    The major threat is habitat loss a...
      ['\n    It is listed on Appendix III of the Be...
      Decreasing
      ...
      22
      0.12
      -14.0
      2.0
      0
      0.5
      1.0
      0.0
      0
      0.0
    
    
      83513
      4363
      10
      albanian water frog
      Pelophylax
      Ranidae
      Terrestrial; Freshwater
      Albania; Montenegro;
      \r\r\r\n    The major threat is drainage of we...
      ["\n    It is listed on Appendix III of the Be...
      Decreasing
      ...
      21
      0.16
      -2.0
      10.0
      1
      1.0
      0.0
      1.0
      1
      1.0
    
  

5 rows × 49 columns



In [8]:

    
# Code to add country similarity as feature

# Author @Jabroni McBroniFace

#'ltable_country_count'
#'rtable_cCount'

# Add column for countryCountSimilarity (similar if within same count by one)
import operator, re

countryMatchColumn = []
for index, row in c.iterrows():
    lcCount = int(row['ltable_country_count'])
    rcCount = int(row['rtable_country_count'])
    
    if (lcCount-1) <= rcCount <= (lcCount+1):
        countryMatchColumn.append(1)
    else:
        countryMatchColumn.append(0)

c['country_count_sim'] = countryMatchColumn
c.head()









    Out[8]:






  
    
      
      ltable_id
      rtable_id
      ltable_name
      ltable_genus
      ltable_family
      ltable_ecology
      ltable_countries
      ltable_threat_paragraph
      ltable_conservation_paragraph
      ltable_pop_trend
      ...
      countries_countries_lev_sim
      countries_countries_nmw
      countries_countries_sw
      country_count_country_count_exm
      country_count_country_count_anm
      country_count_country_count_lev_dist
      country_count_country_count_lev_sim
      status_match
      country_overlap
      country_count_sim
    
    
      _id
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      25215
      1470
      3
      adelaide pigmy bluetongue skink, pygmy bluetongue
      Tiliqua
      Scincidae
      NaN
      Australia;
      NaN
      NaN
      NaN
      ...
      0.16
      -10.0
      3.0
      0
      0.5
      1.0
      0.0
      1
      0.0
      1
    
    
      75598
      4363
      9
      albanian water frog
      Pelophylax
      Ranidae
      Terrestrial; Freshwater
      Albania; Montenegro;
      \r\r\r\n    The major threat is drainage of we...
      ["\n    It is listed on Appendix III of the Be...
      Decreasing
      ...
      0.16
      -2.0
      10.0
      1
      1.0
      0.0
      1.0
      1
      1.0
      1
    
    
      75600
      4365
      9
      NaN
      Pelophylax
      Ranidae
      Terrestrial; Freshwater
      China;
      \r\r\r\n    The major threat is over-collectin...
      ['\n    The Shapotou location is within a nati...
      Decreasing
      ...
      0.08
      -16.0
      1.0
      0
      0.5
      1.0
      0.0
      1
      0.0
      1
    
    
      78778
      7543
      9
      karpathos frog
      Pelophylax
      Ranidae
      Terrestrial; Freshwater
      Greece;
      \r\r\r\n    The major threat is habitat loss a...
      ['\n    It is listed on Appendix III of the Be...
      Decreasing
      ...
      0.12
      -14.0
      2.0
      0
      0.5
      1.0
      0.0
      0
      0.0
      1
    
    
      83513
      4363
      10
      albanian water frog
      Pelophylax
      Ranidae
      Terrestrial; Freshwater
      Albania; Montenegro;
      \r\r\r\n    The major threat is drainage of we...
      ["\n    It is listed on Appendix III of the Be...
      Decreasing
      ...
      0.16
      -2.0
      10.0
      1
      1.0
      0.0
      1.0
      1
      1.0
      1
    
  

5 rows × 50 columns



In [9]:

    
# c.to_csv('../blocking/candidate_set.csv')
c.to_csv('../blocking/candidate_set.csv')



In [ ]:

	ltable_id	rtable_id	ltable_name	ltable_genus	ltable_family	ltable_ecology	ltable_countries	ltable_threat_paragraph	ltable_conservation_paragraph	ltable_pop_trend	...	countries_countries_mel	countries_countries_lev_dist	countries_countries_lev_sim	countries_countries_nmw	countries_countries_sw	country_count_country_count_exm	country_count_country_count_anm	country_count_country_count_lev_dist	country_count_country_count_lev_sim	status_match
_id
25215	1470	3	adelaide pigmy bluetongue skink, pygmy bluetongue	Tiliqua	Scincidae	NaN	Australia;	NaN	NaN	NaN	...	0.511818	21	0.16	-10.0	3.0	0	0.5	1.0	0.0	1
75598	4363	9	albanian water frog	Pelophylax	Ranidae	Terrestrial; Freshwater	Albania; Montenegro;	\r\r\r\n The major threat is drainage of we...	["\n It is listed on Appendix III of the Be...	Decreasing	...	0.531032	21	0.16	-2.0	10.0	1	1.0	0.0	1.0	1
75600	4365	9	NaN	Pelophylax	Ranidae	Terrestrial; Freshwater	China;	\r\r\r\n The major threat is over-collectin...	['\n The Shapotou location is within a nati...	Decreasing	...	0.455238	23	0.08	-16.0	1.0	0	0.5	1.0	0.0	1
78778	7543	9	karpathos frog	Pelophylax	Ranidae	Terrestrial; Freshwater	Greece;	\r\r\r\n The major threat is habitat loss a...	['\n It is listed on Appendix III of the Be...	Decreasing	...	0.470000	22	0.12	-14.0	2.0	0	0.5	1.0	0.0	0
83513	4363	10	albanian water frog	Pelophylax	Ranidae	Terrestrial; Freshwater	Albania; Montenegro;	\r\r\r\n The major threat is drainage of we...	["\n It is listed on Appendix III of the Be...	Decreasing	...	0.531032	21	0.16	-2.0	10.0	1	1.0	0.0	1.0	1