In [2]:
#Read in Redlist

import pandas as pd
import py_entitymatching as em
import math
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import style
import re
style.use('ggplot')

# Read in csv as dataframe
pd.read_csv('countryInfo.txt', sep='\t', lineterminator='\r')


Out[2]:
#ISO ISO3 ISO-Numeric fips Country Capital Area(in sq km) Population Continent tld CurrencyCode CurrencyName Phone Postal Code Format Postal Code Regex Languages geonameid neighbours EquivalentFipsCode
0 \nAD AND 20 AN Andorra Andorra la Vella 468.00 84000 EU .ad EUR Euro 376 AD### ^(?:AD)*(\d{3})$ ca 3041565.0 ES,FR NaN
1 \nAE ARE 784 AE United Arab Emirates Abu Dhabi 82880.00 4975593 AS .ae AED Dirham 971 NaN NaN ar-AE,fa,en,hi,ur 290557.0 SA,OM NaN
2 \nAF AFG 4 AF Afghanistan Kabul 647500.00 29121286 AS .af AFN Afghani 93 NaN NaN fa-AF,ps,uz-AF,tk 1149361.0 TM,CN,IR,TJ,PK,UZ NaN
3 \nAG ATG 28 AC Antigua and Barbuda St. John's 443.00 86754 NaN .ag XCD Dollar +1-268 NaN NaN en-AG 3576396.0 NaN NaN
4 \nAI AIA 660 AV Anguilla The Valley 102.00 13254 NaN .ai XCD Dollar +1-264 NaN NaN en-AI 3573511.0 NaN NaN
5 \nAL ALB 8 AL Albania Tirana 28748.00 2986952 EU .al ALL Lek 355 NaN NaN sq,el 783754.0 MK,GR,ME,RS,XK NaN
6 \nAM ARM 51 AM Armenia Yerevan 29800.00 2968000 AS .am AMD Dram 374 ###### ^(\d{6})$ hy 174982.0 GE,IR,AZ,TR NaN
7 \nAO AGO 24 AO Angola Luanda 1246700.00 13068161 AF .ao AOA Kwanza 244 NaN NaN pt-AO 3351879.0 CD,NA,ZM,CG NaN
8 \nAQ ATA 10 AY Antarctica NaN 14000000.00 0 AN .aq NaN NaN NaN NaN NaN NaN 6697173.0 NaN NaN
9 \nAR ARG 32 AR Argentina Buenos Aires 2766890.00 41343201 SA .ar ARS Peso 54 @####@@@ ^([A-Z]\d{4}[A-Z]{3})$ es-AR,en,it,de,fr,gn 3865483.0 CL,BO,UY,PY,BR NaN
10 \nAS ASM 16 AQ American Samoa Pago Pago 199.00 57881 OC .as USD Dollar +1-684 NaN NaN en-AS,sm,to 5880801.0 NaN NaN
11 \nAT AUT 40 AU Austria Vienna 83858.00 8205000 EU .at EUR Euro 43 #### ^(\d{4})$ de-AT,hr,hu,sl 2782113.0 CH,DE,HU,SK,CZ,IT,SI,LI NaN
12 \nAU AUS 36 AS Australia Canberra 7686850.00 21515754 OC .au AUD Dollar 61 #### ^(\d{4})$ en-AU 2077456.0 NaN NaN
13 \nAW ABW 533 AA Aruba Oranjestad 193.00 71566 NaN .aw AWG Guilder 297 NaN NaN nl-AW,es,en 3577279.0 NaN NaN
14 \nAX ALA 248 NaN Aland Islands Mariehamn NaN 26711 EU .ax EUR Euro +358-18 ##### ^(?:FI)*(\d{5})$ sv-AX 661882.0 NaN FI
15 \nAZ AZE 31 AJ Azerbaijan Baku 86600.00 8303512 AS .az AZN Manat 994 AZ #### ^(?:AZ)*(\d{4})$ az,ru,hy 587116.0 GE,IR,AM,TR,RU NaN
16 \nBA BIH 70 BK Bosnia and Herzegovina Sarajevo 51129.00 4590000 EU .ba BAM Marka 387 ##### ^(\d{5})$ bs,hr-BA,sr-BA 3277605.0 HR,ME,RS NaN
17 \nBB BRB 52 BB Barbados Bridgetown 431.00 285653 NaN .bb BBD Dollar +1-246 BB##### ^(?:BB)*(\d{5})$ en-BB 3374084.0 NaN NaN
18 \nBD BGD 50 BG Bangladesh Dhaka 144000.00 156118464 AS .bd BDT Taka 880 #### ^(\d{4})$ bn-BD,en 1210997.0 MM,IN NaN
19 \nBE BEL 56 BE Belgium Brussels 30510.00 10403000 EU .be EUR Euro 32 #### ^(\d{4})$ nl-BE,fr-BE,de-BE 2802361.0 DE,NL,LU,FR NaN
20 \nBF BFA 854 UV Burkina Faso Ouagadougou 274200.00 16241811 AF .bf XOF Franc 226 NaN NaN fr-BF 2361809.0 NE,BJ,GH,CI,TG,ML NaN
21 \nBG BGR 100 BU Bulgaria Sofia 110910.00 7148785 EU .bg BGN Lev 359 #### ^(\d{4})$ bg,tr-BG 732800.0 MK,GR,RO,TR,RS NaN
22 \nBH BHR 48 BA Bahrain Manama 665.00 738004 AS .bh BHD Dinar 973 ####|### ^(\d{3}\d?)$ ar-BH,en,fa,ur 290291.0 NaN NaN
23 \nBI BDI 108 BY Burundi Bujumbura 27830.00 9863117 AF .bi BIF Franc 257 NaN NaN fr-BI,rn 433561.0 TZ,CD,RW NaN
24 \nBJ BEN 204 BN Benin Porto-Novo 112620.00 9056010 AF .bj XOF Franc 229 NaN NaN fr-BJ 2395170.0 NE,TG,BF,NG NaN
25 \nBL BLM 652 TB Saint Barthelemy Gustavia 21.00 8450 NaN .gp EUR Euro 590 ### ### NaN fr 3578476.0 NaN NaN
26 \nBM BMU 60 BD Bermuda Hamilton 53.00 65365 NaN .bm BMD Dollar +1-441 @@ ## ^([A-Z]{2}\d{2})$ en-BM,pt 3573345.0 NaN NaN
27 \nBN BRN 96 BX Brunei Bandar Seri Begawan 5770.00 395027 AS .bn BND Dollar 673 @@#### ^([A-Z]{2}\d{4})$ ms-BN,en-BN 1820814.0 MY NaN
28 \nBO BOL 68 BL Bolivia Sucre 1098580.00 9947418 SA .bo BOB Boliviano 591 NaN NaN es-BO,qu,ay 3923057.0 PE,CL,PY,BR,AR NaN
29 \nBQ BES 535 NaN Bonaire, Saint Eustatius and Saba NaN NaN 18012 NaN .bq USD Dollar 599 NaN NaN nl,pap,en 7626844.0 NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
222 \nTM TKM 795 TX Turkmenistan Ashgabat 488100.00 4940916 AS .tm TMT Manat 993 ###### ^(\d{6})$ tk,ru,uz 1218197.0 AF,IR,UZ,KZ NaN
223 \nTN TUN 788 TS Tunisia Tunis 163610.00 10589025 AF .tn TND Dinar 216 #### ^(\d{4})$ ar-TN,fr 2464461.0 DZ,LY NaN
224 \nTO TON 776 TN Tonga Nuku'alofa 748.00 122580 OC .to TOP Pa'anga 676 NaN NaN to,en-TO 4032283.0 NaN NaN
225 \nTR TUR 792 TU Turkey Ankara 780580.00 77804122 AS .tr TRY Lira 90 ##### ^(\d{5})$ tr-TR,ku,diq,az,av 298795.0 SY,GE,IQ,IR,GR,AM,AZ,BG NaN
226 \nTT TTO 780 TD Trinidad and Tobago Port of Spain 5128.00 1228691 NaN .tt TTD Dollar +1-868 NaN NaN en-TT,hns,fr,es,zh 3573591.0 NaN NaN
227 \nTV TUV 798 TV Tuvalu Funafuti 26.00 10472 OC .tv AUD Dollar 688 NaN NaN tvl,en,sm,gil 2110297.0 NaN NaN
228 \nTW TWN 158 TW Taiwan Taipei 35980.00 22894384 AS .tw TWD Dollar 886 ##### ^(\d{5})$ zh-TW,zh,nan,hak 1668284.0 NaN NaN
229 \nTZ TZA 834 TZ Tanzania Dodoma 945087.00 41892895 AF .tz TZS Shilling 255 NaN NaN sw-TZ,en,ar 149590.0 MZ,KE,CD,RW,ZM,BI,UG,MW NaN
230 \nUA UKR 804 UP Ukraine Kiev 603700.00 45415596 EU .ua UAH Hryvnia 380 ##### ^(\d{5})$ uk,ru-UA,rom,pl,hu 690791.0 PL,MD,HU,SK,BY,RO,RU NaN
231 \nUG UGA 800 UG Uganda Kampala 236040.00 33398682 AF .ug UGX Shilling 256 NaN NaN en-UG,lg,sw,ar 226074.0 TZ,KE,SS,CD,RW NaN
232 \nUM UMI 581 NaN United States Minor Outlying Islands NaN 0.00 0 OC .um USD Dollar 1 NaN NaN en-UM 5854968.0 NaN NaN
233 \nUS USA 840 US United States Washington 9629091.00 310232863 NaN .us USD Dollar 1 #####-#### ^\d{5}(-\d{4})?$ en-US,es-US,haw,fr 6252001.0 CA,MX,CU NaN
234 \nUY URY 858 UY Uruguay Montevideo 176220.00 3477000 SA .uy UYU Peso 598 ##### ^(\d{5})$ es-UY 3439705.0 BR,AR NaN
235 \nUZ UZB 860 UZ Uzbekistan Tashkent 447400.00 27865738 AS .uz UZS Som 998 ###### ^(\d{6})$ uz,ru,tg 1512440.0 TM,AF,KG,TJ,KZ NaN
236 \nVA VAT 336 VT Vatican Vatican City 0.44 921 EU .va EUR Euro 379 ##### ^(\d{5})$ la,it,fr 3164670.0 IT NaN
237 \nVC VCT 670 VC Saint Vincent and the Grenadines Kingstown 389.00 104217 NaN .vc XCD Dollar +1-784 NaN NaN en-VC,fr 3577815.0 NaN NaN
238 \nVE VEN 862 VE Venezuela Caracas 912050.00 27223228 SA .ve VEF Bolivar 58 #### ^(\d{4})$ es-VE 3625428.0 GY,BR,CO NaN
239 \nVG VGB 92 VI British Virgin Islands Road Town 153.00 21730 NaN .vg USD Dollar +1-284 NaN NaN en-VG 3577718.0 NaN NaN
240 \nVI VIR 850 VQ U.S. Virgin Islands Charlotte Amalie 352.00 108708 NaN .vi USD Dollar +1-340 #####-#### ^\d{5}(-\d{4})?$ en-VI 4796775.0 NaN NaN
241 \nVN VNM 704 VM Vietnam Hanoi 329560.00 89571130 AS .vn VND Dong 84 ###### ^(\d{6})$ vi,en,fr,zh,km 1562822.0 CN,LA,KH NaN
242 \nVU VUT 548 NH Vanuatu Port Vila 12200.00 221552 OC .vu VUV Vatu 678 NaN NaN bi,en-VU,fr-VU 2134431.0 NaN NaN
243 \nWF WLF 876 WF Wallis and Futuna Mata Utu 274.00 16025 OC .wf XPF Franc 681 ##### ^(986\d{2})$ wls,fud,fr-WF 4034749.0 NaN NaN
244 \nWS WSM 882 WS Samoa Apia 2944.00 192001 OC .ws WST Tala 685 NaN NaN sm,en-WS 4034894.0 NaN NaN
245 \nYE YEM 887 YM Yemen Sanaa 527970.00 23495361 AS .ye YER Rial 967 NaN NaN ar-YE 69543.0 SA,OM NaN
246 \nYT MYT 175 MF Mayotte Mamoudzou 374.00 159042 AF .yt EUR Euro 262 ##### ^(\d{5})$ fr-YT 1024031.0 NaN NaN
247 \nZA ZAF 710 SF South Africa Pretoria 1219912.00 49000000 AF .za ZAR Rand 27 #### ^(\d{4})$ zu,xh,af,nso,en-ZA,tn,st,ts,ss,ve,nr 953987.0 ZW,SZ,MZ,BW,NA,LS NaN
248 \nZM ZMB 894 ZA Zambia Lusaka 752614.00 13460305 AF .zm ZMW Kwacha 260 ##### ^(\d{5})$ en-ZM,bem,loz,lun,lue,ny,toi 895949.0 ZW,TZ,MZ,CD,NA,MW,AO NaN
249 \nZW ZWE 716 ZI Zimbabwe Harare 390580.00 11651858 AF .zw ZWL Dollar 263 NaN NaN en-ZW,sn,nr,nd 878675.0 ZA,MZ,BW,ZM NaN
250 \nCS SCG 891 YI Serbia and Montenegro Belgrade 102350.00 10829175 EU .cs RSD Dinar 381 ##### ^(\d{5})$ cu,hu,sq,sr NaN AL,HU,MK,RO,HR,BA,BG NaN
251 \nAN ANT 530 NT Netherlands Antilles Willemstad 960.00 136197 NaN .an ANG Guilder 599 NaN NaN nl-AN,en,es NaN GP NaN

252 rows × 19 columns


In [ ]:
# CME
rl.columns# get key

# Note: The id column of the  redlist csv doesn't actually look like the object created here (rl).  In the csv,
# id numbers jump around since they represent original row number of the original scraped data csv.  Apparently
# this library can detect what we want and enumerate things in order if you give it a key feature like this.

In [ ]:
#Read in ARKIVE
# Read in csv as dataframe
ar = em.read_csv_metadata("finalArkives.csv", encoding="ISO-8859-1", key="id")

# glance at first few rows
ar.head()

In [ ]:
# Remove parentheticals from arkive common names
# author: CME
# Next, actually remove scientific name and replace with common name(s)
#ar = arBeforeRemovingNames # in case script now run completely in order
#ar = ar[ar.animalName.str.contains("\(") == True] # update df to contain only tuples that contain common/nick names
error = 0 # make sure there's no wonky cases

# remove parenthesis contents
for index, row in ar.iterrows():
    genus = str(row['name'])

    pstart = genus.find("(")
    if pstart !=-1:        
        # extract full common/nick name
        noparens = genus[0:pstart]
        ar.loc[index, 'name'] = noparens
    else:
        error = error + 1

print(error)

In [ ]:
# Concatenate nickname and common_name fields in arkive
# author: @andrewedstrom

# remove parenthesis contents
nameColumn = []
for index, row in ar.iterrows():
    nickname = str(row['scientific_name']) #look here if errors start happening
    nickname = re.sub('[^A-Za-z0-9;,\s]+', '', nickname).lower()
    common = str(row['common_name'])
    common = re.sub('[^A-Za-z0-9;,\s]+', '', common).lower()
    if common in nickname: 
        nameColumn.append(nickname)
    else:
        nameColumn.append((nickname + "; " + common))
ar['name'] = nameColumn
ar.head()

In [2]:
# Lowercase redlist name field
# author: @andrewedstrom
error = 0 # make sure there's no wonky cases

# remove parenthesis contents
for index, row in rl.iterrows():
    n = str(row['name'])
    n = re.sub('[^A-Za-z0-9;,\s]+', '', n).lower()

    rl.loc[index, 'name'] = n.lower()

rl.head()


Out[2]:
id name genus family ecology countries threat_paragraph conservation_paragraph pop_trend status country_count scientific_name
0 0 ochlockonee moccasinshell Medionidus Unionidae Freshwater United States (Florida, Georgia); NaN NaN NaN \r\n Critically Endangered\r\n\r\n\r\n A1ce\r\n\r\n 1 Medionidus simpsonianus
1 1 nelsons spiny pocket mouse Heteromys Heteromyidae Terrestrial Guatemala; Mexico (Chiapas); \r\n The main threat to this species is the significant forest loss in its habitat. Habitat f... ['\n This species occurs in a newly-named national park, Tacana, in Mexico.\n\n \n '] Decreasing \r\n Endangered\r\n\r\n\r\n B1ab(i,ii,iii,v)\r\n\r\n 2 Heteromys nelsoni
2 2 comoro friar Amauris Nymphalidae NaN Comoros; NaN NaN NaN \r\n Endangered\r\n\r\n\r\n B1+2c, C2b\r\n\r\n 1 Amauris comorana
3 3 atlantic halibut, halibut Hippoglossus Pleuronectidae NaN <div>Atlantic – northeast; Atlantic – northwest</div>; Canada (Newfoundland I); Denmark; Far... NaN NaN NaN \r\n Endangered\r\n\r\n\r\n A1d\r\n\r\n 15 Hippoglossus hippoglossus
4 4 nan Hirasea Endodontidae NaN Japan (Ogasawara-shoto); NaN NaN NaN \r\n Endangered\r\n\r\n\r\n A1d\r\n\r\n 1 Hirasea acutissima

In [ ]:


In [ ]:
# Remove parentheticals from arkive genus
# author: @andrewedstrom
# Next, actually remove scientific name and replace with common name(s)
#ar = arBeforeRemovingNames # in case script now run completely in order
#ar = ar[ar.animalName.str.contains("\(") == True] # update df to contain only tuples that contain common/nick names
error = 0 # make sure there's no wonky cases

# remove parenthesis contents
for index, row in ar.iterrows():
    genus = str(row['genus'])

    pstart = genus.find("(")
    if pstart !=-1:        
        # extract full common/nick name
        noparens = genus[0:pstart]
        ar.loc[index, 'genus'] = noparens
    else:
        error = error + 1

print(error)

In [3]:
rl.to_csv('finalRedlist.csv')
#ar.to_csv('finalArkives.csv')

In [ ]:
# Rule-based blocking over input tables
# first get features that can be used
feature_table = em.get_features_for_blocking(rl, ar)

In [ ]:
#len(feature_table)
feature_table
#em.get_attr_corres(rl, ar)['ltable']

In [ ]:


In [ ]:
# CME -- I'm realizing now that we may want to start with the attribute equivalence
# blocker for names and see where that leaves us.  On my TO-DO.
equivB = em.AttrEquivalenceBlocker()
C0 = equivB.block_tables(rl, ar, 'name', 'name',
                    l_output_attrs=['name', 'genus'], 
                    r_output_attrs=['name', 'genus'])
C0

# lolwat this reveals zero matches?

In [ ]:
#Do overlap blocking on name to get candidate set
#Perform some kind of blocking to find candidate pairs
ob = em.OverlapBlocker()
C0 = ob.block_tables(rl, ar,'name', 'name', 
                     l_output_attrs=['name', 'genus', 'family'], 
                     r_output_attrs=['name', 'genus', 'family'], overlap_size=2)
C0

In [ ]:
#Only keep candidate pairs which share genus
ab = em.AttrEquivalenceBlocker()
C1 = ob.block_candset(C0, 'genus', 'genus')
C1

#Do overlap blocking on name to get candidate set
#Perform some kind of blocking to find candidate pairs
#C1 = ab.block_tables(rl, ar,'genus', 'genus', 
#                     l_output_attrs=['name', 'genus'], 
#                     r_output_attrs=['name', 'genus'])

In [ ]:
C2 = equivB.block_tables(rl, ar, 'family', 'family',
                    l_output_attrs=['name', 'genus' ,'family'], 
                    r_output_attrs=['name', 'genus', 'family'])

C3 = ob.block_candset(C2, 'genus', 'genus')
C3

In [ ]:
C = em.combine_blocker_outputs_via_union([C1, C3])
C

In [ ]:
em.to_csv_metadata(C, './candidate_set.csv')

In [ ]:
C = em.read_csv_metadata('candidate_set.csv', ltable=rl, rtable=ar)

In [ ]:
C

In [ ]: