notebook.community

Edit and run



In [2]:

    
#Read in Redlist

import pandas as pd
import py_entitymatching as em
import math
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import style
import re
style.use('ggplot')

# Read in csv as dataframe
CI = em.read_csv_metadata('countryInfo.txt', sep='\t', lineterminator='\r')
ar = em.read_csv_metadata("../finalArkives.csv", encoding="ISO-8859-1", key="id")
CI1 = CI[['#ISO','Country']]
CI1.rename(columns={'#ISO':'id'}, inplace=True)
CI1.head()









    



Metadata file is not present in the given path; proceeding to read the csv file.
Metadata file is not present in the given path; proceeding to read the csv file.
c:\users\aparn\anaconda\envs\my_first_env\lib\site-packages\pandas\core\frame.py:2816: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  **kwargs)






    Out[2]:






  
    
      
      id
      Country
    
  
  
    
      0
      \nAD
      Andorra
    
    
      1
      \nAE
      United Arab Emirates
    
    
      2
      \nAF
      Afghanistan
    
    
      3
      \nAG
      Antigua and Barbuda
    
    
      4
      \nAI
      Anguilla



In [ ]:

    
names = []
for index, row in CI1.iterrows():
    name = str(row['id'])
    name = name.replace('\n','')
    names.append(name)
    
CI1['id'] = names

CI1 = CI1.set_index(['id'])
CI1.head()



In [ ]:

    
#Read in ARKIVE
# Read in csv as dataframe
#CI1.loc[-1] = ['UK', 'United Kingdom']
# glance at first few rows
CI1.head()



In [ ]:

    
ar.columns



In [ ]:

    
# Concatenate nickname and common_name fields in arkive
# author: @andrewedstrom

# remove parenthesis contents
countries = []
for index, row in ar.iterrows():
    nickname = str(row[' countries']) #look here if errors start happening
    country =  str(re.findall(r"'(.*?)'", nickname, re.DOTALL))
    #print(country)
    country = country.replace(',', '')
    countries.append(country)
ar['country'] = countries
ar.columns
#ar.head()



In [ ]:

    
#ar.drop([' countries'], axis = 1, inplace=True)
ar.columns



In [ ]:

    
ar.head()



In [ ]:

    
# Remove parentheticals from arkive genus
# author: @andrewedstrom
# Next, actually remove scientific name and replace with common name(s)
#ar = arBeforeRemovingNames # in case script now run completely in order
#ar = ar[ar.animalName.str.contains("\(") == True] # update df to contain only tuples that contain common/nick names
error = 0 # make sure there's no wonky cases

# remove parenthesis contents
finalC = []
for index, row in ar.iterrows():
    list_country = str(row['country'])
    country = re.findall(r"'(.*?)'", list_country, re.DOTALL)
    cout = []
    for n in country:
        ct = str(CI1.loc[n])
        ct = ct.split('Name')[0]
        ct = ct.replace('Country','')
        ct = ct.lstrip().rstrip()
        cout.append(ct)
    cout = str(cout)
    finalC.append(cout)
ar['countries'] = finalC



In [ ]:

    
#ar.head()
#ar.drop(['country'], axis = 1, inplace=True)
#ar = em.read_csv_metadata("countryKeys.csv", encoding="ISO-8859-1", key="id")
#ar.to_csv('finalArkives.csv')
ar.head()



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:

	id	Country
0	\nAD	Andorra
1	\nAE	United Arab Emirates
2	\nAF	Afghanistan
3	\nAG	Antigua and Barbuda
4	\nAI	Anguilla