In [2]:
#Read in Redlist

import pandas as pd
import py_entitymatching as em
import math
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import style
import re
style.use('ggplot')

# Read in csv as dataframe
CI = em.read_csv_metadata('countryInfo.txt', sep='\t', lineterminator='\r')
ar = em.read_csv_metadata("../finalArkives.csv", encoding="ISO-8859-1", key="id")
CI1 = CI[['#ISO','Country']]
CI1.rename(columns={'#ISO':'id'}, inplace=True)
CI1.head()


Metadata file is not present in the given path; proceeding to read the csv file.
Metadata file is not present in the given path; proceeding to read the csv file.
c:\users\aparn\anaconda\envs\my_first_env\lib\site-packages\pandas\core\frame.py:2816: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  **kwargs)
Out[2]:
id Country
0 \nAD Andorra
1 \nAE United Arab Emirates
2 \nAF Afghanistan
3 \nAG Antigua and Barbuda
4 \nAI Anguilla

In [ ]:
names = []
for index, row in CI1.iterrows():
    name = str(row['id'])
    name = name.replace('\n','')
    names.append(name)
    
CI1['id'] = names

CI1 = CI1.set_index(['id'])
CI1.head()

In [ ]:
#Read in ARKIVE
# Read in csv as dataframe
#CI1.loc[-1] = ['UK', 'United Kingdom']
# glance at first few rows
CI1.head()

In [ ]:
ar.columns

In [ ]:
# Concatenate nickname and common_name fields in arkive
# author: @andrewedstrom

# remove parenthesis contents
countries = []
for index, row in ar.iterrows():
    nickname = str(row[' countries']) #look here if errors start happening
    country =  str(re.findall(r"'(.*?)'", nickname, re.DOTALL))
    #print(country)
    country = country.replace(',', '')
    countries.append(country)
ar['country'] = countries
ar.columns
#ar.head()

In [ ]:
#ar.drop([' countries'], axis = 1, inplace=True)
ar.columns

In [ ]:
ar.head()

In [ ]:
# Remove parentheticals from arkive genus
# author: @andrewedstrom
# Next, actually remove scientific name and replace with common name(s)
#ar = arBeforeRemovingNames # in case script now run completely in order
#ar = ar[ar.animalName.str.contains("\(") == True] # update df to contain only tuples that contain common/nick names
error = 0 # make sure there's no wonky cases

# remove parenthesis contents
finalC = []
for index, row in ar.iterrows():
    list_country = str(row['country'])
    country = re.findall(r"'(.*?)'", list_country, re.DOTALL)
    cout = []
    for n in country:
        ct = str(CI1.loc[n])
        ct = ct.split('Name')[0]
        ct = ct.replace('Country','')
        ct = ct.lstrip().rstrip()
        cout.append(ct)
    cout = str(cout)
    finalC.append(cout)
ar['countries'] = finalC

In [ ]:
#ar.head()
#ar.drop(['country'], axis = 1, inplace=True)
#ar = em.read_csv_metadata("countryKeys.csv", encoding="ISO-8859-1", key="id")
#ar.to_csv('finalArkives.csv')
ar.head()

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]: