In [1]:
import pandas as pd
import numpy as np
import os
In [2]:
michi = pd.read_csv('C:\\users\\jlandman\\Desktop\\michi.csv', encoding='latin-1', sep=',')
lecl = pd.read_csv('C:\\users\\jlandman\\Desktop\\lecl_extra.csv', encoding='latin-1')
world_links = pd.read_csv('C:\\Users\\jlandman\\Desktop\\Manual_links_WGMS_to_RGI_GlaThiDa_Leclercq_WORLD_20160212.csv', encoding='latin-1')
alps_links = pd.read_csv('C:\\Users\\jlandman\\Desktop\\ALPS_LINKS_FROM_GH.csv', encoding='latin-1')
sgi_links = pd.read_csv('C:\\Users\\jlandman\\Documents\\github\\glaciers-cci5\\glaciers_cci5\\sgi2010_rgi_links.csv', encoding='latin-1')
#ak_links = pd.read_csv('C:\\Users\\jlandman\\Desktop\\LeBris_Paul_to_FG\\Links_GLIMS_FoG_AK_final_v2.csv', encoding='latin-1')
#fog_2016_a = pd.read_csv('C:\\Users\\jlandman\\Desktop\\DOI-WGMS-FoG-2016-08\\WGMS-FoG-2016-08-A-GLACIER.csv', index_col='WGMS_ID', encoding='latin-1')
In [3]:
sgi_links = sgi_links.rename(columns={'POLTITICAL_UNIT':'POLITICAL_UNIT'})
In [4]:
# Select status = True in the manually checked files
lecl_true = lecl[lecl['status'] == True]
world_true = world_links[world_links['status'] == True]
alps_true = alps_links[alps_links['status'] == True]
In [5]:
# make WGMS_ID Integers
lecl_true['WGMS_ID'] = lecl_true['WGMS_ID'].astype(int)
world_true['WGMS_ID'] = world_true['WGMS_ID'].astype(int)
alps_true['WGMS_ID'] = alps_true['WGMS_ID'].astype(int)
sgi_links['WGMS_ID'] = sgi_links['WGMS_ID'].astype(int)
#ak_links['WGMS_ID'] = ak_links['WGMS_ID'].astype(int)
In [6]:
# set WGMS_ID as index
michi_ind = michi.set_index('WGMS_ID')
lecl_ind = lecl_true.set_index('WGMS_ID')
world_ind = world_true.set_index('WGMS_ID')
alps_ind = alps_true.set_index('WGMS_ID')
sgi_ind = sgi_links.set_index('WGMS_ID')
#ak_ind = ak_links.set_index('WGMS_ID')
In [7]:
lecl_ind.rename(columns={'RGI_ID': 'RGI_ID_LEC'}, inplace=True)
world_ind.rename(columns={'RGI_ID': 'RGI_ID_WORLD'}, inplace=True)
alps_ind.rename(columns={'RGI_ID': 'RGI_ID_ALPS'}, inplace=True)
sgi_ind.rename(columns={'RGI_ID': 'RGI_ID_SGI'}, inplace=True)
#ak_ind.rename(columns={'GLIMS_ID': 'GLIMS_ID_AK'}, inplace=True)
In [8]:
sgi_ind.head()
Out[8]:
In [9]:
len(sgi_ind)
Out[9]:
In [10]:
print(len(np.unique([i for i in michi.RGI_ID.values if isinstance(i, str)])))
print(len([i for i in michi.RGI_ID.values if isinstance(i, str)]))
print(len(set([i for i in michi.RGI_ID.values if isinstance(i,str)])))
dup_RGI = np.unique([x for x in michi.RGI_ID.values.tolist() if michi.RGI_ID.values.tolist().count(x) > 1])
list(dup_RGI).remove('nan')
In [11]:
michi[michi.RGI_ID.isin(dup_RGI)].sort_values(by='RGI_ID')
Out[11]:
In [12]:
print(len([i for i in michi_ind.RGI_ID.values if isinstance(i, str)]))
In [13]:
michi_update = michi_ind.copy()
In [14]:
michi_update = michi_update.join([lecl_ind.RGI_ID_LEC, world_ind.RGI_ID_WORLD, alps_ind.RGI_ID_ALPS, sgi_ind.RGI_ID_SGI], how='outer')
In [15]:
michi_update.update(sgi_ind, overwrite=False)
In [16]:
michi_update[pd.isnull(michi_update.POLITICAL_UNIT)]
Out[16]:
In [17]:
for i in range(len(michi_update.RGI_ID_LEC.values)):
if isinstance(michi_update.RGI_ID_LEC.values[i], str):
print(michi_update.RGI_ID_LEC.values[i])
In [18]:
michi_update[michi_update[['RGI_ID_LEC', 'RGI_ID_WORLD', 'RGI_ID_ALPS', 'RGI_ID_SGI']].isnull().sum(axis=1) <3][:20]
Out[18]:
In [19]:
michi_update[~pd.isnull(michi_update.RGI_ID.values)][0:25]# & michi_update[['RGI_ID_LEC', 'RGI_ID_WORLD', 'RGI_ID_ALPS', 'RGI_ID_SGI']].isnull().sum(axis=1) <3]
Out[19]:
In [20]:
michi_update.columns.values
Out[20]:
In [21]:
michi_update[['RGI_ID', 'RGI_ID_LEC', 'RGI_ID_WORLD', 'RGI_ID_ALPS', 'RGI_ID_SGI']] = michi_update[['RGI_ID', 'RGI_ID_LEC', 'RGI_ID_WORLD', 'RGI_ID_ALPS', 'RGI_ID_SGI']].astype(str)
In [22]:
for i in range(len(michi_update.RGI_ID_LEC.values)):
if isinstance(michi_update.RGI_ID_LEC.values[i], float):
print(michi_update.RGI_ID_LEC.values[i])
In [23]:
michi_update.loc[(michi_update.RGI_ID != michi_update.RGI_ID_LEC)
& (michi_update.RGI_ID_LEC != 'nan')
& (michi_update.RGI_ID != 'nan')]
Out[23]:
In [24]:
michi_update.loc[(michi_update.RGI_ID != michi_update.RGI_ID_WORLD)
& (michi_update.RGI_ID_WORLD != 'nan')
& (michi_update.RGI_ID != 'nan')]
Out[24]:
In [25]:
michi_update.loc[(michi_update.RGI_ID != michi_update.RGI_ID_ALPS)
& (michi_update.RGI_ID_ALPS != 'nan')
& (michi_update.RGI_ID != 'nan')]
Out[25]:
In [26]:
michi_update.loc[(michi_update.RGI_ID != michi_update.RGI_ID_SGI)
& (michi_update.RGI_ID_SGI != 'nan')
& (michi_update.RGI_ID != 'nan')]
Out[26]:
In [27]:
# This is Chorabari glacier -> no RGI equivalent, would be confusing
michi_update = michi_update.drop(3640)
In [28]:
michi_update[pd.isnull(michi_update['NAME'])]
Out[28]:
In [29]:
michi_update[pd.isnull(michi_update['POLITICAL_UNIT'])]
Out[29]:
In [30]:
# Replace back 'nan' to np.nan
michi_update = michi_update.replace('nan', np.nan)
In [31]:
np.unique(michi_update.REMARKS.values)
Out[31]:
In [32]:
michi_update.REMARKS = np.nan
In [33]:
# This is necessary to make the remarks extendable in the next step (convert np.nan to empty string)
michi_update.REMARKS = michi_update.REMARKS.fillna('')
In [34]:
# ATTENTION, ORDER!
# we FIRST fill the df with the manual links and then only the np.nan values with the automatic links so that
# manual links (potentially "more correct") are not overwritten by potentially false geometric links
michi_update['RGI_ID_new'] = np.nan
michi_update['RGI_ID_new'] = np.where(~pd.isnull(michi_update.RGI_ID_LEC), michi_update.RGI_ID_LEC, np.nan)
michi_update['RGI_ID_new'] = np.where(pd.isnull(michi_update.RGI_ID_new), michi_update.RGI_ID_WORLD, michi_update.RGI_ID_new)
michi_update['RGI_ID_new'] = np.where(pd.isnull(michi_update.RGI_ID_new), michi_update.RGI_ID_ALPS, michi_update.RGI_ID_new)
# now it's time to fill the remarks column with "RGI linked checked manually", where there is an entry in RGI_ID_new
michi_update.loc[~pd.isnull(michi_update['RGI_ID_new']), 'REMARKS'] = michi_update['REMARKS'] + 'RGI link checked manually'
print(len(michi_update[michi_update.REMARKS == 'RGI link checked manually']))
michi_update['RGI_ID_new'] = np.where(pd.isnull(michi_update.RGI_ID_new), michi_update.RGI_ID_SGI, michi_update.RGI_ID_new)
michi_update.loc[(~pd.isnull(michi_update['RGI_ID_new'])) & (michi_update['REMARKS'].str.len() == 0.), 'REMARKS'] = michi_update['REMARKS'] + 'RGI link automated'
print(len(michi_update[michi_update.REMARKS == 'RGI link checked manually']))
In [35]:
michi_update.columns.values
Out[35]:
In [36]:
michi_update = michi_update[['POLITICAL_UNIT', 'NAME', 'PSFG_ID', 'WGI_ID', 'GLIMS_ID', 'RGI_ID_new', 'REMARKS']]
michi_update
Out[36]:
In [37]:
print(len(np.unique([i for i in michi_update.RGI_ID_new.values if isinstance(i, str)])))
print(len([i for i in michi_update.RGI_ID_new.values if isinstance(i, str)]))
print(len(set([i for i in michi_update.RGI_ID_new.values if isinstance(i,str)])))
dup_RGI_update = np.unique([x for x in michi_update.RGI_ID_new.values.tolist() if michi_update.RGI_ID_new.values.tolist().count(x) > 1])
list(dup_RGI_update).remove('nan')
michi_update.loc[michi_update.RGI_ID_new.isin(dup_RGI_update), ['POLITICAL_UNIT','NAME','RGI_ID_new'] ].sort_values(by='RGI_ID_new')#.to_csv('c:\\users\\jlandman\\Desktop\\Wrong_links_FoG_RGI_fuer_Fabi.csv')
Out[37]:
In [38]:
# According to Horst, they shouldn't be deleted
# this indice should be deleted
delete = [
5862, # shares a polygon with Jamtalferner in RGI (ice connected), but would be major error
4966, # Titlis and Unter Rotegg are separated in FoG
4959,
]
# OBERS ISCHMEER/FIESCHER/OCHS-N?
In [39]:
# Gebroulaz: two RGI Polygons, but only one is captured
#Fenga: RGI50-11.00723 OK
#Horn K. (Ziller): RGI50-11.00459 OK
In [40]:
michi_update = michi_update.rename({'RGI_ID_new':'RGI_ID'})
michi_update.to_csv('c:\\users\\jlandman\\Desktop\\WGMS-FoG-2016-08-AA-GLACIER-ID-LUT_updated.csv', encoding='latin-1')
In [41]:
website = pd.read_csv('C:\\Users\\jlandman\\Desktop\\00_rgi50_links\\00_rgi50_links.csv', skiprows=2, encoding='latin-1')
In [42]:
website = website.set_index('FoGId')
In [43]:
all_ind = np.hstack((website.index.values, michi_update.index.values))
In [44]:
all_ind.shape
Out[44]:
In [45]:
dupl_ind = np.unique([i for i in all_ind if all_ind.tolist().count(i) > 1])
In [46]:
len(dupl_ind)
Out[46]:
In [47]:
for i in dupl_ind:
if not (pd.isnull(website.loc[i, 'GLIMSId']) or pd.isnull(michi_update.loc[i, 'GLIMS_ID'])):
if website.loc[i, 'GLIMSId'] != michi_update.loc[i, 'GLIMS_ID']:
print(i, website.loc[i, 'Name'],website.loc[i, 'GLIMSId'], michi_update.loc[i, 'GLIMS_ID'])
if not (pd.isnull(website.loc[i, 'RGIId']) or pd.isnull(michi_update.loc[i, 'RGI_ID_new'])):
if (website.loc[i, 'RGIId'] != michi_update.loc[i, 'RGI_ID_new']):
print(i, website.loc[i, 'Name'],website.loc[i, 'RGIId'], michi_update.loc[i, 'RGI_ID_new'])
In [ ]: