Modules


In [1]:
import pandas as pd
from glob import glob
import os
import numpy as np
import matplotlib.pyplot as plt
import warnings
% matplotlib inline

Point thinning functions


In [2]:
def dist_pyth(lon1, lat1, lon2, lat2):
    """Calculate the distance between two points """
    return np.sqrt((lat2 - lat1)**2 + (lon2 - lon1)**2)


def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between one point 
    on the earth and an array of points (specified in decimal degrees)
    """
    
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a)) 
    r = 6371000 # Radius of earth in meters
    return c * r


def p_dist(lon1, lat1, lon2, lat2, units=None):
    """
    Calculate the distance between two *CLOSE* points in meters using Pythagoras
    """
    if units == 'm':
        dist = dist_pyth(lon1, lat1, lon2, lat2)
    elif units == 'deg':
        dist = haversine(lon1, lat1, lon2, lat2)
    else:
        raise ValueError('Units must be meters (m) or degrees (deg).')
        
    return dist


def p_thin(df, xcol='x', ycol='y', datacols='data', radius=10, method='nanmean', units=None):
    """
    Thin a pandas point series based on distance of the points.
    
    Parameters
    ----------
    pseries: Series of points
    xcol: name of column with x coordinates
    ycol: name of columns with y coordinates
    datacol: name fo columns with data
    radius: search radius for point distance
    method: calculation method (a numpy method). Default: 'nanmean'
    
    Returns
    -------
    A thinned dataframe
    
    """
    
    assert units!=None,'Units must be meters (m) or degrees (deg).'
    
    thinned = pd.DataFrame([],columns=df.columns.values)
    
    print(len(df))
    ct=0
    while len(df)>0:
        df['DIST'] = p_dist(df[xcol].iloc[0], df[ycol].iloc[0], df[xcol], df[ycol], units=units)
        df = df.sort_values('DIST')

        subset = df[df.DIST <= radius]
        
        if not subset.empty:
            subset = subset.reset_index()
            fill_dict = {}
            fill_dict[xcol] = subset.loc[int(len(subset)/2), xcol]
            fill_dict[ycol] = subset.loc[int(len(subset)/2), ycol]
            for data in datacols:
                try:
                    fill_dict[data] = getattr(np,method)(subset[data].values)
                except TypeError:
                    fill_dict[data] = np.nan
            thinned.loc[len(thinned)] = pd.Series(fill_dict)
            ct+=1

            if ct%100==0:
                print(ct, len(df))
        # delete the used points from the data
        df = df.drop(df[df.DIST <= radius].index.values)
        
    print(len(thinned))
    return thinned

Files


In [3]:
files = glob('C:\\Users\\jlandman\\Desktop\\newData\\CL_Schaefer\\GlaThiDa2-0_MS*_Johannes.xls')
print(files)


['C:\\Users\\jlandman\\Desktop\\newData\\CL_Schaefer\\GlaThiDa2-0_MS2_Johannes.xls', 'C:\\Users\\jlandman\\Desktop\\newData\\CL_Schaefer\\GlaThiDa2-0_MS_Johannes.xls']

In [4]:
# no longer needed
"""
thin_dict = {'Del Potro':8,
             'Tronquitos':6,
             'Tapado':12,
             'Juncal Norte': 20,   
             'Rio Blanco':24,
             'Tupungatito':22,
             'Marmolejo':26    
}
"""


Out[4]:
"\nthin_dict = {'Del Potro':8,\n             'Tronquitos':6,\n             'Tapado':12,\n             'Juncal Norte': 20,   \n             'Rio Blanco':24,\n             'Tupungatito':22,\n             'Marmolejo':26    \n}\n"

In [5]:
search_dist = 10 # forward search distance for point thinning

In [6]:
data_new = pd.DataFrame([])

for file in files:
    fname = os.path.basename(file)
    data = pd.read_excel(file, sheetname=1)
    data = data.apply(lambda x: pd.to_numeric(x, errors='ignore'))

    for glacier in np.unique(data.GLACIER_NAME.values):
        temp = data[data.GLACIER_NAME==glacier]

        gtd_id = temp[temp.GLACIER_NAME==glacier].GlaThiDa_ID.values[0]
        pol_u = temp[temp.GLACIER_NAME==glacier].POLITICAL_UNIT.values[0]
        surv_date = temp[temp.GLACIER_NAME==glacier].SURVEY_DATE.values[0]
        remarks = temp[temp.GLACIER_NAME==glacier].REMARKS.iloc[0] + ' Point data have been thinned (mean) within a search distance of %s m.' % search_dist
        gname = glacier
        print(gname)
        #former data thinning
        #temp = temp.groupby(temp.index // thin_dict[glacier]).mean() # average over every x values as specified above

        try:
            temp['ELEVATION'] = temp['ELEVATION']
        except KeyError:
            temp['ELEVATION'] = np.nan
        except ValueError:
            temp['ELEVATION'] = np.nan
        
        # thin the data and filter "mean on empyt slice" warnings (in case there is no elevation)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=RuntimeWarning)
            temp = p_thin(temp, xcol='POINT_LON', ycol='POINT_LAT', datacols=['THICKNESS', 'ELEVATION'], radius=search_dist, method='nanmean', units='deg')

        temp['POLITICAL_UNIT'] = pol_u
        temp['GLACIER_NAME'] = gname.upper()
        temp['REMARKS'] = remarks
        temp['THICKNESS_UNCERTAINTY'] = np.nan
        temp['DATA_FLAG'] = np.nan
        temp['GlaThiDa_ID'] = gtd_id
        temp['SURVEY_DATE'] = surv_date
        temp['POINT_ID'] = range(1,len(temp)+1)

        data_new = data_new.append(temp, ignore_index=True)
   
data_new['THICKNESS'] = data_new['THICKNESS'].round()
data_new['ELEVATION'] = data_new['ELEVATION'].round()
data_new['POINT_LAT'] = data_new['POINT_LAT'].round(8)
data_new['POINT_LON'] = data_new['POINT_LON'].round(9)
data_new = data_new[['GlaThiDa_ID','POLITICAL_UNIT', 'GLACIER_NAME', 'SURVEY_DATE', 'POINT_ID',  'POINT_LAT',  'POINT_LON',  'ELEVATION', 
                     'THICKNESS', 'THICKNESS_UNCERTAINTY', 'DATA_FLAG', 'REMARKS']]

data_new = data_new.sort_values(by=['GlaThiDa_ID','POINT_ID'])
data_new.to_excel(os.path.join(os.path.dirname(file),fname.split('.')[0]+'_thinned.xlsx'), index=False)


Marmolejo
20549
C:\Users\jlandman\Anaconda3\lib\site-packages\ipykernel\__main__.py:21: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
C:\Users\jlandman\Anaconda3\lib\site-packages\ipykernel\__main__.py:64: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
100 19527
200 18304
300 16511
400 14387
500 12587
600 10917
700 9491
800 8279
900 6866
1000 5564
1100 4267
1200 3289
1300 2298
1400 1374
1500 411
1544
Rio Blanco
23603
C:\Users\jlandman\Anaconda3\lib\site-packages\ipykernel\__main__.py:21: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
C:\Users\jlandman\Anaconda3\lib\site-packages\ipykernel\__main__.py:64: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
100 21893
200 20753
300 19626
400 18342
500 16685
600 15626
700 14610
800 13322
900 10994
1000 9836
1100 8926
1200 7811
1300 6695
1400 5506
1500 4551
1600 3555
1700 2223
1800 910
1871
Tupungatito
21344
C:\Users\jlandman\Anaconda3\lib\site-packages\ipykernel\__main__.py:21: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
C:\Users\jlandman\Anaconda3\lib\site-packages\ipykernel\__main__.py:64: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
100 19848
200 18123
300 17082
400 15840
500 14668
600 13689
700 12549
800 11180
900 10022
1000 8860
1100 7761
1200 6629
1300 5729
1400 4298
1500 2767
1600 1254
1685
Del Potro
7909
C:\Users\jlandman\Anaconda3\lib\site-packages\ipykernel\__main__.py:25: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
C:\Users\jlandman\Anaconda3\lib\site-packages\ipykernel\__main__.py:64: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
100 6263
200 5193
300 3732
400 2429
500 1109
542
Juncal Norte
20416
C:\Users\jlandman\Anaconda3\lib\site-packages\ipykernel\__main__.py:21: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
C:\Users\jlandman\Anaconda3\lib\site-packages\ipykernel\__main__.py:64: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
100 18995
200 17499
300 16129
400 14554
500 12848
600 11551
700 9928
800 8729
900 7322
1000 6052
1100 4463
1200 3134
1300 2036
1400 936
1469
Tapado
14435
C:\Users\jlandman\Anaconda3\lib\site-packages\ipykernel\__main__.py:21: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
C:\Users\jlandman\Anaconda3\lib\site-packages\ipykernel\__main__.py:64: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
100 11970
200 10065
300 8366
400 6499
500 4270
600 2683
700 1358
800 339
836
Tronquitos
6666
C:\Users\jlandman\Anaconda3\lib\site-packages\ipykernel\__main__.py:21: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
C:\Users\jlandman\Anaconda3\lib\site-packages\ipykernel\__main__.py:64: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
100 4882
200 3756
300 2670
400 1686
500 302
516