In [2]:
import pandas as pd
from glob import glob
import os
import numpy as np
import warnings

In [3]:
def dist_pyth(lon1, lat1, lon2, lat2):
    """Calculate the distance between two points """
    return np.sqrt((lat2 - lat1)**2 + (lon2 - lon1)**2)


def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between one point 
    on the earth and an array of points (specified in decimal degrees)
    """
    
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a)) 
    r = 6371000 # Radius of earth in meters
    return c * r


def p_dist(lon1, lat1, lon2, lat2, units=None):
    """
    Calculate the distance between two *CLOSE* points in meters using Pythagoras
    """
    if units == 'm':
        dist = dist_pyth(lon1, lat1, lon2, lat2)
    elif units == 'deg':
        dist = haversine(lon1, lat1, lon2, lat2)
    else:
        raise ValueError('Units must be meters (m) or degrees (deg).')
        
    return dist


def p_thin(df, xcol='x', ycol='y', datacols='data', radius=10, method='nanmean', units=None):
    """
    Thin a pandas point series based on distance of the points. The first point in the DataFrame is taken and all points 
    within the search distance are found and the given method is applied to the data columns. Used data are deleted from 
    the frame and the procedure is repeated. Data are written to the point which is closest to the half of the search
    distance away from the initial point.
    
    Parameters
    ----------
    pseries: Series of points
    xcol: name of column with x coordinates
    ycol: name of columns with y coordinates
    datacol: name of columns with data (list/string)
    radius: search radius for point distance (meters)
    method: calculation method (a numpy method). Default: 'nanmean'
    
    Returns
    -------
    A thinned dataframe
    
    """
    
    assert units!=None,'Units of x/y must be meters (m) or degrees (deg).'
    
    thinned = pd.DataFrame([],columns=df.columns.values)
    
    print(len(df))
    ct=0
    while len(df)>0:
        df['DIST'] = p_dist(df[xcol].iloc[0], df[ycol].iloc[0], df[xcol], df[ycol], units=units)
        df = df.sort_values('DIST')

        subset = df[df.DIST <= radius]

        if not subset.empty:
            subset = subset.reset_index()
            # find the point ID which is closest to half of the search distance 
            #(to this point the new data will be assigned)
            assign_id = subset.ix[(subset.DIST-(radius/2.)).abs().argsort()[0]].name
            
            fill_dict = {}
            fill_dict[xcol] = subset.loc[assign_id, xcol]
            fill_dict[ycol] = subset.loc[assign_id, ycol]
            for data in datacols:
                try:
                    fill_dict[data] = getattr(np,method)(subset[data].values)
                except TypeError:
                    fill_dict[data] = np.nan
            thinned.loc[len(thinned)] = pd.Series(fill_dict)
            ct+=1

            if ct%100==0:
                print(ct,len(df))
                
        # delete the used points from the data
        df = df.drop(df[df.DIST <= radius].index.values)
        
    print(len(thinned))
    return thinned

In [4]:
files = glob('C:\\Users\\jlandman\\Desktop\\newData\\CH_Huss-Fischer\\GlaThiDa2.0_UFR\\results\\*_TTT.xlsx')

In [5]:
"""
thin_dict = {'tortin': 6,
             'tsanfleuron': 6,
    
}

date_dict = {'tortin':20130399,
            'tsanfleuron':20100399
}
"""
search_dist = 10  # forward search distance for thinning

In [6]:
for file in files:
    fname = os.path.basename(file)
    data = pd.read_excel(file)
    punit = data['POLITICAL_UNIT'].values[0]
    gname = data['GLACIER_NAME'].values[0]
    sdate = data['SURVEY_DATE'].values[0]
    
    # old thinning
    # data = data.groupby(data.index // thin_dict[fname.split('_')[0]]).mean()
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=RuntimeWarning)
        data = p_thin(data, xcol='POINT_LON', ycol='POINT_LAT', datacols=['ELEVATION', 'THICKNESS', 'THICKNESS_UNCERTAINTY'], 
                      radius=search_dist, method='nanmean', units='deg')
    
    data['ELEVATION'] = data['ELEVATION'].round()
    data['THICKNESS'] = data['THICKNESS'].round()
    data['POLITICAL_UNIT'] = punit
    data['GLACIER_NAME'] = gname
    data['POINT_ID'] = range(1,len(data)+1)
    data['SURVEY_DATE'] = sdate
    if not pd.isnull(data['REMARKS']).any():
        data['REMARKS'] =  data['REMARKS'].astype(str) + 'Point data have been thinned (mean) within a search distance of %s m.' % search_dist
    else:
        data['REMARKS'] = 'Point data have been thinned (mean) within a search distance of %s m.' % search_dist
    
    
    data = data[['GlaThiDa_ID' ,'POLITICAL_UNIT', 'GLACIER_NAME', 'SURVEY_DATE', 'POINT_ID',  'POINT_LAT',  'POINT_LON',  'ELEVATION', 
                 'THICKNESS', 'THICKNESS_UNCERTAINTY', 'DATA_FLAG', 'REMARKS']]
    
    data.to_excel(os.path.join(os.path.dirname(file),fname.split('.')[0]+'_thinned.xlsx'), index=False)


1747
95
5007
100 3709
200 2254
300 1014
386
2014
100 934
186
2753
100 1654
200 557
251
2266
100 1137
200 62
206
995
90
1263
90
1356
100 153
116
3238
100 1908
200 737
267
3250
100 1984
200 885
278
2754
100 1660
200 507
237
6227
100 5043
200 3906
300 2763
400 1708
500 595
555
6299
100 5716
200 5269
300 4945
400 4394
500 3998
600 3481
700 2933
800 2335
900 1136
1000 507
1064
308
100 128
166

In [ ]: