In [14]:
import pandas as pd
from glob import glob
import os
import numpy as np

In [15]:
def dist_pyth(lon1, lat1, lon2, lat2):
    """Calculate the distance between two points """
    return np.sqrt((lat2 - lat1)**2 + (lon2 - lon1)**2)


def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between one point 
    on the earth and an array of points (specified in decimal degrees)
    """
    
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a)) 
    r = 6371000 # Radius of earth in meters
    return c * r


def p_dist(lon1, lat1, lon2, lat2, units=None):
    """
    Calculate the distance between two *CLOSE* points in meters using Pythagoras
    """
    if units == 'm':
        dist = dist_pyth(lon1, lat1, lon2, lat2)
    elif units == 'deg':
        dist = haversine(lon1, lat1, lon2, lat2)
    else:
        raise ValueError('Units must be meters (m) or degrees (deg).')
        
    return dist


def p_thin(df, xcol='x', ycol='y', datacols='data', radius=10, method='nanmean', units=None):
    """
    Thin a pandas point series based on distance of the points. The first point in the DataFrame is taken and all points 
    within the search distance are found and the given method is applied to the data columns. Used data are deleted from 
    the frame and the procedure is repeated. Data are written to the point which is closest to the half of the search
    distance away from the initial point.
    
    Parameters
    ----------
    pseries: Series of points
    xcol: name of column with x coordinates
    ycol: name of columns with y coordinates
    datacol: name of columns with data (list/string)
    radius: search radius for point distance (meters)
    method: calculation method (a numpy method). Default: 'nanmean'
    
    Returns
    -------
    A thinned dataframe
    
    """
    
    assert units!=None,'Units of x/y must be meters (m) or degrees (deg).'
    
    thinned = pd.DataFrame([],columns=df.columns.values)
    
    print(len(df))
    ct=0
    while len(df)>0:
        df['DIST'] = p_dist(df[xcol].iloc[0], df[ycol].iloc[0], df[xcol], df[ycol], units=units)
        df = df.sort_values('DIST')

        subset = df[df.DIST <= radius]

        if not subset.empty:
            subset = subset.reset_index()
            # find the point ID which is closest to half of the search distance 
            #(to this point the new data will be assigned)
            assign_id = subset.ix[(subset.DIST-(radius/2.)).abs().argsort()[0]].name
            
            fill_dict = {}
            fill_dict[xcol] = subset.loc[assign_id, xcol]
            fill_dict[ycol] = subset.loc[assign_id, ycol]
            for data in datacols:
                try:
                    fill_dict[data] = getattr(np,method)(subset[data].values)
                except TypeError:
                    fill_dict[data] = np.nan
            thinned.loc[len(thinned)] = pd.Series(fill_dict)
            ct+=1

            if ct%100==0:
                print(ct,len(df))
                
        # delete the used points from the data
        df = df.drop(df[df.DIST <= radius].index.values)
        
    print(len(thinned))
    return thinned

In [16]:
files = glob('C:\\Users\\jlandman\\Desktop\\newData\\SJ_AQ_Navarro\\final\\*_Johannes.xlsx')
print(files)


['C:\\Users\\jlandman\\Desktop\\newData\\SJ_AQ_Navarro\\final\\GlaThiDa_Navarro1_Johannes.xlsx', 'C:\\Users\\jlandman\\Desktop\\newData\\SJ_AQ_Navarro\\final\\GlaThiDa_Navarro2_Johannes.xlsx']

In [17]:
"""
thin_dict = {"ELFENBEINBREEN": 14,
             "SVEIGBREEN": 4,
}
"""

search_dist = 10

In [18]:
data_new = pd.DataFrame([])

for file in files:
    data = pd.read_excel(file, sheetname='TTT - GL. THICKNESS POINT DATA')
    for gid in np.unique(data.GlaThiDa_ID.values):
        temp = data[data.GlaThiDa_ID==gid]

        gname = temp[temp.GlaThiDa_ID == gid].GLACIER_NAME.values[0]
        pol_u = temp[temp.GlaThiDa_ID == gid].POLITICAL_UNIT.values[0]
        surv_date = temp[temp.GlaThiDa_ID == gid].SURVEY_DATE.values[0]
        remarks = temp[temp.GlaThiDa_ID == gid].REMARKS.iloc[0]
        if not pd.isnull(remarks).any():
            remarks = remarks + ' Point data have been thinned (mean) within a search distance of %s m.' % search_dist
        else:
            remarks = 'Point data have been thinned (mean) within a search distance of %s m.' % search_dist
        gtd_id = gid
        
        # old point thinning
        # temp = temp.groupby(temp.index // thin_dict[gname]).mean() # average over every x values as specified above
        temp = p_thin(temp, xcol='POINT_LON', ycol='POINT_LAT', datacols=['ELEVATION', 'THICKNESS', 'THICKNESS_UNCERTAINTY'], 
                      radius=search_dist, method='nanmean', units='deg')

        temp['POLITICAL_UNIT'] = pol_u
        temp['GLACIER_NAME'] = gname
        temp['REMARKS'] = remarks
        temp['DATA_FLAG'] = np.nan
        temp['GlaThiDa_ID'] = gtd_id
        temp['SURVEY_DATE'] = surv_date
        temp['POINT_ID'] = range(1,len(temp)+1)

        data_new = data_new.append(temp, ignore_index=True)

data_new['THICKNESS_UNCERTAINTY'] = data_new['THICKNESS_UNCERTAINTY'].round()
data_new['THICKNESS'] = data_new['THICKNESS'].round()
data_new['ELEVATION'] = data_new['ELEVATION'].round()

data_new = data_new[['GlaThiDa_ID','POLITICAL_UNIT', 'GLACIER_NAME', 'SURVEY_DATE', 'POINT_ID',  'POINT_LAT',  'POINT_LON', 
                     'ELEVATION', 'THICKNESS', 'THICKNESS_UNCERTAINTY', 'DATA_FLAG', 'REMARKS']]

data_new = data_new.sort_values(by=['GlaThiDa_ID','POINT_ID'])
data_new.to_excel(os.path.join(os.path.dirname(file),file.split('.')[0]+'_thinned.xlsx'), index=False)


14624
100 14377
200 14156
300 13952
400 13733
500 13512
600 13302
700 13098
800 12883
900 12665
1000 12463
1100 12233
1200 12031
1300 11827
1400 11619
1500 11396
1600 11170
1700 10909
1800 10684
1900 10482
2000 10269
2100 10066
2200 9855
2300 9635
2400 9433
2500 9230
2600 9028
2700 8828
2800 8530
2900 8257
3000 8043
3100 7838
3200 7638
3300 7429
3400 7229
3500 7014
3600 6804
3700 6603
3800 6357
3900 6152
4000 5949
4100 5747
4200 5544
4300 5347
4400 5150
4500 4948
4600 4748
4700 4545
4800 4341
4900 4127
5000 3924
5100 3718
5200 3514
5300 3311
5400 3112
5500 2890
5600 2687
5700 2465
5800 2247
5900 2038
6000 1821
6100 1612
6200 1404
6300 1200
6400 995
6500 787
6600 575
6700 287
6800 71
6835
4241
100 4064
200 3893
300 3711
400 3531
500 3335
600 3174
700 2984
800 2824
900 2652
1000 2488
1100 2330
1200 2208
1300 2032
1400 1768
1500 1453
1600 1241
1700 1036
1800 845
1900 665
2000 475
2100 277
2200 81
2246

In [ ]: