In [11]:
import pandas as pd
from glob import glob
import os
import numpy as np

In [12]:
def dist_pyth(lon1, lat1, lon2, lat2):
    """Calculate the distance between two points """
    return np.sqrt((lat2 - lat1)**2 + (lon2 - lon1)**2)


def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between one point 
    on the earth and an array of points (specified in decimal degrees)
    """
    
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a)) 
    r = 6371000 # Radius of earth in meters
    return c * r


def p_dist(lon1, lat1, lon2, lat2, units=None):
    """
    Calculate the distance between two *CLOSE* points in meters using Pythagoras
    """
    if units == 'm':
        dist = dist_pyth(lon1, lat1, lon2, lat2)
    elif units == 'deg':
        dist = haversine(lon1, lat1, lon2, lat2)
    else:
        raise ValueError('Units must be meters (m) or degrees (deg).')
        
    return dist


def p_thin(df, xcol='x', ycol='y', datacols='data', radius=10, method='nanmean', units=None):
    """
    Thin a pandas point series based on distance of the points. The first point in the DataFrame is taken and all points 
    within the search distance are found and the given method is applied to the data columns. Used data are deleted from 
    the frame and the procedure is repeated. Data are written to the point which is closest to the half of the search
    distance away from the initial point.
    
    Parameters
    ----------
    pseries: Series of points
    xcol: name of column with x coordinates
    ycol: name of columns with y coordinates
    datacol: name of columns with data (list/string)
    radius: search radius for point distance (meters)
    method: calculation method (a numpy method). Default: 'nanmean'
    
    Returns
    -------
    A thinned dataframe
    
    """
    
    assert units!=None,'Units of x/y must be meters (m) or degrees (deg).'
    
    thinned = pd.DataFrame([],columns=df.columns.values)
    
    print(len(df))
    ct=0
    while len(df)>0:
        df['DIST'] = p_dist(df[xcol].iloc[0], df[ycol].iloc[0], df[xcol], df[ycol], units=units)
        df = df.sort_values('DIST')

        subset = df[df.DIST <= radius]

        if not subset.empty:
            subset = subset.reset_index()
            # find the point ID which is closest to half of the search distance 
            #(to this point the new data will be assigned)
            assign_id = subset.ix[(subset.DIST-(radius/2.)).abs().argsort()[0]].name
            
            fill_dict = {}
            fill_dict[xcol] = subset.loc[assign_id, xcol]
            fill_dict[ycol] = subset.loc[assign_id, ycol]
            for data in datacols:
                try:
                    fill_dict[data] = getattr(np,method)(subset[data].values)
                except TypeError:
                    fill_dict[data] = np.nan
            thinned.loc[len(thinned)] = pd.Series(fill_dict)
            ct+=1

            if ct%100==0:
                print(ct,len(df))
                
        # delete the used points from the data
        df = df.drop(df[df.DIST <= radius].index.values)
        
    print(len(thinned))
    return thinned

In [13]:
file = 'C:\\Users\\jlandman\\Desktop\\newData\\RU_Kutuzov\\GlaThiDa_2-0_Kutuzov_Johannes.xls'
print(file)


C:\Users\jlandman\Desktop\newData\RU_Kutuzov\GlaThiDa_2-0_Kutuzov_Johannes.xls

In [14]:
"""
thin_dict = {(2072, 'DJANKUAT'): 2,
             (2073, 'DJANKUAT'): 2,
             (2074, 'MARUKH'): 2,
}
"""

search_dist = 10

In [15]:
data_new = pd.DataFrame([])

data = pd.read_excel(file, sheetname='TTT - GL. THICKNESS POINT DATA')


for gid in np.unique(data.GlaThiDa_ID.values):
    temp = data[data.GlaThiDa_ID==gid]

    gname = temp[temp.GlaThiDa_ID == gid].GLACIER_NAME.values[0]
    pol_u = temp[temp.GlaThiDa_ID == gid].POLITICAL_UNIT.values[0]
    surv_date = temp[temp.GlaThiDa_ID == gid].SURVEY_DATE.values[0]
    remarks = temp[temp.GlaThiDa_ID == gid].REMARKS.iloc[0] + ' Point data have been thinned (mean) within a search distance of %s m.' % search_dist
    gtd_id = gid
    
    #old point thinning
    # temp = temp.groupby(temp.index // thin_dict[(gid, gname)]).mean() # average over every x values as specified above
    temp = p_thin(temp, xcol='POINT_LON', ycol='POINT_LAT', datacols=['ELEVATION', 'THICKNESS', 'THICKNESS_UNCERTAINTY'],
                  radius=search_dist, method='nanmean', units='deg')

    temp['POLITICAL_UNIT'] = pol_u
    temp['GLACIER_NAME'] = gname
    temp['REMARKS'] = remarks
    temp['DATA_FLAG'] = np.nan
    temp['GlaThiDa_ID'] = gtd_id
    temp['SURVEY_DATE'] = surv_date
    temp['POINT_ID'] = range(1,len(temp)+1)


    data_new = data_new.append(temp, ignore_index=True)

    
data_new['THICKNESS'] = data_new['THICKNESS'].round()
data_new['ELEVATION'] = data_new['ELEVATION'].round()
data_new['THICKNESS_UNCERTAINTY'] = data_new['THICKNESS_UNCERTAINTY'].round(2)

data_new = data_new[['GlaThiDa_ID','POLITICAL_UNIT', 'GLACIER_NAME', 'SURVEY_DATE', 'POINT_ID',  'POINT_LAT',  'POINT_LON',  
                     'ELEVATION', 'THICKNESS', 'THICKNESS_UNCERTAINTY', 'DATA_FLAG', 'REMARKS']]

data_new = data_new.sort_values(by=['GlaThiDa_ID','POINT_ID'])
data_new.to_excel(os.path.join(os.path.dirname(file),file.split('.')[0]+'_thinned.xlsx'), index=False)


2035
C:\Users\jlandman\Anaconda3\lib\site-packages\ipykernel\__main__.py:67: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
100 1708
200 1332
300 1011
400 740
500 428
600 71
624
1983
C:\Users\jlandman\Anaconda3\lib\site-packages\ipykernel\__main__.py:67: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
100 1643
200 1366
300 1094
400 783
500 506
600 217
666
2458
C:\Users\jlandman\Anaconda3\lib\site-packages\ipykernel\__main__.py:67: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
100 2160
200 1887
300 1656
400 1411
500 1223
600 1055
700 872
800 565
900 255
988

In [ ]:


In [ ]: