In [1]:
import pandas as pd
from glob import glob
from simpledbf import Dbf5
import os
import pyproj
import numpy as np
import warnings

In [2]:
def dist_pyth(lon1, lat1, lon2, lat2):
    """Calculate the distance between two points """
    return np.sqrt((lat2 - lat1)**2 + (lon2 - lon1)**2)


def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between one point 
    on the earth and an array of points (specified in decimal degrees)
    """
    
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a)) 
    r = 6371000 # Radius of earth in meters
    return c * r


def p_dist(lon1, lat1, lon2, lat2, units=None):
    """
    Calculate the distance between two *CLOSE* points in meters using Pythagoras
    """
    if units == 'm':
        dist = dist_pyth(lon1, lat1, lon2, lat2)
    elif units == 'deg':
        dist = haversine(lon1, lat1, lon2, lat2)
    else:
        raise ValueError('Units must be meters (m) or degrees (deg).')
        
    return dist


def p_thin(df, xcol='x', ycol='y', datacols='data', radius=10, method='nanmean', units=None):
    """
    Thin a pandas point series based on distance of the points. The first point in the DataFrame is taken and all points 
    within the search distance are found and the given method is applied to the data columns. Used data are deleted from 
    the frame and the procedure is repeated. Data are written to the point which is closest to the half of the search
    distance away from the initial point.
    
    Parameters
    ----------
    pseries: Series of points
    xcol: name of column with x coordinates
    ycol: name of columns with y coordinates
    datacol: name of columns with data (list/string)
    radius: search radius for point distance (meters)
    method: calculation method (a numpy method). Default: 'nanmean'
    
    Returns
    -------
    A thinned dataframe
    
    """
    
    assert units!=None,'Units of x/y must be meters (m) or degrees (deg).'
    
    thinned = pd.DataFrame([],columns=df.columns.values)
    
    print(len(df))
    ct=0
    while len(df)>0:
        df['DIST'] = p_dist(df[xcol].iloc[0], df[ycol].iloc[0], df[xcol], df[ycol], units=units)
        df = df.sort_values('DIST')

        subset = df[df.DIST <= radius]

        if not subset.empty:
            subset = subset.reset_index()
            # find the point ID which is closest to half of the search distance 
            #(to this point the new data will be assigned)
            assign_id = subset.ix[(subset.DIST-(radius/2.)).abs().argsort()[0]].name
            
            fill_dict = {}
            fill_dict[xcol] = subset.loc[assign_id, xcol]
            fill_dict[ycol] = subset.loc[assign_id, ycol]
            for data in datacols:
                try:
                    fill_dict[data] = getattr(np,method)(subset[data].values)
                except TypeError:
                    fill_dict[data] = np.nan
            thinned.loc[len(thinned)] = pd.Series(fill_dict)
            ct+=1

            if ct%100==0:
                print(ct,len(df))
                
        # delete the used points from the data
        df = df.drop(df[df.DIST <= radius].index.values)
        
    print(len(thinned))
    return thinned

In [3]:
files = glob('C:\\Users\\jlandman\\Desktop\\newData\\AT-IT-KayHelfricht\\Fischer-etal-2015\\*ferner*\\*.dbf')

In [4]:
pop_list = ['Gliederferner', 'Langenferner', 'Rieserferner', 'Uebeltalferner', 'Weissbrunnferner', 'Weissbrunnferner1995']

In [5]:
for i in files:
    for j in pop_list:
        if j in i:
            files.remove(i)
files


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-5-ed39e14f96af> in <module>()
      2     for j in pop_list:
      3         if j in i:
----> 4             files.remove(i)
      5 files

ValueError: list.remove(x): x not in list

In [6]:
thin_dict = {'Feuersteinferner': 25,
             'Grafferner': 15,
             'Hangendenferner': 25,
             'ed_Hohenferner':25,
             'Laaserferner':65,
             'Langtaufererferner':20,
             'Matscherferner':20,
             'obererOrtlerferner':2,
             'Schranferner':10,
             'Suldenferner':20
    
}

date_dict = {'Feuersteinferner': 20130608,
             'Grafferner': 20130612,
             'Hangendenferner': 20130608,
             'ed_Hohenferner':20130514,
             'Laaserferner':20130513,
             'Langtaufererferner':20130613,
             'Matscherferner':20130612,
             'obererOrtlerferner':np.nan,
             'Schranferner':20130514,
             'Suldenferner':20130611}

In [7]:
search_dist = 10  # forward search distance for thinning 

# Some constants
gtd_id = np.nan # GlaThiDa_ID
punit = 'IT' # political unit
gname = ''
pid = np.nan
thickness_unc = np.nan
data_flag = np.nan
remarks = ''

In [8]:
utm32 = pyproj.Proj(init='epsg:32632') #WGS84 UTM32N
latlon = pyproj.Proj(init='epsg:4326')  #WGS84 lat/lon

In [10]:
for file in files:
    fname = os.path.basename(file)
    dbf = Dbf5(file)
    data = dbf.to_dataframe()
    data.columns = map(str.lower, data.columns) # make all columns lowercase, that's easier
    #former data thinning
    #data = data.groupby(data.index // thin_dict[fname.split('.')[0]]).mean() # average over every x values as specified above
    
    try:
        # project UTM values to lat/lon
        xs = data.x.values
        ys = data.y.values
        
        x1,y1 = utm32(xs, ys)
        lons, lats = pyproj.transform(utm32,latlon,xs,ys)
        
        data['POINT_LAT'] = lats
        data['POINT_LON'] = lons
        
        try:
            data['ELEVATION'] = (data['ed'] + data['ug']).round()
        except KeyError:
            data['ELEVATION'] = np.nan
            
        data['THICKNESS'] = data['ed'].round()
        data['THICKNESS_UNCERTAINTY'] = thickness_unc
        
        # time to thin the data points
        print(fname.split('.')[0].upper())
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=RuntimeWarning)
            data = p_thin(data, xcol='POINT_LON', ycol='POINT_LAT', datacols=['ELEVATION', 'THICKNESS', 'THICKNESS_UNCERTAINTY'], radius=search_dist, method='nanmean', units='deg')
        
        data['ELEVATION'] = data['ELEVATION'].round()
        data['THICKNESS'] = data['THICKNESS'].round()
        
        data['GlaThiDa_ID'] = gtd_id
        data['POLITICAL_UNIT'] = punit
        data['GLACIER_NAME'] = fname.split('.')[0].upper()
        data['SURVEY_DATE'] = date_dict[fname.split('.')[0]]
        data['POINT_ID'] = range(1,len(data)+1)
        data['DATA_FLAG'] = data_flag
        data['REMARKS'] = remarks + ' Point data have been thinned (mean) within a search distance of %s m.' % search_dist
        
        data = data[['GlaThiDa_ID','POLITICAL_UNIT','GLACIER_NAME','SURVEY_DATE','POINT_ID','POINT_LAT','POINT_LON',
                     'ELEVATION','THICKNESS','THICKNESS_UNCERTAINTY','DATA_FLAG','REMARKS']]
        data.to_excel(os.path.join(os.path.dirname(file),fname.split('.')[0]+'.xlsx'), index=False)
    except AttributeError:
        print('Table %s has no coordinates' % fname)


FEUERSTEINFERNER
25287
100 21411
200 15322
300 11735
400 7667
500 2603
590
GRAFFERNER
13608
100 10544
200 7860
300 5063
400 3526
491
HANGENDENFERNER
23549
100 17558
200 13315
300 8714
400 5532
494
ED_HOHENFERNER
27486
100 23511
200 19624
300 17340
400 12510
500 6882
600 1624
631
LAASERFERNER
65895
100 54706
200 49549
300 43466
400 37018
500 32127
600 28036
700 18232
800 9728
891
LANGTAUFERERFERNER
20909
100 18738
200 17155
300 15386
400 13212
500 9932
600 4286
700 2280
800 425
830
MATSCHERFERNER
20392
100 16464
200 12221
300 8561
400 6440
500 4505
600 1034
641
Table obererOrtlerferner.dbf has no coordinates
SCHRANFERNER
9275
100 6273
200 3763
300 1040
315
SULDENFERNER
19358
100 13000
200 7718
300 5020
400 1030
431
Table Weissbrunn1995.dbf has no coordinates

In [ ]: