In [1]:
import pandas as pd
from glob import glob
from simpledbf import Dbf5
import os
import pyproj
import numpy as np
import warnings
In [2]:
def dist_pyth(lon1, lat1, lon2, lat2):
"""Calculate the distance between two points """
return np.sqrt((lat2 - lat1)**2 + (lon2 - lon1)**2)
def haversine(lon1, lat1, lon2, lat2):
"""
Calculate the great circle distance between one point
on the earth and an array of points (specified in decimal degrees)
"""
# convert decimal degrees to radians
lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])
# haversine formula
dlon = lon2 - lon1
dlat = lat2 - lat1
a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
c = 2 * np.arcsin(np.sqrt(a))
r = 6371000 # Radius of earth in meters
return c * r
def p_dist(lon1, lat1, lon2, lat2, units=None):
"""
Calculate the distance between two *CLOSE* points in meters using Pythagoras
"""
if units == 'm':
dist = dist_pyth(lon1, lat1, lon2, lat2)
elif units == 'deg':
dist = haversine(lon1, lat1, lon2, lat2)
else:
raise ValueError('Units must be meters (m) or degrees (deg).')
return dist
def p_thin(df, xcol='x', ycol='y', datacols='data', radius=10, method='nanmean', units=None):
"""
Thin a pandas point series based on distance of the points. The first point in the DataFrame is taken and all points
within the search distance are found and the given method is applied to the data columns. Used data are deleted from
the frame and the procedure is repeated. Data are written to the point which is closest to the half of the search
distance away from the initial point.
Parameters
----------
pseries: Series of points
xcol: name of column with x coordinates
ycol: name of columns with y coordinates
datacol: name of columns with data (list/string)
radius: search radius for point distance (meters)
method: calculation method (a numpy method). Default: 'nanmean'
Returns
-------
A thinned dataframe
"""
assert units!=None,'Units of x/y must be meters (m) or degrees (deg).'
thinned = pd.DataFrame([],columns=df.columns.values)
print(len(df))
ct=0
while len(df)>0:
df['DIST'] = p_dist(df[xcol].iloc[0], df[ycol].iloc[0], df[xcol], df[ycol], units=units)
df = df.sort_values('DIST')
subset = df[df.DIST <= radius]
if not subset.empty:
subset = subset.reset_index()
# find the point ID which is closest to half of the search distance
#(to this point the new data will be assigned)
assign_id = subset.ix[(subset.DIST-(radius/2.)).abs().argsort()[0]].name
fill_dict = {}
fill_dict[xcol] = subset.loc[assign_id, xcol]
fill_dict[ycol] = subset.loc[assign_id, ycol]
for data in datacols:
try:
fill_dict[data] = getattr(np,method)(subset[data].values)
except TypeError:
fill_dict[data] = np.nan
thinned.loc[len(thinned)] = pd.Series(fill_dict)
ct+=1
if ct%100==0:
print(ct,len(df))
# delete the used points from the data
df = df.drop(df[df.DIST <= radius].index.values)
print(len(thinned))
return thinned
In [3]:
files = glob('C:\\Users\\jlandman\\Desktop\\newData\\AT-IT-KayHelfricht\\Fischer-etal-2015\\*ferner*\\*.dbf')
In [4]:
pop_list = ['Gliederferner', 'Langenferner', 'Rieserferner', 'Uebeltalferner', 'Weissbrunnferner', 'Weissbrunnferner1995']
In [5]:
for i in files:
for j in pop_list:
if j in i:
files.remove(i)
files
In [6]:
thin_dict = {'Feuersteinferner': 25,
'Grafferner': 15,
'Hangendenferner': 25,
'ed_Hohenferner':25,
'Laaserferner':65,
'Langtaufererferner':20,
'Matscherferner':20,
'obererOrtlerferner':2,
'Schranferner':10,
'Suldenferner':20
}
date_dict = {'Feuersteinferner': 20130608,
'Grafferner': 20130612,
'Hangendenferner': 20130608,
'ed_Hohenferner':20130514,
'Laaserferner':20130513,
'Langtaufererferner':20130613,
'Matscherferner':20130612,
'obererOrtlerferner':np.nan,
'Schranferner':20130514,
'Suldenferner':20130611}
In [7]:
search_dist = 10 # forward search distance for thinning
# Some constants
gtd_id = np.nan # GlaThiDa_ID
punit = 'IT' # political unit
gname = ''
pid = np.nan
thickness_unc = np.nan
data_flag = np.nan
remarks = ''
In [8]:
utm32 = pyproj.Proj(init='epsg:32632') #WGS84 UTM32N
latlon = pyproj.Proj(init='epsg:4326') #WGS84 lat/lon
In [10]:
for file in files:
fname = os.path.basename(file)
dbf = Dbf5(file)
data = dbf.to_dataframe()
data.columns = map(str.lower, data.columns) # make all columns lowercase, that's easier
#former data thinning
#data = data.groupby(data.index // thin_dict[fname.split('.')[0]]).mean() # average over every x values as specified above
try:
# project UTM values to lat/lon
xs = data.x.values
ys = data.y.values
x1,y1 = utm32(xs, ys)
lons, lats = pyproj.transform(utm32,latlon,xs,ys)
data['POINT_LAT'] = lats
data['POINT_LON'] = lons
try:
data['ELEVATION'] = (data['ed'] + data['ug']).round()
except KeyError:
data['ELEVATION'] = np.nan
data['THICKNESS'] = data['ed'].round()
data['THICKNESS_UNCERTAINTY'] = thickness_unc
# time to thin the data points
print(fname.split('.')[0].upper())
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=RuntimeWarning)
data = p_thin(data, xcol='POINT_LON', ycol='POINT_LAT', datacols=['ELEVATION', 'THICKNESS', 'THICKNESS_UNCERTAINTY'], radius=search_dist, method='nanmean', units='deg')
data['ELEVATION'] = data['ELEVATION'].round()
data['THICKNESS'] = data['THICKNESS'].round()
data['GlaThiDa_ID'] = gtd_id
data['POLITICAL_UNIT'] = punit
data['GLACIER_NAME'] = fname.split('.')[0].upper()
data['SURVEY_DATE'] = date_dict[fname.split('.')[0]]
data['POINT_ID'] = range(1,len(data)+1)
data['DATA_FLAG'] = data_flag
data['REMARKS'] = remarks + ' Point data have been thinned (mean) within a search distance of %s m.' % search_dist
data = data[['GlaThiDa_ID','POLITICAL_UNIT','GLACIER_NAME','SURVEY_DATE','POINT_ID','POINT_LAT','POINT_LON',
'ELEVATION','THICKNESS','THICKNESS_UNCERTAINTY','DATA_FLAG','REMARKS']]
data.to_excel(os.path.join(os.path.dirname(file),fname.split('.')[0]+'.xlsx'), index=False)
except AttributeError:
print('Table %s has no coordinates' % fname)
In [ ]: