Imports


In [1]:
# Imports
import math as m
import pandas as pd
import numpy as np
import scipy as sp
import os
my_plotly_api_key = os.environ.get('MY_PLOTLY_API_KEY') # retrive api_key from operating system
import plotly
plotly.tools.set_credentials_file(username='agu3rra', api_key=my_plotly_api_key) # setting up credentials; Plotly is an online service.
import plotly.plotly as py # import graphics library
import plotly.graph_objs as go
from scipy import integrate
import random

# Data loading
dfCoordinates = pd.read_csv('gpsDataUTM.csv') # reads data into dataframe

#---a--#-----#-----#-----#-----#-----#-----#-----#-----#-----#-----#-----#-----#
# Calculate volume of earth from the lowest measured point
# Idea: generate training set to a Neural Network and treat it as a regression problem

# Part 1: Generate training set
# Idea: generate a set of mathematical surface functions z=f(x,y) that go by
# close to the points we're getting from GPS. Use these funcitons to generate a
# set of (x,y,z) points and a double integral to calculate the exact valume of
# the solid delimited by it.

# Better idea: maybe the model can generelize from general space, no need to actually find functions that have values close to our sample set.
# The only remaining challenge is to select only a subset of points that match the input data size (random selection)

Main


In [2]:
# Determine bounds of interest
x_max = dfCoordinates['x_rel'].max()
y_max = dfCoordinates['y_rel'].max()
z_max = dfCoordinates['z_rel'].max()

In [3]:
# Remember: in terrain measurements, all x, y and z values will be positive
# Dev note: increasing number of samples may be required
x = np.linspace(0.0,x_max,num=500) # generate linear space for x values
y = np.linspace(0.0,y_max,num=500)
xGrid, yGrid = np.meshgrid(x,y) # generate mesh grid for plotting sample data

In [4]:
# Generate a equations and corresponding double integrals
# Data of interest: functions with variations bounded by 0:*_max values

# Dev note: How many double integral evaluations are needed for the model to be able to generalize well? Starting at 1000
integrate.dblquad(lambda y, x: 3 + x**2 - 2*y, 0.0, x_max, lambda x: -x, lambda x: x)


Out[4]:
(1175537925.0989282, 1.3051092707649128e-05)

In [5]:
def combination(n,k): # Combination of n samples taken k at a time.
    return m.factorial(n)/(m.factorial(n-k)*m.factorial(k))

In [6]:
combination(156,3) # there are 620620 planes defined by all 156 distinct points in my sample data


Out[6]:
620620.0

In [7]:
def combinations(iterable, r):
    # combinations('ABCD', 2) --> AB AC AD BC BD CD
    # combinations(range(4), 3) --> 012 013 023 123
    pool = tuple(iterable)
    n = len(pool)
    if r > n:
        return
    indices = list(range(r))
    yield tuple(pool[i] for i in indices)
    while True:
        for i in reversed(range(r)):
            if indices[i] != i + n - r:
                break
        else:
            return
        indices[i] += 1
        for j in range(i+1, r):
            indices[j] = indices[j-1] + 1
        yield tuple(pool[i] for i in indices)

In [8]:
c = combinations('ABCD',2)

In [9]:
for i in c:
    print(i)


('A', 'B')
('A', 'C')
('A', 'D')
('B', 'C')
('B', 'D')
('C', 'D')

In [10]:
# return a list with each of the 156 points from pandas dataframe and apply combination
# define all plane equations that can be obtained from combination of each point 3 at a time.
# evaluate bounded numerical integral of each plane
# select a subset of points that match that bounded space on the evaluated integral and match with corresponding volume calculation (double integral result)
# Your training set will be comprised of 620.620 sets of 156 points (156x3 = 468 inputs) and corresponding volume.
# Train your model on a neural network

Sample: Generating plane equation from 3 points


In [11]:
# Sample code: generate the plane equation of any 3 points available at the GPS dataframe

In [12]:
A = np.array([dfCoordinates.iloc[2].values[5],
              dfCoordinates.iloc[2].values[6],
              dfCoordinates.iloc[2].values[7]])

In [13]:
B = np.array([dfCoordinates.iloc[20].values[5],
              dfCoordinates.iloc[20].values[6],
              dfCoordinates.iloc[20].values[7]])

In [14]:
C = np.array([dfCoordinates.iloc[60].values[5],
              dfCoordinates.iloc[60].values[6],
              dfCoordinates.iloc[60].values[7]])

In [15]:
# A, B and C are relative coordinates collected from the GPS survey

In [16]:
# Plane equation: a(x-xo) + b(y-yo) + c(z-zo) = 0
# , where [xo,yo,zo] is a point in the plane (so A, B or C can be used)
# [a,b,c] is a vector perpendicular to the plane (it can be obtained from the cross product of any 2 vectors within the plane)

In [17]:
AB = B-A # Vector AB is within the desired plane

In [18]:
print(A)
print(B)
print(AB)
print(type(AB))


[ 214.019258     98.79277407   14.14      ]
[  0.0209151   14.59788367   9.47      ]
[-213.9983429  -84.1948904   -4.67     ]
<class 'numpy.ndarray'>

In [19]:
BC = C-B # Vector BC is also within the plane

In [20]:
VP = np.cross(AB,BC) # VP is a vector perpendicular to the desired plane

In [21]:
VP # VP contains a,b,c values of the plane equation


Out[21]:
array([  164.82014359,  -494.98349017,  1371.27261237])

In [22]:
# Plotting plane:

# Equation coeficients and point within plane
a = VP[0]
b = VP[1]
c = VP[2]
xo = A[0]
yo = A[1]
zo = A[2]

In [23]:
z = ((-a*(xGrid-xo)-b*(yGrid-yo))/c)+zo # plane equation

In [24]:
z.shape


Out[24]:
(500, 500)

In [25]:
surface = go.Surface(x=x, y=y, z=z)

xPoints = [A[0], B[0], C[0]]
yPoints = [A[1], B[1], C[1]]
zPoints = [A[2], B[2], C[2]]

points = go.Scatter3d(x=xPoints,
                    y=yPoints,
                    z=zPoints,
                    mode='markers',
                    marker=dict(size=5,opacity=0.8))
data = [surface, points]

layoutDictionary = dict(gridcolor='rgb(255, 255, 255)',
                        zerolinecolor='rgb(255, 255, 255)',
                        showbackground=True,
                        backgroundcolor='rgb(230, 230,230)')

layout = go.Layout(
    title='Plane Plot',
    scene=dict(
        xaxis=layoutDictionary,
        yaxis=layoutDictionary,
        zaxis=layoutDictionary
    )
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='Plane plot')


Out[25]:

Plotting a cube and testing double integral volume calculation


In [26]:
# Defining a cuboid of 5x4x10 (Volume = 200)
A = np.array([5,0,10])
B = np.array([0,4,10])
C = np.array([5,4,10])
AB = B-A # Vector AB is within the desired plane
BC = C-B # Vector AB is within the desired plane
VP = np.cross(AB,BC) # perpendicular vector
# Equation coeficients and point within plane
a = VP[0]
b = VP[1]
c = VP[2]
xo = A[0]
yo = A[1]
zo = A[2]
z = ((-a*(xGrid-xo)-b*(yGrid-yo))/c)+zo # plane equation
surface = go.Surface(x=x, y=y, z=z)

xPoints = [A[0], B[0], C[0]]
yPoints = [A[1], B[1], C[1]]
zPoints = [A[2], B[2], C[2]]

points = go.Scatter3d(x=xPoints,
                    y=yPoints,
                    z=zPoints,
                    mode='markers',
                    marker=dict(size=5,opacity=0.8))
data = [surface, points]

layoutDictionary = dict(gridcolor='rgb(255, 255, 255)',
                        zerolinecolor='rgb(255, 255, 255)',
                        showbackground=True,
                        backgroundcolor='rgb(230, 230,230)')

layout = go.Layout(
    title='Cubic Plot',
    scene=dict(
        xaxis=layoutDictionary,
        yaxis=layoutDictionary,
        zaxis=layoutDictionary
    )
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='Cubic plot')


Out[26]:

In [27]:
# Calculating corresponding volume with double integral.
# Expected value: 200
integrate.dblquad(lambda y, x: ((-a*(x-xo)-b*(y-yo))/c)+zo, 0.0, 5.0, lambda x: 0.0, lambda x: 4.0)


Out[27]:
(200.0, 2.220446049250313e-12)

In [28]:
# Success! The above code shows us how to define a plane and calculate the corresponding volume of a bounded shape.

Main: resuming code


In [29]:
# return a list with each of the 155 points from pandas dataframe and apply combination
# define all plane equations that can be obtained from combination of each point 3 at a time.
dfSurvey = dfCoordinates[['x_rel',
                          'y_rel',
                          'z_rel']]

In [30]:
dfSurvey.head()


Out[30]:
x_rel y_rel z_rel
0 213.860510 127.527218 14.12
1 213.700623 127.211200 14.12
2 214.019258 98.792774 14.14
3 214.261816 70.346291 11.64
4 217.136937 44.608228 9.87

In [31]:
pointsCount = int(dfSurvey.count()[0]) # number of points available

In [32]:
combination(155,3) # number of possible combinations


Out[32]:
608685.0

In [60]:
planeCombinations = combinations(range(pointsCount),3) # indexes all possible combinations of available points
# DEBUG planeCombinations = combinations(range(4),3) # reducing set for debugging

mlDataset = pd.DataFrame([]) # initialize new dataframe for the machine learning dataset
# Determine plane equation for each set of 3 points and calculate corresponding volume
for pointSet in planeCombinations:
    point1index = pointSet[0] # index of the first point
    point2index = pointSet[1]
    point3index = pointSet[2]
    
    # fetching actual points according to index
    point1 = np.array([dfSurvey.iloc[point1index].values[0],
                       dfSurvey.iloc[point1index].values[1],
                       dfSurvey.iloc[point1index].values[2]])
    point2 = np.array([dfSurvey.iloc[point2index].values[0],
                       dfSurvey.iloc[point2index].values[1],
                       dfSurvey.iloc[point2index].values[2]])    
    point3 = np.array([dfSurvey.iloc[point3index].values[0],
                       dfSurvey.iloc[point3index].values[1],
                       dfSurvey.iloc[point3index].values[2]])
    
    planeVectorOne = point2 - point1 # a vector within the plane
    planeVectorTwo = point3 - point2 # a vector within the plane
    perpendicularVector = np.cross(planeVectorOne, planeVectorTwo) # plane equation coeficients obtained <a,b,c>

    # saving constants to literals
    a = perpendicularVector[0]
    b = perpendicularVector[1]
    c = perpendicularVector[2]
    xo = point1[0]
    yo = point1[1]
    zo = point1[2]
    
    # Plane equation a(x-xo)+b(y-yo)+c(z-zo)=0 with isolated z:
    z = ((-a*(xGrid-xo)-b*(yGrid-yo))/c)+zo
    
    pointsInGrid = z.shape[0]
    evalutePoints = pointsInGrid - pointsCount # evaluate number of available points within your grid - samples from GPS
    if evalutePoints>0: # normal scenario
        randomIndexesX = random.sample(range(pointsInGrid), pointsCount) # generate random list of x points, where x is the number of points collected by the GPS
        randomIndexesY = random.sample(range(pointsInGrid), pointsCount)
    else: # your grid has less points than colleted from the GPS survey. Increase grid size
        print("Your grid has %s points less than required. Increase grid size." % evaluatePoints)

    mlSample = pd.DataFrame([]) # initialize new dataframe for sample storage
    
    for i in range(pointsCount):
        x_value = xGrid[randomIndexesX[i]][randomIndexesX[i]] # obtain value of corresponding random value from the xGrid
        y_value = yGrid[randomIndexesY[i]][randomIndexesY[i]]
        z_value = z[randomIndexesX[i]][randomIndexesY[i]] # Corresponding z value evaluated for x,y

        # append data to dataframe
        samplePoint = pd.Series(dict(x=x_value,
                                     y=y_value,
                                     z=z_value))
        mlSample = mlSample.append(samplePoint,
                                   ignore_index=True) # append newly converted data to dataframe

    # the dfTrainingSample contains a set of points for which we need to evaluate the corresponding bounds and volume    
    # evaluate max and min bounds
    x_min = mlSample['x'].min()
    x_max = mlSample['x'].max()
    y_min = mlSample['y'].min()
    y_max = mlSample['y'].max()
    # evaluating bounds for z is not required for evaluating the double integral (plane equation is used for z)
    
    # Evaluating corresponding volume for this point set
    volume = integrate.dblquad(lambda y, x: ((-a*(x-xo)-b*(y-yo))/c)+zo, 
                               x_min, x_max, 
                               lambda x: y_min, lambda x: y_max)
    volume = volume[0] # saving the computed volume. The second element in the list holds the error.
#     print(dfTrainingSample.head())
#     print(volume)
    # at this point of the code we have a dataframe with corresponding points in a plane and a corresponding volume.
    # This is a sample! We made one! Now on to saving it in a format compatible with the neural network
    # Saving training sample: for 155 data points >> row = 466 elements all x's, all y's, all z's + volume
    # append data to mlDataset
    
    #create numpy array to store in mlDataframe (single row)
    sampleData = np.concatenate([mlSample['x'].values,
                                 mlSample['y'].values,
                                 mlSample['z'].values])
    datasetSample = pd.Series(dict(inputData = sampleData,
                                   inputLabel = volume))
    mlDataset = mlDataset.append(datasetSample,
                                 ignore_index=True)


# Debugging        
#     print("Here's a point in the <x,y,z> grid:")
#     print(xGrid[randomIndex][randomIndex])
#     print(yGrid[randomIndex][randomIndex])
#     print(z[randomIndex][randomIndex])
#     print("%s %s %s" % (a, b, c))
#     print(point1)

# Create dataframe from list of indexed random samples

In [62]:
mlDataset.to_csv('mlDataset.csv',index=False) # save to file

In [63]:
dfNew = pd.read_csv('mlDataset.csv')

In [64]:
dfNew


Out[64]:
inputData inputLabel
0 [ 77.22194358 202.54212629 0.8825365 13... 5.884609e+05
1 [ 6.17775549 0.8825365 135.91062069 2... 8.986781e+05
2 [ 1.95481834e+02 1.29291597e+02 1.6194544... 1.308210e+08
3 [ 1.79596177e+02 8.51647721e+01 2.6476094... -1.429454e+08

In [65]:
mlDataset


Out[65]:
inputData inputLabel
0 [77.2219435756, 202.542126293, 0.882536498007,... 5.884609e+05
1 [6.17775548605, 0.882536498007, 135.910620693,... 8.986781e+05
2 [195.481834309, 129.291596958, 161.945447384, ... 1.308210e+08
3 [179.596177344, 85.1647720577, 2.64760949402, ... -1.429454e+08

In [66]:
# Saving to CSV doesn't exactly read right when trying to import. I am going to pickle it.
import pickle
file = open('mlDataset.pickle', 'wb')
pickle.dump(mlDataset, file)
file.close()

In [67]:
# recovering data
file = open('mlDataset.pickle', 'rb')
pickledDataset = pickle.load(file)
file.close()

In [68]:
pickledDataset


Out[68]:
inputData inputLabel
0 [77.2219435756, 202.542126293, 0.882536498007,... 5.884609e+05
1 [6.17775548605, 0.882536498007, 135.910620693,... 8.986781e+05
2 [195.481834309, 129.291596958, 161.945447384, ... 1.308210e+08
3 [179.596177344, 85.1647720577, 2.64760949402, ... -1.429454e+08

In [37]:
trace1 = go.Scatter3d(
    x=dfTrainingSample['x'].values,
    y=dfTrainingSample['y'].values,
    z=dfTrainingSample['z'].values,
    mode='markers',
    marker=dict(
        size=5,
        line=dict(
            color='rgba(217, 217, 217, 0.14)',
            width=0.5
        ),
        opacity=0.8
    )
)

data = [trace1]
layout = go.Layout(
    margin=dict(
        l=0,
        r=0,
        b=0,
        t=0
    )
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='simple-3d-scatter')


High five! You successfuly sent some data to your account on plotly. View your plot in your browser at https://plot.ly/~agu3rra/0 or inside your plot.ly account where it is named 'simple-3d-scatter'
Out[37]:

In [110]:
xGrid.shape


Out[110]:
(500, 500)