In [1]:
import pandas as pd
import numpy as np
import scipy
from sklearn import svm
import random

In [2]:
csvNames = ['Well1B3mths.csv','Well1C3mths.csv','Well1D3mths.csv','Well1E3mths.csv',
           'Well1F3mths.csv','Well1G3mths.csv','Well1H3mths.csv','Well1I3mths.csv',
           'Well1J3mths.csv','Well2A3mths.csv','Well2B3mths.csv','Well2C3mths.csv',
            'Well2D3mths.csv','Well2E3mths.csv','Well3A3mths.csv','Well3B3mths.csv',
            'Well3C3mths.csv','Well3D3mths.csv','Well3E3mths.csv','Well3F3mths.csv',
            'Well3G3mths.csv','Well3H3mths.csv','Well3I3mths.csv','Well4A3mths.csv',
            'Well4B3mths.csv']

In [3]:
featuresColNames = ['Casing Pressure',
                    'Gas Flow (Volume)',
                    'Motor Speed',
                    'Motor Torque',
                    'Pump Speed Actual', #this contains NULLs
                    'Tubing Flow Meter',
                    'Tubing Pressure',
                    'Water Flow Mag from Separator']
targetsName = ['Downhole Gauge Pressure']

allFeatures = []
allTargets = []

for well in csvNames:
    df = pd.DataFrame.from_csv(well)
    
    features = df[featuresColNames].dropna()
    target = df[targetsName].dropna()
    
    allFeatures.append(features)
    allTargets.append(target)

allFeatures = pd.concat(allFeatures).as_matrix()
allTargets = pd.concat(allTargets).as_matrix()

Split the Features and Targets into Training and Test Sets


In [4]:
# Split into trainingX, trainingY, testX, testY

# shuffle these arrays so that training and test sets are randomized
print "shuffling the arrays"
allFeatures = random.sample(allFeatures, len(allFeatures)) 
allTargets = random.sample(allTargets, len(allTargets))

# percentage to train on
sampleSize = 0.1

trainSize = int(0.7*len(allFeatures)*sampleSize)
testSize = int(0.3*len(allFeatures)*sampleSize)
print "trainSize is: ", trainSize
print "testSize is: ", testSize

print "separating them into trainX, trainY, testX, testY"
trainX = allFeatures[:trainSize]
trainY = allTargets[:trainSize]
testX = allFeatures[trainSize:trainSize+testSize]
testY = allTargets[trainSize:trainSize+testSize]

print "done"


shuffling the arrays
trainSize is:  305640
testSize is:  130988
separating them into trainX, trainY, testX, testY
done

🤖 Machine Learning (with sklearn)


In [ ]:
clf = svm.SVR()
clf.fit(trainX, trainY) 
predictions = clf.predict(testX)


/usr/local/lib/python2.7/dist-packages/sklearn/utils/validation.py:526: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)

Calculate the Percentage of Error

Go through each example and calculate the percentage of error. Add this percentage of error to an array. Then use this array to calculate the overall percentage of error within the predicted test set.


In [22]:
def calculateError(prediction, actual):
    return (predicted-actual)/actual

percentageErrors = []
for i, prediction in enumerate(predictions):
    actual = testY[i][0]
    percentageError = (prediction-actual)/actual
    percentageErrors.append(percentageError)

overallError = np.sum(percentageErrors) / len(percentageErrors)

print "Overall error was: " + str(overallError*100) + "%"


Overall error was: 51.8173081805%