In [1]:
import pandas as pd
import numpy as np
import scipy
from sklearn import svm
import random
In [2]:
csvNames = ['Well1B3mths.csv','Well1C3mths.csv','Well1D3mths.csv','Well1E3mths.csv',
In [3]:
featuresColNames = ['Casing Pressure',
'Gas Flow (Volume)',
'Motor Speed',
'Motor Torque',
'Pump Speed Actual', #this contains NULLs
'Tubing Flow Meter',
'Tubing Pressure',
'Water Flow Mag from Separator']
targetsName = ['Downhole Gauge Pressure']
allFeatures = []
allTargets = []
for well in csvNames:
df = pd.DataFrame.from_csv(well)
features = df[featuresColNames].dropna()
target = df[targetsName].dropna()
allFeatures = pd.concat(allFeatures).as_matrix()
allTargets = pd.concat(allTargets).as_matrix()
In [4]:
# Split into trainingX, trainingY, testX, testY
# shuffle these arrays so that training and test sets are randomized
print "shuffling the arrays"
allFeatures = random.sample(allFeatures, len(allFeatures))
allTargets = random.sample(allTargets, len(allTargets))
# percentage to train on
sampleSize = 0.1
trainSize = int(0.7*len(allFeatures)*sampleSize)
testSize = int(0.3*len(allFeatures)*sampleSize)
print "trainSize is: ", trainSize
print "testSize is: ", testSize
print "separating them into trainX, trainY, testX, testY"
trainX = allFeatures[:trainSize]
trainY = allTargets[:trainSize]
testX = allFeatures[trainSize:trainSize+testSize]
testY = allTargets[trainSize:trainSize+testSize]
print "done"
In [ ]:
clf = svm.SVR(), trainY)
predictions = clf.predict(testX)
In [22]:
def calculateError(prediction, actual):
return (predicted-actual)/actual
percentageErrors = []
for i, prediction in enumerate(predictions):
actual = testY[i][0]
percentageError = (prediction-actual)/actual
overallError = np.sum(percentageErrors) / len(percentageErrors)
print "Overall error was: " + str(overallError*100) + "%"