In [1]:
import pyspark
from pyspark import SparkContext
import urllib
from pyspark.mllib.regression import LabeledPoint
from numpy import array
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.sql import SQLContext
from time import time
# Custom imports
import MySQLConnection
In [2]:
sqlContext = SQLContext(sc)
# Get username and password from file in this format: {"user":"yourusername","password":"yourpassword"}
connectionProperties = MySQLConnection.getDBConnectionProps('/home/erik/mysql_credentials.txt')
# Get training data from the database...biosensor database and SensorTrainingReadings table
data = sqlContext.read.jdbc("jdbc:mysql://localhost/biosensor", "SensorTrainingReadings", properties=connectionProperties).selectExpr("deviceID","metricTypeID","uomID","positionID","actualPitch")
print "Train data size is {}".format(data.count())
In [12]:
# Split data into training and test dataasets
(trainingDataTable, testDataTable) = data.randomSplit([0.9, 0.1])
trainingDataTable.show()
testDataTable.show()
In [13]:
# The model requires labeldPoints which is a row with label and a vector of features.
def featurize(t):
return LabeledPoint(t.positionID, [t.actualPitch])
trainingData = trainingDataTable.map(featurize)
In [14]:
# Train the classifier/Build the model
startTime = time()
#Random Forest Model
model = RandomForest.trainClassifier(
trainingData,
numClasses=3,
categoricalFeaturesInfo={},
numTrees=6,
featureSubsetStrategy="auto",
impurity='gini',
maxDepth=4,
maxBins=32
)
elapsedTime = time() - startTime
print "Classifier trained in {} seconds".format(round(elapsedTime,3))
# Save the madel for use in evaluating readings
model.save(sc,"models/IoTBackBraceRandomForest.model")
In [15]:
# Evaluate model on test instances and compute test error
testData = testDataTable.map(featurize)
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
print('Test Error = ' + str(testErr))
Another handy feature is that you can view the model logic tree by using the "toDebugString()" method
In [16]:
print('Random Forest Classifcation Model:')
print(model.toDebugString())
In [17]:
loadedModel = RandomForestModel.load(sc, "models/IoTBackBraceRandomForest.model")
The example below passes a value to the model from a range of -50 degrees (stooped) to +10 degrees (standing).
In [18]:
for i in range(-50,50):
prediction = loadedModel.predict([i])
positions = {
0 : "upright",
1 : "back bent",
2 : "stooped"
}
print str(i) + " => " + str(positions[prediction])
In [ ]: