In [1]:
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import DecisionTree
from pyspark.mllib.util import MLUtils

In [2]:
data = MLUtils.loadLibSVMFile(sc, '/home/anant/projects/spark-examples/data/sample_libsvm_data.txt').cache()

In [3]:
# Train a DecisionTree model.
#  Empty categoricalFeaturesInfo indicates all features are continuous.
model = DecisionTree.trainRegressor(data, categoricalFeaturesInfo={},
                                    impurity='variance', maxDepth=5, maxBins=100)

In [4]:
# Evaluate model on training instances and compute training error
predictions = model.predict(data.map(lambda x: x.features))
labelsAndPredictions = data.map(lambda lp: lp.label).zip(predictions)
trainMSE = labelsAndPredictions.map(lambda (v, p): (v - p) * (v - p)).sum() / float(data.count())
print('Training Mean Squared Error = ' + str(trainMSE))
print('Learned regression tree model:')
print(model)


Training Mean Squared Error = 0.0
Learned regression tree model:
DecisionTreeModel regressor
  If (feature 434 <= 0.0)
   If (feature 100 <= 193.5)
    Predict: 0.0
   Else (feature 100 > 193.5)
    Predict: 1.0
  Else (feature 434 > 0.0)
   Predict: 1.0


In [ ]: