Algoritmos de Classificação e Regressão

- SVM


In [13]:
from pyspark.mllib.classification import SVMWithSGD, SVMModel
from pyspark.mllib.regression import LabeledPoint

# Constroi estrutura LabeledPoint a partir dos dados
def parsePoint(line):
    values = [float(x) for x in line.split(' ')]
    return LabeledPoint(values[0], values[1:])

data = sc.textFile("data/sample_svm_data.txt")
parsedData = data.map(parsePoint)

# Constroi o modelo SVM utilizando Gradiente Descendente Estocástico
model = SVMWithSGD.train(parsedData, iterations=100)

# Faz as predições
labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
trainErr = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count())
print("Erro = " + str(trainErr))


Erro = 0.38198757764

- Decision Tree


In [12]:
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from pyspark.mllib.util import MLUtils

# Carrega os dados
data = MLUtils.loadLibSVMFile(sc, 'data/sample_libsvm_data.txt')
# Divide os dados em treino (70%) e teste (30%)
(trainingData, testData) = data.randomSplit([0.7, 0.3])

# Treina um classificador DecisionTree.
# categoricalFeaturesInfo vazio indica que todas as features são valores contínuos
# caso houvesse alguma feature categórica, categoricalFeaturesInfo = {0:2}, 
# indicando que a feature 0 tem 2 categorias
model = DecisionTree.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
                                     impurity='gini', maxDepth=5, maxBins=32)

# Faz as predições
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
print('Porcentagem de Erro = ' + str(testErr))
print('Modelos de classificação gerados:')
print(model.toDebugString())


Porcentagem de Erro = 0.0769230769231
Modelos de classificação gerados:
DecisionTreeModel classifier of depth 1 with 3 nodes
  If (feature 406 <= 20.0)
   Predict: 0.0
  Else (feature 406 > 20.0)
   Predict: 1.0

- Random Forest


In [9]:
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.mllib.util import MLUtils

data = MLUtils.loadLibSVMFile(sc, 'data/sample_libsvm_data.txt')
# Divide os dados em treino (70%) e teste (30%)
(trainingData, testData) = data.randomSplit([0.7, 0.3])

# Treina um classificador DecisionTree.
# Número de Decision Tree a serem geradas: 3. 
model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
                                     numTrees=3, featureSubsetStrategy="auto",
                                     impurity='gini', maxDepth=4, maxBins=32)

# Faz a predição
predictions = model.predict(testData.map(lambda x: x.features))
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(lambda (v, p): v != p).count() / float(testData.count())
print('Porcentagem de Erro = ' + str(testErr))
print('Modelos de classificação gerados:')
print(model.toDebugString())


Test Error = 0.030303030303
Learned classification forest model:
TreeEnsembleModel classifier with 3 trees

  Tree 0:
    If (feature 511 <= 0.0)
     If (feature 402 <= 0.0)
      Predict: 1.0
     Else (feature 402 > 0.0)
      Predict: 0.0
    Else (feature 511 > 0.0)
     Predict: 0.0
  Tree 1:
    If (feature 401 <= 0.0)
     If (feature 370 <= 0.0)
      Predict: 1.0
     Else (feature 370 > 0.0)
      Predict: 0.0
    Else (feature 401 > 0.0)
     Predict: 0.0
  Tree 2:
    If (feature 399 <= 0.0)
     If (feature 494 <= 0.0)
      If (feature 434 <= 0.0)
       Predict: 0.0
      Else (feature 434 > 0.0)
       Predict: 1.0
     Else (feature 494 > 0.0)
      If (feature 606 <= 0.0)
       Predict: 0.0
      Else (feature 606 > 0.0)
       Predict: 1.0
    Else (feature 399 > 0.0)
     Predict: 0.0


In [ ]: