Métricas de Avaliação

- Para problemas de Classificação


In [5]:
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from pyspark.mllib.util import MLUtils
from pyspark.mllib.evaluation import MulticlassMetrics

# Carrega os dados
data = MLUtils.loadLibSVMFile(sc, "data/sample_multiclass_classification_data.txt")

# Divide o conjunto de dados em treino (60%) e teste (40%)
training, test = data.randomSplit([0.6, 0.4], seed=11L)
training.cache()

# Executa o processo de treino para construção do modelo
model = LogisticRegressionWithLBFGS.train(training, numClasses=3)

# Faz as predições
predictionAndLabels = test.map(lambda lp: (float(model.predict(lp.features)), lp.label))

# Instancia objeto que possui as metricas necessárias
metrics = MulticlassMetrics(predictionAndLabels)

# Overall statistics
precision = metrics.precision()
recall = metrics.recall()
f1Score = metrics.fMeasure()
print("Summary Stats")
print("Precision = %s" % precision)
print("Recall = %s" % recall)
print("F1 Score = %s" % f1Score)

# Resultados por classe
labels = data.map(lambda lp: lp.label).distinct().collect()
for label in sorted(labels):
    print("Class %s precision = %s" % (label, metrics.precision(label)))
    print("Class %s recall = %s" % (label, metrics.recall(label)))
    print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0)))


Summary Stats
Precision = 0.862068965517
Recall = 0.862068965517
F1 Score = 0.862068965517
Class 0.0 precision = 0.809523809524
Class 0.0 recall = 0.809523809524
Class 0.0 F1 Measure = 0.809523809524
Class 1.0 precision = 1.0
Class 1.0 recall = 1.0
Class 1.0 F1 Measure = 1.0
Class 2.0 precision = 0.789473684211
Class 2.0 recall = 0.789473684211
Class 2.0 F1 Measure = 0.789473684211

- Problemas de Regressão


In [6]:
from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD
from pyspark.mllib.evaluation import RegressionMetrics
from pyspark.mllib.linalg import DenseVector

# Cria estrutura do tipo LabeledPoint
def parsePoint(line):
    values = line.split()
    return LabeledPoint(float(values[0]),
                        DenseVector([float(x.split(':')[1]) for x in values[1:]]))

data = sc.textFile("data/sample_linear_regression_data.txt")
parsedData = data.map(parsePoint)

# Constroi o modelo
model = LinearRegressionWithSGD.train(parsedData)

# Faz as predições
valuesAndPreds = parsedData.map(lambda p: (float(model.predict(p.features)), p.label))

# Instantiate metrics object
metrics = RegressionMetrics(valuesAndPreds)

# Squared Error
print("MSE = %s" % metrics.meanSquaredError)
print("RMSE = %s" % metrics.rootMeanSquaredError)

# R-squared
print("R-squared = %s" % metrics.r2)

# Mean absolute error
print("MAE = %s" % metrics.meanAbsoluteError)

# Explained variance
print("Explained variance = %s" % metrics.explainedVariance)


MSE = 103.309686818
RMSE = 10.1641372884
R-squared = 0.0276391109678
MAE = 8.14869190795
Explained variance = 2.88839520172

In [ ]: