In [5]:
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from pyspark.mllib.util import MLUtils
from pyspark.mllib.evaluation import MulticlassMetrics
# Carrega os dados
data = MLUtils.loadLibSVMFile(sc, "data/sample_multiclass_classification_data.txt")
# Divide o conjunto de dados em treino (60%) e teste (40%)
training, test = data.randomSplit([0.6, 0.4], seed=11L)
training.cache()
# Executa o processo de treino para construção do modelo
model = LogisticRegressionWithLBFGS.train(training, numClasses=3)
# Faz as predições
predictionAndLabels = test.map(lambda lp: (float(model.predict(lp.features)), lp.label))
# Instancia objeto que possui as metricas necessárias
metrics = MulticlassMetrics(predictionAndLabels)
# Overall statistics
precision = metrics.precision()
recall = metrics.recall()
f1Score = metrics.fMeasure()
print("Summary Stats")
print("Precision = %s" % precision)
print("Recall = %s" % recall)
print("F1 Score = %s" % f1Score)
# Resultados por classe
labels = data.map(lambda lp: lp.label).distinct().collect()
for label in sorted(labels):
print("Class %s precision = %s" % (label, metrics.precision(label)))
print("Class %s recall = %s" % (label, metrics.recall(label)))
print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0)))
In [6]:
from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD
from pyspark.mllib.evaluation import RegressionMetrics
from pyspark.mllib.linalg import DenseVector
# Cria estrutura do tipo LabeledPoint
def parsePoint(line):
values = line.split()
return LabeledPoint(float(values[0]),
DenseVector([float(x.split(':')[1]) for x in values[1:]]))
data = sc.textFile("data/sample_linear_regression_data.txt")
parsedData = data.map(parsePoint)
# Constroi o modelo
model = LinearRegressionWithSGD.train(parsedData)
# Faz as predições
valuesAndPreds = parsedData.map(lambda p: (float(model.predict(p.features)), p.label))
# Instantiate metrics object
metrics = RegressionMetrics(valuesAndPreds)
# Squared Error
print("MSE = %s" % metrics.meanSquaredError)
print("RMSE = %s" % metrics.rootMeanSquaredError)
# R-squared
print("R-squared = %s" % metrics.r2)
# Mean absolute error
print("MAE = %s" % metrics.meanAbsoluteError)
# Explained variance
print("Explained variance = %s" % metrics.explainedVariance)
In [ ]: