In [1]:
# Imports relativos ao sistema operacional
import os
import sys
In [2]:
# Passagem de caminhos do Spark e do diretório de dados
SPARK_PATH = "/Users/flavio.clesio/Documents/spark-2.1.0"
ROOT_DIR = "/Users/flavio.clesio/Desktop/pyspark-regression/dataset/"
In [3]:
# Neste snippet passamos os caminhos do Spark como variaveis de ambiente para o sistema operacional
os.environ['SPARK_HOME'] = SPARK_PATH
os.environ['HADOOP_HOME'] = SPARK_PATH
# Neste snippet passamos todas as partes da instalacao do Spark
sys.path.append(SPARK_PATH + "/bin")
sys.path.append(SPARK_PATH + "/python")
sys.path.append(SPARK_PATH + "/python/pyspark/")
sys.path.append(SPARK_PATH + "/python/lib")
sys.path.append(SPARK_PATH + "/python/lib/pyspark.zip")
sys.path.append(SPARK_PATH + "/python/lib/py4j-0.10.4-src.zip") # Must be the same version of your Spark Version
In [4]:
%matplotlib inline
In [5]:
# Vamos fazer agora alguns imports iniciais em relação ao Spark
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from pyspark.mllib.util import MLUtils
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.regression import LinearRegressionWithSGD
from pyspark.mllib.evaluation import RegressionMetrics
from pyspark.mllib.tree import DecisionTree
import matplotlib.pyplot
import numpy as np
import matplotlib
matplotlib.use('Agg')
In [6]:
%pylab inline
pylab.rcParams['figure.figsize'] = (14, 9)
In [7]:
# Instanciamento da sessao do Spark
sc = SparkContext("local", "pyspark-rapiddo")
sc
Out[7]:
In [8]:
# Carga do arquivo .csv
raw_data = sc.textFile(ROOT_DIR + "base_maio_scalled_one_hotted.csv")
In [9]:
# Contagem simples em relacao ao numero de registros
num_data = raw_data.count()
print 'Quantidade de registros:', num_data
In [10]:
# Transformacao do arquivo em um RDD realizando a divisao pela virgula
records = raw_data.map(lambda x: x.split("|"))
In [11]:
# Primeiro registro ja sem os headers
first = records.first()
print first
In [12]:
# Vamos colocar os dados em cache, ja que vamos realizar inumeras leituras
records.cache()
Out[12]:
In [13]:
# Primeiro registro do dataset records
records.first()
Out[13]:
In [14]:
# Load and parse the data
def parsePoint(line):
values = [float(x) for x in line.replace(',', ' ').split(' ')]
return LabeledPoint(values[73], values[1:73])
In [15]:
parsedData = raw_data.map(parsePoint)
In [16]:
parsedData.take(1)
Out[16]:
In [17]:
raw_data.take(1)
Out[17]:
In [18]:
(trainingData, testData) = parsedData.randomSplit([0.7, 0.3], seed=100)
In [19]:
trainingData.take(1)
Out[19]:
In [20]:
testData.take(1)
Out[20]:
In [21]:
def get_categorical_fatures():
categoricalFeaturesInfo = {}
categoricalFeaturesInfo[0] = 2
categoricalFeaturesInfo[1] = 2
categoricalFeaturesInfo[2] = 2
categoricalFeaturesInfo[3] = 2
categoricalFeaturesInfo[4] = 2
categoricalFeaturesInfo[5] = 2
categoricalFeaturesInfo[6] = 2
categoricalFeaturesInfo[7] = 2
categoricalFeaturesInfo[8] = 2
categoricalFeaturesInfo[9] = 2
categoricalFeaturesInfo[10] = 2
categoricalFeaturesInfo[11] = 2
categoricalFeaturesInfo[12] = 2
categoricalFeaturesInfo[13] = 2
categoricalFeaturesInfo[14] = 2
categoricalFeaturesInfo[15] = 2
categoricalFeaturesInfo[16] = 2
categoricalFeaturesInfo[17] = 2
categoricalFeaturesInfo[18] = 2
categoricalFeaturesInfo[19] = 2
categoricalFeaturesInfo[20] = 2
categoricalFeaturesInfo[21] = 2
categoricalFeaturesInfo[22] = 2
categoricalFeaturesInfo[23] = 2
categoricalFeaturesInfo[24] = 2
categoricalFeaturesInfo[25] = 2
categoricalFeaturesInfo[26] = 2
categoricalFeaturesInfo[27] = 2
categoricalFeaturesInfo[28] = 2
categoricalFeaturesInfo[29] = 2
categoricalFeaturesInfo[30] = 2
categoricalFeaturesInfo[31] = 2
categoricalFeaturesInfo[32] = 2
categoricalFeaturesInfo[33] = 2
categoricalFeaturesInfo[34] = 2
categoricalFeaturesInfo[35] = 2
categoricalFeaturesInfo[36] = 2
categoricalFeaturesInfo[37] = 2
categoricalFeaturesInfo[38] = 2
categoricalFeaturesInfo[39] = 2
categoricalFeaturesInfo[40] = 2
categoricalFeaturesInfo[41] = 2
categoricalFeaturesInfo[42] = 2
categoricalFeaturesInfo[43] = 2
categoricalFeaturesInfo[44] = 2
categoricalFeaturesInfo[45] = 2
categoricalFeaturesInfo[46] = 2
categoricalFeaturesInfo[47] = 2
categoricalFeaturesInfo[48] = 2
categoricalFeaturesInfo[49] = 2
categoricalFeaturesInfo[50] = 2
categoricalFeaturesInfo[51] = 2
categoricalFeaturesInfo[52] = 2
categoricalFeaturesInfo[53] = 2
categoricalFeaturesInfo[54] = 2
categoricalFeaturesInfo[55] = 2
categoricalFeaturesInfo[56] = 2
categoricalFeaturesInfo[57] = 2
categoricalFeaturesInfo[58] = 2
categoricalFeaturesInfo[59] = 2
categoricalFeaturesInfo[60] = 2
categoricalFeaturesInfo[61] = 2
categoricalFeaturesInfo[62] = 2
categoricalFeaturesInfo[63] = 2
categoricalFeaturesInfo[64] = 2
categoricalFeaturesInfo[65] = 2
categoricalFeaturesInfo[66] = 2
categoricalFeaturesInfo[67] = 2
categoricalFeaturesInfo[68] = 2
return categoricalFeaturesInfo
In [22]:
# Train a DecisionTree model.
# Empty categoricalFeaturesInfo indicates all features are continuous.
model = DecisionTree.trainRegressor(trainingData
,get_categorical_fatures()
,impurity='variance'
,maxDepth=8
,maxBins=200
,minInstancesPerNode=1
,minInfoGain = 0.0)
In [23]:
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))
In [24]:
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
In [25]:
testMSE = labelsAndPredictions.map(lambda lp: (lp[0] - lp[1]) * (lp[0] - lp[1])).sum()/float(testData.count())
In [26]:
print('Test Mean Squared Error = ' + str(testMSE))
print('Test Root Mean Squared Error = ' + str(np.sqrt(testMSE)))
print('Learned regression tree model:')
print(model.toDebugString())
In [27]:
# Save and load model
#model.save(sc, "target/tmp/myDecisionTreeRegressionModel")
#sameModel = DecisionTreeModel.load(sc, "target/tmp/myDecisionTreeRegressionModel")
In [ ]:
# Algumas funcoes de erro para a avaliacao das arvores
def squared_error(actual, pred):
return (pred - actual)**2
def abs_error(actual, pred):
return np.abs(pred - actual)
def squared_log_error(pred, actual):
return (np.log(pred + 1) - np.log(actual + 1))**2
In [ ]:
# Funcao para avaliar as arvores
def evaluate_dt(train, test, maxDepth, maxBins):
model = DecisionTree.trainRegressor(train
,{}
,impurity='variance'
,maxDepth=maxDepth
,maxBins=maxBins)
preds = model.predict(test.map(lambda p: p.features))
actual = test.map(lambda p: p.label)
tp = actual.zip(preds)
rmsle = np.sqrt(tp.map(lambda (t, p): squared_log_error(t,p)).mean())
rmse = np.sqrt(tp.map(lambda (t, p): squared_error(t,p)).mean())
return rmse
In [ ]:
# Tree Depth
params = [5,8,10,20,30]
metrics = [evaluate_dt(trainingData, testData, param, 32) for param in params]
print 'Parametros escolhidos:', params
print 'RMSE dos parametros', metrics
plot(params, metrics)
fig = matplotlib.pyplot.gcf()
In [ ]:
# Maximo de bins
params = [2, 4, 8, 16, 32, 64, 100, 200]
metrics = [evaluate_dt(trainingData, testData, 5, param) for param in params]
print 'Parametros escolhidos:', params
print 'RMSE dos parametros', metrics
plot(params, metrics)
fig = matplotlib.pyplot.gcf()
In [ ]:
# Maximo de bins
params = [200, 400, 500, 700, 1000]
metrics = [evaluate_dt(trainingData, testData, 5, param) for param in params]
print 'Parametros escolhidos:', params
print 'RMSE dos parametros', metrics
plot(params, metrics)
fig = matplotlib.pyplot.gcf()
In [ ]:
# Alguns experimentos com Regressao Linear
help(LinearRegressionWithSGD)
In [28]:
def linearRegression_MSE(data, regMode, regForce, numInteracoes, stepConvergence):
model = LinearRegressionWithSGD.train(data
,iterations=numInteracoes
, step=stepConvergence
, miniBatchFraction=1.0
, initialWeights=None
, regParam=regForce
, regType=regMode
, intercept=True
, validateData=True
, convergenceTol=0.001)
# Avaliacao do modelo
# Usamos o .map() do parsedData para pegar os pares {train_data, predict_data}
valuesAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
# Informations about model
print "Regularizador = " + str(regMode)
# Forca do Regularizador
print "Parametro de Regularizacao = " + str(regForce)
# Predictions to using in metrics
Preds = parsedData.map(lambda p: (p.label, float(model.predict(p.features))))
# Get metrics
metrics = RegressionMetrics(Preds)
# Squared Error
print("MSE = %s" % metrics.meanSquaredError)
print("RMSE = %s" % metrics.rootMeanSquaredError)
# R-squared
print("R-squared = %s" % metrics.r2)
# Mean absolute error
print("MAE = %s" % metrics.meanAbsoluteError)
# Explained variance
print("Explained variance = %s" % metrics.explainedVariance)
return
In [29]:
# Sem regularizacao
print '\n Modelo 1'
RegLinear_1 = linearRegression_MSE(trainingData, None, 0, 10000, 0.001)
print '\n Modelo 2'
RegLinear_2 = linearRegression_MSE(trainingData, None, 0, 100000, 0.001)
print '\n Modelo 3'
RegLinear_3 = linearRegression_MSE(trainingData, None, 0, 500000, 0.001)
print '\n Modelo 4'
RegLinear_4 = linearRegression_MSE(trainingData, None, 0, 10000, 0.01)
print '\n Modelo 5'
RegLinear_5 = linearRegression_MSE(trainingData, None, 0, 100000, 0.01)
print '\n Modelo 6'
RegLinear_6 = linearRegression_MSE(trainingData, None, 0, 500000, 0.01)
print '\n Modelo 7'
RegLinear_7 = linearRegression_MSE(trainingData, None, 0, 10000, 0.1)
print '\n Modelo 8'
RegLinear_8 = linearRegression_MSE(trainingData, None, 0, 100000, 0.1)
print '\n Modelo 9'
RegLinear_9 = linearRegression_MSE(trainingData, None, 0, 500000, 0.1)
print '\n Modelo 10'
RegLinear_10 = linearRegression_MSE(trainingData, None, 0, 10000, 1.0)
print '\n Modelo 11'
RegLinear_11 = linearRegression_MSE(trainingData, None, 0, 100000, 1.0)
print '\n Modelo 12'
RegLinear_12 = linearRegression_MSE(trainingData, None, 0, 500000, 1.0)
In [30]:
# Regularizacao L1 (Lasso)
print '\n Modelo 1'
LASSO_1 = linearRegression_MSE(trainingData, "l1", 0.0, 10000, 0.001)
print '\n Modelo 2'
LASSO_2 = linearRegression_MSE(trainingData, "l1", 0.0, 100000, 0.001)
print '\n Modelo 3'
LASSO_3 = linearRegression_MSE(trainingData, "l1", 0.0, 500000, 0.001)
print '\n Modelo 4'
LASSO_4 = linearRegression_MSE(trainingData, "l1", 0.01, 10000, 0.001)
print '\n Modelo 5'
LASSO_5 = linearRegression_MSE(trainingData, "l1", 0.01, 100000, 0.001)
print '\n Modelo 6'
LASSO_6 = linearRegression_MSE(trainingData, "l1", 0.01, 500000, 0.001)
print '\n Modelo 7'
LASSO_7 = linearRegression_MSE(trainingData, "l1", 0.025, 10000, 0.01)
print '\n Modelo 8'
LASSO_8 = linearRegression_MSE(trainingData, "l1", 0.025, 100000, 0.01)
print '\n Modelo 9'
LASSO_9 = linearRegression_MSE(trainingData, "l1", 0.025, 500000, 0.01)
print '\n Modelo 10'
LASSO_10 = linearRegression_MSE(trainingData, "l1", 0.05, 10000, 0.1)
print '\n Modelo 11'
LASSO_11 = linearRegression_MSE(trainingData, "l1", 0.05, 100000, 0.1)
print '\n Modelo 12'
LASSO_12 = linearRegression_MSE(trainingData, "l1", 0.05, 500000, 0.1)
print '\n Modelo 13'
LASSO_13 = linearRegression_MSE(trainingData, "l1", 0.1, 10000, 1.0)
print '\n Modelo 14'
LASSO_14 = linearRegression_MSE(trainingData, "l1", 0.1, 100000, 1.0)
print '\n Modelo 15'
LASSO_15 = linearRegression_MSE(trainingData, "l1", 0.1, 500000, 1.0)
print '\n Modelo 16'
LASSO_16 = linearRegression_MSE(trainingData, "l1", 10.0, 10000, 1.0)
print '\n Modelo 17'
LASSO_17 = linearRegression_MSE(trainingData, "l1", 10.0, 100000, 1.0)
print '\n Modelo 18'
LASSO_18 = linearRegression_MSE(trainingData, "l1", 10.0, 500000, 1.0)
print '\n Modelo 19'
LASSO_19 = linearRegression_MSE(trainingData, "l1", 100.0, 10000, 1.0)
print '\n Modelo 20'
LASSO_20 = linearRegression_MSE(trainingData, "l1", 100.0, 100000, 1.0)
print '\n Modelo 21'
LASSO_21 = linearRegression_MSE(trainingData, "l1", 100.0, 500000, 1.0)
print '\n Modelo 22'
LASSO_22 = linearRegression_MSE(trainingData, "l1", 1000.0, 10000, 1.0)
print '\n Modelo 23'
LASSO_23 = linearRegression_MSE(trainingData, "l1", 1000.0, 100000, 1.0)
print '\n Modelo 24'
LASSO_24 = linearRegression_MSE(trainingData, "l1", 1000.0, 500000, 1.0)
In [31]:
# Regularizacao L2 (Ridge)
print '\n Modelo 1'
RIDGE_1 = linearRegression_MSE(trainingData, "l2", 0.0, 10000, 0.001)
print '\n Modelo 2'
RIDGE_2 = linearRegression_MSE(trainingData, "l2", 0.0, 100000, 0.001)
print '\n Modelo 3'
RIDGE_3 = linearRegression_MSE(trainingData, "l2", 0.0, 500000, 0.001)
print '\n Modelo 4'
RIDGE_4 = linearRegression_MSE(trainingData, "l2", 0.01, 10000, 0.001)
print '\n Modelo 5'
RIDGE_5 = linearRegression_MSE(trainingData, "l2", 0.01, 100000, 0.001)
print '\n Modelo 6'
RIDGE_6 = linearRegression_MSE(trainingData, "l2", 0.01, 500000, 0.001)
print '\n Modelo 7'
RIDGE_7 = linearRegression_MSE(trainingData, "l2", 0.1, 10000, 1.0)
print '\n Modelo 8'
RIDGE_8 = linearRegression_MSE(trainingData, "l2", 0.1, 100000, 1.0)
print '\n Modelo 9'
RIDGE_9 = linearRegression_MSE(trainingData, "l2", 0.1, 500000, 1.0)
print '\n Modelo 10'
RIDGE_10 = linearRegression_MSE(trainingData, "l2", 1.0, 10000, 1.0)
print '\n Modelo 11'
RIDGE_11 = linearRegression_MSE(trainingData, "l2", 1.0, 100000, 1.0)
print '\n Modelo 12'
RIDGE_12 = linearRegression_MSE(trainingData, "l2", 1.0, 500000, 1.0)
print '\n Modelo 13'
RIDGE_13 = linearRegression_MSE(trainingData, "l2", 5.0, 10000, 1.0)
print '\n Modelo 14'
RIDGE_14 = linearRegression_MSE(trainingData, "l2", 5.0, 100000, 1.0)
print '\n Modelo 15'
RIDGE_15 = linearRegression_MSE(trainingData, "l2", 5.0, 500000, 1.0)
print '\n Modelo 16'
RIDGE_16 = linearRegression_MSE(trainingData, "l2", 10.0, 10000, 1.0)
print '\n Modelo 17'
RIDGE_17 = linearRegression_MSE(trainingData, "l2", 10.0, 100000, 1.0)
print '\n Modelo 18'
RIDGE_18 = linearRegression_MSE(trainingData, "l2", 10.0, 500000, 1.0)
print '\n Modelo 19'
RIDGE_19 = linearRegression_MSE(trainingData, "l2", 20.0, 10000, 1.0)
print '\n Modelo 20'
RIDGE_20 = linearRegression_MSE(trainingData, "l2", 20.0, 100000, 1.0)
print '\n Modelo 21'
RIDGE_21 = linearRegression_MSE(trainingData, "l2", 20.0, 500000, 1.0)
In [ ]:
# Mais experimentos com arvore de decisao
help(DecisionTree.trainRegressor)
In [ ]:
# Mais experimentos com arvore de decisao
dt_model = DecisionTree.trainRegressor(parsedData
,categoricalFeaturesInfo = {}
, impurity='variance'
, maxDepth=5
, maxBins=32
, minInstancesPerNode=1
, minInfoGain=0.0)
In [ ]:
# Evaluate model on test instances and compute test error
predictions = dt_model.predict(testData.map(lambda x: x.features))
In [ ]:
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)
In [ ]:
testMSE = labelsAndPredictions.map(lambda lp: (lp[0] - lp[1]) * (lp[0] - lp[1])).sum() /\
float(testData.count())
In [ ]:
print('Test Mean Squared Error = ' + str(testMSE))
print('Learned regression tree model:')
print(model.toDebugString())
In [ ]:
# Save and load model
#model.save(sc, "target/tmp/myDecisionTreeRegressionModel")
#sameModel = DecisionTreeModel.load(sc, "target/tmp/myDecisionTreeRegressionModel")
In [ ]:
# Paramgrid: https://github.com/MingChen0919/learning-apache-spark/blob/master/linear-regression.ipynb
# Paramgrid: https://mapr.com/blog/churn-prediction-pyspark-using-mllib-and-ml-packages/
In [ ]:
linear_model = LinearRegressionWithSGD.train(data
,iterations=10
,step=0.1
,intercept=False)
In [ ]:
# Aqui vamos usar o .map para o nosso conjunto de dados data, e usando a funcao lambda vamos pegar a coluna com os valores
# que no caso é o p.label e o resultado das predicoes no .peatures
true_vs_predicted = data.map(lambda p: (p.label, linear_model.predict(p.features)))
In [ ]:
print "Linear Model predictions: " + str(true_vs_predicted.take(5))
In [ ]:
# Agora vamos usar o regressor da arvore de decisao regressora. No caso quando se a algum tipo de variavel categorica
# e necessario passar como argumento categoricalFeaturesInfo. No caso vai ficar as...is
dt_model = DecisionTree.trainRegressor(data_dt
,{})
# A diferenca fundamental que precisa ser enendida e que no modelo linear, obrigatoriamente as variavels dummy precisam
# estar criadas no vetor de features, enquanto no modelo de arvore de decisao, como o algoritmo tem propriedades
# de quebra da arvore, nem sempre isso e necessario, mas e recomendado
In [ ]:
preds = dt_model.predict(data_dt.map(lambda p: p.features))
In [ ]:
actual = data.map(lambda p: p.label)
In [ ]:
true_vs_predicted_dt = actual.zip(preds)
In [ ]:
print "Decision Tree predictions: " + str(true_vs_predicted_dt.take(5))
In [ ]:
print "Decision Tree depth: " + str(dt_model.depth())
In [ ]:
print "Decision Tree number of nodes: " + str(dt_model.numNodes())
In [ ]:
# Atribuicao de index para todos os campos via chave e valor
data_with_idx = data.zipWithIndex().map(lambda (k, v): (v, k))
In [ ]:
# Nesse caso a base de teste sera criada usando 20% de todo o conjunto de dados
test = data_with_idx.sample(False, 0.2, 42)
In [ ]:
# O subtractByKey() faz o trabalho de remover da base inicial as instancias que tem overlap
# ou seja, esses serao os 80% da base de testes
train = data_with_idx.subtractByKey(test)
In [ ]:
train_data = train.map(lambda (idx, p): p)
test_data = test.map(lambda (idx, p) : p)
train_size = train_data.count()
test_size = test_data.count()
In [ ]:
print "Training data size: %d" % train_size
print "Test data size: %d" % test_size
print "Total data size: %d " % num_data
print "Train + Test size : %d" % (train_size + test_size)
In [ ]:
data_with_idx_dt = data_dt.zipWithIndex().map(lambda (k, v): (v, k))
test_dt = data_with_idx_dt.sample(False, 0.2, 42)
train_dt = data_with_idx_dt.subtractByKey(test_dt)
train_data_dt = train_dt.map(lambda (idx, p): p)
test_data_dt = test_dt.map(lambda (idx, p) : p)
In [ ]:
def evaluate(train, test, iterations, step, regParam, regType,intercept):
model = LinearRegressionWithSGD.train(train
,iterations
,step
,regParam=regParam
,regType=regType
,intercept=intercept)
tp = test.map(lambda p: (p.label, model.predict(p.features)))
rmsle = np.sqrt(tp.map(lambda (t, p): squared_log_error(t, p)).mean())
return rmsle
In [ ]:
# Numero de interacoes como parametros
params = [1, 5, 10, 20, 50, 100]
metrics = [evaluate(train_data, test_data, param, 0.01, 0.0,'l2',False) for param in params]
print 'Parametros escolhidos:', params
print 'RMSLE dos parametros', metrics
In [ ]:
matplotlib.pyplot.plot(params, metrics)
matplotlib.pyplot.show()
In [ ]:
# Step size
params = [0.01, 0.025, 0.05, 0.1, 1.0]
metrics = [evaluate(train_data, test_data, 10, param, 0.0, 'l2',False) for param in params]
print 'Parametros escolhidos:', params
print 'RMSLE dos parametros', metrics
In [ ]:
# Regularizacao Ridge (L2)
params = [0.0, 0.01, 0.1, 1.0, 5.0, 10.0, 20.0]
metrics = [evaluate(train_data, test_data, 10, 0.1, param, 'l2',False) for param in params]
print 'Parametros escolhidos:', params
print 'RMSLE dos parametros', metrics
In [ ]:
matplotlib.pyplot.plot(params, metrics)
matplotlib.pyplot.xscale('log')
matplotlib.pyplot.show()
In [ ]:
# Regularizacao L! (Lasso)
params = [0.0, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
metrics = [evaluate(train_data, test_data, 10, 0.1, param, 'l1',False) for param in params]
print 'Parametros escolhidos:', params
print 'RMSLE dos parametros', metrics
In [ ]:
matplotlib.pyplot.plot(params, metrics)
matplotlib.pyplot.xscale('log')
matplotlib.pyplot.show()
In [ ]:
model_l1 = LinearRegressionWithSGD.train(train_data, 10, 0.1,regParam=1.0, regType='l1', intercept=False)
model_l1_10 = LinearRegressionWithSGD.train(train_data, 10, 0.1,regParam=10.0, regType='l1', intercept=False)
model_l1_100 = LinearRegressionWithSGD.train(train_data, 10, 0.1,regParam=100.0, regType='l1', intercept=False)
In [ ]:
print "L1 (1.0) number of zero weights: " + str(sum(model_l1.weights.array == 0))
print "L1 (10.0) number of zeros weights: " + str(sum(model_l1_10.weights.array == 0))
print "L1 (100.0) number of zeros weights: " + str(sum(model_l1_100.weights.array == 0))
In [ ]:
# Como a regularizacao e mais agressiva o numero de numeros zeros e maior quanto mais se aumenta a regularizacao
In [ ]:
# Intercepto
params = [False, True]
metrics = [evaluate(train_data, test_data, 10, 0.1, 1.0, 'l2', param)for param in params]
print 'Parametros escolhidos:', params
print 'RMSLE dos parametros', metrics
In [ ]:
# Put in bar
matplotlib.pyplot.bar(params, metrics)
matplotlib.pyplot.xscale('log')
matplotlib.pyplot.show()
In [ ]:
def evaluate_dt(train, test, maxDepth, maxBins):
model = DecisionTree.trainRegressor(train
, {}
,impurity='variance'
,maxDepth=maxDepth
,maxBins=maxBins)
preds = model.predict(test.map(lambda p: p.features))
actual = test.map(lambda p: p.label)
tp = actual.zip(preds)
rmsle = np.sqrt(tp.map(lambda (t, p): squared_log_error(t,p)).mean())
return rmsle
In [ ]:
# Tree Depth
params = [1, 2, 3, 4, 5, 10, 20]
metrics = [evaluate_dt(train_data_dt, test_data_dt, param, 32) for param in params]
print 'Parametros escolhidos:', params
print 'RMSLE dos parametros', metrics
plot(params, metrics)
fig = matplotlib.pyplot.gcf()
In [ ]:
# Maximo de bins
params = [2, 4, 8, 16, 32, 64, 100]
metrics = [evaluate_dt(train_data_dt, test_data_dt, 5, param) for param in params]
print 'Parametros escolhidos:', params
print 'RMSLE dos parametros', metrics
plot(params, metrics)
fig = matplotlib.pyplot.gcf()
In [ ]:
# Save and load model
#model.save(sc, "home/myDecisionTreeClassificationModel")
In [ ]:
#sameModel = DecisionTreeModel.load(sc, "target/tmp/myDecisionTreeClassificationModel")