In [1]:
# Imports relativos ao sistema operacional
import os
import sys

In [2]:
# Passagem de caminhos do Spark e do diretório de dados
SPARK_PATH = "/Users/flavio.clesio/Documents/spark-2.1.0" 
ROOT_DIR = "/Users/flavio.clesio/Desktop/pyspark-regression/dataset/"

In [3]:
# Neste snippet passamos os caminhos do Spark como variaveis de ambiente para o sistema operacional
os.environ['SPARK_HOME'] = SPARK_PATH
os.environ['HADOOP_HOME'] = SPARK_PATH

# Neste snippet passamos todas as partes da instalacao do Spark
sys.path.append(SPARK_PATH + "/bin")
sys.path.append(SPARK_PATH + "/python")
sys.path.append(SPARK_PATH + "/python/pyspark/")
sys.path.append(SPARK_PATH + "/python/lib")
sys.path.append(SPARK_PATH + "/python/lib/pyspark.zip")
sys.path.append(SPARK_PATH + "/python/lib/py4j-0.10.4-src.zip") # Must be the same version of your Spark Version

In [4]:
%matplotlib inline

In [5]:
# Vamos fazer agora alguns imports iniciais em relação ao Spark
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from pyspark.mllib.util import MLUtils
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.regression import LinearRegressionWithSGD
from pyspark.mllib.evaluation import RegressionMetrics
from pyspark.mllib.tree import DecisionTree
import matplotlib.pyplot
import numpy as np
import matplotlib
matplotlib.use('Agg')


/usr/local/lib/python2.7/site-packages/matplotlib/__init__.py:1405: UserWarning: 
This call to matplotlib.use() has no effect because the backend has already
been chosen; matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.

  warnings.warn(_use_error_msg)

In [6]:
%pylab inline
pylab.rcParams['figure.figsize'] = (14, 9)


Populating the interactive namespace from numpy and matplotlib

In [7]:
# Instanciamento da sessao do Spark
sc = SparkContext("local", "pyspark-rapiddo")
sc


Out[7]:
<pyspark.context.SparkContext at 0x105dd7e90>

Extracao de features para o modelo linear


In [8]:
# Carga do arquivo .csv
raw_data = sc.textFile(ROOT_DIR + "base_maio_scalled_one_hotted.csv")

In [9]:
# Contagem simples em relacao ao numero de registros
num_data = raw_data.count()
print 'Quantidade de registros:',  num_data


Quantidade de registros: 118544

In [10]:
# Transformacao do arquivo em um RDD realizando a divisao pela virgula
records = raw_data.map(lambda x: x.split("|"))

In [11]:
# Primeiro registro ja sem os headers
first = records.first()
print first


[u'1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,3.776331646,-0.303368592,-0.571108822,923']

In [12]:
# Vamos colocar os dados em cache, ja que vamos realizar inumeras leituras
records.cache()


Out[12]:
PythonRDD[4] at RDD at PythonRDD.scala:48

In [13]:
# Primeiro registro do dataset records
records.first()


Out[13]:
[u'1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,3.776331646,-0.303368592,-0.571108822,923']

In [14]:
# Load and parse the data
def parsePoint(line):
    values = [float(x) for x in line.replace(',', ' ').split(' ')]
    return LabeledPoint(values[73], values[1:73])

In [15]:
parsedData = raw_data.map(parsePoint)

In [16]:
parsedData.take(1)


Out[16]:
[LabeledPoint(923.0, [0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.776331646,-0.303368592,-0.571108822])]

In [17]:
raw_data.take(1)


Out[17]:
[u'1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,3.776331646,-0.303368592,-0.571108822,923']

In [18]:
(trainingData, testData) = parsedData.randomSplit([0.7, 0.3], seed=100)

In [19]:
trainingData.take(1)


Out[19]:
[LabeledPoint(923.0, [0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.776331646,-0.303368592,-0.571108822])]

In [20]:
testData.take(1)


Out[20]:
[LabeledPoint(25.0, [1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-0.332191362,-0.52114871,-0.752435134])]

In [21]:
def get_categorical_fatures():
    categoricalFeaturesInfo = {}
    categoricalFeaturesInfo[0] = 2
    categoricalFeaturesInfo[1] = 2
    categoricalFeaturesInfo[2] = 2
    categoricalFeaturesInfo[3] = 2
    categoricalFeaturesInfo[4] = 2
    categoricalFeaturesInfo[5] = 2
    categoricalFeaturesInfo[6] = 2
    categoricalFeaturesInfo[7] = 2
    categoricalFeaturesInfo[8] = 2
    categoricalFeaturesInfo[9] = 2
    categoricalFeaturesInfo[10] = 2
    categoricalFeaturesInfo[11] = 2
    categoricalFeaturesInfo[12] = 2
    categoricalFeaturesInfo[13] = 2
    categoricalFeaturesInfo[14] = 2
    categoricalFeaturesInfo[15] = 2
    categoricalFeaturesInfo[16] = 2
    categoricalFeaturesInfo[17] = 2
    categoricalFeaturesInfo[18] = 2
    categoricalFeaturesInfo[19] = 2
    categoricalFeaturesInfo[20] = 2
    categoricalFeaturesInfo[21] = 2
    categoricalFeaturesInfo[22] = 2
    categoricalFeaturesInfo[23] = 2
    categoricalFeaturesInfo[24] = 2
    categoricalFeaturesInfo[25] = 2
    categoricalFeaturesInfo[26] = 2
    categoricalFeaturesInfo[27] = 2
    categoricalFeaturesInfo[28] = 2
    categoricalFeaturesInfo[29] = 2
    categoricalFeaturesInfo[30] = 2
    categoricalFeaturesInfo[31] = 2
    categoricalFeaturesInfo[32] = 2
    categoricalFeaturesInfo[33] = 2
    categoricalFeaturesInfo[34] = 2
    categoricalFeaturesInfo[35] = 2
    categoricalFeaturesInfo[36] = 2
    categoricalFeaturesInfo[37] = 2
    categoricalFeaturesInfo[38] = 2
    categoricalFeaturesInfo[39] = 2
    categoricalFeaturesInfo[40] = 2
    categoricalFeaturesInfo[41] = 2
    categoricalFeaturesInfo[42] = 2
    categoricalFeaturesInfo[43] = 2
    categoricalFeaturesInfo[44] = 2
    categoricalFeaturesInfo[45] = 2
    categoricalFeaturesInfo[46] = 2
    categoricalFeaturesInfo[47] = 2
    categoricalFeaturesInfo[48] = 2
    categoricalFeaturesInfo[49] = 2
    categoricalFeaturesInfo[50] = 2
    categoricalFeaturesInfo[51] = 2
    categoricalFeaturesInfo[52] = 2
    categoricalFeaturesInfo[53] = 2
    categoricalFeaturesInfo[54] = 2
    categoricalFeaturesInfo[55] = 2
    categoricalFeaturesInfo[56] = 2
    categoricalFeaturesInfo[57] = 2
    categoricalFeaturesInfo[58] = 2
    categoricalFeaturesInfo[59] = 2
    categoricalFeaturesInfo[60] = 2
    categoricalFeaturesInfo[61] = 2
    categoricalFeaturesInfo[62] = 2
    categoricalFeaturesInfo[63] = 2
    categoricalFeaturesInfo[64] = 2
    categoricalFeaturesInfo[65] = 2
    categoricalFeaturesInfo[66] = 2
    categoricalFeaturesInfo[67] = 2
    categoricalFeaturesInfo[68] = 2

    return categoricalFeaturesInfo

In [22]:
# Train a DecisionTree model.
# Empty categoricalFeaturesInfo indicates all features are continuous.
model = DecisionTree.trainRegressor(trainingData
                                    ,get_categorical_fatures()
                                    ,impurity='variance'
                                    ,maxDepth=8
                                    ,maxBins=200
                                    ,minInstancesPerNode=1
                                    ,minInfoGain = 0.0)

In [23]:
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))

In [24]:
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)

In [25]:
testMSE = labelsAndPredictions.map(lambda lp: (lp[0] - lp[1]) * (lp[0] - lp[1])).sum()/float(testData.count())

In [26]:
print('Test Mean Squared Error = ' + str(testMSE))
print('Test Root Mean Squared Error = ' + str(np.sqrt(testMSE)))
print('Learned regression tree model:')
print(model.toDebugString())


Test Mean Squared Error = 78713.0239347
Test Root Mean Squared Error = 280.558414479
Learned regression tree model:
DecisionTreeModel regressor of depth 8 with 475 nodes
  If (feature 70 <= 0.983127614)
   If (feature 69 <= -0.332191362)
    If (feature 70 <= -0.081339107)
     If (feature 70 <= -0.362859746)
      If (feature 70 <= -0.444660083)
       If (feature 70 <= -0.494590159)
        If (feature 46 in {0.0})
         If (feature 71 <= -0.69199303)
          Predict: 24.88308366824991
         Else (feature 71 > -0.69199303)
          Predict: 36.66078865750997
        Else (feature 46 not in {0.0})
         If (feature 71 <= -0.69199303)
          Predict: 25.137724550898202
         Else (feature 71 > -0.69199303)
          Predict: 109.21428571428571
       Else (feature 70 > -0.494590159)
        If (feature 70 <= -0.464844582)
         If (feature 28 in {0.0})
          Predict: 47.93636885952313
         Else (feature 28 not in {0.0})
          Predict: 123.16666666666667
        Else (feature 70 > -0.464844582)
         If (feature 70 <= -0.460595214)
          Predict: 66.34393638170974
         Else (feature 70 > -0.460595214)
          Predict: 58.325383304940374
      Else (feature 70 > -0.444660083)
       If (feature 54 in {0.0})
        If (feature 70 <= -0.40641577)
         If (feature 71 <= 1.181712195)
          Predict: 71.91357779980179
         Else (feature 71 > 1.181712195)
          Predict: 103.95333333333333
        Else (feature 70 > -0.40641577)
         If (feature 70 <= -0.386231271)
          Predict: 88.17477414871439
         Else (feature 70 > -0.386231271)
          Predict: 99.95601633678919
       Else (feature 54 not in {0.0})
        If (feature 58 in {0.0})
         Predict: 68.0
        Else (feature 58 not in {0.0})
         Predict: 2004.0
     Else (feature 70 > -0.362859746)
      If (feature 70 <= -0.262999595)
       If (feature 70 <= -0.300181566)
        If (feature 71 <= 1.000385883)
         If (feature 70 <= -0.340550564)
          Predict: 109.96505505026329
         Else (feature 70 > -0.340550564)
          Predict: 123.23056702966579
        Else (feature 71 > 1.000385883)
         If (feature 38 in {0.0})
          Predict: 135.72455968688845
         Else (feature 38 not in {0.0})
          Predict: 277.51020408163265
       Else (feature 70 > -0.300181566)
        If (feature 71 <= 0.395964842)
         If (feature 71 <= -0.631550926)
          Predict: 121.739336492891
         Else (feature 71 > -0.631550926)
          Predict: 139.9868154158215
        Else (feature 71 > 0.395964842)
         If (feature 38 in {0.0})
          Predict: 153.3846153846154
         Else (feature 38 not in {0.0})
          Predict: 190.76923076923077
      Else (feature 70 > -0.262999595)
       If (feature 70 <= -0.150391339)
        If (feature 71 <= -0.38978251)
         If (feature 0 in {0.0})
          Predict: 152.36204576043068
         Else (feature 0 not in {0.0})
          Predict: 170.71108742004265
        Else (feature 71 > -0.38978251)
         If (feature 1 in {0.0})
          Predict: 190.7429906542056
         Else (feature 1 not in {0.0})
          Predict: 1593.0
       Else (feature 70 > -0.150391339)
        If (feature 71 <= 0.033312218)
         If (feature 1 in {0.0})
          Predict: 193.08574879227052
         Else (feature 1 not in {0.0})
          Predict: 619.0
        Else (feature 71 > 0.033312218)
         If (feature 67 in {1.0})
          Predict: 153.5
         Else (feature 67 not in {1.0})
          Predict: 239.9182839632278
    Else (feature 70 > -0.081339107)
     If (feature 70 <= 0.40733823)
      If (feature 70 <= 0.12050588)
       If (feature 71 <= -0.450224614)
        If (feature 70 <= 0.05251599)
         If (feature 57 in {0.0})
          Predict: 219.29820359281436
         Else (feature 57 not in {0.0})
          Predict: 298.8688524590164
        Else (feature 70 > 0.05251599)
         If (feature 25 in {0.0})
          Predict: 265.0388601036269
         Else (feature 25 not in {0.0})
          Predict: 779.0
       Else (feature 71 > -0.450224614)
        If (feature 57 in {0.0})
         If (feature 9 in {0.0})
          Predict: 269.49632193855473
         Else (feature 9 not in {0.0})
          Predict: 391.037037037037
        Else (feature 57 not in {0.0})
         If (feature 71 <= 1.181712195)
          Predict: 313.8279569892473
         Else (feature 71 > 1.181712195)
          Predict: 423.69565217391306
      Else (feature 70 > 0.12050588)
       If (feature 71 <= -0.571108822)
        If (feature 71 <= -0.752435134)
         If (feature 70 <= 0.253298635)
          Predict: 353.5466666666667
         Else (feature 70 > 0.253298635)
          Predict: 420.33962264150944
        Else (feature 71 > -0.752435134)
         If (feature 56 in {0.0})
          Predict: 265.3056338028169
         Else (feature 56 not in {0.0})
          Predict: 418.0769230769231
       Else (feature 71 > -0.571108822)
        If (feature 71 <= 3.236743732)
         If (feature 70 <= 0.253298635)
          Predict: 339.2782401902497
         Else (feature 70 > 0.253298635)
          Predict: 377.41248720573185
        Else (feature 71 > 3.236743732)
         If (feature 57 in {0.0})
          Predict: 404.6865671641791
         Else (feature 57 not in {0.0})
          Predict: 1013.4444444444445
     Else (feature 70 > 0.40733823)
      If (feature 70 <= 0.714355079)
       If (feature 71 <= -0.450224614)
        If (feature 71 <= -0.752435134)
         If (feature 70 <= 0.557128457)
          Predict: 495.97727272727275
         Else (feature 70 > 0.557128457)
          Predict: 566.65
        Else (feature 71 > -0.752435134)
         If (feature 0 in {0.0})
          Predict: 332.9298245614035
         Else (feature 0 not in {0.0})
          Predict: 426.96190476190475
       Else (feature 71 > -0.450224614)
        If (feature 67 in {1.0})
         If (feature 71 <= -0.329340406)
          Predict: 103.83333333333333
         Else (feature 71 > -0.329340406)
          Predict: 327.8
        Else (feature 67 not in {1.0})
         If (feature 70 <= 0.490200909)
          Predict: 480.97087378640776
         Else (feature 70 > 0.490200909)
          Predict: 538.8163265306123
      Else (feature 70 > 0.714355079)
       If (feature 67 in {1.0})
        If (feature 71 <= -0.69199303)
         If (feature 17 in {1.0})
          Predict: 171.0
         Else (feature 17 not in {1.0})
          Predict: 632.5
        Else (feature 71 > -0.69199303)
         If (feature 0 in {1.0})
          Predict: 194.24
         Else (feature 0 not in {1.0})
          Predict: 450.4166666666667
       Else (feature 67 not in {1.0})
        If (feature 0 in {0.0})
         If (feature 70 <= 0.863082964)
          Predict: 427.8425925925926
         Else (feature 70 > 0.863082964)
          Predict: 600.2564102564103
        Else (feature 0 not in {0.0})
         If (feature 71 <= 2.632322692)
          Predict: 605.6516129032258
         Else (feature 71 > 2.632322692)
          Predict: 691.2558139534884
   Else (feature 69 > -0.332191362)
    If (feature 69 <= 0.352562473)
     If (feature 70 <= 0.001523572)
      If (feature 71 <= -0.510666718)
       If (feature 71 <= -0.69199303)
        If (feature 70 <= -0.258750227)
         If (feature 70 <= -0.388355955)
          Predict: 64.33244680851064
         Else (feature 70 > -0.388355955)
          Predict: 138.95165394402036
        Else (feature 70 > -0.258750227)
         If (feature 71 <= -0.752435134)
          Predict: 198.01932367149757
         Else (feature 71 > -0.752435134)
          Predict: 283.7296137339056
       Else (feature 71 > -0.69199303)
        If (feature 22 in {0.0})
         If (feature 70 <= -0.522211052)
          Predict: 541.8196721311475
         Else (feature 70 > -0.522211052)
          Predict: 323.85006784260514
        Else (feature 22 not in {0.0})
         Predict: 3192.0
      Else (feature 71 > -0.510666718)
       If (feature 70 <= -0.352236326)
        If (feature 70 <= -0.39579235)
         If (feature 70 <= -0.500964211)
          Predict: 464.826164874552
         Else (feature 70 > -0.500964211)
          Predict: 398.28589993502277
        Else (feature 70 > -0.39579235)
         If (feature 71 <= -0.027129886)
          Predict: 440.26066350710903
         Else (feature 71 > -0.027129886)
          Predict: 551.603498542274
       Else (feature 70 > -0.352236326)
        If (feature 71 <= 0.033312218)
         If (feature 0 in {0.0})
          Predict: 456.0635451505017
         Else (feature 0 not in {0.0})
          Predict: 559.9235668789809
        Else (feature 71 > 0.033312218)
         If (feature 70 <= -0.196072047)
          Predict: 638.6203501094092
         Else (feature 70 > -0.196072047)
          Predict: 794.9477124183006
     Else (feature 70 > 0.001523572)
      If (feature 71 <= 0.335522738)
       If (feature 70 <= 0.619806638)
        If (feature 71 <= -0.510666718)
         If (feature 71 <= -0.752435134)
          Predict: 407.78846153846155
         Else (feature 71 > -0.752435134)
          Predict: 535.4775687409551
        Else (feature 71 > -0.510666718)
         If (feature 60 in {0.0})
          Predict: 719.3900293255132
         Else (feature 60 not in {0.0})
          Predict: 1711.0
       Else (feature 70 > 0.619806638)
        If (feature 71 <= -0.631550926)
         If (feature 71 <= -0.752435134)
          Predict: 635.5238095238095
         Else (feature 71 > -0.752435134)
          Predict: 749.5431034482758
        Else (feature 71 > -0.631550926)
         If (feature 26 in {0.0})
          Predict: 954.4099378881988
         Else (feature 26 not in {0.0})
          Predict: 1675.3333333333333
      Else (feature 71 > 0.335522738)
       If (feature 70 <= 0.490200909)
        If (feature 71 <= 3.05541742)
         If (feature 59 in {0.0})
          Predict: 866.0906344410876
         Else (feature 59 not in {0.0})
          Predict: 1315.8235294117646
        Else (feature 71 > 3.05541742)
         If (feature 65 in {0.0})
          Predict: 1177.2272727272727
         Else (feature 65 not in {0.0})
          Predict: 1415.68
       Else (feature 70 > 0.490200909)
        If (feature 57 in {0.0})
         If (feature 71 <= 2.994975316)
          Predict: 1239.5906040268455
         Else (feature 71 > 2.994975316)
          Predict: 1720.4444444444443
        Else (feature 57 not in {0.0})
         If (feature 71 <= 1.967459547)
          Predict: 3714.6666666666665
         Else (feature 71 > 1.967459547)
          Predict: 1348.3333333333333
    Else (feature 69 > 0.352562473)
     If (feature 71 <= -0.631550926)
      If (feature 70 <= 0.12050588)
       If (feature 71 <= -0.69199303)
        If (feature 70 <= -0.170575838)
         If (feature 70 <= -0.345862274)
          Predict: 73.6013986013986
         Else (feature 70 > -0.345862274)
          Predict: 196.61744966442953
        Else (feature 70 > -0.170575838)
         If (feature 71 <= -0.752435134)
          Predict: 258.1530612244898
         Else (feature 71 > -0.752435134)
          Predict: 392.1090909090909
       Else (feature 71 > -0.69199303)
        If (feature 69 <= 7.884854654)
         If (feature 69 <= 1.037316307)
          Predict: 491.57954545454544
         Else (feature 69 > 1.037316307)
          Predict: 814.1644736842105
        Else (feature 69 > 7.884854654)
         If (feature 57 in {1.0})
          Predict: 1433.6666666666667
         Else (feature 57 not in {1.0})
          Predict: 2704.6666666666665
      Else (feature 70 > 0.12050588)
       If (feature 71 <= -0.752435134)
        If (feature 70 <= 0.557128457)
         If (feature 70 <= 0.365906891)
          Predict: 397.1034482758621
         Else (feature 70 > 0.365906891)
          Predict: 486.95454545454544
        Else (feature 70 > 0.557128457)
         If (feature 70 <= 0.768534523)
          Predict: 575.8
         Else (feature 70 > 0.768534523)
          Predict: 677.4655172413793
       Else (feature 71 > -0.752435134)
        If (feature 70 <= 0.40733823)
         If (feature 43 in {0.0})
          Predict: 642.5689655172414
         Else (feature 43 not in {0.0})
          Predict: 2129.5
        Else (feature 70 > 0.40733823)
         If (feature 69 <= 3.091577811)
          Predict: 818.5474137931035
         Else (feature 69 > 3.091577811)
          Predict: 1040.6875
     Else (feature 71 > -0.631550926)
      If (feature 43 in {0.0})
       If (feature 69 <= 4.461085481)
        If (feature 70 <= 0.365906891)
         If (feature 69 <= 1.037316307)
          Predict: 720.0276679841897
         Else (feature 69 > 1.037316307)
          Predict: 897.9391891891892
        Else (feature 70 > 0.365906891)
         If (feature 71 <= -0.208456198)
          Predict: 1089.4397905759163
         Else (feature 71 > -0.208456198)
          Predict: 1840.8666666666666
       Else (feature 69 > 4.461085481)
        If (feature 65 in {0.0})
         If (feature 54 in {0.0})
          Predict: 1354.351851851852
         Else (feature 54 not in {0.0})
          Predict: 3994.0
        Else (feature 65 not in {0.0})
         If (feature 71 <= -0.510666718)
          Predict: 2012.0
         Else (feature 71 > -0.510666718)
          Predict: 6663.0
      Else (feature 43 not in {0.0})
       If (feature 69 <= 1.722070142)
        If (feature 70 <= -0.357548036)
         If (feature 71 <= -0.148014094)
          Predict: 666.1392405063291
         Else (feature 71 > -0.148014094)
          Predict: 973.1147540983607
        Else (feature 70 > -0.357548036)
         If (feature 67 in {1.0})
          Predict: 757.1756756756756
         Else (feature 67 not in {1.0})
          Predict: 1401.2072072072071
       Else (feature 69 > 1.722070142)
        If (feature 59 in {0.0})
         If (feature 58 in {0.0})
          Predict: 1715.712643678161
         Else (feature 58 not in {0.0})
          Predict: 3184.5
        Else (feature 59 not in {0.0})
         If (feature 70 <= -0.357548036)
          Predict: 3002.0
         Else (feature 70 > -0.357548036)
          Predict: 4655.0
  Else (feature 70 > 0.983127614)
   If (feature 70 <= 2.973956591)
    If (feature 69 <= -0.332191362)
     If (feature 70 <= 1.859559794)
      If (feature 70 <= 1.409126771)
       If (feature 71 <= -0.510666718)
        If (feature 71 <= -0.752435134)
         If (feature 70 <= 1.15310234)
          Predict: 772.3703703703703
         Else (feature 70 > 1.15310234)
          Predict: 862.1666666666666
        Else (feature 71 > -0.752435134)
         If (feature 67 in {1.0})
          Predict: 261.4
         Else (feature 67 not in {1.0})
          Predict: 628.7846715328467
       Else (feature 71 > -0.510666718)
        If (feature 63 in {0.0})
         If (feature 70 <= 1.15310234)
          Predict: 693.839393939394
         Else (feature 70 > 1.15310234)
          Predict: 785.6523076923077
        Else (feature 63 not in {0.0})
         If (feature 71 <= 1.000385883)
          Predict: 1587.5
         Else (feature 71 > 1.000385883)
          Predict: 1579.7142857142858
      Else (feature 70 > 1.409126771)
       If (feature 32 in {1.0})
        If (feature 62 in {0.0})
         If (feature 71 <= -0.450224614)
          Predict: 282.0
         Else (feature 71 > -0.450224614)
          Predict: 698.0
        Else (feature 62 not in {0.0})
         Predict: 906.0
       Else (feature 32 not in {1.0})
        If (feature 28 in {1.0})
         If (feature 58 in {0.0})
          Predict: 28.0
         Else (feature 58 not in {0.0})
          Predict: 30.0
        Else (feature 28 not in {1.0})
         If (feature 0 in {0.0})
          Predict: 717.6716417910447
         Else (feature 0 not in {0.0})
          Predict: 871.0103626943005
     Else (feature 70 > 1.859559794)
      If (feature 70 <= 2.626570745)
       If (feature 63 in {0.0})
        If (feature 9 in {0.0})
         If (feature 66 in {1.0})
          Predict: 802.013698630137
         Else (feature 66 not in {1.0})
          Predict: 1054.6842105263158
        Else (feature 9 not in {0.0})
         If (feature 70 <= 2.238815901)
          Predict: 989.4545454545455
         Else (feature 70 > 2.238815901)
          Predict: 1396.4716981132076
       Else (feature 63 not in {0.0})
        If (feature 0 in {0.0})
         Predict: 16.0
        Else (feature 0 not in {0.0})
         If (feature 33 in {0.0})
          Predict: 1416.4634146341464
         Else (feature 33 not in {0.0})
          Predict: 2483.0
      Else (feature 70 > 2.626570745)
       If (feature 32 in {1.0})
        If (feature 0 in {0.0})
         Predict: 52.0
        Else (feature 0 not in {0.0})
         Predict: 86.0
       Else (feature 32 not in {1.0})
        If (feature 64 in {0.0})
         If (feature 17 in {1.0})
          Predict: 272.5
         Else (feature 17 not in {1.0})
          Predict: 1301.5785123966941
        Else (feature 64 not in {0.0})
         If (feature 71 <= 0.21463853)
          Predict: 1502.892857142857
         Else (feature 71 > 0.21463853)
          Predict: 1725.3333333333333
    Else (feature 69 > -0.332191362)
     If (feature 70 <= 2.238815901)
      If (feature 71 <= -0.631550926)
       If (feature 70 <= 1.409126771)
        If (feature 71 <= -0.752435134)
         If (feature 70 <= 1.15310234)
          Predict: 772.2435897435897
         Else (feature 70 > 1.15310234)
          Predict: 869.6933333333334
        Else (feature 71 > -0.752435134)
         If (feature 69 <= 1.722070142)
          Predict: 930.9389312977099
         Else (feature 69 > 1.722070142)
          Predict: 1271.0
       Else (feature 70 > 1.409126771)
        If (feature 71 <= -0.752435134)
         If (feature 70 <= 1.859559794)
          Predict: 1032.5631067961165
         Else (feature 70 > 1.859559794)
          Predict: 1229.7692307692307
        Else (feature 71 > -0.752435134)
         If (feature 17 in {0.0})
          Predict: 1290.7329545454545
         Else (feature 17 not in {0.0})
          Predict: 2419.0
      Else (feature 71 > -0.631550926)
       If (feature 9 in {0.0})
        If (feature 70 <= 1.567415734)
         If (feature 71 <= 0.093754322)
          Predict: 1200.5727272727272
         Else (feature 71 > 0.093754322)
          Predict: 1501.7961165048544
        Else (feature 70 > 1.567415734)
         If (feature 71 <= 0.516849051)
          Predict: 1522.5165562913908
         Else (feature 71 > 0.516849051)
          Predict: 3245.3
       Else (feature 9 not in {0.0})
        If (feature 69 <= 1.037316307)
         If (feature 69 <= 0.352562473)
          Predict: 1333.9285714285713
         Else (feature 69 > 0.352562473)
          Predict: 2045.8
        Else (feature 69 > 1.037316307)
         If (feature 0 in {1.0})
          Predict: 2407.0
         Else (feature 0 not in {1.0})
          Predict: 28191.0
     Else (feature 70 > 2.238815901)
      If (feature 65 in {0.0})
       If (feature 0 in {0.0})
        If (feature 69 <= 1.722070142)
         If (feature 6 in {0.0})
          Predict: 1346.0625
         Else (feature 6 not in {0.0})
          Predict: 2960.0
        Else (feature 69 > 1.722070142)
         If (feature 71 <= -0.752435134)
          Predict: 1481.5217391304348
         Else (feature 71 > -0.752435134)
          Predict: 1903.6190476190477
       Else (feature 0 not in {0.0})
        If (feature 69 <= 0.352562473)
         If (feature 38 in {0.0})
          Predict: 1513.095238095238
         Else (feature 38 not in {0.0})
          Predict: 2115.777777777778
        Else (feature 69 > 0.352562473)
         If (feature 71 <= 1.181712195)
          Predict: 1926.0506329113923
         Else (feature 71 > 1.181712195)
          Predict: 4517.0
      Else (feature 65 not in {0.0})
       If (feature 37 in {0.0})
        If (feature 71 <= -0.69199303)
         If (feature 51 in {0.0})
          Predict: 1589.7777777777778
         Else (feature 51 not in {0.0})
          Predict: 2189.0
        Else (feature 71 > -0.69199303)
         If (feature 69 <= 1.037316307)
          Predict: 2204.5
         Else (feature 69 > 1.037316307)
          Predict: 2691.5714285714284
       Else (feature 37 not in {0.0})
        Predict: 7289.0
   Else (feature 70 > 2.973956591)
    If (feature 70 <= 4.584467118)
     If (feature 69 <= -0.332191362)
      If (feature 70 <= 3.787710591)
       If (feature 66 in {1.0})
        If (feature 38 in {1.0})
         If (feature 71 <= 0.275080634)
          Predict: 495.0
         Else (feature 71 > 0.275080634)
          Predict: 98.83333333333333
        Else (feature 38 not in {1.0})
         If (feature 0 in {0.0})
          Predict: 531.6
         Else (feature 0 not in {0.0})
          Predict: 1673.2857142857142
       Else (feature 66 not in {1.0})
        If (feature 46 in {1.0})
         If (feature 65 in {1.0})
          Predict: 67.0
         Else (feature 65 not in {1.0})
          Predict: 1366.6666666666667
        Else (feature 46 not in {1.0})
         If (feature 6 in {1.0})
          Predict: 66.0
         Else (feature 6 not in {1.0})
          Predict: 1681.7447916666667
      Else (feature 70 > 3.787710591)
       If (feature 67 in {1.0})
        If (feature 38 in {0.0})
         If (feature 37 in {0.0})
          Predict: 29.0
         Else (feature 37 not in {0.0})
          Predict: 76.0
        Else (feature 38 not in {0.0})
         Predict: 127.5
       Else (feature 67 not in {1.0})
        If (feature 0 in {0.0})
         If (feature 35 in {0.0})
          Predict: 1031.8333333333333
         Else (feature 35 not in {0.0})
          Predict: 2271.6666666666665
        Else (feature 0 not in {0.0})
         If (feature 37 in {0.0})
          Predict: 2086.1176470588234
         Else (feature 37 not in {0.0})
          Predict: 3427.25
     Else (feature 69 > -0.332191362)
      If (feature 71 <= -0.752435134)
       If (feature 70 <= 3.787710591)
        If (feature 60 in {0.0})
         If (feature 69 <= 1.722070142)
          Predict: 1829.8125
         Else (feature 69 > 1.722070142)
          Predict: 1906.3333333333333
        Else (feature 60 not in {0.0})
         If (feature 0 in {1.0})
          Predict: 2007.0
         Else (feature 0 not in {1.0})
          Predict: 2042.5
       Else (feature 70 > 3.787710591)
        If (feature 56 in {0.0})
         If (feature 35 in {0.0})
          Predict: 2175.2608695652175
         Else (feature 35 not in {0.0})
          Predict: 2284.6666666666665
        Else (feature 56 not in {0.0})
         If (feature 35 in {1.0})
          Predict: 2284.0
         Else (feature 35 not in {1.0})
          Predict: 2387.0
      Else (feature 71 > -0.752435134)
       If (feature 69 <= 2.406823977)
        If (feature 70 <= 3.787710591)
         If (feature 1 in {0.0})
          Predict: 2054.9
         Else (feature 1 not in {0.0})
          Predict: 4269.5
        Else (feature 70 > 3.787710591)
         If (feature 36 in {0.0})
          Predict: 2586.4285714285716
         Else (feature 36 not in {0.0})
          Predict: 4706.0
       Else (feature 69 > 2.406823977)
        If (feature 46 in {0.0})
         If (feature 29 in {0.0})
          Predict: 2541.6923076923076
         Else (feature 29 not in {0.0})
          Predict: 4691.0
        Else (feature 46 not in {0.0})
         If (feature 59 in {0.0})
          Predict: 3601.4
         Else (feature 59 not in {0.0})
          Predict: 7389.0
    Else (feature 70 > 4.584467118)
     If (feature 69 <= -0.332191362)
      If (feature 51 in {1.0})
       If (feature 71 <= -0.631550926)
        If (feature 62 in {1.0})
         Predict: 84.0
        Else (feature 62 not in {1.0})
         Predict: 526.0
       Else (feature 71 > -0.631550926)
        If (feature 62 in {0.0})
         Predict: 1067.0
        Else (feature 62 not in {0.0})
         Predict: 1173.0
      Else (feature 51 not in {1.0})
       If (feature 60 in {1.0})
        If (feature 0 in {0.0})
         If (feature 5 in {1.0})
          Predict: 16.0
         Else (feature 5 not in {1.0})
          Predict: 32.0
        Else (feature 0 not in {0.0})
         Predict: 2453.0
       Else (feature 60 not in {1.0})
        If (feature 67 in {1.0})
         Predict: 68.0
        Else (feature 67 not in {1.0})
         If (feature 59 in {1.0})
          Predict: 1377.0
         Else (feature 59 not in {1.0})
          Predict: 2629.415
     Else (feature 69 > -0.332191362)
      If (feature 57 in {0.0})
       If (feature 71 <= -0.510666718)
        If (feature 71 <= -0.571108822)
         If (feature 69 <= 1.722070142)
          Predict: 3481.159574468085
         Else (feature 69 > 1.722070142)
          Predict: 3075.296296296296
        Else (feature 71 > -0.571108822)
         If (feature 26 in {0.0})
          Predict: 2462.818181818182
         Else (feature 26 not in {0.0})
          Predict: 3048.3333333333335
       Else (feature 71 > -0.510666718)
        If (feature 0 in {1.0})
         If (feature 46 in {0.0})
          Predict: 2920.5833333333335
         Else (feature 46 not in {0.0})
          Predict: 4854.4
        Else (feature 0 not in {1.0})
         If (feature 69 <= 0.352562473)
          Predict: 4242.666666666667
         Else (feature 69 > 0.352562473)
          Predict: 5217.166666666667
      Else (feature 57 not in {0.0})
       If (feature 71 <= -0.450224614)
        If (feature 32 in {1.0})
         If (feature 0 in {1.0})
          Predict: 566.0
         Else (feature 0 not in {1.0})
          Predict: 3803.0
        Else (feature 32 not in {1.0})
         If (feature 71 <= -0.631550926)
          Predict: 3797.923076923077
         Else (feature 71 > -0.631550926)
          Predict: 5300.533333333334
       Else (feature 71 > -0.450224614)
        If (feature 46 in {1.0})
         Predict: 989.0
        Else (feature 46 not in {1.0})
         If (feature 1 in {1.0})
          Predict: 2504.0
         Else (feature 1 not in {1.0})
          Predict: 3164.8571428571427


In [27]:
# Save and load model
#model.save(sc, "target/tmp/myDecisionTreeRegressionModel")
#sameModel = DecisionTreeModel.load(sc, "target/tmp/myDecisionTreeRegressionModel")

In [ ]:
# Algumas funcoes de erro para a avaliacao das arvores
def squared_error(actual, pred):
    return (pred - actual)**2

def abs_error(actual, pred):
    return np.abs(pred - actual)

def squared_log_error(pred, actual):
    return (np.log(pred + 1) - np.log(actual + 1))**2

In [ ]:
# Funcao para avaliar as arvores
def evaluate_dt(train, test, maxDepth, maxBins):
    model = DecisionTree.trainRegressor(train
                                        ,{}
                                        ,impurity='variance'
                                        ,maxDepth=maxDepth
                                        ,maxBins=maxBins)
    
    preds = model.predict(test.map(lambda p: p.features))
    
    actual = test.map(lambda p: p.label)
    
    tp = actual.zip(preds)
    
    rmsle = np.sqrt(tp.map(lambda (t, p): squared_log_error(t,p)).mean())
    
    rmse = np.sqrt(tp.map(lambda (t, p): squared_error(t,p)).mean())
    
    return rmse

In [ ]:
# Tree Depth
params = [5,8,10,20,30]
metrics = [evaluate_dt(trainingData, testData, param, 32) for param in params]

print 'Parametros escolhidos:', params
print 'RMSE dos parametros', metrics

plot(params, metrics)
fig = matplotlib.pyplot.gcf()

In [ ]:
# Maximo de bins
params = [2, 4, 8, 16, 32, 64, 100, 200]
metrics = [evaluate_dt(trainingData, testData, 5, param) for param in params]

print 'Parametros escolhidos:', params
print 'RMSE dos parametros', metrics

plot(params, metrics)
fig = matplotlib.pyplot.gcf()

In [ ]:
# Maximo de bins
params = [200, 400, 500, 700, 1000]
metrics = [evaluate_dt(trainingData, testData, 5, param) for param in params]

print 'Parametros escolhidos:', params
print 'RMSE dos parametros', metrics

plot(params, metrics)
fig = matplotlib.pyplot.gcf()

In [ ]:
# Alguns experimentos com Regressao Linear
help(LinearRegressionWithSGD)

In [28]:
def linearRegression_MSE(data, regMode, regForce, numInteracoes, stepConvergence):
    model = LinearRegressionWithSGD.train(data
                                          ,iterations=numInteracoes
                                          , step=stepConvergence
                                          , miniBatchFraction=1.0
                                          , initialWeights=None
                                          , regParam=regForce
                                          , regType=regMode
                                          , intercept=True
                                          , validateData=True
                                          , convergenceTol=0.001)
   
    # Avaliacao do modelo
    # Usamos o .map() do parsedData para pegar os pares {train_data, predict_data}
    valuesAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
    
    # Informations about model
    print "Regularizador = " + str(regMode)
    
    # Forca do Regularizador
    print "Parametro de Regularizacao = " + str(regForce)   
    
    # Predictions to using in metrics
    Preds = parsedData.map(lambda p: (p.label, float(model.predict(p.features))))
    
    # Get metrics
    metrics = RegressionMetrics(Preds)
    
    # Squared Error
    print("MSE = %s" % metrics.meanSquaredError)
    print("RMSE = %s" % metrics.rootMeanSquaredError)

    # R-squared
    print("R-squared = %s" % metrics.r2)

    # Mean absolute error
    print("MAE = %s" % metrics.meanAbsoluteError)

    # Explained variance
    print("Explained variance = %s" % metrics.explainedVariance)
        
    return

In [29]:
# Sem regularizacao
print '\n Modelo 1'
RegLinear_1 = linearRegression_MSE(trainingData, None, 0, 10000, 0.001)
print '\n Modelo 2'
RegLinear_2 = linearRegression_MSE(trainingData, None, 0, 100000, 0.001)
print '\n Modelo 3'
RegLinear_3 = linearRegression_MSE(trainingData, None, 0, 500000, 0.001)
print '\n Modelo 4'
RegLinear_4 = linearRegression_MSE(trainingData, None, 0, 10000, 0.01)
print '\n Modelo 5'
RegLinear_5 = linearRegression_MSE(trainingData, None, 0, 100000, 0.01)
print '\n Modelo 6'
RegLinear_6 = linearRegression_MSE(trainingData, None, 0, 500000, 0.01)
print '\n Modelo 7'
RegLinear_7 = linearRegression_MSE(trainingData, None, 0, 10000, 0.1)
print '\n Modelo 8'
RegLinear_8 = linearRegression_MSE(trainingData, None, 0, 100000, 0.1)
print '\n Modelo 9'
RegLinear_9 = linearRegression_MSE(trainingData, None, 0, 500000, 0.1)
print '\n Modelo 10'
RegLinear_10 = linearRegression_MSE(trainingData, None, 0, 10000, 1.0)
print '\n Modelo 11'
RegLinear_11 = linearRegression_MSE(trainingData, None, 0, 100000, 1.0)
print '\n Modelo 12'
RegLinear_12 = linearRegression_MSE(trainingData, None, 0, 500000, 1.0)


/Users/flavio.clesio/Documents/spark-2.1.0/python/pyspark/mllib/regression.py:281: UserWarning: Deprecated in 2.0.0. Use ml.regression.LinearRegression.
  warnings.warn("Deprecated in 2.0.0. Use ml.regression.LinearRegression.")
Regularizador = None
Parametro de Regularizacao = 0
MSE = 311153.633152
RMSE = 557.81146739
R-squared = -1857.01389131
MAE = 263.419184866
Explained variance = 319599.107676

 Modelo
Regularizador = None
Parametro de Regularizacao = 0
MSE = 311153.633152
RMSE = 557.81146739
R-squared = -1857.01389131
MAE = 263.419184866
Explained variance = 319599.107676

 Modelo
Regularizador = None
Parametro de Regularizacao = 0
MSE = 311153.633152
RMSE = 557.81146739
R-squared = -1857.01389131
MAE = 263.419184866
Explained variance = 319599.107676

 Modelo
Regularizador = None
Parametro de Regularizacao = 0
MSE = 213966.788636
RMSE = 462.565442544
R-squared = -20.5264279018
MAE = 202.79982463
Explained variance = 274539.42919

 Modelo
Regularizador = None
Parametro de Regularizacao = 0
MSE = 213966.788636
RMSE = 462.565442544
R-squared = -20.5264279018
MAE = 202.79982463
Explained variance = 274539.42919

 Modelo
Regularizador = None
Parametro de Regularizacao = 0
MSE = 213966.788636
RMSE = 462.565442544
R-squared = -20.5264279018
MAE = 202.79982463
Explained variance = 274539.42919

 Modelo
Regularizador = None
Parametro de Regularizacao = 0
MSE = 111282.773454
RMSE = 333.590727469
R-squared = -0.0837590722608
MAE = 158.749994678
Explained variance = 250733.366898

 Modelo
Regularizador = None
Parametro de Regularizacao = 0
MSE = 111282.773454
RMSE = 333.590727469
R-squared = -0.0837590722608
MAE = 158.749994678
Explained variance = 250733.366898

 Modelo
Regularizador = None
Parametro de Regularizacao = 0
MSE = 111282.773454
RMSE = 333.590727469
R-squared = -0.0837590722608
MAE = 158.749994678
Explained variance = 250733.366898

 Modelo
Regularizador = None
Parametro de Regularizacao = 0
MSE = 105210.015491
RMSE = 324.360933977
R-squared = 0.272697918031
MAE = 146.345274654
Explained variance = 250442.108293

 Modelo
Regularizador = None
Parametro de Regularizacao = 0
MSE = 105210.015491
RMSE = 324.360933977
R-squared = 0.272697918031
MAE = 146.345274654
Explained variance = 250442.108293

 Modelo
Regularizador = None
Parametro de Regularizacao = 0
MSE = 105210.015491
RMSE = 324.360933977
R-squared = 0.272697918031
MAE = 146.345274654
Explained variance = 250442.108293

In [30]:
# Regularizacao L1 (Lasso)
print '\n Modelo 1'
LASSO_1 = linearRegression_MSE(trainingData, "l1", 0.0, 10000, 0.001)
print '\n Modelo 2'
LASSO_2 = linearRegression_MSE(trainingData, "l1", 0.0, 100000, 0.001)
print '\n Modelo 3'
LASSO_3 = linearRegression_MSE(trainingData, "l1", 0.0, 500000, 0.001)
print '\n Modelo 4'
LASSO_4 = linearRegression_MSE(trainingData, "l1", 0.01, 10000, 0.001)
print '\n Modelo 5'
LASSO_5 = linearRegression_MSE(trainingData, "l1", 0.01, 100000, 0.001)
print '\n Modelo 6'
LASSO_6 = linearRegression_MSE(trainingData, "l1", 0.01, 500000, 0.001)
print '\n Modelo 7'
LASSO_7 = linearRegression_MSE(trainingData, "l1", 0.025, 10000, 0.01)
print '\n Modelo 8'
LASSO_8 = linearRegression_MSE(trainingData, "l1", 0.025, 100000, 0.01)
print '\n Modelo 9'
LASSO_9 = linearRegression_MSE(trainingData, "l1", 0.025, 500000, 0.01)
print '\n Modelo 10'
LASSO_10 = linearRegression_MSE(trainingData, "l1", 0.05, 10000, 0.1)
print '\n Modelo 11'
LASSO_11 = linearRegression_MSE(trainingData, "l1", 0.05, 100000, 0.1)
print '\n Modelo 12'
LASSO_12 = linearRegression_MSE(trainingData, "l1", 0.05, 500000, 0.1)
print '\n Modelo 13'
LASSO_13 = linearRegression_MSE(trainingData, "l1", 0.1, 10000, 1.0)
print '\n Modelo 14'
LASSO_14 = linearRegression_MSE(trainingData, "l1", 0.1, 100000, 1.0)
print '\n Modelo 15'
LASSO_15 = linearRegression_MSE(trainingData, "l1", 0.1, 500000, 1.0)
print '\n Modelo 16'
LASSO_16 = linearRegression_MSE(trainingData, "l1", 10.0, 10000, 1.0)
print '\n Modelo 17'
LASSO_17 = linearRegression_MSE(trainingData, "l1", 10.0, 100000, 1.0)
print '\n Modelo 18'
LASSO_18 = linearRegression_MSE(trainingData, "l1", 10.0, 500000, 1.0)
print '\n Modelo 19'
LASSO_19 = linearRegression_MSE(trainingData, "l1", 100.0, 10000, 1.0)
print '\n Modelo 20'
LASSO_20 = linearRegression_MSE(trainingData, "l1", 100.0, 100000, 1.0)
print '\n Modelo 21'
LASSO_21 = linearRegression_MSE(trainingData, "l1", 100.0, 500000, 1.0)
print '\n Modelo 22'
LASSO_22 = linearRegression_MSE(trainingData, "l1", 1000.0, 10000, 1.0)
print '\n Modelo 23'
LASSO_23 = linearRegression_MSE(trainingData, "l1", 1000.0, 100000, 1.0)
print '\n Modelo 24'
LASSO_24 = linearRegression_MSE(trainingData, "l1", 1000.0, 500000, 1.0)


 Modelo 1
Regularizador = l1
Parametro de Regularizacao = 0.0
MSE = 311153.633152
RMSE = 557.81146739
R-squared = -1857.01389131
MAE = 263.419184866
Explained variance = 319599.107676

 Modelo 2
Regularizador = l1
Parametro de Regularizacao = 0.0
MSE = 311153.633152
RMSE = 557.81146739
R-squared = -1857.01389131
MAE = 263.419184866
Explained variance = 319599.107676

 Modelo 3
Regularizador = l1
Parametro de Regularizacao = 0.0
MSE = 311153.633152
RMSE = 557.81146739
R-squared = -1857.01389131
MAE = 263.419184866
Explained variance = 319599.107676

 Modelo 4
Regularizador = l1
Parametro de Regularizacao = 0.01
MSE = 311154.803838
RMSE = 557.812516745
R-squared = -1857.22127405
MAE = 263.420528702
Explained variance = 319599.878099

 Modelo 5
Regularizador = l1
Parametro de Regularizacao = 0.01
MSE = 311154.803838
RMSE = 557.812516745
R-squared = -1857.22127405
MAE = 263.420528702
Explained variance = 319599.878099

 Modelo 6
Regularizador = l1
Parametro de Regularizacao = 0.01
MSE = 311154.803838
RMSE = 557.812516745
R-squared = -1857.22127405
MAE = 263.420528702
Explained variance = 319599.878099

 Modelo 7
Regularizador = l1
Parametro de Regularizacao = 0.025
MSE = 213980.660092
RMSE = 462.580436348
R-squared = -20.5333764229
MAE = 202.804773114
Explained variance = 274547.24839

 Modelo 8
Regularizador = l1
Parametro de Regularizacao = 0.025
MSE = 213980.660092
RMSE = 462.580436348
R-squared = -20.5333764229
MAE = 202.804773114
Explained variance = 274547.24839

 Modelo 9
Regularizador = l1
Parametro de Regularizacao = 0.025
MSE = 213980.660092
RMSE = 462.580436348
R-squared = -20.5333764229
MAE = 202.804773114
Explained variance = 274547.24839

 Modelo 10
Regularizador = l1
Parametro de Regularizacao = 0.05
MSE = 111295.372912
RMSE = 333.60961154
R-squared = -0.0844165037554
MAE = 158.738628187
Explained variance = 250737.025384

 Modelo 11
Regularizador = l1
Parametro de Regularizacao = 0.05
MSE = 111295.372912
RMSE = 333.60961154
R-squared = -0.0844165037554
MAE = 158.738628187
Explained variance = 250737.025384

 Modelo 12
Regularizador = l1
Parametro de Regularizacao = 0.05
MSE = 111295.372912
RMSE = 333.60961154
R-squared = -0.0844165037554
MAE = 158.738628187
Explained variance = 250737.025384

 Modelo 13
Regularizador = l1
Parametro de Regularizacao = 0.1
MSE = 105223.168648
RMSE = 324.381208839
R-squared = 0.271874252719
MAE = 146.326140281
Explained variance = 250442.198543

 Modelo 14
Regularizador = l1
Parametro de Regularizacao = 0.1
MSE = 105223.168648
RMSE = 324.381208839
R-squared = 0.271874252719
MAE = 146.326140281
Explained variance = 250442.198543

 Modelo 15
Regularizador = l1
Parametro de Regularizacao = 0.1
MSE = 105223.168648
RMSE = 324.381208839
R-squared = 0.271874252719
MAE = 146.326140281
Explained variance = 250442.198543

 Modelo 16
Regularizador = l1
Parametro de Regularizacao = 10.0
MSE = 106418.740074
RMSE = 326.218853033
R-squared = 0.189870124349
MAE = 142.568133568
Explained variance = 250576.613493

 Modelo 17
Regularizador = l1
Parametro de Regularizacao = 10.0
MSE = 106418.740074
RMSE = 326.218853033
R-squared = 0.189870124349
MAE = 142.568133568
Explained variance = 250576.613493

 Modelo 18
Regularizador = l1
Parametro de Regularizacao = 10.0
MSE = 106418.740074
RMSE = 326.218853033
R-squared = 0.189870124349
MAE = 142.568133568
Explained variance = 250576.613493

 Modelo 19
Regularizador = l1
Parametro de Regularizacao = 100.0
MSE = 149644.455923
RMSE = 386.839056874
R-squared = -2.03598190209
MAE = 166.271298465
Explained variance = 260229.426941

 Modelo 20
Regularizador = l1
Parametro de Regularizacao = 100.0
MSE = 149644.455923
RMSE = 386.839056874
R-squared = -2.03598190209
MAE = 166.271298465
Explained variance = 260229.426941

 Modelo 21
Regularizador = l1
Parametro de Regularizacao = 100.0
MSE = 149644.455923
RMSE = 386.839056874
R-squared = -2.03598190209
MAE = 166.271298465
Explained variance = 260229.426941

 Modelo 22
Regularizador = l1
Parametro de Regularizacao = 1000.0
MSE = 330165.38039
RMSE = 574.600191777
R-squared = -inf
MAE = 282.353109394
Explained variance = 330165.38039

 Modelo 23
Regularizador = l1
Parametro de Regularizacao = 1000.0
MSE = 330165.38039
RMSE = 574.600191777
R-squared = -inf
MAE = 282.353109394
Explained variance = 330165.38039

 Modelo 24
Regularizador = l1
Parametro de Regularizacao = 1000.0
MSE = 330165.38039
RMSE = 574.600191777
R-squared = -inf
MAE = 282.353109394
Explained variance = 330165.38039

In [31]:
# Regularizacao L2 (Ridge)
print '\n Modelo 1'
RIDGE_1 = linearRegression_MSE(trainingData, "l2", 0.0, 10000, 0.001)
print '\n Modelo 2'
RIDGE_2 = linearRegression_MSE(trainingData, "l2", 0.0, 100000, 0.001)
print '\n Modelo 3'
RIDGE_3 = linearRegression_MSE(trainingData, "l2", 0.0, 500000, 0.001)
print '\n Modelo 4'
RIDGE_4 = linearRegression_MSE(trainingData, "l2", 0.01, 10000, 0.001)
print '\n Modelo 5'
RIDGE_5 = linearRegression_MSE(trainingData, "l2", 0.01, 100000, 0.001)
print '\n Modelo 6'
RIDGE_6 = linearRegression_MSE(trainingData, "l2", 0.01, 500000, 0.001)
print '\n Modelo 7'
RIDGE_7 = linearRegression_MSE(trainingData, "l2", 0.1, 10000, 1.0)
print '\n Modelo 8'
RIDGE_8 = linearRegression_MSE(trainingData, "l2", 0.1, 100000, 1.0)
print '\n Modelo 9'
RIDGE_9 = linearRegression_MSE(trainingData, "l2", 0.1, 500000, 1.0)
print '\n Modelo 10'
RIDGE_10 = linearRegression_MSE(trainingData, "l2", 1.0, 10000, 1.0)
print '\n Modelo 11'
RIDGE_11 = linearRegression_MSE(trainingData, "l2", 1.0, 100000, 1.0)
print '\n Modelo 12'
RIDGE_12 = linearRegression_MSE(trainingData, "l2", 1.0, 500000, 1.0)
print '\n Modelo 13'
RIDGE_13 = linearRegression_MSE(trainingData, "l2", 5.0, 10000, 1.0)
print '\n Modelo 14'
RIDGE_14 = linearRegression_MSE(trainingData, "l2", 5.0, 100000, 1.0)
print '\n Modelo 15'
RIDGE_15 = linearRegression_MSE(trainingData, "l2", 5.0, 500000, 1.0)
print '\n Modelo 16'
RIDGE_16 = linearRegression_MSE(trainingData, "l2", 10.0, 10000, 1.0)
print '\n Modelo 17'
RIDGE_17 = linearRegression_MSE(trainingData, "l2", 10.0, 100000, 1.0)
print '\n Modelo 18'
RIDGE_18 = linearRegression_MSE(trainingData, "l2", 10.0, 500000, 1.0)
print '\n Modelo 19'
RIDGE_19 = linearRegression_MSE(trainingData, "l2", 20.0, 10000, 1.0)
print '\n Modelo 20'
RIDGE_20 = linearRegression_MSE(trainingData, "l2", 20.0, 100000, 1.0)
print '\n Modelo 21'
RIDGE_21 = linearRegression_MSE(trainingData, "l2", 20.0, 500000, 1.0)


 Modelo 1
Regularizador = l2
Parametro de Regularizacao = 0.0
MSE = 311153.633152
RMSE = 557.81146739
R-squared = -1857.01389131
MAE = 263.419184866
Explained variance = 319599.107676

 Modelo 2
Regularizador = l2
Parametro de Regularizacao = 0.0
MSE = 311153.633152
RMSE = 557.81146739
R-squared = -1857.01389131
MAE = 263.419184866
Explained variance = 319599.107676

 Modelo 3
Regularizador = l2
Parametro de Regularizacao = 0.0
MSE = 311153.633152
RMSE = 557.81146739
R-squared = -1857.01389131
MAE = 263.419184866
Explained variance = 319599.107676

 Modelo 4
Regularizador = l2
Parametro de Regularizacao = 0.01
MSE = 311157.64097
RMSE = 557.815059827
R-squared = -1857.82319897
MAE = 263.423132852
Explained variance = 319601.358708

 Modelo 5
Regularizador = l2
Parametro de Regularizacao = 0.01
MSE = 311157.64097
RMSE = 557.815059827
R-squared = -1857.82319897
MAE = 263.423132852
Explained variance = 319601.358708

 Modelo 6
Regularizador = l2
Parametro de Regularizacao = 0.01
MSE = 311157.64097
RMSE = 557.815059827
R-squared = -1857.82319897
MAE = 263.423132852
Explained variance = 319601.358708

 Modelo 7
Regularizador = l2
Parametro de Regularizacao = 0.1
MSE = 108533.636583
RMSE = 329.44443626
R-squared = 0.0274171276519
MAE = 150.366265573
Explained variance = 250896.924282

 Modelo 8
Regularizador = l2
Parametro de Regularizacao = 0.1
MSE = 108533.636583
RMSE = 329.44443626
R-squared = 0.0274171276519
MAE = 150.366265573
Explained variance = 250896.924282

 Modelo 9
Regularizador = l2
Parametro de Regularizacao = 0.1
MSE = 108533.636583
RMSE = 329.44443626
R-squared = 0.0274171276519
MAE = 150.366265573
Explained variance = 250896.924282

 Modelo 10
Regularizador = l2
Parametro de Regularizacao = 1.0
MSE = 169795.900715
RMSE = 412.06298149
R-squared = -5.44385954171
MAE = 182.182818202
Explained variance = 264117.928455

 Modelo 11
Regularizador = l2
Parametro de Regularizacao = 1.0
MSE = 169795.900715
RMSE = 412.06298149
R-squared = -5.44385954171
MAE = 182.182818202
Explained variance = 264117.928455

 Modelo 12
Regularizador = l2
Parametro de Regularizacao = 1.0
MSE = 169795.900715
RMSE = 412.06298149
R-squared = -5.44385954171
MAE = 182.182818202
Explained variance = 264117.928455

 Modelo 13
Regularizador = l2
Parametro de Regularizacao = 5.0
MSE = 263852.477011
RMSE = 513.665724972
R-squared = -99.4992005856
MAE = 227.78929503
Explained variance = 297209.056763

 Modelo 14
Regularizador = l2
Parametro de Regularizacao = 5.0
MSE = 263852.477011
RMSE = 513.665724972
R-squared = -99.4992005856
MAE = 227.78929503
Explained variance = 297209.056763

 Modelo 15
Regularizador = l2
Parametro de Regularizacao = 5.0
MSE = 263852.477011
RMSE = 513.665724972
R-squared = -99.4992005856
MAE = 227.78929503
Explained variance = 297209.056763

 Modelo 16
Regularizador = l2
Parametro de Regularizacao = 10.0
MSE = 291801.749924
RMSE = 540.186773185
R-squared = -372.840605223
MAE = 247.447366799
Explained variance = 310155.778655

 Modelo 17
Regularizador = l2
Parametro de Regularizacao = 10.0
MSE = 291801.749924
RMSE = 540.186773185
R-squared = -372.840605223
MAE = 247.447366799
Explained variance = 310155.778655

 Modelo 18
Regularizador = l2
Parametro de Regularizacao = 10.0
MSE = 291801.749924
RMSE = 540.186773185
R-squared = -372.840605223
MAE = 247.447366799
Explained variance = 310155.778655

 Modelo 19
Regularizador = l2
Parametro de Regularizacao = 20.0
MSE = 309325.923449
RMSE = 556.170768244
R-squared = -1435.40821365
MAE = 262.340113205
Explained variance = 318988.155718

 Modelo 20
Regularizador = l2
Parametro de Regularizacao = 20.0
MSE = 309325.923449
RMSE = 556.170768244
R-squared = -1435.40821365
MAE = 262.340113205
Explained variance = 318988.155718

 Modelo 21
Regularizador = l2
Parametro de Regularizacao = 20.0
MSE = 309325.923449
RMSE = 556.170768244
R-squared = -1435.40821365
MAE = 262.340113205
Explained variance = 318988.155718

In [ ]:
# Mais experimentos com arvore de decisao 
help(DecisionTree.trainRegressor)

In [ ]:
# Mais experimentos com arvore de decisao 
dt_model = DecisionTree.trainRegressor(parsedData
                                       ,categoricalFeaturesInfo = {}
                                       , impurity='variance'
                                       , maxDepth=5
                                       , maxBins=32
                                       , minInstancesPerNode=1
                                       , minInfoGain=0.0)

In [ ]:
# Evaluate model on test instances and compute test error
predictions = dt_model.predict(testData.map(lambda x: x.features))

In [ ]:
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)

In [ ]:
testMSE = labelsAndPredictions.map(lambda lp: (lp[0] - lp[1]) * (lp[0] - lp[1])).sum() /\
    float(testData.count())

In [ ]:
print('Test Mean Squared Error = ' + str(testMSE))
print('Learned regression tree model:')
print(model.toDebugString())

In [ ]:
# Save and load model
#model.save(sc, "target/tmp/myDecisionTreeRegressionModel")
#sameModel = DecisionTreeModel.load(sc, "target/tmp/myDecisionTreeRegressionModel")

In [ ]:
# Paramgrid: https://github.com/MingChen0919/learning-apache-spark/blob/master/linear-regression.ipynb
# Paramgrid: https://mapr.com/blog/churn-prediction-pyspark-using-mllib-and-ml-packages/

Treinamento do modelo linear


In [ ]:
linear_model = LinearRegressionWithSGD.train(data
                                             ,iterations=10
                                             ,step=0.1
                                             ,intercept=False)

In [ ]:
# Aqui vamos usar o .map para o nosso conjunto de dados data, e usando a funcao lambda vamos pegar a coluna com os valores
# que no caso é o p.label e o resultado das predicoes no .peatures
true_vs_predicted = data.map(lambda p: (p.label, linear_model.predict(p.features)))

In [ ]:
print "Linear Model predictions: " + str(true_vs_predicted.take(5))

In [ ]:
# Agora vamos usar o regressor da arvore de decisao regressora. No caso quando se a algum tipo de variavel categorica
# e necessario passar como argumento categoricalFeaturesInfo. No caso vai ficar as...is
dt_model = DecisionTree.trainRegressor(data_dt
                                       ,{})

# A diferenca fundamental que precisa ser enendida e que no modelo linear, obrigatoriamente as variavels dummy precisam
# estar criadas no vetor de features, enquanto no modelo de arvore de decisao, como o algoritmo tem propriedades
# de quebra da arvore, nem sempre isso e necessario, mas e recomendado

In [ ]:
preds = dt_model.predict(data_dt.map(lambda p: p.features))

In [ ]:
actual = data.map(lambda p: p.label)

In [ ]:
true_vs_predicted_dt = actual.zip(preds)

In [ ]:
print "Decision Tree predictions: " + str(true_vs_predicted_dt.take(5))

In [ ]:
print "Decision Tree depth: " + str(dt_model.depth())

In [ ]:
print "Decision Tree number of nodes: " + str(dt_model.numNodes())

Tunning do modelo


In [ ]:
# Atribuicao de index para todos os campos via chave e valor
data_with_idx = data.zipWithIndex().map(lambda (k, v): (v, k))

In [ ]:
# Nesse caso a base de teste sera criada usando 20% de todo o conjunto de dados
test = data_with_idx.sample(False, 0.2, 42)

In [ ]:
# O subtractByKey() faz o trabalho de remover da base inicial as instancias que tem overlap
# ou seja, esses serao os 80% da base de testes
train = data_with_idx.subtractByKey(test)

In [ ]:
train_data = train.map(lambda (idx, p): p)
test_data = test.map(lambda (idx, p) : p)
train_size = train_data.count()
test_size = test_data.count()

In [ ]:
print "Training data size: %d" % train_size
print "Test data size: %d" % test_size
print "Total data size: %d " % num_data
print "Train + Test size : %d" % (train_size + test_size)

In [ ]:
data_with_idx_dt = data_dt.zipWithIndex().map(lambda (k, v): (v, k))
test_dt = data_with_idx_dt.sample(False, 0.2, 42)
train_dt = data_with_idx_dt.subtractByKey(test_dt)
train_data_dt = train_dt.map(lambda (idx, p): p)
test_data_dt = test_dt.map(lambda (idx, p) : p)

In [ ]:
def evaluate(train, test, iterations, step, regParam, regType,intercept):
    model = LinearRegressionWithSGD.train(train
                                          ,iterations
                                          ,step
                                          ,regParam=regParam
                                          ,regType=regType
                                          ,intercept=intercept)
    
    tp = test.map(lambda p: (p.label, model.predict(p.features)))
    
    rmsle = np.sqrt(tp.map(lambda (t, p): squared_log_error(t, p)).mean())
    
    return rmsle

In [ ]:
# Numero de interacoes como parametros
params = [1, 5, 10, 20, 50, 100]
metrics = [evaluate(train_data, test_data, param, 0.01, 0.0,'l2',False) for param in params]

print 'Parametros escolhidos:', params
print 'RMSLE dos parametros', metrics

In [ ]:
matplotlib.pyplot.plot(params, metrics)
matplotlib.pyplot.show()

In [ ]:
# Step size
params = [0.01, 0.025, 0.05, 0.1, 1.0]
metrics = [evaluate(train_data, test_data, 10, param, 0.0, 'l2',False) for param in params]

print 'Parametros escolhidos:', params
print 'RMSLE dos parametros', metrics

In [ ]:
# Regularizacao Ridge (L2)
params = [0.0, 0.01, 0.1, 1.0, 5.0, 10.0, 20.0]
metrics = [evaluate(train_data, test_data, 10, 0.1, param, 'l2',False) for param in params]

print 'Parametros escolhidos:', params
print 'RMSLE dos parametros', metrics

In [ ]:
matplotlib.pyplot.plot(params, metrics)
matplotlib.pyplot.xscale('log')
matplotlib.pyplot.show()

In [ ]:
# Regularizacao L! (Lasso)
params = [0.0, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
metrics = [evaluate(train_data, test_data, 10, 0.1, param, 'l1',False) for param in params]

print 'Parametros escolhidos:', params
print 'RMSLE dos parametros', metrics

In [ ]:
matplotlib.pyplot.plot(params, metrics)
matplotlib.pyplot.xscale('log')
matplotlib.pyplot.show()

In [ ]:
model_l1 = LinearRegressionWithSGD.train(train_data, 10, 0.1,regParam=1.0, regType='l1', intercept=False)
model_l1_10 = LinearRegressionWithSGD.train(train_data, 10, 0.1,regParam=10.0, regType='l1', intercept=False)
model_l1_100 = LinearRegressionWithSGD.train(train_data, 10, 0.1,regParam=100.0, regType='l1', intercept=False)

In [ ]:
print "L1 (1.0) number of zero weights: " + str(sum(model_l1.weights.array == 0))
print "L1 (10.0) number of zeros weights: " + str(sum(model_l1_10.weights.array == 0))
print "L1 (100.0) number of zeros weights: " + str(sum(model_l1_100.weights.array == 0))

In [ ]:
# Como a regularizacao e mais agressiva o numero de numeros zeros e maior quanto mais se aumenta a regularizacao

In [ ]:
# Intercepto
params = [False, True]
metrics = [evaluate(train_data, test_data, 10, 0.1, 1.0, 'l2', param)for param in params]
print 'Parametros escolhidos:', params
print 'RMSLE dos parametros', metrics

In [ ]:
# Put in bar
matplotlib.pyplot.bar(params, metrics)
matplotlib.pyplot.xscale('log')
matplotlib.pyplot.show()

Impacto dos parametros na arvore de decisao regressora


In [ ]:
def evaluate_dt(train, test, maxDepth, maxBins):
    model = DecisionTree.trainRegressor(train
                                        , {}
                                        ,impurity='variance'
                                        ,maxDepth=maxDepth
                                        ,maxBins=maxBins)
    
    preds = model.predict(test.map(lambda p: p.features))
    
    actual = test.map(lambda p: p.label)
    
    tp = actual.zip(preds)
    
    rmsle = np.sqrt(tp.map(lambda (t, p): squared_log_error(t,p)).mean())
    
    return rmsle

In [ ]:
# Tree Depth
params = [1, 2, 3, 4, 5, 10, 20]
metrics = [evaluate_dt(train_data_dt, test_data_dt, param, 32) for param in params]

print 'Parametros escolhidos:', params
print 'RMSLE dos parametros', metrics

plot(params, metrics)
fig = matplotlib.pyplot.gcf()

In [ ]:
# Maximo de bins
params = [2, 4, 8, 16, 32, 64, 100]
metrics = [evaluate_dt(train_data_dt, test_data_dt, 5, param) for param in params]

print 'Parametros escolhidos:', params
print 'RMSLE dos parametros', metrics

plot(params, metrics)
fig = matplotlib.pyplot.gcf()

In [ ]:
# Save and load model
#model.save(sc, "home/myDecisionTreeClassificationModel")

In [ ]:
#sameModel = DecisionTreeModel.load(sc, "target/tmp/myDecisionTreeClassificationModel")