In [1]:
from pyspark import SparkContext
sc = SparkContext('local','example')
In [2]:
from pyspark.mllib.linalg import Vectors
x = Vectors.dense([1,2,3,4])
x[0]
Out[2]:
In [6]:
x = [Vectors.dense([1,2,3,4,5]), Vectors.dense([6,7,8,9,10])]
xrdd = sc.parallelize(x, 2)
xrdd.glom().collect()
Out[6]:
In [7]:
from pyspark.mllib.regression import LabeledPoint as LP
pt = LP(1, Vectors.dense(2,-1,4))
print("Label: ", pt.label)
print("Feature Vector: ", pt.features)
In [ ]:
In [ ]:
from pysparl.mllib.classification import NaiveBayes as NB
nbmodel = NB.train(Xrdd_train)
testpred = NB.train(Xrdd_test.features)
In [ ]:
trainpred = nbmodel.predict(Xrdd_train.features)
cf_mat [Xrdd_train.label][trainpred] += 1
In [ ]:
from pyspark.mllib.classification import DecisionTree
dtmodel = DecisionTree.trainClassifier(Xrdd_train,
numClasses = 2,
impurity = 'entropy', ## options: gini or entropy
maxDepth = 5,
maxBins = 32,
minInstancesPerNode = 2)