In [1]:
import findspark
findspark.init()
import pyspark
sc = pyspark.SparkContext()
In [2]:
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint
from string import split,strip
from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.mllib.util import MLUtils
Data Set Information:
The data has been produced using Monte Carlo simulations. The first 21 features (columns 2-22) are kinematic properties measured by the particle detectors in the accelerator. The last seven features are functions of the first 21 features; these are high-level features derived by physicists to help discriminate between the two classes. There is an interest in using deep learning methods to obviate the need for physicists to manually develop such features. Benchmark results using Bayesian Decision Trees from a standard physics package and 5-layer neural networks are presented in the original paper. The last 500,000 examples are used as a test set.
In [3]:
#define feature names
feature_text='lepton pT, lepton eta, lepton phi, missing energy magnitude, missing energy phi, jet 1 pt, jet 1 eta, jet 1 phi, jet 1 b-tag, jet 2 pt, jet 2 eta, jet 2 phi, jet 2 b-tag, jet 3 pt, jet 3 eta, jet 3 phi, jet 3 b-tag, jet 4 pt, jet 4 eta, jet 4 phi, jet 4 b-tag, m_jj, m_jjj, m_lv, m_jlv, m_bb, m_wbb, m_wwbb'
features=[strip(a) for a in split(feature_text,',')]
print len(features),features
In [4]:
# create a directory called higgs, download and decompress HIGGS.csv.gz into it
from os.path import exists
if not exists('higgs'):
print "creating directory higgs"
!mkdir higgs
%cd higgs
if not exists('HIGGS.csv'):
if not exists('HIGGS.csv.gz'):
print 'downloading HIGGS.csv.gz'
!curl -O http://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz
print 'decompressing HIGGS.csv.gz --- May take 5-10 minutes'
!gunzip -f HIGGS.csv.gz
!ls -l
%cd ..
In [5]:
# Read the file into an RDD
# If doing this on a real cluster, you need the file to be available on all nodes, ideally in HDFS.
path='higgs/HIGGS.csv'
inputRDD=sc.textFile(path)
Out[5]:
In [6]:
# Transform the text RDD into an RDD of LabeledPoints
Data=inputRDD.map(lambda line: [float(strip(x)) for x in line.split(',')])\
.map(lambda a: LabeledPoint(a[0], a[1:]))
Out[6]:
In [7]:
Data1=Data.sample(False,0.01).cache()
(trainingData,testData)=Data1.randomSplit([0.7,0.3])
print 'Sizes: Data1=%d, trainingData=%d, testData=%d'%(Data1.count(),trainingData.cache().count(),testData.cache().count())
In [8]:
from time import time
errors={}
for depth in [1,3,6,10]:
start=time()
model=GradientBoostedTrees.trainClassifier(trainingData, categoricalFeaturesInfo={}, numIterations=10, maxDepth=depth)
#print model.toDebugString()
errors[depth]={}
dataSets={'train':trainingData,'test':testData}
for name in dataSets.keys(): # Calculate errors on train and test sets
data=dataSets[name]
Predicted=model.predict(data.map(lambda x: x.features))
LabelsAndPredictions = data.map(lambda lp: lp.label).zip(Predicted)
Err = LabelsAndPredictions.filter(lambda (v,p): v != p).count()/float(data.count())
errors[depth][name]=Err
print depth,errors[depth],int(time()-start),'seconds'
print errors
In [9]:
B10 = errors
# Plot Train/test accuracy vs Depth of trees graph
%pylab inline
from plot_utils import *
make_figure([B10],['10Trees'],Title='Boosting using 10% of data')
In [10]:
from time import time
errors={}
for depth in [1,3,6,10,15,20]:
start=time()
model = RandomForest.trainClassifier(trainingData, numClasses=2, categoricalFeaturesInfo={},
numTrees=10, featureSubsetStrategy="auto",
impurity='gini', maxDepth=depth)
errors[depth]={}
dataSets={'train':trainingData,'test':testData}
for name in dataSets.keys(): # Calculate errors on train and test sets
data=dataSets[name]
Predicted=model.predict(data.map(lambda x: x.features))
LabelsAndPredictions = data.map(lambda lp: lp.label).zip(Predicted)
Err = LabelsAndPredictions.filter(lambda (v,p): v != p).count()/float(data.count())
errors[depth][name]=Err
print depth,errors[depth],int(time()-start),'seconds'
print errors
In [13]:
RF_10trees = errors
# Plot Train/test accuracy vs Depth of trees graph
make_figure([RF_10trees],['10Trees'],Title='Random Forests using 10% of data')
In [14]:
make_figure([RF_10trees, B10],['10Trees', 'GB'],Title='GBT vs RF')