(IDatributo, categoria)
através de um atributo binário. Nós podemos fazer isso no Python criando um dicionário que mapeia cada possível tupla em um inteiro que corresponde a sua posição no vetor de atributos binário.
In [25]:
# Data for manual OHE
# Note: the first data point does not include any value for the optional third feature
sampleOne = [(0, 'mouse'), (1, 'black')]
sampleTwo = [(0, 'cat'), (1, 'tabby'), (2, 'mouse')]
sampleThree = [(0, 'bear'), (1, 'black'), (2, 'salmon')]
sampleDataRDD = sc.parallelize([sampleOne, sampleTwo, sampleThree])
In [26]:
# EXERCICIO
sampleOHEDictManual = {}
sampleOHEDictManual[(0,'bear')] = 0
sampleOHEDictManual[(0,'cat')] = 1
sampleOHEDictManual[(0,'mouse')] = 2
sampleOHEDictManual[(1,'black')] = 3
sampleOHEDictManual[(1,'tabby')] = 4
sampleOHEDictManual[(2,'mouse')] = 5
sampleOHEDictManual[(2,'salmon')] = 6
In [27]:
# TEST One-hot-encoding (1a)
from test_helper import Test
Test.assertEqualsHashed(sampleOHEDictManual[(0,'bear')],
'b6589fc6ab0dc82cf12099d1c2d40ab994e8410c',
"incorrect value for sampleOHEDictManual[(0,'bear')]")
Test.assertEqualsHashed(sampleOHEDictManual[(0,'cat')],
'356a192b7913b04c54574d18c28d46e6395428ab',
"incorrect value for sampleOHEDictManual[(0,'cat')]")
Test.assertEqualsHashed(sampleOHEDictManual[(0,'mouse')],
'da4b9237bacccdf19c0760cab7aec4a8359010b0',
"incorrect value for sampleOHEDictManual[(0,'mouse')]")
Test.assertEqualsHashed(sampleOHEDictManual[(1,'black')],
'77de68daecd823babbb58edb1c8e14d7106e83bb',
"incorrect value for sampleOHEDictManual[(1,'black')]")
Test.assertEqualsHashed(sampleOHEDictManual[(1,'tabby')],
'1b6453892473a467d07372d45eb05abc2031647a',
"incorrect value for sampleOHEDictManual[(1,'tabby')]")
Test.assertEqualsHashed(sampleOHEDictManual[(2,'mouse')],
'ac3478d69a3c81fa62e60f5c3696165a4e5e6ac4',
"incorrect value for sampleOHEDictManual[(2,'mouse')]")
Test.assertEqualsHashed(sampleOHEDictManual[(2,'salmon')],
'c1dfd96eea8cc2b62785275bca38ac261256e278',
"incorrect value for sampleOHEDictManual[(2,'salmon')]")
Test.assertEquals(len(sampleOHEDictManual.keys()), 7,
'incorrect number of keys in sampleOHEDictManual')
Dense
para vetores esparsos. Utilize a classe SparseVector para representá-los e verifique que ambas as representações retornam o mesmo resultado nos cálculos dos produtos interno.SparseVector(tamanho, *args)
para criar um novo vetor esparso onde tamanho
é o tamanho do vetor e args
pode ser um dicionário, uma lista de tuplas (índice, valor) ou duas arrays separadas de índices e valores ordenados por índice.
In [28]:
import numpy as np
from pyspark.mllib.linalg import SparseVector
In [29]:
# EXERCICIO
aDense = np.array([0., 3., 0., 4.])
aSparse = SparseVector(4,[(1,3),(3,4)])
bDense = np.array([0., 0., 0., 1.])
bSparse = SparseVector(4,[(3,1)])
w = np.array([0.4, 3.1, -1.4, -.5])
print aDense.dot(w)
print aSparse.dot(w)
print bDense.dot(w)
print bSparse.dot(w)
In [30]:
# TEST Sparse Vectors (1b)
Test.assertTrue(isinstance(aSparse, SparseVector), 'aSparse needs to be an instance of SparseVector')
Test.assertTrue(isinstance(bSparse, SparseVector), 'aSparse needs to be an instance of SparseVector')
Test.assertTrue(aDense.dot(w) == aSparse.dot(w),
'dot product of aDense and w should equal dot product of aSparse and w')
Test.assertTrue(bDense.dot(w) == bSparse.dot(w),
'dot product of bDense and w should equal dot product of bSparse and w')
sampleOHEDictManual
, crie um vetor esparso para cada amostra de nossa base de dados. Todo atributo que ocorre em uma amostra deve ter valor 1.0. Por exemplo, um vetor para um ponto com os atributos 2 e 4 devem ser [0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0]
.
In [31]:
# Reminder of the sample features
# sampleOne = [(0, 'mouse'), (1, 'black')]
# sampleTwo = [(0, 'cat'), (1, 'tabby'), (2, 'mouse')]
# sampleThree = [(0, 'bear'), (1, 'black'), (2, 'salmon')]
In [32]:
# EXERCICIO
sampleOneOHEFeatManual = SparseVector(7, [(sampleOHEDictManual[s],1.0) for s in sampleOne])
sampleTwoOHEFeatManual = SparseVector(7, [(sampleOHEDictManual[s],1.0) for s in sampleTwo])
sampleThreeOHEFeatManual = SparseVector(7, [(sampleOHEDictManual[s],1.0) for s in sampleThree])
In [33]:
# TEST OHE Features as sparse vectors (1c)
Test.assertTrue(isinstance(sampleOneOHEFeatManual, SparseVector),
'sampleOneOHEFeatManual needs to be a SparseVector')
Test.assertTrue(isinstance(sampleTwoOHEFeatManual, SparseVector),
'sampleTwoOHEFeatManual needs to be a SparseVector')
Test.assertTrue(isinstance(sampleThreeOHEFeatManual, SparseVector),
'sampleThreeOHEFeatManual needs to be a SparseVector')
Test.assertEqualsHashed(sampleOneOHEFeatManual,
'ecc00223d141b7bd0913d52377cee2cf5783abd6',
'incorrect value for sampleOneOHEFeatManual')
Test.assertEqualsHashed(sampleTwoOHEFeatManual,
'26b023f4109e3b8ab32241938e2e9b9e9d62720a',
'incorrect value for sampleTwoOHEFeatManual')
Test.assertEqualsHashed(sampleThreeOHEFeatManual,
'c04134fd603ae115395b29dcabe9d0c66fbdc8a7',
'incorrect value for sampleThreeOHEFeatManual')
In [34]:
# EXERCICIO
def oneHotEncoding(rawFeats, OHEDict, numOHEFeats):
"""Produce a one-hot-encoding from a list of features and an OHE dictionary.
Note:
You should ensure that the indices used to create a SparseVector are sorted.
Args:
rawFeats (list of (int, str)): The features corresponding to a single observation. Each
feature consists of a tuple of featureID and the feature's value. (e.g. sampleOne)
OHEDict (dict): A mapping of (featureID, value) to unique integer.
numOHEFeats (int): The total number of unique OHE features (combinations of featureID and
value).
Returns:
SparseVector: A SparseVector of length numOHEFeats with indicies equal to the unique
identifiers for the (featureID, value) combinations that occur in the observation and
with values equal to 1.0.
"""
return SparseVector(numOHEFeats, [(OHEDict[s],1.0) for s in rawFeats])
# Calculate the number of features in sampleOHEDictManual
numSampleOHEFeats = len(sampleOHEDictManual)
# Run oneHotEnoding on sampleOne
sampleOneOHEFeat = oneHotEncoding(sampleOne, sampleOHEDictManual, numSampleOHEFeats)
print sampleOneOHEFeat
In [35]:
# TEST Define an OHE Function (1d)
Test.assertTrue(sampleOneOHEFeat == sampleOneOHEFeatManual,
'sampleOneOHEFeat should equal sampleOneOHEFeatManual')
Test.assertEquals(sampleOneOHEFeat, SparseVector(7, [2,3], [1.0,1.0]),
'incorrect value for sampleOneOHEFeat')
Test.assertEquals(oneHotEncoding([(1, 'black'), (0, 'mouse')], sampleOHEDictManual,
numSampleOHEFeats), SparseVector(7, [2,3], [1.0,1.0]),
'incorrect definition for oneHotEncoding')
In [36]:
# EXERCICIO
sampleOHEData = sampleDataRDD.map(lambda x: oneHotEncoding(x,sampleOHEDictManual, numSampleOHEFeats))
print sampleOHEData.collect()
In [37]:
# TEST Apply OHE to a dataset (1e)
sampleOHEDataValues = sampleOHEData.collect()
Test.assertTrue(len(sampleOHEDataValues) == 3, 'sampleOHEData should have three elements')
Test.assertEquals(sampleOHEDataValues[0], SparseVector(7, {2: 1.0, 3: 1.0}),
'incorrect OHE for first sample')
Test.assertEquals(sampleOHEDataValues[1], SparseVector(7, {1: 1.0, 4: 1.0, 5: 1.0}),
'incorrect OHE for second sample')
Test.assertEquals(sampleOHEDataValues[2], SparseVector(7, {0: 1.0, 3: 1.0, 6: 1.0}),
'incorrect OHE for third sample')
(IDatributo, categoria)
(IDatributo, categoria)
. Em nossa base de dados você deve gerar (0, 'bear')
, (0, 'cat')
, (0, 'mouse')
, (1, 'black')
, (1, 'tabby')
, (2, 'mouse')
, (2, 'salmon')
. Repare que 'black'
aparece duas vezes em nossa base de dados mas contribui apenas para um item do RDD: (1, 'black')
, por outro lado 'mouse'
aparece duas vezes e contribui para dois itens: (0, 'mouse')
and (2, 'mouse')
.
In [38]:
# EXERCICIO
sampleDistinctFeats = (sampleDataRDD
.flatMap(lambda x : x )
.distinct()
)
In [39]:
# TEST Pair RDD of (featureID, category) (2a)
Test.assertEquals(sorted(sampleDistinctFeats.collect()),
[(0, 'bear'), (0, 'cat'), (0, 'mouse'), (1, 'black'),
(1, 'tabby'), (2, 'mouse'), (2, 'salmon')],
'incorrect value for sampleDistinctFeats')
(IDatributo, categoria)
em sampleDistinctFeats
. A chave da tupla é a própria tupla original, e o valor será um inteiro variando de 0 até número de tuplas - 1.RDD
em um dicionário, utilizando o comando collectAsMap
.
In [40]:
# EXERCICIO
sampleOHEDict = (sampleDistinctFeats
.zipWithIndex()
.collectAsMap())
print sampleOHEDict
In [41]:
# TEST OHE Dictionary from distinct features (2b)
Test.assertEquals(sorted(sampleOHEDict.keys()),
[(0, 'bear'), (0, 'cat'), (0, 'mouse'), (1, 'black'),
(1, 'tabby'), (2, 'mouse'), (2, 'salmon')],
'sampleOHEDict has unexpected keys')
Test.assertEquals(sorted(sampleOHEDict.values()), range(7), 'sampleOHEDict has unexpected values')
In [42]:
# EXERCICIO
def createOneHotDict(inputData):
"""Creates a one-hot-encoder dictionary based on the input data.
Args:
inputData (RDD of lists of (int, str)): An RDD of observations where each observation is
made up of a list of (featureID, value) tuples.
Returns:
dict: A dictionary where the keys are (featureID, value) tuples and map to values that are
unique integers.
"""
return (inputData
.flatMap(lambda x: x)
.distinct()
.zipWithIndex()
.collectAsMap()
)
sampleOHEDictAuto = createOneHotDict(sampleDataRDD)
print sampleOHEDictAuto
In [43]:
# TEST Automated creation of an OHE dictionary (2c)
Test.assertEquals(sorted(sampleOHEDictAuto.keys()),
[(0, 'bear'), (0, 'cat'), (0, 'mouse'), (1, 'black'),
(1, 'tabby'), (2, 'mouse'), (2, 'salmon')],
'sampleOHEDictAuto has unexpected keys')
Test.assertEquals(sorted(sampleOHEDictAuto.values()), range(7),
'sampleOHEDictAuto has unexpected values')
In [46]:
import os.path
baseDir = os.path.join('Data')
inputPath = os.path.join('dac_sample.txt')
fileName = os.path.join(baseDir, inputPath)
if os.path.isfile(fileName):
rawData = (sc
.textFile(fileName, 2)
.map(lambda x: x.replace('\t', ','))) # work with either ',' or '\t' separated data
print rawData.take(1)
In [47]:
# EXERCICIO
weights = [.8, .1, .1]
seed = 42
# Use randomSplit with weights and seed
rawTrainData, rawValidationData, rawTestData = rawData.randomSplit(weights, seed)
# Cache the data
rawTrainData.cache()
rawValidationData.cache()
rawTestData.cache()
nTrain = rawTrainData.count()
nVal = rawValidationData.count()
nTest = rawTestData.count()
print nTrain, nVal, nTest, nTrain + nVal + nTest
print rawData.take(1)
In [48]:
# TEST Loading and splitting the data (3a)
Test.assertTrue(all([rawTrainData.is_cached, rawValidationData.is_cached, rawTestData.is_cached]),
'you must cache the split data')
Test.assertEquals(nTrain, 79911, 'incorrect value for nTrain')
Test.assertEquals(nVal, 10075, 'incorrect value for nVal')
Test.assertEquals(nTest, 10014, 'incorrect value for nTest')
In [58]:
# EXERCICIO
def parsePoint(point):
"""Converts a comma separated string into a list of (featureID, value) tuples.
Note:
featureIDs should start at 0 and increase to the number of features - 1.
Args:
point (str): A comma separated string where the first value is the label and the rest
are features.
Returns:
list: A list of (featureID, value) tuples.
"""
seq = []
for x,y in enumerate(point.split(',')[1:]):
seq.append((x,y))
return seq
parsedTrainFeat = rawTrainData.map(parsePoint)
numCategories = (parsedTrainFeat
.flatMap(lambda x: x)
.distinct()
.map(lambda x: (x[0], 1))
.reduceByKey(lambda x, y: x + y)
.sortByKey()
.collect()
)
print numCategories[2][1]
In [59]:
# TEST Extract features (3b)
Test.assertEquals(numCategories[2][1], 855, 'incorrect implementation of parsePoint')
Test.assertEquals(numCategories[32][1], 4, 'incorrect implementation of parsePoint')
In [60]:
# EXERCICIO
ctrOHEDict = createOneHotDict(parsedTrainFeat)
numCtrOHEFeats = len(ctrOHEDict.keys())
print numCtrOHEFeats
print ctrOHEDict[(0, '')]
In [61]:
# TEST Create an OHE dictionary from the dataset (3c)
Test.assertEquals(numCtrOHEFeats, 233286, 'incorrect number of features in ctrOHEDict')
Test.assertTrue((0, '') in ctrOHEDict, 'incorrect features in ctrOHEDict')
parseOHEPoint
. Dica: essa função é uma extensão da função parsePoint
criada anteriormente e que usa a função oneHotEncoding
.
In [ ]:
from pyspark.mllib.regression import LabeledPoint
In [ ]:
# EXERCICIO
def parseOHEPoint(point, OHEDict, numOHEFeats):
"""Obtain the label and feature vector for this raw observation.
Note:
You must use the function `oneHotEncoding` in this implementation or later portions
of this lab may not function as expected.
Args:
point (str): A comma separated string where the first value is the label and the rest
are features.
OHEDict (dict of (int, str) to int): Mapping of (featureID, value) to unique integer.
numOHEFeats (int): The number of unique features in the training dataset.
Returns:
LabeledPoint: Contains the label for the observation and the one-hot-encoding of the
raw features based on the provided OHE dictionary.
"""
<COMPLETAR>
OHETrainData = rawTrainData.map(lambda point: parseOHEPoint(point, ctrOHEDict, numCtrOHEFeats))
OHETrainData.cache()
print OHETrainData.take(1)
# Check that oneHotEncoding function was used in parseOHEPoint
backupOneHot = oneHotEncoding
oneHotEncoding = None
withOneHot = False
try: parseOHEPoint(rawTrainData.take(1)[0], ctrOHEDict, numCtrOHEFeats)
except TypeError: withOneHot = True
oneHotEncoding = backupOneHot
In [ ]:
# TEST Apply OHE to the dataset (3d)
numNZ = sum(parsedTrainFeat.map(lambda x: len(x)).take(5))
numNZAlt = sum(OHETrainData.map(lambda lp: len(lp.features.indices)).take(5))
Test.assertEquals(numNZ, numNZAlt, 'incorrect implementation of parseOHEPoint')
Test.assertTrue(withOneHot, 'oneHotEncoding not present in parseOHEPoint')
In [ ]:
def bucketFeatByCount(featCount):
"""Bucket the counts by powers of two."""
for i in range(11):
size = 2 ** i
if featCount <= size:
return size
return -1
featCounts = (OHETrainData
.flatMap(lambda lp: lp.features.indices)
.map(lambda x: (x, 1))
.reduceByKey(lambda x, y: x + y))
featCountsBuckets = (featCounts
.map(lambda x: (bucketFeatByCount(x[1]), 1))
.filter(lambda (k, v): k != -1)
.reduceByKey(lambda x, y: x + y)
.collect())
print featCountsBuckets
In [ ]:
import matplotlib.pyplot as plt
x, y = zip(*featCountsBuckets)
x, y = np.log(x), np.log(y)
def preparePlot(xticks, yticks, figsize=(10.5, 6), hideLabels=False, gridColor='#999999',
gridWidth=1.0):
"""Template for generating the plot layout."""
plt.close()
fig, ax = plt.subplots(figsize=figsize, facecolor='white', edgecolor='white')
ax.axes.tick_params(labelcolor='#999999', labelsize='10')
for axis, ticks in [(ax.get_xaxis(), xticks), (ax.get_yaxis(), yticks)]:
axis.set_ticks_position('none')
axis.set_ticks(ticks)
axis.label.set_color('#999999')
if hideLabels: axis.set_ticklabels([])
plt.grid(color=gridColor, linewidth=gridWidth, linestyle='-')
map(lambda position: ax.spines[position].set_visible(False), ['bottom', 'top', 'left', 'right'])
return fig, ax
# generate layout and plot data
fig, ax = preparePlot(np.arange(0, 10, 1), np.arange(4, 14, 2))
ax.set_xlabel(r'$\log_e(bucketSize)$'), ax.set_ylabel(r'$\log_e(countInBucket)$')
plt.scatter(x, y, s=14**2, c='#d6ebf2', edgecolors='#8cbfd0', alpha=0.75)
pass
oneHotEncoding
para ignorar os atributos que não existem no dicionário.
In [ ]:
# EXERCICIO
def oneHotEncoding(rawFeats, OHEDict, numOHEFeats):
"""Produce a one-hot-encoding from a list of features and an OHE dictionary.
Note:
If a (featureID, value) tuple doesn't have a corresponding key in OHEDict it should be
ignored.
Args:
rawFeats (list of (int, str)): The features corresponding to a single observation. Each
feature consists of a tuple of featureID and the feature's value. (e.g. sampleOne)
OHEDict (dict): A mapping of (featureID, value) to unique integer.
numOHEFeats (int): The total number of unique OHE features (combinations of featureID and
value).
Returns:
SparseVector: A SparseVector of length numOHEFeats with indicies equal to the unique
identifiers for the (featureID, value) combinations that occur in the observation and
with values equal to 1.0.
"""
<COMPLETAR>
OHEValidationData = rawValidationData.map(lambda point: parseOHEPoint(point, ctrOHEDict, numCtrOHEFeats))
OHEValidationData.cache()
print OHEValidationData.take(1)
In [ ]:
# TEST Handling unseen features (3e)
numNZVal = (OHEValidationData
.map(lambda lp: len(lp.features.indices))
.sum())
Test.assertEquals(numNZVal, 372080, 'incorrect number of features')
OHETrainData
com a configuração de parâmetros dada. LogisticRegressionWithSGD
retorna um LogisticRegressionModel.LogisticRegressionModel.weights
e LogisticRegressionModel.intercept
para verificar o modelo gerado.
In [ ]:
from pyspark.mllib.classification import LogisticRegressionWithSGD
# fixed hyperparameters
numIters = 50
stepSize = 10.
regParam = 1e-6
regType = 'l2'
includeIntercept = True
In [ ]:
# EXERCICIO
model0 = <COMPLETAR>
sortedWeights = sorted(model0.weights)
print sortedWeights[:5], model0.intercept
In [ ]:
# TEST Logistic regression (4a)
Test.assertTrue(np.allclose(model0.intercept, 0.56455084025), 'incorrect value for model0.intercept')
Test.assertTrue(np.allclose(sortedWeights[0:5],
[-0.45899236853575609, -0.37973707648623956, -0.36996558266753304,
-0.36934962879928263, -0.32697945415010637]), 'incorrect value for model0.weights')
In [ ]:
# EXERCICIO
from math import log
def computeLogLoss(p, y):
"""Calculates the value of log loss for a given probabilty and label.
Note:
log(0) is undefined, so when p is 0 we need to add a small value (epsilon) to it
and when p is 1 we need to subtract a small value (epsilon) from it.
Args:
p (float): A probabilty between 0 and 1.
y (int): A label. Takes on the values 0 and 1.
Returns:
float: The log loss value.
"""
<COMPLETAR>
print computeLogLoss(.5, 1)
print computeLogLoss(.5, 0)
print computeLogLoss(.99, 1)
print computeLogLoss(.99, 0)
print computeLogLoss(.01, 1)
print computeLogLoss(.01, 0)
print computeLogLoss(0, 1)
print computeLogLoss(1, 1)
print computeLogLoss(1, 0)
In [ ]:
# TEST Log loss (4b)
Test.assertTrue(np.allclose([computeLogLoss(.5, 1), computeLogLoss(.01, 0), computeLogLoss(.01, 1)],
[0.69314718056, 0.0100503358535, 4.60517018599]),
'computeLogLoss is not correct')
Test.assertTrue(np.allclose([computeLogLoss(0, 1), computeLogLoss(1, 1), computeLogLoss(1, 0)],
[25.3284360229, 1.00000008275e-11, 25.3284360229]),
'computeLogLoss needs to bound p away from 0 and 1 by epsilon')
In [ ]:
# EXERCICIO
# Note that our dataset has a very high click-through rate by design
# In practice click-through rate can be one to two orders of magnitude lower
classOneFracTrain = OHETrainData.<COMPLETAR>
print classOneFracTrain
logLossTrBase = OHETrainData.<COMPLETAR>
print 'Baseline Train Logloss = {0:.3f}\n'.format(logLossTrBase)
In [ ]:
# TEST Baseline log loss (4c)
Test.assertTrue(np.allclose(classOneFracTrain, 0.22717773523), 'incorrect value for classOneFracTrain')
Test.assertTrue(np.allclose(logLossTrBase, 0.535844), 'incorrect value for logLossTrBase')
predict
, porém esse método retorna apenas 0's e 1's. Para calcular a probabilidade de um evento, vamos criar uma função getP
que recebe como parâmetro o ponto x, o conjunto de pesos w
e o intercept
.
In [ ]:
# EXERCICIO
from math import exp # exp(-t) = e^-t
def getP(x, w, intercept):
"""Calculate the probability for an observation given a set of weights and intercept.
Note:
We'll bound our raw prediction between 20 and -20 for numerical purposes.
Args:
x (SparseVector): A vector with values of 1.0 for features that exist in this
observation and 0.0 otherwise.
w (DenseVector): A vector of weights (betas) for the model.
intercept (float): The model's intercept.
Returns:
float: A probability between 0 and 1.
"""
# calculate rawPrediction = w.x + intercept
rawPrediction = <COMPLETAR>
# Bound the raw prediction value
rawPrediction = min(rawPrediction, 20)
rawPrediction = max(rawPrediction, -20)
# calculate (1+e^-rawPrediction)^-1
return <COMPLETAR>
trainingPredictions = OHETrainData.<COMPLETAR>
print trainingPredictions.take(5)
In [ ]:
# TEST Predicted probability (4d)
Test.assertTrue(np.allclose(trainingPredictions.sum(), 18135.4834348),
'incorrect value for trainingPredictions')
In [ ]:
# EXERCICIO
def evaluateResults(model, data):
"""Calculates the log loss for the data given the model.
Args:
model (LogisticRegressionModel): A trained logistic regression model.
data (RDD of LabeledPoint): Labels and features for each observation.
Returns:
float: Log loss for the data.
"""
return (data
.<COMPLETAR>
.<COMPLETAR>
.<COMPLETAR>
)
logLossTrLR0 = evaluateResults(model0, OHETrainData)
print ('OHE Features Train Logloss:\n\tBaseline = {0:.3f}\n\tLogReg = {1:.3f}'
.format(logLossTrBase, logLossTrLR0))
In [ ]:
# TEST Evaluate the model (4e)
Test.assertTrue(np.allclose(logLossTrLR0, 0.456903), 'incorrect value for logLossTrLR0')
In [ ]:
# EXERCICIO
logLossValBase = OHEValidationData.<COMPLETAR>
logLossValLR0 = evaluateResults(model0, OHEValidationData)
print ('OHE Features Validation Logloss:\n\tBaseline = {0:.3f}\n\tLogReg = {1:.3f}'
.format(logLossValBase, logLossValLR0))
In [ ]:
# TEST Validation log loss (4f)
Test.assertTrue(np.allclose(logLossValBase, 0.527603), 'incorrect value for logLossValBase')
Test.assertTrue(np.allclose(logLossValLR0, 0.456957), 'incorrect value for logLossValLR0')
In [ ]:
labelsAndScores = OHEValidationData.map(lambda lp:
(lp.label, getP(lp.features, model0.weights, model0.intercept)))
labelsAndWeights = labelsAndScores.collect()
labelsAndWeights.sort(key=lambda (k, v): v, reverse=True)
labelsByWeight = np.array([k for (k, v) in labelsAndWeights])
length = labelsByWeight.size
truePositives = labelsByWeight.cumsum()
numPositive = truePositives[-1]
falsePositives = np.arange(1.0, length + 1, 1.) - truePositives
truePositiveRate = truePositives / numPositive
falsePositiveRate = falsePositives / (length - numPositive)
# Generate layout and plot data
fig, ax = preparePlot(np.arange(0., 1.1, 0.1), np.arange(0., 1.1, 0.1))
ax.set_xlim(-.05, 1.05), ax.set_ylim(-.05, 1.05)
ax.set_ylabel('True Positive Rate (Sensitivity)')
ax.set_xlabel('False Positive Rate (1 - Specificity)')
plt.plot(falsePositiveRate, truePositiveRate, color='#8cbfd0', linestyle='-', linewidth=3.)
plt.plot((0., 1.), (0., 1.), linestyle='--', color='#d6ebf2', linewidth=2.) # Baseline model
pass
numBuckets
e observe o resultado.
In [ ]:
from collections import defaultdict
import hashlib
def hashFunction(numBuckets, rawFeats, printMapping=False):
"""Calculate a feature dictionary for an observation's features based on hashing.
Note:
Use printMapping=True for debug purposes and to better understand how the hashing works.
Args:
numBuckets (int): Number of buckets to use as features.
rawFeats (list of (int, str)): A list of features for an observation. Represented as
(featureID, value) tuples.
printMapping (bool, optional): If true, the mappings of featureString to index will be
printed.
Returns:
dict of int to float: The keys will be integers which represent the buckets that the
features have been hashed to. The value for a given key will contain the count of the
(featureID, value) tuples that have hashed to that key.
"""
mapping = {}
for ind, category in rawFeats:
featureString = category + str(ind)
mapping[featureString] = int(int(hashlib.md5(featureString).hexdigest(), 16) % numBuckets)
if(printMapping): print mapping
sparseFeatures = defaultdict(float)
for bucket in mapping.values():
sparseFeatures[bucket] += 1.0
return dict(sparseFeatures)
# Reminder of the sample values:
# sampleOne = [(0, 'mouse'), (1, 'black')]
# sampleTwo = [(0, 'cat'), (1, 'tabby'), (2, 'mouse')]
# sampleThree = [(0, 'bear'), (1, 'black'), (2, 'salmon')]
In [ ]:
# EXERCICIO
# Use four buckets
sampOneFourBuckets = <COMPLETAR>
sampTwoFourBuckets = <COMPLETAR>
sampThreeFourBuckets = <COMPLETAR>
# Use one hundred buckets
sampOneHundredBuckets = <COMPLETAR>
sampTwoHundredBuckets = <COMPLETAR>
sampThreeHundredBuckets = <COMPLETAR>
print '\t\t 4 Buckets \t\t\t 100 Buckets'
print 'SampleOne:\t {0}\t\t {1}'.format(sampOneFourBuckets, sampOneHundredBuckets)
print 'SampleTwo:\t {0}\t\t {1}'.format(sampTwoFourBuckets, sampTwoHundredBuckets)
print 'SampleThree:\t {0}\t {1}'.format(sampThreeFourBuckets, sampThreeHundredBuckets)
In [ ]:
# TEST Hash function (5a)
Test.assertEquals(sampOneFourBuckets, {2: 1.0, 3: 1.0}, 'incorrect value for sampOneFourBuckets')
Test.assertEquals(sampThreeHundredBuckets, {72: 1.0, 5: 1.0, 14: 1.0},
'incorrect value for sampThreeHundredBuckets')
LabeledPoint
com os hashed features armazenados como um SparseVector
. Então use esta função para criar uma nova base de treino, validação e teste com hashed features. Dica: parsedHashPoint
é similar a parseOHEPoint
da Parte (3d).
In [ ]:
# EXERCICIO
def parseHashPoint(point, numBuckets):
"""Create a LabeledPoint for this observation using hashing.
Args:
point (str): A comma separated string where the first value is the label and the rest are
features.
numBuckets: The number of buckets to hash to.
Returns:
LabeledPoint: A LabeledPoint with a label (0.0 or 1.0) and a SparseVector of hashed
features.
"""
<COMPLETAR>
numBucketsCTR = 2 ** 15
hashTrainData = rawTrainData.map(lambda x: parseHashPoint(x,numBucketsCTR))
hashTrainData.cache()
hashValidationData = rawValidationData.map(lambda x: parseHashPoint(x,numBucketsCTR))
hashValidationData.cache()
hashTestData = rawTestData.map(lambda x: parseHashPoint(x,numBucketsCTR))
hashTestData.cache()
print hashTrainData.take(1)
In [ ]:
# TEST Creating hashed features (5b)
hashTrainDataFeatureSum = sum(hashTrainData
.map(lambda lp: len(lp.features.indices))
.take(20))
hashTrainDataLabelSum = sum(hashTrainData
.map(lambda lp: lp.label)
.take(100))
hashValidationDataFeatureSum = sum(hashValidationData
.map(lambda lp: len(lp.features.indices))
.take(20))
hashValidationDataLabelSum = sum(hashValidationData
.map(lambda lp: lp.label)
.take(100))
hashTestDataFeatureSum = sum(hashTestData
.map(lambda lp: len(lp.features.indices))
.take(20))
hashTestDataLabelSum = sum(hashTestData
.map(lambda lp: lp.label)
.take(100))
Test.assertEquals(hashTrainDataFeatureSum, 772, 'incorrect number of features in hashTrainData')
Test.assertEquals(hashTrainDataLabelSum, 24.0, 'incorrect labels in hashTrainData')
Test.assertEquals(hashValidationDataFeatureSum, 776,
'incorrect number of features in hashValidationData')
Test.assertEquals(hashValidationDataLabelSum, 16.0, 'incorrect labels in hashValidationData')
Test.assertEquals(hashTestDataFeatureSum, 774, 'incorrect number of features in hashTestData')
Test.assertEquals(hashTestDataLabelSum, 23.0, 'incorrect labels in hashTestData')
SparseVector
chamado sparse
, chamar len(sparse)
retornará o total de atributos, e não o número de valores não nulos. SparseVector
tem atributos indices
e values
que contém informações sobre quais atributos são não nulos.
In [ ]:
# EXERCICIO
def computeSparsity(data, d, n):
"""Calculates the average sparsity for the features in an RDD of LabeledPoints.
Args:
data (RDD of LabeledPoint): The LabeledPoints to use in the sparsity calculation.
d (int): The total number of features.
n (int): The number of observations in the RDD.
Returns:
float: The average of the ratio of features in a point to total features.
"""
return (data
.<COMPLETAR>
.<COMPLETAR>
)/(d*n*1.)
averageSparsityHash = computeSparsity(hashTrainData, numBucketsCTR, nTrain)
averageSparsityOHE = computeSparsity(OHETrainData, numCtrOHEFeats, nTrain)
print 'Average OHE Sparsity: {0:.7e}'.format(averageSparsityOHE)
print 'Average Hash Sparsity: {0:.7e}'.format(averageSparsityHash)
In [ ]:
# TEST Sparsity (5c)
Test.assertTrue(np.allclose(averageSparsityOHE, 1.6717677e-04),
'incorrect value for averageSparsityOHE')
Test.assertTrue(np.allclose(averageSparsityHash, 1.1805561e-03),
'incorrect value for averageSparsityHash')
stepSizes
de 1 e 10 e regParams
de 1e-6 e 1e-3.
In [ ]:
numIters = 500
regType = 'l2'
includeIntercept = True
# Initialize variables using values from initial model training
bestModel = None
bestLogLoss = 1e10
In [ ]:
# EXERCICIO
stepSizes = [1, 10]
regParams = [1e-6, 1e-3]
for stepSize in stepSizes:
for regParam in regParams:
model = (<COMPLETAR>)
logLossVa = <COMPLETAR>
print ('\tstepSize = {0:.1f}, regParam = {1:.0e}: logloss = {2:.3f}'
.format(stepSize, regParam, logLossVa))
if (logLossVa < bestLogLoss):
bestModel = model
bestLogLoss = logLossVa
print ('Hashed Features Validation Logloss:\n\tBaseline = {0:.3f}\n\tLogReg = {1:.3f}'
.format(logLossValBase, bestLogLoss))
In [ ]:
# TEST Logistic model with hashed features (5d)
Test.assertTrue(np.allclose(bestLogLoss, 0.4481683608), 'incorrect value for bestLogLoss')
In [ ]:
# EXERCICIO
# Log loss for the best model from (5d)
logLossValLR0 = <COMPLETAR>
logLossTest = <COMPLETAR>
# Log loss for the baseline model
logLossTestBaseline = hashTestData.map(lambda lp: computeLogLoss(classOneFracTrain,lp.label)).mean()
print ('Hashed Features Test Log Loss:\n\tBaseline = {0:.3f}\n\tLogReg = {1:.3f}'
.format(logLossTestBaseline, logLossTest))
In [ ]:
# TEST Evaluate on the test set (5e)
Test.assertTrue(np.allclose(logLossTestBaseline, 0.537438),
'incorrect value for logLossTestBaseline')
Test.assertTrue(np.allclose(logLossTest, 0.455616931), 'incorrect value for logLossTest')
In [ ]: