In [1]:
from pymongo import MongoClient
In [2]:
Sections = MongoClient('localhost').Stage_database.Stage_Sections
In [139]:
Sections.find({'type': 'move'}).count()
Out[139]:
In [4]:
import numpy as np
import scipy as sp
In [5]:
confirmedSections = Sections.find({"$and": [{'type': 'move'}, {'confirmed_mode': {'$ne': ''}}]})
In [6]:
walkSections = Sections.find({"$and": [{'type': 'move'}, {'confirmed_mode': 1}]})
In [7]:
modeList = []
for mode in MongoClient('localhost').Stage_database.Stage_Modes.find():
modeList.append(mode)
print mode
In [8]:
print Sections.find({"$and": [{'type': 'move'}, {'confirmed_mode': {'$ne': ''}}]}).count()
for mode in modeList:
print "%s: %s" % (mode['mode_name'], Sections.find({"$and": [{'type': 'move'}, {'confirmed_mode': mode['mode_id']}]}).count())
In [13]:
from featurecalc import calDistance, calSpeed, calHeading, calAvgSpeed, calSpeeds, calAccels, getIthMaxSpeed, getIthMaxAccel, calHCR,\
calSR, calVCR, mode_cluster, mode_start_end_coverage, cluster_route_match_score,transit_route_match_score
In [14]:
def getSpeedsForMode(modeId):
modeSectionCursor = Sections.find({"$and": [{'type': 'move'}, {'confirmed_mode': modeId}]})
speedList = []
for section in modeSectionCursor:
speeds = calSpeeds(section)
if speeds != None:
# currHistogram = sp.histogram(speeds)
speedList.append(speeds)
return speedList
In [15]:
def showHists(speedList):
# print histograms
nRows = len(speedList)/10
if len(speedList) < 10:
nRows = 1
errCmpFig, axesMatrix = plt.subplots(nRows, 10, figsize=(25, nRows * 2))
# print axesMatrix.shape, axesMatrix.flatten().shape
axesVector = axesMatrix.flatten()
for (i, axis) in enumerate(axesVector):
# print("Plotting values %s for bins %s" % (histograms[i][0].shape, histograms[i][1].shape))
if i < len(speedList):
axis.hist(speedList[i])
# axis.plot(histograms[i][1][:-1], histograms[i][0])
In [16]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
%matplotlib inline
%config InlineBackend.figure_format='png'
In [17]:
walkSpeeds = getSpeedsForMode(1)
In [19]:
print len(walkSpeeds)
showHists(walkSpeeds[0:100])
In [20]:
bikeSpeeds = getSpeedsForMode(3)
In [21]:
print len(bikeSpeeds)
showHists(bikeSpeeds[0:100])
In [22]:
busSpeeds = getSpeedsForMode(5)
In [23]:
print len(busSpeeds)
showHists(busSpeeds[0:100])
In [24]:
trainSpeeds = getSpeedsForMode(6)
In [25]:
print len(trainSpeeds)
showHists(trainSpeeds[0:100])
In [216]:
carSpeeds = getSpeedsForMode(7)
In [217]:
print len(carSpeeds)
showHists(carSpeeds[0:100])
In [13]:
userIds = Sections.distinct("user_id")
for userId in userIds:
print userId, Sections.find({"user_id": userId}).count()
Let's pick the user id with the most number of sections
In [79]:
userIdCounts = []
for userId in userIds:
userIdCounts.append((userId, Sections.find({"user_id": userId}).count()))
sortedUserIds = sorted(userIdCounts, key=lambda k:k[1])
print sortedUserIds
maxUID = sortedUserIds[-1][0]
print maxUID, type(maxUID)
In [ ]:
def getSpeedsForModeAndUser(modeId, userId):
print "userID = %s" % userId
modeSectionCursor = Sections.find({"$and": [{'type': 'move'}, {'confirmed_mode': modeId}, {'user_id': userId}]})
print "Number of matches = %s" % modeSectionCursor.count()
speedList = []
for section in modeSectionCursor:
speeds = calSpeeds(section)
if speeds != None:
# currHistogram = sp.histogram(speeds)
speedList.append(speeds)
return speedList
In [ ]:
print maxUID
userWalkSpeeds = getSpeedsForModeAndUser(1, maxUID)
In [222]:
print len(userWalkSpeeds)
showHists(userWalkSpeeds[0:100])
In [223]:
userBikeSpeeds = getSpeedsForModeAndUser(3, maxUID)
In [224]:
print len(userBikeSpeeds)
showHists(userBikeSpeeds[0:10])
In [225]:
userCarSpeeds = getSpeedsForModeAndUser(7, maxUID)
In [226]:
print len(userCarSpeeds)
showHists(userCarSpeeds[0:10])
In [123]:
# Features are:
# 0. distance
# 1. duration
# 2. first filter mode
# 3. sectionId
# 4. avg speed
# 5. speed EV
# 6. speed variance
# 7. max speed
# 8. max accel
# 9. isCommute
# 10. heading change rate (currently unfilled)
# 11. stop rate (currently unfilled)
# 12. velocity change rate (currently unfilled)
# 13. start lat
# 14. start lng
# 15. stop lat
# 16. stop lng
# 17. start hour
# 18. end hour
# 19. both start and end close to bus stop
# 20. both start and end close to train station
# 21-28. routematching features
featureLabels = ["distance", "duration", "first filter mode", "sectionId", "avg speed",
"speed EV", "speed variance", "max speed", "max accel", "isCommute",
"heading change rate", "stop rate", "velocity change rate", "start lat", "start lng",
"stop lat", "stop lng", "start hour", "end hour", "close to bus stop", "close to train stop",
"walking","running","cycling","transport","bus","train","car","mixed"]
bus_cluster=mode_cluster(5,105,1)
train_cluster=mode_cluster(6,600,1)
def generateFeatureMatrixAndResultVector(sectionQuery):
confirmedSections = Sections.find(sectionQuery)
featureMatrix = np.zeros([confirmedSections.count(), len(featureLabels)])
resultVector = np.zeros(confirmedSections.count())
for (i, section) in enumerate(confirmedSections):
featureMatrix[i, 0] = section['distance']
featureMatrix[i, 1] = (section['section_end_datetime'] - section['section_start_datetime']).total_seconds()
featureMatrix[i, 2] = section['mode']
featureMatrix[i, 3] = section['section_id']
featureMatrix[i, 4] = calAvgSpeed(section)
speeds = calSpeeds(section)
if speeds != None:
featureMatrix[i, 5] = np.mean(speeds)
featureMatrix[i, 6] = np.std(speeds)
featureMatrix[i, 7] = np.max(speeds)
else:
# They will remain zero
pass
accels = calAccels(section)
if accels != None and len(accels) > 0:
featureMatrix[i, 8] = np.max(accels)
else:
# They will remain zero
pass
featureMatrix[i, 9] = ('commute' in section) and (section['commute'] == 'to' or section['commute'] == 'from')
featureMatrix[i, 10] = calHCR(section)
featureMatrix[i, 11] = calSR(section)
featureMatrix[i, 12] = calVCR(section)
if section['section_start_point'] != None:
startCoords = section['section_start_point']['coordinates']
featureMatrix[i, 13] = startCoords[0]
featureMatrix[i, 14] = startCoords[1]
if section['section_end_point'] != None:
endCoords = section['section_end_point']['coordinates']
featureMatrix[i, 15] = endCoords[0]
featureMatrix[i, 16] = endCoords[1]
featureMatrix[i, 17] = section['section_start_datetime'].time().hour
featureMatrix[i, 18] = section['section_end_datetime'].time().hour
## try new bus matching
featureMatrix[i, 19] = mode_start_end_coverage(section,bus_cluster,105)
featureMatrix[i, 20] = mode_start_end_coverage(section,train_cluster,600)
# TransitMap=transit_route_match_score(section,100000,100000,'lcs',500,0.7)
# featureMatrix[i, 20] = max(0,TransitMap['Bart'],TransitMap['CalTrain'])
## RouteMatching Feature
ModeCluster=cluster_route_match_score(section,step1=100000,step2=100000,method='DTW',radius1=2000,threshold=1000)
# print(ModeCluster)
for modeInd in range(len(modeList)):
# print(modeInd)
# print(modeList[modeInd]['mode_id'])
featureMatrix[i, 21+modeInd] = ModeCluster[modeList[modeInd]['mode_id']]
resultVector[i] = section['confirmed_mode']
return (featureMatrix, resultVector)
In [124]:
from uuid import UUID
uuid_shankari=UUID('b0d937d0-70ef-305e-9563-440369012b39')
uuid_zeshi=UUID('9aad7d1b-aa58-3464-9cca-beee37c55d93')
uuid_shankari_hus=UUID('3a307244-ecf1-3e6e-a9a7-3aaf101b40fa')
(featureMatrix, resultVector) = generateFeatureMatrixAndResultVector({"$and": [{'type': 'move'}, {'confirmed_mode': {'$ne': ''}},
{'user_id':uuid_shankari}]})
# (featureMatrix, resultVector) = generateFeatureMatrixAndResultVector({"$and": [{'type': 'move'}, {'confirmed_mode': {'$ne': ''}}]})
In [20]:
print(np.max(featureMatrix[:,10]))
print(np.max(featureMatrix[:,20]))
print(np.mean(featureMatrix[:,20]))
print(np.max(featureMatrix[:,12]))
print featureMatrix.shape, resultVector.shape
print(np.unique(resultVector))
In [21]:
runIndices = resultVector == 2
transportIndices = resultVector == 4
mixedIndices = resultVector == 8
strippedIndices = np.logical_not(runIndices | transportIndices | mixedIndices)
print np.nonzero(runIndices), np.nonzero(transportIndices), np.nonzero(mixedIndices), np.count_nonzero(strippedIndices)
Now, we filter out "mixed" and "running", since there are few instances of them and we don't intend to predict them initially. We also filter out any "transport" since it should never be in the confirmed set, and we don't want to deal with it if it is.
In [22]:
strippedFeatureMatrix = featureMatrix[strippedIndices]
strippedResultVector = resultVector[strippedIndices]
First, we visualize the distribution of some of the features. This is so that we can compare our dataset to Zheng et al 2010.
In [23]:
def plotFeatureVector(featureMatrix, resultVector, featureIndex, modeList):
avgSpeedFig, avgSpeedAxes = plt.subplots(1,1, figsize=(12,10))
currModeSpeedsList = []
currModeNamesList = []
for mode in modeList:
currModeMask = resultVector == mode['mode_id']
currModeSpeeds = featureMatrix[currModeMask, featureIndex]
# print "For mode %s, shape is %s" % (mode['mode_id'], str(currModeSpeeds.shape))
if np.count_nonzero(currModeMask) != 0:
currModeNamesList.append(mode['mode_name'])
currModeSpeedsList.append(currModeSpeeds)
avgSpeedAxes.hist(currModeSpeedsList, normed=True, histtype="bar", label=currModeNamesList)
avgSpeedAxes.set_ylabel("number of segments")
avgSpeedAxes.set_xlabel(featureLabels[featureIndex])
plt.legend()
In [24]:
for col in range(0, len(featureLabels)):
plotFeatureVector(strippedFeatureMatrix, strippedResultVector, col, modeList)
In spite of stripping out the values, we see that there are clear outliers. This is almost certainly a mis-classified trip, because the distance and speed are both really large, but the mode is walking. Let's manually filter out this outlier.
In [25]:
distanceOutliers = strippedFeatureMatrix[:,0] > 500000
speedOutliers = strippedFeatureMatrix[:,4] > 100
speedMeanOutliers = strippedFeatureMatrix[:,5] > 80
speedVarianceOutliers = strippedFeatureMatrix[:,6] > 70
maxSpeedOutliers = strippedFeatureMatrix[:,7] > 160
print np.nonzero(distanceOutliers), np.nonzero(speedOutliers), \
np.nonzero(speedMeanOutliers), np.nonzero(speedVarianceOutliers), \
np.nonzero(maxSpeedOutliers)
nonOutlierIndices = np.logical_not(distanceOutliers | speedOutliers | speedMeanOutliers | speedVarianceOutliers | maxSpeedOutliers)
print nonOutlierIndices.shape
In [26]:
cleanedFeatureMatrix = strippedFeatureMatrix[nonOutlierIndices]
cleanedResultVector = strippedResultVector[nonOutlierIndices]
print(cleanedResultVector.shape)
In [27]:
for col in range(0, 10):
plotFeatureVector(cleanedFeatureMatrix, cleanedResultVector, col, modeList)
Using the graphs above, we can estimate the separability of our input. Clearly, there is some separability - the car and train trips that are at 20-30+ are clearly separable from the walk/bike trips that are at lower speeds. But are they separable from each other? And at least eyeballing the data, it looks like at least 75% of car trips are actually not that fast - the mean EV is < 10mph. Even with max speed, at least 25% of car trips appear to have a max speed ~ 10 mph. Max accel doesn't seem to have as much predictive power as one might hope - most max accel clusters at less than 5. It would be nice to visualize the clusters in this data, but I'm just going to start trying decision trees and SVMs on this data now.
In [28]:
genericFeatureIndices = list(xrange(0,10))
AdvancedFeatureIndices = list(xrange(10,13))
LocationFeatureIndices = list(xrange(13,17))
TimeFeatureIndices = list(xrange(17,19))
BusTrainFeatureIndices = list(xrange(19,21))
RouteMatchingFeatureIndices = list(xrange(21,29))
print genericFeatureIndices
print AdvancedFeatureIndices
print LocationFeatureIndices
print TimeFeatureIndices
print BusTrainFeatureIndices
print RouteMatchingFeatureIndices
In [61]:
genericCleanedFM = cleanedFeatureMatrix[:,genericFeatureIndices]
print genericCleanedFM.shape
In [62]:
from sklearn import cross_validation
from sklearn import svm
In [63]:
svmClf = svm.LinearSVC()
svmScores = cross_validation.cross_val_score(svmClf, genericCleanedFM, cleanedResultVector, cv=5)
In [64]:
print svmScores
print svmScores.mean()
Using svm.SVC() takes significantly longer (hours instead of seconds) but generates higher accuracy. The accuracy is still lower than the random forest, though.
In [65]:
from sklearn import ensemble
In [66]:
forestClf = ensemble.RandomForestClassifier()
forestScores = cross_validation.cross_val_score(forestClf, genericCleanedFM, cleanedResultVector, cv=5)
In [67]:
print forestScores
print forestScores.mean()
These results look pretty good, and pretty much parallel what the Zheng paper got, even with just the basic features. We get 82% average accuracy for a linear SVM and 86% average accuracy for a random forest. But the 82% and 86% values are for cross validation, where we have a known value that we can validate against.
But what we really want to do is to decide, while looking at a section that we have no ground truth on, whether we want the user to classify it or not. And then we want to see, for the high confidence predictions that we will not prompt the user for, how accurate our classification really is.
In order to do this, we get the probabilities for each prediction in addition to the prediction itself. We can then test the accuracy of the high confidence predictions and compare it to the accuracy of all predictions.
To recap, we now return three metrics:
In [68]:
# Generate folds of indices
def generateFoldArrays(nIndices, nFolds):
currPermutation = np.random.permutation(nIndices)
currPermutationParts = np.array_split(currPermutation, nFolds)
foldArrays = []
for i in range(0, nFolds):
testIndices = currPermutationParts[i]
trainIndicesParts = [currPermutationPart for (j, currPermutationPart) in enumerate(currPermutationParts) if j != i]
trainIndices = np.concatenate(trainIndicesParts)
foldArrays.append((trainIndices, testIndices))
return foldArrays
def kFoldValidationWithProb(algo, X, y, nFolds, prob_threshold):
foldArrays = generateFoldArrays(len(y), nFolds)
scores = []
highConfidenceScores = []
percentAutoClassified = []
percentAutoClassifiedByMode = []
for (trainIndices, testIndices) in foldArrays:
# print testIndices[0]
model = algo.fit(X[trainIndices], y[trainIndices])
testX = X[testIndices]
testy = y[testIndices]
predictedY = model.predict(testX)
if hasattr(algo, "decision_function"):
predictedYProb = algo.decision_function(testX)
else:
predictedYProb = algo.predict_proba(testX)
# print ("predictedY.shape = %s, predictedYProb.shape = %s" %
# (str(predictedY.shape), str(predictedYProb.shape)))
# As we can see below, we take the max confidence along the first axis
highConfidencePredictions = np.max(predictedYProb, 1) > prob_threshold
print "Found %s high confidence predictions out of %s" % (np.count_nonzero(highConfidencePredictions),
len(testIndices))
cmc = lambda m:np.count_nonzero(testy[highConfidencePredictions] == m)
# Let us see how many of each mode were autoclassified
# print("Autoclassifications split by confirmed modes: walk: %s, bike: %s, bus: %s, train: %s, car: %s" %
# (cmc(1), cmc(3), cmc(5), cmc(6), cmc(7)))
pcmc = lambda m: float(np.count_nonzero(testy[highConfidencePredictions] == m))/np.count_nonzero(testy == m) if ((np.count_nonzero(testy == m) != 0)) else 0
# Let us see what percentage of each mode was autoclassified
# print("For threshold %s, autoclassifications split by confirmed mode percents: walk: %s, bike: %s, bus: %s, train: %s, car: %s" %
# (prob_threshold, pcmc(1), pcmc(3), pcmc(5), pcmc(6), pcmc(7)))
percentAutoClassified.append(float(np.count_nonzero(highConfidencePredictions))/len(testIndices))
percentAutoClassifiedByMode.append([pcmc(1), pcmc(3), pcmc(5), pcmc(6), pcmc(7)])
# so now we are going to generate two scores.
# the first is the score on only the high confidence predictions
highConfidenceScore = model.score(testX[highConfidencePredictions], testy[highConfidencePredictions])
highConfidenceScores.append(highConfidenceScore)
score = model.score(X[testIndices], y[testIndices])
scores.append(score)
# print scores
print("for prob %s, percentage auto classified %s" % (prob_threshold, np.array(percentAutoClassified).mean()))
print("for prob %s, scoring only on high confidence predictions %s" % (prob_threshold, np.array(highConfidenceScores).mean()))
print("for prob %s, scoring on all predictions %s" % (prob_threshold, np.array(scores).mean()))
return (np.array(percentAutoClassified), np.array(percentAutoClassifiedByMode), np.array(highConfidenceScores), np.array(scores))
In [69]:
def exploreKFoldValidationSpace(algo, X, y, nFolds):
(pac0, pacm0, hcs0, s0) = kFoldValidationWithProb(algo, X, y, nFolds, 0.90)
(pac5, pacm5, hcs5, s5) = kFoldValidationWithProb(algo, X, y, nFolds, 0.95)
(pac9, pacm9, hcs9, s9) = kFoldValidationWithProb(algo, X, y, nFolds, 0.99)
probs = [0.90, 0.95, 0.99]
pacs = [pac0.mean(), pac5.mean(), pac9.mean()]
hcs = [hcs0.mean(), hcs5.mean(), hcs9.mean()]
ss = [s0.mean(), s5.mean(), s9.mean()]
pacmWalk = [pacm0[:,0].mean(), pacm5[:,0].mean(), pacm9[:,0].mean()]
pacmBike = [pacm0[:,1].mean(), pacm5[:,1].mean(), pacm9[:,1].mean()]
pacmBus = [pacm0[:,2].mean(), pacm5[:,2].mean(), pacm9[:,2].mean()]
pacmTrain = [pacm0[:,3].mean(), pacm5[:,3].mean(), pacm9[:,3].mean()]
pacmCar = [pacm0[:,4].mean(), pacm5[:,4].mean(), pacm9[:,4].mean()]
fig, axes = plt.subplots(1, 1, figsize=(15, 10))
print pacs
axes.set_yticks(np.arange(0,1,0.1))
axes.plot(probs, pacs, label="percentage auto classified")
print pacmWalk
axes.plot(probs, pacmWalk, linewidth = 5, label="percent walk auto classified")
print pacmBike
axes.plot(probs, pacmBike, label="percent bike auto classified")
print pacmBus
axes.plot(probs, pacmBus, linewidth=5, label="percent bus auto classified")
print pacmTrain
axes.plot(probs, pacmTrain, label="percent train auto classified")
print pacmCar
axes.plot(probs, pacmCar, linewidth=5, label="percent car auto classified")
print hcs
axes.plot(probs, hcs, label="accuracy of high confidence samples")
print ss
axes.plot(probs, ss, linewidth = 5, label="accuracy of all samples")
plt.legend(loc='best')
In [70]:
forestClf = ensemble.RandomForestClassifier()
exploreKFoldValidationSpace(forestClf, genericCleanedFM, cleanedResultVector, 5)
The results of these three metrics for confidence intervals of 90%, 95% and 99% are shown above, and they are all largely similar. The accuracy of the high confidence predictions is, as expected, really high at 97 - 98%. However, we were only able to auto-classify ~ 50% of the sections. Now, let's retry using the linear SVM above.
In [71]:
svmClf = svm.LinearSVC()
exploreKFoldValidationSpace(svmClf, genericCleanedFM, cleanedResultVector, 5)
We see that the SVM is able to classify more trips than the decision tree, but at the cost of unacceptably lower performance on the high confidence predictions.
We now get the most important params for the decision tree so that we can better understand what it is doing.
In [72]:
forestClf.get_params()
Out[72]:
In [73]:
for (i, importance) in enumerate(forestClf.feature_importances_):
print featureLabels[i], importance
So the highest importance features are:
Now, let's try another non-parametric method like nearest neighbor
In [74]:
from sklearn import neighbors
In [75]:
knnClf = neighbors.KNeighborsClassifier()
In [76]:
exploreKFoldValidationSpace(knnClf, cleanedFeatureMatrix, cleanedResultVector, 5)
knn does almost the same as decision tree, except that the accuracy of the high confidence predictions is a bit lower. I think that the percentages are around the same as well. Basically, we can classify walk pretty well and the others pretty poorly. So I am not sure what we are adding here over moves :)
I'm surprised at the low prediction rate for cycling. Moves seems to get that pretty accurately for me.
I'm now going to plot this data and see what it looks like.
In [77]:
Advanced_indices=[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18]
print(Advanced_indices)
forestClf = ensemble.RandomForestClassifier()
exploreKFoldValidationSpace(forestClf, cleanedFeatureMatrix[:,Advanced_indices], cleanedResultVector, 5)
In [78]:
Spatial_indices=[0,1,2,3,4,5,6,7,8,9,13,14,15,16,17,18,19,20]
print(Spatial_indices)
forestClf = ensemble.RandomForestClassifier()
exploreKFoldValidationSpace(forestClf, cleanedFeatureMatrix[:,Spatial_indices], cleanedResultVector, 5)
In [79]:
for (i, importance) in enumerate(forestClf.feature_importances_):
print featureLabels[i], importance
In [80]:
forestClf = ensemble.RandomForestClassifier()
exploreKFoldValidationSpace(forestClf, cleanedFeatureMatrix, cleanedResultVector, 5)
In [81]:
knnClf = neighbors.KNeighborsClassifier()
exploreKFoldValidationSpace(knnClf, cleanedFeatureMatrix, cleanedResultVector, 5)
In [82]:
for (i, importance) in enumerate(forestClf.feature_importances_):
print featureLabels[i], importance
In [83]:
from matplotlib import colors
import itertools
In [84]:
def printColorMap(algo, Xall, y):
# we want to split roughly into roughly 10-20 sections
nSplits = 20
# setup parameters
cmap_light = colors.ListedColormap(['#FAAAAA', '#AFAAAA', '#AAFAAA', '#AAAFAA', '#AAAAFA', '#AAAAAF'])
cmap_bold = colors.ListedColormap(['#F00000', '#0F0000', '#00F000', '#000F00', '#0000F0', '#00000F'])
# nFeatures = Xall.shape[1]
nFeatures = 10
fig, axes = plt.subplots(20, 5, figsize=(15,50))
plt.tight_layout()
axesArr = axes.flatten()
i = 0
for selCombo in itertools.product(np.arange(nFeatures), np.arange(nFeatures)):
if selCombo[0] == selCombo[1]:
continue
# print("Generating grid for combo %s,%s in slot %s" % (featureLabels[selCombo[0]], featureLabels[selCombo[1]], i))
selMask = np.zeros(Xall.shape[1])
# Otherwise, we won't be able to plot it properly below
assert(len(selCombo) == 2)
selMask[selCombo[0]] = 1
selMask[selCombo[1]] = 1
X = Xall[:,selMask == 1]
algo.fit(X, y)
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, m_max]x[y_min, y_max].
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
# we want to split roughly into
h_x = float(x_max - x_min) / nSplits
h_y = float(y_max - y_min) / nSplits
xx, yy = np.meshgrid(np.arange(x_min, x_max, h_x),
np.arange(y_min, y_max, h_y))
Z = algo.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
axesArr[i].pcolormesh(xx, yy, Z, cmap=cmap_light)
# Plot also the training points
axesArr[i].scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold)
# plt.scatter(X[:, 0], X[:, 1], c=y)
axesArr[i].set_xlim(xx.min(), xx.max())
axesArr[i].set_ylim(yy.min(), yy.max())
axesArr[i].set_title("%s v/s %s" % (featureLabels[selCombo[0]], featureLabels[selCombo[1]]))
# axesArr[i].legend(loc='best')
i = i+1
In [85]:
printColorMap(forestClf, cleanedFeatureMatrix, cleanedResultVector)
Let us also quickly take a look at the confusion matrix for the overall model. Because maybe we should not care about the confidence of the predictions, and just weight them lower.
In [86]:
from sklearn import metrics
In [87]:
def printConfusionMatrix(algo, X, y):
skf = cross_validation.StratifiedKFold(y, 5)
confusize=len(np.unique(y))
# print(confusize)
sumPCM = np.zeros([confusize, confusize])
for train, test in skf:
X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
y_pred = algo.fit(X_train, y_train).predict(X_test)
cm = metrics.confusion_matrix(y_test, y_pred)
# print(cm.shape)
sumArr = np.sum(cm, axis=1)
repeatedSumArr = np.repeat(sumArr, cm.shape[1]).reshape(cm.shape)
sumPCM = np.add(sumPCM, np.divide(cm.astype(float), repeatedSumArr))
finalPCM = sumPCM / 5
print(finalPCM)
# Show confusion matrix in a separate window
plt.matshow(finalPCM)
plt.title('Confusion matrix')
plt.colorbar()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
In [88]:
forestClf = ensemble.RandomForestClassifier()
# print(np.unique(cleanedResultVector))
printConfusionMatrix(forestClf, genericCleanedFM, cleanedResultVector)
In [89]:
forestClf = ensemble.RandomForestClassifier()
printConfusionMatrix(forestClf, cleanedFeatureMatrix[:,Spatial_indices], cleanedResultVector)
In [90]:
forestClf = ensemble.RandomForestClassifier()
printConfusionMatrix(forestClf, cleanedFeatureMatrix, cleanedResultVector)
Adding start and end points does improve the accuracy of the bus and train. Train trips in particular, are significantly improved.
In [91]:
knnClf = neighbors.KNeighborsClassifier()
printConfusionMatrix(knnClf, genericCleanedFM, cleanedResultVector)
In [92]:
knnClf = neighbors.KNeighborsClassifier()
printConfusionMatrix(knnClf, cleanedFeatureMatrix, cleanedResultVector)
knn does significantly worse, primarily because of bus trips. I suspect this is because different people make the same trip using different modes. Time for per-user trips?
As we can see, the prediction rate is best for walk and bike, which are the ones for which we get the most data from moves. It may be a mistake to use the same model for both types of trips because moves will do a good job for walk/bike and a horrible job for transport, because we don't allow users to specify 'transport' in the output.
These also have zero carbon footprint. Let us see how well we do on the motorized trips alone.
In [93]:
transportTrips = cleanedFeatureMatrix[:,2] == 4
print np.count_nonzero(transportTrips)
In [94]:
forestClf = ensemble.RandomForestClassifier()
printConfusionMatrix(forestClf, genericCleanedFM[transportTrips], cleanedResultVector[transportTrips])
In [95]:
forestClf = ensemble.RandomForestClassifier()
printConfusionMatrix(forestClf, cleanedFeatureMatrix[transportTrips], cleanedResultVector[transportTrips])
In [96]:
knnClf = neighbors.KNeighborsClassifier()
printConfusionMatrix(knnClf, genericCleanedFM[transportTrips], cleanedResultVector[transportTrips])
In [97]:
knnClf = neighbors.KNeighborsClassifier()
printConfusionMatrix(knnClf, cleanedFeatureMatrix[transportTrips], cleanedResultVector[transportTrips])
As we can see, we are actually able to predict car trips with a fair degree of accuracy. But bus and train trips are pretty much a tossup. Ignore the entries for 0 and 1 above, since we stripped out all walk and bike trips, and so these are only trips which moves misclassified, and not the entire dataset. Now we know why the Zheng paper only attempted to distinguish between bus and car trips, and not bus, train and car. The new features helped in the decision tree case, but not by that much, and did not help us at all in the knn case.
In [98]:
def getUserModelComparison(isTransportOnly):
userIds = Sections.distinct("user_id")
# I'm not going to bother with testing against only the generic features
# because the main issue here is personalization
numberOfSections = []
percentAutoClassified = []
percentAutoClassifiedWalk = []
percentAutoClassifiedBike = []
percentAutoClassifiedBus = []
percentAutoClassifiedTrain = []
percentAutoClassifiedCar = []
autoClassifiedAccuracy = []
overallAccuracy = []
labels = ["Number of sections", "% autoclassified", "% auto classified walk",
"% auto classified bike", "% auto classified bus",
"% auto classified train", "% auto classified car",
"auto classified accuracy", "overall accuracy"]
for userId in userIds:
# decision tree with all features
if not isTransportOnly:
query = {"$and": [{'type': 'move'}, {'confirmed_mode': {'$ne': ''}}, {'user_id': userId}]}
else:
query = {"$and": [{'type': 'move'}, {'confirmed_mode': {'$ne': ''}}, {'mode': 4}, {'user_id': userId}]}
(userFeatureMatrix, userResultVector) = generateFeatureMatrixAndResultVector(query)
# we only focus on users who have enough history with us
if len(userResultVector) < 50:
print("Skipping user with userId %s who has %s unconfirmed sections" % (userId, len(userResultVector)))
continue
forestClf = ensemble.RandomForestClassifier()
# printConfusionMatrix(forestClf, userFeatureMatrix, userResultVector)
(pac5, pacm5, hcs5, s5) = kFoldValidationWithProb(forestClf, userFeatureMatrix, userResultVector, 5, 0.95)
numberOfSections.append(len(userResultVector))
percentAutoClassified.append(pac5.mean())
percentAutoClassifiedWalk.append(pacm5[0].mean())
percentAutoClassifiedBike.append(pacm5[1].mean())
percentAutoClassifiedBus.append(pacm5[2].mean())
percentAutoClassifiedTrain.append(pacm5[3].mean())
percentAutoClassifiedCar.append(pacm5[4].mean())
autoClassifiedAccuracy.append(hcs5.mean())
overallAccuracy.append(s5.mean())
resultArray = np.array([numberOfSections, percentAutoClassified, percentAutoClassifiedWalk,
percentAutoClassifiedBike, percentAutoClassifiedBus, percentAutoClassifiedTrain,
percentAutoClassifiedCar, autoClassifiedAccuracy, overallAccuracy])
print resultArray.shape
return (resultArray, labels)
In [99]:
def displayUserVariation(ra, labels):
''' ra has rows = plots and cols = users
'''
fig, (axes, axesNum) = plt.subplots(2, 1, figsize=(25, 25))
nUsers = ra.shape[1]
for i in [1,-2,-1]:
# each row is one plot
print ra[i]
axes.plot(np.arange(nUsers), ra[i], label=labels[i])
axes.legend(loc='best')
for i in [0]:
# each row is one plot
print ra[i]
axesNum.plot(np.arange(nUsers), ra[i], label=labels[i])
axesNum.legend(loc='best')
In [100]:
(userResultArray, labels) = getUserModelComparison(isTransportOnly=False)
In [101]:
displayUserVariation(userResultArray, labels)
So there's quite a bit of variability in both the overall accuracy, and in the number of trips for a user. The two don't seem to be correlated though. We get some fairly uneven improvement - for some users, the general classification is over 90%. We are also able to classify over 80% of the trips for some users.
But that might just be due to a higher ratio of walk trips, which are classified more accurately. I can explore this only for transport, but first, I'm going to try to build a gesture library and build the associated features. Then maybe Mogeng can continue some of the exploration.
In [102]:
(userResultArrayTransOnly, labelsTransOnly) = getUserModelComparison(isTransportOnly=True)
In [103]:
displayUserVariation(userResultArrayTransOnly, labelsTransOnly)
So looking at transport-only trips, and focusing on users with enough transport history (50+ motorized transport trips), we are able to get an overall accuracy of around 70 - 80% even for the motorized trips. However, there are some clear outliers, like the one who has only 60% accuracy. Also, because our current threshold for high confidence is set so high, the high confidence predictions are > 95% correct as before. We have to decide what to use.
We can autoclassify 20 - 50% of the motorized transport trips. In general, this is related to the number of trips - there is a very clear spike in the data for user 4. But the correlation is not exact. In particular, user 5 has > 50 trips, but only ~ 10% autoclassified trips.
It might be worthwhile to take a closer look at these 6 users, see what their transport trips look like, and get a sense of what the difference between user 4 and user 5 is, for example. This might help us figure out how to build better user models.
In [104]:
def buildRouteLibrary(userId, threshold):
'''
Here we attempt to build a route library for each user.
Then, the probability of the top match can be a factor in our machine learning.
Let us just start with the start and end points instead of a full dynamic time warp.
userSections = Sections.find({"$and": [{'type': 'move'}, {'confirmed_mode': {'$ne': ''}}, {'user_id': userId}]})
existingRoutes = RouteLibrary()
for section in userSections:
existingRoutes.update(section)
return existingRoutes
'''
In [126]:
forestClf = ensemble.RandomForestClassifier()
exploreKFoldValidationSpace(forestClf, cleanedFeatureMatrix[:,genericFeatureIndices], cleanedResultVector, 5)
printConfusionMatrix(forestClf, cleanedFeatureMatrix[:,genericFeatureIndices], cleanedResultVector)
In [106]:
for (i, importance) in enumerate(forestClf.feature_importances_):
print featureLabels[i], importance
In [127]:
forestClf = ensemble.RandomForestClassifier()
exploreKFoldValidationSpace(forestClf, cleanedFeatureMatrix[:,genericFeatureIndices+AdvancedFeatureIndices], cleanedResultVector, 5)
printConfusionMatrix(forestClf, cleanedFeatureMatrix[:,genericFeatureIndices+AdvancedFeatureIndices], cleanedResultVector)
In [108]:
for (i, importance) in enumerate(forestClf.feature_importances_):
print featureLabels[i], importance
In [131]:
forestClf = ensemble.RandomForestClassifier()
exploreKFoldValidationSpace(forestClf, cleanedFeatureMatrix[:,genericFeatureIndices+BusTrainFeatureIndices], cleanedResultVector, 5)
printConfusionMatrix(forestClf, cleanedFeatureMatrix[:,genericFeatureIndices+BusTrainFeatureIndices], cleanedResultVector)
In [110]:
print(genericFeatureIndices+BusTrainFeatureIndices)
forestClf.feature_importances_
Out[110]:
In [111]:
for (i, importance) in enumerate(forestClf.feature_importances_):
print featureLabels[i], importance
In [132]:
forestClf = ensemble.RandomForestClassifier()
exploreKFoldValidationSpace(forestClf, cleanedFeatureMatrix[:,genericFeatureIndices+AdvancedFeatureIndices+BusTrainFeatureIndices], cleanedResultVector, 5)
printConfusionMatrix(forestClf, cleanedFeatureMatrix[:,genericFeatureIndices+AdvancedFeatureIndices+BusTrainFeatureIndices], cleanedResultVector)
In [113]:
for (i, importance) in enumerate(forestClf.feature_importances_):
print featureLabels[i], importance
In [134]:
forestClf = ensemble.RandomForestClassifier()
exploreKFoldValidationSpace(forestClf, cleanedFeatureMatrix[:,genericFeatureIndices+AdvancedFeatureIndices+BusTrainFeatureIndices
+LocationFeatureIndices], cleanedResultVector, 5)
printConfusionMatrix(forestClf, cleanedFeatureMatrix[:,genericFeatureIndices+AdvancedFeatureIndices+BusTrainFeatureIndices
+LocationFeatureIndices], cleanedResultVector)
In [137]:
forestClf = ensemble.RandomForestClassifier()
exploreKFoldValidationSpace(forestClf, cleanedFeatureMatrix[:,genericFeatureIndices+AdvancedFeatureIndices+BusTrainFeatureIndices
+LocationFeatureIndices+TimeFeatureIndices], cleanedResultVector, 5)
printConfusionMatrix(forestClf, cleanedFeatureMatrix[:,genericFeatureIndices+AdvancedFeatureIndices+BusTrainFeatureIndices
+LocationFeatureIndices+TimeFeatureIndices], cleanedResultVector)
In [51]:
for (i, importance) in enumerate(forestClf.feature_importances_):
print featureLabels[i], importance
In [138]:
forestClf = ensemble.RandomForestClassifier()
exploreKFoldValidationSpace(forestClf, cleanedFeatureMatrix[:,genericFeatureIndices+AdvancedFeatureIndices+BusTrainFeatureIndices
+LocationFeatureIndices+TimeFeatureIndices+RouteMatchingFeatureIndices],
cleanedResultVector, 5)
printConfusionMatrix(forestClf, cleanedFeatureMatrix[:,genericFeatureIndices+AdvancedFeatureIndices+BusTrainFeatureIndices
+LocationFeatureIndices+TimeFeatureIndices+RouteMatchingFeatureIndices],
cleanedResultVector)
In [117]:
for (i, importance) in enumerate(forestClf.feature_importances_):
print featureLabels[i], importance
In [99]:
print(cleanedFeatureMatrix.shape)
In [99]:
In [ ]: