In [1]:
from pymongo import MongoClient

In [2]:
Sections = MongoClient('localhost').Stage_database.Stage_Sections

In [139]:
Sections.find({'type': 'move'}).count()


Out[139]:
7616

In [4]:
import numpy as np
import scipy as sp

In [5]:
confirmedSections = Sections.find({"$and": [{'type': 'move'}, {'confirmed_mode': {'$ne': ''}}]})

In [6]:
walkSections = Sections.find({"$and": [{'type': 'move'}, {'confirmed_mode': 1}]})

In [7]:
modeList = []
for mode in MongoClient('localhost').Stage_database.Stage_Modes.find():
    modeList.append(mode)
    print mode


{u'mode_id': 1, u'_id': ObjectId('534e78611e860a68729de34b'), u'mode_name': u'walking'}
{u'mode_id': 2, u'_id': ObjectId('534e78611e860a68729de34c'), u'mode_name': u'running'}
{u'mode_id': 3, u'_id': ObjectId('534e78611e860a68729de34d'), u'mode_name': u'cycling'}
{u'mode_id': 4, u'_id': ObjectId('534e78611e860a68729de34e'), u'mode_name': u'transport'}
{u'mode_id': 5, u'_id': ObjectId('534e78611e860a68729de34f'), u'mode_name': u'bus'}
{u'mode_id': 6, u'_id': ObjectId('534e78611e860a68729de350'), u'mode_name': u'train'}
{u'mode_id': 7, u'_id': ObjectId('534e78611e860a68729de351'), u'mode_name': u'car'}
{u'mode_id': 8, u'_id': ObjectId('534e78611e860a68729de352'), u'mode_name': u'mixed'}

In [8]:
print Sections.find({"$and": [{'type': 'move'}, {'confirmed_mode': {'$ne': ''}}]}).count()
for mode in modeList:
    print "%s: %s" % (mode['mode_name'], Sections.find({"$and": [{'type': 'move'}, {'confirmed_mode': mode['mode_id']}]}).count())


5174
walking: 2400
running: 53
cycling: 727
transport: 0
bus: 260
train: 190
car: 1463
mixed: 67

In [13]:
from featurecalc import calDistance, calSpeed, calHeading, calAvgSpeed, calSpeeds, calAccels, getIthMaxSpeed, getIthMaxAccel, calHCR,\
calSR, calVCR, mode_cluster, mode_start_end_coverage, cluster_route_match_score,transit_route_match_score

In [14]:
def getSpeedsForMode(modeId):
    modeSectionCursor = Sections.find({"$and": [{'type': 'move'}, {'confirmed_mode': modeId}]})
    speedList = []
    for section in modeSectionCursor:
        speeds = calSpeeds(section)
        if speeds != None:
            # currHistogram = sp.histogram(speeds)
            speedList.append(speeds)
    return speedList

In [15]:
def showHists(speedList):
    # print histograms
    nRows = len(speedList)/10
    if len(speedList) < 10:
        nRows = 1
    errCmpFig, axesMatrix = plt.subplots(nRows, 10, figsize=(25, nRows * 2))
    # print axesMatrix.shape, axesMatrix.flatten().shape
    axesVector = axesMatrix.flatten()
    for (i, axis) in enumerate(axesVector):
        # print("Plotting values %s for bins %s" % (histograms[i][0].shape, histograms[i][1].shape))
        if i < len(speedList):
            axis.hist(speedList[i])
        # axis.plot(histograms[i][1][:-1], histograms[i][0])

In [16]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

%matplotlib inline
%config InlineBackend.figure_format='png'

In [17]:
walkSpeeds = getSpeedsForMode(1)

In [19]:
print len(walkSpeeds)
showHists(walkSpeeds[0:100])


2218

In [20]:
bikeSpeeds = getSpeedsForMode(3)

In [21]:
print len(bikeSpeeds)
showHists(bikeSpeeds[0:100])


679

In [22]:
busSpeeds = getSpeedsForMode(5)

In [23]:
print len(busSpeeds)
showHists(busSpeeds[0:100])


260

In [24]:
trainSpeeds = getSpeedsForMode(6)

In [25]:
print len(trainSpeeds)
showHists(trainSpeeds[0:100])


190

In [216]:
carSpeeds = getSpeedsForMode(7)

In [217]:
print len(carSpeeds)
showHists(carSpeeds[0:100])


716

In [13]:
userIds = Sections.distinct("user_id")
for userId in userIds:
    print userId, Sections.find({"user_id": userId}).count()


01477d94-e0bb-3903-a428-338551801391 332
03678a4f-efb9-3100-b0e5-8f296c197eb7 618
1c97e50a-aa2a-3016-ad31-f9f6b726d4c4 767
1cc03940-57f5-3e35-a189-55d067dc6460 1339
202a0732-d5c2-307d-aa29-2b39bc5e6660 709
2491630e-bb62-39c2-8349-df4dad434dee 772
298a1c8e-74c7-3631-b59f-a5ce246c4e6d 655
3537737e-8db4-3706-8c6c-ad4cafd9dc82 944
399ed209-2e4e-37fc-a6ac-37c25f7ee1f2 403
3a307244-ecf1-3e6e-a9a7-3aaf101b40fa 426
3b7f6e86-7260-336e-96f0-9e1a0bafc242 673
527e98f8-710d-3719-93f2-ab140d1439ba 599
5322f635-a82c-3677-a1a4-5c26804a90b7 319
560a0ea4-fcdf-3c1b-adf0-22d2b79af3ea 158
5c035ff1-e0d5-397f-b989-9cb8faa0a1da 765
5dd35f51-3d1a-30db-9b22-ce0a432c2a59 638
6a488797-5be1-38aa-9700-5532be978ff9 1170
7a88d272-3b9e-3d18-9749-57ddb6214440 8
89e30222-b446-30ad-852c-b305ddafe70d 258
8b3fd099-f3e9-3a8f-a70e-6914ad6b6799 505
8d235d9c-7af6-3132-ba91-626e40181449 83
8d718272-c46c-3e24-8bae-9696917deeec 100
8e230f78-6e37-3588-bf2c-6888a5b63c82 160
951779de-a10c-3373-b186-c1c9b14b5e38 23
9aad7d1b-aa58-3464-9cca-beee37c55d93 1157
9cc9d577-3a22-3659-8cd5-9142936f44ec 218
a1a10092-0136-37f1-8109-9ebf47ef72b2 17
a47da5c4-3ced-34ac-af25-6c3aa7998827 926
acd105e3-b221-35c5-81ac-fee5ffe9de66 581
adaa57a8-7fb8-3389-acce-db50ddc9e2dd 908
b0d937d0-70ef-305e-9563-440369012b39 866
b3b9fe85-d510-3e5b-bec2-031ca9da7c83 50
bcbefbfc-8021-353f-9f5c-648d539c8cff 13
c08ea44e-6fd7-3eea-8931-868cc8083a1b 543
cb8009c5-267c-37c4-b55b-4889e02c431e 815
d8a6b737-b6a5-3fc7-a144-76ac2d434252 63
e0509a4e-9842-3c73-8050-875b6383c097 168
e1a99120-9a5a-31ae-9d7a-93b9a3e0c51c 63
e211dd91-423f-31ff-a1f8-89e5fdecc164 210
ea3f7719-f4a3-36fd-9477-924114cfe0bd 835
ed8ca6a2-9092-37af-adde-46e51a18a310 494
f4254389-3592-3cd5-b206-462fb6eaa7b5 966
f8fee20c-0f32-359d-ba75-bce97a7ac83b 818
fb457463-9a06-3c84-947d-db1fefb5a199 248
ff860db0-f3fe-387c-a005-c5837f5ba970 7

Let's pick the user id with the most number of sections


In [79]:
userIdCounts = []
for userId in userIds:
    userIdCounts.append((userId, Sections.find({"user_id": userId}).count()))
sortedUserIds = sorted(userIdCounts, key=lambda k:k[1])
print sortedUserIds
maxUID = sortedUserIds[-1][0]
print maxUID, type(maxUID)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-79-ec9f03e643f8> in <module>()
      1 userIdCounts = []
----> 2 for userId in userIds:
      3     userIdCounts.append((userId, Sections.find({"user_id": userId}).count()))
      4 sortedUserIds = sorted(userIdCounts, key=lambda k:k[1])
      5 print sortedUserIds

NameError: name 'userIds' is not defined

In [ ]:
def getSpeedsForModeAndUser(modeId, userId):
    print "userID = %s" % userId
    modeSectionCursor = Sections.find({"$and": [{'type': 'move'}, {'confirmed_mode': modeId}, {'user_id': userId}]})
    print "Number of matches = %s" % modeSectionCursor.count()
    speedList = []
    for section in modeSectionCursor:
        speeds = calSpeeds(section)
        if speeds != None:
            # currHistogram = sp.histogram(speeds)
            speedList.append(speeds)
    return speedList

In [ ]:
print maxUID
userWalkSpeeds = getSpeedsForModeAndUser(1, maxUID)

In [222]:
print len(userWalkSpeeds)
showHists(userWalkSpeeds[0:100])


37

In [223]:
userBikeSpeeds = getSpeedsForModeAndUser(3, maxUID)


userID = 6a488797-5be1-38aa-9700-5532be978ff9
Number of matches = 17

In [224]:
print len(userBikeSpeeds)
showHists(userBikeSpeeds[0:10])


17

In [225]:
userCarSpeeds = getSpeedsForModeAndUser(7, maxUID)


userID = 6a488797-5be1-38aa-9700-5532be978ff9
Number of matches = 38

In [226]:
print len(userCarSpeeds)
showHists(userCarSpeeds[0:10])


38

Feature matrix construction


In [123]:
# Features are:
# 0. distance
# 1. duration
# 2. first filter mode
# 3. sectionId
# 4. avg speed
# 5. speed EV
# 6. speed variance
# 7. max speed
# 8. max accel
# 9. isCommute
# 10. heading change rate (currently unfilled)
# 11. stop rate (currently unfilled)
# 12. velocity change rate (currently unfilled)
# 13. start lat
# 14. start lng
# 15. stop lat
# 16. stop lng
# 17. start hour
# 18. end hour
# 19. both start and end close to bus stop
# 20. both start and end close to train station
# 21-28. routematching features
featureLabels = ["distance", "duration", "first filter mode", "sectionId", "avg speed",
                 "speed EV", "speed variance", "max speed", "max accel", "isCommute",
                 "heading change rate", "stop rate", "velocity change rate", "start lat", "start lng",
                 "stop lat", "stop lng", "start hour", "end hour", "close to bus stop", "close to train stop",
                 "walking","running","cycling","transport","bus","train","car","mixed"]
bus_cluster=mode_cluster(5,105,1)
train_cluster=mode_cluster(6,600,1)
def generateFeatureMatrixAndResultVector(sectionQuery):
    confirmedSections = Sections.find(sectionQuery)
    featureMatrix = np.zeros([confirmedSections.count(), len(featureLabels)])
    resultVector = np.zeros(confirmedSections.count())
    for (i, section) in enumerate(confirmedSections):
        featureMatrix[i, 0] = section['distance']
        featureMatrix[i, 1] = (section['section_end_datetime'] - section['section_start_datetime']).total_seconds()
        featureMatrix[i, 2] = section['mode']
        featureMatrix[i, 3] = section['section_id']
        featureMatrix[i, 4] = calAvgSpeed(section)
        speeds = calSpeeds(section)
        if speeds != None:
            featureMatrix[i, 5] = np.mean(speeds)
            featureMatrix[i, 6] = np.std(speeds)
            featureMatrix[i, 7] = np.max(speeds)
        else:
            # They will remain zero
            pass
        accels = calAccels(section)
        if accels != None and len(accels) > 0:
            featureMatrix[i, 8] = np.max(accels)
        else:
            # They will remain zero
            pass
        featureMatrix[i, 9] = ('commute' in section) and (section['commute'] == 'to' or section['commute'] == 'from')
        featureMatrix[i, 10] = calHCR(section)
        featureMatrix[i, 11] = calSR(section)
        featureMatrix[i, 12] = calVCR(section)
        if section['section_start_point'] != None:
            startCoords = section['section_start_point']['coordinates']
            featureMatrix[i, 13] = startCoords[0]
            featureMatrix[i, 14] = startCoords[1]
        
        if section['section_end_point'] != None:
            endCoords = section['section_end_point']['coordinates']
            featureMatrix[i, 15] = endCoords[0]
            featureMatrix[i, 16] = endCoords[1]
        
        featureMatrix[i, 17] = section['section_start_datetime'].time().hour
        featureMatrix[i, 18] = section['section_end_datetime'].time().hour
        
        ## try new bus matching
        featureMatrix[i, 19] = mode_start_end_coverage(section,bus_cluster,105)
        featureMatrix[i, 20] = mode_start_end_coverage(section,train_cluster,600)
#         TransitMap=transit_route_match_score(section,100000,100000,'lcs',500,0.7)
#         featureMatrix[i, 20] = max(0,TransitMap['Bart'],TransitMap['CalTrain'])
        
        ## RouteMatching Feature
        ModeCluster=cluster_route_match_score(section,step1=100000,step2=100000,method='DTW',radius1=2000,threshold=1000)
#         print(ModeCluster)
        for modeInd in range(len(modeList)):
#             print(modeInd)
#             print(modeList[modeInd]['mode_id'])
            featureMatrix[i, 21+modeInd] = ModeCluster[modeList[modeInd]['mode_id']]
        resultVector[i] = section['confirmed_mode']
    return (featureMatrix, resultVector)

In [124]:
from uuid import UUID
uuid_shankari=UUID('b0d937d0-70ef-305e-9563-440369012b39')
uuid_zeshi=UUID('9aad7d1b-aa58-3464-9cca-beee37c55d93')
uuid_shankari_hus=UUID('3a307244-ecf1-3e6e-a9a7-3aaf101b40fa')
(featureMatrix, resultVector) = generateFeatureMatrixAndResultVector({"$and": [{'type': 'move'}, {'confirmed_mode': {'$ne': ''}},
                                                                               {'user_id':uuid_shankari}]})
# (featureMatrix, resultVector) = generateFeatureMatrixAndResultVector({"$and": [{'type': 'move'}, {'confirmed_mode': {'$ne': ''}}]})

In [20]:
print(np.max(featureMatrix[:,10]))
print(np.max(featureMatrix[:,20]))
print(np.mean(featureMatrix[:,20]))
print(np.max(featureMatrix[:,12]))
print featureMatrix.shape, resultVector.shape
print(np.unique(resultVector))


0.0242130750605
1.0
0.046511627907
0.0320754716981
(258, 29) (258,)
[ 1.  3.  6.  7.  8.]

In [21]:
runIndices = resultVector == 2
transportIndices = resultVector == 4
mixedIndices = resultVector == 8
strippedIndices = np.logical_not(runIndices | transportIndices | mixedIndices)
print np.nonzero(runIndices), np.nonzero(transportIndices), np.nonzero(mixedIndices), np.count_nonzero(strippedIndices)


(array([], dtype=int64),) (array([], dtype=int64),) (array([90]),) 257

Now, we filter out "mixed" and "running", since there are few instances of them and we don't intend to predict them initially. We also filter out any "transport" since it should never be in the confirmed set, and we don't want to deal with it if it is.


In [22]:
strippedFeatureMatrix = featureMatrix[strippedIndices]
strippedResultVector = resultVector[strippedIndices]

First, we visualize the distribution of some of the features. This is so that we can compare our dataset to Zheng et al 2010.


In [23]:
def plotFeatureVector(featureMatrix, resultVector, featureIndex, modeList):
    avgSpeedFig, avgSpeedAxes = plt.subplots(1,1, figsize=(12,10))
    currModeSpeedsList = []
    currModeNamesList = []
    for mode in modeList:
        currModeMask = resultVector == mode['mode_id']
        currModeSpeeds = featureMatrix[currModeMask, featureIndex]
        # print "For mode %s, shape is %s" % (mode['mode_id'], str(currModeSpeeds.shape))
        if np.count_nonzero(currModeMask) != 0:
            currModeNamesList.append(mode['mode_name'])
            currModeSpeedsList.append(currModeSpeeds)
    avgSpeedAxes.hist(currModeSpeedsList, normed=True, histtype="bar", label=currModeNamesList)
    avgSpeedAxes.set_ylabel("number of segments")
    avgSpeedAxes.set_xlabel(featureLabels[featureIndex])
    plt.legend()

In [24]:
for col in range(0, len(featureLabels)):
    plotFeatureVector(strippedFeatureMatrix, strippedResultVector, col, modeList)


/Library/Python/2.7/site-packages/matplotlib-override/matplotlib/pyplot.py:412: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_num_figures`).
  max_open_warning, RuntimeWarning)

In spite of stripping out the values, we see that there are clear outliers. This is almost certainly a mis-classified trip, because the distance and speed are both really large, but the mode is walking. Let's manually filter out this outlier.


In [25]:
distanceOutliers = strippedFeatureMatrix[:,0] > 500000
speedOutliers = strippedFeatureMatrix[:,4] > 100
speedMeanOutliers = strippedFeatureMatrix[:,5] > 80
speedVarianceOutliers = strippedFeatureMatrix[:,6] > 70
maxSpeedOutliers = strippedFeatureMatrix[:,7] > 160
print np.nonzero(distanceOutliers), np.nonzero(speedOutliers), \
        np.nonzero(speedMeanOutliers), np.nonzero(speedVarianceOutliers), \
        np.nonzero(maxSpeedOutliers)
nonOutlierIndices = np.logical_not(distanceOutliers | speedOutliers | speedMeanOutliers | speedVarianceOutliers | maxSpeedOutliers)
print nonOutlierIndices.shape


(array([], dtype=int64),) (array([], dtype=int64),) (array([], dtype=int64),) (array([], dtype=int64),) (array([], dtype=int64),)
(257,)

In [26]:
cleanedFeatureMatrix = strippedFeatureMatrix[nonOutlierIndices]
cleanedResultVector = strippedResultVector[nonOutlierIndices]
print(cleanedResultVector.shape)


(257,)

In [27]:
for col in range(0, 10):
    plotFeatureVector(cleanedFeatureMatrix, cleanedResultVector, col, modeList)


Using the graphs above, we can estimate the separability of our input. Clearly, there is some separability - the car and train trips that are at 20-30+ are clearly separable from the walk/bike trips that are at lower speeds. But are they separable from each other? And at least eyeballing the data, it looks like at least 75% of car trips are actually not that fast - the mean EV is < 10mph. Even with max speed, at least 25% of car trips appear to have a max speed ~ 10 mph. Max accel doesn't seem to have as much predictive power as one might hope - most max accel clusters at less than 5. It would be nice to visualize the clusters in this data, but I'm just going to start trying decision trees and SVMs on this data now.

Feature Indices


In [28]:
genericFeatureIndices = list(xrange(0,10))
AdvancedFeatureIndices = list(xrange(10,13))
LocationFeatureIndices = list(xrange(13,17))
TimeFeatureIndices = list(xrange(17,19))
BusTrainFeatureIndices = list(xrange(19,21))
RouteMatchingFeatureIndices = list(xrange(21,29))
print genericFeatureIndices
print AdvancedFeatureIndices
print LocationFeatureIndices
print TimeFeatureIndices
print BusTrainFeatureIndices
print RouteMatchingFeatureIndices


[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
[10, 11, 12]
[13, 14, 15, 16]
[17, 18]
[19, 20]
[21, 22, 23, 24, 25, 26, 27, 28]

Generic model, generic features


In [61]:
genericCleanedFM = cleanedFeatureMatrix[:,genericFeatureIndices]
print genericCleanedFM.shape


(257, 10)

In [62]:
from sklearn import cross_validation
from sklearn import svm

In [63]:
svmClf = svm.LinearSVC()
svmScores = cross_validation.cross_val_score(svmClf, genericCleanedFM, cleanedResultVector, cv=5)

In [64]:
print svmScores
print svmScores.mean()


[ 0.26923077  0.44230769  0.70588235  0.07843137  0.07843137]
0.314856711916

Using svm.SVC() takes significantly longer (hours instead of seconds) but generates higher accuracy. The accuracy is still lower than the random forest, though.


In [65]:
from sklearn import ensemble

In [66]:
forestClf = ensemble.RandomForestClassifier()
forestScores = cross_validation.cross_val_score(forestClf, genericCleanedFM, cleanedResultVector, cv=5)

In [67]:
print forestScores
print forestScores.mean()


[ 0.86538462  0.86538462  0.88235294  0.90196078  0.82352941]
0.867722473605

These results look pretty good, and pretty much parallel what the Zheng paper got, even with just the basic features. We get 82% average accuracy for a linear SVM and 86% average accuracy for a random forest. But the 82% and 86% values are for cross validation, where we have a known value that we can validate against.

But what we really want to do is to decide, while looking at a section that we have no ground truth on, whether we want the user to classify it or not. And then we want to see, for the high confidence predictions that we will not prompt the user for, how accurate our classification really is.

In order to do this, we get the probabilities for each prediction in addition to the prediction itself. We can then test the accuracy of the high confidence predictions and compare it to the accuracy of all predictions.

To recap, we now return three metrics:

  • The number of entries that would be autoclassified given a particular target confidence interval
  • The accuracy of the entries that would be autoclassified
  • The accuracy of all entries, including ones that had low confidence

In [68]:
# Generate folds of indices
def generateFoldArrays(nIndices, nFolds):
    currPermutation = np.random.permutation(nIndices)
    currPermutationParts = np.array_split(currPermutation, nFolds)
    
    foldArrays = []
    for i in range(0, nFolds):
        testIndices = currPermutationParts[i]
        trainIndicesParts = [currPermutationPart for (j, currPermutationPart) in enumerate(currPermutationParts) if j != i]
        trainIndices = np.concatenate(trainIndicesParts)
        foldArrays.append((trainIndices, testIndices))
    return foldArrays

def kFoldValidationWithProb(algo, X, y, nFolds, prob_threshold):
    foldArrays = generateFoldArrays(len(y), nFolds)
    
    scores = []
    highConfidenceScores = []
    percentAutoClassified = []
    percentAutoClassifiedByMode = []
    for (trainIndices, testIndices) in foldArrays:
        # print testIndices[0]
        model = algo.fit(X[trainIndices], y[trainIndices])
        testX = X[testIndices]
        testy = y[testIndices]
        
        predictedY = model.predict(testX)
        if hasattr(algo, "decision_function"):
            predictedYProb = algo.decision_function(testX)
        else:
            predictedYProb = algo.predict_proba(testX)
        
        # print ("predictedY.shape = %s, predictedYProb.shape = %s" %
        #        (str(predictedY.shape), str(predictedYProb.shape)))
        
        # As we can see below, we take the max confidence along the first axis
        highConfidencePredictions = np.max(predictedYProb, 1) > prob_threshold
        print "Found %s high confidence predictions out of %s" % (np.count_nonzero(highConfidencePredictions),
                                                                  len(testIndices))
        
        cmc = lambda m:np.count_nonzero(testy[highConfidencePredictions] == m)
        
        # Let us see how many of each mode were autoclassified
        # print("Autoclassifications split by confirmed modes: walk: %s, bike: %s, bus: %s, train: %s, car: %s" %
        #       (cmc(1), cmc(3), cmc(5), cmc(6), cmc(7)))
        
        pcmc = lambda m: float(np.count_nonzero(testy[highConfidencePredictions] == m))/np.count_nonzero(testy == m) if ((np.count_nonzero(testy == m) != 0)) else 0 
        # Let us see what percentage of each mode was autoclassified
        # print("For threshold %s, autoclassifications split by confirmed mode percents: walk: %s, bike: %s, bus: %s, train: %s, car: %s" %
        #        (prob_threshold, pcmc(1), pcmc(3), pcmc(5), pcmc(6), pcmc(7)))
        
        percentAutoClassified.append(float(np.count_nonzero(highConfidencePredictions))/len(testIndices))
        percentAutoClassifiedByMode.append([pcmc(1), pcmc(3), pcmc(5), pcmc(6), pcmc(7)])
        
        # so now we are going to generate two scores.
        # the first is the score on only the high confidence predictions
        highConfidenceScore = model.score(testX[highConfidencePredictions], testy[highConfidencePredictions])
        highConfidenceScores.append(highConfidenceScore)
        
        score = model.score(X[testIndices], y[testIndices])
        scores.append(score)
    # print scores
    
    print("for prob %s, percentage auto classified %s" % (prob_threshold, np.array(percentAutoClassified).mean()))
    print("for prob %s, scoring only on high confidence predictions %s" % (prob_threshold, np.array(highConfidenceScores).mean()))
    print("for prob %s, scoring on all predictions %s" % (prob_threshold, np.array(scores).mean()))

    return (np.array(percentAutoClassified), np.array(percentAutoClassifiedByMode), np.array(highConfidenceScores), np.array(scores))

In [69]:
def exploreKFoldValidationSpace(algo, X, y, nFolds):
    (pac0, pacm0, hcs0, s0) = kFoldValidationWithProb(algo, X, y, nFolds, 0.90)    
    (pac5, pacm5, hcs5, s5) = kFoldValidationWithProb(algo, X, y, nFolds, 0.95)
    (pac9, pacm9, hcs9, s9) = kFoldValidationWithProb(algo, X, y, nFolds, 0.99)
    
    probs = [0.90, 0.95, 0.99]
    pacs = [pac0.mean(), pac5.mean(), pac9.mean()]
    hcs = [hcs0.mean(), hcs5.mean(), hcs9.mean()]
    ss = [s0.mean(), s5.mean(), s9.mean()]
    
    pacmWalk = [pacm0[:,0].mean(), pacm5[:,0].mean(), pacm9[:,0].mean()]
    pacmBike = [pacm0[:,1].mean(), pacm5[:,1].mean(), pacm9[:,1].mean()]
    pacmBus = [pacm0[:,2].mean(), pacm5[:,2].mean(), pacm9[:,2].mean()]
    pacmTrain = [pacm0[:,3].mean(), pacm5[:,3].mean(), pacm9[:,3].mean()]
    pacmCar = [pacm0[:,4].mean(), pacm5[:,4].mean(), pacm9[:,4].mean()]
    
    fig, axes = plt.subplots(1, 1, figsize=(15, 10))
    print pacs
    axes.set_yticks(np.arange(0,1,0.1))
    axes.plot(probs, pacs, label="percentage auto classified")
    
    print pacmWalk
    axes.plot(probs, pacmWalk, linewidth = 5, label="percent walk auto classified")
    print pacmBike
    axes.plot(probs, pacmBike, label="percent bike auto classified")
    print pacmBus
    axes.plot(probs, pacmBus, linewidth=5, label="percent bus auto classified")
    print pacmTrain
    axes.plot(probs, pacmTrain, label="percent train auto classified")
    print pacmCar
    axes.plot(probs, pacmCar, linewidth=5, label="percent car auto classified")
    
    print hcs
    axes.plot(probs, hcs, label="accuracy of high confidence samples")
    print ss
    axes.plot(probs, ss, linewidth = 5, label="accuracy of all samples")
    plt.legend(loc='best')

In [70]:
forestClf = ensemble.RandomForestClassifier()
exploreKFoldValidationSpace(forestClf, genericCleanedFM, cleanedResultVector, 5)


Found 18 high confidence predictions out of 52
Found 25 high confidence predictions out of 52
Found 19 high confidence predictions out of 51
Found 26 high confidence predictions out of 51
Found 23 high confidence predictions out of 51
for prob 0.9, percentage auto classified 0.432051282051
for prob 0.9, scoring only on high confidence predictions 0.965473684211
for prob 0.9, scoring on all predictions 0.860256410256
Found 30 high confidence predictions out of 52
Found 21 high confidence predictions out of 52
Found 21 high confidence predictions out of 51
Found 25 high confidence predictions out of 51
Found 23 high confidence predictions out of 51
for prob 0.95, percentage auto classified 0.466742081448
for prob 0.95, scoring only on high confidence predictions 0.977971014493
for prob 0.95, scoring on all predictions 0.886877828054
Found 26 high confidence predictions out of 52
Found 31 high confidence predictions out of 52
Found 21 high confidence predictions out of 51
Found 23 high confidence predictions out of 51
Found 17 high confidence predictions out of 51
for prob 0.99, percentage auto classified 0.458446455505
for prob 0.99, scoring only on high confidence predictions 0.956327543424
for prob 0.99, scoring on all predictions 0.844645550528
[0.43205128205128202, 0.46674208144796381, 0.45844645550527902]
[0.28333333333333333, 0.45999999999999996, 0.31047619047619046]
[0.44438672438672439, 0.48648841354723704, 0.4966666666666667]
[0.0, 0.0, 0.0]
[0.066666666666666666, 0.16666666666666666, 0.13333333333333333]
[0.49308201058201051, 0.50592324576097392, 0.49437149004274239]
[0.96547368421052615, 0.97797101449275359, 0.95632754342431758]
[0.8602564102564102, 0.88687782805429871, 0.84464555052790347]

The results of these three metrics for confidence intervals of 90%, 95% and 99% are shown above, and they are all largely similar. The accuracy of the high confidence predictions is, as expected, really high at 97 - 98%. However, we were only able to auto-classify ~ 50% of the sections. Now, let's retry using the linear SVM above.


In [71]:
svmClf = svm.LinearSVC()
exploreKFoldValidationSpace(svmClf, genericCleanedFM, cleanedResultVector, 5)


Found 10 high confidence predictions out of 52
Found 46 high confidence predictions out of 52
Found 19 high confidence predictions out of 51
Found 33 high confidence predictions out of 51
Found 44 high confidence predictions out of 51
for prob 0.9, percentage auto classified 0.59185520362
for prob 0.9, scoring only on high confidence predictions 0.556642396505
for prob 0.9, scoring on all predictions 0.532730015083
Found 5 high confidence predictions out of 52
Found 41 high confidence predictions out of 52
Found 50 high confidence predictions out of 51
Found 45 high confidence predictions out of 51
Found 6 high confidence predictions out of 51
for prob 0.95, percentage auto classified 0.573001508296
for prob 0.95, scoring only on high confidence predictions 0.371046070461
for prob 0.95, scoring on all predictions 0.37790346908
Found 25 high confidence predictions out of 52
Found 34 high confidence predictions out of 52
Found 36 high confidence predictions out of 51
Found 29 high confidence predictions out of 51
Found 38 high confidence predictions out of 51
for prob 0.99, percentage auto classified 0.630844645551
for prob 0.99, scoring only on high confidence predictions 0.576852638696
for prob 0.99, scoring on all predictions 0.558823529412
[0.59185520361990951, 0.57300150829562591, 0.63084464555052799]
[0.22857142857142856, 0.58714285714285708, 0.38428571428571429]
[0.31848484848484848, 0.47640692640692645, 0.22967032967032966]
[0.0, 0.0, 0.0]
[0.88000000000000012, 0.69999999999999996, 1.0]
[0.72147484699208841, 0.5979716024340771, 0.74915602013162996]
[0.55664239650509661, 0.37104607046070459, 0.5768526386961319]
[0.53273001508295637, 0.37790346907993966, 0.55882352941176472]

We see that the SVM is able to classify more trips than the decision tree, but at the cost of unacceptably lower performance on the high confidence predictions.

We now get the most important params for the decision tree so that we can better understand what it is doing.


In [72]:
forestClf.get_params()


Out[72]:
{'bootstrap': True,
 'compute_importances': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'min_density': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 10,
 'n_jobs': 1,
 'oob_score': False,
 'random_state': None,
 'verbose': 0}

In [73]:
for (i, importance) in enumerate(forestClf.feature_importances_):
    print featureLabels[i], importance


distance 0.101989807486
duration 0.0461485667333
first filter mode 0.157485362457
sectionId 0.012461548282
avg speed 0.12343212752
speed EV 0.078909478219
speed variance 0.320828572326
max speed 0.0985251507983
max accel 0.0514042999585
isCommute 0.00881508621924

So the highest importance features are:

  • first filter mode (moves mode)
  • speed EV
  • avg speed
  • distance

Now, let's try another non-parametric method like nearest neighbor


In [74]:
from sklearn import neighbors

In [75]:
knnClf = neighbors.KNeighborsClassifier()

In [76]:
exploreKFoldValidationSpace(knnClf, cleanedFeatureMatrix, cleanedResultVector, 5)


Found 26 high confidence predictions out of 52
Found 30 high confidence predictions out of 52
Found 21 high confidence predictions out of 51
Found 35 high confidence predictions out of 51
Found 26 high confidence predictions out of 51
for prob 0.9, percentage auto classified 0.536953242836
for prob 0.9, scoring only on high confidence predictions 0.920586080586
for prob 0.9, scoring on all predictions 0.844871794872
Found 29 high confidence predictions out of 52
Found 27 high confidence predictions out of 52
Found 33 high confidence predictions out of 51
Found 26 high confidence predictions out of 51
Found 27 high confidence predictions out of 51
for prob 0.95, percentage auto classified 0.552639517345
for prob 0.95, scoring only on high confidence predictions 0.925153390671
for prob 0.95, scoring on all predictions 0.844268476621
Found 34 high confidence predictions out of 52
Found 28 high confidence predictions out of 52
Found 25 high confidence predictions out of 51
Found 26 high confidence predictions out of 51
Found 29 high confidence predictions out of 51
for prob 0.99, percentage auto classified 0.552187028658
for prob 0.99, scoring only on high confidence predictions 0.930671065243
for prob 0.99, scoring on all predictions 0.8407239819
[0.53695324283559587, 0.55263951734539973, 0.55218702865761693]
[0.15555555555555553, 0.17714285714285713, 0.21333333333333332]
[0.34904761904761905, 0.35446553446553442, 0.33286713286713288]
[0.0, 0.0, 0.0]
[0.028571428571428571, 0.18666666666666668, 0.25]
[0.72631946326917873, 0.72106012027624278, 0.72514073371283994]
[0.92058608058608049, 0.92515339067063207, 0.93067106524307341]
[0.84487179487179487, 0.8442684766214178, 0.8407239819004525]

knn does almost the same as decision tree, except that the accuracy of the high confidence predictions is a bit lower. I think that the percentages are around the same as well. Basically, we can classify walk pretty well and the others pretty poorly. So I am not sure what we are adding here over moves :)

I'm surprised at the low prediction rate for cycling. Moves seems to get that pretty accurately for me.

I'm now going to plot this data and see what it looks like.

Advanced features added


In [77]:
Advanced_indices=[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18]
print(Advanced_indices)
forestClf = ensemble.RandomForestClassifier()
exploreKFoldValidationSpace(forestClf, cleanedFeatureMatrix[:,Advanced_indices], cleanedResultVector, 5)


[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]
Found 24 high confidence predictions out of 52
Found 25 high confidence predictions out of 52
Found 15 high confidence predictions out of 51
Found 26 high confidence predictions out of 51
Found 20 high confidence predictions out of 51
for prob 0.9, percentage auto classified 0.427677224736
for prob 0.9, scoring only on high confidence predictions 0.983333333333
for prob 0.9, scoring on all predictions 0.899019607843
Found 19 high confidence predictions out of 52
Found 22 high confidence predictions out of 52
Found 25 high confidence predictions out of 51
Found 23 high confidence predictions out of 51
Found 24 high confidence predictions out of 51
for prob 0.95, percentage auto classified 0.440045248869
for prob 0.95, scoring only on high confidence predictions 0.973880105402
for prob 0.95, scoring on all predictions 0.918250377074
Found 27 high confidence predictions out of 52
Found 17 high confidence predictions out of 52
Found 25 high confidence predictions out of 51
Found 27 high confidence predictions out of 51
Found 19 high confidence predictions out of 51
for prob 0.99, percentage auto classified 0.44766214178
for prob 0.99, scoring only on high confidence predictions 0.984592592593
for prob 0.99, scoring on all predictions 0.933936651584
[0.42767722473604824, 0.44004524886877822, 0.44766214177978886]
[0.040000000000000001, 0.13999999999999999, 0.17777777777777776]
[0.24914529914529915, 0.29324009324009326, 0.31072261072261076]
[0.0, 0.0, 0.0]
[0.11666666666666665, 0.089999999999999997, 0.25]
[0.59042931688804556, 0.57463203463203461, 0.5597985000451795]
[0.98333333333333317, 0.97388010540184455, 0.98459259259259257]
[0.89901960784313728, 0.91825037707390644, 0.93393665158371042]

Spatial knowledge added


In [78]:
Spatial_indices=[0,1,2,3,4,5,6,7,8,9,13,14,15,16,17,18,19,20]
print(Spatial_indices)
forestClf = ensemble.RandomForestClassifier()
exploreKFoldValidationSpace(forestClf, cleanedFeatureMatrix[:,Spatial_indices], cleanedResultVector, 5)


[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 13, 14, 15, 16, 17, 18, 19, 20]
Found 27 high confidence predictions out of 52
Found 31 high confidence predictions out of 52
Found 27 high confidence predictions out of 51
Found 26 high confidence predictions out of 51
Found 26 high confidence predictions out of 51
for prob 0.9, percentage auto classified 0.532880844646
for prob 0.9, scoring only on high confidence predictions 0.969515669516
for prob 0.9, scoring on all predictions 0.910407239819
Found 24 high confidence predictions out of 52
Found 21 high confidence predictions out of 52
Found 26 high confidence predictions out of 51
Found 19 high confidence predictions out of 51
Found 24 high confidence predictions out of 51
for prob 0.95, percentage auto classified 0.443665158371
for prob 0.95, scoring only on high confidence predictions 0.982142857143
for prob 0.95, scoring on all predictions 0.902714932127
Found 28 high confidence predictions out of 52
Found 20 high confidence predictions out of 52
Found 23 high confidence predictions out of 51
Found 22 high confidence predictions out of 51
Found 27 high confidence predictions out of 51
for prob 0.99, percentage auto classified 0.466968325792
for prob 0.99, scoring only on high confidence predictions 0.983501683502
for prob 0.99, scoring on all predictions 0.914404223228
[0.53288084464555052, 0.44366515837104076, 0.46696832579185521]
[0.29999999999999999, 0.14523809523809522, 0.18666666666666668]
[0.40717171717171718, 0.30571428571428572, 0.23805860805860807]
[0.0, 0.0, 0.0]
[0.36666666666666664, 0.11666666666666665, 0.3833333333333333]
[0.66404040404040399, 0.58443721935101256, 0.62111018014243813]
[0.96951566951566959, 0.9821428571428571, 0.98350168350168354]
[0.91040723981900451, 0.90271493212669685, 0.91440422322775261]

In [79]:
for (i, importance) in enumerate(forestClf.feature_importances_):
    print featureLabels[i], importance


distance 0.0639816410955
duration 0.0284049524753
first filter mode 0.128673321612
sectionId 0.0139580138381
avg speed 0.110869607959
speed EV 0.131658189459
speed variance 0.134385814468
max speed 0.0400202286035
max accel 0.0489940811131
isCommute 0.00355619636267
heading change rate 0.0169936111001
stop rate 0.103680251204
velocity change rate 0.0177435107174
start lat 0.0704408237554
start lng 0.037107440129
stop lat 0.0102258783795
stop lng 0.0
start hour 0.0393064377293

Location and time features added


In [80]:
forestClf = ensemble.RandomForestClassifier()
exploreKFoldValidationSpace(forestClf, cleanedFeatureMatrix, cleanedResultVector, 5)


Found 18 high confidence predictions out of 52
Found 28 high confidence predictions out of 52
Found 30 high confidence predictions out of 51
Found 29 high confidence predictions out of 51
Found 22 high confidence predictions out of 51
for prob 0.9, percentage auto classified 0.494570135747
for prob 0.9, scoring only on high confidence predictions 0.986436781609
for prob 0.9, scoring on all predictions 0.93371040724
Found 29 high confidence predictions out of 52
Found 27 high confidence predictions out of 52
Found 24 high confidence predictions out of 51
Found 23 high confidence predictions out of 51
Found 18 high confidence predictions out of 51
for prob 0.95, percentage auto classified 0.470286576169
for prob 0.95, scoring only on high confidence predictions 0.963058748404
for prob 0.95, scoring on all predictions 0.926093514329
Found 32 high confidence predictions out of 52
Found 28 high confidence predictions out of 52
Found 28 high confidence predictions out of 51
Found 21 high confidence predictions out of 51
Found 27 high confidence predictions out of 51
for prob 0.99, percentage auto classified 0.528808446456
for prob 0.99, scoring only on high confidence predictions 0.986607142857
for prob 0.99, scoring on all predictions 0.926093514329
[0.49457013574660635, 0.4702865761689291, 0.52880844645550529]
[0.16190476190476191, 0.12222222222222223, 0.24666666666666665]
[0.22363858363858363, 0.25999999999999995, 0.34659340659340659]
[0.0, 0.0, 0.0]
[0.28333333333333333, 0.52000000000000002, 0.37333333333333335]
[0.68666301769750049, 0.61193351566576726, 0.67681301210712974]
[0.98643678160919546, 0.96305874840357608, 0.9866071428571429]
[0.93371040723981902, 0.92609351432880849, 0.92609351432880849]

In [81]:
knnClf = neighbors.KNeighborsClassifier()
exploreKFoldValidationSpace(knnClf, cleanedFeatureMatrix, cleanedResultVector, 5)


Found 30 high confidence predictions out of 52
Found 24 high confidence predictions out of 52
Found 30 high confidence predictions out of 51
Found 32 high confidence predictions out of 51
Found 25 high confidence predictions out of 51
for prob 0.9, percentage auto classified 0.548868778281
for prob 0.9, scoring only on high confidence predictions 0.91825
for prob 0.9, scoring on all predictions 0.83665158371
Found 34 high confidence predictions out of 52
Found 33 high confidence predictions out of 52
Found 25 high confidence predictions out of 51
Found 23 high confidence predictions out of 51
Found 33 high confidence predictions out of 51
for prob 0.95, percentage auto classified 0.575339366516
for prob 0.95, scoring only on high confidence predictions 0.915054793459
for prob 0.95, scoring on all predictions 0.840271493213
Found 32 high confidence predictions out of 52
Found 33 high confidence predictions out of 52
Found 21 high confidence predictions out of 51
Found 31 high confidence predictions out of 51
Found 32 high confidence predictions out of 51
for prob 0.99, percentage auto classified 0.579411764706
for prob 0.99, scoring only on high confidence predictions 0.899737292278
for prob 0.99, scoring on all predictions 0.844570135747
[0.54886877828054303, 0.57533936651583706, 0.57941176470588229]
[0.25666666666666665, 0.22999999999999998, 0.26333333333333331]
[0.2614094239094239, 0.38769230769230767, 0.35714285714285715]
[0.0, 0.0, 0.0]
[0.14999999999999999, 0.12, 0.29999999999999999]
[0.73583062186346881, 0.73830751736431244, 0.75155165792679701]
[0.91825000000000012, 0.91505479345888552, 0.8997372922776149]
[0.83665158371040715, 0.84027149321266958, 0.84457013574660633]

In [82]:
for (i, importance) in enumerate(forestClf.feature_importances_):
    print featureLabels[i], importance


distance 0.0231248211626
duration 0.0153017313871
first filter mode 0.0738837516593
sectionId 0.00869007619383
avg speed 0.156694951419
speed EV 0.101225093569
speed variance 0.0538383131174
max speed 0.0920023484993
max accel 0.0207811246462
isCommute 0.00315433082022
heading change rate 0.0413629626794
stop rate 0.0869018391535
velocity change rate 0.0974757843874
start lat 0.00962633831856
start lng 0.0443292777696
stop lat 0.0201653257629
stop lng 0.0351866777418
start hour 0.00969283221957
end hour 0.00664907204558
close to bus stop 0.0
close to train stop 0.0
walking 0.00153153218181
running 0.0
cycling 0.021347462992
transport 0.0
bus 0.0
train 0.0301109875681
car 0.0399735063751
mixed 0.00694985833076

Some more contour plots to help us visualize the data


In [83]:
from matplotlib import colors
import itertools

In [84]:
def printColorMap(algo, Xall, y):
    # we want to split roughly into roughly 10-20 sections
    nSplits = 20
    
    # setup parameters
    cmap_light = colors.ListedColormap(['#FAAAAA', '#AFAAAA', '#AAFAAA', '#AAAFAA', '#AAAAFA', '#AAAAAF'])
    cmap_bold = colors.ListedColormap(['#F00000', '#0F0000', '#00F000', '#000F00', '#0000F0', '#00000F'])
   
    # nFeatures = Xall.shape[1]
    nFeatures = 10
    fig, axes = plt.subplots(20, 5, figsize=(15,50))
    plt.tight_layout()
    axesArr = axes.flatten()
        
    i = 0
    for selCombo in itertools.product(np.arange(nFeatures), np.arange(nFeatures)):
        if selCombo[0] == selCombo[1]:
            continue
        # print("Generating grid for combo %s,%s in slot %s" % (featureLabels[selCombo[0]], featureLabels[selCombo[1]], i))
        
        selMask = np.zeros(Xall.shape[1])
        # Otherwise, we won't be able to plot it properly below
        assert(len(selCombo) == 2)
        selMask[selCombo[0]] = 1
        selMask[selCombo[1]] = 1
    
        X = Xall[:,selMask == 1]
    
        algo.fit(X, y)
        # Plot the decision boundary. For that, we will assign a color to each
        # point in the mesh [x_min, m_max]x[y_min, y_max].
        x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
        y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
        # we want to split roughly into 
        h_x = float(x_max - x_min) / nSplits
        h_y = float(y_max - y_min) / nSplits
        
        xx, yy = np.meshgrid(np.arange(x_min, x_max, h_x),
                             np.arange(y_min, y_max, h_y))
        Z = algo.predict(np.c_[xx.ravel(), yy.ravel()])
    
        # Put the result into a color plot
        Z = Z.reshape(xx.shape)

        axesArr[i].pcolormesh(xx, yy, Z, cmap=cmap_light)

        # Plot also the training points
        axesArr[i].scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold)
        # plt.scatter(X[:, 0], X[:, 1], c=y)
        axesArr[i].set_xlim(xx.min(), xx.max())
        axesArr[i].set_ylim(yy.min(), yy.max())
        axesArr[i].set_title("%s v/s %s" % (featureLabels[selCombo[0]], featureLabels[selCombo[1]]))
        # axesArr[i].legend(loc='best')
        i = i+1

In [85]:
printColorMap(forestClf, cleanedFeatureMatrix, cleanedResultVector)


Let us also quickly take a look at the confusion matrix for the overall model. Because maybe we should not care about the confidence of the predictions, and just weight them lower.


In [86]:
from sklearn import metrics

In [87]:
def printConfusionMatrix(algo, X, y):
    skf = cross_validation.StratifiedKFold(y, 5)
    confusize=len(np.unique(y))
#     print(confusize)
    sumPCM = np.zeros([confusize, confusize])
    for train, test in skf:
        X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
        y_pred = algo.fit(X_train, y_train).predict(X_test)
        cm = metrics.confusion_matrix(y_test, y_pred)
#         print(cm.shape)
        sumArr = np.sum(cm, axis=1)
        repeatedSumArr = np.repeat(sumArr, cm.shape[1]).reshape(cm.shape)
        sumPCM = np.add(sumPCM, np.divide(cm.astype(float), repeatedSumArr))
    
    finalPCM = sumPCM / 5
    print(finalPCM)

    # Show confusion matrix in a separate window
    plt.matshow(finalPCM)
    plt.title('Confusion matrix')
    plt.colorbar()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()

In [88]:
forestClf = ensemble.RandomForestClassifier()
# print(np.unique(cleanedResultVector))
printConfusionMatrix(forestClf, genericCleanedFM, cleanedResultVector)


[[ 0.85        0.15        0.          0.        ]
 [ 0.05        0.79545455  0.          0.15454545]
 [ 0.          0.          0.41666667  0.58333333]
 [ 0.00625     0.03830645  0.03125     0.92419355]]

In [89]:
forestClf = ensemble.RandomForestClassifier()
printConfusionMatrix(forestClf, cleanedFeatureMatrix[:,Spatial_indices], cleanedResultVector)


[[ 0.96        0.04        0.          0.        ]
 [ 0.05151515  0.84545455  0.03333333  0.06969697]
 [ 0.          0.          0.9         0.1       ]
 [ 0.00625     0.03830645  0.00625     0.94919355]]

In [90]:
forestClf = ensemble.RandomForestClassifier()
printConfusionMatrix(forestClf, cleanedFeatureMatrix, cleanedResultVector)


[[ 0.86        0.14        0.          0.        ]
 [ 0.03333333  0.91666667  0.01666667  0.03333333]
 [ 0.          0.05        0.9         0.05      ]
 [ 0.01270161  0.03165323  0.          0.95564516]]

Adding start and end points does improve the accuracy of the bus and train. Train trips in particular, are significantly improved.


In [91]:
knnClf = neighbors.KNeighborsClassifier()
printConfusionMatrix(knnClf, genericCleanedFM, cleanedResultVector)


[[ 0.95        0.05        0.          0.        ]
 [ 0.08484848  0.7469697   0.01818182  0.15      ]
 [ 0.          0.          0.83333333  0.16666667]
 [ 0.01915323  0.08306452  0.03205645  0.86572581]]

In [92]:
knnClf = neighbors.KNeighborsClassifier()
printConfusionMatrix(knnClf, cleanedFeatureMatrix, cleanedResultVector)


[[ 0.95        0.05        0.          0.        ]
 [ 0.08484848  0.7469697   0.01818182  0.15      ]
 [ 0.          0.          0.83333333  0.16666667]
 [ 0.01915323  0.08306452  0.03205645  0.86572581]]

knn does significantly worse, primarily because of bus trips. I suspect this is because different people make the same trip using different modes. Time for per-user trips?

Only for transport trips

As we can see, the prediction rate is best for walk and bike, which are the ones for which we get the most data from moves. It may be a mistake to use the same model for both types of trips because moves will do a good job for walk/bike and a horrible job for transport, because we don't allow users to specify 'transport' in the output.

These also have zero carbon footprint. Let us see how well we do on the motorized trips alone.


In [93]:
transportTrips = cleanedFeatureMatrix[:,2] == 4
print np.count_nonzero(transportTrips)


187

In [94]:
forestClf = ensemble.RandomForestClassifier()
printConfusionMatrix(forestClf, genericCleanedFM[transportTrips], cleanedResultVector[transportTrips])


[[ 0.33333333  0.          0.66666667]
 [ 0.          0.4         0.6       ]
 [ 0.03870968  0.03225806  0.92903226]]

In [95]:
forestClf = ensemble.RandomForestClassifier()
printConfusionMatrix(forestClf, cleanedFeatureMatrix[transportTrips], cleanedResultVector[transportTrips])


[[ 0.43333333  0.06666667  0.5       ]
 [ 0.05        0.9         0.05      ]
 [ 0.03850806  0.          0.96149194]]

In [96]:
knnClf = neighbors.KNeighborsClassifier()
printConfusionMatrix(knnClf, genericCleanedFM[transportTrips], cleanedResultVector[transportTrips])


[[ 0.1         0.          0.9       ]
 [ 0.          0.8         0.2       ]
 [ 0.02580645  0.03870968  0.93548387]]

In [97]:
knnClf = neighbors.KNeighborsClassifier()
printConfusionMatrix(knnClf, cleanedFeatureMatrix[transportTrips], cleanedResultVector[transportTrips])


[[ 0.1         0.          0.9       ]
 [ 0.          0.8         0.2       ]
 [ 0.02580645  0.03870968  0.93548387]]

As we can see, we are actually able to predict car trips with a fair degree of accuracy. But bus and train trips are pretty much a tossup. Ignore the entries for 0 and 1 above, since we stripped out all walk and bike trips, and so these are only trips which moves misclassified, and not the entire dataset. Now we know why the Zheng paper only attempted to distinguish between bus and car trips, and not bus, train and car. The new features helped in the decision tree case, but not by that much, and did not help us at all in the knn case.

User-specific models


In [98]:
def getUserModelComparison(isTransportOnly):
    userIds = Sections.distinct("user_id")

    # I'm not going to bother with testing against only the generic features
    # because the main issue here is personalization

    numberOfSections = []
    percentAutoClassified = []
    percentAutoClassifiedWalk = []
    percentAutoClassifiedBike = []
    percentAutoClassifiedBus = []
    percentAutoClassifiedTrain = []
    percentAutoClassifiedCar = []
    autoClassifiedAccuracy = []
    overallAccuracy = []

    labels = ["Number of sections", "% autoclassified", "% auto classified walk",
              "% auto classified bike", "% auto classified bus",
              "% auto classified train", "% auto classified car",
              "auto classified accuracy", "overall accuracy"]
    
    for userId in userIds:
        # decision tree with all features
        if not isTransportOnly:
            query = {"$and": [{'type': 'move'}, {'confirmed_mode': {'$ne': ''}}, {'user_id': userId}]}
        else:
            query = {"$and": [{'type': 'move'}, {'confirmed_mode': {'$ne': ''}}, {'mode': 4}, {'user_id': userId}]}
        (userFeatureMatrix, userResultVector) = generateFeatureMatrixAndResultVector(query)
    
        # we only focus on users who have enough history with us
        if len(userResultVector) < 50:
            print("Skipping user with userId %s who has %s unconfirmed sections" % (userId, len(userResultVector)))
            continue
        
        forestClf = ensemble.RandomForestClassifier()
        # printConfusionMatrix(forestClf, userFeatureMatrix, userResultVector)
        (pac5, pacm5, hcs5, s5) = kFoldValidationWithProb(forestClf, userFeatureMatrix, userResultVector, 5, 0.95)
        numberOfSections.append(len(userResultVector))
        percentAutoClassified.append(pac5.mean())
        percentAutoClassifiedWalk.append(pacm5[0].mean())
        percentAutoClassifiedBike.append(pacm5[1].mean())
        percentAutoClassifiedBus.append(pacm5[2].mean())
        percentAutoClassifiedTrain.append(pacm5[3].mean())
        percentAutoClassifiedCar.append(pacm5[4].mean())
        autoClassifiedAccuracy.append(hcs5.mean())
        overallAccuracy.append(s5.mean())
    resultArray = np.array([numberOfSections, percentAutoClassified, percentAutoClassifiedWalk,
                            percentAutoClassifiedBike, percentAutoClassifiedBus, percentAutoClassifiedTrain,
                            percentAutoClassifiedCar, autoClassifiedAccuracy, overallAccuracy])
    print resultArray.shape
    return (resultArray, labels)

In [99]:
def displayUserVariation(ra, labels):
    ''' ra has rows = plots and cols = users
    '''
    fig, (axes, axesNum) = plt.subplots(2, 1, figsize=(25, 25))
    nUsers = ra.shape[1]
    for i in [1,-2,-1]:
        # each row is one plot
        print ra[i]
        axes.plot(np.arange(nUsers), ra[i], label=labels[i])
        axes.legend(loc='best')
    for i in [0]:
        # each row is one plot
        print ra[i]
        axesNum.plot(np.arange(nUsers), ra[i], label=labels[i])    
        axesNum.legend(loc='best')

In [100]:
(userResultArray, labels) = getUserModelComparison(isTransportOnly=False)


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-100-13de48a5696d> in <module>()
----> 1 (userResultArray, labels) = getUserModelComparison(isTransportOnly=False)

<ipython-input-98-bbe559a0ef13> in getUserModelComparison(isTransportOnly)
     26         else:
     27             query = {"$and": [{'type': 'move'}, {'confirmed_mode': {'$ne': ''}}, {'mode': 4}, {'user_id': userId}]}
---> 28         (userFeatureMatrix, userResultVector) = generateFeatureMatrixAndResultVector(query)
     29 
     30         # we only focus on users who have enough history with us

<ipython-input-18-6b5090dc8ae3> in generateFeatureMatrixAndResultVector(sectionQuery)
     77 
     78         ## RouteMatching Feature
---> 79         ModeCluster=cluster_route_match_score(section,step1=100000,step2=100000,method='DTW',radius1=2000,threshold=1000)
     80 #         print(ModeCluster)
     81         for modeInd in range(len(modeList)):

/Users/Mogeng/e-mission-server/CFC_WebApp/main/featurecalc.pyc in cluster_route_match_score(segment, step1, step2, method, radius1, threshold)
    335 
    336 def cluster_route_match_score(segment,step1=100000,step2=100000,method='lcs',radius1=2000,threshold=0.5):
--> 337     userRouteClusters=get_routeCluster_db().find_one({'$and':[{'user':segment['user_id']},{'method':method}]})['clusters']
    338     route_seg = getRoute(segment['_id'])
    339 

TypeError: 'NoneType' object has no attribute '__getitem__'

In [101]:
displayUserVariation(userResultArray, labels)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-101-ebf8f818bf51> in <module>()
----> 1 displayUserVariation(userResultArray, labels)

NameError: name 'userResultArray' is not defined

So there's quite a bit of variability in both the overall accuracy, and in the number of trips for a user. The two don't seem to be correlated though. We get some fairly uneven improvement - for some users, the general classification is over 90%. We are also able to classify over 80% of the trips for some users.

But that might just be due to a higher ratio of walk trips, which are classified more accurately. I can explore this only for transport, but first, I'm going to try to build a gesture library and build the associated features. Then maybe Mogeng can continue some of the exploration.


In [102]:
(userResultArrayTransOnly, labelsTransOnly) = getUserModelComparison(isTransportOnly=True)


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-102-a4e3b4479b64> in <module>()
----> 1 (userResultArrayTransOnly, labelsTransOnly) = getUserModelComparison(isTransportOnly=True)

<ipython-input-98-bbe559a0ef13> in getUserModelComparison(isTransportOnly)
     26         else:
     27             query = {"$and": [{'type': 'move'}, {'confirmed_mode': {'$ne': ''}}, {'mode': 4}, {'user_id': userId}]}
---> 28         (userFeatureMatrix, userResultVector) = generateFeatureMatrixAndResultVector(query)
     29 
     30         # we only focus on users who have enough history with us

<ipython-input-18-6b5090dc8ae3> in generateFeatureMatrixAndResultVector(sectionQuery)
     77 
     78         ## RouteMatching Feature
---> 79         ModeCluster=cluster_route_match_score(section,step1=100000,step2=100000,method='DTW',radius1=2000,threshold=1000)
     80 #         print(ModeCluster)
     81         for modeInd in range(len(modeList)):

/Users/Mogeng/e-mission-server/CFC_WebApp/main/featurecalc.pyc in cluster_route_match_score(segment, step1, step2, method, radius1, threshold)
    335 
    336 def cluster_route_match_score(segment,step1=100000,step2=100000,method='lcs',radius1=2000,threshold=0.5):
--> 337     userRouteClusters=get_routeCluster_db().find_one({'$and':[{'user':segment['user_id']},{'method':method}]})['clusters']
    338     route_seg = getRoute(segment['_id'])
    339 

TypeError: 'NoneType' object has no attribute '__getitem__'

In [103]:
displayUserVariation(userResultArrayTransOnly, labelsTransOnly)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-103-4798d3743565> in <module>()
----> 1 displayUserVariation(userResultArrayTransOnly, labelsTransOnly)

NameError: name 'userResultArrayTransOnly' is not defined

So looking at transport-only trips, and focusing on users with enough transport history (50+ motorized transport trips), we are able to get an overall accuracy of around 70 - 80% even for the motorized trips. However, there are some clear outliers, like the one who has only 60% accuracy. Also, because our current threshold for high confidence is set so high, the high confidence predictions are > 95% correct as before. We have to decide what to use.

We can autoclassify 20 - 50% of the motorized transport trips. In general, this is related to the number of trips - there is a very clear spike in the data for user 4. But the correlation is not exact. In particular, user 5 has > 50 trips, but only ~ 10% autoclassified trips.

It might be worthwhile to take a closer look at these 6 users, see what their transport trips look like, and get a sense of what the difference between user 4 and user 5 is, for example. This might help us figure out how to build better user models.


In [104]:
def buildRouteLibrary(userId, threshold):
    '''
    Here we attempt to build a route library for each user.
    Then, the probability of the top match can be a factor in our machine learning.
    Let us just start with the start and end points instead of a full dynamic time warp.
    
    userSections = Sections.find({"$and": [{'type': 'move'}, {'confirmed_mode': {'$ne': ''}}, {'user_id': userId}]})
    existingRoutes = RouteLibrary()
    for section in userSections:
        existingRoutes.update(section)
    return existingRoutes
    '''

A summary comparision of models

Generic model


In [126]:
forestClf = ensemble.RandomForestClassifier()
exploreKFoldValidationSpace(forestClf, cleanedFeatureMatrix[:,genericFeatureIndices], cleanedResultVector, 5)
printConfusionMatrix(forestClf, cleanedFeatureMatrix[:,genericFeatureIndices], cleanedResultVector)


Found 27 high confidence predictions out of 52
Found 20 high confidence predictions out of 52
Found 23 high confidence predictions out of 51
Found 22 high confidence predictions out of 51
Found 15 high confidence predictions out of 51
for prob 0.9, percentage auto classified 0.416063348416
for prob 0.9, scoring only on high confidence predictions 0.958880105402
for prob 0.9, scoring on all predictions 0.887254901961
Found 22 high confidence predictions out of 52
Found 27 high confidence predictions out of 52
Found 24 high confidence predictions out of 51
Found 17 high confidence predictions out of 51
Found 21 high confidence predictions out of 51
for prob 0.95, percentage auto classified 0.431598793363
for prob 0.95, scoring only on high confidence predictions 0.965644540645
for prob 0.95, scoring on all predictions 0.863800904977
Found 20 high confidence predictions out of 52
Found 30 high confidence predictions out of 52
Found 16 high confidence predictions out of 51
Found 26 high confidence predictions out of 51
Found 12 high confidence predictions out of 51
for prob 0.99, percentage auto classified 0.40407239819
for prob 0.99, scoring only on high confidence predictions 0.943974358974
for prob 0.99, scoring on all predictions 0.85987933635
[0.41606334841628961, 0.43159879336349921, 0.40407239819004526]
[0.26190476190476186, 0.22857142857142856, 0.26999999999999996]
[0.36923076923076925, 0.48888888888888893, 0.41075757575757577]
[0.0, 0.0, 0.0]
[0.15666666666666668, 0.089999999999999997, 0.073333333333333334]
[0.4923312923312923, 0.46912102149171114, 0.46136710239651418]
[0.95888010540184454, 0.96564454064454064, 0.94397435897435888]
[0.88725490196078416, 0.86380090497737549, 0.85987933634992453]
[[ 0.91        0.04        0.          0.05      ]
 [ 0.05        0.83030303  0.          0.11969697]
 [ 0.          0.          0.48333333  0.51666667]
 [ 0.00625     0.04475806  0.01270161  0.93629032]]

In [106]:
for (i, importance) in enumerate(forestClf.feature_importances_):
    print featureLabels[i], importance


distance 0.0296112725131
duration 0.105345796636
first filter mode 0.156262575133
sectionId 0.0196439806407
avg speed 0.0957797497725
speed EV 0.18871602775
speed variance 0.223488011349
max speed 0.0882178378311
max accel 0.0878481391335
isCommute 0.00508660924174

Generic + Advanced model


In [127]:
forestClf = ensemble.RandomForestClassifier()
exploreKFoldValidationSpace(forestClf, cleanedFeatureMatrix[:,genericFeatureIndices+AdvancedFeatureIndices], cleanedResultVector, 5)
printConfusionMatrix(forestClf, cleanedFeatureMatrix[:,genericFeatureIndices+AdvancedFeatureIndices], cleanedResultVector)


Found 27 high confidence predictions out of 52
Found 26 high confidence predictions out of 52
Found 22 high confidence predictions out of 51
Found 20 high confidence predictions out of 51
Found 13 high confidence predictions out of 51
for prob 0.9, percentage auto classified 0.419532428356
for prob 0.9, scoring only on high confidence predictions 0.975524475524
for prob 0.9, scoring on all predictions 0.859803921569
Found 17 high confidence predictions out of 52
Found 19 high confidence predictions out of 52
Found 21 high confidence predictions out of 51
Found 19 high confidence predictions out of 51
Found 22 high confidence predictions out of 51
for prob 0.95, percentage auto classified 0.381598793363
for prob 0.95, scoring only on high confidence predictions 0.925471432592
for prob 0.95, scoring on all predictions 0.867571644042
Found 16 high confidence predictions out of 52
Found 21 high confidence predictions out of 52
Found 19 high confidence predictions out of 51
Found 18 high confidence predictions out of 51
Found 19 high confidence predictions out of 51
for prob 0.99, percentage auto classified 0.361915535445
for prob 0.99, scoring only on high confidence predictions 0.956338763576
for prob 0.99, scoring on all predictions 0.883182503771
[0.41953242835595772, 0.38159879336349928, 0.36191553544494726]
[0.34714285714285714, 0.25, 0.45999999999999996]
[0.41878787878787876, 0.42389277389277391, 0.29328171828171834]
[0.0, 0.0, 0.0]
[0.10666666666666666, 0.16, 0.11857142857142858]
[0.48022528415080601, 0.40154887316439358, 0.41395502645502641]
[0.97552447552447552, 0.92547143259217557, 0.95633876357560577]
[0.85980392156862739, 0.86757164404223241, 0.88318250377073892]
[[ 0.86        0.14        0.          0.        ]
 [ 0.05        0.88333333  0.          0.06666667]
 [ 0.          0.          0.36666667  0.63333333]
 [ 0.01270161  0.03165323  0.03145161  0.92419355]]

In [108]:
for (i, importance) in enumerate(forestClf.feature_importances_):
    print featureLabels[i], importance


distance 0.024486884262
duration 0.0389122260387
first filter mode 0.120573608401
sectionId 0.00604740743722
avg speed 0.117449080123
speed EV 0.169444454398
speed variance 0.154185256254
max speed 0.112006879998
max accel 0.0252823191124
isCommute 0.00249689269519
heading change rate 0.12482988159
stop rate 0.0311160891948
velocity change rate 0.0731690204964

Generic + BusTrain model


In [131]:
forestClf = ensemble.RandomForestClassifier()
exploreKFoldValidationSpace(forestClf, cleanedFeatureMatrix[:,genericFeatureIndices+BusTrainFeatureIndices], cleanedResultVector, 5)
printConfusionMatrix(forestClf, cleanedFeatureMatrix[:,genericFeatureIndices+BusTrainFeatureIndices], cleanedResultVector)


Found 23 high confidence predictions out of 52
Found 25 high confidence predictions out of 52
Found 13 high confidence predictions out of 51
Found 27 high confidence predictions out of 51
Found 24 high confidence predictions out of 51
for prob 0.9, percentage auto classified 0.435595776772
for prob 0.9, scoring only on high confidence predictions 0.975201288245
for prob 0.9, scoring on all predictions 0.89125188537
Found 20 high confidence predictions out of 52
Found 21 high confidence predictions out of 52
Found 26 high confidence predictions out of 51
Found 26 high confidence predictions out of 51
Found 25 high confidence predictions out of 51
for prob 0.95, percentage auto classified 0.459653092006
for prob 0.95, scoring only on high confidence predictions 0.92863003663
for prob 0.95, scoring on all predictions 0.848114630468
Found 24 high confidence predictions out of 52
Found 23 high confidence predictions out of 52
Found 22 high confidence predictions out of 51
Found 27 high confidence predictions out of 51
Found 18 high confidence predictions out of 51
for prob 0.99, percentage auto classified 0.443514328808
for prob 0.99, scoring only on high confidence predictions 0.952188552189
for prob 0.99, scoring on all predictions 0.856033182504
[0.43559577677224742, 0.45965309200603316, 0.44351432880844649]
[0.37857142857142856, 0.42666666666666664, 0.20666666666666669]
[0.48730158730158735, 0.41153846153846152, 0.48321678321678319]
[0.0, 0.0, 0.0]
[0.083333333333333329, 0.48333333333333328, 0.33333333333333331]
[0.47365501039125329, 0.46049865694078218, 0.49092741935483869]
[0.9752012882447666, 0.92863003663003663, 0.95218855218855225]
[0.89125188536953248, 0.84811463046757163, 0.8560331825037707]
[[ 0.9         0.05        0.          0.05      ]
 [ 0.05        0.86515152  0.          0.08484848]
 [ 0.          0.          0.61666667  0.38333333]
 [ 0.00625     0.05766129  0.01875     0.91733871]]

In [110]:
print(genericFeatureIndices+BusTrainFeatureIndices)
forestClf.feature_importances_


[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 19, 20]
Out[110]:
array([ 0.03444598,  0.06069715,  0.21183028,  0.01391201,  0.18800568,
        0.1093163 ,  0.18382435,  0.12828708,  0.02470621,  0.00646429,
        0.        ,  0.03851068])

In [111]:
for (i, importance) in enumerate(forestClf.feature_importances_):
    print featureLabels[i], importance


distance 0.0344459818984
duration 0.0606971482021
first filter mode 0.21183027511
sectionId 0.0139120099857
avg speed 0.188005676094
speed EV 0.109316298761
speed variance 0.183824354331
max speed 0.128287075003
max accel 0.0247062121768
isCommute 0.00646429320599
heading change rate 0.0
stop rate 0.038510675232

Generic + Advanced + BusTrain model


In [132]:
forestClf = ensemble.RandomForestClassifier()
exploreKFoldValidationSpace(forestClf, cleanedFeatureMatrix[:,genericFeatureIndices+AdvancedFeatureIndices+BusTrainFeatureIndices], cleanedResultVector, 5)
printConfusionMatrix(forestClf, cleanedFeatureMatrix[:,genericFeatureIndices+AdvancedFeatureIndices+BusTrainFeatureIndices], cleanedResultVector)


Found 22 high confidence predictions out of 52
Found 26 high confidence predictions out of 52
Found 21 high confidence predictions out of 51
Found 14 high confidence predictions out of 51
Found 17 high confidence predictions out of 51
for prob 0.9, percentage auto classified 0.388536953243
for prob 0.9, scoring only on high confidence predictions 0.966000666001
for prob 0.9, scoring on all predictions 0.883031674208
Found 20 high confidence predictions out of 52
Found 20 high confidence predictions out of 52
Found 19 high confidence predictions out of 51
Found 18 high confidence predictions out of 51
Found 20 high confidence predictions out of 51
for prob 0.95, percentage auto classified 0.377375565611
for prob 0.95, scoring only on high confidence predictions 0.957251461988
for prob 0.95, scoring on all predictions 0.879336349925
Found 17 high confidence predictions out of 52
Found 24 high confidence predictions out of 52
Found 24 high confidence predictions out of 51
Found 18 high confidence predictions out of 51
Found 17 high confidence predictions out of 51
for prob 0.99, percentage auto classified 0.389064856712
for prob 0.99, scoring only on high confidence predictions 0.95637254902
for prob 0.99, scoring on all predictions 0.863574660633
[0.3885369532428356, 0.37737556561085972, 0.38906485671191554]
[0.21666666666666665, 0.39523809523809522, 0.28714285714285714]
[0.33888888888888885, 0.32754578754578756, 0.29390637140637138]
[0.0, 0.0, 0.0]
[0.20000000000000001, 0.10666666666666666, 0.18333333333333332]
[0.4578411910669975, 0.43100179763138602, 0.45999427043544694]
[0.96600066600066603, 0.95725146198830413, 0.95637254901960778]
[0.88303167420814466, 0.87933634992458531, 0.86357466063348409]
[[ 0.91        0.09        0.          0.        ]
 [ 0.05        0.86515152  0.          0.08484848]
 [ 0.          0.          0.53333333  0.46666667]
 [ 0.00625     0.05766129  0.00625     0.92983871]]

In [113]:
for (i, importance) in enumerate(forestClf.feature_importances_):
    print featureLabels[i], importance


distance 0.0632373845336
duration 0.0359033077901
first filter mode 0.119684338591
sectionId 0.0
avg speed 0.206093155033
speed EV 0.0825402105556
speed variance 0.128700261096
max speed 0.158400619851
max accel 0.0135804720675
isCommute 0.00170382401561
heading change rate 0.0476230363208
stop rate 0.0287572531945
velocity change rate 0.0666113509365
start lat 0.0
start lng 0.047164786014

Generic + Advanced + BusTrain + Location model


In [134]:
forestClf = ensemble.RandomForestClassifier()
exploreKFoldValidationSpace(forestClf, cleanedFeatureMatrix[:,genericFeatureIndices+AdvancedFeatureIndices+BusTrainFeatureIndices
                                                            +LocationFeatureIndices], cleanedResultVector, 5)
printConfusionMatrix(forestClf, cleanedFeatureMatrix[:,genericFeatureIndices+AdvancedFeatureIndices+BusTrainFeatureIndices
                                                            +LocationFeatureIndices], cleanedResultVector)


Found 28 high confidence predictions out of 52
Found 31 high confidence predictions out of 52
Found 26 high confidence predictions out of 51
Found 23 high confidence predictions out of 51
Found 21 high confidence predictions out of 51
for prob 0.9, percentage auto classified 0.501432880845
for prob 0.9, scoring only on high confidence predictions 0.954870305011
for prob 0.9, scoring on all predictions 0.926093514329
Found 22 high confidence predictions out of 52
Found 26 high confidence predictions out of 52
Found 30 high confidence predictions out of 51
Found 20 high confidence predictions out of 51
Found 18 high confidence predictions out of 51
for prob 0.95, percentage auto classified 0.451282051282
for prob 0.95, scoring only on high confidence predictions 0.981196581197
for prob 0.95, scoring on all predictions 0.906485671192
Found 25 high confidence predictions out of 52
Found 23 high confidence predictions out of 52
Found 20 high confidence predictions out of 51
Found 27 high confidence predictions out of 51
Found 25 high confidence predictions out of 51
for prob 0.99, percentage auto classified 0.466968325792
for prob 0.99, scoring only on high confidence predictions 0.985185185185
for prob 0.99, scoring on all predictions 0.922322775264
[0.50143288084464555, 0.45128205128205129, 0.46696832579185521]
[0.17999999999999999, 0.40714285714285714, 0.25714285714285712]
[0.31641025641025644, 0.21877289377289375, 0.34649517149517151]
[0.0, 0.0, 0.0]
[0.14999999999999999, 0.040000000000000001, 0.29523809523809519]
[0.63937119675456389, 0.62295369190530481, 0.57077044577044567]
[0.95487030501055747, 0.98119658119658126, 0.98518518518518516]
[0.92609351432880838, 0.90648567119155354, 0.92232277526395179]
[[ 0.87        0.13        0.          0.        ]
 [ 0.03333333  0.9         0.          0.06666667]
 [ 0.          0.          0.95        0.05      ]
 [ 0.00625     0.04455645  0.          0.94919355]]

Generic + Advanced + BusTrain + Location + Time model


In [137]:
forestClf = ensemble.RandomForestClassifier()
exploreKFoldValidationSpace(forestClf, cleanedFeatureMatrix[:,genericFeatureIndices+AdvancedFeatureIndices+BusTrainFeatureIndices
                                                            +LocationFeatureIndices+TimeFeatureIndices], cleanedResultVector, 5)
printConfusionMatrix(forestClf, cleanedFeatureMatrix[:,genericFeatureIndices+AdvancedFeatureIndices+BusTrainFeatureIndices
                                                            +LocationFeatureIndices+TimeFeatureIndices], cleanedResultVector)


Found 24 high confidence predictions out of 52
Found 28 high confidence predictions out of 52
Found 22 high confidence predictions out of 51
Found 27 high confidence predictions out of 51
Found 25 high confidence predictions out of 51
for prob 0.9, percentage auto classified 0.490196078431
for prob 0.9, scoring only on high confidence predictions 0.97744973545
for prob 0.9, scoring on all predictions 0.918250377074
Found 20 high confidence predictions out of 52
Found 23 high confidence predictions out of 52
Found 30 high confidence predictions out of 51
Found 22 high confidence predictions out of 51
Found 23 high confidence predictions out of 51
for prob 0.95, percentage auto classified 0.459502262443
for prob 0.95, scoring only on high confidence predictions 0.984637681159
for prob 0.95, scoring on all predictions 0.918325791855
Found 26 high confidence predictions out of 52
Found 28 high confidence predictions out of 52
Found 20 high confidence predictions out of 51
Found 16 high confidence predictions out of 51
Found 23 high confidence predictions out of 51
for prob 0.99, percentage auto classified 0.439064856712
for prob 0.99, scoring only on high confidence predictions 0.975714285714
for prob 0.99, scoring on all predictions 0.906787330317
[0.49019607843137258, 0.45950226244343889, 0.43906485671191547]
[0.29714285714285715, 0.13, 0.089999999999999997]
[0.29173604173604173, 0.32311688311688314, 0.29255799755799761]
[0.0, 0.0, 0.0]
[0.14857142857142858, 0.17333333333333334, 0.18333333333333332]
[0.62998647734956048, 0.58961990595611291, 0.58046296296296296]
[0.97744973544973546, 0.9846376811594203, 0.97571428571428565]
[0.91825037707390644, 0.91832579185520358, 0.90678733031674208]
[[ 0.92        0.08        0.          0.        ]
 [ 0.06818182  0.86515152  0.01666667  0.05      ]
 [ 0.          0.          0.95        0.05      ]
 [ 0.00625     0.05080645  0.00645161  0.93649194]]

In [51]:
for (i, importance) in enumerate(forestClf.feature_importances_):
    print featureLabels[i], importance


distance 0.0256169893858
duration 0.0268814393768
first filter mode 0.172151263686
sectionId 0.0
avg speed 0.0625403321809
speed EV 0.0960213388378
speed variance 0.143150275028
max speed 0.0931226349765
max accel 0.00531485834888
isCommute 0.000423831736124
heading change rate 0.0254355893953
stop rate 0.0402968711831
velocity change rate 0.08843047259
start lat 0.0
start lng 0.0242146059607
stop lat 0.0117110940089
stop lng 0.0796086808924
start hour 0.00881680182146
end hour 0.0812257452367
close to bus stop 0.0118881169651
close to train stop 0.00314905838986

Generic + Advanced + BusTrain + Location + Time + RouteMatching model


In [138]:
forestClf = ensemble.RandomForestClassifier()
exploreKFoldValidationSpace(forestClf, cleanedFeatureMatrix[:,genericFeatureIndices+AdvancedFeatureIndices+BusTrainFeatureIndices
                                                            +LocationFeatureIndices+TimeFeatureIndices+RouteMatchingFeatureIndices],
                            cleanedResultVector, 5)
printConfusionMatrix(forestClf, cleanedFeatureMatrix[:,genericFeatureIndices+AdvancedFeatureIndices+BusTrainFeatureIndices
                                                            +LocationFeatureIndices+TimeFeatureIndices+RouteMatchingFeatureIndices], 
                     cleanedResultVector)


Found 32 high confidence predictions out of 52
Found 28 high confidence predictions out of 52
Found 26 high confidence predictions out of 51
Found 24 high confidence predictions out of 51
Found 21 high confidence predictions out of 51
for prob 0.9, percentage auto classified 0.509200603318
for prob 0.9, scoring only on high confidence predictions 0.986057692308
for prob 0.9, scoring on all predictions 0.910256410256
Found 32 high confidence predictions out of 52
Found 21 high confidence predictions out of 52
Found 24 high confidence predictions out of 51
Found 25 high confidence predictions out of 51
Found 28 high confidence predictions out of 51
for prob 0.95, percentage auto classified 0.50580693816
for prob 0.95, scoring only on high confidence predictions 0.970833333333
for prob 0.95, scoring on all predictions 0.918476621418
Found 29 high confidence predictions out of 52
Found 26 high confidence predictions out of 52
Found 23 high confidence predictions out of 51
Found 26 high confidence predictions out of 51
Found 23 high confidence predictions out of 51
for prob 0.99, percentage auto classified 0.493891402715
for prob 0.99, scoring only on high confidence predictions 0.97671548841
for prob 0.99, scoring on all predictions 0.922247360483
[0.50920060331825034, 0.50580693815987932, 0.49389140271493215]
[0.3833333333333333, 0.21999999999999997, 0.22333333333333333]
[0.29048337283631404, 0.24551448551448551, 0.32264069264069267]
[0.0, 0.0, 0.0]
[0.53333333333333344, 0.66666666666666663, 0.10666666666666666]
[0.62142127349023901, 0.64612333810053502, 0.6481710390249289]
[0.98605769230769236, 0.97083333333333344, 0.9767154884096414]
[0.91025641025641024, 0.91847662141779796, 0.92224736048265465]
[[ 0.9         0.1         0.          0.        ]
 [ 0.05151515  0.8469697   0.01666667  0.08484848]
 [ 0.          0.          0.95        0.05      ]
 [ 0.          0.05080645  0.          0.94919355]]

In [117]:
for (i, importance) in enumerate(forestClf.feature_importances_):
    print featureLabels[i], importance


distance 0.0149112832964
duration 0.0149490182512
first filter mode 0.0936947642492
sectionId 0.00134812685151
avg speed 0.0246087197342
speed EV 0.0579008538752
speed variance 0.159546611099
max speed 0.0697263977761
max accel 0.0265435833545
isCommute 0.000393058016184
heading change rate 0.114038868064
stop rate 0.119591253375
velocity change rate 0.0552726874989
start lat 0.0
start lng 0.00878045573343
stop lat 0.0119286869256
stop lng 0.0878309027517
start hour 0.0114351948012
end hour 0.0381995245631
close to bus stop 0.0160942828294
close to train stop 0.00680133546922
walking 0.00440596531638
running 0.0
cycling 0.0070542681769
transport 0.0
bus 0.0
train 0.0241884816585
car 0.0157533180849
mixed 0.0150023582487

In [99]:
print(cleanedFeatureMatrix.shape)


(257, 29)

In [99]:


In [ ]: