Training: Cruising Identification

Written by Orysya Stus


In [ ]:
import pandas as pd
import numpy as np
import matplotlib as plt
import json
%pylab inline

MULTI-STEP CLASSIFICATION


In [ ]:
data = !ls *.json
one_week = []
for d in data:
    with open(d) as f:
        resulting_json = json.load(f)
        one_week.append(pd.DataFrame(resulting_json))
metadata = pd.concat(one_week, axis=0)
metadata = metadata.reset_index(drop=True)

Check if duplicate hashes exist and remove unneeded columns.


In [ ]:
# Duplicate checker
print('Number of instances: ', metadata.shape[0])
print('Number of unique hashes: ', metadata['group'].nunique())
a = pd.DataFrame(metadata.groupby('group')['group'].count())
group_duplicates = list(a.index[a['group'] > 1])
print('Number of duplicates: ', len(group_duplicates))
index_hash_mapping = dict(zip(list(metadata.index), list(metadata['group'])))
del metadata['group']
del metadata['id']
del metadata['reduction']
del metadata['timeofday']
del metadata['weekday']
del metadata['date']
del metadata['fhv']

# # These features would not provide additional information gain
del metadata['distance_shortest']
del metadata['distance_total']
del metadata['duration_of_trip']
metadata = metadata.astype(float)
metadata['speed_average'] = metadata['speed_average']*2.23694
metadata['speed_max'] = metadata['speed_max']*2.23694
metadata['speed_standard_deviation'] = metadata['speed_standard_deviation']*2.23694

In [ ]:
print('For modeling, the shape of the metadata is: ', metadata.shape)
metadata.head()

Overview of Metadata

Summary Statistics
Attribute Histograms
Attribute Correlations
Attribute Covariances

In [ ]:
metadata.describe()

In [ ]:
metadata.hist(bins=100, figsize=(20,20));

In [ ]:
metadata.corr()

In [ ]:
metadata.cov()

In [ ]:
# from pandas.tools.plotting import scatter_matrix
# scatter_matrix(metadata, alpha=0.03, figsize=(20, 20), diagonal='kde');

Training Data Creation

Not Cruising:


In [ ]:
print('time_percentage_driving == 0 and time_percentage_bogus == 0 count: ', metadata[(metadata['time_percentage_driving'] == 0) & (metadata['time_percentage_bogus'] == 0)].shape[0])
print('time_percentage_driving == 0 and time_percentage_bogus == 0 %: ', round(100.0*metadata[(metadata['time_percentage_driving'] == 0) & (metadata['time_percentage_bogus'] == 0)].shape[0]/metadata.shape[0], 2),'%')

print('\ntime_percentage_walking == 100 count: ', metadata['time_percentage_walking'][metadata['time_percentage_walking'] == 100].count())
print('time_percentage_walking == 100 %: ', round(100.0*metadata['time_percentage_walking'][metadata['time_percentage_walking'] == 100].count()/metadata.shape[0], 2))

print('\nMark above cases as not cruising, to use as training data.')
not_cruising = []
not_cruising.extend(list(metadata[(metadata['time_percentage_driving'] == 0) & (metadata['time_percentage_bogus'] == 0)].index))
not_cruising.extend(list(metadata['time_percentage_walking'][metadata['time_percentage_walking'] == 100].index))

not_cruising = list(set(not_cruising))
print('Not cruising count: ', len(not_cruising))
print('Not cruising %: ', round(100.0*len(not_cruising)/metadata.shape[0], 2),'%')
metadata1 = metadata[~metadata.index.isin(not_cruising)]

#Get data where threshold == 1  --> not cruising
not_cruising_threshold = 1
print('\nNumber of instances where distance_ratio ==', not_cruising_threshold,': ', metadata1['distance_ratio'][metadata1['distance_ratio'] == not_cruising_threshold].count())
print('% of instances where distance_ratio ==', not_cruising_threshold, ': ', round(100.0*metadata1['distance_ratio'][metadata1['distance_ratio'] == not_cruising_threshold].count()/metadata.shape[0], 2), '%')
not_cruising.extend(list(metadata1[metadata1['distance_ratio'] == not_cruising_threshold].index))

print('\n*********************For Training: Not Cruising********************* \nTotal not cruising count: ', len(not_cruising))
print('Total not cruising %: ', round(100.0*len(not_cruising)/metadata.shape[0], 2),'%')

metadata1 = metadata[~metadata.index.isin(not_cruising)]

Cruising:


In [ ]:
#Get data where threshold < 0.3 --> cruising
cruising_threshold = 0.3
print('\n*********************For Training: Cruising********************* \ndistance_ratio <', cruising_threshold,'count: ', metadata1['distance_ratio'][metadata1['distance_ratio'] < cruising_threshold].count())
print('distance_ratio <', cruising_threshold, '%: ', round(100.0*metadata1['distance_ratio'][metadata1['distance_ratio'] < cruising_threshold].count()/metadata.shape[0], 2), '%')
cruising = list(metadata1[metadata1['distance_ratio'] < cruising_threshold].index)

Modeling


In [ ]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
from sklearn import metrics
import seaborn as sns
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import cross_val_score, cross_val_predict, StratifiedKFold 
from sklearn import preprocessing, metrics, svm, ensemble
from sklearn.metrics import accuracy_score, classification_report
from sklearn.externals import joblib
from sklearn.tree import DecisionTreeClassifier, export_graphviz

Separating data between known (labeled for training) and unknowns (need to be predicted).


In [ ]:
not_cruising_data = metadata[metadata.index.isin(not_cruising)]
not_cruising_data['label'] = 'not_cruising'
cruising_data = metadata[metadata.index.isin(cruising)]
cruising_data['label'] = 'cruising'

knowns = pd.concat([not_cruising_data, cruising_data])
unknowns = metadata[(~metadata.index.isin(not_cruising)) & (~metadata.index.isin(cruising))]
del knowns['distance_ratio']
del unknowns['distance_ratio']
print('Knowns: ', knowns.shape, 'Unknowns: ', unknowns.shape)

Build training dataset for modeling.


In [ ]:
def training_data(dataframe, standardize, random_state):
    """The function prepares the train & test dataframes for modeling.
    Standardizing the data is desired."""
    data = np.array(dataframe.ix[:, dataframe.columns != 'label'])
    dic = {'cruising':1, 'not_cruising':0}
    target = np.array([dic[n] if n in dic else n for n in np.array(dataframe.ix[:, -1])])
        
    if standardize == 'True':
        """Scale the data (Assume that all features are centered around 0 and have variance in the same order. If a 
        feature has a variance that is orders of magnitude larger that others, it might dominate the objective 
        function and make the estimator unable to learn from other features correctly as expected). Note in order for 
        StandardScaler to work, need to remove any nulls in data set prior to running."""
        scalar = preprocessing.StandardScaler().fit(data)
        data = scalar.transform(data)
        joblib.dump(scalar, 'standardize_X.pkl')
    else: pass
        
    X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.4, random_state=random_state)
    return(X_train, y_train, X_test, y_test)

Decision Tree


In [ ]:
accuracy = []
roc = []
f1_score = []
recall = []
precision = []
for i in range(10):
    print('\n********************** For round', i, 'validation set. ********************** ')
    X_train, y_train, X_test, y_test = training_data(knowns, 'False', i)
    kf = StratifiedKFold(y_train, n_folds=10, shuffle=True, random_state=0)
    gridparams = dict(criterion=['gini', 'entropy'], max_depth=[2,3,4,5,6,7,8,9,10])
    params = {'random_state': None}
    dt = GridSearchCV(DecisionTreeClassifier(**params), gridparams, cv=kf, scoring='f1', n_jobs=-1)
    
    # Training the model
    dt.fit(X_train, y_train)
    print('Best model: ')
    print(dt.best_estimator_)
    print("")
    
    # Predicting on the test set
    y_pred = dt.predict(X_test)
    
    # Scoring the performance of the model
    print("Accuracy: %0.3f" % accuracy_score(y_test, y_pred, normalize=True))
    print("Aucroc: %0.3f" % metrics.roc_auc_score(y_test, y_pred))
    print("f1 score: %0.3f" % metrics.f1_score(y_test, y_pred))
    print("Recall: %0.3f" % metrics.recall_score(y_test, y_pred))
    print("Precision: %0.3f" % metrics.precision_score(y_test, y_pred))
    accuracy.append(accuracy_score(y_test, y_pred, normalize=True))
    roc.append(metrics.roc_auc_score(y_test, y_pred))
    f1_score.append(metrics.f1_score(y_test, y_pred))
    recall.append(metrics.recall_score(y_test, y_pred))
    precision.append(metrics.precision_score(y_test, y_pred))    

print('\n********************** Decision tree performance ********************** ')
print('The accuracy for this model is: %0.3f +/- %0.3f' % (mean(accuracy), std(accuracy)))
print('The auc_roc for this model is: %0.3f +/- %0.3f' % (mean(roc), std(accuracy)))
print('The precision for this model is: %0.3f +/- %0.3f' % (mean(precision), std(accuracy)))
print('The recall for this model is: %0.3f +/- %0.3f' % (mean(recall), std(accuracy)))
print('The f1_score for this model is: %0.3f +/- %0.3f' % (mean(f1_score), std(accuracy)))

Retrain the data using all known data using best model (observed from above values using majority vote):

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=8,
        max_features=None, max_leaf_nodes=None,
        min_impurity_split=1e-07, min_samples_leaf=1,
        min_samples_split=2, min_weight_fraction_leaf=0.0,
        presort=False, random_state=None, splitter='best')

In [ ]:
dt = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=8,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')
data = np.array(knowns.ix[:, knowns.columns != 'label'])
dic = {'cruising':1, 'not_cruising':0}
target = np.array([dic[n] if n in dic else n for n in np.array(knowns.ix[:, -1])])

dt.fit(data, target)
joblib.dump(dt, 'decision_tree.pkl')

In [ ]:
accuracy = []
roc = []
f1_score = []
recall = []
precision = []
for i in range(10):
    X_train, y_train, X_test, y_test = training_data(knowns, 'False', i)
    
    # Predicting on the test set
    y_pred = dt.predict(X_test)
    
    # Scoring the performance of the model
    accuracy.append(accuracy_score(y_test, y_pred, normalize=True))
    roc.append(metrics.roc_auc_score(y_test, y_pred))
    f1_score.append(metrics.f1_score(y_test, y_pred))
    recall.append(metrics.recall_score(y_test, y_pred))
    precision.append(metrics.precision_score(y_test, y_pred))    

print('\n********************** Decision Tree Final Model Performance ********************** ')
print('The accuracy for this model is: %0.3f +/- %0.3f' % (mean(accuracy), std(accuracy)))
print('The auc_roc for this model is: %0.3f +/- %0.3f' % (mean(roc), std(accuracy)))
print('The precision for this model is: %0.3f +/- %0.3f' % (mean(precision), std(accuracy)))
print('The recall for this model is: %0.3f +/- %0.3f' % (mean(recall), std(accuracy)))
print('The f1_score for this model is: %0.3f +/- %0.3f' % (mean(f1_score), std(accuracy)))

In [ ]:
cm = pd.DataFrame(metrics.confusion_matrix(y_test, y_pred))
sns.heatmap(cm, annot=True, fmt=".3f", linewidth=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
plt.title('Decision Tree Confusion Matrix', size = 10);

Logistic Regression


In [ ]:
accuracy = []
roc = []
f1_score = []
recall = []
precision = []
for i in range(10):
    print('\n********************** For round', i, 'validation set. ********************** ')
    X_train, y_train, X_test, y_test = training_data(knowns, 'False', i)
    kf = StratifiedKFold(y_train, n_folds=10, shuffle=True, random_state=0)
    gridparams = dict(C=list(np.power(10.0, np.arange(-10, 10))))
    params = {'penalty':'l2'}
    logreg = GridSearchCV(LogisticRegression(**params), gridparams, cv=kf, scoring='f1', n_jobs=-1)
    
    # Training the model
    logreg.fit(X_train, y_train)
    print('Best model: ')
    print(logreg.best_estimator_)
    print("")
    
    # Predicting on the test set
    y_pred = logreg.predict(X_test)
    
    # Scoring the performance of the model
    print("Accuracy: %0.3f" % accuracy_score(y_test, y_pred, normalize=True))
    print("Aucroc: %0.3f" % metrics.roc_auc_score(y_test, y_pred))
    print("f1 score: %0.3f" % metrics.f1_score(y_test, y_pred))
    print("Recall: %0.3f" % metrics.recall_score(y_test, y_pred))
    print("Precision: %0.3f" % metrics.precision_score(y_test, y_pred))
    accuracy.append(accuracy_score(y_test, y_pred, normalize=True))
    roc.append(metrics.roc_auc_score(y_test, y_pred))
    f1_score.append(metrics.f1_score(y_test, y_pred))
    recall.append(metrics.recall_score(y_test, y_pred))
    precision.append(metrics.precision_score(y_test, y_pred))    

print('\n********************** Logistic Regression performance ********************** ')
print('The accuracy for this model is: %0.3f +/- %0.3f' % (mean(accuracy), std(accuracy)))
print('The auc_roc for this model is: %0.3f +/- %0.3f' % (mean(roc), std(accuracy)))
print('The precision for this model is: %0.3f +/- %0.3f' % (mean(precision), std(accuracy)))
print('The recall for this model is: %0.3f +/- %0.3f' % (mean(recall), std(accuracy)))
print('The f1_score for this model is: %0.3f +/- %0.3f' % (mean(f1_score), std(accuracy)))

Retrain the data using all known data using best model (observed from above values using majority vote):

LogisticRegression(C=100000000.0, class_weight=None, dual=False,
      fit_intercept=True, intercept_scaling=1, max_iter=100,
      multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
      solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [ ]:
logreg = LogisticRegression(C=100000000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)
data = np.array(knowns.ix[:, knowns.columns != 'label'])
dic = {'cruising':1, 'not_cruising':0}
target = np.array([dic[n] if n in dic else n for n in np.array(knowns.ix[:, -1])])

logreg.fit(data, target)
joblib.dump(logreg, 'logistic_regression.pkl')

In [ ]:
accuracy = []
roc = []
f1_score = []
recall = []
precision = []
for i in range(10):
    X_train, y_train, X_test, y_test = training_data(knowns, 'False', i)
    
    # Predicting on the test set
    y_pred = logreg.predict(X_test)
    
    # Scoring the performance of the model
    accuracy.append(accuracy_score(y_test, y_pred, normalize=True))
    roc.append(metrics.roc_auc_score(y_test, y_pred))
    f1_score.append(metrics.f1_score(y_test, y_pred))
    recall.append(metrics.recall_score(y_test, y_pred))
    precision.append(metrics.precision_score(y_test, y_pred))    

print('\n********************** Logistic Regression Final Model Performance ********************** ')
print('The accuracy for this model is: %0.3f +/- %0.3f' % (mean(accuracy), std(accuracy)))
print('The auc_roc for this model is: %0.3f +/- %0.3f' % (mean(roc), std(accuracy)))
print('The precision for this model is: %0.3f +/- %0.3f' % (mean(precision), std(accuracy)))
print('The recall for this model is: %0.3f +/- %0.3f' % (mean(recall), std(accuracy)))
print('The f1_score for this model is: %0.3f +/- %0.3f' % (mean(f1_score), std(accuracy)))

In [ ]:
cm = pd.DataFrame(metrics.confusion_matrix(y_test, y_pred))
sns.heatmap(cm, annot=True, fmt=".3f", linewidth=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
plt.title('Logistic Regression Confusion Matrix', size = 10);

Gradient Boosting Classifier


In [ ]:
accuracy = []
roc = []
f1_score = []
recall = []
precision = []
for i in range(10):
    print('\n********************** For round', i, 'validation set. ********************** ')
    X_train, y_train, X_test, y_test = training_data(knowns, 'False', i)
    kf = StratifiedKFold(y_train, n_folds=10, shuffle=True, random_state=0)
    gridparams = dict(learning_rate=[0.01, 0.1],loss=['deviance','exponential'])
    params = {'n_estimators': 100, 'max_depth': 4}
    gbclf = GridSearchCV(ensemble.GradientBoostingClassifier(**params), gridparams, cv=kf, scoring='f1', n_jobs=-1)
    
    # Training the model
    gbclf.fit(X_train, y_train)
    print('Best model: ')
    print(gbclf.best_estimator_)
    print("")
    
    # Predicting on the test set
    y_pred = gbclf.predict(X_test)
    
    # Scoring the performance of the model
    print("Accuracy: %0.3f" % accuracy_score(y_test, y_pred, normalize=True))
    print("Aucroc: %0.3f" % metrics.roc_auc_score(y_test, y_pred))
    print("f1 score: %0.3f" % metrics.f1_score(y_test, y_pred))
    print("Recall: %0.3f" % metrics.recall_score(y_test, y_pred))
    print("Precision: %0.3f" % metrics.precision_score(y_test, y_pred))
    accuracy.append(accuracy_score(y_test, y_pred, normalize=True))
    roc.append(metrics.roc_auc_score(y_test, y_pred))
    f1_score.append(metrics.f1_score(y_test, y_pred))
    recall.append(metrics.recall_score(y_test, y_pred))
    precision.append(metrics.precision_score(y_test, y_pred))    

print('\n********************** Gradient Boosting Classifier performance ********************** ')
print('The accuracy for this model is: %0.3f +/- %0.3f' % (mean(accuracy), std(accuracy)))
print('The auc_roc for this model is: %0.3f +/- %0.3f' % (mean(roc), std(accuracy)))
print('The precision for this model is: %0.3f +/- %0.3f' % (mean(precision), std(accuracy)))
print('The recall for this model is: %0.3f +/- %0.3f' % (mean(recall), std(accuracy)))
print('The f1_score for this model is: %0.3f +/- %0.3f' % (mean(f1_score), std(accuracy)))

Retrain the data using all known data using best model (observed from above values using majority vote):

GradientBoostingClassifier(criterion='friedman_mse', init=None,
          learning_rate=0.1, loss='deviance', max_depth=4,
          max_features=None, max_leaf_nodes=None,
          min_impurity_split=1e-07, min_samples_leaf=1,
          min_samples_split=2, min_weight_fraction_leaf=0.0,
          n_estimators=100, presort='auto', random_state=None,
          subsample=1.0, verbose=0, warm_start=False)

In [ ]:
gbclf = ensemble.GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=4,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False)
data = np.array(knowns.ix[:, knowns.columns != 'label'])
dic = {'cruising':1, 'not_cruising':0}
target = np.array([dic[n] if n in dic else n for n in np.array(knowns.ix[:, -1])])

gbclf.fit(data, target)
joblib.dump(gbclf, 'gradient_boosting_classifier.pkl')

In [ ]:
print('Gradient Boosting Classifier Feature Importance')
for f in range(len(knowns.columns[:-1])):
    print('\nFeature: ', knowns.columns[:-1][f], '\nImportance: ', gbclf.feature_importances_[f])

In [ ]:
accuracy = []
roc = []
f1_score = []
recall = []
precision = []
for i in range(10):
    X_train, y_train, X_test, y_test = training_data(knowns, 'False', i)
    
    # Predicting on the test set
    y_pred = gbclf.predict(X_test)
    
    # Scoring the performance of the model
    accuracy.append(accuracy_score(y_test, y_pred, normalize=True))
    roc.append(metrics.roc_auc_score(y_test, y_pred))
    f1_score.append(metrics.f1_score(y_test, y_pred))
    recall.append(metrics.recall_score(y_test, y_pred))
    precision.append(metrics.precision_score(y_test, y_pred))    

print('\n********************** Gradient Boosting Classifier Final Model Performance ********************** ')
print('The accuracy for this model is: %0.3f +/- %0.3f' % (mean(accuracy), std(accuracy)))
print('The auc_roc for this model is: %0.3f +/- %0.3f' % (mean(roc), std(accuracy)))
print('The precision for this model is: %0.3f +/- %0.3f' % (mean(precision), std(accuracy)))
print('The recall for this model is: %0.3f +/- %0.3f' % (mean(recall), std(accuracy)))
print('The f1_score for this model is: %0.3f +/- %0.3f' % (mean(f1_score), std(accuracy)))

In [ ]:
cm = pd.DataFrame(metrics.confusion_matrix(y_test, y_pred))
sns.heatmap(cm, annot=True, fmt=".3f", linewidth=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
plt.title('Gradient Boosting Classifier Confusion Matrix', size = 10);

In [ ]: