In [ ]:
import pandas as pd
import numpy as np
import matplotlib as plt
import json
%pylab inline
In [ ]:
data = !ls *.json
one_week = []
for d in data:
with open(d) as f:
resulting_json = json.load(f)
one_week.append(pd.DataFrame(resulting_json))
metadata = pd.concat(one_week, axis=0)
metadata = metadata.reset_index(drop=True)
Check if duplicate hashes exist and remove unneeded columns.
In [ ]:
# Duplicate checker
print('Number of instances: ', metadata.shape[0])
print('Number of unique hashes: ', metadata['group'].nunique())
a = pd.DataFrame(metadata.groupby('group')['group'].count())
group_duplicates = list(a.index[a['group'] > 1])
print('Number of duplicates: ', len(group_duplicates))
index_hash_mapping = dict(zip(list(metadata.index), list(metadata['group'])))
del metadata['group']
del metadata['id']
del metadata['reduction']
del metadata['timeofday']
del metadata['weekday']
del metadata['date']
del metadata['fhv']
# # These features would not provide additional information gain
del metadata['distance_shortest']
del metadata['distance_total']
del metadata['duration_of_trip']
metadata = metadata.astype(float)
metadata['speed_average'] = metadata['speed_average']*2.23694
metadata['speed_max'] = metadata['speed_max']*2.23694
metadata['speed_standard_deviation'] = metadata['speed_standard_deviation']*2.23694
In [ ]:
print('For modeling, the shape of the metadata is: ', metadata.shape)
metadata.head()
In [ ]:
metadata.describe()
In [ ]:
metadata.hist(bins=100, figsize=(20,20));
In [ ]:
metadata.corr()
In [ ]:
metadata.cov()
In [ ]:
# from pandas.tools.plotting import scatter_matrix
# scatter_matrix(metadata, alpha=0.03, figsize=(20, 20), diagonal='kde');
In [ ]:
print('time_percentage_driving == 0 and time_percentage_bogus == 0 count: ', metadata[(metadata['time_percentage_driving'] == 0) & (metadata['time_percentage_bogus'] == 0)].shape[0])
print('time_percentage_driving == 0 and time_percentage_bogus == 0 %: ', round(100.0*metadata[(metadata['time_percentage_driving'] == 0) & (metadata['time_percentage_bogus'] == 0)].shape[0]/metadata.shape[0], 2),'%')
print('\ntime_percentage_walking == 100 count: ', metadata['time_percentage_walking'][metadata['time_percentage_walking'] == 100].count())
print('time_percentage_walking == 100 %: ', round(100.0*metadata['time_percentage_walking'][metadata['time_percentage_walking'] == 100].count()/metadata.shape[0], 2))
print('\nMark above cases as not cruising, to use as training data.')
not_cruising = []
not_cruising.extend(list(metadata[(metadata['time_percentage_driving'] == 0) & (metadata['time_percentage_bogus'] == 0)].index))
not_cruising.extend(list(metadata['time_percentage_walking'][metadata['time_percentage_walking'] == 100].index))
not_cruising = list(set(not_cruising))
print('Not cruising count: ', len(not_cruising))
print('Not cruising %: ', round(100.0*len(not_cruising)/metadata.shape[0], 2),'%')
metadata1 = metadata[~metadata.index.isin(not_cruising)]
#Get data where threshold == 1 --> not cruising
not_cruising_threshold = 1
print('\nNumber of instances where distance_ratio ==', not_cruising_threshold,': ', metadata1['distance_ratio'][metadata1['distance_ratio'] == not_cruising_threshold].count())
print('% of instances where distance_ratio ==', not_cruising_threshold, ': ', round(100.0*metadata1['distance_ratio'][metadata1['distance_ratio'] == not_cruising_threshold].count()/metadata.shape[0], 2), '%')
not_cruising.extend(list(metadata1[metadata1['distance_ratio'] == not_cruising_threshold].index))
print('\n*********************For Training: Not Cruising********************* \nTotal not cruising count: ', len(not_cruising))
print('Total not cruising %: ', round(100.0*len(not_cruising)/metadata.shape[0], 2),'%')
metadata1 = metadata[~metadata.index.isin(not_cruising)]
Cruising:
In [ ]:
#Get data where threshold < 0.3 --> cruising
cruising_threshold = 0.3
print('\n*********************For Training: Cruising********************* \ndistance_ratio <', cruising_threshold,'count: ', metadata1['distance_ratio'][metadata1['distance_ratio'] < cruising_threshold].count())
print('distance_ratio <', cruising_threshold, '%: ', round(100.0*metadata1['distance_ratio'][metadata1['distance_ratio'] < cruising_threshold].count()/metadata.shape[0], 2), '%')
cruising = list(metadata1[metadata1['distance_ratio'] < cruising_threshold].index)
In [ ]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
from sklearn import metrics
import seaborn as sns
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import cross_val_score, cross_val_predict, StratifiedKFold
from sklearn import preprocessing, metrics, svm, ensemble
from sklearn.metrics import accuracy_score, classification_report
from sklearn.externals import joblib
from sklearn.tree import DecisionTreeClassifier, export_graphviz
Separating data between known (labeled for training) and unknowns (need to be predicted).
In [ ]:
not_cruising_data = metadata[metadata.index.isin(not_cruising)]
not_cruising_data['label'] = 'not_cruising'
cruising_data = metadata[metadata.index.isin(cruising)]
cruising_data['label'] = 'cruising'
knowns = pd.concat([not_cruising_data, cruising_data])
unknowns = metadata[(~metadata.index.isin(not_cruising)) & (~metadata.index.isin(cruising))]
del knowns['distance_ratio']
del unknowns['distance_ratio']
print('Knowns: ', knowns.shape, 'Unknowns: ', unknowns.shape)
Build training dataset for modeling.
In [ ]:
def training_data(dataframe, standardize, random_state):
"""The function prepares the train & test dataframes for modeling.
Standardizing the data is desired."""
data = np.array(dataframe.ix[:, dataframe.columns != 'label'])
dic = {'cruising':1, 'not_cruising':0}
target = np.array([dic[n] if n in dic else n for n in np.array(dataframe.ix[:, -1])])
if standardize == 'True':
"""Scale the data (Assume that all features are centered around 0 and have variance in the same order. If a
feature has a variance that is orders of magnitude larger that others, it might dominate the objective
function and make the estimator unable to learn from other features correctly as expected). Note in order for
StandardScaler to work, need to remove any nulls in data set prior to running."""
scalar = preprocessing.StandardScaler().fit(data)
data = scalar.transform(data)
joblib.dump(scalar, 'standardize_X.pkl')
else: pass
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.4, random_state=random_state)
return(X_train, y_train, X_test, y_test)
In [ ]:
accuracy = []
roc = []
f1_score = []
recall = []
precision = []
for i in range(10):
print('\n********************** For round', i, 'validation set. ********************** ')
X_train, y_train, X_test, y_test = training_data(knowns, 'False', i)
kf = StratifiedKFold(y_train, n_folds=10, shuffle=True, random_state=0)
gridparams = dict(criterion=['gini', 'entropy'], max_depth=[2,3,4,5,6,7,8,9,10])
params = {'random_state': None}
dt = GridSearchCV(DecisionTreeClassifier(**params), gridparams, cv=kf, scoring='f1', n_jobs=-1)
# Training the model
dt.fit(X_train, y_train)
print('Best model: ')
print(dt.best_estimator_)
print("")
# Predicting on the test set
y_pred = dt.predict(X_test)
# Scoring the performance of the model
print("Accuracy: %0.3f" % accuracy_score(y_test, y_pred, normalize=True))
print("Aucroc: %0.3f" % metrics.roc_auc_score(y_test, y_pred))
print("f1 score: %0.3f" % metrics.f1_score(y_test, y_pred))
print("Recall: %0.3f" % metrics.recall_score(y_test, y_pred))
print("Precision: %0.3f" % metrics.precision_score(y_test, y_pred))
accuracy.append(accuracy_score(y_test, y_pred, normalize=True))
roc.append(metrics.roc_auc_score(y_test, y_pred))
f1_score.append(metrics.f1_score(y_test, y_pred))
recall.append(metrics.recall_score(y_test, y_pred))
precision.append(metrics.precision_score(y_test, y_pred))
print('\n********************** Decision tree performance ********************** ')
print('The accuracy for this model is: %0.3f +/- %0.3f' % (mean(accuracy), std(accuracy)))
print('The auc_roc for this model is: %0.3f +/- %0.3f' % (mean(roc), std(accuracy)))
print('The precision for this model is: %0.3f +/- %0.3f' % (mean(precision), std(accuracy)))
print('The recall for this model is: %0.3f +/- %0.3f' % (mean(recall), std(accuracy)))
print('The f1_score for this model is: %0.3f +/- %0.3f' % (mean(f1_score), std(accuracy)))
Retrain the data using all known data using best model (observed from above values using majority vote):
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=8,
max_features=None, max_leaf_nodes=None,
min_impurity_split=1e-07, min_samples_leaf=1,
min_samples_split=2, min_weight_fraction_leaf=0.0,
presort=False, random_state=None, splitter='best')
In [ ]:
dt = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=8,
max_features=None, max_leaf_nodes=None,
min_impurity_split=1e-07, min_samples_leaf=1,
min_samples_split=2, min_weight_fraction_leaf=0.0,
presort=False, random_state=None, splitter='best')
data = np.array(knowns.ix[:, knowns.columns != 'label'])
dic = {'cruising':1, 'not_cruising':0}
target = np.array([dic[n] if n in dic else n for n in np.array(knowns.ix[:, -1])])
dt.fit(data, target)
joblib.dump(dt, 'decision_tree.pkl')
In [ ]:
accuracy = []
roc = []
f1_score = []
recall = []
precision = []
for i in range(10):
X_train, y_train, X_test, y_test = training_data(knowns, 'False', i)
# Predicting on the test set
y_pred = dt.predict(X_test)
# Scoring the performance of the model
accuracy.append(accuracy_score(y_test, y_pred, normalize=True))
roc.append(metrics.roc_auc_score(y_test, y_pred))
f1_score.append(metrics.f1_score(y_test, y_pred))
recall.append(metrics.recall_score(y_test, y_pred))
precision.append(metrics.precision_score(y_test, y_pred))
print('\n********************** Decision Tree Final Model Performance ********************** ')
print('The accuracy for this model is: %0.3f +/- %0.3f' % (mean(accuracy), std(accuracy)))
print('The auc_roc for this model is: %0.3f +/- %0.3f' % (mean(roc), std(accuracy)))
print('The precision for this model is: %0.3f +/- %0.3f' % (mean(precision), std(accuracy)))
print('The recall for this model is: %0.3f +/- %0.3f' % (mean(recall), std(accuracy)))
print('The f1_score for this model is: %0.3f +/- %0.3f' % (mean(f1_score), std(accuracy)))
In [ ]:
cm = pd.DataFrame(metrics.confusion_matrix(y_test, y_pred))
sns.heatmap(cm, annot=True, fmt=".3f", linewidth=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
plt.title('Decision Tree Confusion Matrix', size = 10);
In [ ]:
accuracy = []
roc = []
f1_score = []
recall = []
precision = []
for i in range(10):
print('\n********************** For round', i, 'validation set. ********************** ')
X_train, y_train, X_test, y_test = training_data(knowns, 'False', i)
kf = StratifiedKFold(y_train, n_folds=10, shuffle=True, random_state=0)
gridparams = dict(C=list(np.power(10.0, np.arange(-10, 10))))
params = {'penalty':'l2'}
logreg = GridSearchCV(LogisticRegression(**params), gridparams, cv=kf, scoring='f1', n_jobs=-1)
# Training the model
logreg.fit(X_train, y_train)
print('Best model: ')
print(logreg.best_estimator_)
print("")
# Predicting on the test set
y_pred = logreg.predict(X_test)
# Scoring the performance of the model
print("Accuracy: %0.3f" % accuracy_score(y_test, y_pred, normalize=True))
print("Aucroc: %0.3f" % metrics.roc_auc_score(y_test, y_pred))
print("f1 score: %0.3f" % metrics.f1_score(y_test, y_pred))
print("Recall: %0.3f" % metrics.recall_score(y_test, y_pred))
print("Precision: %0.3f" % metrics.precision_score(y_test, y_pred))
accuracy.append(accuracy_score(y_test, y_pred, normalize=True))
roc.append(metrics.roc_auc_score(y_test, y_pred))
f1_score.append(metrics.f1_score(y_test, y_pred))
recall.append(metrics.recall_score(y_test, y_pred))
precision.append(metrics.precision_score(y_test, y_pred))
print('\n********************** Logistic Regression performance ********************** ')
print('The accuracy for this model is: %0.3f +/- %0.3f' % (mean(accuracy), std(accuracy)))
print('The auc_roc for this model is: %0.3f +/- %0.3f' % (mean(roc), std(accuracy)))
print('The precision for this model is: %0.3f +/- %0.3f' % (mean(precision), std(accuracy)))
print('The recall for this model is: %0.3f +/- %0.3f' % (mean(recall), std(accuracy)))
print('The f1_score for this model is: %0.3f +/- %0.3f' % (mean(f1_score), std(accuracy)))
Retrain the data using all known data using best model (observed from above values using majority vote):
LogisticRegression(C=100000000.0, class_weight=None, dual=False,
fit_intercept=True, intercept_scaling=1, max_iter=100,
multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
solver='liblinear', tol=0.0001, verbose=0, warm_start=False)
In [ ]:
logreg = LogisticRegression(C=100000000.0, class_weight=None, dual=False,
fit_intercept=True, intercept_scaling=1, max_iter=100,
multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
solver='liblinear', tol=0.0001, verbose=0, warm_start=False)
data = np.array(knowns.ix[:, knowns.columns != 'label'])
dic = {'cruising':1, 'not_cruising':0}
target = np.array([dic[n] if n in dic else n for n in np.array(knowns.ix[:, -1])])
logreg.fit(data, target)
joblib.dump(logreg, 'logistic_regression.pkl')
In [ ]:
accuracy = []
roc = []
f1_score = []
recall = []
precision = []
for i in range(10):
X_train, y_train, X_test, y_test = training_data(knowns, 'False', i)
# Predicting on the test set
y_pred = logreg.predict(X_test)
# Scoring the performance of the model
accuracy.append(accuracy_score(y_test, y_pred, normalize=True))
roc.append(metrics.roc_auc_score(y_test, y_pred))
f1_score.append(metrics.f1_score(y_test, y_pred))
recall.append(metrics.recall_score(y_test, y_pred))
precision.append(metrics.precision_score(y_test, y_pred))
print('\n********************** Logistic Regression Final Model Performance ********************** ')
print('The accuracy for this model is: %0.3f +/- %0.3f' % (mean(accuracy), std(accuracy)))
print('The auc_roc for this model is: %0.3f +/- %0.3f' % (mean(roc), std(accuracy)))
print('The precision for this model is: %0.3f +/- %0.3f' % (mean(precision), std(accuracy)))
print('The recall for this model is: %0.3f +/- %0.3f' % (mean(recall), std(accuracy)))
print('The f1_score for this model is: %0.3f +/- %0.3f' % (mean(f1_score), std(accuracy)))
In [ ]:
cm = pd.DataFrame(metrics.confusion_matrix(y_test, y_pred))
sns.heatmap(cm, annot=True, fmt=".3f", linewidth=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
plt.title('Logistic Regression Confusion Matrix', size = 10);
In [ ]:
accuracy = []
roc = []
f1_score = []
recall = []
precision = []
for i in range(10):
print('\n********************** For round', i, 'validation set. ********************** ')
X_train, y_train, X_test, y_test = training_data(knowns, 'False', i)
kf = StratifiedKFold(y_train, n_folds=10, shuffle=True, random_state=0)
gridparams = dict(learning_rate=[0.01, 0.1],loss=['deviance','exponential'])
params = {'n_estimators': 100, 'max_depth': 4}
gbclf = GridSearchCV(ensemble.GradientBoostingClassifier(**params), gridparams, cv=kf, scoring='f1', n_jobs=-1)
# Training the model
gbclf.fit(X_train, y_train)
print('Best model: ')
print(gbclf.best_estimator_)
print("")
# Predicting on the test set
y_pred = gbclf.predict(X_test)
# Scoring the performance of the model
print("Accuracy: %0.3f" % accuracy_score(y_test, y_pred, normalize=True))
print("Aucroc: %0.3f" % metrics.roc_auc_score(y_test, y_pred))
print("f1 score: %0.3f" % metrics.f1_score(y_test, y_pred))
print("Recall: %0.3f" % metrics.recall_score(y_test, y_pred))
print("Precision: %0.3f" % metrics.precision_score(y_test, y_pred))
accuracy.append(accuracy_score(y_test, y_pred, normalize=True))
roc.append(metrics.roc_auc_score(y_test, y_pred))
f1_score.append(metrics.f1_score(y_test, y_pred))
recall.append(metrics.recall_score(y_test, y_pred))
precision.append(metrics.precision_score(y_test, y_pred))
print('\n********************** Gradient Boosting Classifier performance ********************** ')
print('The accuracy for this model is: %0.3f +/- %0.3f' % (mean(accuracy), std(accuracy)))
print('The auc_roc for this model is: %0.3f +/- %0.3f' % (mean(roc), std(accuracy)))
print('The precision for this model is: %0.3f +/- %0.3f' % (mean(precision), std(accuracy)))
print('The recall for this model is: %0.3f +/- %0.3f' % (mean(recall), std(accuracy)))
print('The f1_score for this model is: %0.3f +/- %0.3f' % (mean(f1_score), std(accuracy)))
Retrain the data using all known data using best model (observed from above values using majority vote):
GradientBoostingClassifier(criterion='friedman_mse', init=None,
learning_rate=0.1, loss='deviance', max_depth=4,
max_features=None, max_leaf_nodes=None,
min_impurity_split=1e-07, min_samples_leaf=1,
min_samples_split=2, min_weight_fraction_leaf=0.0,
n_estimators=100, presort='auto', random_state=None,
subsample=1.0, verbose=0, warm_start=False)
In [ ]:
gbclf = ensemble.GradientBoostingClassifier(criterion='friedman_mse', init=None,
learning_rate=0.1, loss='deviance', max_depth=4,
max_features=None, max_leaf_nodes=None,
min_impurity_split=1e-07, min_samples_leaf=1,
min_samples_split=2, min_weight_fraction_leaf=0.0,
n_estimators=100, presort='auto', random_state=None,
subsample=1.0, verbose=0, warm_start=False)
data = np.array(knowns.ix[:, knowns.columns != 'label'])
dic = {'cruising':1, 'not_cruising':0}
target = np.array([dic[n] if n in dic else n for n in np.array(knowns.ix[:, -1])])
gbclf.fit(data, target)
joblib.dump(gbclf, 'gradient_boosting_classifier.pkl')
In [ ]:
print('Gradient Boosting Classifier Feature Importance')
for f in range(len(knowns.columns[:-1])):
print('\nFeature: ', knowns.columns[:-1][f], '\nImportance: ', gbclf.feature_importances_[f])
In [ ]:
accuracy = []
roc = []
f1_score = []
recall = []
precision = []
for i in range(10):
X_train, y_train, X_test, y_test = training_data(knowns, 'False', i)
# Predicting on the test set
y_pred = gbclf.predict(X_test)
# Scoring the performance of the model
accuracy.append(accuracy_score(y_test, y_pred, normalize=True))
roc.append(metrics.roc_auc_score(y_test, y_pred))
f1_score.append(metrics.f1_score(y_test, y_pred))
recall.append(metrics.recall_score(y_test, y_pred))
precision.append(metrics.precision_score(y_test, y_pred))
print('\n********************** Gradient Boosting Classifier Final Model Performance ********************** ')
print('The accuracy for this model is: %0.3f +/- %0.3f' % (mean(accuracy), std(accuracy)))
print('The auc_roc for this model is: %0.3f +/- %0.3f' % (mean(roc), std(accuracy)))
print('The precision for this model is: %0.3f +/- %0.3f' % (mean(precision), std(accuracy)))
print('The recall for this model is: %0.3f +/- %0.3f' % (mean(recall), std(accuracy)))
print('The f1_score for this model is: %0.3f +/- %0.3f' % (mean(f1_score), std(accuracy)))
In [ ]:
cm = pd.DataFrame(metrics.confusion_matrix(y_test, y_pred))
sns.heatmap(cm, annot=True, fmt=".3f", linewidth=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
plt.title('Gradient Boosting Classifier Confusion Matrix', size = 10);
In [ ]: