In [8]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline
from sklearn.learning_curve import validation_curve
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
In [7]:
data = pd.read_csv("data/driver_image.csv")
data = data.ix[:,1:]
X_data = data.ix[:,:-1]
y_data = data.ix[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.25)
In [33]:
from sklearn.naive_bayes import GaussianNB
In [34]:
%%time
gaussian_nb = GaussianNB().fit(X_train, y_train)
In [35]:
%%time
gaussian_predict = gaussian_nb.predict(X_test)
In [36]:
print(classification_report(y_test, gaussian_predict))
In [17]:
confusion_matrix(y_test, gaussian_predict)
Out[17]:
In [56]:
sum(y_test != gaussian_predict)
Out[56]:
In [58]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
In [59]:
%%time
qda = QuadraticDiscriminantAnalysis().fit(X_train, y_train)
In [60]:
%%time
qda_predict = qda.predict(X_test)
In [61]:
print(classification_report(y_test, qda_predict))
In [62]:
confusion_matrix(y_test, qda_predict)
Out[62]:
In [63]:
sum(y_test != qda_predict)
Out[63]:
In [4]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
In [5]:
%%time
lda = LinearDiscriminantAnalysis(n_components=3).fit(X_train, y_train)
In [19]:
%%time
lda_predict = lda.predict(X_test)
In [20]:
print(classification_report(y_test, lda_predict))
In [21]:
confusion_matrix(y_test, lda_predict)
Out[21]:
In [55]:
sum(y_test != lda_predict)
In [9]:
from sklearn.tree import DecisionTreeClassifier
In [10]:
max_depth_range = np.arange(1,102, 20)
In [11]:
%%time
train_score, test_score = validation_curve(DecisionTreeClassifier(), data.ix[:,:-1], data.ix[:,-1],
param_name = "max_depth", param_range=max_depth_range,scoring="accuracy", n_jobs=4)
In [12]:
train_scores_mean = np.mean(train_score, axis=1)
train_scores_std = np.std(train_score, axis=1)
test_scores_mean = np.mean(test_score, axis=1)
test_scores_std = np.std(test_score, axis=1)
plt.title("Validation Curve with Tree")
plt.xlabel("max_depth")
plt.ylabel("Score")
plt.ylim(0.0, 1.1)
plt.semilogx(max_depth_range, train_scores_mean, label="Training score", color="r")
plt.fill_between(max_depth_range, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.2, color="r")
plt.semilogx(max_depth_range, test_scores_mean, label="Cross-validation score", color="g")
plt.fill_between(max_depth_range, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.2, color="g")
plt.legend(loc="best")
plt.show()
In [37]:
%%time
tree = DecisionTreeClassifier(max_depth=60).fit(X_train, y_train)
In [38]:
%%time
tree_predict = tree.predict(X_test)
In [39]:
print(classification_report(y_test, tree_predict))
In [40]:
confusion_matrix(y_test, tree_predict)
Out[40]:
In [54]:
sum(y_test != tree_predict)
Out[54]:
In [45]:
%%time
tree_e = DecisionTreeClassifier(criterion="entropy", max_depth=60).fit(X_train, y_train)
In [42]:
%%time
tree_e_predict = tree.predict(X_test)
In [43]:
print(classification_report(y_test, tree_e_predict))
In [44]:
confusion_matrix(y_test, tree_e_predict)
Out[44]:
In [53]:
sum(y_test != tree_e_predict)
Out[53]:
In [27]:
from io import StringIO
from pydot import pydot
from sklearn.tree import export_graphviz
from IPython.display import Image
dot_buf = StringIO()
export_graphviz(tree, out_file=dot_buf, feature_names=["X"+str(i) for i in range(5600)])
graph = pydot.graph_from_dot_data(dot_buf.getvalue())
image = graph.create_png()
image_buf = StringIO()
image_buf.write(image)
Image(image_buf.getvalue())
In [7]:
max_depth_range = np.arange(1,101)
In [8]:
%%time
train_score, test_score = validation_curve(RandomForestClassifier(), data.ix[:,:-1], data.ix[:,-1],
param_name = "max_depth", param_range=max_depth_range,scoring="accuracy", n_jobs=4)
In [12]:
train_scores_mean = np.mean(train_score, axis=1)
train_scores_std = np.std(train_score, axis=1)
test_scores_mean = np.mean(test_score, axis=1)
test_scores_std = np.std(test_score, axis=1)
plt.title("Validation Curve with RanFo")
plt.xlabel("max_depth")
plt.ylabel("Score")
plt.ylim(0.0, 1.1)
plt.semilogx(max_depth_range, train_scores_mean, label="Training score", color="r")
plt.fill_between(max_depth_range, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.2, color="r")
plt.semilogx(max_depth_range, test_scores_mean, label="Cross-validation score", color="g")
plt.fill_between(max_depth_range, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.2, color="g")
plt.legend(loc="best")
plt.show()
In [4]:
n_estimators_range = np.arange(1,201,10)
In [5]:
%%time
train_score2, test_score2 = validation_curve(RandomForestClassifier(), data.ix[:,:-1], data.ix[:,-1],
param_name = "n_estimators", param_range=n_estimators_range, scoring="accuracy", n_jobs=4)
In [6]:
train_scores_mean = np.mean(train_score2, axis=1)
train_scores_std = np.std(train_score2, axis=1)
test_scores_mean = np.mean(test_score2, axis=1)
test_scores_std = np.std(test_score2, axis=1)
plt.title("Validation Curve with RanFo")
plt.xlabel("max_depth")
plt.ylabel("Score")
plt.ylim(0.0, 1.1)
plt.semilogx(n_estimators_range, train_scores_mean, label="Training score", color="r")
plt.fill_between(n_estimators_range, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.2, color="r")
plt.semilogx(n_estimators_range, test_scores_mean, label="Cross-validation score", color="g")
plt.fill_between(n_estimators_range, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.2, color="g")
plt.legend(loc="best")
plt.show()
In [50]:
%%time
optimized_random_forest = RandomForestClassifier(max_depth=60, n_estimators=160).fit(X_train, y_train)
In [51]:
%%time
orf_predict = optimized_random_forest.predict(X_test)
In [48]:
print(classification_report(y_test, orf_predict))
In [49]:
confusion_matrix(y_test, orf_predict)
Out[49]:
In [52]:
sum(y_test != orf_predict)
Out[52]:
In [64]:
%%time
optimized_random_forest = RandomForestClassifier(max_depth=60, n_estimators=160, n_jobs=4).fit(X_data, y_data)
In [65]:
test_data = pd.read_csv("data/test_img_data.csv")
In [68]:
%%time
predict = optimized_random_forest.predict_log_proba(test_data.ix[:,1:])
In [69]:
submission = pd.DataFrame(np.hstack([np.array(test_data.ix[:,0])[:,np.newaxis], predict]), columns=["img"]+["c"+str(i) for i in range(10)])
In [70]:
submission.to_csv("submission/random_forest_log_submission.csv", index=False)
In [ ]:
# to use random_forest score 2.30259
In [7]:
data
Out[7]:
In [ ]: