In [8]:
    
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline
from sklearn.learning_curve import validation_curve
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
    
In [7]:
    
data = pd.read_csv("data/driver_image.csv")
data = data.ix[:,1:]
X_data = data.ix[:,:-1]
y_data = data.ix[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.25)
    
In [33]:
    
from sklearn.naive_bayes import GaussianNB
    
In [34]:
    
%%time
gaussian_nb = GaussianNB().fit(X_train, y_train)
    
    
In [35]:
    
%%time
gaussian_predict = gaussian_nb.predict(X_test)
    
    
In [36]:
    
print(classification_report(y_test, gaussian_predict))
    
    
In [17]:
    
confusion_matrix(y_test, gaussian_predict)
    
    Out[17]:
In [56]:
    
sum(y_test != gaussian_predict)
    
    Out[56]:
In [58]:
    
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
    
In [59]:
    
%%time
qda = QuadraticDiscriminantAnalysis().fit(X_train, y_train)
    
    
    
In [60]:
    
%%time
qda_predict = qda.predict(X_test)
    
    
In [61]:
    
print(classification_report(y_test, qda_predict))
    
    
In [62]:
    
confusion_matrix(y_test, qda_predict)
    
    Out[62]:
In [63]:
    
sum(y_test != qda_predict)
    
    Out[63]:
In [4]:
    
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
    
In [5]:
    
%%time
lda = LinearDiscriminantAnalysis(n_components=3).fit(X_train, y_train)
    
    
In [19]:
    
%%time
lda_predict = lda.predict(X_test)
    
    
In [20]:
    
print(classification_report(y_test, lda_predict))
    
    
In [21]:
    
confusion_matrix(y_test, lda_predict)
    
    Out[21]:
In [55]:
    
sum(y_test != lda_predict)
    
    
In [9]:
    
from sklearn.tree import DecisionTreeClassifier
    
In [10]:
    
max_depth_range = np.arange(1,102, 20)
    
In [11]:
    
%%time
train_score, test_score = validation_curve(DecisionTreeClassifier(), data.ix[:,:-1], data.ix[:,-1],
                          param_name = "max_depth", param_range=max_depth_range,scoring="accuracy", n_jobs=4)
    
    
In [12]:
    
train_scores_mean = np.mean(train_score, axis=1)
train_scores_std = np.std(train_score, axis=1)
test_scores_mean = np.mean(test_score, axis=1)
test_scores_std = np.std(test_score, axis=1)
plt.title("Validation Curve with Tree")
plt.xlabel("max_depth")
plt.ylabel("Score")
plt.ylim(0.0, 1.1)
plt.semilogx(max_depth_range, train_scores_mean, label="Training score", color="r")
plt.fill_between(max_depth_range, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.2, color="r")
plt.semilogx(max_depth_range, test_scores_mean, label="Cross-validation score", color="g")
plt.fill_between(max_depth_range, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.2, color="g")
plt.legend(loc="best")
plt.show()
    
    
In [37]:
    
%%time
tree = DecisionTreeClassifier(max_depth=60).fit(X_train, y_train)
    
    
In [38]:
    
%%time
tree_predict = tree.predict(X_test)
    
    
In [39]:
    
print(classification_report(y_test, tree_predict))
    
    
In [40]:
    
confusion_matrix(y_test, tree_predict)
    
    Out[40]:
In [54]:
    
sum(y_test != tree_predict)
    
    Out[54]:
In [45]:
    
%%time
tree_e = DecisionTreeClassifier(criterion="entropy", max_depth=60).fit(X_train, y_train)
    
    
In [42]:
    
%%time
tree_e_predict = tree.predict(X_test)
    
    
In [43]:
    
print(classification_report(y_test, tree_e_predict))
    
    
In [44]:
    
confusion_matrix(y_test, tree_e_predict)
    
    Out[44]:
In [53]:
    
sum(y_test != tree_e_predict)
    
    Out[53]:
In [27]:
    
from io import StringIO
from pydot import pydot
from sklearn.tree import export_graphviz
from IPython.display import Image
dot_buf = StringIO() 
export_graphviz(tree, out_file=dot_buf, feature_names=["X"+str(i) for i in range(5600)])
graph = pydot.graph_from_dot_data(dot_buf.getvalue()) 
image = graph.create_png()
image_buf = StringIO() 
image_buf.write(image)
Image(image_buf.getvalue())
    
    
    
In [7]:
    
max_depth_range = np.arange(1,101)
    
In [8]:
    
%%time
train_score, test_score = validation_curve(RandomForestClassifier(), data.ix[:,:-1], data.ix[:,-1],
                          param_name = "max_depth", param_range=max_depth_range,scoring="accuracy", n_jobs=4)
    
    
In [12]:
    
train_scores_mean = np.mean(train_score, axis=1)
train_scores_std = np.std(train_score, axis=1)
test_scores_mean = np.mean(test_score, axis=1)
test_scores_std = np.std(test_score, axis=1)
plt.title("Validation Curve with RanFo")
plt.xlabel("max_depth")
plt.ylabel("Score")
plt.ylim(0.0, 1.1)
plt.semilogx(max_depth_range, train_scores_mean, label="Training score", color="r")
plt.fill_between(max_depth_range, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.2, color="r")
plt.semilogx(max_depth_range, test_scores_mean, label="Cross-validation score", color="g")
plt.fill_between(max_depth_range, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.2, color="g")
plt.legend(loc="best")
plt.show()
    
    
In [4]:
    
n_estimators_range = np.arange(1,201,10)
    
In [5]:
    
%%time
train_score2, test_score2 = validation_curve(RandomForestClassifier(), data.ix[:,:-1], data.ix[:,-1],
                          param_name = "n_estimators", param_range=n_estimators_range, scoring="accuracy", n_jobs=4)
    
    
In [6]:
    
train_scores_mean = np.mean(train_score2, axis=1)
train_scores_std = np.std(train_score2, axis=1)
test_scores_mean = np.mean(test_score2, axis=1)
test_scores_std = np.std(test_score2, axis=1)
plt.title("Validation Curve with RanFo")
plt.xlabel("max_depth")
plt.ylabel("Score")
plt.ylim(0.0, 1.1)
plt.semilogx(n_estimators_range, train_scores_mean, label="Training score", color="r")
plt.fill_between(n_estimators_range, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.2, color="r")
plt.semilogx(n_estimators_range, test_scores_mean, label="Cross-validation score", color="g")
plt.fill_between(n_estimators_range, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.2, color="g")
plt.legend(loc="best")
plt.show()
    
    
In [50]:
    
%%time
optimized_random_forest = RandomForestClassifier(max_depth=60, n_estimators=160).fit(X_train, y_train)
    
    
In [51]:
    
%%time
orf_predict = optimized_random_forest.predict(X_test)
    
    
In [48]:
    
print(classification_report(y_test, orf_predict))
    
    
In [49]:
    
confusion_matrix(y_test, orf_predict)
    
    Out[49]:
In [52]:
    
sum(y_test != orf_predict)
    
    Out[52]:
In [64]:
    
%%time
optimized_random_forest = RandomForestClassifier(max_depth=60, n_estimators=160, n_jobs=4).fit(X_data, y_data)
    
    
In [65]:
    
test_data = pd.read_csv("data/test_img_data.csv")
    
In [68]:
    
%%time
predict = optimized_random_forest.predict_log_proba(test_data.ix[:,1:])
    
    
In [69]:
    
submission = pd.DataFrame(np.hstack([np.array(test_data.ix[:,0])[:,np.newaxis], predict]), columns=["img"]+["c"+str(i) for i in range(10)])
    
In [70]:
    
submission.to_csv("submission/random_forest_log_submission.csv", index=False)
    
In [ ]:
    
# to use random_forest score 2.30259
    
In [7]:
    
data
    
    Out[7]:
In [ ]: