Topic: Challenge Set 8
Subject: Classification Errors
Date: 02/13/2017
Name: Prashant Tatineni
In [101]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import cross_val_predict
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score
%matplotlib inline
In [13]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/voting-records/house-votes-84.data', header=None)
In [14]:
df = df.replace('y',1)
df = df.replace('n',0)
for i in range(1,17):
df[i] = df[i].replace('?',(df[i].replace('?',np.nan)).mean())
In [15]:
y = df[0]
X = df[[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]]
In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.30, random_state=4444)
Challenge 1
KNN
In [22]:
knn = KNeighborsClassifier(n_neighbors=6)
knn.fit(X_train, y_train)
y_test_predicted = knn.predict(X_test)
accuracy_score(y_test, y_test_predicted)
Out[22]:
In [18]:
# Precision, Recall, F1, Support ['democrat','republican']
precision_recall_fscore_support(y_test, y_test_predicted)
Out[18]:
Logistic Regression
In [26]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_score_lr = lr.fit(X_train, y_train).decision_function(X_test)
y_test_predicted = lr.predict(X_test)
accuracy_score(y_test, y_test_predicted)
Out[26]:
In [27]:
# Precision, Recall, F1, Support ['democrat','republican']
precision_recall_fscore_support(y_test, y_test_predicted)
Out[27]:
Guess Democrat
In [28]:
y_test_predicted = ['democrat']*131
accuracy_score(y_test, y_test_predicted)
Out[28]:
In [30]:
# Precision, Recall, F1, Support ['democrat','republican']
precision_recall_fscore_support(y_test, y_test_predicted)
Out[30]:
Guess Republican
In [31]:
y_test_predicted = ['republican']*131
accuracy_score(y_test, y_test_predicted)
Out[31]:
In [32]:
# Precision, Recall, F1, Support ['democrat','republican']
precision_recall_fscore_support(y_test, y_test_predicted)
Out[32]:
Gaussian Naive Bayes
In [35]:
nb = GaussianNB()
nb.fit(X_train, y_train)
y_test_predicted = nb.predict(X_test)
accuracy_score(y_test, y_test_predicted)
Out[35]:
In [36]:
# Precision, Recall, F1, Support ['democrat','republican']
precision_recall_fscore_support(y_test, y_test_predicted)
Out[36]:
Support Vector Machine
In [37]:
svm = SVC()
svm.fit(X_train, y_train)
y_score_svm = svm.fit(X_train, y_train).decision_function(X_test)
y_test_predicted = svm.predict(X_test)
accuracy_score(y_test, y_test_predicted)
Out[37]:
In [38]:
# Precision, Recall, F1, Support ['democrat','republican']
precision_recall_fscore_support(y_test, y_test_predicted)
Out[38]:
Decision Tree
In [40]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_test_predicted = dt.predict(X_test)
accuracy_score(y_test, y_test_predicted)
Out[40]:
In [41]:
# Precision, Recall, F1, Support ['democrat','republican']
precision_recall_fscore_support(y_test, y_test_predicted)
Out[41]:
Random Forest
In [46]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_test_predicted = rf.predict(X_test)
accuracy_score(y_test, y_test_predicted)
Out[46]:
In [47]:
# Precision, Recall, F1, Support ['democrat','republican']
precision_recall_fscore_support(y_test, y_test_predicted)
Out[47]:
Challenge 2
ROC curves are plotted using .decision_function(X_test)
, where available, for the above classifiers.
In [87]:
fpr_lr, tpr_lr, _ = roc_curve(y_test,y_score_lr, pos_label='republican')
auc_lr = auc(fpr_lr,tpr_lr)
In [88]:
fpr_svm, tpr_svm, _ = roc_curve(y_test,y_score_svm, pos_label='republican')
auc_svm = auc(fpr_svm,tpr_svm)
In [91]:
plt.plot(fpr_lr, tpr_lr, color='darkorange',
lw=2, label='Logistic Regression (area = %0.3f)' % auc_lr)
plt.plot(fpr_svm, tpr_svm, color='green',
lw=2, label='Support Vector Classifier (area = %0.3f)' % auc_svm)
plt.plot([0,1], [0,1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.xlim([-0.05,1.0])
plt.ylim([0.0,1.05])
plt.title('Receiver operating characteristic')
plt.legend(loc=4)
Out[91]:
Challenge 3
KNN
In [117]:
y_predicted = cross_val_predict(KNeighborsClassifier(n_neighbors=6), X, y, cv=10)
accuracy_score(y, y_predicted)
Out[117]:
In [118]:
# Precision, Recall, F1, Support ['democrat','republican']
precision_recall_fscore_support(y, y_predicted)
Out[118]:
Logistic Regression
In [119]:
y_predicted = cross_val_predict(LogisticRegression(), X, y, cv=10)
accuracy_score(y, y_predicted)
Out[119]:
In [120]:
# Precision, Recall, F1, Support ['democrat','republican']
precision_recall_fscore_support(y, y_predicted)
Out[120]:
Gaussian NB
In [121]:
y_predicted = cross_val_predict(GaussianNB(), X, y, cv=10)
accuracy_score(y, y_predicted)
Out[121]:
In [122]:
# Precision, Recall, F1, Support ['democrat','republican']
precision_recall_fscore_support(y, y_predicted)
Out[122]:
Support Vector
In [123]:
y_predicted = cross_val_predict(SVC(), X, y, cv=10)
accuracy_score(y, y_predicted)
Out[123]:
In [124]:
# Precision, Recall, F1, Support ['democrat','republican']
precision_recall_fscore_support(y, y_predicted)
Out[124]:
Decision Tree
In [133]:
y_predicted = cross_val_predict(DecisionTreeClassifier(), X, y, cv=10)
accuracy_score(y, y_predicted)
Out[133]:
In [134]:
# Precision, Recall, F1, Support ['democrat','republican']
precision_recall_fscore_support(y, y_predicted)
Out[134]:
Random Forest
In [136]:
y_predicted = cross_val_predict(RandomForestClassifier(), X, y, cv=10)
accuracy_score(y, y_predicted)
Out[136]:
In [137]:
# Precision, Recall, F1, Support ['democrat','republican']
precision_recall_fscore_support(y, y_predicted)
Out[137]:
Challenge 4
In [145]:
mv = pd.read_csv('2013_movies.csv')
model_mv = mv[['Rating','Budget','DomesticTotalGross','Runtime']].dropna()
y = model_mv['Rating']
X = model_mv[['Budget','DomesticTotalGross','Runtime']]
(model_mv.groupby('Rating')['Budget'].count())
Out[145]:
KNN
In [146]:
y_predicted = cross_val_predict(KNeighborsClassifier(n_neighbors=5), X, y, cv=10)
# Precision, Recall, F1, Support ['PG','PG-13','R']
precision_recall_fscore_support(y, y_predicted)
Out[146]:
Logistic Regression
In [147]:
y_predicted = cross_val_predict(LogisticRegression(), X, y, cv=10)
# Precision, Recall, F1, Support ['PG','PG-13','R']
precision_recall_fscore_support(y, y_predicted)
Out[147]:
Challenge 5
In [158]:
hab = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/haberman/haberman.data', header=None)
hab.columns = ['age','yr','nodes','survived5']
y = hab['survived5']
X = hab[['age','yr','nodes']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.30, random_state=4444)
In [159]:
lr = LogisticRegression()
y_score_lr = lr.fit(X_train, y_train).decision_function(X_test)
fpr_lr, tpr_lr, _ = roc_curve(y_test, y_score_lr, pos_label=2)
auc_lr = auc(fpr_lr,tpr_lr)
In [160]:
plt.plot(fpr_lr, tpr_lr, color='darkorange',
lw=2, label='Logistic Regression (AUC = %0.2f)' % auc_lr)
plt.plot([0,1], [0,1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.xlim([-0.05,1.0])
plt.ylim([0.0,1.05])
plt.title('Receiver operating characteristic')
plt.legend(loc=4)
Out[160]: