Topic: Challenge Set 7
Subject: Classification
Date: 02/13/2017
Name: Prashant Tatineni
In [15]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.learning_curve import learning_curve
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
%matplotlib inline
Challenge 1
In [16]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/voting-records/house-votes-84.data', header=None)
In [4]:
df.head()
Out[4]:
In [6]:
df.shape
Out[6]:
In [17]:
df = df.replace('y',1)
df = df.replace('n',0)
In [18]:
for i in range(1,17):
df[i] = df[i].replace('?',(df[i].replace('?',np.nan)).mean())
Challenge 2
In [19]:
y = df[0]
X = df[[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]]
In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.30, random_state=4444)
Challenge 3
In [21]:
accuracy = []
for i in range(1,21):
knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(X_train, y_train)
y_test_predicted = knn.predict(X_test)
accuracy.append(accuracy_score(y_test, y_test_predicted))
In [39]:
max(accuracy)
Out[39]:
In [40]:
# k value giving highest accuracy
accuracy.index(max(accuracy)) + 1
Out[40]:
Challenge 4
In [11]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_test_predicted = lr.predict(X_test)
lr_accuracy = accuracy_score(y_test, y_test_predicted)
lr_accuracy
Out[11]:
Challenge 5
In [22]:
(y.groupby(y).count()).plot.bar()
Out[22]:
In [14]:
y_predicted = ['democrat']*len(X)
dem_accuracy = accuracy_score(y, y_predicted)
dem_accuracy
Out[14]:
In [15]:
y_predicted = ['republican']*len(X)
rep_accuracy = accuracy_score(y, y_predicted)
rep_accuracy
Out[15]:
Challenge 6
In [34]:
k = range(1,21)
plt.scatter(k, accuracy, color='k', label='KNN')
plt.scatter(k, [lr_accuracy]*20, color='g', label='Logistic Regression')
plt.scatter(k, [dem_accuracy]*20, color='b', label='Predict Democrat')
plt.scatter(k, [rep_accuracy]*20, color='r', label='Predict Republican')
plt.legend(loc=4)
Out[34]:
Challenge 7
In [17]:
m, train_err, test_err = learning_curve(LogisticRegression(), X, y, cv=10)
In [18]:
m
Out[18]:
In [19]:
train_cv_err = np.mean(train_err, axis=1)
test_cv_err = np.mean(test_err, axis=1)
In [25]:
plt.scatter(m, train_cv_err, color='b', label='Training Error')
plt.scatter(m, test_cv_err, color='g', label='Test Error')
plt.title('Learning Curve for Logistic Regression')
plt.legend()
Out[25]:
In [23]:
m, train_err, test_err = learning_curve(KNeighborsClassifier(n_neighbors=6), X, y, cv=10)
train_cv_err = 1 - np.mean(train_err, axis=1)
test_cv_err = 1 - np.mean(test_err, axis=1)
In [24]:
plt.scatter(m, train_cv_err, color='b', label='Training Error')
plt.scatter(m, test_cv_err, color='g', label='Test Error')
plt.title('Learning Curve for KNN')
plt.legend()
Out[24]:
Challenge 8
Gaussian Naive Bayes
In [48]:
nb = GaussianNB()
nb.fit(X_train, y_train)
y_test_predicted = nb.predict(X_test)
nb_accuracy = accuracy_score(y_test, y_test_predicted)
nb_accuracy
Out[48]:
Support Vector Machine
In [49]:
svm = SVC()
svm.fit(X_train, y_train)
y_test_predicted = svm.predict(X_test)
svm_accuracy = accuracy_score(y_test, y_test_predicted)
svm_accuracy
Out[49]:
Decision Tree
In [50]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_test_predicted = dt.predict(X_test)
dt_accuracy = accuracy_score(y_test, y_test_predicted)
dt_accuracy
Out[50]:
Random Forest
In [52]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_test_predicted = rf.predict(X_test)
rf_accuracy = accuracy_score(y_test, y_test_predicted)
rf_accuracy
Out[52]:
Challenge 9
In [66]:
np.mean(cross_val_score(KNeighborsClassifier(n_neighbors=6), X, y, cv=10))
Out[66]:
In [68]:
np.mean(cross_val_score(LogisticRegression(), X, y, cv=10))
Out[68]:
In [71]:
np.mean(cross_val_score(GaussianNB(), X, y, cv=10))
Out[71]:
In [72]:
np.mean(cross_val_score(SVC(), X, y, cv=10))
Out[72]:
In [73]:
np.mean(cross_val_score(DecisionTreeClassifier(), X, y, cv=10))
Out[73]:
In [83]:
np.mean(cross_val_score(RandomForestClassifier(), X, y, cv=10))
Out[83]:
Challenge 10
In [119]:
vp = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/voting-records/house-votes-84.data', header=None)
In [120]:
vp = vp.replace('y',1)
vp = vp.replace('n',0)
vp.head()
Out[120]:
Vote 16 (the last column) has the most '?'; let's try to predict that one.
In [134]:
for i in range(1,17):
vp[i] = vp[i].replace('?',vp[i].mode()[0])
In [135]:
vy = vp[16]
vX = vp[[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]]
In [136]:
np.mean(cross_val_score(LogisticRegression(), vX, vy, cv=10))
Out[136]:
Challenge 11
In [4]:
mv = pd.read_csv('2013_movies.csv')
mv.head()
Out[4]:
In [199]:
mv.shape
Out[199]:
In [5]:
ratings = mv.groupby('Rating')['Title'].count()
In [11]:
ratings.plot.bar()
Out[11]:
In [260]:
model_mv = mv[['Rating','Budget','DomesticTotalGross','Runtime']].dropna()
In [261]:
y = model_mv['Rating']
X = model_mv[['Budget','DomesticTotalGross','Runtime']]
KNN
In [298]:
accuracy = []
for k in range(1,50):
accuracy.append(np.mean(cross_val_score(KNeighborsClassifier(n_neighbors=k), X, y, cv=15)))
In [299]:
max(accuracy)
Out[299]:
In [300]:
# k value giving highest accuracy
accuracy.index(max(accuracy)) + 1
Out[300]:
Logistic Regression
In [297]:
np.mean(cross_val_score(LogisticRegression(), X, y, cv=15))
Out[297]:
In [296]:
np.mean(cross_val_score(LogisticRegression(multi_class='multinomial', solver='lbfgs'), X, y, cv=15))
Out[296]:
In [295]:
np.mean(cross_val_score(LogisticRegression(multi_class='multinomial', solver='newton-cg'), X, y, cv=15))
Out[295]:
In [311]:
lr = LogisticRegression(multi_class='multinomial', solver='newton-cg')
lr.fit(X,y)
lr.coef_
Out[311]:
Baseline Predictor
In [307]:
pg13_predictor = pd.Series(['PG-13']*89)
In [308]:
accuracy_score(y, pg13_predictor)
Out[308]:
Challenge 12
In [26]:
hab = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/haberman/haberman.data', header=None)
In [27]:
hab.columns = ['age','yr','nodes','survived5']
hab.head()
Out[27]:
Age - all patients
In [318]:
hab.age.mean()
Out[318]:
In [319]:
hab.age.std()
Out[319]:
Age - patients who survived 5+ yrs
In [327]:
hab.groupby('survived5')['age'].mean()[1]
Out[327]:
In [328]:
hab.groupby('survived5')['age'].std()[1]
Out[328]:
Age - patients survived < 5 yrs
In [329]:
hab.groupby('survived5')['age'].mean()[2]
Out[329]:
In [330]:
hab.groupby('survived5')['age'].mean()[2]
Out[330]:
In [30]:
hab.age.plot.hist()
Out[30]:
In [31]:
hab.nodes.plot.hist()
Out[31]:
In [348]:
((hab.sort_values('yr')).yr).head()
Out[348]:
In [349]:
((hab.sort_values('yr', ascending=False)).yr).head()
Out[349]:
In [32]:
y = hab['survived5']
X = hab[['age','yr','nodes']]
In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.30, random_state=4444)
In [34]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_test_predicted = lr.predict(X_test)
lr_accuracy = accuracy_score(y_test, y_test_predicted)
lr_accuracy
Out[34]:
In [35]:
lr.coef_
Out[35]:
The magnitudes of the coefficients suggest that 'nodes' has the strongest effect of reducing survival rate.
In [36]:
m, train_err, test_err = learning_curve(LogisticRegression(), X, y)
In [37]:
m
Out[37]:
In [38]:
train_cv_err = 1- np.mean(train_err, axis=1)
test_cv_err = 1- np.mean(test_err, axis=1)
In [39]:
plt.scatter(m, train_cv_err, color='b', label='Training Error')
plt.scatter(m, test_cv_err, color='g', label='Test Error')
plt.title('Learning Curve for Logistic Regression on Haberman')
plt.legend()
Out[39]:
In [ ]: