In [6]:
# import common APIs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
import os
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn import cross_validation, naive_bayes, tree, svm, ensemble
from sklearn.metrics import classification_report,confusion_matrix,precision_recall_curve,auc,roc_auc_score,roc_curve
from xgboost import XGBClassifier
In [2]:
# Data observation
filepath = '/Users/mac/Desktop/Kaggle_datasets/Mushroom_classification/'
filename01 = 'mushrooms.csv'
df_full = pd.read_csv(os.path.join(filepath, filename01))
df_full.head()
Out[2]:
In [3]:
df_full.info() #完全沒有numeric type
In [4]:
df_full['class'].value_counts()
Out[4]:
In [5]:
df_full.columns
Out[5]:
In [12]:
df_encode = df_full.apply(LabelEncoder().fit_transform)
df_encode.head() #1:p, 0:e
Out[12]:
In [13]:
sns.countplot(df_encode['class'])
Out[13]:
In [25]:
sns.jointplot(x="cap-shape", y="class", data=df_encode)
Out[25]:
In [15]:
sns.barplot(x="cap-shape", y="cap-color", hue="class", data=df_encode)
Out[15]:
In [16]:
sns.pointplot(x="cap-shape", y="cap-color", hue="class", data=df_encode)
Out[16]:
In [19]:
plt.figure(figsize=(20,5))
sns.boxplot(data=df_encode, orient="v");
In [22]:
g = sns.PairGrid(df_encode,
x_vars=['cap-shape', 'cap-surface', 'cap-color'],
y_vars=['bruises', 'odor'],
aspect=1, size=3.5)
g.map(sns.barplot, palette="pastel");
In [23]:
sns.lmplot(x="cap-color", y="class", data=df_encode, x_jitter=.05); #讓點不要overlap
In [51]:
fig, ([axis1,axis2],[axis3,axis4]) = plt.subplots(2,2,figsize=(10,10))
sns.countplot(x='cap-shape', data=df_encode, ax=axis1)
sns.countplot(x='class', data=df_encode, ax=axis2)
sns.stripplot(x='class', y='cap-shape', data=df_encode, ax=axis3, jitter=True)
sns.barplot(x='cap-shape', y='class', data=df_encode, ax=axis4)
plt.show()
In [52]:
fig, ([axis1,axis2],[axis3,axis4]) = plt.subplots(2,2,figsize=(10,10))
sns.countplot(x='cap-color', data=df_encode, ax=axis1)
sns.countplot(x='class', data=df_encode, ax=axis2)
sns.stripplot(x='class', y='cap-color', data=df_encode, ax=axis3, jitter=True)
sns.barplot(x='cap-color', y='class', data=df_encode, ax=axis4)
plt.show()
In [10]:
df_dum = pd.get_dummies(df_full)
df_dum.head()
Out[10]:
In [11]:
k = 10 #number of variables for heatmap
corrmat = df_dum.corr()
cols = corrmat.nlargest(k, 'class_e')['class_e'].index
cm = np.corrcoef(df_dum[cols].values.T)
plt.figure(figsize=(15,15)) #可以調整大小
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10},
yticklabels = cols.values, xticklabels = cols.values, cmap='rainbow')
#hm.xaxis.set_ticks_position('top')
plt.show()
In [12]:
# Data preprocessing
from sklearn.utils import shuffle
shuffle_df = shuffle(df_encode, random_state=42)
df_label = shuffle_df['class']
df_feature = shuffle_df.drop('class', axis=1)
cut_point = round(len(df_encode)*0.6)
train_feature = np.array(df_feature.values[:cut_point,:])
train_label = np.array(df_label.values[:cut_point])
test_feature = np.array(df_feature.values[cut_point:,:])
test_label = np.array(df_label.values[cut_point:])
In [53]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X=scaler.fit_transform(df_feature.values)
X
Out[53]:
In [54]:
pca = PCA()
pca.fit_transform(X)
Out[54]:
In [66]:
covariance=pca.get_covariance()
#pd.DataFrame(covariance)
In [56]:
explained_variance= pca.explained_variance_
explained_variance
Out[56]:
In [65]:
plt.figure(figsize=(8,6))
plt.bar(range(22), explained_variance, alpha=0.5, align='center',label='individual explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')
plt.legend(loc='best')
plt.tight_layout()
plt.grid(False)
plt.show()
In [69]:
pca = PCA(n_components=2)
x = pca.fit_transform(X)
plt.figure(figsize = (8,8))
plt.scatter(x[:,0],x[:,1], alpha=0.5)
plt.show()
In [72]:
pca = PCA(n_components=3)
x = pca.fit_transform(X)
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(x[:,0],x[:,1],x[:,2], alpha=0.5)
plt.show()
In [71]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=2, random_state=5)
X_clustered = kmeans.fit_predict(X)
LABEL_COLOR_MAP = {0 : 'g',
1 : 'y'}
label_color = [LABEL_COLOR_MAP[l] for l in X_clustered]
plt.figure(figsize = (8,8))
plt.scatter(x[:,0],x[:,1], c= label_color)
plt.show()
In [81]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3, random_state=5)
X_clustered = kmeans.fit_predict(X)
LABEL_COLOR_MAP = {0 : 'g',
1 : 'y',
2 : 'r'}
label_color = [LABEL_COLOR_MAP[l] for l in X_clustered]
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(x[:,0],x[:,1],x[:,2], c= label_color, alpha=0.5)
plt.show()
In [13]:
### naive_bayes.BernoulliNB()
from sklearn import cross_validation, naive_bayes
X_train,X_test,y_train,y_test = cross_validation.train_test_split(train_feature,train_label,
test_size=0.25, random_state=0,stratify=train_label)
clf=naive_bayes.BernoulliNB()
clf.fit(X_train,y_train)
print("Traing Score:%f"%clf.score(train_feature,train_label))
print("Testing Score:%f"%clf.score(test_feature,test_label))
y_predict = clf.predict(X_test)
print('\n'+classification_report(y_test,y_predict))
In [14]:
# confusion matrix
prediction2 = clf.predict(test_feature)
prediction2_list = prediction2.reshape(-1).astype(int)
label2_list = test_label.astype(int)
print(classification_report(label2_list, prediction2_list))
print(confusion_matrix(label2_list, prediction2_list))
# conf heatmap
conf = confusion_matrix(label2_list, prediction2_list)
f, ax= plt.subplots(figsize = (5, 5))
sns.heatmap(conf, annot=True, ax=ax, fmt='d')
ax.xaxis.set_ticks_position('top') #Making x label be on top is common in textbooks.
plt.show()
# model_efficacy
def model_efficacy(conf):
total_num = np.sum(conf)
sen = conf[0][0]/(conf[0][0]+conf[1][0])
spe = conf[1][1]/(conf[1][0]+conf[1][1])
false_positive_rate = conf[0][1]/(conf[0][1]+conf[1][1])
false_negative_rate = conf[1][0]/(conf[0][0]+conf[1][0])
print('total_num: ',total_num)
print('G1P1: ',conf[0][0]) #G = gold standard; P = prediction
print('G0P1: ',conf[0][1])
print('G1P0: ',conf[1][0])
print('G0P0: ',conf[1][1])
print('##########################')
print('sensitivity: ',sen)
print('specificity: ',spe)
print('false_positive_rate: ',false_positive_rate)
print('false_negative_rate: ',false_negative_rate)
return total_num, sen, spe, false_positive_rate, false_negative_rate
conf = confusion_matrix(label2_list, prediction2_list)
model_efficacy(conf)
# ROC curve
false_positive_rate, true_positive_rate, thresholds = roc_curve(label2_list, prediction2_list)
roc_auc = auc(false_positive_rate, true_positive_rate)
plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate, true_positive_rate, 'b', label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
In [15]:
### naive_bayes.GaussianNB()
from sklearn import cross_validation, naive_bayes
X_train,X_test,y_train,y_test = cross_validation.train_test_split(train_feature,train_label,
test_size=0.25, random_state=0,stratify=train_label)
clf=naive_bayes.GaussianNB()
clf.fit(X_train,y_train)
print("Traing Score:%f"%clf.score(train_feature,train_label))
print("Testing Score:%f"%clf.score(test_feature,test_label))
y_predict = clf.predict(X_test)
print('\n'+classification_report(y_test,y_predict))
In [16]:
# confusion matrix
prediction2 = clf.predict(test_feature)
prediction2_list = prediction2.reshape(-1).astype(int)
label2_list = test_label.astype(int)
print(classification_report(label2_list, prediction2_list))
print(confusion_matrix(label2_list, prediction2_list))
# conf heatmap
conf = confusion_matrix(label2_list, prediction2_list)
f, ax= plt.subplots(figsize = (5, 5))
sns.heatmap(conf, annot=True, ax=ax, fmt='d')
ax.xaxis.set_ticks_position('top') #Making x label be on top is common in textbooks.
plt.show()
# model_efficacy
def model_efficacy(conf):
total_num = np.sum(conf)
sen = conf[0][0]/(conf[0][0]+conf[1][0])
spe = conf[1][1]/(conf[1][0]+conf[1][1])
false_positive_rate = conf[0][1]/(conf[0][1]+conf[1][1])
false_negative_rate = conf[1][0]/(conf[0][0]+conf[1][0])
print('total_num: ',total_num)
print('G1P1: ',conf[0][0]) #G = gold standard; P = prediction
print('G0P1: ',conf[0][1])
print('G1P0: ',conf[1][0])
print('G0P0: ',conf[1][1])
print('##########################')
print('sensitivity: ',sen)
print('specificity: ',spe)
print('false_positive_rate: ',false_positive_rate)
print('false_negative_rate: ',false_negative_rate)
return total_num, sen, spe, false_positive_rate, false_negative_rate
conf = confusion_matrix(label2_list, prediction2_list)
model_efficacy(conf)
# ROC curve
false_positive_rate, true_positive_rate, thresholds = roc_curve(label2_list, prediction2_list)
roc_auc = auc(false_positive_rate, true_positive_rate)
plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate, true_positive_rate, 'b', label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
In [17]:
### tree.DecisionTreeClassifier()
from sklearn import cross_validation,tree
X_train,X_test,y_train,y_test = cross_validation.train_test_split(train_feature,train_label,
test_size=0.25, random_state=0,stratify=train_label)
clf=tree.DecisionTreeClassifier()
clf.fit(X_train,y_train)
print("Traing Score:%f"%clf.score(train_feature,train_label))
print("Testing Score:%f"%clf.score(test_feature,test_label))
y_predict = clf.predict(X_test)
print('\n'+classification_report(y_test,y_predict))
In [18]:
# confusion matrix
prediction2 = clf.predict(test_feature)
prediction2_list = prediction2.reshape(-1).astype(int)
label2_list = test_label.astype(int)
print(classification_report(label2_list, prediction2_list))
print(confusion_matrix(label2_list, prediction2_list))
# conf heatmap
conf = confusion_matrix(label2_list, prediction2_list)
f, ax= plt.subplots(figsize = (5, 5))
sns.heatmap(conf, annot=True, ax=ax, fmt='d')
ax.xaxis.set_ticks_position('top') #Making x label be on top is common in textbooks.
plt.show()
# model_efficacy
def model_efficacy(conf):
total_num = np.sum(conf)
sen = conf[0][0]/(conf[0][0]+conf[1][0])
spe = conf[1][1]/(conf[1][0]+conf[1][1])
false_positive_rate = conf[0][1]/(conf[0][1]+conf[1][1])
false_negative_rate = conf[1][0]/(conf[0][0]+conf[1][0])
print('total_num: ',total_num)
print('G1P1: ',conf[0][0]) #G = gold standard; P = prediction
print('G0P1: ',conf[0][1])
print('G1P0: ',conf[1][0])
print('G0P0: ',conf[1][1])
print('##########################')
print('sensitivity: ',sen)
print('specificity: ',spe)
print('false_positive_rate: ',false_positive_rate)
print('false_negative_rate: ',false_negative_rate)
return total_num, sen, spe, false_positive_rate, false_negative_rate
conf = confusion_matrix(label2_list, prediction2_list)
model_efficacy(conf)
# ROC curve
false_positive_rate, true_positive_rate, thresholds = roc_curve(label2_list, prediction2_list)
roc_auc = auc(false_positive_rate, true_positive_rate)
plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate, true_positive_rate, 'b', label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
In [19]:
### svm.LinearSVC()
from sklearn import cross_validation,svm
X_train,X_test,y_train,y_test = cross_validation.train_test_split(train_feature,train_label,
test_size=0.25, random_state=0,stratify=train_label)
clf=svm.LinearSVC()
clf.fit(X_train,y_train)
print("Traing Score:%f"%clf.score(train_feature,train_label))
print("Testing Score:%f"%clf.score(test_feature,test_label))
y_predict = clf.predict(X_test)
print('\n'+classification_report(y_test,y_predict))
In [20]:
# confusion matrix
prediction2 = clf.predict(test_feature)
prediction2_list = prediction2.reshape(-1).astype(int)
label2_list = test_label.astype(int)
print(classification_report(label2_list, prediction2_list))
print(confusion_matrix(label2_list, prediction2_list))
# conf heatmap
conf = confusion_matrix(label2_list, prediction2_list)
f, ax= plt.subplots(figsize = (5, 5))
sns.heatmap(conf, annot=True, ax=ax, fmt='d')
ax.xaxis.set_ticks_position('top') #Making x label be on top is common in textbooks.
plt.show()
# model_efficacy
def model_efficacy(conf):
total_num = np.sum(conf)
sen = conf[0][0]/(conf[0][0]+conf[1][0])
spe = conf[1][1]/(conf[1][0]+conf[1][1])
false_positive_rate = conf[0][1]/(conf[0][1]+conf[1][1])
false_negative_rate = conf[1][0]/(conf[0][0]+conf[1][0])
print('total_num: ',total_num)
print('G1P1: ',conf[0][0]) #G = gold standard; P = prediction
print('G0P1: ',conf[0][1])
print('G1P0: ',conf[1][0])
print('G0P0: ',conf[1][1])
print('##########################')
print('sensitivity: ',sen)
print('specificity: ',spe)
print('false_positive_rate: ',false_positive_rate)
print('false_negative_rate: ',false_negative_rate)
return total_num, sen, spe, false_positive_rate, false_negative_rate
conf = confusion_matrix(label2_list, prediction2_list)
model_efficacy(conf)
# ROC curve
false_positive_rate, true_positive_rate, thresholds = roc_curve(label2_list, prediction2_list)
roc_auc = auc(false_positive_rate, true_positive_rate)
plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate, true_positive_rate, 'b', label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
In [21]:
### svm.SVC()
from sklearn import cross_validation,svm
X_train,X_test,y_train,y_test = cross_validation.train_test_split(train_feature,train_label,
test_size=0.25, random_state=0,stratify=train_label)
clf=svm.SVC()
clf.fit(X_train,y_train)
print("Traing Score:%f"%clf.score(train_feature,train_label))
print("Testing Score:%f"%clf.score(test_feature,test_label))
y_predict = clf.predict(X_test)
print('\n'+classification_report(y_test,y_predict))
In [22]:
# confusion matrix
prediction2 = clf.predict(test_feature)
prediction2_list = prediction2.reshape(-1).astype(int)
label2_list = test_label.astype(int)
print(classification_report(label2_list, prediction2_list))
print(confusion_matrix(label2_list, prediction2_list))
# conf heatmap
conf = confusion_matrix(label2_list, prediction2_list)
f, ax= plt.subplots(figsize = (5, 5))
sns.heatmap(conf, annot=True, ax=ax, fmt='d')
ax.xaxis.set_ticks_position('top') #Making x label be on top is common in textbooks.
plt.show()
# model_efficacy
def model_efficacy(conf):
total_num = np.sum(conf)
sen = conf[0][0]/(conf[0][0]+conf[1][0])
spe = conf[1][1]/(conf[1][0]+conf[1][1])
false_positive_rate = conf[0][1]/(conf[0][1]+conf[1][1])
false_negative_rate = conf[1][0]/(conf[0][0]+conf[1][0])
print('total_num: ',total_num)
print('G1P1: ',conf[0][0]) #G = gold standard; P = prediction
print('G0P1: ',conf[0][1])
print('G1P0: ',conf[1][0])
print('G0P0: ',conf[1][1])
print('##########################')
print('sensitivity: ',sen)
print('specificity: ',spe)
print('false_positive_rate: ',false_positive_rate)
print('false_negative_rate: ',false_negative_rate)
return total_num, sen, spe, false_positive_rate, false_negative_rate
conf = confusion_matrix(label2_list, prediction2_list)
model_efficacy(conf)
# ROC curve
false_positive_rate, true_positive_rate, thresholds = roc_curve(label2_list, prediction2_list)
roc_auc = auc(false_positive_rate, true_positive_rate)
plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate, true_positive_rate, 'b', label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
In [23]:
### ensemble.AdaBoostClassifier()
from sklearn import cross_validation,ensemble
X_train,X_test,y_train,y_test = cross_validation.train_test_split(train_feature,train_label,
test_size=0.25, random_state=0,stratify=train_label) #分層取樣
clf=ensemble.AdaBoostClassifier()
clf.fit(X_train,y_train)
print("Traing Score:%f"%clf.score(train_feature,train_label))
print("Testing Score:%f"%clf.score(test_feature,test_label))
y_predict = clf.predict(X_test)
print('\n'+classification_report(y_test,y_predict))
In [24]:
# confusion matrix
prediction2 = clf.predict(test_feature)
prediction2_list = prediction2.reshape(-1).astype(int)
label2_list = test_label.astype(int)
print(classification_report(label2_list, prediction2_list))
print(confusion_matrix(label2_list, prediction2_list))
# conf heatmap
conf = confusion_matrix(label2_list, prediction2_list)
f, ax= plt.subplots(figsize = (5, 5))
sns.heatmap(conf, annot=True, ax=ax, fmt='d')
ax.xaxis.set_ticks_position('top') #Making x label be on top is common in textbooks.
plt.show()
# model_efficacy
def model_efficacy(conf):
total_num = np.sum(conf)
sen = conf[0][0]/(conf[0][0]+conf[1][0])
spe = conf[1][1]/(conf[1][0]+conf[1][1])
false_positive_rate = conf[0][1]/(conf[0][1]+conf[1][1])
false_negative_rate = conf[1][0]/(conf[0][0]+conf[1][0])
print('total_num: ',total_num)
print('G1P1: ',conf[0][0]) #G = gold standard; P = prediction
print('G0P1: ',conf[0][1])
print('G1P0: ',conf[1][0])
print('G0P0: ',conf[1][1])
print('##########################')
print('sensitivity: ',sen)
print('specificity: ',spe)
print('false_positive_rate: ',false_positive_rate)
print('false_negative_rate: ',false_negative_rate)
return total_num, sen, spe, false_positive_rate, false_negative_rate
conf = confusion_matrix(label2_list, prediction2_list)
model_efficacy(conf)
# ROC curve
false_positive_rate, true_positive_rate, thresholds = roc_curve(label2_list, prediction2_list)
roc_auc = auc(false_positive_rate, true_positive_rate)
plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate, true_positive_rate, 'b', label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
In [25]:
### ensemble.GradientBoostingClassifier()
from sklearn import cross_validation,ensemble
X_train,X_test,y_train,y_test = cross_validation.train_test_split(train_feature,train_label,
test_size=0.25, random_state=0,stratify=train_label) #分層取樣
clf=ensemble.GradientBoostingClassifier()
clf.fit(X_train,y_train)
print("Traing Score:%f"%clf.score(train_feature,train_label))
print("Testing Score:%f"%clf.score(test_feature,test_label))
y_predict = clf.predict(X_test)
print('\n'+classification_report(y_test,y_predict))
In [26]:
# confusion matrix
prediction2 = clf.predict(test_feature)
prediction2_list = prediction2.reshape(-1).astype(int)
label2_list = test_label.astype(int)
print(classification_report(label2_list, prediction2_list))
print(confusion_matrix(label2_list, prediction2_list))
# conf heatmap
conf = confusion_matrix(label2_list, prediction2_list)
f, ax= plt.subplots(figsize = (5, 5))
sns.heatmap(conf, annot=True, ax=ax, fmt='d')
ax.xaxis.set_ticks_position('top') #Making x label be on top is common in textbooks.
plt.show()
# model_efficacy
def model_efficacy(conf):
total_num = np.sum(conf)
sen = conf[0][0]/(conf[0][0]+conf[1][0])
spe = conf[1][1]/(conf[1][0]+conf[1][1])
false_positive_rate = conf[0][1]/(conf[0][1]+conf[1][1])
false_negative_rate = conf[1][0]/(conf[0][0]+conf[1][0])
print('total_num: ',total_num)
print('G1P1: ',conf[0][0]) #G = gold standard; P = prediction
print('G0P1: ',conf[0][1])
print('G1P0: ',conf[1][0])
print('G0P0: ',conf[1][1])
print('##########################')
print('sensitivity: ',sen)
print('specificity: ',spe)
print('false_positive_rate: ',false_positive_rate)
print('false_negative_rate: ',false_negative_rate)
return total_num, sen, spe, false_positive_rate, false_negative_rate
conf = confusion_matrix(label2_list, prediction2_list)
model_efficacy(conf)
# ROC curve
false_positive_rate, true_positive_rate, thresholds = roc_curve(label2_list, prediction2_list)
roc_auc = auc(false_positive_rate, true_positive_rate)
plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate, true_positive_rate, 'b', label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
In [27]:
### ensemble.RandomForestClassifier()
from sklearn import cross_validation,ensemble
X_train,X_test,y_train,y_test = cross_validation.train_test_split(train_feature,train_label,
test_size=0.25, random_state=0,stratify=train_label) #分層取樣
clf=ensemble.RandomForestClassifier()
clf.fit(X_train,y_train)
print("Traing Score:%f"%clf.score(train_feature,train_label))
print("Testing Score:%f"%clf.score(test_feature,test_label))
y_predict = clf.predict(X_test)
print('\n'+classification_report(y_test,y_predict))
In [81]:
# confusion matrix
prediction2 = clf.predict(test_feature)
prediction2_list = prediction2.reshape(-1).astype(int)
label2_list = test_label.astype(int)
print(classification_report(label2_list, prediction2_list))
print(confusion_matrix(label2_list, prediction2_list))
# conf heatmap
conf = confusion_matrix(label2_list, prediction2_list)
f, ax= plt.subplots(figsize = (5, 5))
sns.heatmap(conf, annot=True, ax=ax, fmt='d')
ax.xaxis.set_ticks_position('top') #Making x label be on top is common in textbooks.
plt.show()
# model_efficacy
def model_efficacy(conf):
total_num = np.sum(conf)
sen = conf[0][0]/(conf[0][0]+conf[1][0])
spe = conf[1][1]/(conf[1][0]+conf[1][1])
false_positive_rate = conf[0][1]/(conf[0][1]+conf[1][1])
false_negative_rate = conf[1][0]/(conf[0][0]+conf[1][0])
print('total_num: ',total_num)
print('G1P1: ',conf[0][0]) #G = gold standard; P = prediction
print('G0P1: ',conf[0][1])
print('G1P0: ',conf[1][0])
print('G0P0: ',conf[1][1])
print('##########################')
print('sensitivity: ',sen)
print('specificity: ',spe)
print('false_positive_rate: ',false_positive_rate)
print('false_negative_rate: ',false_negative_rate)
return total_num, sen, spe, false_positive_rate, false_negative_rate
conf = confusion_matrix(label2_list, prediction2_list)
model_efficacy(conf)
# ROC curve
false_positive_rate, true_positive_rate, thresholds = roc_curve(label2_list, prediction2_list)
roc_auc = auc(false_positive_rate, true_positive_rate)
plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate, true_positive_rate, 'b', label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
In [28]:
# XGBClassifier()
from xgboost import XGBClassifier
X_train,X_test,y_train,y_test = cross_validation.train_test_split(train_feature,train_label,
test_size=0.25, random_state=0,stratify=train_label) #分層取樣
clf=XGBClassifier()
clf.fit(X_train,y_train)
print("Traing Score:%f"%clf.score(train_feature,train_label))
print("Testing Score:%f"%clf.score(test_feature,test_label))
y_predict = clf.predict(X_test)
print('\n'+classification_report(y_test,y_predict))
In [29]:
# confusion matrix
prediction2 = clf.predict(test_feature)
prediction2_list = prediction2.reshape(-1).astype(int)
label2_list = test_label.astype(int)
print(classification_report(label2_list, prediction2_list))
print(confusion_matrix(label2_list, prediction2_list))
# conf heatmap
conf = confusion_matrix(label2_list, prediction2_list)
f, ax= plt.subplots(figsize = (5, 5))
sns.heatmap(conf, annot=True, ax=ax, fmt='d')
ax.xaxis.set_ticks_position('top') #Making x label be on top is common in textbooks.
plt.show()
# model_efficacy
def model_efficacy(conf):
total_num = np.sum(conf)
sen = conf[0][0]/(conf[0][0]+conf[1][0])
spe = conf[1][1]/(conf[1][0]+conf[1][1])
false_positive_rate = conf[0][1]/(conf[0][1]+conf[1][1])
false_negative_rate = conf[1][0]/(conf[0][0]+conf[1][0])
print('total_num: ',total_num)
print('G1P1: ',conf[0][0]) #G = gold standard; P = prediction
print('G0P1: ',conf[0][1])
print('G1P0: ',conf[1][0])
print('G0P0: ',conf[1][1])
print('##########################')
print('sensitivity: ',sen)
print('specificity: ',spe)
print('false_positive_rate: ',false_positive_rate)
print('false_negative_rate: ',false_negative_rate)
return total_num, sen, spe, false_positive_rate, false_negative_rate
conf = confusion_matrix(label2_list, prediction2_list)
model_efficacy(conf)
# ROC curve
false_positive_rate, true_positive_rate, thresholds = roc_curve(label2_list, prediction2_list)
roc_auc = auc(false_positive_rate, true_positive_rate)
plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate, true_positive_rate, 'b', label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
In [ ]: