In [2]:
# import common APIs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
import os
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn import cross_validation, naive_bayes, tree, svm, ensemble
from sklearn.metrics import classification_report,confusion_matrix,precision_recall_curve,auc,roc_auc_score,roc_curve
from xgboost import XGBClassifier
In [5]:
# Data observation
filepath = '/Users/mac/Desktop/Kaggle_datasets/Glass_classification/'
filename01 = 'glass.csv'
df = pd.read_csv(os.path.join(filepath, filename01))
df.head()
Out[5]:
In [7]:
df.info()
In [6]:
df.Type.value_counts() # 注意根本沒有type 4
Out[6]:
In [13]:
sns.countplot(x="Type", data=df, palette="Greens_d");
In [15]:
sns.barplot(x="Type", y="RI", data=df); #預設取y的平均值
In [16]:
sns.stripplot(x="Type", y="RI", data=df, jitter=True); #讓overlap的部分可以擺在橫向上
In [20]:
sns.barplot(x="Type", y="Na", data=df); #預設取y的平均值
In [19]:
sns.stripplot(x="Type", y="Na", data=df, jitter=True)
Out[19]:
In [11]:
sns.regplot(x="Type", y="RI", data=df, x_jitter=.05);
In [12]:
sns.lmplot(x="Na", y="Mg", hue="Type", data=df)
Out[12]:
In [41]:
T1 = df.loc[df.Type == 1]
T2 = df.loc[df.Type == 2]
T3 = df.loc[df.Type == 3]
T5 = df.loc[df.Type == 5]
T6 = df.loc[df.Type == 6]
T7 = df.loc[df.Type == 7]
In [63]:
### 分開的彩色等高線熱點圖
f, ([ax1,ax2],[ax3,ax5]) = plt.subplots(2,2,figsize=(10, 10))
#cmap = sns.cubehelix_palette(as_cmap=True, dark=0, light=1, reverse=True)
ax1 = sns.kdeplot(T1.Na, T1.Mg, cmap='Reds', n_levels=60, shade=True, ax=ax1);
ax2 = sns.kdeplot(T2.Na, T2.Mg, cmap='Blues', n_levels=60, shade=True, ax=ax2);
ax3 = sns.kdeplot(T3.Na, T3.Mg, cmap='Greens', n_levels=60, shade=True, ax=ax3);
ax5 = sns.kdeplot(T5.Na, T5.Mg, cmap='Greys', n_levels=60, shade=True, ax=ax5);
In [64]:
### 疊再一起的等高線彩圖,顏色不會融合,醜XD
f, ax = plt.subplots(figsize=(10, 10))
#cmap = sns.cubehelix_palette(as_cmap=True, dark=0, light=1, reverse=True)
ax = sns.kdeplot(T1.Na, T1.Mg, cmap='Reds', n_levels=60, shade_lowest=False, shade=True,);
ax = sns.kdeplot(T2.Na, T2.Mg, cmap='Blues', n_levels=60, shade_lowest=False, shade=True, );
ax = sns.kdeplot(T3.Na, T3.Mg, cmap='Greens', n_levels=60, shade_lowest=False, shade=True, );
ax = sns.kdeplot(T5.Na, T5.Mg, cmap='Greys', n_levels=60, shade_lowest=False, shade=True, );
In [21]:
# Data preprocessing
from sklearn.utils import shuffle
shuffle_df = shuffle(df, random_state=42)
df_label = shuffle_df['Type']
df_feature = shuffle_df.drop('Type', axis=1)
cut_point = round(len(df)*0.6)
train_feature = np.array(df_feature.values[:cut_point,:])
train_label = np.array(df_label.values[:cut_point])
test_feature = np.array(df_feature.values[cut_point:,:])
test_label = np.array(df_label.values[cut_point:])
In [74]:
### naive_bayes.GaussianNB()
from sklearn import cross_validation, naive_bayes
X_train,X_test,y_train,y_test = cross_validation.train_test_split(train_feature,train_label,
test_size=0.25, random_state=0,stratify=train_label)
clf=naive_bayes.GaussianNB()
clf.fit(X_train,y_train)
print("Traing Score:%f"%clf.score(train_feature,train_label))
print("Testing Score:%f"%clf.score(test_feature,test_label))
y_predict = clf.predict(X_test)
print('\n'+classification_report(y_test,y_predict))
y_predict2 = clf.predict(test_feature)
print('\n'+classification_report(test_label,y_predict2))
In [26]:
### naive_bayes.MultinomialNB()
from sklearn import cross_validation, naive_bayes
X_train,X_test,y_train,y_test = cross_validation.train_test_split(train_feature,train_label,
test_size=0.25, random_state=0,stratify=train_label)
clf=naive_bayes.MultinomialNB()
clf.fit(X_train,y_train)
print("Traing Score:%f"%clf.score(train_feature,train_label))
print("Testing Score:%f"%clf.score(test_feature,test_label))
y_predict = clf.predict(X_test)
print('\n'+classification_report(y_test,y_predict))
y_predict2 = clf.predict(test_feature)
print('\n'+classification_report(test_label,y_predict2))
In [75]:
# confusion matrix
prediction2 = clf.predict(test_feature)
prediction2_list = prediction2.reshape(-1).astype(int)
label2_list = test_label.astype(int)
print(classification_report(label2_list, prediction2_list))
print(confusion_matrix(label2_list, prediction2_list))
# conf heatmap
conf = confusion_matrix(label2_list, prediction2_list)
f, ax= plt.subplots(figsize = (7, 6))
sns.heatmap(conf, annot=True, ax=ax, fmt='d', annot_kws={'size':20})
ax.xaxis.set_ticks_position('top') #Making x label be on top is common in textbooks.
plt.show()
In [65]:
### tree.DecisionTreeClassifier()
from sklearn import cross_validation,tree
X_train,X_test,y_train,y_test = cross_validation.train_test_split(train_feature,train_label,
test_size=0.25, random_state=0,stratify=train_label)
clf=tree.DecisionTreeClassifier()
clf.fit(X_train,y_train)
print("Traing Score:%f"%clf.score(train_feature,train_label))
print("Testing Score:%f"%clf.score(test_feature,test_label))
y_predict = clf.predict(X_test)
print('\n'+classification_report(y_test,y_predict))
y_predict2 = clf.predict(test_feature)
print('\n'+classification_report(test_label,y_predict2))
In [73]:
# confusion matrix
prediction2 = clf.predict(test_feature)
prediction2_list = prediction2.reshape(-1).astype(int)
label2_list = test_label.astype(int)
print(classification_report(label2_list, prediction2_list))
print(confusion_matrix(label2_list, prediction2_list))
# conf heatmap
conf = confusion_matrix(label2_list, prediction2_list)
f, ax= plt.subplots(figsize = (7, 6))
sns.heatmap(conf, annot=True, ax=ax, fmt='d', annot_kws={'size':20})
ax.xaxis.set_ticks_position('top') #Making x label be on top is common in textbooks.
plt.show()
In [29]:
### svm.LinearSVC()
from sklearn import cross_validation,svm
X_train,X_test,y_train,y_test = cross_validation.train_test_split(train_feature,train_label,
test_size=0.25, random_state=0,stratify=train_label)
clf=svm.LinearSVC()
clf.fit(X_train,y_train)
print("Traing Score:%f"%clf.score(train_feature,train_label))
print("Testing Score:%f"%clf.score(test_feature,test_label))
y_predict = clf.predict(X_test)
print('\n'+classification_report(y_test,y_predict))
y_predict2 = clf.predict(test_feature)
print('\n'+classification_report(test_label,y_predict2))
In [30]:
### svm.SVC()
from sklearn import cross_validation,svm
X_train,X_test,y_train,y_test = cross_validation.train_test_split(train_feature,train_label,
test_size=0.25, random_state=0,stratify=train_label)
clf=svm.SVC()
clf.fit(X_train,y_train)
print("Traing Score:%f"%clf.score(train_feature,train_label))
print("Testing Score:%f"%clf.score(test_feature,test_label))
y_predict = clf.predict(X_test)
print('\n'+classification_report(y_test,y_predict))
y_predict2 = clf.predict(test_feature)
print('\n'+classification_report(test_label,y_predict2))
In [31]:
### ensemble.AdaBoostClassifier()
from sklearn import cross_validation,ensemble
X_train,X_test,y_train,y_test = cross_validation.train_test_split(train_feature,train_label,
test_size=0.25, random_state=0,stratify=train_label) #分層取樣
clf=ensemble.AdaBoostClassifier()
clf.fit(X_train,y_train)
print("Traing Score:%f"%clf.score(train_feature,train_label))
print("Testing Score:%f"%clf.score(test_feature,test_label))
y_predict = clf.predict(X_test)
print('\n'+classification_report(y_test,y_predict))
y_predict2 = clf.predict(test_feature)
print('\n'+classification_report(test_label,y_predict2))
In [32]:
### ensemble.GradientBoostingClassifier()
from sklearn import cross_validation,ensemble
X_train,X_test,y_train,y_test = cross_validation.train_test_split(train_feature,train_label,
test_size=0.25, random_state=0,stratify=train_label) #分層取樣
clf=ensemble.GradientBoostingClassifier()
clf.fit(X_train,y_train)
print("Traing Score:%f"%clf.score(train_feature,train_label))
print("Testing Score:%f"%clf.score(test_feature,test_label))
y_predict = clf.predict(X_test)
print('\n'+classification_report(y_test,y_predict))
y_predict2 = clf.predict(test_feature)
print('\n'+classification_report(test_label,y_predict2))
In [104]:
### ensemble.RandomForestClassifier()
from sklearn import cross_validation,ensemble
X_train,X_test,y_train,y_test = cross_validation.train_test_split(train_feature,train_label,
test_size=0.25, random_state=0,stratify=train_label) #分層取樣
clf=ensemble.RandomForestClassifier()
clf.fit(X_train,y_train)
print("Traing Score:%f"%clf.score(train_feature,train_label))
print("Testing Score:%f"%clf.score(test_feature,test_label))
y_predict = clf.predict(X_test)
print('\n'+classification_report(y_test,y_predict))
y_predict2 = clf.predict(test_feature)
print('\n'+classification_report(test_label,y_predict2))
In [76]:
# XGBClassifier()
from xgboost import XGBClassifier
X_train,X_test,y_train,y_test = cross_validation.train_test_split(train_feature,train_label,
test_size=0.25, random_state=0,stratify=train_label) #分層取樣
clf=XGBClassifier()
clf.fit(X_train,y_train)
print("Traing Score:%f"%clf.score(train_feature,train_label))
print("Testing Score:%f"%clf.score(test_feature,test_label))
y_predict = clf.predict(X_test)
print('\n'+classification_report(y_test,y_predict))
y_predict2 = clf.predict(test_feature)
print('\n'+classification_report(test_label,y_predict2))
In [105]:
# confusion matrix
prediction2 = clf.predict(test_feature)
prediction2_list = prediction2.reshape(-1).astype(int)
label2_list = test_label.astype(int)
print(classification_report(label2_list, prediction2_list))
print(confusion_matrix(label2_list, prediction2_list))
# conf heatmap
conf = confusion_matrix(label2_list, prediction2_list)
f, ax= plt.subplots(figsize = (7, 6))
sns.heatmap(conf, annot=True, ax=ax, fmt='d', annot_kws={'size':20})
ax.xaxis.set_ticks_position('top') #Making x label be on top is common in textbooks.
plt.show()
In [77]:
# confusion matrix
prediction2 = clf.predict(test_feature)
prediction2_list = prediction2.reshape(-1).astype(int)
label2_list = test_label.astype(int)
print(classification_report(label2_list, prediction2_list))
print(confusion_matrix(label2_list, prediction2_list))
# conf heatmap
conf = confusion_matrix(label2_list, prediction2_list)
f, ax= plt.subplots(figsize = (7, 6))
sns.heatmap(conf, annot=True, ax=ax, fmt='d', annot_kws={'size':20})
ax.xaxis.set_ticks_position('top') #Making x label be on top is common in textbooks.
plt.show()
In [78]:
# Standardize
scaler = MinMaxScaler()
scaler.fit(train_feature)
train_feature_trans = scaler.transform(train_feature)
test_feature_trans = scaler.transform(test_feature)
In [89]:
# Onehot encoding
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()
enc.fit(train_label.reshape(-1,1))
train_label_OHE = enc.transform(train_label.reshape(-1,1)).toarray()
test_label_OHE = enc.transform(test_label.reshape(-1,1)).toarray()
In [90]:
train_label_OHE[0:5]
Out[90]:
In [100]:
# Keras MLP models: categorical_clf
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
def show_train_history(train_history,train,validation):
plt.plot(train_history.history[train])
plt.plot(train_history.history[validation])
plt.title('Train History')
plt.ylabel(train)
plt.xlabel('Epoch')
plt.legend(['train', 'validation'], loc='best')
plt.show()
model = Sequential()
model.add(Dense(units=200,
input_dim=9,
kernel_initializer='uniform',
activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(units=200,
kernel_initializer='uniform',
activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(units=6, #輸出onehot encoding的陣列
kernel_initializer='uniform',
activation='softmax'))
print(model.summary()) #可以清楚看到model還有參數數量
model.compile(loss='categorical_crossentropy', #多元用categorical
optimizer='adam', metrics=['accuracy'])
train_history = model.fit(x=train_feature_trans, y=train_label_OHE, #上面多分割一步在keras是內建的
validation_split=0.8, epochs=500,
batch_size=2000, verbose=2) #verbose=2表示顯示訓練過程
show_train_history(train_history,'acc','val_acc')
show_train_history(train_history,'loss','val_loss')
scores = model.evaluate(test_feature_trans, test_label_OHE)
print('\n')
print('accuracy=',scores[1])
prediction = model.predict_classes(test_feature_trans)
In [101]:
# Train/Test Score,接續後面的confusion matrix
df_ans = pd.DataFrame({'Real Class' :test_label})
df_ans['Prediction'] = prediction
In [103]:
# confusion matrix
prediction2_list = prediction.reshape(-1).astype(int)
label2_list = test_label.astype(int)
print(classification_report(label2_list, prediction2_list))
print(confusion_matrix(label2_list, prediction2_list))
# conf heatmap
conf = confusion_matrix(label2_list, prediction2_list)
f, ax= plt.subplots(figsize = (7, 7))
sns.heatmap(conf, annot=True, ax=ax, fmt='d', annot_kws={'size':20}) #字體大小
ax.xaxis.set_ticks_position('top') #Making x label be on top is common in textbooks.
plt.show()
In [ ]: