In [81]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import cm
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
import os
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report,confusion_matrix,precision_recall_curve, auc,roc_auc_score, roc_curve
In [2]:
filepath = '/Users/mac/Desktop/Kaggle_datasets/Voice_Gender/'
filename01 = 'voice.csv'
df_full = pd.read_csv(os.path.join(filepath, filename01))
df_full.head()
Out[2]:
In [3]:
df_full.info()
In [12]:
df_full.columns
Out[12]:
In [7]:
sex_label = {'male':0,'female':1}
df_full['label'] = df_full['label'].map(sex_label)
In [8]:
df_full
Out[8]:
In [9]:
df_full['label'].value_counts() #相當均勻的數據組
Out[9]:
In [11]:
k = 10 #number of variables for heatmap
corrmat = df_full.corr()
cols = corrmat.nlargest(k, 'label')['label'].index
cm = np.corrcoef(df_full[cols].values.T)
plt.figure(figsize=(15,15)) #可以調整大小
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10},
yticklabels = cols.values, xticklabels = cols.values, cmap='rainbow')
plt.show()
In [15]:
cols = ['meanfreq', 'sd', 'median', 'Q25', 'Q75', 'IQR', 'skew', 'kurt',
'sp.ent', 'sfm', 'mode', 'centroid', 'meanfun', 'minfun', 'maxfun',
'meandom', 'mindom', 'maxdom', 'dfrange', 'modindx']
for col in cols:
facet = sns.FacetGrid(df_full, hue='label', aspect=4, size=4)
facet.map(sns.kdeplot, col ,shade= True)
facet.set()
facet.add_legend()
plt.show()
In [17]:
from sklearn.utils import shuffle
shuffle_df = shuffle(df_full, random_state=42)
df_label = shuffle_df['label']
df_feature = shuffle_df.drop('label', axis=1)
cut_point = round(len(df_full)*0.6)
train_feature = np.array(df_feature.values[:cut_point,:])
train_label = np.array(df_label.values[:cut_point])
test_feature = np.array(df_feature.values[cut_point:,:])
test_label = np.array(df_label.values[cut_point:])
In [18]:
scaler = MinMaxScaler()
scaler.fit(train_feature)
train_feature_trans = scaler.transform(train_feature)
test_feature_trans = scaler.transform(test_feature)
In [19]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
def show_train_history(train_history,train,validation):
plt.plot(train_history.history[train])
plt.plot(train_history.history[validation])
plt.title('Train History')
plt.ylabel(train)
plt.xlabel('Epoch')
plt.legend(['train', 'validation'], loc='best')
plt.show()
model = Sequential()
model.add(Dense(units=200,
input_dim=20,
kernel_initializer='uniform',
activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(units=200,
kernel_initializer='uniform',
activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(units=1, #輸出一個數字
kernel_initializer='uniform',
activation='sigmoid'))
print(model.summary()) #可以清楚看到model還有參數數量
model.compile(loss='binary_crossentropy', #二元用binary
optimizer='adam', metrics=['accuracy'])
train_history = model.fit(x=train_feature_trans, y=train_label, #上面多分割一步在keras是內建的
validation_split=0.8, epochs=300,
batch_size=2000, verbose=1) #verbose=2表示顯示訓練過程
show_train_history(train_history,'acc','val_acc')
show_train_history(train_history,'loss','val_loss')
scores = model.evaluate(test_feature_trans, test_label)
print('\n')
print('accuracy=',scores[1])
prediction = model.predict_classes(test_feature_trans)
#model.save_weights("Keras_VoiceGender_MLP.h5")
#print('model saved to disk')
In [20]:
prediction2 = prediction
prediction2_list = prediction2.reshape(-1).astype(int)
label2_list = test_label.astype(int)
print(classification_report(label2_list, prediction2_list))
print(confusion_matrix(label2_list, prediction2_list))
In [23]:
conf = confusion_matrix(label2_list, prediction2_list)
f, ax= plt.subplots(figsize = (5, 5))
sns.heatmap(conf, annot=True, ax=ax, fmt='d')
ax.xaxis.set_ticks_position('top') #Making x label be on top is common in textbooks.
plt.show()
In [24]:
def model_efficacy(conf):
total_num = np.sum(conf)
sen = conf[0][0]/(conf[0][0]+conf[1][0])
spe = conf[1][1]/(conf[1][0]+conf[1][1])
false_positive_rate = conf[0][1]/(conf[0][1]+conf[1][1])
false_negative_rate = conf[1][0]/(conf[0][0]+conf[1][0])
print('total_num: ',total_num)
print('G1P1: ',conf[0][0]) #G = gold standard; P = prediction
print('G0P1: ',conf[0][1])
print('G1P0: ',conf[1][0])
print('G0P0: ',conf[1][1])
print('##########################')
print('sensitivity: ',sen)
print('specificity: ',spe)
print('false_positive_rate: ',false_positive_rate)
print('false_negative_rate: ',false_negative_rate)
return total_num, sen, spe, false_positive_rate, false_negative_rate
conf = confusion_matrix(label2_list, prediction2_list)
model_efficacy(conf)
Out[24]:
In [27]:
false_positive_rate, true_positive_rate, thresholds = roc_curve(label2_list, prediction2_list)
roc_auc = auc(false_positive_rate, true_positive_rate)
plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate, true_positive_rate, 'b', label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
In [42]:
# Scikit-learn models
from sklearn import datasets,cross_validation,ensemble
X_train,X_test,y_train,y_test = cross_validation.train_test_split(train_feature_trans,train_label,
test_size=0.25, random_state=0,stratify=train_label) #分層取樣
clf=ensemble.RandomForestClassifier()
clf.fit(X_train,y_train)
print("Traing Score:%f"%clf.score(train_feature_trans,train_label))
print("Testing Score:%f"%clf.score(test_feature_trans,test_label))
In [43]:
prediction2 = clf.predict(test_feature_trans)
prediction2_list = prediction2.reshape(-1).astype(int)
label2_list = test_label.astype(int)
print(classification_report(label2_list, prediction2_list))
print(confusion_matrix(label2_list, prediction2_list))
In [44]:
# conf heatmap
conf = confusion_matrix(label2_list, prediction2_list)
f, ax= plt.subplots(figsize = (5, 5))
sns.heatmap(conf, annot=True, ax=ax, fmt='d')
ax.xaxis.set_ticks_position('top') #Making x label be on top is common in textbooks.
plt.show()
In [45]:
# model_efficacy
def model_efficacy(conf):
total_num = np.sum(conf)
sen = conf[0][0]/(conf[0][0]+conf[1][0])
spe = conf[1][1]/(conf[1][0]+conf[1][1])
false_positive_rate = conf[0][1]/(conf[0][1]+conf[1][1])
false_negative_rate = conf[1][0]/(conf[0][0]+conf[1][0])
print('total_num: ',total_num)
print('G1P1: ',conf[0][0]) #G = gold standard; P = prediction
print('G0P1: ',conf[0][1])
print('G1P0: ',conf[1][0])
print('G0P0: ',conf[1][1])
print('##########################')
print('sensitivity: ',sen)
print('specificity: ',spe)
print('false_positive_rate: ',false_positive_rate)
print('false_negative_rate: ',false_negative_rate)
return total_num, sen, spe, false_positive_rate, false_negative_rate
conf = confusion_matrix(label2_list, prediction2_list)
model_efficacy(conf)
Out[45]:
In [46]:
# ROC curve
false_positive_rate, true_positive_rate, thresholds = roc_curve(label2_list, prediction2_list)
roc_auc = auc(false_positive_rate, true_positive_rate)
plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate, true_positive_rate, 'b', label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
In [90]:
pca = PCA(n_components=10)
pca.fit(train_feature_trans)
pca_score = pca.explained_variance_ratio_
pca_score
Out[90]:
In [91]:
train_feature_trans_PCA = pca.transform(train_feature_trans)
test_feature_trans_PCA = pca.transform(test_feature_trans)
In [92]:
# Scikit-learn models
from sklearn import datasets,cross_validation,ensemble
X_train,X_test,y_train,y_test = cross_validation.train_test_split(train_feature_trans_PCA,train_label,
test_size=0.25, random_state=0,stratify=train_label) #分層取樣
clf=ensemble.RandomForestClassifier()
clf.fit(X_train,y_train)
print("Traing Score:%f"%clf.score(train_feature_trans_PCA,train_label))
print("Testing Score:%f"%clf.score(test_feature_trans_PCA,test_label))
In [93]:
prediction2 = clf.predict(test_feature_trans_PCA)
prediction2_list = prediction2.reshape(-1).astype(int)
label2_list = test_label.astype(int)
print(classification_report(label2_list, prediction2_list))
print(confusion_matrix(label2_list, prediction2_list))
In [96]:
# conf heatmap
conf = confusion_matrix(label2_list, prediction2_list)
f, ax= plt.subplots(figsize = (5, 5))
sns.heatmap(conf, annot=True, ax=ax, fmt='d')
ax.xaxis.set_ticks_position('top') #Making x label be on top is common in textbooks.
plt.show()
In [97]:
plt.scatter(train_feature_trans_PCA[:,0],train_feature_trans_PCA[:,1], alpha=0.3,
c=train_label , cmap=cm.jet, vmin=0., vmax=1.)
plt.show()
In [101]:
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(train_feature_trans_PCA[:,0],train_feature_trans_PCA[:,1], train_feature_trans_PCA[:,2], alpha=0.3,
c=train_label , cmap=cm.jet, vmin=0., vmax=1.)
plt.show()
In [ ]: