In [1]:
# import common APIs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
import os
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn import cross_validation, naive_bayes, tree, svm, ensemble
from sklearn.metrics import classification_report,confusion_matrix,precision_recall_curve,auc,roc_auc_score,roc_curve
from xgboost import XGBClassifier
In [2]:
# Data observation
filepath = '/Users/mac/Desktop/Kaggle_datasets/Med_Appointment/'
filename01 = 'KaggleV2-052016.csv'
df_full = pd.read_csv(os.path.join(filepath, filename01))
df_full.head()
Out[2]:
In [3]:
df_full.info()
In [4]:
df_full.columns
Out[4]:
In [5]:
df_full.Handcap.value_counts()
Out[5]:
In [ ]:
##太久了...
df_dum = pd.get_dummies(df_full[['Gender', 'ScheduledDay','AppointmentDay', 'Age',
'Neighbourhood', 'Scholarship', 'Hipertension','Diabetes',
'Alcoholism', 'Handcap', 'SMS_received', 'No-show']])
# Corr heatmap:直接看一排相關係數高低
k = 20 #number of variables for heatmap
corrmat = df_dum.corr()
cols = corrmat.nlargest(k, 'No-show')['No-show'].index
cm = np.corrcoef(df_dum[cols].values.T)
plt.figure(figsize=(15,15)) #可以調整大小
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10},
yticklabels = cols.values, xticklabels = cols.values, cmap='rainbow')
hm.xaxis.set_ticks_position('top')
plt.show()
In [38]:
fig, ([axis1,axis2],[axis3,axis4]) = plt.subplots(2,2,figsize=(10,10))
sns.countplot(x='Gender', data=df_num, ax=axis1)
sns.countplot(x='No-show', data=df_num, ax=axis2)
sns.stripplot(x='No-show', y='Gender', data=df_num, ax=axis3, jitter=True)
sns.barplot(x='Gender', y='No-show', data=df_num, ax=axis4)
plt.show()
In [39]:
fig, ([axis1,axis2],[axis3,axis4]) = plt.subplots(2,2,figsize=(10,10))
sns.countplot(x='Handcap', data=df_num, ax=axis1)
sns.countplot(x='No-show', data=df_num, ax=axis2)
sns.stripplot(x='No-show', y='Handcap', data=df_num, ax=axis3, jitter=True)
sns.barplot(x='Handcap', y='No-show', data=df_num, ax=axis4)
plt.show()
In [6]:
sns.distplot(df_full['Age'], kde=False, bins=15)
plt.show()
In [11]:
g = sns.FacetGrid(df_full, col='No-show', row='Gender')
g.map(plt.hist, 'Age') #使用年齡的分佈做圖兩個變項
plt.show()
In [12]:
g = sns.FacetGrid(df_full, col='No-show', row='Handcap')
g.map(plt.hist, 'Age') #使用年齡的分佈做圖兩個變項
plt.show() #由此可見Handcap是重要因子
In [13]:
# graph distribution of sms reminders
sns.countplot(x='SMS_received', hue='No-show', data=df_full)
plt.show()
In [16]:
sns.stripplot(x='No-show', y='Age', data=df_full,
hue='SMS_received', jitter=True)
plt.show()
In [33]:
cols = ['Gender','Age', 'Neighbourhood', 'Scholarship', 'Hipertension',
'Diabetes', 'Alcoholism', 'Handcap', 'SMS_received', 'No-show']
df_num = df_full[cols].apply(LabelEncoder().fit_transform)
df_num.head() #F:0/M:1, No:0/Yes:1
Out[33]:
In [34]:
#可是效果並沒有變好
scaler = MinMaxScaler()
scaler.fit(df_num.Age.values.reshape(-1, 1))
df_num['Age'] = scaler.transform(df_num.Age.values.reshape(-1, 1))
In [35]:
df_num = pd.get_dummies(df_num, columns=['Neighbourhood'])
In [41]:
df_num.head()
Out[41]:
In [24]:
# Data preprocessing
from sklearn.utils import shuffle
shuffle_df = shuffle(df_num, random_state=42)
df_label = shuffle_df['No-show']
df_feature = shuffle_df.drop('No-show', axis=1)
cut_point = round(len(df_num)*0.6)
train_feature = np.array(df_feature.values[:cut_point,:])
train_label = np.array(df_label.values[:cut_point])
test_feature = np.array(df_feature.values[cut_point:,:])
test_label = np.array(df_label.values[cut_point:])
In [25]:
### naive_bayes.BernoulliNB()
from sklearn import cross_validation, naive_bayes
X_train,X_test,y_train,y_test = cross_validation.train_test_split(train_feature,train_label,
test_size=0.25, random_state=0,stratify=train_label)
clf=naive_bayes.BernoulliNB()
clf.fit(X_train,y_train)
print("Traing Score:%f"%clf.score(train_feature,train_label))
print("Testing Score:%f"%clf.score(test_feature,test_label))
y_predict = clf.predict(X_test)
print('\n'+classification_report(y_test,y_predict))
y_predict2 = clf.predict(test_feature)
print('\n'+classification_report(test_label,y_predict2))
In [26]:
### naive_bayes.GaussianNB()
from sklearn import cross_validation, naive_bayes
X_train,X_test,y_train,y_test = cross_validation.train_test_split(train_feature,train_label,
test_size=0.25, random_state=0,stratify=train_label)
clf=naive_bayes.GaussianNB()
clf.fit(X_train,y_train)
print("Traing Score:%f"%clf.score(train_feature,train_label))
print("Testing Score:%f"%clf.score(test_feature,test_label))
y_predict = clf.predict(X_test)
print('\n'+classification_report(y_test,y_predict))
y_predict2 = clf.predict(test_feature)
print('\n'+classification_report(test_label,y_predict2))
In [27]:
### tree.DecisionTreeClassifier()
from sklearn import cross_validation,tree
X_train,X_test,y_train,y_test = cross_validation.train_test_split(train_feature,train_label,
test_size=0.25, random_state=0,stratify=train_label)
clf=tree.DecisionTreeClassifier()
clf.fit(X_train,y_train)
print("Traing Score:%f"%clf.score(train_feature,train_label))
print("Testing Score:%f"%clf.score(test_feature,test_label))
y_predict = clf.predict(X_test)
print('\n'+classification_report(y_test,y_predict))
y_predict2 = clf.predict(test_feature)
print('\n'+classification_report(test_label,y_predict2))
In [28]:
### svm.LinearSVC()
from sklearn import cross_validation,svm
X_train,X_test,y_train,y_test = cross_validation.train_test_split(train_feature,train_label,
test_size=0.25, random_state=0,stratify=train_label)
clf=svm.LinearSVC()
clf.fit(X_train,y_train)
print("Traing Score:%f"%clf.score(train_feature,train_label))
print("Testing Score:%f"%clf.score(test_feature,test_label))
y_predict = clf.predict(X_test)
print('\n'+classification_report(y_test,y_predict))
y_predict2 = clf.predict(test_feature)
print('\n'+classification_report(test_label,y_predict2))
In [29]:
### ensemble.AdaBoostClassifier()
from sklearn import cross_validation,ensemble
X_train,X_test,y_train,y_test = cross_validation.train_test_split(train_feature,train_label,
test_size=0.25, random_state=0,stratify=train_label) #分層取樣
clf=ensemble.AdaBoostClassifier()
clf.fit(X_train,y_train)
print("Traing Score:%f"%clf.score(train_feature,train_label))
print("Testing Score:%f"%clf.score(test_feature,test_label))
y_predict = clf.predict(X_test)
print('\n'+classification_report(y_test,y_predict))
y_predict2 = clf.predict(test_feature)
print('\n'+classification_report(test_label,y_predict2))
In [30]:
### ensemble.RandomForestClassifier()
from sklearn import cross_validation,ensemble
X_train,X_test,y_train,y_test = cross_validation.train_test_split(train_feature,train_label,
test_size=0.25, random_state=0,stratify=train_label) #分層取樣
clf=ensemble.RandomForestClassifier()
clf.fit(X_train,y_train)
print("Traing Score:%f"%clf.score(train_feature,train_label))
print("Testing Score:%f"%clf.score(test_feature,test_label))
y_predict = clf.predict(X_test)
print('\n'+classification_report(y_test,y_predict))
y_predict2 = clf.predict(test_feature)
print('\n'+classification_report(test_label,y_predict2))
In [31]:
# XGBClassifier()
from xgboost import XGBClassifier
X_train,X_test,y_train,y_test = cross_validation.train_test_split(train_feature,train_label,
test_size=0.25, random_state=0,stratify=train_label) #分層取樣
clf=XGBClassifier()
clf.fit(X_train,y_train)
print("Traing Score:%f"%clf.score(train_feature,train_label))
print("Testing Score:%f"%clf.score(test_feature,test_label))
y_predict = clf.predict(X_test)
print('\n'+classification_report(y_test,y_predict))
y_predict2 = clf.predict(test_feature)
print('\n'+classification_report(test_label,y_predict2))
In [43]:
# Keras MLP models
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
def show_train_history(train_history,train,validation):
plt.plot(train_history.history[train])
plt.plot(train_history.history[validation])
plt.title('Train History')
plt.ylabel(train)
plt.xlabel('Epoch')
plt.legend(['train', 'validation'], loc='best')
plt.show()
model = Sequential()
model.add(Dense(units=500,
input_dim=89,
kernel_initializer='uniform',
activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(units=200,
kernel_initializer='uniform',
activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(units=1, #輸出一個數字
kernel_initializer='uniform',
activation='sigmoid'))
print(model.summary()) #可以清楚看到model還有參數數量
model.compile(loss='binary_crossentropy', #二元用binary
optimizer='adam', metrics=['accuracy'])
train_history = model.fit(x=train_feature, y=train_label, #上面多分割一步在keras是內建的
validation_split=0.8, epochs=50,
batch_size=2000, verbose=2) #verbose=2表示顯示訓練過程
show_train_history(train_history,'acc','val_acc')
show_train_history(train_history,'loss','val_loss')
scores = model.evaluate(test_feature, test_label)
print('\n')
print('accuracy=',scores[1])
In [ ]: