In [47]:
# import common APIs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
import os
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report,confusion_matrix,precision_recall_curve,auc,roc_auc_score,roc_curve
In [120]:
# Data observation
filepath = '/Users/mac/Desktop/Kaggle_datasets/Pokemon/'
filename01 = 'Pokemon721.csv'
df_full = pd.read_csv(os.path.join(filepath, filename01))
df_full.head()
Out[120]:
In [4]:
df_full.info()
In [9]:
df_full.columns
Out[9]:
In [6]:
sns.jointplot(x="HP", y="Attack", data=df_full)
plt.show()
In [7]:
sns.jointplot(x="Attack", y="Defense", data=df_full)
plt.show()
In [8]:
sns.jointplot(x="Attack", y="Sp. Atk", data=df_full)
plt.show()
In [12]:
#創造純數字的欄位
num_cols = ['HP', 'Attack', 'Defense','Sp. Atk', 'Sp. Def', 'Speed']
df_num = df_full[num_cols]
In [19]:
plt.figure(figsize=(8,6))
sns.boxplot(data=df_num)
plt.show()
In [13]:
sns.pairplot(df_num)
plt.show()
In [125]:
plt.figure(figsize=(6,6)) #可以調整大小
sns.set(font_scale=1.25)
hm = sns.heatmap(df_num.corr(), cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10},
cmap='rainbow')
hm.xaxis.set_ticks_position('top')
plt.show()
In [36]:
pkmn = pd.melt(df_full, id_vars=["Name", "Type 1", "Type 2"], value_vars=num_cols)
pkmn.head()
Out[36]:
In [38]:
plt.figure(figsize=(10,10))
sns.swarmplot(x='variable', y="value", data=pkmn, hue="Type 1");
plt.show()
In [41]:
plt.figure(figsize=(12,10))
plt.ylim(0, 275)
sns.swarmplot(x='variable', y="value", data=pkmn, hue="Type 1", dodge=True, size=7)
plt.legend(bbox_to_anchor=(1, 1), loc=2, borderaxespad=0.);
plt.show()
In [46]:
sns.set_style("whitegrid")
with sns.color_palette([
"#8ED752", "#F95643", "#53AFFE", "#C3D221", "#BBBDAF",
"#AD5CA2", "#F8E64E", "#F0CA42", "#F9AEFE", "#A35449",
"#FB61B4", "#CDBD72", "#7673DA", "#66EBFF", "#8B76FF",
"#8E6856", "#C3C1D7", "#75A4F9"], n_colors=18, desat=.9):
plt.figure(figsize=(12,10))
plt.ylim(0, 275)
sns.swarmplot(x="variable", y="value", data=pkmn, hue="Type 1", dodge=True, size=7)
plt.legend(bbox_to_anchor=(1, 1), loc=2, borderaxespad=0.);
plt.show()
In [48]:
scaler = StandardScaler().fit(df_num)
df_scaled = scaler.transform(df_num)
print(df_scaled[:,0].mean()) # zero (or very close)
print(df_scaled[:,0].std()) # 1 (or very close)
In [50]:
pca = PCA(n_components=0.8) # consider enough components to explain 80% of the variance
pca.fit(df_scaled)
pcscores = pd.DataFrame(pca.transform(df_scaled))
pcscores.columns = ['PC'+str(i+1) for i in range(len(pcscores.columns))]
In [52]:
pcscores.head()
Out[52]:
In [55]:
pca.components_ #PCA對每個column variable解釋的程度
Out[55]:
In [53]:
# num_cols = ['HP', 'Attack', 'Defense','Sp. Atk', 'Sp. Def', 'Speed']
loadings = pd.DataFrame(pca.components_, columns=num_cols)
loadings.index = ['PC'+str(i+1) for i in range(len(pcscores.columns))]
In [54]:
loadings
Out[54]:
In [72]:
plt.figure(figsize=(6,6))
load_sqr = loadings**2
ax = sns.heatmap(load_sqr.T, linewidths=0.5, cmap="BuGn", annot=True, annot_kws={"size": 15})
ax.set_xticklabels(ax.xaxis.get_majorticklabels(), rotation=0, fontsize=15)
ax.set_yticklabels(ax.yaxis.get_majorticklabels(), rotation=0, fontsize=15)
plt.show()
In [71]:
plt.figure(figsize=(6,6))
ax = sns.heatmap(loadings.T, center=0, linewidths=0.5,
cmap="RdBu", vmin=-1, vmax=1, annot=True, annot_kws={"size": 15})
ax.set_xticklabels(ax.xaxis.get_majorticklabels(), rotation=0, fontsize=15)
ax.set_yticklabels(ax.yaxis.get_majorticklabels(), rotation=0, fontsize=15)
plt.show()
In [77]:
best = pcscores.sort_values(by='PC4', ascending=False)[:5] #PC4最大的剛好HP也都高
df_full.loc[best.index]
Out[77]:
In [75]:
df_full.sort_values(by='HP', ascending=False)[:5] #結果是一樣的
Out[75]:
In [121]:
dict_legend = {True:1, False:0} #他給的是bool,所以不能寫成str喔!!
df_full['Legendary'] = df_full['Legendary'].map(dict_legend)
df_full.head()
Out[121]:
In [88]:
cols = ['HP', 'Attack', 'Defense','Sp. Atk', 'Sp. Def','Speed','Legendary']
df_fl = df_full[cols]
In [89]:
from sklearn.utils import shuffle
shuffle_df = shuffle(df_fl, random_state=42)
df_label = shuffle_df['Legendary']
df_feature = shuffle_df.drop('Legendary', axis=1)
cut_point = round(len(df_fl)*0.6)
train_feature = np.array(df_feature.values[:cut_point,:])
train_label = np.array(df_label.values[:cut_point])
test_feature = np.array(df_feature.values[cut_point:,:])
test_label = np.array(df_label.values[cut_point:])
In [90]:
from sklearn import datasets,cross_validation,ensemble
X_train,X_test,y_train,y_test = cross_validation.train_test_split(train_feature,train_label,
test_size=0.25, random_state=0,stratify=train_label) #分層取樣
clf = ensemble.RandomForestClassifier()
clf.fit(X_train,y_train)
print("Traing Score:%f"%clf.score(train_feature,train_label))
print("Testing Score:%f"%clf.score(test_feature,test_label))
In [91]:
from sklearn import datasets,cross_validation,tree
X_train,X_test,y_train,y_test = cross_validation.train_test_split(train_feature,train_label,
test_size=0.25, random_state=0,stratify=train_label) #分層取樣
clf = tree.DecisionTreeClassifier()
clf.fit(X_train,y_train)
print("Traing Score:%f"%clf.score(train_feature,train_label))
print("Testing Score:%f"%clf.score(test_feature,test_label)) #over-training
In [92]:
from sklearn import datasets,cross_validation,svm
X_train,X_test,y_train,y_test = cross_validation.train_test_split(train_feature,train_label,
test_size=0.25, random_state=0,stratify=train_label) #分層取樣
clf = svm.SVC()
clf.fit(X_train,y_train)
print("Traing Score:%f"%clf.score(train_feature,train_label))
print("Testing Score:%f"%clf.score(test_feature,test_label)) #over-training
In [93]:
# Standardize
scaler = MinMaxScaler()
scaler.fit(train_feature)
train_feature_trans = scaler.transform(train_feature)
test_feature_trans = scaler.transform(test_feature)
In [98]:
# Keras MLP models
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
def show_train_history(train_history,train,validation):
plt.plot(train_history.history[train])
plt.plot(train_history.history[validation])
plt.title('Train History')
plt.ylabel(train)
plt.xlabel('Epoch')
plt.legend(['train', 'validation'], loc='best')
plt.show()
model = Sequential()
model.add(Dense(units=200,
input_dim=6,
kernel_initializer='uniform',
activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(units=200,
kernel_initializer='uniform',
activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(units=1, #輸出一個數字
kernel_initializer='uniform',
activation='sigmoid'))
print(model.summary()) #可以清楚看到model還有參數數量
model.compile(loss='binary_crossentropy', #二元用binary
optimizer='adam', metrics=['accuracy'])
train_history = model.fit(x=train_feature_trans, y=train_label, #上面多分割一步在keras是內建的
validation_split=0.8, epochs=200,
batch_size=2000, verbose=2) #verbose=2表示顯示訓練過程
show_train_history(train_history,'acc','val_acc')
show_train_history(train_history,'loss','val_loss')
scores = model.evaluate(test_feature_trans, test_label)
print('\n')
print('accuracy=',scores[1])
In [112]:
from sklearn.utils import shuffle
df_PCA = pcscores
df_PCA['Legendary'] = df_full['Legendary']
shuffle_df = shuffle(df_PCA, random_state=42)
df_label = shuffle_df['Legendary']
df_feature = shuffle_df.drop('Legendary', axis=1)
cut_point = round(len(df_PCA)*0.6)
train_feature = np.array(df_feature.values[:cut_point,:])
train_label = np.array(df_label.values[:cut_point])
test_feature = np.array(df_feature.values[cut_point:,:])
test_label = np.array(df_label.values[cut_point:])
In [114]:
df_PCA.head()
Out[114]:
In [115]:
from sklearn import datasets,cross_validation,ensemble
X_train,X_test,y_train,y_test = cross_validation.train_test_split(train_feature,train_label,
test_size=0.25, random_state=0,stratify=train_label) #分層取樣
clf = ensemble.RandomForestClassifier()
clf.fit(X_train,y_train)
print("Traing Score:%f"%clf.score(train_feature,train_label))
print("Testing Score:%f"%clf.score(test_feature,test_label))
In [116]:
from sklearn import datasets,cross_validation,tree
X_train,X_test,y_train,y_test = cross_validation.train_test_split(train_feature,train_label,
test_size=0.25, random_state=0,stratify=train_label) #分層取樣
clf = tree.DecisionTreeClassifier()
clf.fit(X_train,y_train)
print("Traing Score:%f"%clf.score(train_feature,train_label))
print("Testing Score:%f"%clf.score(test_feature,test_label)) #over-training
In [117]:
from sklearn import datasets,cross_validation,svm
X_train,X_test,y_train,y_test = cross_validation.train_test_split(train_feature,train_label,
test_size=0.25, random_state=0,stratify=train_label) #分層取樣
clf = svm.SVC()
clf.fit(X_train,y_train)
print("Traing Score:%f"%clf.score(train_feature,train_label))
print("Testing Score:%f"%clf.score(test_feature,test_label)) #over-training
In [119]:
# Keras MLP models
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
def show_train_history(train_history,train,validation):
plt.plot(train_history.history[train])
plt.plot(train_history.history[validation])
plt.title('Train History')
plt.ylabel(train)
plt.xlabel('Epoch')
plt.legend(['train', 'validation'], loc='best')
plt.show()
model = Sequential()
model.add(Dense(units=200,
input_dim=4,
kernel_initializer='uniform',
activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(units=200,
kernel_initializer='uniform',
activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(units=1, #輸出一個數字
kernel_initializer='uniform',
activation='sigmoid'))
print(model.summary()) #可以清楚看到model還有參數數量
model.compile(loss='binary_crossentropy', #二元用binary
optimizer='adam', metrics=['accuracy'])
train_history = model.fit(x=train_feature, y=train_label, #上面多分割一步在keras是內建的
validation_split=0.8, epochs=120,
batch_size=2000, verbose=2) #verbose=2表示顯示訓練過程
show_train_history(train_history,'acc','val_acc')
show_train_history(train_history,'loss','val_loss')
scores = model.evaluate(test_feature, test_label)
print('\n')
print('accuracy=',scores[1])
In [ ]: