In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['font.family']='SimHei' #顯示中文

%matplotlib inline

In [2]:
import warnings
warnings.filterwarnings('ignore')

資料載入與探索


In [3]:
# Load in the train datasets
train = pd.read_csv('input/train.csv', encoding = "utf-8", dtype = {'type': np.int32})
test = pd.read_csv('input/test.csv', encoding = "utf-8")
submission = pd.read_csv('input/submission.csv', encoding = "utf-8", dtype = {'type': np.int32})

In [4]:
train.head(3)


Out[4]:
id 花萼長度 花萼寬度 花瓣長度 花瓣寬度 屬種 type
0 1 5.4 3.7 1.5 0.2 Iris-setosa 1
1 2 4.8 3.4 1.6 0.2 Iris-setosa 1
2 3 4.8 3.0 1.4 0.1 Iris-setosa 1

In [5]:
test.head(3)


Out[5]:
id 花萼長度 花萼寬度 花瓣長度 花瓣寬度
0 1 5.1 3.5 1.4 0.2
1 2 4.9 3.0 1.4 0.2
2 3 4.7 3.2 1.3 0.2

In [6]:
submission.head(3)


Out[6]:
id type
0 1 1
1 2 1
2 3 1

One-hot Encoding


In [7]:
df1 = pd.get_dummies(train['屬種'])
df1.sample(5)


Out[7]:
Iris-new_type Iris-setosa Iris-versicolor Iris-virginica
116 0 0 0 1
121 1 0 0 0
113 0 0 0 1
55 0 0 1 0
42 0 0 1 0

LabelEncoding


In [8]:
df2 = train['屬種'].replace({'Iris-setosa':1,'Iris-versicolor':2,'Iris-virginica':3})
df2.sample(5)


Out[8]:
113    3
93     3
94     3
6      1
32     1
Name: 屬種, dtype: object

Data clean - 缺值處理


In [9]:
#missing data
miss_sum = train.isnull().sum().sort_values(ascending=False)
miss_sum


Out[9]:
花萼寬度    1
花萼長度    1
type    0
屬種      0
花瓣寬度    0
花瓣長度    0
id      0
dtype: int64

In [10]:
#查詢那幾筆是空值
print(train[train['花萼寬度'].isnull()])
print("--------------------------------")
print(train[train['花萼長度'].isnull()])


      id  花萼長度  花萼寬度  花瓣長度  花瓣寬度             屬種  type
121  122   5.2   NaN   5.1   1.8  Iris-new_type     4
--------------------------------
      id  花萼長度  花萼寬度  花瓣長度  花瓣寬度             屬種  type
120  121   NaN   3.0   4.9   1.2  Iris-new_type     4

In [11]:
#直接把 NaN drop (如果筆數很少,不影響建模的時候)
train_d_na = train.dropna().reset_index(drop=True)
train_d_na.isnull().sum().sort_values(ascending=False)


Out[11]:
type    0
屬種      0
花瓣寬度    0
花瓣長度    0
花萼寬度    0
花萼長度    0
id      0
dtype: int64

In [12]:
#將空值補平均數
#train.loc[train['花萼寬度'].isnull(),['花萼寬度']] = train['花萼寬度'].mean() #花萼寬度:第2欄
train[['花萼寬度']] = train[['花萼寬度']].fillna(np.mean(train[['花萼寬度']]))

train.plot(kind='line',y='花萼寬度',figsize=(10,6),fontsize=14,title='花萼寬度')


Out[12]:
<matplotlib.axes._subplots.AxesSubplot at 0xa30df98>

In [13]:
#將空值補眾數
#train.loc[train['花萼長度'].isnull(),['花萼長度']] = train['花萼長度'].mode()[0] #花萼長度:第1欄
train[['花萼長度']] = train[['花萼長度']].fillna(train['花萼長度'].mode()[0])

train.plot(kind='line',y='花萼長度',figsize=(10,6),fontsize=14,title='花萼長度')


Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0xa426908>

In [14]:
from pandas.plotting import scatter_matrix
scatter_matrix( train[['花瓣寬度','花瓣長度','花萼寬度','花萼長度']],figsize=(10, 10),color='b')


Out[14]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x000000000A5F9978>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000000A665CC0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000000A675D68>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000000A79EDD8>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x000000000A8005C0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000000A8005F8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000000B8CBF98>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000000B948C50>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x000000000B997CF8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000000BA17518>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000000BA6D668>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000000BAA8668>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x000000000BB3B860>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000000BB4CF60>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000000BC08898>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000000BC6B080>]], dtype=object)

相關分析


In [15]:
corr = train[['花瓣寬度','花瓣長度','花萼寬度','花萼長度']].corr()
print(corr)


          花瓣寬度      花瓣長度      花萼寬度      花萼長度
花瓣寬度  1.000000  0.958595 -0.367957  0.810219
花瓣長度  0.958595  1.000000 -0.436213  0.850649
花萼寬度 -0.367957 -0.436213  1.000000 -0.130174
花萼長度  0.810219  0.850649 -0.130174  1.000000

In [16]:
import seaborn as sns
plt.rcParams['font.family']='DFKai-SB' #顯示中文
plt.figure(figsize=(10,10))
sns.heatmap(corr, square=True, annot=True, cmap="RdBu_r") #center=0, cmap="YlGnBu"
#sns.plt.show()

# http://seaborn.pydata.org/tutorial/color_palettes.html


Out[16]:
<matplotlib.axes._subplots.AxesSubplot at 0xd25af28>

離群值分析


In [17]:
#train[['花瓣寬度','花瓣長度','花萼寬度','花萼長度']]
fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(10, 10), sharey=True)

axes[0, 0].boxplot(train['花萼寬度'],showmeans=True)
axes[0, 0].set_title('訓:花萼寬度')

axes[0, 1].boxplot(train['花瓣寬度'],showmeans=True)
axes[0, 1].set_title('訓:花瓣寬度')

axes[0, 2].boxplot(train['花瓣長度'],showmeans=True)
axes[0, 2].set_title('訓:花瓣長度')

axes[0, 3].boxplot(train['花萼長度'],showmeans=True)
axes[0, 3].set_title('訓:花萼長度')

axes[1, 0].boxplot(test['花萼寬度'],showmeans=True)
axes[1, 0].set_title('測:花萼寬度')

axes[1, 1].boxplot(test['花瓣寬度'],showmeans=True)
axes[1, 1].set_title('測:花瓣寬度')

axes[1, 2].boxplot(test['花瓣長度'],showmeans=True)
axes[1, 2].set_title('測:花瓣長度')

axes[1, 3].boxplot(test['花萼長度'],showmeans=True)
axes[1, 3].set_title('測:花萼長度')


Out[17]:
<matplotlib.text.Text at 0xdae91d0>

In [18]:
train.plot(kind='bar',y='花萼寬度',figsize=(30,6),fontsize=14,title='花萼寬度')


Out[18]:
<matplotlib.axes._subplots.AxesSubplot at 0xd68b940>

In [19]:
#IQR = Q3-Q1
IQR = np.percentile(train['花萼寬度'],75) - np.percentile(train['花萼寬度'],25)

In [20]:
#outlier = Q3 + 1.5*IQR , or. Q1 - 1.5*IQR
train[train['花萼寬度'] > np.percentile(train['花萼寬度'],75)+1.5*IQR]


Out[20]:
id 花萼長度 花萼寬度 花瓣長度 花瓣寬度 屬種 type
5 6 5.7 4.4 1.5 0.4 Iris-setosa 1
22 23 5.2 4.1 1.5 0.1 Iris-setosa 1
23 24 5.5 4.2 1.4 0.2 Iris-setosa 1

In [21]:
#outlier = Q3 + 1.5*IQR , or. Q1 - 1.5*IQR
train[train['花萼寬度'] < np.percentile(train['花萼寬度'],25)-1.5*IQR]


Out[21]:
id 花萼長度 花萼寬度 花瓣長度 花瓣寬度 屬種 type
40 41 5.0 2.0 3.5 1.0 Iris-versicolor 2

In [22]:
#fix_X = X.drop(X.index[[5,23,40]])
#fix_y = y.drop(y.index[[5,23,40]])

切分資料 (從官方的training data切分出來)


In [23]:
#把示範用的 type 4, 資料去除, 以免干擾建模
train = train[train['type']!=4]

In [24]:
from sklearn.model_selection import train_test_split

X = train[['花瓣寬度','花瓣長度','花萼寬度','花萼長度']]
y = train['type']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=100)

標準化


In [25]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

In [26]:
X_train_std[0:5]


Out[26]:
array([[ 0.82172682,  1.27937769,  0.47468749,  1.65481456],
       [ 1.36523117,  1.04600587, -0.44195042,  0.65747341],
       [-1.35229059, -1.2877124 ,  0.24552801, -1.33720889],
       [ 0.41409855,  0.22920447,  0.01636853,  0.03413519],
       [-1.08053842, -1.46274127,  2.07880383, -0.58920303]])

In [27]:
y_test[0:5]


Out[27]:
69     2
54     2
37     1
46     2
106    3
Name: type, dtype: int32

建立初步模型

KNN


In [28]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

knn = KNeighborsClassifier(n_neighbors=3, weights='uniform')
knn.fit(X_train_std, y_train)

print(metrics.classification_report(y_test, knn.predict(X_test_std)))
print(metrics.confusion_matrix(y_test, knn.predict(X_test_std)))


             precision    recall  f1-score   support

          1       1.00      1.00      1.00        14
          2       0.90      0.90      0.90        10
          3       0.92      0.92      0.92        12

avg / total       0.94      0.94      0.94        36

[[14  0  0]
 [ 0  9  1]
 [ 0  1 11]]

Random Forest


In [29]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=500, criterion='gini', max_features='auto', oob_score=True)
rfc.fit(X_train, y_train) #不標準化

print("oob_score(accuary):",rfc.oob_score_)
print(metrics.classification_report(y_test, rfc.predict(X_test)))


oob_score(accuary): 0.916666666667
             precision    recall  f1-score   support

          1       1.00      1.00      1.00        14
          2       1.00      0.90      0.95        10
          3       0.92      1.00      0.96        12

avg / total       0.97      0.97      0.97        36

貝式分類器


In [30]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train_std, y_train)

print(metrics.classification_report(y_test, gnb.predict(X_test_std)))
print(metrics.confusion_matrix(y_test, gnb.predict(X_test_std)))


             precision    recall  f1-score   support

          1       1.00      1.00      1.00        14
          2       1.00      0.90      0.95        10
          3       0.92      1.00      0.96        12

avg / total       0.97      0.97      0.97        36

[[14  0  0]
 [ 0  9  1]
 [ 0  0 12]]

SVM


In [31]:
from sklearn.svm import SVC

svc = SVC(C=1.0, kernel="rbf", probability=True)
svc.fit(X_train_std, y_train)

print(metrics.classification_report(y_test, svc.predict(X_test_std)))
print(metrics.confusion_matrix(y_test, svc.predict(X_test_std)))


             precision    recall  f1-score   support

          1       1.00      1.00      1.00        14
          2       1.00      0.90      0.95        10
          3       0.92      1.00      0.96        12

avg / total       0.97      0.97      0.97        36

[[14  0  0]
 [ 0  9  1]
 [ 0  0 12]]

In [32]:
#from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import StackingClassifier
import xgboost as xgb

clf1 = KNeighborsClassifier(n_neighbors=3, weights='uniform')
clf2 = RandomForestClassifier(n_estimators=500, criterion='gini', max_features='auto', oob_score=True)
clf3 = GaussianNB()
clf4 = SVC(C=1.0, kernel="rbf", probability=True)
meta_clf = xgb.XGBClassifier(n_estimators= 2000, max_depth= 4)
stacking_clf = StackingClassifier(classifiers=[clf1, clf2, clf3, clf4], meta_classifier=meta_clf)

clf1.fit(X_train_std, y_train)
clf2.fit(X_train, y_train)
clf3.fit(X_train_std, y_train)
clf4.fit(X_train_std, y_train)
stacking_clf.fit(X_train_std, y_train)

print('KNN Score:',clf1.score(X_test_std, y_test))
print('RF Score:',clf2.score(X_test, y_test))
print('GNB Score:',clf3.score(X_test_std, y_test))
print('SVC Score:',clf4.score(X_test_std, y_test))
print('Stacking Score:',stacking_clf.score(X_test_std, y_test))


C:\Anaconda3\lib\site-packages\sklearn\cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
KNN Score: 0.944444444444
RF Score: 0.972222222222
GNB Score: 0.972222222222
SVC Score: 0.972222222222
Stacking Score: 0.972222222222

XGBoost

詳細說明:

(ENG) https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/

(CHT) http://www.itread01.com/articles/1476146171.html


In [33]:
import xgboost as xgb

gbm = xgb.XGBClassifier(n_estimators= 2000, max_depth= 4).fit(X_train, y_train)

print(metrics.classification_report(y_test, gbm.predict(X_test)))
print("Score:", gbm.score(X_test, y_test))


             precision    recall  f1-score   support

          1       1.00      1.00      1.00        14
          2       1.00      0.90      0.95        10
          3       0.92      1.00      0.96        12

avg / total       0.97      0.97      0.97        36

Score: 0.972222222222

In [34]:
print(gbm.feature_importances_)


[ 0.23069175  0.1608462   0.53055739  0.07790463]

In [35]:
from xgboost import plot_importance
plot_importance(gbm, )
plt.show()



In [36]:
pred = gbm.predict(test[['花瓣寬度','花瓣長度','花萼寬度','花萼長度']])

In [37]:
pred


Out[37]:
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3,
       3, 3, 3, 2, 3, 3, 3])

In [38]:
# Generate Submission File 
StackingSubmission = pd.DataFrame({ 'id': submission.id, 'type': pred })
StackingSubmission.to_csv("submission.csv", index=False)

In [39]:
submission = pd.read_csv('submission.csv', encoding = "utf-8", dtype = {'type': np.int32})
submission


Out[39]:
id type
0 1 1
1 2 1
2 3 1
3 4 1
4 5 1
5 6 1
6 7 1
7 8 1
8 9 1
9 10 1
10 11 2
11 12 2
12 13 2
13 14 2
14 15 2
15 16 2
16 17 2
17 18 2
18 19 2
19 20 2
20 21 3
21 22 3
22 23 3
23 24 3
24 25 3
25 26 3
26 27 2
27 28 3
28 29 3
29 30 3

In [40]:
test[20:30]


Out[40]:
id 花萼長度 花萼寬度 花瓣長度 花瓣寬度
20 21 6.3 3.3 6.0 2.5
21 22 5.8 2.7 5.1 1.9
22 23 7.1 3.0 5.9 2.1
23 24 6.3 2.9 5.6 1.8
24 25 6.5 3.0 5.8 2.2
25 26 7.6 3.0 6.6 2.1
26 27 4.9 2.5 4.5 1.7
27 28 7.3 2.9 6.3 1.8
28 29 6.7 2.5 5.8 1.8
29 30 7.2 3.6 6.1 2.5

測試資料集的預測結果比較


In [41]:
#使用先前 training set的scale fit做縮放
test_std = sc.transform(test[['花瓣寬度','花瓣長度','花萼寬度','花萼長度']])

In [42]:
submission_stk = stacking_clf.predict(test_std)
submission_stk


Out[42]:
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3,
       3, 3, 3, 2, 3, 3, 3])

In [43]:
submission_rfc = rfc.predict(test[['花瓣寬度','花瓣長度','花萼寬度','花萼長度']])
submission_rfc


Out[43]:
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3,
       3, 3, 3, 2, 3, 3, 3])

In [44]:
submission_knn =knn.predict(test_std)
submission_knn


Out[44]:
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3,
       3, 3, 3, 2, 3, 3, 3])

In [45]:
submission_gnb = gnb.predict(test_std)
submission_gnb


Out[45]:
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 2, 3, 2, 2, 2, 3, 2, 2, 2, 3, 3, 3,
       3, 3, 3, 2, 3, 3, 3])

In [46]:
submission_svc = svc.predict(test_std)
submission_svc


Out[46]:
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3])

In [47]:
from sklearn.ensemble import VotingClassifier
clf1 = knn
clf2 = rfc
clf3 = gnb
clf4 = svc

eclf = VotingClassifier(estimators=[('knn', clf1), ('rfc', clf2),('gnb', clf3),('svc',clf4)], voting='hard', weights=[1, 1, 1, 4])
eclf.fit(X_train_std, y_train)
print(metrics.classification_report(y_test, eclf.predict(X_test_std)))


             precision    recall  f1-score   support

          1       1.00      1.00      1.00        14
          2       1.00      0.90      0.95        10
          3       0.92      1.00      0.96        12

avg / total       0.97      0.97      0.97        36


In [48]:
submission_eclf = eclf.predict(test_std)
submission_eclf


Out[48]:
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3])

In [ ]: