In [1]:

    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['font.family']='SimHei' #顯示中文

%matplotlib inline



In [2]:

    
import warnings
warnings.filterwarnings('ignore')

資料載入與探索



In [3]:

    
# Load in the train datasets
train = pd.read_csv('input/train.csv', encoding = "utf-8", dtype = {'type': np.int32})
test = pd.read_csv('input/test.csv', encoding = "utf-8")
submission = pd.read_csv('input/submission.csv', encoding = "utf-8", dtype = {'type': np.int32})



In [4]:

    
train.head(3)









    Out[4]:







  
    
      
      id
      花萼長度
      花萼寬度
      花瓣長度
      花瓣寬度
      屬種
      type
    
  
  
    
      0
      1
      5.4
      3.7
      1.5
      0.2
      Iris-setosa
      1
    
    
      1
      2
      4.8
      3.4
      1.6
      0.2
      Iris-setosa
      1
    
    
      2
      3
      4.8
      3.0
      1.4
      0.1
      Iris-setosa
      1



In [5]:

    
test.head(3)



In [6]:

    
submission.head(3)

One-hot Encoding



In [7]:

    
df1 = pd.get_dummies(train['屬種'])
df1.sample(5)









    Out[7]:







  
    
      
      Iris-new_type
      Iris-setosa
      Iris-versicolor
      Iris-virginica
    
  
  
    
      116
      0
      0
      0
      1
    
    
      121
      1
      0
      0
      0
    
    
      113
      0
      0
      0
      1
    
    
      55
      0
      0
      1
      0
    
    
      42
      0
      0
      1
      0

LabelEncoding



In [8]:

    
df2 = train['屬種'].replace({'Iris-setosa':1,'Iris-versicolor':2,'Iris-virginica':3})
df2.sample(5)









    Out[8]:





113    3
93     3
94     3
6      1
32     1
Name: 屬種, dtype: object

Data clean - 缺值處理



In [9]:

    
#missing data
miss_sum = train.isnull().sum().sort_values(ascending=False)
miss_sum









    Out[9]:





花萼寬度    1
花萼長度    1
type    0
屬種      0
花瓣寬度    0
花瓣長度    0
id      0
dtype: int64



In [10]:

    
#查詢那幾筆是空值
print(train[train['花萼寬度'].isnull()])
print("--------------------------------")
print(train[train['花萼長度'].isnull()])









    



      id  花萼長度  花萼寬度  花瓣長度  花瓣寬度             屬種  type
121  122   5.2   NaN   5.1   1.8  Iris-new_type     4
--------------------------------
      id  花萼長度  花萼寬度  花瓣長度  花瓣寬度             屬種  type
120  121   NaN   3.0   4.9   1.2  Iris-new_type     4



In [11]:

    
#直接把 NaN drop (如果筆數很少,不影響建模的時候)
train_d_na = train.dropna().reset_index(drop=True)
train_d_na.isnull().sum().sort_values(ascending=False)









    Out[11]:





type    0
屬種      0
花瓣寬度    0
花瓣長度    0
花萼寬度    0
花萼長度    0
id      0
dtype: int64



In [12]:

    
#將空值補平均數
#train.loc[train['花萼寬度'].isnull(),['花萼寬度']] = train['花萼寬度'].mean() #花萼寬度:第2欄
train[['花萼寬度']] = train[['花萼寬度']].fillna(np.mean(train[['花萼寬度']]))

train.plot(kind='line',y='花萼寬度',figsize=(10,6),fontsize=14,title='花萼寬度')









    Out[12]:





<matplotlib.axes._subplots.AxesSubplot at 0xa30df98>



In [13]:

    
#將空值補眾數
#train.loc[train['花萼長度'].isnull(),['花萼長度']] = train['花萼長度'].mode()[0] #花萼長度:第1欄
train[['花萼長度']] = train[['花萼長度']].fillna(train['花萼長度'].mode()[0])

train.plot(kind='line',y='花萼長度',figsize=(10,6),fontsize=14,title='花萼長度')









    Out[13]:





<matplotlib.axes._subplots.AxesSubplot at 0xa426908>



In [14]:

    
from pandas.plotting import scatter_matrix
scatter_matrix( train[['花瓣寬度','花瓣長度','花萼寬度','花萼長度']],figsize=(10, 10),color='b')









    Out[14]:





array([[<matplotlib.axes._subplots.AxesSubplot object at 0x000000000A5F9978>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000000A665CC0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000000A675D68>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000000A79EDD8>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x000000000A8005C0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000000A8005F8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000000B8CBF98>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000000B948C50>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x000000000B997CF8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000000BA17518>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000000BA6D668>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000000BAA8668>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x000000000BB3B860>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000000BB4CF60>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000000BC08898>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000000000BC6B080>]], dtype=object)

離群值分析



In [17]:

    
#train[['花瓣寬度','花瓣長度','花萼寬度','花萼長度']]
fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(10, 10), sharey=True)

axes[0, 0].boxplot(train['花萼寬度'],showmeans=True)
axes[0, 0].set_title('訓:花萼寬度')

axes[0, 1].boxplot(train['花瓣寬度'],showmeans=True)
axes[0, 1].set_title('訓:花瓣寬度')

axes[0, 2].boxplot(train['花瓣長度'],showmeans=True)
axes[0, 2].set_title('訓:花瓣長度')

axes[0, 3].boxplot(train['花萼長度'],showmeans=True)
axes[0, 3].set_title('訓:花萼長度')

axes[1, 0].boxplot(test['花萼寬度'],showmeans=True)
axes[1, 0].set_title('測:花萼寬度')

axes[1, 1].boxplot(test['花瓣寬度'],showmeans=True)
axes[1, 1].set_title('測:花瓣寬度')

axes[1, 2].boxplot(test['花瓣長度'],showmeans=True)
axes[1, 2].set_title('測:花瓣長度')

axes[1, 3].boxplot(test['花萼長度'],showmeans=True)
axes[1, 3].set_title('測:花萼長度')









    Out[17]:





<matplotlib.text.Text at 0xdae91d0>



In [18]:

    
train.plot(kind='bar',y='花萼寬度',figsize=(30,6),fontsize=14,title='花萼寬度')









    Out[18]:





<matplotlib.axes._subplots.AxesSubplot at 0xd68b940>



In [19]:

    
#IQR = Q3-Q1
IQR = np.percentile(train['花萼寬度'],75) - np.percentile(train['花萼寬度'],25)



In [20]:

    
#outlier = Q3 + 1.5*IQR , or. Q1 - 1.5*IQR
train[train['花萼寬度'] > np.percentile(train['花萼寬度'],75)+1.5*IQR]









    Out[20]:







  
    
      
      id
      花萼長度
      花萼寬度
      花瓣長度
      花瓣寬度
      屬種
      type
    
  
  
    
      5
      6
      5.7
      4.4
      1.5
      0.4
      Iris-setosa
      1
    
    
      22
      23
      5.2
      4.1
      1.5
      0.1
      Iris-setosa
      1
    
    
      23
      24
      5.5
      4.2
      1.4
      0.2
      Iris-setosa
      1



In [21]:

    
#outlier = Q3 + 1.5*IQR , or. Q1 - 1.5*IQR
train[train['花萼寬度'] < np.percentile(train['花萼寬度'],25)-1.5*IQR]









    Out[21]:







  
    
      
      id
      花萼長度
      花萼寬度
      花瓣長度
      花瓣寬度
      屬種
      type
    
  
  
    
      40
      41
      5.0
      2.0
      3.5
      1.0
      Iris-versicolor
      2



In [22]:

    
#fix_X = X.drop(X.index[[5,23,40]])
#fix_y = y.drop(y.index[[5,23,40]])

切分資料 (從官方的training data切分出來)



In [23]:

    
#把示範用的 type 4, 資料去除, 以免干擾建模
train = train[train['type']!=4]



In [24]:

    
from sklearn.model_selection import train_test_split

X = train[['花瓣寬度','花瓣長度','花萼寬度','花萼長度']]
y = train['type']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=100)

標準化



In [25]:

    
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)



In [26]:

    
X_train_std[0:5]









    Out[26]:





array([[ 0.82172682,  1.27937769,  0.47468749,  1.65481456],
       [ 1.36523117,  1.04600587, -0.44195042,  0.65747341],
       [-1.35229059, -1.2877124 ,  0.24552801, -1.33720889],
       [ 0.41409855,  0.22920447,  0.01636853,  0.03413519],
       [-1.08053842, -1.46274127,  2.07880383, -0.58920303]])



In [27]:

    
y_test[0:5]









    Out[27]:





69     2
54     2
37     1
46     2
106    3
Name: type, dtype: int32

建立初步模型

KNN



In [28]:

    
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

knn = KNeighborsClassifier(n_neighbors=3, weights='uniform')
knn.fit(X_train_std, y_train)

print(metrics.classification_report(y_test, knn.predict(X_test_std)))
print(metrics.confusion_matrix(y_test, knn.predict(X_test_std)))









    



             precision    recall  f1-score   support

          1       1.00      1.00      1.00        14
          2       0.90      0.90      0.90        10
          3       0.92      0.92      0.92        12

avg / total       0.94      0.94      0.94        36

[[14  0  0]
 [ 0  9  1]
 [ 0  1 11]]

Random Forest



In [29]:

    
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=500, criterion='gini', max_features='auto', oob_score=True)
rfc.fit(X_train, y_train) #不標準化

print("oob_score(accuary):",rfc.oob_score_)
print(metrics.classification_report(y_test, rfc.predict(X_test)))









    



oob_score(accuary): 0.916666666667
             precision    recall  f1-score   support

          1       1.00      1.00      1.00        14
          2       1.00      0.90      0.95        10
          3       0.92      1.00      0.96        12

avg / total       0.97      0.97      0.97        36

貝式分類器



In [30]:

    
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train_std, y_train)

print(metrics.classification_report(y_test, gnb.predict(X_test_std)))
print(metrics.confusion_matrix(y_test, gnb.predict(X_test_std)))









    



             precision    recall  f1-score   support

          1       1.00      1.00      1.00        14
          2       1.00      0.90      0.95        10
          3       0.92      1.00      0.96        12

avg / total       0.97      0.97      0.97        36

[[14  0  0]
 [ 0  9  1]
 [ 0  0 12]]

SVM



In [31]:

    
from sklearn.svm import SVC

svc = SVC(C=1.0, kernel="rbf", probability=True)
svc.fit(X_train_std, y_train)

print(metrics.classification_report(y_test, svc.predict(X_test_std)))
print(metrics.confusion_matrix(y_test, svc.predict(X_test_std)))









    



             precision    recall  f1-score   support

          1       1.00      1.00      1.00        14
          2       1.00      0.90      0.95        10
          3       0.92      1.00      0.96        12

avg / total       0.97      0.97      0.97        36

[[14  0  0]
 [ 0  9  1]
 [ 0  0 12]]

Stacking

website: http://rasbt.github.io/mlxtend/



In [32]:

    
#from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import StackingClassifier
import xgboost as xgb

clf1 = KNeighborsClassifier(n_neighbors=3, weights='uniform')
clf2 = RandomForestClassifier(n_estimators=500, criterion='gini', max_features='auto', oob_score=True)
clf3 = GaussianNB()
clf4 = SVC(C=1.0, kernel="rbf", probability=True)
meta_clf = xgb.XGBClassifier(n_estimators= 2000, max_depth= 4)
stacking_clf = StackingClassifier(classifiers=[clf1, clf2, clf3, clf4], meta_classifier=meta_clf)

clf1.fit(X_train_std, y_train)
clf2.fit(X_train, y_train)
clf3.fit(X_train_std, y_train)
clf4.fit(X_train_std, y_train)
stacking_clf.fit(X_train_std, y_train)

print('KNN Score:',clf1.score(X_test_std, y_test))
print('RF Score:',clf2.score(X_test, y_test))
print('GNB Score:',clf3.score(X_test_std, y_test))
print('SVC Score:',clf4.score(X_test_std, y_test))
print('Stacking Score:',stacking_clf.score(X_test_std, y_test))









    



C:\Anaconda3\lib\site-packages\sklearn\cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)






    



KNN Score: 0.944444444444
RF Score: 0.972222222222
GNB Score: 0.972222222222
SVC Score: 0.972222222222
Stacking Score: 0.972222222222

XGBoost

詳細說明:

(ENG) https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/

(CHT) http://www.itread01.com/articles/1476146171.html



In [33]:

    
import xgboost as xgb

gbm = xgb.XGBClassifier(n_estimators= 2000, max_depth= 4).fit(X_train, y_train)

print(metrics.classification_report(y_test, gbm.predict(X_test)))
print("Score:", gbm.score(X_test, y_test))









    



             precision    recall  f1-score   support

          1       1.00      1.00      1.00        14
          2       1.00      0.90      0.95        10
          3       0.92      1.00      0.96        12

avg / total       0.97      0.97      0.97        36

Score: 0.972222222222



In [34]:

    
print(gbm.feature_importances_)









    



[ 0.23069175  0.1608462   0.53055739  0.07790463]



In [35]:

    
from xgboost import plot_importance
plot_importance(gbm, )
plt.show()



In [36]:

    
pred = gbm.predict(test[['花瓣寬度','花瓣長度','花萼寬度','花萼長度']])



In [37]:

    
pred









    Out[37]:





array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3,
       3, 3, 3, 2, 3, 3, 3])



In [38]:

    
# Generate Submission File 
StackingSubmission = pd.DataFrame({ 'id': submission.id, 'type': pred })
StackingSubmission.to_csv("submission.csv", index=False)



In [39]:

    
submission = pd.read_csv('submission.csv', encoding = "utf-8", dtype = {'type': np.int32})
submission



In [40]:

    
test[20:30]

測試資料集的預測結果比較



In [41]:

    
#使用先前 training set的scale fit做縮放
test_std = sc.transform(test[['花瓣寬度','花瓣長度','花萼寬度','花萼長度']])



In [42]:

    
submission_stk = stacking_clf.predict(test_std)
submission_stk









    Out[42]:





array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3,
       3, 3, 3, 2, 3, 3, 3])



In [43]:

    
submission_rfc = rfc.predict(test[['花瓣寬度','花瓣長度','花萼寬度','花萼長度']])
submission_rfc









    Out[43]:





array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3,
       3, 3, 3, 2, 3, 3, 3])



In [44]:

    
submission_knn =knn.predict(test_std)
submission_knn









    Out[44]:





array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3,
       3, 3, 3, 2, 3, 3, 3])



In [45]:

    
submission_gnb = gnb.predict(test_std)
submission_gnb









    Out[45]:





array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 2, 3, 2, 2, 2, 3, 2, 2, 2, 3, 3, 3,
       3, 3, 3, 2, 3, 3, 3])



In [46]:

    
submission_svc = svc.predict(test_std)
submission_svc









    Out[46]:





array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3])



In [47]:

    
from sklearn.ensemble import VotingClassifier
clf1 = knn
clf2 = rfc
clf3 = gnb
clf4 = svc

eclf = VotingClassifier(estimators=[('knn', clf1), ('rfc', clf2),('gnb', clf3),('svc',clf4)], voting='hard', weights=[1, 1, 1, 4])
eclf.fit(X_train_std, y_train)
print(metrics.classification_report(y_test, eclf.predict(X_test_std)))









    



             precision    recall  f1-score   support

          1       1.00      1.00      1.00        14
          2       1.00      0.90      0.95        10
          3       0.92      1.00      0.96        12

avg / total       0.97      0.97      0.97        36



In [48]:

    
submission_eclf = eclf.predict(test_std)
submission_eclf









    Out[48]:





array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3])



In [ ]:

	id	花萼長度	花萼寬度	花瓣長度	花瓣寬度	屬種	type
0	1	5.4	3.7	1.5	0.2	Iris-setosa	1
1	2	4.8	3.4	1.6	0.2	Iris-setosa	1
2	3	4.8	3.0	1.4	0.1	Iris-setosa	1

	id	type
0	1	1
1	2	1
2	3	1
3	4	1
4	5	1
5	6	1
6	7	1
7	8	1
8	9	1
9	10	1
10	11	2
11	12	2
12	13	2
13	14	2
14	15	2
15	16	2
16	17	2
17	18	2
18	19	2
19	20	2
20	21	3
21	22	3
22	23	3
23	24	3
24	25	3
25	26	3
26	27	2
27	28	3
28	29	3
29	30	3

	id	花萼長度	花萼寬度	花瓣長度	花瓣寬度
20	21	6.3	3.3	6.0	2.5
21	22	5.8	2.7	5.1	1.9
22	23	7.1	3.0	5.9	2.1
23	24	6.3	2.9	5.6	1.8
24	25	6.5	3.0	5.8	2.2
25	26	7.6	3.0	6.6	2.1
26	27	4.9	2.5	4.5	1.7
27	28	7.3	2.9	6.3	1.8
28	29	6.7	2.5	5.8	1.8
29	30	7.2	3.6	6.1	2.5

	id	type
0	1	1
1	2	1
2	3	1
3	4	1
4	5	1
5	6	1
6	7	1
7	8	1
8	9	1
9	10	1
10	11	2
11	12	2
12	13	2
13	14	2
14	15	2
15	16	2
16	17	2
17	18	2
18	19	2
19	20	2
20	21	3
21	22	3
22	23	3
23	24	3
24	25	3
25	26	3
26	27	2
27	28	3
28	29	3
29	30	3

	id	花萼長度	花萼寬度	花瓣長度	花瓣寬度
20	21	6.3	3.3	6.0	2.5
21	22	5.8	2.7	5.1	1.9
22	23	7.1	3.0	5.9	2.1
23	24	6.3	2.9	5.6	1.8
24	25	6.5	3.0	5.8	2.2
25	26	7.6	3.0	6.6	2.1
26	27	4.9	2.5	4.5	1.7
27	28	7.3	2.9	6.3	1.8
28	29	6.7	2.5	5.8	1.8
29	30	7.2	3.6	6.1	2.5

資料載入與探索

One-hot Encoding

LabelEncoding

Data clean - 缺值處理

相關分析

離群值分析

切分資料 (從官方的training data切分出來)

標準化

建立初步模型

KNN

Random Forest

貝式分類器

SVM

Stacking

XGBoost

測試資料集的預測結果比較

	id	type
0	1	1
1	2	1
2	3	1
3	4	1
4	5	1
5	6	1
6	7	1
7	8	1
8	9	1
9	10	1
10	11	2
11	12	2
12	13	2
13	14	2
14	15	2
15	16	2
16	17	2
17	18	2
18	19	2
19	20	2
20	21	3
21	22	3
22	23	3
23	24	3
24	25	3
25	26	3
26	27	2
27	28	3
28	29	3
29	30	3

	id	花萼長度	花萼寬度	花瓣長度	花瓣寬度
20	21	6.3	3.3	6.0	2.5
21	22	5.8	2.7	5.1	1.9
22	23	7.1	3.0	5.9	2.1
23	24	6.3	2.9	5.6	1.8
24	25	6.5	3.0	5.8	2.2
25	26	7.6	3.0	6.6	2.1
26	27	4.9	2.5	4.5	1.7
27	28	7.3	2.9	6.3	1.8
28	29	6.7	2.5	5.8	1.8
29	30	7.2	3.6	6.1	2.5