In [41]:
import pandas as pd
import numpy as np
import sklearn as sk

In [42]:
dftrain = pd.read_csv('dftrain.csv',delimiter=',')

In [43]:
dftrain.head()


Out[43]:
id water_availability terrain num_private region_code district_code population water_quality quality_group quantity_group source_type source_class waterpoint_type wp_age daysdiff payment status_group
0 69572 1 2 0 11 5 109 1 1 1 1 1 1 18 2225.0 1 1
1 8776 0 2 0 20 2 280 1 1 2 2 2 1 7 1502.0 2 1
2 34310 1 1 0 21 4 250 1 1 1 3 2 2 8 1511.0 3 1
3 67743 0 0 0 90 63 58 1 1 3 4 1 2 31 1539.0 2 2
4 19728 0 0 0 18 1 0 1 1 4 2 2 1 2017 2104.0 2 1

In [44]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel

In [45]:
features = list(dftrain.columns)
print features


['id', 'water_availability', 'terrain', 'num_private', 'region_code', 'district_code', 'population', 'water_quality', 'quality_group', 'quantity_group', 'source_type', 'source_class', 'waterpoint_type', 'wp_age', 'daysdiff', 'payment', 'status_group']

In [51]:
selfeats = [features[1],features[2],features[3],features[4],features[5],features[6],features[7],features[8],features[9],features[10],features[11],features[12],features[13],features[14],features[15]]

In [52]:
trainfeats = dftrain[selfeats]
labels = dftrain['status_group']

In [53]:
clf = ExtraTreesClassifier()
clf = clf.fit(trainfeats,labels)

In [54]:
clf.feature_importances_


Out[54]:
array([ 0.03329123,  0.01796841,  0.00338161,  0.06100113,  0.07694766,
        0.12684107,  0.01593033,  0.0176505 ,  0.13753921,  0.04741611,
        0.0115643 ,  0.09086236,  0.11472087,  0.20008682,  0.0447984 ])

In [55]:
model = SelectFromModel(clf, prefit=True)
X_new = model.transform(trainfeats)
print(model.get_support())
X_new.shape


[False False False False  True  True False False  True False False  True
  True  True False]
Out[55]:
(47520, 6)

In [56]:
clf2 = DecisionTreeClassifier()
clf2 = clf2.fit(X_new,labels)

In [57]:
dfvalidation = pd.read_csv('validation.csv',delimiter=',')

In [58]:
dfvalidation.head()


Out[58]:
id water_availability terrain num_private region_code district_code population water_quality quality_group quantity_group source_type source_class waterpoint_type wp_age daysdiff payment status_group
0 45078 0 2 0 19 4 0 1 1 1 5 1 3 2017 2080.0 2 1
1 31768 1 2 0 80 13 860 1 1 1 5 1 3 32 1546.0 3 1
2 36242 0 2 0 3 4 30 1 1 1 1 1 0 41 1504.0 2 3
3 18510 1 0 0 4 8 40 1 1 2 1 1 1 27 2223.0 6 1
4 45518 0 0 0 12 6 0 1 1 1 6 2 1 2017 2108.0 2 1

In [59]:
dfvaltest = dfvalidation[['id', 'water_availability', 'terrain', 'num_private', 'region_code', 'district_code', 'population', 'water_quality', 'quality_group', 'quantity_group', 'source_type', 'source_class', 'waterpoint_type', 'wp_age', 'daysdiff','payment']].copy()

In [60]:
dfvaltest.head()


Out[60]:
id water_availability terrain num_private region_code district_code population water_quality quality_group quantity_group source_type source_class waterpoint_type wp_age daysdiff payment
0 45078 0 2 0 19 4 0 1 1 1 5 1 3 2017 2080.0 2
1 31768 1 2 0 80 13 860 1 1 1 5 1 3 32 1546.0 3
2 36242 0 2 0 3 4 30 1 1 1 1 1 0 41 1504.0 2
3 18510 1 0 0 4 8 40 1 1 2 1 1 1 27 2223.0 6
4 45518 0 0 0 12 6 0 1 1 1 6 2 1 2017 2108.0 2

In [61]:
feat2test = list(dfvaltest.columns)
selfeats = [feat2test[5],feat2test[6],feat2test[9],feat2test[12],feat2test[13],feat2test[14]]
valfeats = dfvaltest[selfeats]

In [62]:
dfvaltest['prediction'] = clf2.predict(valfeats)

In [63]:
dfvaltest.head()


Out[63]:
id water_availability terrain num_private region_code district_code population water_quality quality_group quantity_group source_type source_class waterpoint_type wp_age daysdiff payment prediction
0 45078 0 2 0 19 4 0 1 1 1 5 1 3 2017 2080.0 2 1
1 31768 1 2 0 80 13 860 1 1 1 5 1 3 32 1546.0 3 1
2 36242 0 2 0 3 4 30 1 1 1 1 1 0 41 1504.0 2 3
3 18510 1 0 0 4 8 40 1 1 2 1 1 1 27 2223.0 6 1
4 45518 0 0 0 12 6 0 1 1 1 6 2 1 2017 2108.0 2 1

In [64]:
dfvaltest['status_group'] = dfvalidation['status_group']

#lets check the accuracy score....


In [65]:
from sklearn.metrics import accuracy_score

In [66]:
y_pred = dfvaltest['prediction']
y_true = dfvaltest['status_group']

In [67]:
accuracy_score(y_true,y_pred)


Out[67]:
0.74705387205387208

#open test dataframe


In [68]:
dftest = pd.read_csv('testsetfix.csv',delimiter=',')

In [69]:
dftest.head()


Out[69]:
id water_availability terrain num_private region_code district_code population water_quality quality_group quantity_group source_type source_class waterpoint_type wp_age daysdiff payment
0 50785 0 2 0 21 3 321 1 1 4 2 2 0 5 1532.0 2
1 51630 0 2 0 2 2 300 1 1 2 1 1 1 17 1532.0 2
2 17168 0 2 0 13 2 500 1 1 2 2 2 0 7 1535.0 2
3 45559 0 0 0 80 43 250 1 1 3 5 1 0 30 1545.0 0
4 49871 1 2 0 10 3 60 1 1 1 1 1 1 17 1481.0 6

In [70]:
feat2pred = list(dftest.columns)
selfeat_p = [feat2pred[5],feat2pred[6],feat2pred[9],feat2pred[12],feat2pred[13],feat2pred[14]]
predfeats = dftest[selfeat_p]

In [71]:
dftest['status_group'] = clf2.predict(predfeats)

In [72]:
dftest.head()


Out[72]:
id water_availability terrain num_private region_code district_code population water_quality quality_group quantity_group source_type source_class waterpoint_type wp_age daysdiff payment status_group
0 50785 0 2 0 21 3 321 1 1 4 2 2 0 5 1532.0 2 2
1 51630 0 2 0 2 2 300 1 1 2 1 1 1 17 1532.0 2 1
2 17168 0 2 0 13 2 500 1 1 2 2 2 0 7 1535.0 2 1
3 45559 0 0 0 80 43 250 1 1 3 5 1 0 30 1545.0 0 2
4 49871 1 2 0 10 3 60 1 1 1 1 1 1 17 1481.0 6 1

In [73]:
dftest.loc[dftest['status_group']==3]


Out[73]:
id water_availability terrain num_private region_code district_code population water_quality quality_group quantity_group source_type source_class waterpoint_type wp_age daysdiff payment status_group
8 36301 1 1 0 90 33 40 1 1 2 1 1 1 20 1544.0 3 3
18 28330 0 0 0 17 5 0 1 1 4 2 2 1 2017 1653.0 0 3
34 37034 1 2 0 11 4 360 1 1 1 6 2 1 39 2212.0 6 3
37 707 1 2 3 3 2 45 1 1 1 1 1 1 45 1514.0 6 3
38 48489 0 0 0 5 6 1 1 1 2 6 2 1 33 2240.0 2 3
56 17853 0 2 0 19 1 1200 1 1 2 1 1 4 18 2079.0 2 3
71 8869 1 0 0 10 1 1 1 1 1 6 2 1 31 1533.0 6 3
87 20773 0 0 0 1 1 0 1 1 1 4 1 1 2017 2209.0 2 3
135 30309 0 2 0 17 1 250 1 1 1 5 1 3 19 1537.0 2 3
137 3496 0 0 0 5 1 189 2 2 1 4 1 1 12 2190.0 2 3
141 74104 0 0 0 1 4 0 1 1 2 4 1 2 2017 2225.0 0 3
192 13323 0 0 0 19 2 0 2 2 2 5 1 3 2017 2062.0 2 3
198 52145 0 0 0 90 33 40 1 1 1 1 1 1 22 1524.0 2 3
236 14853 0 0 0 12 1 0 1 1 1 4 1 1 2017 2207.0 3 3
249 72139 1 2 0 21 5 1800 1 1 1 4 1 2 12 1519.0 3 3
266 39300 0 0 0 1 1 0 1 1 1 1 1 1 2017 2191.0 2 3
274 67682 0 2 0 2 2 150 1 1 1 1 1 1 12 1483.0 0 3
281 64903 0 2 0 2 1 1 1 1 4 2 2 1 17 1487.0 2 3
294 19450 1 0 0 5 3 200 1 1 2 6 2 1 14 2210.0 6 3
297 23254 1 1 0 10 1 1 1 1 1 4 1 2 6 1523.0 3 3
332 70288 0 0 0 1 1 0 1 1 1 4 1 1 2017 2191.0 2 3
356 38301 0 2 0 16 1 350 1 1 1 1 1 1 5 1528.0 0 3
359 61415 0 2 0 19 1 300 1 1 2 5 1 3 23 2091.0 2 3
372 68162 0 2 0 16 1 450 1 1 4 2 2 1 12 1529.0 0 3
378 45590 0 0 0 12 7 0 1 1 1 6 2 1 2017 2208.0 1 3
409 51000 0 0 0 18 30 0 1 1 2 1 1 1 2017 2090.0 2 3
437 55279 0 0 0 17 3 0 1 1 4 2 2 0 2017 1651.0 2 3
440 1998 0 0 0 18 2 0 1 1 1 1 1 1 2017 2097.0 2 3
490 31787 0 2 0 19 1 350 1 1 2 5 1 0 13 2086.0 2 3
496 3138 0 0 0 17 2 0 1 1 4 2 2 1 2017 1624.0 5 3
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
14370 47689 1 2 0 2 7 180 1 1 2 6 2 1 53 1505.0 3 3
14385 46851 0 2 0 5 1 150 1 1 1 4 1 1 45 2209.0 2 3
14400 17105 0 0 0 12 4 0 1 1 2 6 2 1 2017 2082.0 2 3
14409 51525 1 2 0 16 2 410 1 1 1 6 2 2 16 1533.0 6 3
14418 50629 0 2 0 5 1 150 1 1 1 1 1 1 7 2211.0 2 3
14478 57348 0 0 0 1 1 0 1 1 2 1 1 1 2017 2191.0 2 3
14486 55433 0 0 0 5 6 200 1 1 1 6 2 1 50 2237.0 2 3
14490 4371 1 2 0 16 2 310 1 1 1 6 2 1 17 1528.0 6 3
14531 14280 0 2 0 3 1 1 1 1 2 1 1 1 10 1503.0 6 3
14559 41653 0 2 0 13 2 1 1 1 2 4 1 1 57 1550.0 0 3
14563 63295 0 2 0 3 4 1 1 1 1 1 1 0 47 1486.0 6 3
14574 15834 0 2 0 11 4 50 1 1 1 5 1 3 14 2239.0 2 3
14608 28800 0 0 0 17 1 0 1 1 2 5 1 3 2017 1548.0 2 3
14624 26244 0 0 0 12 7 0 5 5 4 6 2 1 2017 2192.0 2 3
14636 31723 1 1 0 16 3 172 1 1 1 6 2 2 8 1540.0 6 3
14659 17950 1 0 0 5 3 107 1 1 1 6 2 1 22 2229.0 6 3
14710 56401 0 0 0 12 7 0 5 5 4 6 2 1 2017 2192.0 2 3
14717 44895 1 1 0 15 1 280 1 1 1 1 1 1 30 2087.0 1 3
14729 12363 0 1 0 10 5 500 1 1 0 4 1 0 6 1510.0 0 3
14738 71022 0 0 0 17 5 0 1 1 1 5 1 0 2017 1640.0 0 3
14741 51268 0 0 0 18 1 0 1 1 1 1 1 1 2017 2090.0 2 3
14742 65296 0 0 0 3 3 603 1 1 1 6 2 1 8 1413.0 2 3
14743 49042 0 0 0 12 1 0 1 1 1 4 1 1 2017 2210.0 2 3
14745 47336 1 2 0 5 1 250 1 1 1 1 1 1 11 2186.0 3 3
14746 47648 0 0 0 12 6 0 1 1 1 5 1 3 2017 2197.0 4 3
14775 49339 0 0 0 17 8 0 2 2 1 5 1 0 2017 1640.0 3 3
14783 36768 0 2 0 15 2 100 1 1 2 4 1 3 32 2068.0 2 3
14787 26494 0 2 0 20 1 1200 2 2 1 5 1 0 23 1598.0 0 3
14834 12592 0 0 0 1 1 0 1 1 2 1 1 1 2017 2212.0 6 3
14842 57731 1 1 0 16 3 230 1 1 1 6 2 2 8 1540.0 6 3

860 rows × 17 columns


In [94]:
submission = dftest[['id','status_group']].copy()

In [95]:
submission['status_group'].replace(1,'functional',inplace=True)
submission['status_group'].replace(2,'non functional',inplace=True)
submission['status_group'].replace(3,'functional needs repair',inplace=True)

In [96]:
submission.to_csv('submission6baggedDT_77.csv',sep=',',header=True,index=False)

In [97]:
print "done!"


done!

In [78]:
from sklearn.ensemble import BaggingClassifier

In [126]:
clf3 = BaggingClassifier(n_estimators=50)
clf3 = clf3.fit(X_new,labels)

In [127]:
dftest['status_group'] = clf3.predict(predfeats)

In [128]:
dfvaltest['prediction'] = clf3.predict(valfeats)

In [129]:
accuracy_score(y_true,y_pred)


Out[129]:
0.76986531986531992

Done! Kesimpulan: DT sebagai unstable ML algorithm (akurasi prediksi bisa berubah banyak jika jumlah data berubah) dapat dibantu akurasinya dengan teknik Bagging (Bootstrap Aggregating), yakni bagging ("mewadahi dalam tas") sample subset ke dalam banyak "bag" dan melakukan klasifikasi pada masing-masing wadah, lalu nilai agregat nya yang dipakai. Hal ini bertujuan mengurangi variansi. Untuk n_estimators (banyaknya "bag"/"tas"), perlu dicari nilai optimalnya, dalam kasus ini 50 paling optimal dengan hasil akurasi 0.77, naik dari semula (DT tanpa bagging) sebesar 0.74. Sehingga perlu dicari cara lain lagi untuk bisa lebih meningkatkan akurasi.

cara berikutnya mungkin kembali ke features engineering: membuat feature (kolom) baru yang bisa menggambarkan ciri khas/pattern dalam data. (pengibaratan: feature disini adalah seperti kumis pada wajah, atau bentuk bibir, saat harus mengklasifikasi wajah menjadi male/female)


In [ ]: