In [41]:
import pandas as pd
import numpy as np
import sklearn as sk
In [42]:
dftrain = pd.read_csv('dftrain.csv',delimiter=',')
In [43]:
dftrain.head()
Out[43]:
In [44]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
In [45]:
features = list(dftrain.columns)
print features
In [51]:
selfeats = [features[1],features[2],features[3],features[4],features[5],features[6],features[7],features[8],features[9],features[10],features[11],features[12],features[13],features[14],features[15]]
In [52]:
trainfeats = dftrain[selfeats]
labels = dftrain['status_group']
In [53]:
clf = ExtraTreesClassifier()
clf = clf.fit(trainfeats,labels)
In [54]:
clf.feature_importances_
Out[54]:
In [55]:
model = SelectFromModel(clf, prefit=True)
X_new = model.transform(trainfeats)
print(model.get_support())
X_new.shape
Out[55]:
In [56]:
clf2 = DecisionTreeClassifier()
clf2 = clf2.fit(X_new,labels)
In [57]:
dfvalidation = pd.read_csv('validation.csv',delimiter=',')
In [58]:
dfvalidation.head()
Out[58]:
In [59]:
dfvaltest = dfvalidation[['id', 'water_availability', 'terrain', 'num_private', 'region_code', 'district_code', 'population', 'water_quality', 'quality_group', 'quantity_group', 'source_type', 'source_class', 'waterpoint_type', 'wp_age', 'daysdiff','payment']].copy()
In [60]:
dfvaltest.head()
Out[60]:
In [61]:
feat2test = list(dfvaltest.columns)
selfeats = [feat2test[5],feat2test[6],feat2test[9],feat2test[12],feat2test[13],feat2test[14]]
valfeats = dfvaltest[selfeats]
In [62]:
dfvaltest['prediction'] = clf2.predict(valfeats)
In [63]:
dfvaltest.head()
Out[63]:
In [64]:
dfvaltest['status_group'] = dfvalidation['status_group']
In [65]:
from sklearn.metrics import accuracy_score
In [66]:
y_pred = dfvaltest['prediction']
y_true = dfvaltest['status_group']
In [67]:
accuracy_score(y_true,y_pred)
Out[67]:
In [68]:
dftest = pd.read_csv('testsetfix.csv',delimiter=',')
In [69]:
dftest.head()
Out[69]:
In [70]:
feat2pred = list(dftest.columns)
selfeat_p = [feat2pred[5],feat2pred[6],feat2pred[9],feat2pred[12],feat2pred[13],feat2pred[14]]
predfeats = dftest[selfeat_p]
In [71]:
dftest['status_group'] = clf2.predict(predfeats)
In [72]:
dftest.head()
Out[72]:
In [73]:
dftest.loc[dftest['status_group']==3]
Out[73]:
In [94]:
submission = dftest[['id','status_group']].copy()
In [95]:
submission['status_group'].replace(1,'functional',inplace=True)
submission['status_group'].replace(2,'non functional',inplace=True)
submission['status_group'].replace(3,'functional needs repair',inplace=True)
In [96]:
submission.to_csv('submission6baggedDT_77.csv',sep=',',header=True,index=False)
In [97]:
print "done!"
In [78]:
from sklearn.ensemble import BaggingClassifier
In [126]:
clf3 = BaggingClassifier(n_estimators=50)
clf3 = clf3.fit(X_new,labels)
In [127]:
dftest['status_group'] = clf3.predict(predfeats)
In [128]:
dfvaltest['prediction'] = clf3.predict(valfeats)
In [129]:
accuracy_score(y_true,y_pred)
Out[129]:
Done! Kesimpulan: DT sebagai unstable ML algorithm (akurasi prediksi bisa berubah banyak jika jumlah data berubah) dapat dibantu akurasinya dengan teknik Bagging (Bootstrap Aggregating), yakni bagging ("mewadahi dalam tas") sample subset ke dalam banyak "bag" dan melakukan klasifikasi pada masing-masing wadah, lalu nilai agregat nya yang dipakai. Hal ini bertujuan mengurangi variansi. Untuk n_estimators (banyaknya "bag"/"tas"), perlu dicari nilai optimalnya, dalam kasus ini 50 paling optimal dengan hasil akurasi 0.77, naik dari semula (DT tanpa bagging) sebesar 0.74. Sehingga perlu dicari cara lain lagi untuk bisa lebih meningkatkan akurasi.
cara berikutnya mungkin kembali ke features engineering: membuat feature (kolom) baru yang bisa menggambarkan ciri khas/pattern dalam data. (pengibaratan: feature disini adalah seperti kumis pada wajah, atau bentuk bibir, saat harus mengklasifikasi wajah menjadi male/female)
In [ ]: