In [1]:
import os
import sys
root_path = os.path.abspath("../../../")
if root_path not in sys.path:
    sys.path.append(root_path)

import numpy as np
from Util.Util import DataUtil

train_num = 6000
(x_train, y_train), (x_test, y_test) = DataUtil.get_dataset(
    "mushroom", "../../../_Data/mushroom.txt",
    n_train=train_num, tar_idx=0
)
x_train, y_train, wc, features, feat_dicts, label_dict = DataUtil.quantize_data(x_train, y_train)
x_test, y_test = DataUtil.transform_data(x_test, y_test, wc, feat_dicts, label_dict)

from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()
x_train_one_hot = enc.fit_transform(x_train)
x_test_one_hot = enc.transform(x_test)

In [2]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()
clf.fit(x_train_one_hot, y_train)
print(np.mean(y_test == clf.predict(x_test_one_hot)))


0.953389830508

In [3]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier()

clf.fit(x_train, y_train)
print(np.mean(y_test == clf.predict(x_test)))

clf.fit(x_train_one_hot, y_train)
print(np.mean(y_test == clf.predict(x_test_one_hot)))


1.0
1.0

In [4]:
from sklearn.svm import SVC

clf = SVC()

clf.fit(x_train, y_train)
print(np.mean(y_test == clf.predict(x_test)))

clf.fit(x_train_one_hot, y_train)
print(np.mean(y_test == clf.predict(x_test_one_hot)))


1.0
0.998116760829

In [5]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()

clf.fit(x_train, y_train)
print(np.mean(y_test == clf.predict(x_test)))

clf.fit(x_train_one_hot, y_train)
print(np.mean(y_test == clf.predict(x_test_one_hot)))


0.946798493409
0.999529190207