In [7]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV

In [8]:
df = pd.read_csv('./data/ad-dataset/ad.data', header=None)
explanatory_variable_columns = set(df.columns.values)
response_variable_column = df[len(df.columns.values)-1]
explanatory_variable_columns.remove(len(df.columns.values)-1)

y = [1 if e == 'ad.' else 0 for e in response_variable_column]
X = df[list(explanatory_variable_columns)]

X.replace(to_replace=' *\?', value=-1, regex=True, inplace=True)


/usr/local/lib/python2.7/site-packages/IPython/kernel/__main__.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [10]:
pipeline = Pipeline([
        ('clf', DecisionTreeClassifier(criterion='entropy'))
    ])

parameters = {
    'clf__max_depth': (150, 155, 160),
    'clf__min_samples_split': (1,2,3),
    'clf__min_samples_leaf': (1,2,3)
}

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='f1')

grid_search.fit(X_train, y_train)

print 'Best score: %0.3f' % grid_search.best_score_
print 'Best parameters set:'
best_parameters = grid_search.best_estimator_.get_params()

for param_name in sorted(parameters.keys()):
    print '\t%s: %r' % (param_name, best_parameters[param_name])
    
predictions = grid_search.predict(X_test)
print classification_report(y_test, predictions)


Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done  50 jobs       | elapsed:   13.4s
[Parallel(n_jobs=-1)]: Done  67 out of  81 | elapsed:   17.8s remaining:    3.7s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:   21.2s finished
Best score: 0.891
Best parameters set:
	clf__max_depth: 160
	clf__min_samples_leaf: 1
	clf__min_samples_split: 2
             precision    recall  f1-score   support

          0       0.99      0.98      0.98       706
          1       0.88      0.92      0.90       114

avg / total       0.97      0.97      0.97       820


In [ ]: