In [9]:
import pandas as pd
import numpy as np

In [5]:
df = pd.read_clipboard()

In [6]:
df.head()


Out[6]:
Training_instance Plays_fetch Is_grumpy Favorite_food Species
0 1 Yes No Bacon Dog
1 2 No Yes Dog_Food Dog
2 3 No Yes Cat_food Cat
3 4 No Yes Bacon Cat
4 5 No No Cat_food Cat

In [7]:
df


Out[7]:
Training_instance Plays_fetch Is_grumpy Favorite_food Species
0 1 Yes No Bacon Dog
1 2 No Yes Dog_Food Dog
2 3 No Yes Cat_food Cat
3 4 No Yes Bacon Cat
4 5 No No Cat_food Cat
5 6 No Yes Bacon Cat
6 7 No Yes Cat_Food Cat
7 8 No No Dog_Food Dog
8 9 No Yes Cat_food Cat
9 10 Yes No Dog_Food Dog
10 11 Yes No Bacon Dog
11 12 No No Cat_food Cat
12 13 Yes Yes Cat_food Cat
13 14 Yes Yes Bacon Dog

In [8]:
!pwd


/home/topo

In [16]:
df.to_csv("/home/topo/repos/ghub/mml_sklearn/datasets/animal_food_train.csv", header=True, index=False)

In [15]:
#pd.read_csv("/home/topo/repos/ghub/mml_sklearn/datasets/animal_food_train.csv")

In [17]:
test = pd.read_clipboard()

In [18]:
test


Out[18]:
Training_instance Plays_fetch Is_grumpy Favorite_food Species
0 1 Yes No Bacon Dog
1 2 Yes Yes Dog_Food Dog
2 3 No Yes Dog_Food Cat
3 4 No Yes Bacon Cat
4 5 No No Cat_food Cat

In [19]:
test.to_csv("/home/topo/repos/ghub/mml_sklearn/datasets/animal_food_test.csv", header=True, index=False)

In [21]:
#pd.read_csv("/home/topo/repos/ghub/mml_sklearn/datasets/animal_food_test.csv")

In [22]:
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV

In [54]:
if __name__=="__main__":
    df = pd.read_csv("/home/topo/repos/ghub/mml_sklearn/datasets/ad-data/ad.data", header=None)
    explanatory_variable_columns = set(df.columns.values)
    response_variable_column = df[len(df.columns.values)-1]
    # The last column describes the targets
    explanatory_variable_columns.remove(len(df.columns.values)-1)
    y = [1 if e == 'ad.' else 0 for e in response_variable_column]
    X = df[list(explanatory_variable_columns)]
    # print X.head(5)
    
    X.replace(to_replace=' *\?', value=-1, regex=True, inplace=True)
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    
    pipeline = Pipeline ( [
            ('clf', DecisionTreeClassifier(criterion='gini'))
        ])
    parameters = {
        'clf__max_depth': (50,80, 100, 120, 150, 155, 160),
        'clf__min_samples_split': (1, 2, 3, 4, 5),
        'clf__min_samples_leaf': (1, 2, 3, 4, 5)
    }
    
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    print 'Best score: %0.3f' % grid_search.best_score_
    print 'Best parameters set:'
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print '\t%s: %r' % (param_name, best_parameters[param_name])
    predictions = grid_search.predict(X_test)
    print classification_report(y_test, predictions)
    print best_parameters


/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
[Parallel(n_jobs=-1)]: Done  49 tasks       | elapsed:   18.6s
[Parallel(n_jobs=-1)]: Done 199 tasks       | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 449 tasks       | elapsed:  2.9min
Fitting 3 folds for each of 175 candidates, totalling 525 fits
Best score: 0.974
Best parameters set:
	clf__max_depth: 155
	clf__min_samples_leaf: 1
	clf__min_samples_split: 5
             precision    recall  f1-score   support

          0       0.97      0.98      0.98       706
          1       0.88      0.81      0.84       114

avg / total       0.96      0.96      0.96       820

{'clf__class_weight': None, 'clf__splitter': 'best', 'clf__max_leaf_nodes': None, 'clf': DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=155,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=5, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'), 'clf__max_depth': 155, 'clf__min_samples_leaf': 1, 'clf__min_weight_fraction_leaf': 0.0, 'clf__presort': False, 'steps': [('clf', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=155,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=5, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'))], 'clf__random_state': None, 'clf__max_features': None, 'clf__criterion': 'gini', 'clf__min_samples_split': 5}
[Parallel(n_jobs=-1)]: Done 525 out of 525 | elapsed:  3.5min finished

In [55]:
# Using RandomForest Classifier for the same
from sklearn.ensemble import RandomForestClassifier
if __name__=="__main__":
    df = pd.read_csv("/home/topo/repos/ghub/mml_sklearn/datasets/ad-data/ad.data", header=None)
    explanatory_variable_columns = set(df.columns.values)
    response_variable_column = df[len(df.columns.values)-1]
    # The last column describes the targets
    explanatory_variable_columns.remove(len(df.columns.values)-1)
    y = [1 if e == 'ad.' else 0 for e in response_variable_column]
    X = df[list(explanatory_variable_columns)]
    # print X.head(5)
    
    X.replace(to_replace=' *\?', value=-1, regex=True, inplace=True)
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    
    pipeline = Pipeline ( [
            ('clf', RandomForestClassifier(criterion='gini'))
        ])
    parameters = {
        'clf__n_estimators': (5, 10, 20, 50),
        'clf__max_depth': (50, 150, 250),
        'clf__min_samples_split': (1, 2, 3),
        'clf__min_samples_leaf': (1, 2, 3)
    }
    
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    print 'Best score: %0.3f' % grid_search.best_score_
    print 'Best parameters set:'
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print '\t%s: %r' % (param_name, best_parameters[param_name])
    predictions = grid_search.predict(X_test)
    print classification_report(y_test, predictions)
    print best_parameters


/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
[Parallel(n_jobs=-1)]: Done  49 tasks       | elapsed:   19.0s
[Parallel(n_jobs=-1)]: Done 199 tasks       | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 324 out of 324 | elapsed:  2.1min finished
Fitting 3 folds for each of 108 candidates, totalling 324 fits
Best score: 0.978
Best parameters set:
	clf__max_depth: 150
	clf__min_samples_leaf: 1
	clf__min_samples_split: 1
	clf__n_estimators: 50
             precision    recall  f1-score   support

          0       0.98      1.00      0.99       700
          1       0.97      0.88      0.92       120

avg / total       0.98      0.98      0.98       820

{'clf__class_weight': None, 'clf__n_estimators': 50, 'clf__warm_start': False, 'clf__max_leaf_nodes': None, 'clf__oob_score': False, 'clf__bootstrap': True, 'clf': RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=150, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=1,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False), 'clf__max_depth': 150, 'clf__min_samples_leaf': 1, 'clf__verbose': 0, 'steps': [('clf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=150, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=1,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))], 'clf__min_weight_fraction_leaf': 0.0, 'clf__n_jobs': 1, 'clf__max_features': 'auto', 'clf__criterion': 'gini', 'clf__min_samples_split': 1, 'clf__random_state': None}

In [ ]: