In [19]:
from pprint import pprint
from time import time
import logging

import numpy as np
import pandas as pd

# ----------- Helper functions -------------- #
from sklearn import cross_validation
from sklearn.pipeline import make_pipeline

# ------------ Pre-processing --------------- #
from sklearn.decomposition import PCA

# ------------ Model functions -------------- #
from sklearn import tree
from sklearn import ensemble

Data Ingestion


In [3]:
filename = '../data/train.csv'
Y_COLUMN = 'label' # Name of column containing training labels.

# Display progress logs on stdout
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')

def create_training_set(filename, Y_COLUMN, test_size=0.4):
    """Takes in the training data location and column name corresponding to training labels. Returns an training
    test sets via a scikit cross validation train_test_split method."""

    sample = pd.read_csv(filename)

    data = sample.drop(Y_COLUMN, 1)

    target = sample.label.values.copy()

    X_train, X_test, y_train, y_test = cross_validation.train_test_split(
        data, target, test_size=test_size, random_state=0)

    return X_train, X_test, y_train, y_test


X_train, X_test, y_train, y_test = create_training_set(filename, Y_COLUMN)

Decision Tree Pipeline


In [21]:
pipeline = make_pipeline(
    PCA(copy=True, n_components=None, whiten=False),
    tree.DecisionTreeClassifier(),
)
pipeline.fit(X_train, y_train)
pipeline.score(X_train, y_train)

scores = cross_validation.cross_val_score(pipeline, X_train, y_train)

print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))


Accuracy: 0.76 (+/- 0.01)

Visualize Decision Tree

Random Forest Pipeline


In [20]:
pipeline = make_pipeline(
    #PCA(copy=True, n_components=None, whiten=False),
    ensemble.RandomForestClassifier(),
)
pipeline.fit(X_train, y_train)
pipeline.score(X_train, y_train)

scores = cross_validation.cross_val_score(pipeline, X_train, y_train)

print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))


Accuracy: 0.93 (+/- 0.00)

In [ ]: