In [19]:
from pprint import pprint
from time import time
import logging
import numpy as np
import pandas as pd
# ----------- Helper functions -------------- #
from sklearn import cross_validation
from sklearn.pipeline import make_pipeline
# ------------ Pre-processing --------------- #
from sklearn.decomposition import PCA
# ------------ Model functions -------------- #
from sklearn import tree
from sklearn import ensemble
In [3]:
filename = '../data/train.csv'
Y_COLUMN = 'label' # Name of column containing training labels.
# Display progress logs on stdout
logging.basicConfig(level=logging.INFO,
format='%(asctime)s %(levelname)s %(message)s')
def create_training_set(filename, Y_COLUMN, test_size=0.4):
"""Takes in the training data location and column name corresponding to training labels. Returns an training
test sets via a scikit cross validation train_test_split method."""
sample = pd.read_csv(filename)
data = sample.drop(Y_COLUMN, 1)
target = sample.label.values.copy()
X_train, X_test, y_train, y_test = cross_validation.train_test_split(
data, target, test_size=test_size, random_state=0)
return X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = create_training_set(filename, Y_COLUMN)
In [21]:
pipeline = make_pipeline(
PCA(copy=True, n_components=None, whiten=False),
tree.DecisionTreeClassifier(),
)
pipeline.fit(X_train, y_train)
pipeline.score(X_train, y_train)
scores = cross_validation.cross_val_score(pipeline, X_train, y_train)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
In [20]:
pipeline = make_pipeline(
#PCA(copy=True, n_components=None, whiten=False),
ensemble.RandomForestClassifier(),
)
pipeline.fit(X_train, y_train)
pipeline.score(X_train, y_train)
scores = cross_validation.cross_val_score(pipeline, X_train, y_train)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
In [ ]: