In [60]:
from pprint import pprint
from time import time
import logging

import numpy as np
import pandas as pd

# ----------- Helper functions -------------- #
from sklearn import cross_validation
from sklearn.pipeline import make_pipeline

# ------------ Pre-processing --------------- #
from sklearn.decomposition import PCA

# ------------ Model functions -------------- #
from sklearn import linear_model, svm

Data Ingestion


In [54]:
filename = '../data/train.csv'
Y_COLUMN = 'label' # Name of column containing training labels.

# Display progress logs on stdout
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')

def create_training_set(filename, Y_COLUMN, test_size=0.4):
    """Takes in the training data location and column name corresponding to training labels. Returns an training
    test sets via a scikit cross validation train_test_split method."""

    sample = pd.read_csv(filename)

    data = sample.drop(Y_COLUMN, 1)

    target = sample.label.values.copy()

    X_train, X_test, y_train, y_test = cross_validation.train_test_split(
        data, target, test_size=test_size, random_state=0)

    return X_train, X_test, y_train, y_test

Main Pipeline


In [62]:
pipeline = make_pipeline(PCA(
    copy=True, n_components=None, whiten=False),
    linear_model.LinearRegression(),
)

In [50]:
pipeline.fit(X_train, y_train)
pipeline.score(X_train, y_train)


Out[50]:
0.62615431243537911

In [63]:
scores = cross_validation.cross_val_score(pipeline, X_train, y_train)

In [64]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))


Accuracy: -504477321372171960320.00 (+/- 627751994475329880064.00)

In [ ]:


In [ ]: