In [1]:
#https://www.kaggle.com/c/amazon-employee-access-challenge/forums/t/4797/starter-code-in-python-with-scikit-learn-auc-885?limit=all
#Vowpal Wabbit code: 
#http://www.kaggle.com/c/amazon-employee-access-challenge/forums/t/4735/beating-the-benchmark?page=2

In [1]:
from __future__ import division

import numpy as np
from sklearn import (metrics, cross_validation, linear_model, preprocessing, ensemble)

SEED = 42  # always use a seed for randomized procedures


def load_data(filename, use_labels=True):
    """
    Load data from CSV files and return them as numpy arrays
    The use_labels parameter indicates whether one should
    read the first column (containing class labels). If false,
    return all 0s. 
    """

    # load column 1 to 8 (ignore last one)
    data = np.loadtxt(open("data/" + filename), delimiter=',',
                      usecols=range(1, 9), skiprows=1)
    if use_labels:
        labels = np.loadtxt(open("data/" + filename), delimiter=',',
                            usecols=[0], skiprows=1)
    else:
        labels = np.zeros(data.shape[0])
    return labels, data


def save_results(predictions, filename):
    """Given a vector of predictions, save results in CSV format."""
    with open(filename, 'w') as f:
        f.write("id,ACTION\n")
        for i, pred in enumerate(predictions):
            f.write("%d,%f\n" % (i + 1, pred))

In [2]:
cd "/home/bakuda/ageekrepo/kaggle/amazon-access-challenge/sklearn/"


/home/bakuda/oldUbunuHome/bakuda/sandbox/ageekrepo/kaggle/amazon-access-challenge/sklearn

In [3]:
!ls


classifier_corrected.py  Starter code in python with scikit-learn (AUC .885) - Amazon.pdf
classifier.py		 submitAEAC2014.csv
data

In [6]:
model = linear_model.LogisticRegression(C=2)  # the classifier we'll use
    
    #model = ensemble.RandomForestClassifier()   # RandomForest wont accept sparse matrix

    # === load data in memory === #
    print "loading data"
    y, X = load_data('train.csv')
    y_test, X_test = load_data('test.csv', use_labels=False)

    # === one-hot encoding === #
    # we want to encode the category IDs encountered both in
    # the training and the test set, so we fit the encoder on both
    encoder = preprocessing.OneHotEncoder()
    encoder.fit(np.vstack((X, X_test)))
    
    X = encoder.transform(X)  # Returns a sparse matrix (see numpy.sparse)
    X_test = encoder.transform(X_test)


loading data

In [36]:
X.shape, y.shape, X_test.shape


Out[36]:
((32769, 16600), (32769,), (58921, 16600))

In [7]:
# === training & metrics === #
    mean_auc = 0.0
    n = 10  # repeat the CV procedure 10 times to get more precise results
    for i in range(n):
        # for each iteration, randomly hold out 20% of the data as CV set
        X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(
            X, y, test_size=.20, random_state=i*SEED)

        # if you want to perform feature selection / hyperparameter
        # optimization, this is where you want to do it

        # train model and make predictions
        model.fit(X_train, y_train) 
        preds = model.predict_proba(X_cv)[:, 1]

        # compute AUC metric for this CV fold
        fpr, tpr, thresholds = metrics.roc_curve(y_cv, preds)
        roc_auc = metrics.auc(fpr, tpr)
        print "AUC (fold %d/%d): %f" % (i + 1, n, roc_auc)
        mean_auc += roc_auc

    print "Mean AUC: %f" % (mean_auc/n)


AUC (fold 1/10): 0.864114
AUC (fold 2/10): 0.872416
AUC (fold 3/10): 0.875735
AUC (fold 4/10): 0.880353
AUC (fold 5/10): 0.850458
AUC (fold 6/10): 0.876558
AUC (fold 7/10): 0.852691
AUC (fold 8/10): 0.861178
AUC (fold 9/10): 0.862445
AUC (fold 10/10): 0.872142
Mean AUC: 0.866809

In [21]:
# When making predictions, retrain the model on the whole training set
    model.fit(X, y)
    preds = model.predict_proba(X_test)[:, 1]
    filename = raw_input("Enter name for submission file: ")
    save_results(preds, filename + ".csv")


Enter name for submission file: submitAEAC2014

In [ ]: