In [1]:
%matplotlib inline
import pylab as pl
import numpy as np
import scipy as sp
import pandas as pd
from datetime import datetime, timedelta

import util

In [2]:
X = util.fetch(util.cache_path('train_X_before_2014-08-01_22-00-47'))
y = util.fetch(util.cache_path('train_y_before_2014-08-01_22-00-47'))

In [3]:
X[X[:, :45] > 0] = 1.

In [4]:
X[:, 47] = X[:, 47] / 3.

In [5]:
X[:, 48] = X[:, 48] / 6.

In [6]:
X[:, 49] = X[:, 49] / 20104.

In [7]:
X[:, 50] = X[:, 50] / 20104.

In [8]:
X[:, 141:] = X[:, 141:] / np.amax(X, axis=0)[141:]

In [9]:
i = 55
while i < 96:
    X[:, i] = X[:, i] / np.max(X[:, i])
    i += 5

In [10]:
from sklearn.svm import LinearSVC
from sklearn.cross_validation import cross_val_score, StratifiedKFold

In [11]:
svc = LinearSVC(dual=False, class_weight='auto')

In [12]:
cross_val_score(svc, X, y, scoring='roc_auc', cv=StratifiedKFold(y, 5))


Out[12]:
array([ 0.89339357,  0.9134252 ,  0.85240336,  0.86367327,  0.81409625])

In [13]:
svc.fit(X, y)


Out[13]:
LinearSVC(C=1.0, class_weight='auto', dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [14]:
from sklearn.metrics import roc_auc_score

In [15]:
from sklearn.calibration import CalibratedClassifierCV
clf = CalibratedClassifierCV(svc, cv=StratifiedKFold(y, 5),
                                      method='isotonic')
clf.fit(X, y)
roc_auc_score(y, clf.predict_proba(X)[:, 1])


Out[15]:
0.85831888308739623

In [16]:
svc.coef_[0][0:45]


Out[16]:
array([ 0.19560413,  0.17089759,  0.65764372, -0.16353356, -0.03430508,
        0.03438299,  0.0305282 , -0.81062161, -0.18258275, -0.02177491,
       -0.61820751, -0.1728157 ,  0.02736452, -0.05324924, -0.23152444,
       -0.13226027,  0.00292948, -0.13227422, -0.07393709, -0.01571165,
       -0.18954343, -0.1581372 ,  0.04015358,  0.1046442 , -0.06462214,
        0.1141877 , -0.0793758 ,  0.09890401, -0.1229745 , -0.02287298,
       -0.00665293,  0.02671975,  0.6407645 ,  0.72272215,  0.59951251,
        0.42662929, -0.0743573 ,  0.10653245, -0.2188587 , -0.08949457,
       -0.12845542,  0.27643448, -0.02939936,  0.27823046, -0.03132662])

In [17]:
from sklearn.linear_model import LogisticRegressionCV
lr = LogisticRegressionCV(cv=5, scoring='roc_auc')
lr.fit(X, y)
roc_auc_score(y, lr.predict_proba(X)[:, 1])


Out[17]:
0.85801240818802649

In [19]:
lr.coef_[0][0:45]


Out[19]:
array([ 0.55524728,  0.58337573,  3.14866739, -0.42387956, -0.08976478,
        0.13819725,  0.35633133, -3.41059384, -0.29468168, -0.03769797,
       -2.78221699, -0.59845394, -0.11601482,  0.0636001 , -0.54406701,
       -0.26319308, -0.25419769,  0.30933589, -0.50214663, -0.04211469,
       -0.49922947, -0.42282014, -0.13151163,  0.35713128, -0.17807037,
        0.32982723, -0.21705695,  0.3376418 , -1.53369482, -0.03792951,
       -0.19220702, -0.04559449,  1.55751474,  1.77052335,  1.64272848,
        1.67319217, -0.09988238,  0.69010081, -1.11456405, -0.2379652 ,
       -0.46870361,  1.34808414, -0.24040566,  2.74425955, -0.10113288])

In [ ]: