In [2]:
import numpy as np

In [3]:
cd kaggle


/Users/martiom/kaggle

In [6]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn import preprocessing as pre
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression

In [20]:
train = pd.read_csv('train.csv')
train.replace([' ', '', -1], np.nan, inplace = True)

In [8]:


In [9]:
execfile('preprocessing.py')

In [23]:
print train.shape
train['NaNCount'] = train.isnull().sum(axis=1) 
print train.shape
train.drop('NaNCount', axis = 1, inplace=True)
print train.shape
X, y = convert_data(train, cat2vectors = False, normalize_numeric = False)
print X.shape, y.shape


(260753, 300)
(260753, 300)
(260753, 299)
QuoteNumber skipped
Original_Quote_Date skipped
QuoteConversion_Flag skipped
(260753, 285) (260753,)

In [24]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.90, random_state=36)

In [16]:
def check_classifier(Xtrain, ytrain, Xtest, ytest, clf):
    clf.fit(Xtrain, ytrain)
    print clf.score(Xtrain, ytrain)
    print clf.score(Xtest, ytest)

In [28]:
clf.fit?

In [25]:
%%timeit -r 1 -n 1
dc = lambda: GBC(learning_rate = 0.02, n_estimators = 100, max_depth = 10, min_samples_split = 4,
   subsample = 0.8, max_features = 0.66)
clf = dc()
check_classifier(Xtrain, ytrain, Xtest, ytest,clf )


0.947382550336
0.920235386359
1 loops, best of 1: 4min 12s per loop

In [15]:
948, 920

In [29]:
check_classifier(Xtrain, ytrain, Xtest, ytest,clf )


0.922440820318
0.923088723131

In [16]:
%%timeit -r 1 -n 1
import xgboost as xgb
dc = lambda: xgb.XGBClassifier()

clf = dc()
check_classifier(Xtrain, ytrain, Xtest, ytest,clf )

clf = dc()
clfbag = BaggingClassifier(clf, n_estimators=5)
check_classifier(Xtrain, ytrain, Xtest, ytest,clfbag )

clf = dc()
clf_isotonic = CalibratedClassifierCV(clf, cv=5, method='isotonic')
check_classifier(Xtrain, ytrain, Xtest, ytest,clf_isotonic )


0.921606696005
0.921688941727
0.921692984727
0.921631416464
0.923495460254
0.924066652605

In [ ]:
from sklearn.neighbors import KNeighborsClassifier

dc = lambda: KNeighborsClassifier(3)

clf = dc()
check_classifier(Xtrain, ytrain, Xtest, ytest,clf )

clf = dc()
clfbag = BaggingClassifier(clf, n_estimators=5)
check_classifier(Xtrain, ytrain, Xtest, ytest,clfbag )

clf = dc()
clf_isotonic = CalibratedClassifierCV(clf, cv=5, method='isotonic')
check_classifier(Xtrain, ytrain, Xtest, ytest,clf_isotonic )

In [9]:
clf = GBC()
clfbag = BaggingClassifier(clf, n_estimators=5)
check_classifier(Xtrain, ytrain, Xtest, ytest,clfbag )

clf = GBC()
clf_isotonic = CalibratedClassifierCV(clf, cv=5, method='isotonic')
check_classifier(Xtrain, ytrain, Xtest, ytest,clf_isotonic )


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-9-8f860b18794f> in <module>()
      1 clf = GBC()
      2 clfbag = BaggingClassifier(clf, n_estimators=5)
----> 3 check_classifier(Xtrain, ytrain, Xtest, ytest,clfbag )
      4 
      5 clf = GBC()

<ipython-input-8-d806c0394eef> in check_classifier(Xtrain, ytrain, Xtest, ytest, clf)
      1 def check_classifier(Xtrain, ytrain, Xtest, ytest, clf):
----> 2     clf.fit(Xtrain, ytrain)
      3     print clf.score(Xtrain, ytrain)
      4     print clf.score(Xtest, ytest)

/Users/martiom/anaconda/lib/python2.7/site-packages/sklearn/ensemble/bagging.pyc in fit(self, X, y, sample_weight)
    335                 seeds[starts[i]:starts[i + 1]],
    336                 verbose=self.verbose)
--> 337             for i in range(n_jobs))
    338 
    339         # Reduce

/Users/martiom/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self, iterable)
    802             self._iterating = True
    803 
--> 804             while self.dispatch_one_batch(iterator):
    805                 pass
    806 

/Users/martiom/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in dispatch_one_batch(self, iterator)
    660                 return False
    661             else:
--> 662                 self._dispatch(tasks)
    663                 return True
    664 

/Users/martiom/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in _dispatch(self, batch)
    568 
    569         if self._pool is None:
--> 570             job = ImmediateComputeBatch(batch)
    571             self._jobs.append(job)
    572             self.n_dispatched_batches += 1

/Users/martiom/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __init__(self, batch)
    181         # Don't delay the application, to avoid keeping the input
    182         # arguments in memory
--> 183         self.results = batch()
    184 
    185     def get(self):

/Users/martiom/anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self)
     70 
     71     def __call__(self):
---> 72         return [func(*args, **kwargs) for func, args, kwargs in self.items]
     73 
     74     def __len__(self):

/Users/martiom/anaconda/lib/python2.7/site-packages/sklearn/ensemble/bagging.pyc in _parallel_build_estimators(n_estimators, ensemble, X, y, sample_weight, seeds, verbose)
    111                 curr_sample_weight[not_indices] = 0
    112 
--> 113             estimator.fit(X[:, features], y, sample_weight=curr_sample_weight)
    114             samples = curr_sample_weight > 0.
    115 

/Users/martiom/anaconda/lib/python2.7/site-packages/sklearn/ensemble/gradient_boosting.pyc in fit(self, X, y, sample_weight, monitor)
   1023         # fit the boosting stages
   1024         n_stages = self._fit_stages(X, y, y_pred, sample_weight, random_state,
-> 1025                                     begin_at_stage, monitor, X_idx_sorted)
   1026         # change shape of arrays after fit (early-stopping or additional ests)
   1027         if n_stages != self.estimators_.shape[0]:

/Users/martiom/anaconda/lib/python2.7/site-packages/sklearn/ensemble/gradient_boosting.pyc in _fit_stages(self, X, y, y_pred, sample_weight, random_state, begin_at_stage, monitor, X_idx_sorted)
   1078             y_pred = self._fit_stage(i, X, y, y_pred, sample_weight,
   1079                                      sample_mask, random_state, X_idx_sorted,
-> 1080                                      X_csc, X_csr)
   1081 
   1082             # track deviance (= loss)

/Users/martiom/anaconda/lib/python2.7/site-packages/sklearn/ensemble/gradient_boosting.pyc in _fit_stage(self, i, X, y, y_pred, sample_weight, sample_mask, random_state, X_idx_sorted, X_csc, X_csr)
    782             else:
    783                 tree.fit(X, residual, sample_weight=sample_weight,
--> 784                          check_input=False, X_idx_sorted=X_idx_sorted)
    785 
    786             # update tree leaves

/Users/martiom/anaconda/lib/python2.7/site-packages/sklearn/tree/tree.pyc in fit(self, X, y, sample_weight, check_input, X_idx_sorted)
    304         # into each tree.
    305         if X_idx_sorted is None and presort:
--> 306                 X_idx_sorted = np.asfortranarray(np.argsort(X, axis=0),
    307                                                  dtype=np.int32)
    308 

/Users/martiom/anaconda/lib/python2.7/site-packages/numpy/core/fromnumeric.pyc in argsort(a, axis, kind, order)
    906     except AttributeError:
    907         return _wrapit(a, 'argsort', axis, kind, order)
--> 908     return argsort(axis, kind, order)
    909 
    910 

KeyboardInterrupt: 

In [ ]: