In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics, preprocessing, linear_model


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import SVC
from sklearn import grid_search
from sklearn.metrics import log_loss
from sklearn.cross_validation import train_test_split

from sklearn.metrics import roc_auc_score, log_loss
from sklearn import metrics


/home/analyst/anaconda3/lib/python3.6/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
/home/analyst/anaconda3/lib/python3.6/site-packages/sklearn/grid_search.py:42: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. This module will be removed in 0.20.
  DeprecationWarning)

In [2]:
train = pd.read_csv('numerai_training_data.csv')
test = pd.read_csv('numerai_tournament_data.csv')
example = pd.read_csv('example_predictions.csv')

In [3]:
id_test = test.id
id_test[0:5]
target = train.target

In [4]:
test.drop(['id', 'target','data_type'], axis=1, inplace = True )
train.drop(['target', 'id', 'data_type', ], axis=1, inplace = True)

In [5]:
train.head()


Out[5]:
era feature1 feature2 feature3 feature4 feature5 feature6 feature7 feature8 feature9 ... feature41 feature42 feature43 feature44 feature45 feature46 feature47 feature48 feature49 feature50
0 era1 0.55098 0.42673 0.40180 0.44622 0.68562 0.45346 0.24763 0.61223 0.52564 ... 0.53232 0.55924 0.64714 0.62358 0.40199 0.51210 0.42287 0.33241 0.54669 0.55408
1 era1 0.32694 0.37829 0.38716 0.41725 0.50691 0.38413 0.61237 0.40076 0.52302 ... 0.46287 0.29351 0.57591 0.40191 0.60666 0.53842 0.52236 0.55653 0.26194 0.33737
2 era1 0.45440 0.45144 0.55052 0.64551 0.63833 0.51962 0.34126 0.57061 0.44524 ... 0.40919 0.69339 0.44649 0.58797 0.51314 0.42471 0.31818 0.61949 0.66547 0.57674
3 era1 0.64494 0.60252 0.43466 0.60305 0.52179 0.42805 0.59592 0.33314 0.48087 ... 0.40814 0.19793 0.45149 0.56702 0.31475 0.42197 0.38904 0.59570 0.42558 0.44277
4 era1 0.39060 0.62302 0.73704 0.58155 0.42124 0.54693 0.51778 0.35616 0.37648 ... 0.37677 0.64195 0.22353 0.48502 0.46545 0.52634 0.32485 0.62126 0.51344 0.55221

5 rows × 51 columns


In [6]:
test.head()


Out[6]:
era feature1 feature2 feature3 feature4 feature5 feature6 feature7 feature8 feature9 ... feature41 feature42 feature43 feature44 feature45 feature46 feature47 feature48 feature49 feature50
0 era86 0.28523 0.52729 0.60784 0.43518 0.32576 0.63765 0.44005 0.49780 0.53072 ... 0.53073 0.35296 0.46170 0.50857 0.40087 0.55512 0.65612 0.60729 0.37915 0.46449
1 era86 0.38658 0.57589 0.36267 0.36722 0.52405 0.54712 0.63671 0.40546 0.68287 ... 0.57903 0.26381 0.56040 0.36975 0.50206 0.59444 0.65968 0.42385 0.33500 0.35268
2 era86 0.33371 0.54650 0.49027 0.40156 0.43806 0.47818 0.55603 0.27314 0.42985 ... 0.40163 0.34934 0.46677 0.28978 0.63833 0.70284 0.46035 0.49885 0.29836 0.52302
3 era86 0.29859 0.35833 0.47076 0.38464 0.52346 0.48471 0.55128 0.45734 0.62827 ... 0.66385 0.44130 0.55330 0.37002 0.42181 0.45798 0.54207 0.58005 0.48274 0.44154
4 era86 0.60599 0.69024 0.80057 0.52854 0.43206 0.61740 0.44755 0.36829 0.50066 ... 0.26655 0.39515 0.32171 0.62867 0.53641 0.54626 0.67708 0.67124 0.33609 0.44527

5 rows × 51 columns


In [7]:
from sklearn import preprocessing
lbl = preprocessing.LabelEncoder()

shapeTrain = train.shape[0]
shapeTest = test.shape[0]
train = train.append(test)

eratrain = train.era.str.get_dummies()
eratrain = pd.concat((eratrain,train),axis=1) 

#toTransform = ['era']
#for f in toTransform: 
#    lbl = preprocessing.LabelEncoder() 
#    lbl.fit(list(eratrain[f].values)) 
#    eratrain[f] = lbl.transform(list(eratrain[f].values))
  
test = eratrain[shapeTrain:shapeTrain+shapeTest]
train = eratrain[0:shapeTrain]

In [ ]:


In [8]:
eratrain.head()


Out[8]:
era1 era10 era11 era12 era13 era14 era15 era16 era17 era18 ... feature41 feature42 feature43 feature44 feature45 feature46 feature47 feature48 feature49 feature50
0 1 0 0 0 0 0 0 0 0 0 ... 0.53232 0.55924 0.64714 0.62358 0.40199 0.51210 0.42287 0.33241 0.54669 0.55408
1 1 0 0 0 0 0 0 0 0 0 ... 0.46287 0.29351 0.57591 0.40191 0.60666 0.53842 0.52236 0.55653 0.26194 0.33737
2 1 0 0 0 0 0 0 0 0 0 ... 0.40919 0.69339 0.44649 0.58797 0.51314 0.42471 0.31818 0.61949 0.66547 0.57674
3 1 0 0 0 0 0 0 0 0 0 ... 0.40814 0.19793 0.45149 0.56702 0.31475 0.42197 0.38904 0.59570 0.42558 0.44277
4 1 0 0 0 0 0 0 0 0 0 ... 0.37677 0.64195 0.22353 0.48502 0.46545 0.52634 0.32485 0.62126 0.51344 0.55221

5 rows × 149 columns


In [ ]:


In [9]:
train.columns


Out[9]:
Index(['era1', 'era10', 'era11', 'era12', 'era13', 'era14', 'era15', 'era16',
       'era17', 'era18',
       ...
       'feature41', 'feature42', 'feature43', 'feature44', 'feature45',
       'feature46', 'feature47', 'feature48', 'feature49', 'feature50'],
      dtype='object', length=149)

In [25]:
train = train.apply(pd.to_numeric, errors='coerce')
target = target.apply(pd.to_numeric, errors='coerce')


train.fillna(0, inplace=True)
train.fillna(0, inplace=True)

X = train
y = target


X_fit, X_eval, y_fit, y_eval= train_test_split(
    X, y, test_size=0.10, random_state=1
)

In [26]:
X_fit['era21'].dtypes


Out[26]:
dtype('int64')

In [27]:
rf = RandomForestClassifier(n_estimators=301, n_jobs=5, max_depth =8)
rf.fit(X_fit, y_fit)
print ('RandomForestClassifier ', (log_loss(y_eval, rf.predict_proba(X_eval))))


RandomForestClassifier  0.691705044448

In [28]:
submission = pd.DataFrame({"id":id_test, "probability":rf.predict_proba(test)[:,1]})
submission.to_csv("submission_rf_5(n-300, defth=9 with eradummies).csv", index=False)


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-28-2ba07938d919> in <module>()
----> 1 submission = pd.DataFrame({"id":id_test, "probability":rf.predict_proba(test)[:,1]})
      2 submission.to_csv("submission_rf_5(n-300, defth=9 with eradummies).csv", index=False)

~/anaconda3/lib/python3.6/site-packages/sklearn/ensemble/forest.py in predict_proba(self, X)
    576         check_is_fitted(self, 'estimators_')
    577         # Check data
--> 578         X = self._validate_X_predict(X)
    579 
    580         # Assign chunk of trees to jobs

~/anaconda3/lib/python3.6/site-packages/sklearn/ensemble/forest.py in _validate_X_predict(self, X)
    355                                  "call `fit` before exploiting the model.")
    356 
--> 357         return self.estimators_[0]._validate_X_predict(X, check_input=True)
    358 
    359     @property

~/anaconda3/lib/python3.6/site-packages/sklearn/tree/tree.py in _validate_X_predict(self, X, check_input)
    371         """Validate X whenever one tries to predict, apply, predict_proba"""
    372         if check_input:
--> 373             X = check_array(X, dtype=DTYPE, accept_sparse="csr")
    374             if issparse(X) and (X.indices.dtype != np.intc or
    375                                 X.indptr.dtype != np.intc):

~/anaconda3/lib/python3.6/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
    431                                       force_all_finite)
    432     else:
--> 433         array = np.array(array, dtype=dtype, order=order, copy=copy)
    434 
    435         if ensure_2d:

ValueError: could not convert string to float: 'eraX'

In [ ]:
lr = LogisticRegression()
lrCV = LogisticRegression()
lrCV.fit(X_fit, y_fit)

### write log_reg default
logloss_train = log_loss(y_fit, lrCV.predict_proba(X_fit))
logloss_val = log_loss(y_eval, lrCV.predict_proba(X_eval))

print ('logloss_train: ', logloss_train)
print ('logloss_val: ', logloss_val)


submission = pd.DataFrame({"id":id_test, "probability":lrCV.predict_proba(test)[:,1]})
submission.to_csv("submission_logreg.csv", index=False)