notebook.community

Edit and run



In [1]:

    
import pandas as pd
import numpy as np
from sklearn import metrics, preprocessing, linear_model


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import SVC
from sklearn import grid_search
from sklearn.metrics import log_loss
from sklearn.cross_validation import train_test_split

from sklearn.metrics import roc_auc_score, log_loss
from sklearn import metrics









    



/home/analyst/anaconda3/lib/python3.6/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
/home/analyst/anaconda3/lib/python3.6/site-packages/sklearn/grid_search.py:42: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. This module will be removed in 0.20.
  DeprecationWarning)



In [2]:

    
train = pd.read_csv('numerai_training_data.csv')
test = pd.read_csv('numerai_tournament_data.csv')
example = pd.read_csv('example_predictions.csv')



In [3]:

    
id_test = test.id
id_test[0:5]
target = train.target



In [4]:

    
test.drop(['id', 'target','data_type'], axis=1, inplace = True )
train.drop(['target', 'id', 'data_type', ], axis=1, inplace = True)



In [5]:

    
train.head()









    Out[5]:







  
    
      
      era
      feature1
      feature2
      feature3
      feature4
      feature5
      feature6
      feature7
      feature8
      feature9
      ...
      feature41
      feature42
      feature43
      feature44
      feature45
      feature46
      feature47
      feature48
      feature49
      feature50
    
  
  
    
      0
      era1
      0.55098
      0.42673
      0.40180
      0.44622
      0.68562
      0.45346
      0.24763
      0.61223
      0.52564
      ...
      0.53232
      0.55924
      0.64714
      0.62358
      0.40199
      0.51210
      0.42287
      0.33241
      0.54669
      0.55408
    
    
      1
      era1
      0.32694
      0.37829
      0.38716
      0.41725
      0.50691
      0.38413
      0.61237
      0.40076
      0.52302
      ...
      0.46287
      0.29351
      0.57591
      0.40191
      0.60666
      0.53842
      0.52236
      0.55653
      0.26194
      0.33737
    
    
      2
      era1
      0.45440
      0.45144
      0.55052
      0.64551
      0.63833
      0.51962
      0.34126
      0.57061
      0.44524
      ...
      0.40919
      0.69339
      0.44649
      0.58797
      0.51314
      0.42471
      0.31818
      0.61949
      0.66547
      0.57674
    
    
      3
      era1
      0.64494
      0.60252
      0.43466
      0.60305
      0.52179
      0.42805
      0.59592
      0.33314
      0.48087
      ...
      0.40814
      0.19793
      0.45149
      0.56702
      0.31475
      0.42197
      0.38904
      0.59570
      0.42558
      0.44277
    
    
      4
      era1
      0.39060
      0.62302
      0.73704
      0.58155
      0.42124
      0.54693
      0.51778
      0.35616
      0.37648
      ...
      0.37677
      0.64195
      0.22353
      0.48502
      0.46545
      0.52634
      0.32485
      0.62126
      0.51344
      0.55221
    
  

5 rows × 51 columns



In [6]:

    
test.head()









    Out[6]:







  
    
      
      era
      feature1
      feature2
      feature3
      feature4
      feature5
      feature6
      feature7
      feature8
      feature9
      ...
      feature41
      feature42
      feature43
      feature44
      feature45
      feature46
      feature47
      feature48
      feature49
      feature50
    
  
  
    
      0
      era86
      0.28523
      0.52729
      0.60784
      0.43518
      0.32576
      0.63765
      0.44005
      0.49780
      0.53072
      ...
      0.53073
      0.35296
      0.46170
      0.50857
      0.40087
      0.55512
      0.65612
      0.60729
      0.37915
      0.46449
    
    
      1
      era86
      0.38658
      0.57589
      0.36267
      0.36722
      0.52405
      0.54712
      0.63671
      0.40546
      0.68287
      ...
      0.57903
      0.26381
      0.56040
      0.36975
      0.50206
      0.59444
      0.65968
      0.42385
      0.33500
      0.35268
    
    
      2
      era86
      0.33371
      0.54650
      0.49027
      0.40156
      0.43806
      0.47818
      0.55603
      0.27314
      0.42985
      ...
      0.40163
      0.34934
      0.46677
      0.28978
      0.63833
      0.70284
      0.46035
      0.49885
      0.29836
      0.52302
    
    
      3
      era86
      0.29859
      0.35833
      0.47076
      0.38464
      0.52346
      0.48471
      0.55128
      0.45734
      0.62827
      ...
      0.66385
      0.44130
      0.55330
      0.37002
      0.42181
      0.45798
      0.54207
      0.58005
      0.48274
      0.44154
    
    
      4
      era86
      0.60599
      0.69024
      0.80057
      0.52854
      0.43206
      0.61740
      0.44755
      0.36829
      0.50066
      ...
      0.26655
      0.39515
      0.32171
      0.62867
      0.53641
      0.54626
      0.67708
      0.67124
      0.33609
      0.44527
    
  

5 rows × 51 columns



In [7]:

    
from sklearn import preprocessing
lbl = preprocessing.LabelEncoder()

shapeTrain = train.shape[0]
shapeTest = test.shape[0]
train = train.append(test)

eratrain = train.era.str.get_dummies()
eratrain = pd.concat((eratrain,train),axis=1) 

#toTransform = ['era']
#for f in toTransform: 
#    lbl = preprocessing.LabelEncoder() 
#    lbl.fit(list(eratrain[f].values)) 
#    eratrain[f] = lbl.transform(list(eratrain[f].values))
  
test = eratrain[shapeTrain:shapeTrain+shapeTest]
train = eratrain[0:shapeTrain]



In [ ]:



In [8]:

    
eratrain.head()









    Out[8]:







  
    
      
      era1
      era10
      era11
      era12
      era13
      era14
      era15
      era16
      era17
      era18
      ...
      feature41
      feature42
      feature43
      feature44
      feature45
      feature46
      feature47
      feature48
      feature49
      feature50
    
  
  
    
      0
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0.53232
      0.55924
      0.64714
      0.62358
      0.40199
      0.51210
      0.42287
      0.33241
      0.54669
      0.55408
    
    
      1
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0.46287
      0.29351
      0.57591
      0.40191
      0.60666
      0.53842
      0.52236
      0.55653
      0.26194
      0.33737
    
    
      2
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0.40919
      0.69339
      0.44649
      0.58797
      0.51314
      0.42471
      0.31818
      0.61949
      0.66547
      0.57674
    
    
      3
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0.40814
      0.19793
      0.45149
      0.56702
      0.31475
      0.42197
      0.38904
      0.59570
      0.42558
      0.44277
    
    
      4
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      ...
      0.37677
      0.64195
      0.22353
      0.48502
      0.46545
      0.52634
      0.32485
      0.62126
      0.51344
      0.55221
    
  

5 rows × 149 columns



In [ ]:



In [9]:

    
train.columns









    Out[9]:





Index(['era1', 'era10', 'era11', 'era12', 'era13', 'era14', 'era15', 'era16',
       'era17', 'era18',
       ...
       'feature41', 'feature42', 'feature43', 'feature44', 'feature45',
       'feature46', 'feature47', 'feature48', 'feature49', 'feature50'],
      dtype='object', length=149)



In [25]:

    
train = train.apply(pd.to_numeric, errors='coerce')
target = target.apply(pd.to_numeric, errors='coerce')


train.fillna(0, inplace=True)
train.fillna(0, inplace=True)

X = train
y = target


X_fit, X_eval, y_fit, y_eval= train_test_split(
    X, y, test_size=0.10, random_state=1
)



In [26]:

    
X_fit['era21'].dtypes









    Out[26]:





dtype('int64')



In [27]:

    
rf = RandomForestClassifier(n_estimators=301, n_jobs=5, max_depth =8)
rf.fit(X_fit, y_fit)
print ('RandomForestClassifier ', (log_loss(y_eval, rf.predict_proba(X_eval))))









    



RandomForestClassifier  0.691705044448



In [28]:

    
submission = pd.DataFrame({"id":id_test, "probability":rf.predict_proba(test)[:,1]})
submission.to_csv("submission_rf_5(n-300, defth=9 with eradummies).csv", index=False)









    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-28-2ba07938d919> in <module>()
----> 1 submission = pd.DataFrame({"id":id_test, "probability":rf.predict_proba(test)[:,1]})
      2 submission.to_csv("submission_rf_5(n-300, defth=9 with eradummies).csv", index=False)

~/anaconda3/lib/python3.6/site-packages/sklearn/ensemble/forest.py in predict_proba(self, X)
    576         check_is_fitted(self, 'estimators_')
    577         # Check data
--> 578         X = self._validate_X_predict(X)
    579 
    580         # Assign chunk of trees to jobs

~/anaconda3/lib/python3.6/site-packages/sklearn/ensemble/forest.py in _validate_X_predict(self, X)
    355                                  "call `fit` before exploiting the model.")
    356 
--> 357         return self.estimators_[0]._validate_X_predict(X, check_input=True)
    358 
    359     @property

~/anaconda3/lib/python3.6/site-packages/sklearn/tree/tree.py in _validate_X_predict(self, X, check_input)
    371         """Validate X whenever one tries to predict, apply, predict_proba"""
    372         if check_input:
--> 373             X = check_array(X, dtype=DTYPE, accept_sparse="csr")
    374             if issparse(X) and (X.indices.dtype != np.intc or
    375                                 X.indptr.dtype != np.intc):

~/anaconda3/lib/python3.6/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
    431                                       force_all_finite)
    432     else:
--> 433         array = np.array(array, dtype=dtype, order=order, copy=copy)
    434 
    435         if ensure_2d:

ValueError: could not convert string to float: 'eraX'



In [ ]:

    
lr = LogisticRegression()
lrCV = LogisticRegression()
lrCV.fit(X_fit, y_fit)

### write log_reg default
logloss_train = log_loss(y_fit, lrCV.predict_proba(X_fit))
logloss_val = log_loss(y_eval, lrCV.predict_proba(X_eval))

print ('logloss_train: ', logloss_train)
print ('logloss_val: ', logloss_val)


submission = pd.DataFrame({"id":id_test, "probability":lrCV.predict_proba(test)[:,1]})
submission.to_csv("submission_logreg.csv", index=False)

	era	feature1	feature2	feature3	feature4	feature5	feature6	feature7	feature8	feature9	...	feature41	feature42	feature43	feature44	feature45	feature46	feature47	feature48	feature49	feature50
0	era1	0.55098	0.42673	0.40180	0.44622	0.68562	0.45346	0.24763	0.61223	0.52564	...	0.53232	0.55924	0.64714	0.62358	0.40199	0.51210	0.42287	0.33241	0.54669	0.55408
1	era1	0.32694	0.37829	0.38716	0.41725	0.50691	0.38413	0.61237	0.40076	0.52302	...	0.46287	0.29351	0.57591	0.40191	0.60666	0.53842	0.52236	0.55653	0.26194	0.33737
2	era1	0.45440	0.45144	0.55052	0.64551	0.63833	0.51962	0.34126	0.57061	0.44524	...	0.40919	0.69339	0.44649	0.58797	0.51314	0.42471	0.31818	0.61949	0.66547	0.57674
3	era1	0.64494	0.60252	0.43466	0.60305	0.52179	0.42805	0.59592	0.33314	0.48087	...	0.40814	0.19793	0.45149	0.56702	0.31475	0.42197	0.38904	0.59570	0.42558	0.44277
4	era1	0.39060	0.62302	0.73704	0.58155	0.42124	0.54693	0.51778	0.35616	0.37648	...	0.37677	0.64195	0.22353	0.48502	0.46545	0.52634	0.32485	0.62126	0.51344	0.55221

	era	feature1	feature2	feature3	feature4	feature5	feature6	feature7	feature8	feature9	...	feature41	feature42	feature43	feature44	feature45	feature46	feature47	feature48	feature49	feature50
0	era86	0.28523	0.52729	0.60784	0.43518	0.32576	0.63765	0.44005	0.49780	0.53072	...	0.53073	0.35296	0.46170	0.50857	0.40087	0.55512	0.65612	0.60729	0.37915	0.46449
1	era86	0.38658	0.57589	0.36267	0.36722	0.52405	0.54712	0.63671	0.40546	0.68287	...	0.57903	0.26381	0.56040	0.36975	0.50206	0.59444	0.65968	0.42385	0.33500	0.35268
2	era86	0.33371	0.54650	0.49027	0.40156	0.43806	0.47818	0.55603	0.27314	0.42985	...	0.40163	0.34934	0.46677	0.28978	0.63833	0.70284	0.46035	0.49885	0.29836	0.52302
3	era86	0.29859	0.35833	0.47076	0.38464	0.52346	0.48471	0.55128	0.45734	0.62827	...	0.66385	0.44130	0.55330	0.37002	0.42181	0.45798	0.54207	0.58005	0.48274	0.44154
4	era86	0.60599	0.69024	0.80057	0.52854	0.43206	0.61740	0.44755	0.36829	0.50066	...	0.26655	0.39515	0.32171	0.62867	0.53641	0.54626	0.67708	0.67124	0.33609	0.44527