In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics, preprocessing, linear_model
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import SVC
from sklearn import grid_search
from sklearn.metrics import log_loss
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score, log_loss
from sklearn import metrics
In [2]:
train = pd.read_csv('numerai_training_data.csv')
test = pd.read_csv('numerai_tournament_data.csv')
example = pd.read_csv('example_predictions.csv')
In [3]:
id_test = test.id
id_test[0:5]
target = train.target
In [4]:
test.drop(['id', 'target','data_type'], axis=1, inplace = True )
train.drop(['target', 'id', 'data_type', ], axis=1, inplace = True)
In [5]:
train.head()
Out[5]:
In [6]:
test.head()
Out[6]:
In [7]:
from sklearn import preprocessing
lbl = preprocessing.LabelEncoder()
shapeTrain = train.shape[0]
shapeTest = test.shape[0]
train = train.append(test)
eratrain = train.era.str.get_dummies()
eratrain = pd.concat((eratrain,train),axis=1)
#toTransform = ['era']
#for f in toTransform:
# lbl = preprocessing.LabelEncoder()
# lbl.fit(list(eratrain[f].values))
# eratrain[f] = lbl.transform(list(eratrain[f].values))
test = eratrain[shapeTrain:shapeTrain+shapeTest]
train = eratrain[0:shapeTrain]
In [ ]:
In [8]:
eratrain.head()
Out[8]:
In [ ]:
In [9]:
train.columns
Out[9]:
In [25]:
train = train.apply(pd.to_numeric, errors='coerce')
target = target.apply(pd.to_numeric, errors='coerce')
train.fillna(0, inplace=True)
train.fillna(0, inplace=True)
X = train
y = target
X_fit, X_eval, y_fit, y_eval= train_test_split(
X, y, test_size=0.10, random_state=1
)
In [26]:
X_fit['era21'].dtypes
Out[26]:
In [27]:
rf = RandomForestClassifier(n_estimators=301, n_jobs=5, max_depth =8)
rf.fit(X_fit, y_fit)
print ('RandomForestClassifier ', (log_loss(y_eval, rf.predict_proba(X_eval))))
In [28]:
submission = pd.DataFrame({"id":id_test, "probability":rf.predict_proba(test)[:,1]})
submission.to_csv("submission_rf_5(n-300, defth=9 with eradummies).csv", index=False)
In [ ]:
lr = LogisticRegression()
lrCV = LogisticRegression()
lrCV.fit(X_fit, y_fit)
### write log_reg default
logloss_train = log_loss(y_fit, lrCV.predict_proba(X_fit))
logloss_val = log_loss(y_eval, lrCV.predict_proba(X_eval))
print ('logloss_train: ', logloss_train)
print ('logloss_val: ', logloss_val)
submission = pd.DataFrame({"id":id_test, "probability":lrCV.predict_proba(test)[:,1]})
submission.to_csv("submission_logreg.csv", index=False)