In [1]:
df = pd.read_csv('numerai_training_data.csv')
df.sample(5)

X = df.drop('target', axis=1)
y = df['target']

print X.shape


(136573, 50)

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

clf = LogisticRegression()

scores = cross_val_score(clf, X, y, scoring='neg_log_loss', cv=4, n_jobs=4)
print scores
print -np.mean(scores)


[-0.69121732 -0.69219839 -0.69171326 -0.69104339]
0.69154308969

In [3]:
from xgboost import XGBClassifier

clf = XGBClassifier(n_estimators=100)

scores = cross_val_score(clf, X, y, scoring='neg_log_loss', cv=4, n_jobs=4)
print scores
print -np.mean(scores)


[-0.69176224 -0.69260348 -0.69211006 -0.69157048]
0.692011566601

In [4]:
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier(n_estimators=100)

scores = cross_val_score(clf, X, y, scoring='neg_log_loss', cv=4, n_jobs=4)
print scores
print -np.mean(scores)


[-0.69175972 -0.69270824 -0.69236938 -0.6916588 ]
0.692124033941