Workflow adapted from Katie Malone's Workflows in Python series
See also
sklearn.model_selection.cross_val_score
function
In [2]:
import pandas as pd
import sklearn
In [3]:
data_dir = '../data/raw/'
data_filename = 'blood_train.csv'
df_blood = pd.read_csv(data_dir+data_filename)
df_blood.head(10)
Out[3]:
iloc
to drop the subject id column and the prediction column (i.e. 'Made Donation in March 2007')predict_proba
, train_test_split
functions
In [4]:
X = df_blood.iloc[:,1:5].as_matrix()
y = list(df_blood["Made Donation in March 2007"])
In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= sklearn.model_selection.train_test_split(
X, y,
test_size=0.5,
random_state=0)
print("No. Rows in training set:\t", len(X_train))
print("No. Rows in testing set:\t" , len(X_test))
In [20]:
# Split data into 4 partitions
# - training set
# - validation set
# - combined training & validation set
# - testing set
# nrows_total = df_blood.count()[1]
# nrows_train = int(nrows_total/3)
# nrows_valid = int(nrows_total*2/3)
# X_train, y_train = X[:nrows_train] , y[:nrows_train]
# X_valid, y_valid = X[nrows_train:nrows_valid], y[nrows_train:nrows_valid]
# X_test , y_test = X[nrows_valid:] , y[nrows_valid:]
# X_train_valid, y_train_valid = X[:nrows_valid] , y[:nrows_valid]
# print("Total number of rows:\t", nrows_total)
# print("Training rows:\t\t" , 0 ,"-", nrows_train)
# print("Validation rows:\t" , nrows_train,"-", nrows_valid)
# print("Testing rows:\t\t" ,nrows_valid ,"-" , nrows_total)
With the data loaded, transformed and split, can now pass it into different classifiers and see how they perform
Basic workflow for each classifier:
clf
variableX_train
, y_train
) into classifierX_test
datasklearn.metrics.log_loss
function & y_test
data)
In [21]:
from sklearn.linear_model import LogisticRegression
clf = sklearn.linear_model.LogisticRegression()
clf.fit(X_train, y_train)
clf_probs = clf.predict_proba(X_test)
score = sklearn.metrics.log_loss(y_test, clf_probs)
print("Log-loss score:\t", score)
In [22]:
from sklearn.linear_model import LogisticRegression
# Load Test Data
data_filename = 'blood_test.csv'
df_test = pd.read_csv(data_dir+data_filename)
# Transform data
# - dropped the ID column
# - converted to matrix array for input to `predict_proba`
Z = df_test.iloc[:,1:5].as_matrix()
# Predict data
clf_probs = clf.predict_proba(Z)
# Add predictions back into test data frame
df_test['Made Donation in March 2007'] = clf_probs[:,1]
df_test.head()
# Setup save filename and directory
submit_dir = '../data/processed/'
submit_filename = 'submit-logistic_regression.csv'
# Save to CSV-file using only the subject-id, and predition columns
df_test.to_csv(submit_dir+submit_filename,
columns=('Unnamed: 0', 'Made Donation in March 2007'),
index=False)
In [23]:
from sklearn.ensemble import RandomForestClassifier
# Train uncalibrated random forest classifier
# on whole train and validation data
# and evaluate on test data
clf = sklearn.ensemble.RandomForestClassifier(n_estimators=25)
clf.fit(X_train, y_train)
# Get probabilities
clf_probs = clf.predict_proba(X_test)
# Test/Evaluate the the model
score = sklearn.metrics.log_loss(y_test, clf_probs)
print("Log-loss score:\t", score)
In [24]:
from sklearn.ensemble import RandomForestClassifier
# Train random forest classifier
# - calibrate on validation data
# - evaluate test data
clf = sklearn.ensemble.RandomForestClassifier(n_estimators=25)
clf.fit(X_train, y_train)
clf_probs = clf.predict_proba(X_test)
from sklearn.calibration import CalibratedClassifierCV
# Pass the RandomForestClassifier into the CalibrationClassifier
sig_clf = CalibratedClassifierCV(clf, method="sigmoid", cv="prefit")
sig_clf.fit(X_train, y_train)
# Get prediction probabilities from model
sig_clf_probs = sig_clf.predict_proba(X_test)
# Test quality of predictions using `log_loss` function
sig_score = sklearn.metrics.log_loss(y_test, sig_clf_probs)
print("Log-loss score:\t", sig_score)
In [ ]:
from sklearn import svm
clf = []
clf_probs = []
# clf = svm.SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
# decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
# max_iter=-1, probability=True, random_state=None, shrinking=True,
# tol=0.001, verbose=False)
# clf = svm.SVC(kernel='rbf',degree=2, probability=True)
#StandardScaler
X_testNorm = sklearn.preprocessing.MinMaxScaler().fit_transform(X_test)
X_trainNorm = sklearn.preprocessing.MinMaxScaler().fit_transform(X_train)
clf = svm.SVC(kernel='linear',cache_size = 7000, degree=2, probability=True)
clf.fit(X_train, y_train)
# Get prediction probabilities from model
clf_probs = clf.predict_proba(X_testNorm)
# Test quality of predictions using `log_loss` function
score = sklearn.metrics.log_loss(y_test, clf_probs)
print("Log-loss score:\t", score)
sklearn.model_selection.cross_val_score
From Katie Malone's Workflows in Python:
The cheapest and easiest way to train on one portion of my dataset and test on another, and to get a measure of model quality at the same time, is to use sklearn.cross_validation.cross_val_score().
cross_val_score()
- splits data into 3 equal portions
- trains on 2 portions
- tests on the third
This process repeats 3 times. That’s why 3 numbers get printed in the code block below.
log_loss
results are negative and is labelled neg_log_loss
for the cross_val_score
functionSee:
In [10]:
X = df_blood.iloc[:,1:5].as_matrix()
y = list(df_blood["Made Donation in March 2007"])
In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
clf = sklearn.linear_model.LogisticRegression()
score = sklearn.model_selection.cross_val_score(
clf,
X, y,
scoring="neg_log_loss")
print(score)
In [12]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
clf = sklearn.tree.DecisionTreeClassifier()
score = sklearn.model_selection.cross_val_score(
clf,
X, y,
scoring="neg_log_loss")
print(score)
In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
clf = sklearn.ensemble.RandomForestClassifier()
score = sklearn.model_selection.cross_val_score(
clf,
X, y,
scoring="neg_log_loss")
print(score)
Examples:
Documentation
Discussion
In [ ]: