Workflow adapted from Katie Malone's Workflows in Python series
See also
sklearn.model_selection.cross_val_score function
In [40]:
import pandas as pd
import sklearn
In [41]:
data_dir = '../data/raw/'
data_filename = 'blood_train.csv'
df_blood = pd.read_csv(data_dir+data_filename)
df_blood.head(10)
Out[41]:
iloc to drop the subject id column and the prediction column (i.e. 'Made Donation in March 2007')predict_proba, train_test_split functions
In [74]:
X = df_blood.iloc[:,1:5].as_matrix()
y = list(df_blood["Made Donation in March 2007"])
In [43]:
# Log transform the features
import numpy as np
from sklearn.preprocessing import FunctionTransformer
transformer = FunctionTransformer(np.log1p)
X= transformer.transform(X)
# Normalize the features using Standard Scalar
#X_test = sklearn.preprocessing.StandardScaler().fit_transform(X_test)
#X_train = sklearn.preprocessing.StandardScaler().fit_transform(X_train)
In [117]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= sklearn.model_selection.train_test_split(
X, y,
test_size=0.5,
random_state=0)
print("No. Rows in training set:\t", len(X_train))
print("No. Rows in testing set:\t" , len(X_test))
In [118]:
# Split data into 4 partitions
# - training set
# - validation set
# - combined training & validation set
# - testing set
# nrows_total = df_blood.count()[1]
# nrows_train = int(nrows_total/3)
# nrows_valid = int(nrows_total*2/3)
# X_train, y_train = X[:nrows_train] , y[:nrows_train]
# X_valid, y_valid = X[nrows_train:nrows_valid], y[nrows_train:nrows_valid]
# X_test , y_test = X[nrows_valid:] , y[nrows_valid:]
# X_train_valid, y_train_valid = X[:nrows_valid] , y[:nrows_valid]
# print("Total number of rows:\t", nrows_total)
# print("Training rows:\t\t" , 0 ,"-", nrows_train)
# print("Validation rows:\t" , nrows_train,"-", nrows_valid)
# print("Testing rows:\t\t" ,nrows_valid ,"-" , nrows_total)
With the data loaded, transformed and split, can now pass it into different classifiers and see how they perform
Basic workflow for each classifier:
clf variableX_train, y_train) into classifierX_test datasklearn.metrics.log_loss function & y_test data)
In [119]:
from sklearn.linear_model import LogisticRegression
clf = sklearn.linear_model.LogisticRegression()
clf.fit(X_train, y_train)
clf_probs = clf.predict_proba(X_test)
score = sklearn.metrics.log_loss(y_test, clf_probs)
print("Log-loss score:\t", score)
In [81]:
clfE = sklearn.linear_model.ElasticNet(l1_ratio=0.24)
clfE.fit(X_train, y_train)
clfE_probs = clfE.predict(X_test)
score = sklearn.metrics.log_loss(y_test, clfE_probs)
print("Log-loss score:\t", score)
In [112]:
from sklearn.linear_model import LogisticRegression
clfL = sklearn.linear_model.LogisticRegression(C=1)
clfL.fit(X_train, y_train)
clfL_probs = clfL.predict_proba(X_test)
score = sklearn.metrics.log_loss(y_test, clfL_probs)
print("Log-loss score:\t", score)
In [102]:
clfECV = sklearn.linear_model.ElasticNetCV(l1_ratio=0.24)
clfECV.fit(X_train, y_train)
clfECV_probs = clfECV.predict(X_test)
score = sklearn.metrics.log_loss(y_test, clfECV_probs )
print("Log-loss score:\t", score)
In [100]:
dir(clfECV)
Out[100]:
In [46]:
from sklearn.linear_model import LogisticRegression
# Load Test Data
data_filename = 'blood_test.csv'
df_test = pd.read_csv(data_dir+data_filename)
# Transform data
# - dropped the ID column
# - converted to matrix array for input to `predict_proba`
Z = df_test.iloc[:,1:5].as_matrix()
# Predict data
clf_probs = clf.predict_proba(Z)
# Add predictions back into test data frame
df_test['Made Donation in March 2007'] = clf_probs[:,1]
df_test.head()
# Setup save filename and directory
submit_dir = '../data/processed/'
submit_filename = 'submit-logistic_regression.csv'
# Save to CSV-file using only the subject-id, and predition columns
df_test.to_csv(submit_dir+submit_filename,
columns=('Unnamed: 0', 'Made Donation in March 2007'),
index=False)
In [30]:
from sklearn.ensemble import RandomForestClassifier
# Train uncalibrated random forest classifier
# on whole train and validation data
# and evaluate on test data
clf = sklearn.ensemble.RandomForestClassifier(n_estimators=25)
clf.fit(X_train, y_train)
# Get probabilities
clf_probs = clf.predict_proba(X_test)
# Test/Evaluate the the model
score = sklearn.metrics.log_loss(y_test, clf_probs)
print("Log-loss score:\t", score)
In [31]:
from sklearn.ensemble import RandomForestClassifier
# Train random forest classifier
# - calibrate on validation data
# - evaluate test data
clf = sklearn.ensemble.RandomForestClassifier(n_estimators=25)
clf.fit(X_train, y_train)
clf_probs = clf.predict_proba(X_test)
from sklearn.calibration import CalibratedClassifierCV
# Pass the RandomForestClassifier into the CalibrationClassifier
sig_clf = CalibratedClassifierCV(clf, method="sigmoid", cv="prefit")
sig_clf.fit(X_train, y_train)
# Get prediction probabilities from model
sig_clf_probs = sig_clf.predict_proba(X_test)
# Test quality of predictions using `log_loss` function
sig_score = sklearn.metrics.log_loss(y_test, sig_clf_probs)
print("Log-loss score:\t", sig_score)
In [132]:
from sklearn import svm
clf = []
clf_probs = []
# clf = svm.SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
# decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
# max_iter=-1, probability=True, random_state=None, shrinking=True,
# tol=0.001, verbose=False)
# clf = svm.SVC(kernel='rbf',degree=2, probability=True)
#StandardScaler
clf = svm.SVC(kernel='linear', probability=True)
clf.fit(X_trainNorm, y_train)
# Get prediction probabilities from model
clf_probs = clf.predict_proba(X_testNorm)
# Test quality of predictions using `log_loss` function
score = sklearn.metrics.log_loss(y_test, clf_probs)
print("Log-loss score:\t", score)
In [22]:
sklearn.model_selection.cross_val_scoreFrom Katie Malone's Workflows in Python:
The cheapest and easiest way to train on one portion of my dataset and test on another, and to get a measure of model quality at the same time, is to use sklearn.cross_validation.cross_val_score().
cross_val_score()
- splits data into 3 equal portions
- trains on 2 portions
- tests on the third
This process repeats 3 times. That’s why 3 numbers get printed in the code block below.
log_loss results are negative and is labelled neg_log_loss for the cross_val_score functionSee:
In [10]:
X = df_blood.iloc[:,1:5].as_matrix()
y = list(df_blood["Made Donation in March 2007"])
In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
clf = sklearn.linear_model.LogisticRegression()
score = sklearn.model_selection.cross_val_score(
clf,
X, y,
scoring="neg_log_loss")
print(score)
In [12]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
clf = sklearn.tree.DecisionTreeClassifier()
score = sklearn.model_selection.cross_val_score(
clf,
X, y,
scoring="neg_log_loss")
print(score)
In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
clf = sklearn.ensemble.RandomForestClassifier()
score = sklearn.model_selection.cross_val_score(
clf,
X, y,
scoring="neg_log_loss")
print(score)
Examples:
Documentation
Discussion
In [ ]: