In [1]:
%load_ext autoreload
%autoreload 2
In [2]:
%%file utils.py
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
pd.options.mode.chained_assignment = None
error_messages = {
"No clear target in training data":
("The training data must have "
"exactly one more column than "
"the test data."),
"Training data has too many columns":
("The training data has more "
"than one column different than "
"the testing data: %s"),
"Column names inconsistent":
("The training columns and the "
"test columns must have "
"identical names excepts for "
"the target variables. "
"Different columns: %s")
}
def X_y_split(X_train, X_test):
"""
Determines which variables are the target
and which are the features. Returns just
The X and y data in the training dataset
as a tuple.
Example usage:
X, y = learn.X_y_split(X_train, X_test)
Parameters
----------
X_train: pandas dataframe
The data that has the target in it.
X_test: pandas dataframe
The data that does not have the target in it.
"""
X_train = X_train.copy()
n_train_cols = X_train.shape[1]
n_test_cols = X_test.shape[1]
if n_train_cols != n_test_cols + 1:
msg = error_messages["No clear target in training data"]
raise ValueError(msg)
test_columns = set(X_test.columns)
train_columns = set(X_train.columns)
target_columns = train_columns - test_columns
if len(target_columns) > 1:
key = "Training data has too many columns"
msg_ = error_messages[key]
msg = msg_ % str(target_columns)
raise ValueError(msg)
extra_columns_in_test = test_columns - train_columns
if extra_columns_in_test:
key = "Column names inconsistent"
msg_ = error_messages[key]
msg = msg_ % str(extra_columns_in_test)
raise ValueError(msg)
y_name = target_columns.pop()
y = X_train.pop(y_name)
return X_train, y
def make_data(source,
missing_data=None,
categorical=None,
outliers=None):
"""
Utility function to assist in loading different
sample datasets. Returns training data (that
contains the target) and testing data (that
does not contain the target).
Parameters
----------
source: string, optional (default="boston")
The specific dataset to load. Options:
- Regression: "boston", "diabetes"
- Classification: "cancer", "digits", "iris"
missing_data: bool or NoneType (default=None)
To be implemented
Determines if there is missing data
categorical: bool or NoneType (default=None)
To be implemented
Determines if there is categorical data
outliers: bool or NoneType (default=None)
To be implemented
Determines if there are outliers in the dataset
"""
if source == "boston":
data = datasets.load_boston()
elif source == "diabetes":
data = datasets.load_diabetes()
data["feature_names"] = ["f{}".format(v)
for v in range(10)]
elif source == "cancer":
data = datasets.load_breast_cancer()
elif source == "digits":
data = datasets.load_digits()
data["feature_names"] = ["f{}".format(v)
for v in range(64)]
elif source == "iris":
data = datasets.load_iris()
X = pd.DataFrame(data=data.data,
columns=data.feature_names)
y = pd.Series(data=data.target)
X_train, X_test, y_train, _ = train_test_split(X,
y,
test_size=.5,
random_state=42)
X_train["target"] = y_train
return X_train, X_test
def is_classification_problem(y, max_classes="auto"):
"""
Check if a target variable is a classification
problem or a regression problem. Returns True if
classification and False if regression. On failure,
raises a ValueError.
Parameters
----------
y: array-like
This should be the target variable. Ideally,
you should convert it to be numeric before
using this function.
max_classes: int or float, optional (default="auto")
Determines the max number of unique target values
there can be for classification problems
If "auto" - sets it equal to 10% of the dataset or
100, whichever is smaller
If float - interprets as percent of dataset size
If int - interprets as number of classes
"""
y = pd.Series(y)
n = len(y)
n_unique = len(y.unique())
if max_classes == "auto":
n_max_classes = int(n*.1)
max_classes = min(n_max_classes, 100)
if isinstance(max_classes, float):
n_max_classes = int(n*max_classes)
max_classes = min(n_max_classes, int(n/2))
# If y is numeric
if y.dtype.kind in 'bifc':
# If there are more than max_classes
# classify as a regression problem
if n_unique > max_classes:
return False
# If there are floating point numbers
# classify as a regression problem
decimals = (y - y.astype(int)).mean()
if decimals > .01:
return False
if n_unique <= max_classes:
return True
try:
y.astype(float)
return False
except ValueError:
msg = ("Malformed target data. "
"Target is non-numeric "
"and there are more "
"unique values than allowed "
"by max_classes")
raise ValueError(msg)
In [3]:
%%file ../tests/test_utils.py
import unittest
from learn import utils
class TestUtils(unittest.TestCase):
def test_making_data_simple(self):
for data in ["boston", "iris"]:
X_train, X_test = utils.make_data(source=data)
train_cols = X_train.columns
test_cols = X_test.columns
# Training data should have exactly one additional column
self.assertEqual(len(train_cols), len(test_cols)+1)
# Ensure only one column name is different
n_diff_cols = len(set(X_train.columns) - set(X_test.columns))
self.assertEqual(1, n_diff_cols)
def test_is_classification_problem(self):
# Shorten function name
icp = utils.is_classification_problem
# Regression because floats
result = icp([1.1, 2.1])
self.assertEqual(result, 0)
# Regression because number of unique
result = icp([1,2,3,4])
self.assertEqual(result, 0)
# Classification because words
result = icp(["cat"]*20+["dog"]*20)
self.assertEqual(result, 1)
# Classification because number of uniques
result = icp([0]*20+[1]*20)
self.assertEqual(result, 1)
# Real data tests - Regression
for dataset in ["boston", "diabetes"]:
data = utils.make_data(source=dataset)
X, y = utils.X_y_split(*data)
self.assertEqual(icp(y), 0)
# Real data tests - Classification
for dataset in ["cancer", "digits", "iris"]:
data = utils.make_data(source=dataset)
X, y = utils.X_y_split(*data)
self.assertEqual(icp(y), 1)
# class TestXYSplit(unittest.TestCase):
# pass
if __name__ == '__main__':
unittest.main()
In [4]:
%%file forall.py
import pandas as pd
from sklearn.preprocessing import label_binarize
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import r2_score, roc_auc_score
from learn import utils
class Regression():
def __init__(self, time_to_compute=None):
self.time_to_compute = time_to_compute
def fit(self, X, y):
model = RandomForestRegressor(n_estimators=100,
oob_score=True)
model.fit(X, y)
self.model = model
self.oob_predictions = model.oob_prediction_
self.score_type = "R2"
self.score = r2_score(y, self.oob_predictions)
return self
def predict(self, X):
predictions = self.model.predict(X)
return predictions
class Classification():
def __init__(self, time_to_compute=None):
"""
"""
self.time_to_compute = time_to_compute
def fit(self, X, y):
"""
Currently y must be numeric. Wrap
LabelVectorizer as TODO.
"""
y = pd.Series(y)
self.n_classes = len(y.unique())
model = RandomForestClassifier(n_estimators=100,
oob_score=True)
model.fit(X, y)
self.model = model
# Evaluation metrics
if self.n_classes == 2:
self.oob_predictions = model.oob_decision_function_[:, 1]
self.score_type = "AUC"
self.score = roc_auc_score(y, self.oob_predictions)
else:
self.oob_predictions = model.oob_decision_function_
self.score_type = "AUC"
y_bin = label_binarize(y, sorted(pd.Series(y).unique()))
self.score = roc_auc_score(y_bin,
self.oob_predictions)
return self
def predict(self, X):
predictions = self.model.predict(X)
return predictions
class All():
def __init__(self, time_to_compute=None):
self.time_to_compute = time_to_compute
def fit(self, X, y):
self.classification = utils.is_classification_problem(y)
if self.classification:
model = Classification()
else:
model = Regression()
model.fit(X, y)
self.model = model
self.score = model.score
self.score_type = model.score_type
return self
def predict(self, X):
predictions = self.model.predict(X)
return predictions
In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
import os
import sys
# Allows importing of local modules
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
sys.path.append(module_path)
from learn import forall as fa
from learn import utils
In [6]:
for dataset in ["boston", "diabetes", "cancer", "digits", "iris"]:
# In the flask app:
X_train, X_test = utils.make_data(source=dataset)
X, y = utils.X_y_split(X_train=X_train, X_test=X_test)
model = fa.All()
model.fit(X, y)
predictions = model.predict(X_test)
print("%s: %.3f (%s)" % (dataset, model.score, model.score_type))