In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%%file utils.py

import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split

pd.options.mode.chained_assignment = None

error_messages = {
    "No clear target in training data": 
        ("The training data must have " 
         "exactly one more column than " 
         "the test data."),
    "Training data has too many columns":
        ("The training data has more "
         "than one column different than "
         "the testing data: %s"),
    "Column names inconsistent":
        ("The training columns and the "
         "test columns must have "
         "identical names excepts for "
         "the target variables. "
         "Different columns: %s")
    }

def X_y_split(X_train, X_test):
    """
    Determines which variables are the target
    and which are the features. Returns just
    The X and y data in the training dataset
    as a tuple.
    
    Example usage:
    X, y = learn.X_y_split(X_train, X_test)
    
    Parameters
    ----------
    X_train: pandas dataframe
        The data that has the target in it.
    
    X_test: pandas dataframe
        The data that does not have the target in it.
    """
    X_train = X_train.copy()
    n_train_cols = X_train.shape[1]
    n_test_cols = X_test.shape[1]
    
    if n_train_cols != n_test_cols + 1:
        msg = error_messages["No clear target in training data"]
        raise ValueError(msg)
        
    test_columns = set(X_test.columns)
    train_columns = set(X_train.columns)
    target_columns = train_columns - test_columns
    if len(target_columns) > 1:
        key = "Training data has too many columns"
        msg_ = error_messages[key]
        msg = msg_ % str(target_columns)
        raise ValueError(msg)

    extra_columns_in_test = test_columns - train_columns
    if extra_columns_in_test:
        key = "Column names inconsistent"
        msg_ = error_messages[key]
        msg = msg_ % str(extra_columns_in_test)
        raise ValueError(msg)     

    y_name = target_columns.pop()
    y = X_train.pop(y_name)
    return X_train, y


def make_data(source, 
              missing_data=None, 
              categorical=None, 
              outliers=None):
    """
    Utility function to assist in loading different 
    sample datasets. Returns training data (that 
    contains the target) and testing data (that
    does not contain the target).
    
    Parameters
    ----------
    source: string, optional (default="boston")
        The specific dataset to load. Options:
        - Regression: "boston", "diabetes"
        - Classification: "cancer", "digits", "iris"
        
    missing_data: bool or NoneType (default=None)
        To be implemented
        Determines if there is missing data
        
    categorical: bool or NoneType (default=None)
        To be implemented
        Determines if there is categorical data
        
    outliers: bool or NoneType (default=None)
        To be implemented
        Determines if there are outliers in the dataset
    """
    if source == "boston":
        data = datasets.load_boston()
    elif source == "diabetes":
        data = datasets.load_diabetes()
        data["feature_names"] = ["f{}".format(v) 
                                 for v in range(10)]
    elif source == "cancer":
        data = datasets.load_breast_cancer()
    elif source == "digits":
        data = datasets.load_digits()
        data["feature_names"] = ["f{}".format(v) 
                                 for v in range(64)]        
    elif source == "iris":
        data = datasets.load_iris()
    X = pd.DataFrame(data=data.data, 
                     columns=data.feature_names)
    y = pd.Series(data=data.target)
    X_train, X_test, y_train, _ = train_test_split(X, 
                                                   y, 
                                                   test_size=.5,
                                                   random_state=42)
    X_train["target"] = y_train
    return X_train, X_test


def is_classification_problem(y, max_classes="auto"):
    """
    Check if a target variable is a classification
    problem or a regression problem. Returns True if
    classification and False if regression. On failure,
    raises a ValueError.
    
    Parameters
    ----------
    y: array-like
        This should be the target variable. Ideally, 
        you should convert it to be numeric before 
        using this function.
        
    max_classes: int or float, optional (default="auto")
        Determines the max number of unique target values
        there can be for classification problems
        
        If "auto" - sets it equal to 10% of the dataset or
            100, whichever is smaller
        If float - interprets as percent of dataset size
        If int - interprets as number of classes
    """
    y = pd.Series(y)
    n = len(y)
    n_unique = len(y.unique())
    if max_classes == "auto":
        n_max_classes = int(n*.1)
        max_classes = min(n_max_classes, 100)
    if isinstance(max_classes, float):
        n_max_classes = int(n*max_classes)
        max_classes = min(n_max_classes, int(n/2))
    # If y is numeric
    if y.dtype.kind in 'bifc':
        # If there are more than max_classes
        # classify as a regression problem
        if n_unique > max_classes:
            return False
        # If there are floating point numbers
        # classify as a regression problem
        decimals = (y - y.astype(int)).mean()
        if decimals > .01:
            return False
    if n_unique <= max_classes:
        return True
    try:
        y.astype(float)
        return False
    except ValueError:
        msg = ("Malformed target data. "
               "Target is non-numeric "
               "and there are more "
               "unique values than allowed "
               "by max_classes")
        raise ValueError(msg)


Overwriting utils.py

In [3]:
%%file ../tests/test_utils.py
import unittest
from learn import utils

class TestUtils(unittest.TestCase):
    def test_making_data_simple(self):
        for data in ["boston", "iris"]:
            X_train, X_test = utils.make_data(source=data)
            train_cols = X_train.columns
            test_cols = X_test.columns
            # Training data should have exactly one additional column
            self.assertEqual(len(train_cols), len(test_cols)+1)
            # Ensure only one column name is different
            n_diff_cols = len(set(X_train.columns) - set(X_test.columns))
            self.assertEqual(1, n_diff_cols)
        
    def test_is_classification_problem(self):
        # Shorten function name
        icp = utils.is_classification_problem
        # Regression because floats
        result = icp([1.1, 2.1])
        self.assertEqual(result, 0)
        # Regression because number of unique
        result = icp([1,2,3,4])
        self.assertEqual(result, 0)
        # Classification because words
        result = icp(["cat"]*20+["dog"]*20)
        self.assertEqual(result, 1)
        # Classification because number of uniques
        result = icp([0]*20+[1]*20)
        self.assertEqual(result, 1)
        # Real data tests - Regression
        for dataset in ["boston", "diabetes"]:
            data = utils.make_data(source=dataset)
            X, y = utils.X_y_split(*data)
            self.assertEqual(icp(y), 0)
        # Real data tests - Classification
        for dataset in ["cancer", "digits", "iris"]:
            data = utils.make_data(source=dataset)
            X, y = utils.X_y_split(*data)
            self.assertEqual(icp(y), 1)
            
# class TestXYSplit(unittest.TestCase):
#     pass

if __name__ == '__main__':
    unittest.main()


Overwriting ../tests/test_utils.py

In [4]:
%%file forall.py

import pandas as pd
from sklearn.preprocessing import label_binarize
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import r2_score, roc_auc_score
from learn import utils

class Regression():
    def __init__(self, time_to_compute=None):
        self.time_to_compute = time_to_compute
        
    def fit(self, X, y):
        model = RandomForestRegressor(n_estimators=100, 
                                      oob_score=True)
        model.fit(X, y)
        self.model = model
        self.oob_predictions = model.oob_prediction_
        self.score_type = "R2"
        self.score = r2_score(y, self.oob_predictions)
        return self
        
    def predict(self, X):
        predictions = self.model.predict(X)
        return predictions
    
    
class Classification():
    def __init__(self, time_to_compute=None):
        """
        """
        self.time_to_compute = time_to_compute
        
    def fit(self, X, y):
        """
        Currently y must be numeric. Wrap 
        LabelVectorizer as TODO.
        """
        y = pd.Series(y)
        self.n_classes = len(y.unique())
        model = RandomForestClassifier(n_estimators=100, 
                                       oob_score=True)
        model.fit(X, y)
        self.model = model
        
        # Evaluation metrics
        if self.n_classes == 2:
            self.oob_predictions = model.oob_decision_function_[:, 1]
            self.score_type = "AUC"
            self.score = roc_auc_score(y, self.oob_predictions)
        else:
            self.oob_predictions = model.oob_decision_function_
            self.score_type = "AUC"
            y_bin = label_binarize(y, sorted(pd.Series(y).unique()))
            self.score = roc_auc_score(y_bin, 
                                       self.oob_predictions)
        return self
        
    def predict(self, X):
        predictions = self.model.predict(X)
        return predictions

    
class All():
    def __init__(self, time_to_compute=None):
        self.time_to_compute = time_to_compute
        
    def fit(self, X, y):
        self.classification = utils.is_classification_problem(y)
        if self.classification:
            model = Classification()
        else:
            model = Regression()
        model.fit(X, y)
        self.model = model
        self.score = model.score
        self.score_type = model.score_type
        return self
        
    def predict(self, X):
        predictions = self.model.predict(X)
        return predictions


Overwriting forall.py

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

import os
import sys

# Allows importing of local modules
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from learn import forall as fa
from learn import utils

In [6]:
for dataset in ["boston", "diabetes", "cancer", "digits", "iris"]:
    # In the flask app:
    X_train, X_test = utils.make_data(source=dataset)
    X, y = utils.X_y_split(X_train=X_train, X_test=X_test)
    model = fa.All()
    model.fit(X, y)
    predictions = model.predict(X_test)
    print("%s: %.3f (%s)" % (dataset, model.score, model.score_type))


boston: 0.814 (R2)
diabetes: 0.435 (R2)
cancer: 0.980 (AUC)
digits: 0.998 (AUC)
iris: 0.981 (AUC)