In [1]:
from harness import Harness

import sklearn.datasets
import sklearn.discriminant_analysis
import sklearn.mixture
import sklearn.model_selection

from pandas import ( 
    CategoricalIndex, DataFrame, Index, Series
)

from IPython.display import (
    Markdown,
)

In [2]:
"""Can I load the iris data"""
iris = sklearn.datasets.load_iris()

In [3]:
"""Can I initialize a Harness DataFrame"""
df = Harness(
    data=iris['data'], 
    index=CategoricalIndex(iris['target']).rename('target'),
    columns=iris['feature_names'],
    estimator=sklearn.discriminant_analysis.LinearDiscriminantAnalysis(),
    feature_level='target',
)

df = df.set_index(
    df.index
    .rename_categories(iris['target_names'])
    .rename('target_name'), append=True
).set_index(
    df
    .index.rename_categories(['red', 'green', 'blue'])
    .rename('color'), append=True
)

df.sample(5)


Out[3]:
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm)
target target_name color
2 virginica blue 6.4 2.8 5.6 2.1
blue 6.9 3.2 5.7 2.3
blue 6.7 3.3 5.7 2.5
blue 5.7 2.5 5.0 2.0
0 setosa red 5.4 3.9 1.3 0.4

In [4]:
test_train = next(
    sklearn.model_selection
    .StratifiedKFold(n_splits=2)
    .split(df.values, df.Index('target'))
)

split = DataFrame(index=df.index, columns=['split'])
split.iloc[test_train[0]], split.iloc[test_train[1]] = 'train', 'test'
df = df.set_index(split.set_index('split', append=True).swaplevel(-1,0).index)

df.ix['train'].fit().sample(5)


Out[4]:
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm)
target_name color target
setosa red 0 4.7 3.2 1.6 0.2
0 5.0 3.3 1.4 0.2
versicolor green 1 6.7 3.0 5.0 1.7
1 5.7 2.9 4.2 1.3
virginica blue 2 6.9 3.1 5.4 2.1

In [5]:
from sklearn.metrics import confusion_matrix

In [6]:
from sklearn import preprocessing, decomposition, discriminant_analysis

In [7]:
df = (
    df
    .set_params(estimator=preprocessing.RobustScaler())
    .fit_transform()
    .set_params(estimator=decomposition.PCA())
    .fit_transform()
    .set_params(estimator=discriminant_analysis.LinearDiscriminantAnalysis())
)
df.ix['train'].fit().sample(2)


Out[7]:
0 1 2 3
target_name color target
setosa red 0 1.436901 -0.033947 -0.002798 -0.055323
virginica blue 2 -1.228911 -0.415372 -0.009152 -0.112343

In [8]:
(
    df
    .transform()
    .set_index(
        CategoricalIndex(df.predict()[0], name='prediction'), append=True
    )
    .sample(2)
)


Out[8]:
0 1
split target_name color target prediction
test versicolor green 1 1 -1.184620 -1.742177
train setosa red 0 0 7.017018 0.623815

In [9]:
df.ix['train'].score()


Out[9]:
0.97333333333333338

In [10]:
df.add_template(
scoreboard="""
The table presents some information about the `{{
    df.estimator.__str__().split('(',1)[0]
}}` model applied to the `iris` dataset.

|         |           Test          |           Training       |
|---------|-------------------------|--------------------------|
| Score   |{{df.ix['test'].score()}}|{{df.ix['train'].score()}}|
| Samples | {{df.ix['test'] | len}} | {{df.ix['train'] | len}} |
"""
);

In [11]:
Markdown(df.get_template('scoreboard'))


Out[11]:

The table presents some information about the LinearDiscriminantAnalysis model applied to the iris dataset.

Test Training
Score 0.986666666667 0.973333333333
Samples 75 75

In [12]:
transformed = df.transform()
transformed.sample(2)


Out[12]:
0 1
split target_name color target
train virginica blue 2 -5.364102 -0.009331
setosa red 0 7.341003 -0.791289

In [13]:
with transformed.reset_index().DataSource(
    x=0, y=1, fill_color='color', text='target_name'
) as source:
    source.Scatter(color='text', marker='split')
    source.save()