notebook.community

Edit and run



In [1]:

    
from harness import Harness

import sklearn.datasets
import sklearn.discriminant_analysis
import sklearn.mixture
import sklearn.model_selection

from pandas import ( 
    CategoricalIndex, DataFrame, Index, Series
)

from IPython.display import (
    Markdown,
)



In [2]:

    
"""Can I load the iris data"""
iris = sklearn.datasets.load_iris()



In [3]:

    
"""Can I initialize a Harness DataFrame"""
df = Harness(
    data=iris['data'], 
    index=CategoricalIndex(iris['target']).rename('target'),
    columns=iris['feature_names'],
    estimator=sklearn.discriminant_analysis.LinearDiscriminantAnalysis(),
    feature_level='target',
)

df = df.set_index(
    df.index
    .rename_categories(iris['target_names'])
    .rename('target_name'), append=True
).set_index(
    df
    .index.rename_categories(['red', 'green', 'blue'])
    .rename('color'), append=True
)

df.sample(5)









    Out[3]:






  
    
      
      
      
      sepal length (cm)
      sepal width (cm)
      petal length (cm)
      petal width (cm)
    
    
      target
      target_name
      color
      
      
      
      
    
  
  
    
      2
      virginica
      blue
      6.4
      2.8
      5.6
      2.1
    
    
      blue
      6.9
      3.2
      5.7
      2.3
    
    
      blue
      6.7
      3.3
      5.7
      2.5
    
    
      blue
      5.7
      2.5
      5.0
      2.0
    
    
      0
      setosa
      red
      5.4
      3.9
      1.3
      0.4



In [4]:

    
test_train = next(
    sklearn.model_selection
    .StratifiedKFold(n_splits=2)
    .split(df.values, df.Index('target'))
)

split = DataFrame(index=df.index, columns=['split'])
split.iloc[test_train[0]], split.iloc[test_train[1]] = 'train', 'test'
df = df.set_index(split.set_index('split', append=True).swaplevel(-1,0).index)

df.ix['train'].fit().sample(5)









    Out[4]:






  
    
      
      
      
      sepal length (cm)
      sepal width (cm)
      petal length (cm)
      petal width (cm)
    
    
      target_name
      color
      target
      
      
      
      
    
  
  
    
      setosa
      red
      0
      4.7
      3.2
      1.6
      0.2
    
    
      0
      5.0
      3.3
      1.4
      0.2
    
    
      versicolor
      green
      1
      6.7
      3.0
      5.0
      1.7
    
    
      1
      5.7
      2.9
      4.2
      1.3
    
    
      virginica
      blue
      2
      6.9
      3.1
      5.4
      2.1



In [5]:

    
from sklearn.metrics import confusion_matrix



In [6]:

    
from sklearn import preprocessing, decomposition, discriminant_analysis



In [7]:

    
df = (
    df
    .set_params(estimator=preprocessing.RobustScaler())
    .fit_transform()
    .set_params(estimator=decomposition.PCA())
    .fit_transform()
    .set_params(estimator=discriminant_analysis.LinearDiscriminantAnalysis())
)
df.ix['train'].fit().sample(2)









    Out[7]:






  
    
      
      
      
      0
      1
      2
      3
    
    
      target_name
      color
      target
      
      
      
      
    
  
  
    
      setosa
      red
      0
      1.436901
      -0.033947
      -0.002798
      -0.055323
    
    
      virginica
      blue
      2
      -1.228911
      -0.415372
      -0.009152
      -0.112343



In [8]:

    
(
    df
    .transform()
    .set_index(
        CategoricalIndex(df.predict()[0], name='prediction'), append=True
    )
    .sample(2)
)









    Out[8]:






  
    
      
      
      
      
      
      0
      1
    
    
      split
      target_name
      color
      target
      prediction
      
      
    
  
  
    
      test
      versicolor
      green
      1
      1
      -1.184620
      -1.742177
    
    
      train
      setosa
      red
      0
      0
      7.017018
      0.623815



In [9]:

    
df.ix['train'].score()









    Out[9]:





0.97333333333333338



In [10]:

    
df.add_template(
scoreboard="""
The table presents some information about the `{{
    df.estimator.__str__().split('(',1)[0]
}}` model applied to the `iris` dataset.

|         |           Test          |           Training       |
|---------|-------------------------|--------------------------|
| Score   |{{df.ix['test'].score()}}|{{df.ix['train'].score()}}|
| Samples | {{df.ix['test'] | len}} | {{df.ix['train'] | len}} |
"""
);



In [11]:

    
Markdown(df.get_template('scoreboard'))









    Out[11]:




The table presents some information about the LinearDiscriminantAnalysis model applied to the iris dataset.



Test
Training




Score
0.986666666667
0.973333333333


Samples
75
75



In [12]:

    
transformed = df.transform()
transformed.sample(2)









    Out[12]:






  
    
      
      
      
      
      0
      1
    
    
      split
      target_name
      color
      target
      
      
    
  
  
    
      train
      virginica
      blue
      2
      -5.364102
      -0.009331
    
    
      setosa
      red
      0
      7.341003
      -0.791289



In [13]:

    
with transformed.reset_index().DataSource(
    x=0, y=1, fill_color='color', text='target_name'
) as source:
    source.Scatter(color='text', marker='split')
    source.save()

			sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)
target	target_name	color
2	virginica	blue	6.4	2.8	5.6	2.1
		blue	6.9	3.2	5.7	2.3
		blue	6.7	3.3	5.7	2.5
		blue	5.7	2.5	5.0	2.0
0	setosa	red	5.4	3.9	1.3	0.4

					0	1
split	target_name	color	target	prediction
test	versicolor	green	1	1	-1.184620	-1.742177
train	setosa	red	0	0	7.017018	0.623815