In [63]:
# Inspired from the Data Science for business, on Introduction to predictive modeling example
# dataset from https://archive.ics.uci.edu/ml/datasets/Mushroom

In [64]:
# necessary imports
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
%load_ext autoreload
%autoreload 2
import sys
sys.path.append("../lib_math")
import information as info
import scatter_boxplot as sbp
%matplotlib inline
import seaborn as sns
from scipy.stats.stats import pearsonr
from scipy import stats


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

data input

""" class: edible=e, poisonous=p cap-shape: bell=b, conical=c, convex=x, flat=f, knobbed=k, sunken=s cap-surface: fibrous=f, grooves=g, scaly=y, smooth=s cap-color: brown=n ,buff=b, cinnamon=c, gray=g, green=r, pink=p, purple=u, red=e, white=w, yellow=y bruises?: bruises=t, no=f odor: almond=a, anise=l, creosote=c, fishy=y, foul=f, musty=m, none=n, pungent=p, spicy=s gill-attachment: attached=a, descending=d, free=f, notched=n gill-spacing: close=c, crowded=w, distant=d gill-size: broad=b, narrow=n gill-color: black=k, brown=n, buff=b, chocolate=h, gray=g, green=r, orange=o, pink=p, purple=u, red=e, white=w, yellow=y stalk-shape: enlarging=e, tapering=t stalk-root: bulbous=b, club=c, cup=u, equal=e, rhizomorphs=z, rooted=r, missing=? stalk-surface-above-ring: fibrous=f, scaly=y, silky=k, smooth=s stalk-surface-below-ring: fibrous=f, scaly=y, silky=k, smooth=s stalk-color-above-ring: brown=n, buff=b, cinnamon=c, gray=g, orange=o, pink=p, red=e, white=w, yellow=y stalk-color-below-ring: brown=n, buff=b, cinnamon=c, gray=g, orange=o, pink=p, red=e, white=w, yellow=y veil-type: partial=p, universal=u veil-color: brown=n, orange=o, white=w, yellow=y ring-number: none=n, one=o, two=t ring-type: cobwebby=c, evanescent=e, flaring=f, large=l, none=n, pendant=p, sheathing=s, zone=z spore-print-color: black=k, brown=n, buff=b, chocolate=h, green=r, orange=o, purple=u, white=w, yellow=y population: abundant=a, clustered=c, numerous=n, scattered=s, several=v, solitary=y habitat: grasses=g, leaves=l, meadows=m, paths=p, urban=u, waste=w, woods=d """

In [65]:
#https://archive.ics.uci.edu/ml/datasets/Mushroom
df = pd.read_csv('../small_data_samples/agaricus-lepiota.data')
print len(df)


8123

In [66]:
df.head(3)


Out[66]:
p x s n t p.1 f c n.1 k ... s.2 w w.1 p.2 w.2 o p.3 k.1 s.3 u
0 e x s y t a f c b k ... s w w p w o p n n g
1 e b s w t l f c b n ... s w w p w o p n n m
2 p x y w t p f c n n ... s w w p w o p k s u

3 rows × 23 columns

Ranking of attributes


In [67]:
# Here we deal with a Classification Task, target variable "p" is categorial
# and the attributes are also categorial
# => we can use the infromation gain

In [69]:
# Calculate the global entropy:
print 'global entropy is :', info.entropy(df, "p")
# If the two classes were perfectly balanced in the dataset it would have an entropy of 1. 
# since entropy ~ 1 we should not be far from this, but lets see exactly how far we are:
print 'total edible: ',  len(df[df.p=='e'])
print 'total poisonous : ',  len(df[df.p=='p'])


global entropy is : Asdf
0.999061269148
total edible:  4208
total poisonous :  3915

In [75]:
# Lets see the information gain for the GILL-COLOR
info.information_gain(df, "k", "p")


Out[75]:
0.4172294080316872

In [76]:
# Lets see the information gain for the SPORE-PRINT-COLOR
info.information_gain(df, "k.1", "p")


Out[76]:
0.4810119039604144

In [77]:
# Lets see the information gain for the SPORE-PRINT-COLOR
info.information_gain(df, "p.1", "p")


Out[77]:
0.9060569015595047

In [ ]:
# => We see that the best information gain, checking these 3 features,  comes from teh odor.
# this is the most informative attribute
# If we were about to build a classification tree, this should be the top node.