In [4]:
import sys
import os
import itertools
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import scipy.stats as stats
sys.path.append(os.path.abspath('..'))
from preprocessing import shroom_dealer
In order to get a glimpse of what specific attribute values could be used to determine if a mushroom was edible or poisonous we generated some barcharts to compare the attribute values by poisonous and edible mushrooms.
The following function can be used to collect the frequencies of each attribute value for two attributes.
In [36]:
def attr_freqs(attr1, attr2):
df = shroom_dealer.get_data_frame()
labels1 = shroom_dealer.get_attribute_dictionary()[attr1]
labels2 = shroom_dealer.get_attribute_dictionary()[attr2]
data = []
for a in df[attr1].cat.categories:
column = df[attr2][df[attr1] == a].value_counts()
data.append(column)
observed = pd.concat(data, axis=1)
observed.columns = [labels1[a] for a in df[attr1].cat.categories]
return observed
attr_freqs('odor', 'poisonous')
Out[36]:
Next we can use this function to plot the comparative data.
In [37]:
def plot_comparative_data(attr, plot=True, save=False):
data = attr_freqs(attr, 'poisonous')
labels = shroom_dealer.get_attribute_dictionary()[attr]
index = np.arange(o.shape[1])
bar_width = 0.35
opacity=0.4
fig, ax = plt.subplots()
plt.bar(index, data.loc['e',:].values, bar_width, align='center',
color='b', label='edible', alpha=opacity)
plt.bar(index + bar_width, data.loc['p',:].values, bar_width,
align='center', color='r', label='poisonous', alpha=opacity)
plt.xlabel('Attributes')
plt.ylabel('Frequency')
plt.title('Frequency by attribute and edibility ({})'.format(attr))
plt.xticks(index + bar_width / 2, data.columns)
plt.legend()
plt.tight_layout()
plt.show()
plt.close()
In [38]:
plot_comparative_data('odor')
From the plot we can see that any mushroom with a foul, spicy and fishy smell as almost certainly poisonous. No smell is almost always edible, but in some rare cases it can be posionous.
Let's take a look at spore print color.
In [39]:
plot_comparative_data('spore-print-color')
We can see that chocolate and white mushrooms are usually poisonous so it is best to avoid those. Black or brown are usually edible, but not always.
To determine association between attributes and edibility we used Pearson's chi-squared test on the frequency of attribute values and then ordered the attributes in descending order of the chi-squared statistic. The chi-squared test works by comparing the observed data to expected data (the null hypothesis which is an even distribution across each row and column) with the following equation,
$$ \chi^2 = \sum^n_{i=1} \frac{ (O_i - E_i)^2 }{ E_i } $$where O is the observed data point and E is the expected data point.
With the following functions we can get a contingency table of the expected and observed values of any two attributes:
In [40]:
def expected_data(observed):
expected = np.zeros(observed.shape)
total = observed.sum().sum()
for j in [0, 1]:
for i, col_total in enumerate(observed.sum()):
row_total = observed.sum(axis=1)[j]
expected[j][i] = row_total*col_total/total
return pd.DataFrame(expected, index=observed.index,
columns=observed.columns)
In [41]:
o = attr_freqs('odor', 'poisonous')
o
Out[41]:
In [42]:
expected_data(o)
Out[42]:
Using these two tables for each attribute we can collect the chi-squared test statistic for each, and then sort them in descending order to rank the attributes by association with being poisonous or edible.
In [43]:
cat_names = shroom_dealer.get_attribute_dictionary().keys()
chisqrs = []
for cat in cat_names:
if cat != 'poisonous':
observed = observed_data(cat, 'poisonous')
expected = expected_data(observed)
chisqr = (((observed-expected)**2)/expected).sum().sum()
chisqrs.append((chisqr, cat))
chisqrs = sorted(chisqrs)[::-1]
chisqrs = chisqrs[:10]
values = [d[0] for d in chisqrs]
labels = [d[1].replace("-", "\n") for d in chisqrs]
index = np.arange(len(chisqrs))
bar_width = .35
opacity=0.4
plt.title("Attributes most associated with edibility")
plt.bar(index, values, bar_width, align='center')
plt.xticks(index, labels)
plt.ylabel("Chi-squared values")
plt.xlabel("Attributes")
plt.autoscale()
plt.tight_layout()
plt.show()
As we can see from the plot, odor is the most associated attribute with edibility, followed by spore print color and gill color. These rankings seem to agree heavily with our comparative barcharts.
While this use of the chi-squared test statistic may not be the traditional use of finding the p-value and accepting or rejecting the null hypothesis to determine independence, it still provided us with a metric to rank the attributes by their association of edibility.
Next we decided to plot a scatterplot matrix of the top 5 most associated attributes with edibility. In order to plot categorical variables on a scatterplot we needed to add some jitter to the data. This was done by adding a random number between -0.167 and 0.167 to all the categorical codes.
In [46]:
df = shroom_dealer.get_data_frame()
for col in df:
if col in ['odor', 'spore-print-color', 'gill-color', 'ring-type',
'stalk-surface-above-ring']:
df[col] = df[col].cat.codes + (np.random.rand(len(df),) - .5)/3
elif col == 'poisonous':
df[col] = df[col].cat.codes
else:
del df[col]
g = sns.pairplot(df, hue='poisonous')
plt.autoscale()
plt.tight_layout()
plt.show()
plt.close()
From the scatter plots we can cleary see how values of certain variables are grouped between poisonous and edible. Because the values were converted to the categorical codes to plot, we have generated a legend for the values of each attribute.
In [68]:
df = shroom_dealer.get_data_frame()
attr = shroom_dealer.get_attribute_dictionary()
labels = {}
for col in df:
if col in ['odor', 'spore-print-color', 'gill-color', 'ring-type',
'stalk-surface-above-ring', 'poisonous']:
labels[col] = [attr[col][c] for c in df[col].cat.categories] + \
(12-len(df[col].cat.categories))*[" "]
pd.DataFrame(labels)
Out[68]:
In [ ]:
In [ ]: