In [45]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
In [46]:
data = pd.read_csv("mushrooms.csv")
print('n_rows: %d' %data.shape[0])
print('n_columns: %d' %data.shape[1])
print('Null Values: %d' %data.isnull().sum().sum())
print('Classes: %s' %data['class'].unique())
In [47]:
data.head()
Out[47]:
In [48]:
#Obtain total number of mushrooms for each 'cap-color' (Entire DataFrame)
cap_colors = data['cap-color'].value_counts()
m_height = cap_colors.values.tolist() #Provides numerical values
cap_colors.axes #Provides row labels
cap_color_labels = cap_colors.axes[0].tolist() #Converts index object to list
#=====PLOT Preparations and Plotting====#
ind = np.arange(10) # the x locations for the groups
width = 0.7 # the width of the bars
colors = ['#DEB887','#778899','#DC143C','#FFFF99','#f8f8ff','#F0DC82','#FF69B4','#D22D1E','#C000C5','g']
#FFFFF0
fig, ax = plt.subplots(figsize=(10,7))
mushroom_bars = ax.bar(ind, m_height , width, color=colors)
#Add some text for labels, title and axes ticks
ax.set_xlabel("Cap Color",fontsize=20)
ax.set_ylabel('Number of Mushrooms',fontsize=20)
ax.set_xticks(ind) #Positioning on the x axis
ax.set_xticklabels(('brown', 'gray','red','yellow','white','buff','pink','cinnamon','purple','green'),
fontsize = 12)
#Auto-labels the number of mushrooms for each bar color.
def autolabel(rects,fontsize=14):
"""
Attach a text label above each bar displaying its height
"""
for rect in rects:
height = rect.get_height()
ax.text(rect.get_x() + rect.get_width()/2., 1*height,'%d' % int(height),
ha='center', va='bottom',fontsize=fontsize)
autolabel(mushroom_bars)
plt.show() #Display bars.
In [49]:
poisonous_cc = [] #Poisonous color cap list
edible_cc = [] #Edible color cap list
for capColor in cap_color_labels:
size = len(data[data['cap-color'] == capColor].index)
edibles = len(data[(data['cap-color'] == capColor) & (data['class'] == 'e')])
edible_cc.append(edibles)
poisonous_cc.append(size-edibles)
#=====PLOT Preparations and Plotting====#
width = 0.40
fig, ax = plt.subplots(figsize=(12,7))
edible_bars = ax.bar(ind, edible_cc , width, color='#ADFF2F')
poison_bars = ax.bar(ind+width, poisonous_cc , width, color='#DA70D6')
#Add some text for labels, title and axes ticks
ax.set_xlabel("Cap Color",fontsize=20)
ax.set_ylabel('Quantity',fontsize=20)
ax.set_title('Edible and Poisonous Mushrooms Based on Cap Color',fontsize=22)
ax.set_xticks(ind + width / 2) #Positioning on the x axis
ax.set_xticklabels(('brown', 'gray','red','yellow','white','buff','pink','cinnamon','purple','green'),
fontsize = 12)
ax.legend((edible_bars,poison_bars),('edible','poisonous'),fontsize=17)
autolabel(edible_bars, 10)
autolabel(poison_bars, 10)
plt.show()
In [50]:
#Obtain total number of mushrooms for each 'odor' (Entire DataFrame)
odors = data['odor'].value_counts()
odor_height = odors.values.tolist() #Provides numerical values
odor_labels = odors.axes[0].tolist() #Converts index labels object to list
#=====PLOT Preparations and Plotting====#
width = 0.7
ind = np.arange(9) # the x locations for the groups
colors = ['#FFFF99','#ADFF2F','#00BFFF','#FA8072','#FFEBCD','#800000','#40E0D0','#808080','#2E8B57']
fig, ax = plt.subplots(figsize=(10,7))
odor_bars = ax.bar(ind, odor_height , width, color=colors)
#Add some text for labels, title and axes ticks
ax.set_xlabel("Odor",fontsize=20)
ax.set_ylabel('Quantity',fontsize=20)
ax.set_title('Mushroom Odor and Quantity',fontsize=22)
ax.set_xticks(ind) #Positioning on the x axis
ax.set_xticklabels(('none', 'foul','fishy','spicy','almond','anise','pungent','creosote','musty'),
fontsize = 12)
ax.legend(odor_bars, ['none: no smell','foul: rotten eggs', 'fishy: fresh fish','spicy: pepper',
'almond: nutlike kernel', 'anise: sweet herbal', 'pungent: vinegar',
'creosote: smoky chimney', 'musty: mold mildew'],fontsize=17)
autolabel(odor_bars)
plt.show() #Display bars.
In [51]:
poisonous_od = [] #Poisonous odor list
edible_od = [] #Edible odor list
for odor in odor_labels:
size = len(data[data['odor'] == odor].index)
edibles = len(data[(data['odor'] == odor) & (data['class'] == 'e')].index)
edible_od.append(edibles)
poisonous_od.append(size-edibles)
#=====PLOT Preparations and Plotting====#
width = 0.40
fig, ax = plt.subplots(figsize=(12,7))
edible_bars = ax.bar(ind, edible_od , width, color='#ADFF2F')
poison_bars = ax.bar(ind+width, poisonous_od , width, color='#DA70D6')
#Add some text for labels, title and axes ticks
ax.set_xlabel("Odor",fontsize=20)
ax.set_ylabel('Quantity',fontsize=20)
ax.set_title('Edible and Poisonous Mushrooms Based on Odor',fontsize=22)
ax.set_xticks(ind + width / 2) #Positioning on the x axis
ax.set_xticklabels(('none', 'foul','fishy','spicy','almond','anise','pungent','creosote','musty'),
fontsize = 12)
ax.legend((edible_bars,poison_bars),('edible','poisonous'),fontsize=17)
autolabel(edible_bars, 10)
autolabel(poison_bars, 10)
plt.show()
In [52]:
#Get the population types and its values for Single Pie chart
populations = data['population'].value_counts()
pop_size = populations.values.tolist() #Provides numerical values
pop_types = populations.axes[0].tolist() #Converts index labels object to list
print(pop_size)
# Data to plot
pop_labels = 'Several', 'Solitary', 'Scattered', 'Numerous', 'Abundant', 'Clustered'
colors = ['#F38181','#EAFFD0','#95E1D3','#FCE38A','#BDE4F4','#9EF4E6']
explode = (0, 0.1, 0, 0, 0, 0) # explode 1st slice
fig = plt.figure(figsize=(12,8))
# Plot
plt.title('Mushroom Population Type Percentange', fontsize=22)
patches, texts, autotexts = plt.pie(pop_size, explode=explode, labels=pop_labels, colors=colors,
autopct='%1.1f%%', shadow=True, startangle=150)
for text,autotext in zip(texts,autotexts):
text.set_fontsize(14)
autotext.set_fontsize(14)
plt.axis('equal')
plt.show()
In [53]:
#Get the habitat types and its values for a Single Pie chart
habitats = data['habitat'].value_counts()
hab_size = habitats.values.tolist() #Provides numerical values
hab_types = habitats.axes[0].tolist() #Converts index labels object to list
# Data to plot
hab_labels = 'Woods', 'Grasses', 'Paths', 'Leaves', 'Urban', 'Meadows', 'Waste'
colors = ['#F5AD6F','#EAFFD0','#FFFF66','#84D9E2','#C0C0C0','#DE7E7E', '#FFB6C1']
explode = (0, 0, 0, 0, 0, 0,0.5) # explode 1st slice
fig = plt.figure(figsize=(12,8))
# Plot
plt.title('Mushroom Habitat Type Percentange', fontsize=22)
patches, texts, autotexts = plt.pie(hab_size, explode=explode, labels=hab_labels, colors=colors,
autopct='%1.1f%%', shadow=True, startangle=360)
for text,autotext in zip(texts,autotexts):
text.set_fontsize(14)
autotext.set_fontsize(14)
plt.axis('equal')
plt.show()
In [54]:
labelencoder=LabelEncoder()
for col in data.columns:
data[col] = labelencoder.fit_transform(data[col])
X = data.drop(['class'],axis=1)
y = data['class']
onehotencoder=OneHotEncoder(sparse=False)
a = onehotencoder.fit_transform(X)
In [55]:
X_train, X_test, y_train, y_test = train_test_split(a, y, test_size = 0.2)
In [57]:
models = [SVC(kernel='rbf', random_state=0), SVC(kernel='linear', random_state=0), LogisticRegression()]
model_names = ['SVC_rbf', 'SVC_linear', 'Logistic Regression']
for i, model in enumerate(models):
model.fit(X_train, y_train)
print ('The accurancy of ' + model_names[i] + ' is ' + str(accuracy_score(y_test, model.predict(X_test))) )