In [13]:
import numpy as np
import pandas as pd
%pylab inline
pylab.style.use('ggplot')
import seaborn as sns
In [2]:
wine_data = pd.read_csv('wine.csv')
In [3]:
wine_data.head()
Out[3]:
In [4]:
label_counts = wine_data.loc[:, 'Wine'].value_counts().sort_index()
label_counts.plot(kind='bar')
Out[4]:
In [5]:
feature_names = wine_data.columns.drop('Wine')
for fname in feature_names:
_ = pylab.figure()
_ = wine_data.loc[:, fname].plot(kind='hist', title=fname)
In [6]:
for fname in feature_names:
_ = pylab.figure()
wine_data.boxplot(column=fname, by='Wine')
In [11]:
from sklearn.feature_selection import mutual_info_classif
mutual_info_importances = mutual_info_classif(wine_data.loc[:, feature_names], wine_data.loc[:, 'Wine'])
importances = pd.Series(mutual_info_importances, index=feature_names)
importances.sort_values(ascending=True).plot(kind='barh')
Out[11]:
In [14]:
corrs = wine_data.loc[:, feature_names].corr()
sns.heatmap(corrs, annot=True)
Out[14]:
In [15]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
feature_data = wine_data.loc[:, feature_names]
label_data = wine_data.loc[:, 'Wine']
model = GaussianNB()
scores = cross_val_score(estimator=model,
X=feature_data,
y=label_data,
cv=5,
scoring='f1_macro'
)
scores = pd.Series(scores)
scores.plot(kind='bar')
Out[15]: