The purpose of this study is to determine how well a model can predict the perceived Quality of based in some of the most relevant physical and chemical properties of wine. The dataset was taken from: 'P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis. Modeling wine preferences by data mining from physicochemical properties. In Decision Support Systems, Elsevier, 47(4):547-553, 2009'
The there are two datasets, one for red wine and another for white wines. Both contain the same variables but with different number of instances. Only one of the dataset will be chosen to perform the analysis.
In [172]:
%matplotlib notebook
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy as sp
import IPython
from IPython.display import display
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer
from sklearn import cross_validation
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from yellowbrick.features import Rank2D
from sklearn.ensemble import GradientBoostingClassifier
from yellowbrick.features.importances import FeatureImportances
from sklearn.feature_selection import RFECV
from sklearn.model_selection import KFold
import pickle
from yellowbrick.classifier import ClassificationReport
from sklearn.ensemble import BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
import multiprocessing
from multiprocessing import Process
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
In [173]:
# Setting display options:
pd.set_option('max_columns',50)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
In [174]:
raw_df_red = pd.read_csv("winequality-red.csv", sep =';')
raw_df_white = pd.read_csv("winequality-white.csv", sep =';')
In [175]:
raw_df_red.describe()
Out[175]:
In [176]:
raw_df_white.describe()
Out[176]:
The dataset that will be chosen for this exercise will be the white wine dataset, since it contains more instances(4898). The dataset does not contain missing values or non numerical data.
In [177]:
raw_df_white.info()
In [178]:
raw_df_red.info()
In [179]:
df = raw_df_white
In [25]:
features = df.keys()
visualizer = Rank2D(features=features, algorithm='pearson')
visualizer.fit(df.values)
visualizer.transform(df.values)
visualizer.poof()
The two most related variables here are density with sugar and density with alcohol content (negative).
In [180]:
X = df.iloc[:,:-1] # independent variables X
y = df['quality'] # dependent Variables y
In [59]:
cols2 = pd.DataFrame(X)
cols2 = list(cols2.columns)
boxplot = X.boxplot(column=cols2, rot=90, fontsize=10)
In [60]:
minmax_scaler = MinMaxScaler(feature_range=(0, 1))
X_minmax = minmax_scaler.fit(X).transform(X)
X_minmax = pd.DataFrame(X_minmax)
X_minmax.columns = X.columns
cols2 = pd.DataFrame(X_minmax)
cols2 = list(cols2.columns)
boxplot = X_minmax.boxplot(column=cols2, rot=90, fontsize=10)
In [61]:
scaler = StandardScaler()
X_std = scaler.fit(X).transform(X)
X_std = pd.DataFrame(X_std)
X_std.columns = X.columns
cols2 = pd.DataFrame(X_std)
cols2 = list(cols2.columns)
boxplot = X_std.boxplot(column=cols2, rot=90, fontsize=10)
It is very clear that here, the best method for scaling is StandardScaling
In [62]:
from sklearn import model_selection
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2,random_state= 0)
X_train = scaler.fit(X_train).transform(X_train)
X_test = scaler.fit(X_test).transform(X_test)
In [63]:
features = X.keys()
Xi = X[features]
yi = y
figure = plt.figure()
axis = figure.add_subplot()
viz = FeatureImportances(GradientBoostingClassifier(), ax=axis)
viz.fit(Xi, yi)
viz.poof()
In [78]:
def rec_fe(target, data, filename):
if __name__ == '__main__':
cls = GradientBoostingClassifier()
rfecv = RFECV(estimator=cls, step=1, cv=KFold(12), scoring='accuracy', verbose = 10, n_jobs = -1)
rfecv.fit(data,target)
optimal_features = rfecv.n_features_
print("Optimal number of features : %d" % rfecv.n_features_)
with open(filename, 'wb') as features:
pickle.dump([optimal_features, rfecv] , features)
In [79]:
rec_fe(y_train,X_train, 'OptimalFeatures_classification.pickle')
In [80]:
with open('OptimalFeatures_classification.pickle', "rb") as feature:
feat = pickle.load(feature, encoding="utf8")
rfecv = feat[1]
In [82]:
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()
I this case, the best approach will be to use all the features.
In [109]:
yi = pd.DataFrame(y)
yi.describe()
Out[109]:
In [136]:
def classifier_graph(classifier):
classes = ['3','4','5','6','7','8']
model = classifier
visualizer = ClassificationReport(model, classes=classes)
X_train_ = pd.DataFrame(X_train)
X_test_ = pd.DataFrame(X_test)
visualizer.fit(X_train_, y_train)
visualizer.score(X_test_, y_test)
g = visualizer.poof()
In [137]:
knn = classifier_graph(KNeighborsClassifier())
In [140]:
forest = classifier_graph(RandomForestClassifier(n_estimators=100,n_jobs = -1))
In [142]:
baggin = classifier_graph(BaggingClassifier())
In [143]:
MLP = classifier_graph(MLPClassifier())
As it can be appreciated above, there is a significant class imbalance. Before proceding I will deal with this using upsampling. In the next section I will use only RandomForestClassifier and KNNClassifier.
In [188]:
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
rovs = RandomOverSampler(random_state=4)
X, y = rovs.fit_sample(X, y)
In [189]:
X = pd.DataFrame(X)
y = pd.Series(y)
print (X.info())
print (y.value_counts())
In [148]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2,random_state= 0)
X_train = scaler.fit(X_train).transform(X_train)
X_test = scaler.fit(X_test).transform(X_test)
In [149]:
knn = classifier_graph(KNeighborsClassifier())
In [150]:
forest = classifier_graph(RandomForestClassifier(n_estimators=100,n_jobs = -1))
As it can be appreciated here, Upsampling significantly solved the problem of class imbalance:
In [157]:
def gridsearch(model, parameters):
grid_search = GridSearchCV(estimator = model, param_grid = parameters, scoring = 'accuracy', cv = 4, n_jobs = -1, verbose = 10)
grid_search = grid_search.fit(X_train, y_train)
score = grid_search.best_score_
best_params = grid_search.best_params_
return score, best_params
In [160]:
KNN = KNeighborsClassifier()
RNF = RandomForestClassifier()
list_params = [{'n_neighbors':[3, 5, 10, 30, 50]}, {'n_estimators' :[50,128,300, 500, 1000]},]
models = [KNN, RNF]
In [161]:
scores_dict = {'Accuracy': [], 'best_params': []}
if __name__ == '__main__':
for model, param in zip(models, list_params):
acu, best_params = gridsearch(model, param)
scores_dict['Accuracy'].append(acu)
scores_dict['best_params'].append(best_params)
print (scores_dict)
with open('gridsearch_class.pickle', 'wb') as grid:
pickle.dump(scores_dict, grid)
In [192]:
knn = KNeighborsClassifier(n_neighbors = 3, n_jobs= -1)
rnf = RandomForestClassifier(n_estimators = 500, n_jobs= -1)
In [194]:
def sampling_offensegroup(features, target, average, pickle_file, pickle_estimators):
scores = {'accuracy_knn':[], 'f1_knn':[], 'precision_knn':[], 'recall_knn':[],
'accuracy_rnf':[], 'f1_rnf':[], 'precision_rnf':[], 'recall_rnf':[]}
skf = StratifiedKFold(n_splits=12)
for train_index, test_index in skf.split(features, target):
X_train, X_test = features[train_index], features[test_index]
y_train, y_test = target[train_index], target[test_index]
# scaling:
scaler = StandardScaler()
X_train = scaler.fit(X_train).transform(X_train)
X_test = scaler.fit(X_test).transform(X_test)
# knn:
fitted_knn = knn.fit(X_train, y_train)
predicted_knn = knn.predict(X_test)
report_knn = classification_report(y_test, predicted_knn)
scores['accuracy_knn'].append(accuracy_score(y_test, predicted_knn))
scores['f1_knn'].append(f1_score(y_test, predicted_knn, average = average))
scores['precision_knn'].append(precision_score(y_test, predicted_knn, average = average))
scores['recall_knn'].append(recall_score(y_test, predicted_knn, average = average))
# rnf:
fitted_rnf = rnf.fit(X_train, y_train)
predicted_rnf = rnf.predict(X_test)
report_rnf = classification_report(y_test, predicted_rnf)
scores['accuracy_rnf'].append(accuracy_score(y_test, predicted_rnf))
scores['f1_rnf'].append(f1_score(y_test, predicted_rnf, average = average))
scores['precision_rnf'].append(precision_score(y_test, predicted_rnf, average = average))
scores['recall_rnf'].append(recall_score(y_test, predicted_rnf, average = average))
print(scores)
# write to disc.
with open(pickle_file, 'wb') as files:
pickle.dump([fitted_knn, fitted_rnf] , files)
with open(pickle_estimators, 'wb') as estimator:
pickle.dump([predicted_knn, predicted_rnf,report_knn, report_rnf] , estimator)
return (scores)
In [200]:
if __name__ == '__main__':
report = sampling_offensegroup(X.values, y.values, 'micro', 'files.pickle',
'estimators.pickle')
In [203]:
report = pd.DataFrame(report)
report
Out[203]:
In [ ]: