In [8]:
%matplotlib notebook
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy as sp
import IPython
from IPython.display import display
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer
from sklearn import cross_validation
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
import keras
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.utils import to_categorical
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from keras.utils import np_utils
In this part other models will be used in order to see if greater accuracy is achieved.
Importing Dataset:
In [9]:
raw_df_white = pd.read_csv("winequality-white.csv", sep =';')
In [10]:
X = raw_df_white.iloc[:,:-1]
y = raw_df_white['quality']
encoder = LabelEncoder()
y = encoder.fit_transform(y)
y = y.reshape(-1, 1)
onehotencoder = OneHotEncoder()
y = onehotencoder.fit_transform(y).toarray()
y = pd.DataFrame(y)
y.columns = ['3', '4', '5', '6', '7', '8', '9']
In [11]:
X_train_white, X_test_white, y_train_white, y_test_white = cross_validation.train_test_split(X, y, test_size = 0.2, random_state = 0)
In [12]:
y_train_white.describe()
Out[12]:
Performing feature scaling on the dataset
In [13]:
scaler = StandardScaler()
#scaler = MinMaxScaler()
#scaler = Normalizer()
X_train_white = scaler.fit(X_train_white).transform(X_train_white)
X_test_white = scaler.fit(X_test_white).transform(X_test_white)
In [14]:
#classifier = Sequential()
#classifier.add(Dense(output_dim = 6, init = 'uniform', activation = 'relu', input_dim = 11))
# input_dim: number of independent var
#classifier.add(Dense(output_dim = 6,init = 'uniform', activation = 'relu'))
#adding layers with keras 2.0
model = Sequential()
model.add(Dense(units=11, input_dim=11))
model.add(Activation('relu'))
model.add(Dense(units=11))
model.add(Activation('relu'))
model.add(Dense(units=11))
model.add(Activation('relu'))
model.add(Dense(units=11))
model.add(Activation('relu'))
model.add(Dense(units=11))
model.add(Activation('relu'))
model.add(Dense(units=11))
model.add(Activation('relu'))
model.add(Dense(units=11))
model.add(Activation('relu'))
model.add(Dense(units=11))
model.add(Activation('relu'))
model.add(Dense(units=11))
model.add(Activation('relu'))
model.add(Activation('relu'))
model.add(Dense(units=11))
model.add(Activation('relu'))
model.add(Dense(units=11))
model.add(Activation('relu'))
model.add(Dense(units=11))
model.add(Activation('relu'))
model.add(Dense(units=11))
model.add(Activation('relu'))
model.add(Dense(units=11))
model.add(Activation('relu'))
model.add(Dense(units=11))
model.add(Activation('relu'))
model.add(Dense(units=11))
model.add(Activation('relu'))
model.add(Dense(units=11))
model.add(Activation('relu'))
model.add(Dense(units=7))
model.add(Activation('softmax'))
#
model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])
model.fit(X_train_white, y_train_white, batch_size = 5, epochs = 100)
Out[14]:
Performing predictions. if y_predict > 0.7 == True
In [15]:
y_predict = model.predict(X_test_white)
y_test_white = y_test_white.astype(float)
y_predict = y_predict
y_predict = y_predict > 0.7
Creating confusion matrix:
In [16]:
from sklearn.metrics import confusion_matrix
from numpy import argmax
cf = confusion_matrix(y_test_white.values.argmax(axis=1), y_predict.argmax(axis=1))
from sklearn.metrics import accuracy_score
#accuracy = accuracy_score(y_test_white, y_predict)
cf
Out[16]:
The results are not accurate. Improving the model:
Improving the model
In [19]:
from keras.layers import Dropout
odel = Sequential()
model.add(Dense(units=11, input_dim=11))
model.add(Activation('relu'))
model.add(Dropout(p = 0.2))
model.add(Dense(units=11, input_dim=11))
model.add(Activation('relu'))
model.add(Dropout(p = 0.2))
model.add(Dense(units=11, input_dim=11))
model.add(Activation('relu'))
model.add(Dropout(p = 0.2))
model.add(Dense(units=11, input_dim=11))
model.add(Activation('relu'))
model.add(Dropout(p = 0.2))
model.add(Dense(units=11, input_dim=11))
model.add(Activation('relu'))
model.add(Dropout(p = 0.2))
model.add(Dense(units=7))
model.add(Dense(units=11, input_dim=11))
model.add(Activation('relu'))
model.add(Dropout(p = 0.2))
model.add(Dense(units=11, input_dim=11))
model.add(Activation('relu'))
model.add(Dropout(p = 0.2))
model.add(Dense(units=11, input_dim=11))
model.add(Activation('relu'))
model.add(Dropout(p = 0.2))
model.add(Dense(units=11, input_dim=11))
model.add(Activation('relu'))
model.add(Dropout(p = 0.2))
model.add(Dense(units=11, input_dim=11))
model.add(Activation('relu'))
model.add(Dropout(p = 0.2))
model.add(Dense(units=7))
model.add(Activation('softmax'))
model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])
model.fit(X_train_white, y_train_white, batch_size = 5, epochs = 100)
Out[19]:
In [20]:
y_predict = model.predict(X_test_white)
y_test_white = y_test_white.astype(float)
y_predict = y_predict
y_predict = y_predict > 0.7
In [21]:
cf = confusion_matrix(y_test_white.values.argmax(axis=1), y_predict.argmax(axis=1))
cf
Out[21]:
The model accuracy was reduced.
In [ ]:
from xgboost.sklearn import XGBClassifier
from xgboost.sklearn import XGBRegressor
xclas = XGBClassifier() # for classifier
xclas.fit(X_train_white, y_train_white)
y_pred = xclas.predict(X_test_white)
cross_val_score(xclas, X_train_white, y_train_white)
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test_white, y_pred)
print (cm)
for Windows, xgboost it is avaliable only in anaconda:
Cross_val_score: array([0.59770992, 0.55708812, 0.58940906])
In this analysis, the two models that performed the best were the KNN and the KernelSVC. Those models achieved about 0.65 of accuracy on the test set. The accuracy for the training models was close to 1.0 which is an indicative of overfitting. Feature extraction did not improve the accuracy of the models. In the research paper. They manage to have an accuracy of 0.868 using the Fuzzy inductive Reasoning methodology, an absolute error tolerance of 1.0 and the discretization of the input variables. Their results vary greatly. For the SVM model, they vary from 0.503 to 0.868 (0.25 to 1.0 absolute error tolerance). In contrast to the research paper, the discretization of the input variables into two categories did not increased accuracy, on the contrary, it significantly reduced it. Apparently, ANN and Xgboost are not better that KNN and SVC models.
In [ ]: