Create a classifier to predict the wine color from wine quality attributes using this dataset: http://archive.ics.uci.edu/ml/datasets/Wine+Quality

The data is in the database we've been using

  • host='training.c1erymiua9dx.us-east-1.rds.amazonaws.com'
  • database='training'
  • port=5432
  • user='dot_student'
  • password='qgis'
  • table name = 'winequality'

Query for the data and create a numpy array


In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import dateutil.parser
import pg8000
from pandas import DataFrame
from sklearn.externals.six import StringIO
import pydotplus
from sklearn import tree
from sklearn.cross_validation import train_test_split
from sklearn import metrics

In [39]:
conn = pg8000.connect(host="training.c1erymiua9dx.us-east-1.rds.amazonaws.com", user='dot_student', password='qgis', database='training')
cursor = conn.cursor()

In [40]:
cursor.execute("select column_name from information_schema.columns where table_name='winequality'")
column_list = []
for row in cursor.fetchall():
    column_list.append(row[0])
column_list


Out[40]:
['fixed_acidity',
 'volatile_acidity',
 'citric_acid',
 'residual_sugar',
 'chlorides',
 'free_sulfur_dioxide',
 'total_sulfur_dioxide',
 'density',
 'ph',
 'sulphates',
 'alcohol',
 'color']

In [5]:
database=cursor.execute("SELECT * FROM winequality")
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
df = pd.read_sql("SELECT * FROM winequality", conn)

In [6]:
df.head()


Out[6]:
b'fixed_acidity' b'volatile_acidity' b'citric_acid' b'residual_sugar' b'chlorides' b'free_sulfur_dioxide' b'total_sulfur_dioxide' b'density' b'ph' b'sulphates' b'alcohol' b'color'
0 7.0 0.27 0.36 20.7 0.045 45.0 170.0 1.0010 3.00 0.45 8.8 W
1 6.3 0.30 0.34 1.6 0.049 14.0 132.0 0.9940 3.30 0.49 9.5 W
2 8.1 0.28 0.40 6.9 0.050 30.0 97.0 0.9951 3.26 0.44 10.1 W
3 7.2 0.23 0.32 8.5 0.058 47.0 186.0 0.9956 3.19 0.40 9.9 W
4 7.2 0.23 0.32 8.5 0.058 47.0 186.0 0.9956 3.19 0.40 9.9 W

In [12]:
df.columns


Out[12]:
Index([       b'fixed_acidity',     b'volatile_acidity',
                b'citric_acid',       b'residual_sugar',
                  b'chlorides',  b'free_sulfur_dioxide',
       b'total_sulfur_dioxide',              b'density',
                         b'ph',            b'sulphates',
                    b'alcohol',                b'color'],
      dtype='object')

In [7]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6497 entries, 0 to 6496
Data columns (total 12 columns):
b'fixed_acidity'           6497 non-null float64
b'volatile_acidity'        6497 non-null float64
b'citric_acid'             6497 non-null float64
b'residual_sugar'          6497 non-null float64
b'chlorides'               6497 non-null float64
b'free_sulfur_dioxide'     6497 non-null float64
b'total_sulfur_dioxide'    6497 non-null float64
b'density'                 6497 non-null float64
b'ph'                      6497 non-null float64
b'sulphates'               6497 non-null float64
b'alcohol'                 6497 non-null float64
b'color'                   6497 non-null object
dtypes: float64(11), object(1)
memory usage: 583.8+ KB

Split the data into features (x) and target (y, the last column in the table)

Remember you can cast the results into an numpy array and then slice out what you want


In [9]:
numpyMatrix = df.as_matrix()
numpyMatrix


Out[9]:
array([[7.0, 0.27, 0.36, ..., 0.45, 8.8, 'W'],
       [6.3, 0.3, 0.34, ..., 0.49, 9.5, 'W'],
       [8.1, 0.28, 0.4, ..., 0.44, 10.1, 'W'],
       ..., 
       [6.3, 0.51, 0.13, ..., 0.75, 11.0, 'R'],
       [5.9, 0.645, 0.12, ..., 0.71, 10.2, 'R'],
       [6.0, 0.31, 0.47, ..., 0.66, 11.0, 'R']], dtype=object)

In [13]:
x = numpyMatrix[:,:11]
x


Out[13]:
array([[7.0, 0.27, 0.36, ..., 3.0, 0.45, 8.8],
       [6.3, 0.3, 0.34, ..., 3.3, 0.49, 9.5],
       [8.1, 0.28, 0.4, ..., 3.26, 0.44, 10.1],
       ..., 
       [6.3, 0.51, 0.13, ..., 3.42, 0.75, 11.0],
       [5.9, 0.645, 0.12, ..., 3.57, 0.71, 10.2],
       [6.0, 0.31, 0.47, ..., 3.39, 0.66, 11.0]], dtype=object)

In [14]:
y = numpyMatrix[:,11:]
y


Out[14]:
array([['W'],
       ['W'],
       ['W'],
       ..., 
       ['R'],
       ['R'],
       ['R']], dtype=object)

Create a decision tree with the data


In [17]:
dt = tree.DecisionTreeClassifier()

In [18]:
dt = dt.fit(x,y)

In [20]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25,train_size=0.75)

In [21]:
dt = dt.fit(x_train,y_train)

In [23]:
def measure_performance(X,y,clf, show_accuracy=True, show_classification_report=True, show_confussion_matrix=True):
    y_pred=clf.predict(X)
    if show_accuracy:
        print("Accuracy:{0:.3f}".format(metrics.accuracy_score(y, y_pred)),"\n")
    if show_classification_report:
        print("Classification report")
        print(metrics.classification_report(y,y_pred),"\n")
    if show_confussion_matrix:
        print("Confusion matrix")
        print(metrics.confusion_matrix(y,y_pred),"\n")

In [24]:
measure_performance(x_test,y_test,dt) #measure on the test data (rather than train)


Accuracy:0.984 

Classification report
             precision    recall  f1-score   support

          R       0.96      0.97      0.97       396
          W       0.99      0.99      0.99      1229

avg / total       0.98      0.98      0.98      1625
 

Confusion matrix
[[ 386   10]
 [  16 1213]] 


In [26]:
def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(iris.target_names))
    plt.xticks(tick_marks, iris.target_names, rotation=45)
    plt.yticks(tick_marks, iris.target_names)
    plt.tight_layout()
    plt.ylabel('True color')
    plt.xlabel('Predicted color')

In [27]:
y_pred = dt.fit(x_train, y_train).predict(x_test)

Run 10-fold cross validation on the model


In [29]:
from sklearn.cross_validation import cross_val_score

In [44]:
x = numpyMatrix[:,:11]
x


Out[44]:
array([[7.0, 0.27, 0.36, ..., 3.0, 0.45, 8.8],
       [6.3, 0.3, 0.34, ..., 3.3, 0.49, 9.5],
       [8.1, 0.28, 0.4, ..., 3.26, 0.44, 10.1],
       ..., 
       [6.3, 0.51, 0.13, ..., 3.42, 0.75, 11.0],
       [5.9, 0.645, 0.12, ..., 3.57, 0.71, 10.2],
       [6.0, 0.31, 0.47, ..., 3.39, 0.66, 11.0]], dtype=object)

In [45]:
y = numpyMatrix[:,11]
y


Out[45]:
array(['W', 'W', 'W', ..., 'R', 'R', 'R'], dtype=object)

In [46]:
scores = cross_val_score(dt,x,y,cv=10)

In [47]:
scores


Out[47]:
array([ 0.98,  0.98,  0.98,  0.98,  0.98,  0.98,  0.98,  0.97,  0.99,  0.98])

In [48]:
np.mean(scores)


Out[48]:
0.98060755314992609

If you have time, calculate the feature importance and graph based on the code in the slides from last class


In [43]:
cursor.execute("Select * FROM winequality")
colnames = [desc[0] for desc in cursor.description]
colnames


Out[43]:
[b'fixed_acidity',
 b'volatile_acidity',
 b'citric_acid',
 b'residual_sugar',
 b'chlorides',
 b'free_sulfur_dioxide',
 b'total_sulfur_dioxide',
 b'density',
 b'ph',
 b'sulphates',
 b'alcohol',
 b'color']

In [50]:
plt.plot(dt.feature_importances_,'o')
plt.ylim(-5,10)
plt.ylim(-5,10)


Out[50]:
(-5, 10)

In [ ]: