Create a classifier to predict the wine color from wine quality attributes using this dataset: http://archive.ics.uci.edu/ml/datasets/Wine+Quality

The data is in the database we've been using

  • host='training.c1erymiua9dx.us-east-1.rds.amazonaws.com'
  • database='training'
  • port=5432
  • user='dot_student'
  • password='qgis'
  • table name = 'winequality'

Query for the data and create a numpy array


In [2]:
import pandas as pd
import numpy as np
import pg8000

In [4]:
conn = pg8000.connect(host='training.c1erymiua9dx.us-east-1.rds.amazonaws.com', database='training', user='dot_student', password='qgis')

In [15]:
cursor = conn.cursor()

In [16]:
cursor.execute('SELECT * FROM winequality')


---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
c:\users\kate\appdata\local\programs\python\python35-32\lib\site-packages\pg8000\core.py in execute(self, cursor, operation, vals)
   1890         try:
-> 1891             ps = cache['ps'][key]
   1892             cursor.ps = ps

KeyError: ('SELECT * FROM winequality', ())

During handling of the above exception, another exception occurred:

ProgrammingError                          Traceback (most recent call last)
<ipython-input-16-784de8fa3a0c> in <module>()
----> 1 cursor.execute('SELECT * FROM winequality')

c:\users\kate\appdata\local\programs\python\python35-32\lib\site-packages\pg8000\core.py in execute(self, operation, args, stream)
    904                 if not self._c.in_transaction and not self._c.autocommit:
    905                     self._c.execute(self, "begin transaction", None)
--> 906                 self._c.execute(self, operation, args)
    907         except AttributeError as e:
    908             if self._c is None:

c:\users\kate\appdata\local\programs\python\python35-32\lib\site-packages\pg8000\core.py in execute(self, cursor, operation, vals)
   1938                 raise OperationalError(str(e))
   1939 
-> 1940             self.handle_messages(cursor)
   1941 
   1942             # We've got row_desc that allows us to identify what we're

c:\users\kate\appdata\local\programs\python\python35-32\lib\site-packages\pg8000\core.py in handle_messages(self, cursor)
   2086 
   2087         if self.error is not None:
-> 2088             raise self.error
   2089 
   2090     # Byte1('C') - Identifies the message as a close command.

ProgrammingError: ('ERROR', '25P02', 'current transaction is aborted, commands ignored until end of transaction block', 'postgres.c', '1283', 'exec_parse_message', '', '')

In [14]:
data = []
for item in cursor.fetchall():
    data.append(item)


---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
c:\users\kate\appdata\local\programs\python\python35-32\lib\site-packages\pg8000\core.py in __next__(self)
   1025             try:
-> 1026                 return self._cached_rows.popleft()
   1027             except IndexError:

IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

IndexError                                Traceback (most recent call last)
c:\users\kate\appdata\local\programs\python\python35-32\lib\site-packages\pg8000\core.py in __next__(self)
   1035                 try:
-> 1036                     return self._cached_rows.popleft()
   1037                 except IndexError:

IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

ProgrammingError                          Traceback (most recent call last)
<ipython-input-14-e3c193e52732> in <module>()
      1 data = []
----> 2 for item in cursor.fetchall():
      3     data.append(item)

c:\users\kate\appdata\local\programs\python\python35-32\lib\site-packages\pg8000\core.py in fetchall(self)
    988         """
    989         try:
--> 990             return tuple(self)
    991         except TypeError:
    992             raise ProgrammingError("attempting to use unexecuted cursor")

c:\users\kate\appdata\local\programs\python\python35-32\lib\site-packages\pg8000\core.py in __next__(self)
   1039                         raise ProgrammingError("A query hasn't been issued.")
   1040                     elif len(self.ps['row_desc']) == 0:
-> 1041                         raise ProgrammingError("no result set")
   1042                     else:
   1043                         raise StopIteration()

ProgrammingError: no result set

In [17]:
from numpy import array

In [ ]:
myarray = array(data)

Split the data into features (x) and target (y, the last column in the table)

Remember you can cast the results into an numpy array and then slice out what you want


In [ ]:
x = myarray[:,:11]
y = myarray[:,11:]

Create a decision tree with the data


In [ ]:
from sklearn.tree import DecisionTreeClassifier

In [ ]:
dt = DecisionTreeClassifier()

In [ ]:
dt = dt.fix(x,y)

Run 10-fold cross validation on the model


In [ ]:
from sklearn.cross_validation import cross_val_score

In [ ]:
scores = cross_val_score(dt,x,y2,cv=10)

If you have time, calculate the feature importance and graph based on the code in the slides from last class


In [18]:
plt.plot(dt.feature_importances_,'o')
plt.ylim(0,1)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-18-9fda9843c1d7> in <module>()
----> 1 plt.plot(dt.feature_importances_,'o')
      2 plt.ylim(0,1)

NameError: name 'plt' is not defined

In [ ]: