In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import dateutil.parser
import pg8000
from pandas import DataFrame
from sklearn.externals.six import StringIO
import pydotplus
from sklearn import tree
from sklearn.cross_validation import train_test_split
from sklearn import metrics
In [39]:
conn = pg8000.connect(host="training.c1erymiua9dx.us-east-1.rds.amazonaws.com", user='dot_student', password='qgis', database='training')
cursor = conn.cursor()
In [40]:
cursor.execute("select column_name from information_schema.columns where table_name='winequality'")
column_list = []
for row in cursor.fetchall():
column_list.append(row[0])
column_list
Out[40]:
In [5]:
database=cursor.execute("SELECT * FROM winequality")
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
df = pd.read_sql("SELECT * FROM winequality", conn)
In [6]:
df.head()
Out[6]:
In [12]:
df.columns
Out[12]:
In [7]:
df.info()
In [9]:
numpyMatrix = df.as_matrix()
numpyMatrix
Out[9]:
In [13]:
x = numpyMatrix[:,:11]
x
Out[13]:
In [14]:
y = numpyMatrix[:,11:]
y
Out[14]:
In [17]:
dt = tree.DecisionTreeClassifier()
In [18]:
dt = dt.fit(x,y)
In [20]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.25,train_size=0.75)
In [21]:
dt = dt.fit(x_train,y_train)
In [23]:
def measure_performance(X,y,clf, show_accuracy=True, show_classification_report=True, show_confussion_matrix=True):
y_pred=clf.predict(X)
if show_accuracy:
print("Accuracy:{0:.3f}".format(metrics.accuracy_score(y, y_pred)),"\n")
if show_classification_report:
print("Classification report")
print(metrics.classification_report(y,y_pred),"\n")
if show_confussion_matrix:
print("Confusion matrix")
print(metrics.confusion_matrix(y,y_pred),"\n")
In [24]:
measure_performance(x_test,y_test,dt) #measure on the test data (rather than train)
In [26]:
def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues):
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(iris.target_names))
plt.xticks(tick_marks, iris.target_names, rotation=45)
plt.yticks(tick_marks, iris.target_names)
plt.tight_layout()
plt.ylabel('True color')
plt.xlabel('Predicted color')
In [27]:
y_pred = dt.fit(x_train, y_train).predict(x_test)
In [29]:
from sklearn.cross_validation import cross_val_score
In [44]:
x = numpyMatrix[:,:11]
x
Out[44]:
In [45]:
y = numpyMatrix[:,11]
y
Out[45]:
In [46]:
scores = cross_val_score(dt,x,y,cv=10)
In [47]:
scores
Out[47]:
In [48]:
np.mean(scores)
Out[48]:
In [43]:
cursor.execute("Select * FROM winequality")
colnames = [desc[0] for desc in cursor.description]
colnames
Out[43]:
In [50]:
plt.plot(dt.feature_importances_,'o')
plt.ylim(-5,10)
plt.ylim(-5,10)
Out[50]:
In [ ]: