In [12]:
import pg8000
conn = pg8000.connect(host='training.c1erymiua9dx.us-east-1.rds.amazonaws.com', database="training", port=5432, user='dot_student', password='qgis')
cursor = conn.cursor()
In [13]:
database=cursor.execute("SELECT * FROM winequality")
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
df = pd.read_sql("SELECT * FROM winequality", conn)
In [14]:
df.head()
Out[14]:
In [15]:
df=df.rename(columns = lambda x : str(x)[1:])
df.columns = [x.replace('\'', '') for x in df.columns]
In [16]:
df.columns
Out[16]:
In [17]:
df.info()
In [ ]:
In [ ]:
In [ ]:
In [18]:
x = df.ix[:, df.columns != 'color'].as_matrix() # the attributes
x
Out[18]:
In [20]:
y = df['color'].as_matrix() # the attributes
y
Out[20]:
In [ ]:
In [24]:
from sklearn import tree
import matplotlib.pyplot as plt
import numpy as np
In [25]:
dt = tree.DecisionTreeClassifier()
In [26]:
dt = dt.fit(x,y)
In [27]:
from sklearn.cross_validation import cross_val_score
In [28]:
scores = cross_val_score(dt,x,y,cv=10)
In [29]:
np.mean(scores)
Out[29]:
In [45]:
df.columns
Out[45]:
In [44]:
# running this on the decision tree
plt.plot(dt.feature_importances_,'o')
plt.ylim(0,1)
plt.xlim(0,10)
# free_sulfur_dioxide is the most important feature.
Out[44]:
In [ ]: