In [1]:
import pg8000
conn = pg8000.connect(host='training.c1erymiua9dx.us-east-1.rds.amazonaws.com', database="training", port=5432, user='dot_student', password='qgis')
In [61]:
import pandas as pd
df = pd.read_sql("select * from winequality", conn)
df.head()
Out[61]:
In [8]:
import numpy as np
data = df.as_matrix()
len(data)
Out[8]:
In [28]:
lastColIndex = len(data[0])-1
x = [i[:lastColIndex] for i in data]
y = [i[lastColIndex] for i in data] # red or white
In [30]:
from sklearn import tree
dt = tree.DecisionTreeClassifier()
dt.fit(x, y)
Out[30]:
In [31]:
from sklearn.cross_validation import cross_val_score
scores = cross_val_score(dt,x,y,cv=10)
scores
Out[31]:
In [50]:
len(dt.feature_importances_)
column_names = list(df.columns)
column_names.pop(11)
Out[50]:
In [60]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(dt.feature_importances_, 'o')
plt.xticks(range(data.shape[1]),column_names, rotation=90)
plt.ylim(0,1)
Out[60]: