In [ ]:
# Setup the Code
In [60]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import numpy
from pandas import DataFrame
from sklearn import datasets
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import SGDClassifier
#matplotlib.style.use('ggplot')
In [68]:
RANDOM_SEED = 666
TEST_FRACTION = 0.25
COLORS = 'r m b'.split()
In [46]:
iris = datasets.load_iris()
In [47]:
print(iris.keys())
In [48]:
print(iris.DESCR)
In [49]:
frame = DataFrame(iris.data, columns=iris.feature_names)
frame.describe()
Out[49]:
In [50]:
figure = plt.figure()
axe = figure.gca()
axes = frame.plot(kind='kde', ax=axe)
In [51]:
fig = plt.figure()
axe = fig.gca()
axe = frame.plot(kind='box', ax=axe)
In [52]:
colors = 'r m b'.split()
for x_feature in range(3):
for y_feature in range(x_feature+1, 4):
fig = plt.figure()
axe = fig.gca()
for target in range(len(colors)):
data = frame[iris.target == target]
label = iris.target_names[target]
color = colors[target]
x_data = iris.feature_names[x_feature]
y_data = iris.feature_names[y_feature]
axe = data.plot(x=x_data, y=y_data,
kind='scatter',
ax=axe,
label=label,
facecolor=color)
It looks like setosa is always more easily separable from the other two types, and that petal width and length might actually be the easiest to separate out.
In [ ]:
class XYData(object):
def __init__(self, x_name, y_name):
self.x_name = x_name
return
# end XYData
In [53]:
x_train = {}
x_test = {}
y_train = {}
y_test = {}
sepal_data = frame[iris.feature_names[:2]]
scaler = preprocessing.StandardScaler().fit(sepal_data)
xtrain, xtest, ytrain, ytest = train_test_split(sepal_data,
iris.target,
test_size=TEST_FRACTION,
random_state=RANDOM_SEED)
xtrain, xtest = scaler.transform(xtrain), scaler.transform(xtest)
In [58]:
classifier = SGDClassifier()
classifier = classifier.fit(xtrain, ytrain)
In [70]:
def plot_fit(x_train, y_train, classifier):
x_min, x_max = x_train[:, 0].min() - .5, x_train[:, 0].max() + .5
y_min, y_max = x_train[:, 1].min() - .5, x_train[:,1].max() + .5
xs = numpy.arange(x_min, x_max, 0.5)
for plot in range(len(iris.target_names)):
figure = plt.figure()
class_name = iris.target_names[plot]
axe = figure.gca()
axe.set_title('Class {0} versus the rest'.format(class_name))
axe.set_xlabel('Sepal Length')
axe.set_ylabel('Sepal Width')
axe.set_xlim(x_min, x_max)
axe.set_ylim(y_min, y_max)
for index, classification in enumerate(iris.target_names):
this_train = x_train[y_train == index]
axe.scatter(this_train[:, 0], this_train[:, 1], c=COLORS[index], label=classification)
ys = (-classifier.intercept_[plot] - xs * classifier.coef_[plot, 0]) / classifier.coef_[plot, 1]
axe.legend()
axe.plot(xs, ys)
return
plot_fit(xtrain, ytrain, classifier)
In [ ]: