In [1]:
import numpy as np
from sklearn import datasets
from sklearn.utils import shuffle
random_state = np.random.RandomState(0)
iris = datasets.load_iris()
X = iris.data
y = iris.target
Make it a binary classification problem by removing the third class
In [2]:
X, y = X[y != 0], y[y != 0]
n_samples, n_features = X.shape
y[y==1] = 0
y[y==2] = 1
In [3]:
print X.shape, y.shape
print set(y)
In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = LogisticRegression()
clf.fit(X_train,y_train)
y_pred_test = clf.predict(X_test)
y_pred_train = clf.predict(X_train)
print accuracy_score(y_train, y_pred_train)
print accuracy_score(y_test, y_pred_test)
In [5]:
print y_train.shape
print y_train.reshape(y_train.shape[0],1).shape
print X_train.shape
cX = np.concatenate((y_train.reshape(80,1), X_train), axis=1)
cX.shape
Out[5]:
Write to file....
In [6]:
np.savetxt('iris_train.csv', cX, delimiter=' ', fmt='%0.4f')
!head iris_train.csv
In [7]:
cX = np.concatenate((y_test.reshape(len(y_test),1), X_test), axis=1)
np.savetxt('iris_test.csv', cX, delimiter=' ', fmt='%0.4f')
In [10]:
points = sc.textFile('../pyspark-exercises/iris_train.csv', 18)
points.take(5)
Out[10]:
In [11]:
from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark.mllib.classification import LabeledPoint
parsed_data = points.map(lambda line: np.array([float(x) for x in line.split(' ')]))
parsed_data = parsed_data.map(lambda arr: LabeledPoint(arr[0],arr[1:]))
print type(parsed_data)
parsed_data.take(1)
Out[11]:
In [13]:
model = LogisticRegressionWithSGD.train(parsed_data)
Any idea about the "Cleaned shuffle" messages? Hint: narrow versus wide transformations.
In [14]:
y = parsed_data.map(lambda x: x.label)
y_pred = parsed_data.map(lambda x: model.predict(x.features))
In [15]:
tmp = y.zip(y_pred)
tmp.take(5)
Out[15]:
Training accuracy
In [16]:
1.0 - tmp.filter(lambda (y, p): y!=p).count()/float(parsed_data.count())
Out[16]:
Test accuracy
In [17]:
points = sc.textFile('../pyspark-exercises/iris_test.csv', 18)
parsed_data = points.map(lambda line: np.array([float(x) for x in line.split(' ')]))
parsed_data = parsed_data.map(lambda arr: LabeledPoint(arr[0],arr[1:]))
y_pred = parsed_data.map(lambda x: model.predict(x.features))
y = parsed_data.map(lambda x: x.label)
tmp = y.zip(y_pred)
1.0 - tmp.filter(lambda (y, p): y!=p).count()/float(parsed_data.count())
Out[17]: