Logistic Regression

Again, thanks to Monte!


In [1]:
import numpy as np
from sklearn import datasets
from sklearn.utils import shuffle

random_state = np.random.RandomState(0)

iris = datasets.load_iris()
X = iris.data
y = iris.target

Make it a binary classification problem by removing the third class


In [2]:
X, y = X[y != 0], y[y != 0]
n_samples, n_features = X.shape

y[y==1] = 0
y[y==2] = 1

In [3]:
print X.shape, y.shape
print set(y)


(100, 4) (100,)
set([0, 1])

Using sklearn


In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = LogisticRegression()
clf.fit(X_train,y_train)
y_pred_test = clf.predict(X_test)
y_pred_train = clf.predict(X_train)
print accuracy_score(y_train, y_pred_train)
print accuracy_score(y_test, y_pred_test)


0.9875
0.85

Save to file


In [5]:
print y_train.shape
print y_train.reshape(y_train.shape[0],1).shape
print X_train.shape
cX = np.concatenate((y_train.reshape(80,1), X_train), axis=1)
cX.shape


(80,)
(80, 1)
(80, 4)
Out[5]:
(80, 5)

Write to file....


In [6]:
np.savetxt('iris_train.csv', cX, delimiter=' ', fmt='%0.4f')
!head iris_train.csv


1.0000 7.6000 3.0000 6.6000 2.1000
1.0000 6.0000 3.0000 4.8000 1.8000
0.0000 6.8000 2.8000 4.8000 1.4000
0.0000 5.8000 2.6000 4.0000 1.2000
1.0000 6.0000 2.2000 5.0000 1.5000
0.0000 6.7000 3.1000 4.4000 1.4000
0.0000 5.5000 2.6000 4.4000 1.2000
1.0000 6.3000 2.5000 5.0000 1.9000
0.0000 5.2000 2.7000 3.9000 1.4000
1.0000 7.7000 2.8000 6.7000 2.0000

In [7]:
cX = np.concatenate((y_test.reshape(len(y_test),1), X_test), axis=1)
np.savetxt('iris_test.csv', cX, delimiter=' ', fmt='%0.4f')

With Spark


In [10]:
points = sc.textFile('../pyspark-exercises/iris_train.csv', 18)
points.take(5)


Out[10]:
[u'1.0000 7.6000 3.0000 6.6000 2.1000',
 u'1.0000 6.0000 3.0000 4.8000 1.8000',
 u'0.0000 6.8000 2.8000 4.8000 1.4000',
 u'0.0000 5.8000 2.6000 4.0000 1.2000',
 u'1.0000 6.0000 2.2000 5.0000 1.5000']

In [11]:
from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark.mllib.classification import LabeledPoint 

parsed_data = points.map(lambda line: np.array([float(x) for x in line.split(' ')]))
parsed_data = parsed_data.map(lambda arr: LabeledPoint(arr[0],arr[1:]))

print type(parsed_data)
parsed_data.take(1)


<class 'pyspark.rdd.PipelinedRDD'>
Out[11]:
[LabeledPoint(1.0, [7.6,3.0,6.6,2.1])]

In [13]:
model = LogisticRegressionWithSGD.train(parsed_data)

Any idea about the "Cleaned shuffle" messages? Hint: narrow versus wide transformations.


In [14]:
y = parsed_data.map(lambda x: x.label)
y_pred = parsed_data.map(lambda x: model.predict(x.features))

In [15]:
tmp = y.zip(y_pred)
tmp.take(5)


Out[15]:
[(1.0, 1), (1.0, 1), (0.0, 0), (0.0, 0), (1.0, 1)]

Training accuracy


In [16]:
1.0 - tmp.filter(lambda (y, p): y!=p).count()/float(parsed_data.count())


Out[16]:
0.975

Test accuracy


In [17]:
points = sc.textFile('../pyspark-exercises/iris_test.csv', 18)
parsed_data = points.map(lambda line: np.array([float(x) for x in line.split(' ')]))
parsed_data = parsed_data.map(lambda arr: LabeledPoint(arr[0],arr[1:]))
y_pred = parsed_data.map(lambda x: model.predict(x.features))
y = parsed_data.map(lambda x: x.label)
tmp = y.zip(y_pred)
1.0 - tmp.filter(lambda (y, p): y!=p).count()/float(parsed_data.count())


Out[17]:
0.85