Logistic Regression


In [ ]:
import numpy as np
from sklearn import datasets
from sklearn.utils import shuffle

random_state = np.random.RandomState(0)

iris = datasets.load_iris()
X = iris.data
y = iris.target

print y

In [ ]:
import seaborn as sns

%matplotlib inline
iris_sns = sns.load_dataset("iris")

g = sns.PairGrid(iris_sns)
g.map_diag(sns.kdeplot)
g.map_offdiag(sns.kdeplot, cmap="Blues_d", n_levels=6);
sns.plt.show()

Make it a binary classification problem by removing the first class


In [ ]:
X, y = X[y != 0], y[y != 0]
n_samples, n_features = X.shape

y[y==1] = 0
y[y==2] = 1

In [ ]:
print X.shape, y.shape
print set(y)

Using sklearn


In [ ]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = LogisticRegression()
clf.fit(X_train,y_train)
y_pred_test = clf.predict(X_test)
y_pred_train = clf.predict(X_train)
print accuracy_score(y_train, y_pred_train)
print accuracy_score(y_test, y_pred_test)

Save to file


In [ ]:
print y_train.shape
print y_train.reshape(y_train.shape[0],1).shape
print X_train.shape
cX = np.concatenate((y_train.reshape(80,1), X_train), axis=1)
cX.shape

Write to file....


In [ ]:
np.savetxt('iris_train.csv', cX, delimiter=' ', fmt='%0.4f')
!head iris_train.csv

In [ ]:
cX = np.concatenate((y_test.reshape(len(y_test),1), X_test), axis=1)
np.savetxt('iris_test.csv', cX, delimiter=' ', fmt='%0.4f')

With Spark


In [ ]:
import findspark
import os
findspark.init() # you need that before import pyspark.

import pyspark
sc = pyspark.SparkContext()

In [ ]:
points = sc.textFile('../data/iris_train.csv', 18)
points.take(5)

In [ ]:
from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark.mllib.classification import LabeledPoint 

parsed_data = points.map(lambda line: np.array([float(x) for x in line.split(' ')]))
parsed_data = parsed_data.map(lambda arr: LabeledPoint(arr[0],arr[1:]))

print type(parsed_data)
parsed_data.take(1)

In [ ]:
model = LogisticRegressionWithSGD.train(parsed_data)

Any idea about the "Cleaned shuffle" messages? Hint: narrow versus wide transformations.


In [ ]:
y = parsed_data.map(lambda x: x.label)
y_pred = parsed_data.map(lambda x: model.predict(x.features))

In [ ]:
tmp = y.zip(y_pred)
tmp.take(5)

Training accuracy


In [ ]:
1.0 - tmp.filter(lambda (y, p): y!=p).count()/float(parsed_data.count())

Test accuracy


In [ ]:
points = sc.textFile('../data/iris_test.csv', 18)
parsed_data = points.map(lambda line: np.array([float(x) for x in line.split(' ')]))
parsed_data = parsed_data.map(lambda arr: LabeledPoint(arr[0],arr[1:]))
y_pred = parsed_data.map(lambda x: model.predict(x.features))
y = parsed_data.map(lambda x: x.label)
tmp = y.zip(y_pred)
1.0 - tmp.filter(lambda (y, p): y!=p).count()/float(parsed_data.count())

In [ ]: