In [17]:
! pip install sklearn-pandas
! pip install -U scikit-learn


Requirement already satisfied: sklearn-pandas in /usr/local/lib/python2.7/site-packages
Requirement already satisfied: scikit-learn>=0.15.0 in /usr/local/lib/python2.7/site-packages (from sklearn-pandas)
Requirement already satisfied: pandas>=0.11.0 in /usr/local/lib/python2.7/site-packages (from sklearn-pandas)
Requirement already satisfied: scipy>=0.14 in /usr/local/lib/python2.7/site-packages (from sklearn-pandas)
Requirement already satisfied: numpy>=1.6.1 in /usr/local/lib/python2.7/site-packages (from sklearn-pandas)
Requirement already satisfied: python-dateutil in /usr/local/lib/python2.7/site-packages (from pandas>=0.11.0->sklearn-pandas)
Requirement already satisfied: pytz>=2011k in /usr/local/lib/python2.7/site-packages (from pandas>=0.11.0->sklearn-pandas)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python2.7/site-packages (from python-dateutil->pandas>=0.11.0->sklearn-pandas)
Requirement already up-to-date: scikit-learn in /usr/local/lib/python2.7/site-packages

In [23]:
import json
import numpy as np
from pandas import read_csv
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
#from sklearn_pandas import DataFrameMapper
from sklearn2pmml import PMMLPipeline, sklearn2pmml
from sklearn.preprocessing import FunctionTransformer


---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-23-d598e9571c67> in <module>()
      5 from sklearn.linear_model import LogisticRegression
      6 #from sklearn_pandas import DataFrameMapper
----> 7 from sklearn2pmml import PMMLPipeline, sklearn2pmml
      8 from sklearn.preprocessing import FunctionTransformer

ImportError: No module named sklearn2pmml

In [2]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"

In [3]:
features = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age']
label = 'label'

In [4]:
dataframe = read_csv(url, names=features + [label])

In [5]:
X = dataframe[features]
Y = dataframe[label]

In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

In [9]:
clf = LogisticRegression()

In [10]:
clf.fit(X_train, Y_train)


Out[10]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [11]:
print clf.score(X_test, Y_test)


0.767716535433

In [13]:
with open('logreg_coefs.json', 'w') as f:
    json.dump(clf.coef_.tolist(), f)