In [2]:
# code for logistic regression

import pandas as pd
import numpy as np
import json
import csv

from sklearn import linear_model
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss

In [3]:
df = pd.read_json('yourpath/train.json')

In [4]:
le=LabelEncoder()
mycols = ['bathrooms', 'bedrooms', 'building_id', 'latitude', 'longitude', 'manager_id', \
                                                              'price', 'interest_level']
d=df.loc[:, mycols]

d.price = np.log(d.price)
d.building_id = le.fit_transform(d.building_id)
d.manager_id = le.fit_transform(d.manager_id)

y = d.loc[:, 'interest_level']
y = le.fit_transform(y)

print min(d.price), max(d.price)


3.76120011569 15.3173632597

In [5]:
logit = linear_model.LogisticRegression(C=1e6, class_weight='balanced', penalty = 'l2', \
                                            max_iter = 1e3, solver = 'liblinear')  #, multi_class = 'multinomial')
logit.fit(d.iloc[:, :-1], y)

## The score (accuracy for classification problems):
print logit.score(d.iloc[:, :-1], y)

print [logit.coef_, logit.intercept_]


0.650632193224
[array([[  2.26513369e-01,   6.78565584e-01,   1.06437588e-04,
         -4.92785082e-02,  -3.66350159e-01,  -5.23551156e-05,
         -3.47762858e+00],
       [ -1.68144612e-01,  -6.17698604e-01,  -1.50601154e-04,
          7.05298508e-03,   2.74802254e-01,  -2.63436091e-06,
          2.70907793e+00],
       [ -1.14225498e-02,   4.25120236e-01,   1.29242476e-04,
          2.77565666e-02,  -1.34009608e-01,   2.53296499e-05,
         -1.61667342e+00]]), array([ 0.00555604, -0.00822092,  0.00303156])]

In [6]:
a = logit.predict_proba(d.iloc[:, :-1])

In [56]:
import csv

with open("logistic.csv", "wb") as f:
    writer = csv.writer(f)
    writer.writerows(a)

In [9]:
# computing log loss
ll = log_loss(d.iloc[:, -1], a)

In [10]:
ll


Out[10]:
0.8086483084591477