In [2]:
# code for logistic regression
import pandas as pd
import numpy as np
import json
import csv
from sklearn import linear_model
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
In [3]:
df = pd.read_json('yourpath/train.json')
In [4]:
le=LabelEncoder()
mycols = ['bathrooms', 'bedrooms', 'building_id', 'latitude', 'longitude', 'manager_id', \
'price', 'interest_level']
d=df.loc[:, mycols]
d.price = np.log(d.price)
d.building_id = le.fit_transform(d.building_id)
d.manager_id = le.fit_transform(d.manager_id)
y = d.loc[:, 'interest_level']
y = le.fit_transform(y)
print min(d.price), max(d.price)
In [5]:
logit = linear_model.LogisticRegression(C=1e6, class_weight='balanced', penalty = 'l2', \
max_iter = 1e3, solver = 'liblinear') #, multi_class = 'multinomial')
logit.fit(d.iloc[:, :-1], y)
## The score (accuracy for classification problems):
print logit.score(d.iloc[:, :-1], y)
print [logit.coef_, logit.intercept_]
In [6]:
a = logit.predict_proba(d.iloc[:, :-1])
In [56]:
import csv
with open("logistic.csv", "wb") as f:
writer = csv.writer(f)
writer.writerows(a)
In [9]:
# computing log loss
ll = log_loss(d.iloc[:, -1], a)
In [10]:
ll
Out[10]: