In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
In [2]:
base_path = "/Users/qiangzhang/Google Drive/workspace (1)/Agent-analysis/data/"
parcel_path = base_path + "Regression.csv"
In [3]:
data = pd.read_csv(parcel_path)
data['sold'] = (data.sold > 0).astype(int)
In [ ]:
In [4]:
data.groupby('sold').mean()
Out[4]:
In [5]:
data.groupby('year').mean()
Out[5]:
In [6]:
data_sold = data.loc[data['sold'] == 1]
data_unsold = data.loc[data['sold'] == 0]
In [7]:
#Train size = 2000, test size = 2000
#features = ['age','salary','sold price']
XS_train, XS_test, yS_train, yS_test = train_test_split(data_sold[['age','salary','price']], data_sold['sold'], train_size=2000, test_size=2000,random_state=42)
XU_train, XU_test, yU_train, yU_test = train_test_split(data_unsold[['age','salary','price']], data_unsold['sold'], train_size=2000, test_size=2000,random_state=42)
X_train = XS_train.append(XU_train).values
X_test = XS_test.append(XU_test).values
y_train = yS_train.append(yU_train).values
y_test = yS_test.append(yU_test).values
In [8]:
model = LogisticRegression()
model = model.fit(X_train, y_train)
print model.score(X_train, y_train)
In [9]:
print y_train.mean()
In [10]:
print "age, salary, price coefficients"
print model.coef_
In [11]:
predicted = model.predict(X_test)
probs = model.predict_proba(X_test)
In [12]:
# generate class probabilities
print probs
In [13]:
# generate evaluation metrics
print metrics.accuracy_score(y_test, predicted)
print metrics.roc_auc_score(y_test, probs[:, 1])
In [14]:
print metrics.classification_report(y_test, predicted)
In [15]:
print metrics.confusion_matrix(y_test, predicted)
In [ ]:
In [ ]: