In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [2]:
base_path = "/Users/qiangzhang/Google Drive/workspace (1)/Agent-analysis/data/"
parcel_path = base_path + "Regression.csv"

In [3]:
data = pd.read_csv(parcel_path)
data['sold'] = (data.sold > 0).astype(int)

In [ ]:


In [4]:
data.groupby('sold').mean()


Out[4]:
Unnamed: 0 PIN age price salary year
sold
0 54347.107916 5.352586e+13 51.571959 106475.372446 59388.106064 2013.995865
1 56861.829400 5.372249e+13 52.434077 289176.123043 58641.998376 2014.083568

In [5]:
data.groupby('year').mean()


Out[5]:
Unnamed: 0 PIN age price salary sold
year
2013 18107.5 5.353428e+13 50.627043 114177.364995 59256.484123 0.075574
2014 54380.5 5.354174e+13 51.682191 113424.259400 59012.693752 0.082356
2015 90833.0 5.355143e+13 52.615858 138092.627642 59702.884101 0.095940

In [6]:
data_sold = data.loc[data['sold'] == 1]
data_unsold = data.loc[data['sold'] == 0]

In [7]:
#Train size = 2000, test size = 2000
#features = ['age','salary','sold price']
XS_train, XS_test, yS_train, yS_test = train_test_split(data_sold[['age','salary','price']], data_sold['sold'], train_size=2000, test_size=2000,random_state=42)
XU_train, XU_test, yU_train, yU_test = train_test_split(data_unsold[['age','salary','price']], data_unsold['sold'], train_size=2000, test_size=2000,random_state=42)

X_train = XS_train.append(XU_train).values
X_test = XS_test.append(XU_test).values
y_train = yS_train.append(yU_train).values
y_test = yS_test.append(yU_test).values

In [8]:
model = LogisticRegression()
model = model.fit(X_train, y_train)

print model.score(X_train, y_train)


0.67875

In [9]:
print y_train.mean()


0.5

In [10]:
print "age, salary, price coefficients"
print model.coef_


age, salary, price coefficients
[[ -1.77362350e-08  -1.97805758e-05   7.88090537e-06]]

In [11]:
predicted = model.predict(X_test)
probs = model.predict_proba(X_test)

In [12]:
# generate class probabilities
print probs


[[ 0.59993621  0.40006379]
 [ 0.25368794  0.74631206]
 [ 0.24158786  0.75841214]
 ..., 
 [ 0.34754046  0.65245954]
 [ 0.62314857  0.37685143]
 [ 0.36337563  0.63662437]]

In [13]:
# generate evaluation metrics
print metrics.accuracy_score(y_test, predicted)
print metrics.roc_auc_score(y_test, probs[:, 1])


0.6825
0.73320175

In [14]:
print metrics.classification_report(y_test, predicted)


             precision    recall  f1-score   support

          0       0.66      0.74      0.70      2000
          1       0.71      0.62      0.66      2000

avg / total       0.69      0.68      0.68      4000


In [15]:
print metrics.confusion_matrix(y_test, predicted)


[[1485  515]
 [ 755 1245]]

In [ ]:


In [ ]: