The client bank XYZ is running a direct marketing (phone calls) campaign. The classification goal is to predict if the client will subscribe a term deposit or not.
The data is obtained from UCI Machine Learning repository
In [1]:
#1. import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = (10, 6)
In [2]:
#2. import data
bank = pd.read_csv("data/bank.csv")
In [3]:
#3. Display first few records
bank.head()
Out[3]:
In [4]:
#4. check column names
bank.columns
Out[4]:
In [5]:
#5. do label encoding
from sklearn import preprocessing
In [6]:
bank_e = bank.apply(preprocessing.LabelEncoder().fit_transform)
In [7]:
#6. do some exploratory data analysis
bank_e.head()
Out[7]:
In [8]:
#Plot balance vs deposit
plt.scatter(bank_e.balance,
bank_e.deposit,
s=150)
plt.xlabel('balance')
plt.ylabel('deposit')
Out[8]:
In [9]:
#Plot duration vs deposit
plt.scatter(bank_e.duration,
bank_e.deposit,
s=150)
plt.xlabel('balance')
plt.ylabel('deposit')
Out[9]:
In [10]:
#7. Build logistic regression model
from sklearn.linear_model import LogisticRegression
In [11]:
#8. Build L2 logistic regression model
model_logistic_l2 = LogisticRegression()
In [12]:
model_logistic_l2.fit(bank_e.iloc[:,:16], bank_e.iloc[:,16])
Out[12]:
In [13]:
model_logistic_l2.coef_
Out[13]:
In [14]:
#9. Build L1 logistic regression model
model_logistic_l1 = LogisticRegression(penalty="l1")
In [15]:
model_logistic_l1.fit(bank_e.iloc[:,:16], bank_e.iloc[:,16])
model_logistic_l1.coef_
Out[15]:
In [16]:
#10. Build L1 logistic regression model with different values of C
model_logistic_l1 = LogisticRegression(penalty="l1", C=0.001)
model_logistic_l1.fit(bank_e.iloc[:,:16], bank_e.iloc[:,16])
model_logistic_l1.coef_
Out[16]:
In [18]:
#10. Find generalization error. Use 80/20 split
from sklearn.cross_validation import train_test_split
In [19]:
x_train, x_test, y_train, y_test = train_test_split(bank_e.iloc[:,:16],
bank_e.iloc[:,16],test_size=0.2)
In [22]:
type(train_test_split(bank_e.iloc[:,:16], bank_e.iloc[:,16],test_size=0.2))
Out[22]:
In [25]:
bank_e.shape, x_train.shape, y_train.shape, x_test.shape, y_test.shape
Out[25]:
In [26]:
#Using L2 Logistic Regression
model_logistic_l2 = LogisticRegression()
model_logistic_l2.fit(x_train, y_train)
model_logistic_l2.coef_
Out[26]:
In [23]:
#11. report mis-classification rate
l2_predict = model_logistic_l2.predict(x_test)
In [24]:
model_logistic_l2
Out[24]:
In [40]:
report = pd.DataFrame([l2_predict, y_test]).T
report.columns = ["Prediction", "Actual"]
report.head()
Out[40]:
In [46]:
report.match = (report.Prediction == report.Actual)
In [51]:
misclassification_rate = 100 - report.match.sum()/report.shape[0]*100
In [52]:
print(misclassification_rate)