Project 2
In [571]:
from google.colab import drive
drive.mount('/content/drive')
In [572]:
ls
In [573]:
cd drive/'My Drive'/'Colab Notebooks'/MLP2/
In [574]:
ls
In [1]:
import pandas as pd
import numpy as np
from pandas import read_csv
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import os
# Where to save the figures and data files
PROJECT_ROOT_DIR = "Results"
FIGURE_ID = "Results/FigureFiles"
DATA_ID = "DataFiles/"
if not os.path.exists(PROJECT_ROOT_DIR):
os.mkdir(PROJECT_ROOT_DIR)
if not os.path.exists(FIGURE_ID):
os.makedirs(FIGURE_ID)
if not os.path.exists(DATA_ID):
os.makedirs(DATA_ID)
def image_path(fig_id):
return os.path.join(FIGURE_ID, fig_id)
def data_path(dat_id):
return os.path.join(DATA_ID, dat_id)
def save_fig(fig_id):
plt.savefig(image_path(fig_id) + ".png", format='png')
print("--1--data")
linkname = 'bank-additional-full.csv'
print("--1.2-read file")
dataset1 = pd.read_csv(linkname, sep = ';')
print("--1.2.2--readed")
# View the first 5 rows in the dataset
print(dataset1.shape)
display(dataset1.head(70))
In [2]:
data1 = dataset1[dataset1['y'] == 'yes']
data2 = dataset1[dataset1['y'] == 'no']
print("*")
fig, ax = plt.subplots(2, 2, figsize=(12,10))
b1 = ax[0, 0].bar(data1['day_of_week'].unique(),height = data1['day_of_week'].value_counts(),color='#FF0006')
b2 = ax[0, 0].bar(data2['day_of_week'].unique(),height = data2['day_of_week'].value_counts(),bottom = data1['day_of_week'].value_counts(),color = '#00B9FF')
ax[0, 0].title.set_text('Day of week')
#ax[0, 0].legend((b1[0], b2[0]), ('Yes', 'No'))
ax[0, 1].bar(data1['month'].unique(),height = data1['month'].value_counts(),color='#FF0006')
ax[0, 1].bar(data2['month'].unique(),height = data2['month'].value_counts(),bottom = data1['month'].value_counts(),color = '#00B9FF')
ax[0, 1].title.set_text('Month')
ax[1, 0].bar(data1['job'].unique(),height = data1['job'].value_counts(),color='#FF0006')
ax[1, 0].bar(data1['job'].unique(),height = data2['job'].value_counts()[data1['job'].value_counts().index],bottom = data1['job'].value_counts(),color = '#00B9FF')
ax[1, 0].title.set_text('Type of Job')
ax[1, 0].tick_params(axis='x',rotation=90)
ax[1, 1].bar(data1['education'].unique(),height = data1['education'].value_counts(),color='#FF0006') #row=0, col=1
ax[1, 1].bar(data1['education'].unique(),height = data2['education'].value_counts()[data1['education'].value_counts().index],bottom = data1['education'].value_counts(),color = '#00B9FF')
ax[1, 1].title.set_text('Education')
ax[1, 1].tick_params(axis='x',rotation=90)
#ax[0, 1].xticks(rotation=90)
plt.figlegend((b1[0], b2[0]), ('Yes', 'No'),loc="right",title = "Term deposit")
save_fig('DataVisual-dow_mth_job_edu')
plt.show()
print(dataset1.shape)
In [3]:
fig, ax = plt.subplots(2, 3, figsize=(15,10))
b1 = ax[0, 0].bar(data1['marital'].unique(),height = data1['marital'].value_counts(),color='#FF0006')
b2 = ax[0, 0].bar(data1['marital'].unique(),height = data2['marital'].value_counts()[data1['marital'].value_counts().index],bottom = data1['marital'].value_counts(),color = '#00B9FF')
ax[0, 0].title.set_text('Marital Status')
#ax[0, 0].legend((b1[0], b2[0]), ('Yes', 'No'))
ax[0, 1].bar(data1['housing'].unique(),height = data1['housing'].value_counts(),color='#FF0006')
ax[0, 1].bar(data1['housing'].unique(),height = data2['housing'].value_counts()[data1['housing'].value_counts().index],bottom = data1['housing'].value_counts(),color = '#00B9FF')
ax[0, 1].title.set_text('(Has) housing (loan)')
ax[0, 2].bar(data1['loan'].unique(),height = data1['loan'].value_counts(),color='#FF0006')
ax[0, 2].bar(data1['loan'].unique(),height = data2['loan'].value_counts()[data1['loan'].value_counts().index],bottom = data1['loan'].value_counts(),color = '#00B9FF')
ax[0, 2].title.set_text('(Has personal) loan')
ax[1, 0].bar(data1['contact'].unique(),height = data1['contact'].value_counts(),color='#FF0006')
ax[1, 0].bar(data1['contact'].unique(),height = data2['contact'].value_counts()[data1['contact'].value_counts().index],bottom = data1['contact'].value_counts(),color = '#00B9FF')
ax[1, 0].title.set_text('Contact')
ax[1, 1].bar(data1['default'].unique(),height = data1['default'].value_counts(),color='#FF0006')
ax[1, 1].bar(data1['default'].unique(),height = data2['default'].value_counts()[data1['default'].value_counts().index],bottom = data1['default'].value_counts(),color = '#00B9FF')
ax[1, 1].title.set_text('(Has credit in) Default')
ax[1, 2].bar(data1['poutcome'].unique(),height = data1['poutcome'].value_counts(),color='#FF0006')
ax[1, 2].bar(data1['poutcome'].unique(),height = data2['poutcome'].value_counts()[data1['poutcome'].value_counts().index],bottom = data1['poutcome'].value_counts(),color = '#00B9FF')
ax[1, 2].title.set_text('Outcome of the previous marketing campaign')
plt.figlegend((b1[0], b2[0]), ('Yes', 'No'),loc="right",title = "Term deposit")
save_fig('DataVisual-marit_hous_loan_tcont_def_prevmc')
plt.show()
print(dataset1.shape)
In [4]:
count_day_of_week_response_pct = pd.crosstab(dataset1['y'],dataset1['day_of_week']).apply(lambda x: x/x.sum() * 100)
count_day_of_week_response_pct = count_day_of_week_response_pct.transpose()
print("+a")
plot_day_of_the_week = count_day_of_week_response_pct['yes'].sort_values(ascending = True).plot(kind ='bar',
figsize = (5,5))
plt.title('Subscription Rate by Day of the week')
plt.xlabel('Day of the week')
plt.ylabel('Subscription Rate')
# Label each bar
for rec, label in zip(plot_day_of_the_week.patches,
count_day_of_week_response_pct['yes'].sort_values(ascending = True).round(1).astype(str)):
plot_day_of_the_week.text(rec.get_y() + rec.get_x() + 0.3,#rec.get_width(), #x,y,s ORDER of entries
rec.get_height() + rec.get_y(),#rec.get_y()+ rec.get_height(),
label+'%',
ha = 'center',
va='bottom')
print("++a")
save_fig('DataVisual-Sr_week')
plt.show()
print(dataset1.shape)
In [5]:
count_month_response_pct = pd.crosstab(dataset1['y'],dataset1['month']).apply(lambda x: x/x.sum() * 100)
count_month_response_pct = count_month_response_pct.transpose()
print("+b")
plot_month = count_month_response_pct['yes'].sort_values(ascending = True).plot(kind ='bar',
figsize = (5,5))
plt.title('Subscription Rate by Month')
plt.xlabel('Month')
plt.ylabel('Subscription Rate')
# Label each bar
for rec, label in zip(plot_month.patches,
count_month_response_pct['yes'].sort_values(ascending = True).round(1).astype(str)):
plot_month.text(rec.get_y() + rec.get_x() + 0.3,
rec.get_height() + rec.get_y(),
label+'%', fontsize=8,
ha = 'center',
va='bottom')
print("++b")
save_fig('DataVisual-Sr_month')
plt.show()
print(dataset1.shape)
In [6]:
count_job_response_pct = pd.crosstab(dataset1['y'],dataset1['job']).apply(lambda x: x/x.sum() * 100)
count_job_response_pct = count_job_response_pct.transpose()
print("+c")
plot_job = count_job_response_pct['yes'].sort_values(ascending = True).plot(kind ='bar',
figsize = (5,5))
plt.title('Subscription Rate by Job')
plt.ylabel('Subscription Rate')
plt.xlabel('Job Category')
# Label each bar
for rec, label in zip(plot_job.patches,
count_job_response_pct['yes'].sort_values(ascending = True).round(1).astype(str)):
plot_job.text(rec.get_y() + rec.get_x() + 0.3,
rec.get_height() + rec.get_y(),
label+'%', fontsize=8,
ha = 'center',
va='bottom')
print("++c")
save_fig('DataVisual-Sr_job')
plt.show()
print(dataset1.shape)
In [7]:
count_education_response_pct = pd.crosstab(dataset1['y'],dataset1['education']).apply(lambda x: x/x.sum() * 100)
count_education_response_pct = count_education_response_pct.transpose()
print("+d")
plot_education = count_education_response_pct['yes'].sort_values(ascending = True).plot(kind ='bar',
figsize = (5,5))
plt.title('Subscription Rate by Education')
plt.ylabel('Subscription Rate')
plt.xlabel('Education Category')
# Label each bar
for rec, label in zip(plot_education.patches,
count_education_response_pct['yes'].sort_values(ascending = True).round(1).astype(str)):
plot_education.text(rec.get_y() + rec.get_x() + 0.3,
rec.get_height() + rec.get_y(),
label+'%',
ha = 'center',
va='bottom')
print("++d")
save_fig('DataVisual-Sr_edu')
plt.show()
print(dataset1.shape)
In [8]:
count_marital_response_pct = pd.crosstab(dataset1['y'],dataset1['marital']).apply(lambda x: x/x.sum() * 100)
count_marital_response_pct = count_marital_response_pct.transpose()
print("+")
plot_marital = count_marital_response_pct['yes'].sort_values(ascending = True).plot(kind ='bar',
figsize = (5,5))
plt.title('Subscription Rate by Marital Status')
plt.ylabel('Subscription Rate')
plt.xlabel('Marital Status')
# Label each bar
for rec, label in zip(plot_marital.patches,
count_marital_response_pct['yes'].sort_values(ascending = True).round(1).astype(str)):
plot_marital.text(rec.get_y() + rec.get_x() + 0.3,
rec.get_height() + rec.get_y(),
label+'%',
ha = 'center',
va='bottom')
print("++")
save_fig('DataVisual-Sr_marital')
plt.show()
print(dataset1.shape)
In [9]:
count_housing_response_pct = pd.crosstab(dataset1['y'],dataset1['housing']).apply(lambda x: x/x.sum() * 100)
count_housing_response_pct = count_housing_response_pct.transpose()
print("+1")
plot_housing = count_housing_response_pct['yes'].sort_values(ascending = True).plot(kind ='bar',
figsize = (5,5))
plt.title('Subscription Rate by Housing loan')
plt.ylabel('Subscription Rate')
plt.xlabel('Housing loan')
# Label each bar
for rec, label in zip(plot_housing.patches,
count_housing_response_pct['yes'].sort_values(ascending = True).round(1).astype(str)):
plot_housing.text(rec.get_y() + rec.get_x() + 0.3,
rec.get_height() + rec.get_y(),
label+'%',
ha = 'center',
va='bottom')
print("++1")
save_fig('DataVisual-Sr_housing')
plt.show()
print(dataset1.shape)
In [10]:
count_loan_response_pct = pd.crosstab(dataset1['y'],dataset1['loan']).apply(lambda x: x/x.sum() * 100)
count_loan_response_pct = count_loan_response_pct.transpose()
print("+2")
plot_loan = count_loan_response_pct['yes'].sort_values(ascending = True).plot(kind ='bar',
figsize = (5,5))
plt.title('Subscription Rate by Loan(personal)')
plt.ylabel('Subscription Rate')
plt.xlabel('Loan Category')
# Label each bar
for rec, label in zip(plot_loan.patches,
count_loan_response_pct['yes'].sort_values(ascending = True).round(1).astype(str)):
plot_loan.text(rec.get_y() + rec.get_x() + 0.3,
rec.get_height() + rec.get_y(),
label+'%',
ha = 'center',
va='bottom')
print("++2")
save_fig('DataVisual-Sr_loan')
plt.show()
print(dataset1.shape)
In [11]:
count_contact_response_pct = pd.crosstab(dataset1['y'],dataset1['contact']).apply(lambda x: x/x.sum() * 100)
count_contact_response_pct = count_contact_response_pct.transpose()
print("+3")
plot_contact = count_contact_response_pct['yes'].sort_values(ascending = True).plot(kind ='bar',
figsize = (5,5))
plt.title('Subscription Rate by Contact')
plt.ylabel('Subscription Rate')
plt.xlabel('Contact Category')
# Label each bar
for rec, label in zip(plot_contact.patches,
count_contact_response_pct['yes'].sort_values(ascending = True).round(1).astype(str)):
plot_contact.text(rec.get_y() + rec.get_x() + 0.3,
rec.get_height() + rec.get_y(),
label+'%',
ha = 'center',
va='bottom')
print("++3")
save_fig('DataVisual-Sr_contact')
plt.show()
print(dataset1.shape)
In [12]:
count_default_response_pct = pd.crosstab(dataset1['y'],dataset1['default']).apply(lambda x: x/x.sum() * 100)
count_default_response_pct = count_default_response_pct.transpose()
print("+4")
plot_default = count_default_response_pct['yes'].sort_values(ascending = True).plot(kind ='bar',
figsize = (5,5))
plt.title('Subscription Rate by Default')
plt.ylabel('Subscription Rate')
plt.xlabel('Default Category')
# Label each bar
for rec, label in zip(plot_default.patches,
count_default_response_pct['yes'].sort_values(ascending = True).round(1).astype(str)):
plot_default.text(rec.get_y() + rec.get_x() + 0.3,
rec.get_height() + rec.get_y(),
label+'%',
ha = 'center',
va='bottom')
print("++4")
save_fig('DataVisual-Sr_default')
plt.show()
print(dataset1.shape)
In [13]:
count_poutcome_response_pct = pd.crosstab(dataset1['y'],dataset1['poutcome']).apply(lambda x: x/x.sum() * 100)
count_poutcome_response_pct = count_poutcome_response_pct.transpose()
print("+5")
plot_poutcome = count_poutcome_response_pct['yes'].sort_values(ascending = True).plot(kind ='bar',
figsize = (5,5))
plt.title('Subscription Rate by Previous outcome')
plt.ylabel('Subscription Rate')
plt.xlabel('Previous outcome Category')
# Label each bar
for rec, label in zip(plot_poutcome.patches,
count_poutcome_response_pct['yes'].sort_values(ascending = True).round(1).astype(str)):
plot_poutcome.text(rec.get_y() + rec.get_x() + 0.3,
rec.get_height() + rec.get_y(),
label+'%',
ha = 'center',
va='bottom')
print("++5")
save_fig('DataVisual-Sr_poutcome')
plt.show()
print(dataset1.shape)
In [14]:
#################
print("--2Clean")
# Step 1: Delete the rows which column 'poutcome' contains 'other'
condition0 = dataset1.poutcome == 'other'
#print("--2.1")
#print("--2.1.2 dataset2")
datasetcondition0 = dataset1.drop(dataset1[condition0].index, axis = 0, inplace = False)
#Does not affect the data _?
#Delete the rows with 'unknown' in marital housing and loan
condition0 = datasetcondition0.marital == 'unknown'
dataset02 = datasetcondition0.drop(datasetcondition0[condition0].index, axis = 0, inplace = False)
condition00 = dataset02.housing == 'unknown'
dataset5 = dataset02.drop(dataset02[condition00].index, axis = 0, inplace = False)
#condition000 = dataset002.loan == 'unknown'
#dataset0002 = dataset002.drop(dataset002[condition000].index, axis = 0, inplace = False)
# We found out an "unknown" in edu
#condition0000 = dataset0002.education == 'unknown'
#dataset5 = dataset0002.drop(dataset0002[condition0000].index, axis = 0, inplace = False)
#erase calls with duration 0
#condition000 = dataset002.loan == 'unknown'
#dataset2 = dataset002.drop(dataset002[condition000].index, axis = 0, inplace = False)
#print("--2.1")
#print("--2.1.2 dataset2")
#display(dataset2.head())
# Step 2: Replace 'unknown' in default housing and loan with 'other'
#print("--2.1.3 to replace")
dataset5['loan'] = dataset5['loan'].replace(['unknown'],'no')
#print("--2.1.4 replaced")
print(dataset5.shape)
display(dataset5.head(70))
In [0]:
# Step 2: Replace 'unknown' in default housing and loan with 'other'
#print("--2.1.3 to replace")
#dataset2[['job','education',]] = dataset2[['job','education']].replace(['unknown'],'other')
#print("--2.1.4 replaced")
In [0]:
############################ 2.2 Drop outliers in the column 'balance
#from scipy.stats import zscore
#dataset2[['balance']].mean()
#dataset2[['balance']].mean()
#dataset2['balance_outliers'] = dataset2['balance']
#dataset2['balance_outliers']= zscore(dataset2['balance_outliers'])
#condition1 = (dataset2['balance_outliers']>3) | (dataset2['balance_outliers']<-3 )
#dataset3 = dataset2.drop(dataset2[condition1].index, axis = 0, inplace = False)
#dataset4 = dataset3.drop('balance_outliers', axis=1)
In [15]:
############################## 2.3 Creating and transforming data
# Step 1: Change column name: 'y' to 'response'
#dataset4.rename(index=str, columns={'y': 'response'}, inplace = True)
#dataset2.rename(index=str, columns={'y': 'response'}, inplace = True)
dataset5.rename(index=str, columns={'y': 'response'}, inplace = True)
def convert(dataset5, new_column, old_column):
dataset5[new_column] = dataset5[old_column].apply(lambda x: 0 if x == 'no' else 1)
return dataset5[new_column].value_counts()
#To appreciate the change when displaying the table 22 col
convert(dataset5, "response_binary", "response")
#convert(dataset5, "response", "response")
print("-----2.3.1")
print(dataset5.shape)
display(dataset5.head(150))
In [16]:
# Step 2: Drop column "response_binary" which is useless
#datasetNEW = dataset5.drop('response_binary', axis=1)
#dataset4 = dataset3.drop('balance_outliers', axis=1)
#print("-----2.3.2")
# Step 2: Drop column "contact" which is useless
#dataset5 = dataset4.drop('contact', axis=1)
####dataset5 = dataset2.drop('contact', axis=1)
#print("-----2.3.2")
# Step 3: Change the unit of 'duration' from seconds to minutes
dataset5['duration'] = dataset5['duration'].apply(lambda n:n/60).round(2)
print("-----2.3.3")
print(dataset5.shape)
display(dataset5.head(150))
!!If run more without previous codes, then all responses yes ==1, as it finds only numbers!! Pay attention to
In [17]:
# Step 4: Change 'month' from words to numbers for easier analysis
lst = [dataset5]
for column in lst:
column.loc[column["month"] == "jan", "month"] = 1
column.loc[column["month"] == "feb", "month"] = 2
column.loc[column["month"] == "mar", "month"] = 3
column.loc[column["month"] == "apr", "month"] = 4
column.loc[column["month"] == "may", "month"] = 5
column.loc[column["month"] == "jun", "month"] = 6
column.loc[column["month"] == "jul", "month"] = 7
column.loc[column["month"] == "aug", "month"] = 8
column.loc[column["month"] == "sep", "month"] = 9
column.loc[column["month"] == "oct", "month"] = 10
column.loc[column["month"] == "nov", "month"] = 11
column.loc[column["month"] == "dec", "month"] = 12
print("-----2.3.4")
# Step 4.1: Change 'day' from words to numbers for easier analysis
lst = [dataset5]
for column in lst:
column.loc[column["day_of_week"] == "mon", "day_of_week"] = 1
column.loc[column["day_of_week"] == "tue", "day_of_week"] = 2
column.loc[column["day_of_week"] == "wed", "day_of_week"] = 3
column.loc[column["day_of_week"] == "thu", "day_of_week"] = 4
column.loc[column["day_of_week"] == "fri", "day_of_week"] = 5
print("-----2.3.5")
print(dataset5.shape)
display(dataset5.head())
In [594]:
# Step 4.2: Change from words to numbers for easier analysis
#'basic.4y','basic.6y','basic.9y','high.school','illiterate','professional.course','university.degree','unknown'
lst = [dataset5]
for column in lst:
column.loc[column["education"] == "illiterate", "education"] = 0
column.loc[column["education"] == "unknown", "education"] = 0
column.loc[column["education"] == "basic.4y", "education"] = 1
column.loc[column["education"] == "basic.6y", "education"] = 1
column.loc[column["education"] == "basic.9y", "education"] = 1
column.loc[column["education"] == "basic.9y", "education"] = 1
column.loc[column["education"] == "high.school", "education"] = 1
column.loc[column["education"] == "professional.course", "education"] = 2
column.loc[column["education"] == "university.degree", "education"] = 2
print("-----2.3.5")
print(dataset5.shape)
display(dataset5.head())
lst = [dataset5]
for column in lst:
column.loc[column["marital"] == "single", "marital"] = 0
column.loc[column["marital"] == "divorced", "marital"] = 1
column.loc[column["marital"] == "married", "marital"] = 2
print("-----2.3.x")
print(dataset5.shape)
display(dataset5.head(100))
In [0]:
#'''Convert Duration Call into 5 category'''
#def duration(dataset5):
# dataset5.loc[dataset5["day_of_week"] == "mon", 'day_of_week'] = 1
# data.loc[(data['duration'] > 102) & (data['duration'] <= 180) , 'duration'] = 2
# data.loc[(data['duration'] > 180) & (data['duration'] <= 319) , 'duration'] = 3
# data.loc[(data['duration'] > 319) & (data['duration'] <= 645), 'duration'] = 4
# data.loc[data['duration'] > 645, 'duration'] = 5
# return data
#duration(data);
In [18]:
####################### 2.4 Filtering
# Step 1: Drop rows that 'duration' < 5s
condition2 = (dataset5['duration']<5/60)
#dataset6 = dataset5.drop(dataset5[condition2].index, axis = 0, inplace = False)
dataset7 = dataset5.drop(dataset5[condition2].index, axis = 0, inplace = False)
# Step 2: Drop customer values with 'other' education
#condition3 = (dataset6['education'] == 'other')
#dataset7 = dataset6.drop(dataset6[condition3].index, axis = 0, inplace = False)
plt.show()
print(dataset7.shape)
In [19]:
###################### 3.Explo
### 3.1 Visu
dist_age_duration = plt.figure(figsize = (7,2.5))
ra1 = dist_age_duration.add_subplot(1,2,1)
#ra2 = dist_age_balance.add_subplot(1,2,2)
ra3 = dist_age_duration.add_subplot(1,2,2)
ra1.hist(dataset7['age'])
ra1.set_title('The Distribution of Age')
#ra2.hist(dataset7['balance'], color = 'skyblue')
#ra2.set_title('The Distribution of Balance')
ra3.hist(dataset7['duration'], color = 'skyblue')
ra3.set_title('The Distribution of Duration')
plt.tight_layout()
plt.savefig('Distrubution-age-duration')
plt.show()
print(dataset7.shape)
In [20]:
############################ 3.2 Visualize the relationship between 'age' and 'duration'¶
scatter_age_duration = dataset7.plot.scatter('age','duration',figsize = (7,2.5))
plt.title('The Relationship between Age and Duration ')
plt.show()
print(dataset7.shape)
In [22]:
###############
import seaborn as sns
dur_cam = sns.lmplot(x='age', y='duration',data = dataset7,
hue = 'response',
fit_reg = False,
scatter_kws={'alpha':0.3}, height =3)
plt.axis([0,100,0,90])#x(age) y(s)
plt.ylabel('Duration of Calls (min)')
plt.xlabel('Age')
plt.title('The Relationship between the Age and Duration of calls')
# Annotation
#plt.axhline(y=5, linewidth=2, color="k", linestyle='--')
#plt.annotate('Higher subscription rate when calls <5',xytext = (35,13),
# arrowprops=dict(color = 'k', width=1),xy=(30,6))
plt.show()
print(dataset7.shape)
In [23]:
############### 3.4 Visualize the relationship between 'duration' & 'campaign': with response result¶
import seaborn as sns
dur_cam = sns.lmplot(x='duration', y='campaign',data = dataset7,
hue = 'response',
fit_reg = False,
scatter_kws={'alpha':0.6}, height =4)
plt.axis([0,90,0,90])
plt.ylabel('Number of Calls')#campaign
plt.xlabel('Duration of Calls (Minutes)')
plt.title('The Relationship between the Duration and Number of Calls (with Response Result)')
# Annotation
plt.axhline(y=5, linewidth=2, color="k", linestyle='--')
plt.annotate('Higher subscription rate when calls <5',xytext = (35,13),
arrowprops=dict(color = 'k', width=1),xy=(30,6))
save_fig('DataVisual-Durat_campaign')
plt.show()
print(dataset7.shape)
In [24]:
#Scattered Matrix
from pandas.plotting import scatter_matrix
matrix = scatter_matrix(dataset7[['age','duration','education','campaign']],figsize=(8,6))
plt.suptitle('The Scatter Matrix of Age, Duration, education and Campaign')
plt.show()
print(dataset7.shape)
In [25]:
print(dataset7.shape)
display(dataset7.head())
In [0]:
The main objective of this project is to identify the most responsive customers before the marketing campaign so that the bank will be able to efficiently reach out to them, saving time and marketing resources. To achieve this objective, classification algorithms will be employed. By analyzing customer statistics, a classification model will be built to classify all clients into two groups: "yes" to term deposits and "no" to term deposits.
In [26]:
print(dataset7.shape)
display(dataset7.head(3))
In [27]:
#dataset7 = pd.get_dummies(dataset7, columns = ['job'])
#dataset7 = pd.get_dummies(dataset7, columns = ['education'])
dataset7['housing'] = dataset7['housing'].map({'yes': 1, 'no': 0})
dataset7['default'] = dataset7['default'].map({'no': 1, 'unknown': 0})
dataset7['loan'] = dataset7['loan'].map({'yes': 1, 'no': 0})####################################################
#dataset7_response = pd.DataFrame(dataset['response_binary'])
#dataset7 = pd.merge(dataset7, dataset_response, left_index = True, right_index = True)
print(dataset7.shape)
display(dataset7.head(10))
In [28]:
#To Create a colum for each var
#dataset7 = pd.get_dummies(dataset7, columns = ['job'])
#dataset7 = pd.get_dummies(dataset7, columns = ['education'])
dataset7['contact'] = dataset7['contact'].map({'telephone': 1, 'cellular': 0})
#dataset7['default'] = dataset7['default'].map({'no': 1, 'unknown': 0})
#dataset7['loan'] = dataset7['loan'].map({'yes': 1, 'no': 0})
#dataset7_response = pd.DataFrame(dataset['response_binary'])
#dataset7 = pd.merge(dataset7, dataset_response, left_index = True, right_index = True)
display(dataset7.head(10))
print(dataset7.shape)
In [29]:
display(dataset7.head(1))
corr_data = dataset7[['age','job','marital','education','default','housing','loan','month','day_of_week','duration','campaign','pdays','previous','poutcome','emp.var.rate','cons.price.idx','cons.conf.idx','euribor3m','nr.employed','response_binary']]
corr = corr_data.corr()
#housing loan contact month day_of_week duration campaign pdays previous
#poutcome emp.var.rate cons.price.idx cons.conf.idx euribor3m nr.employed
cor_plot = sns.heatmap(corr,annot=True,cmap='BuPu',linewidths=0.2,annot_kws={'size':10})
fig=plt.gcf()
fig.set_size_inches(15,10)
plt.xticks(fontsize=10,rotation=-60)
plt.yticks(fontsize=10)
plt.title('Correlation Matrix')
save_fig('Datav-CORREMat')
plt.show()
print(dataset7.shape)
In [0]:
Given data set is highly imbalanced, i.e. number of data belonging to 'no' category is way higher than 'yes' category.
In [0]:
print("____sataset7 shape") print(dataset7.shape) predictors = dataset7.iloc[:,0:20]#21 response #22 response binary
Y = dataset7.iloc[:,21] #reponse binary X = pd.get_dummies(predictors) print("predictors shape") print(predictors.shape) print("y shape") print(y.shape) print("table predictors") display(predictors.head(300)) print("y show") display(y.head(3))
In [30]:
#linkname = '../input/bank_cleaned.csv'
#dataset = read_csv(linkname)
#dataset = dataset.drop(['Unnamed: 0'], axis=1)
print("--1--data")
linkname = 'bank-additional-full.csv'
print("--1.2-read file")
datasetO = pd.read_csv(linkname, sep = ';')
print("--1.2.2--readed")
print(datasetO.shape)
display(datasetO.head(1))
################
# Step 1: Delete the rows which column 'poutcome' contains 'other'
conditionO = datasetO.poutcome == 'other'
datasetconditionO0 = datasetO.drop(datasetO[conditionO].index, axis = 0, inplace = False)
conditionO0 = datasetconditionO0.marital == 'unknown'
datasetO02 = datasetconditionO0.drop(datasetconditionO0[conditionO0].index, axis = 0, inplace = False)
conditionO00 = datasetO02.housing == 'unknown'
datasetO002 = datasetO02.drop(datasetO02[conditionO00].index, axis = 0, inplace = False)
conditionO000 = datasetO002.loan == 'unknown'
datasetO0002 = datasetO002.drop(datasetO002[conditionO000].index, axis = 0, inplace = False)
# We found out an "unknown" in edu
conditionO0000 = datasetO0002.education == 'unknown'
datasetO5 = datasetO0002.drop(datasetO0002[conditionO0000].index, axis = 0, inplace = False)
#erase calls with duration 0
#condition000 = dataset002.loan == 'unknown'
#dataset2 = dataset002.drop(dataset002[condition000].index, axis = 0, inplace = False)
print(datasetO5.shape)
display(datasetO5.head(1))
#datasetO5.rename(index=str, columns={'y': 'response'}, inplace = True)
#def convert(datasetO5, new_column, old_column):
# datasetO5[new_column] = datasetO5[old_column].apply(lambda x: 0 if x == 'no' else 1)
# return datasetO5[new_column].value_counts()
#To appreciate the change when displaying the table 22 col
#convert(datasetO5, "response_binary", "response")
#convert(dataset5, "response", "response")
#print("-----2.3.1")
#print(datasetO5.shape)
#display(datasetO5.head(1))
In [608]:
datasetO5.info()
In [31]:
predictors = datasetO5.iloc[:,0:20]
predictors = predictors.drop(['pdays'],axis=1)
y = datasetO5.iloc[:,20]
X = pd.get_dummies(predictors)
print(datasetO5.shape)
display(datasetO5.head())
print(predictors.shape)
display(predictors.head())
display(y.head())
display(X.head())
In [610]:
from pylab import rcParams
import matplotlib.ticker as mtick # For specifying the axes tick format
df = datasetO5
rcParams['figure.figsize']=10,6
ax = (df['loan'].value_counts()*100.0 /len(df)).plot(kind='bar', stacked = True, rot = 0)
ax.yaxis.set_major_formatter(mtick.PercentFormatter())
ax.set_ylabel('% Customers with loan')
ax.set_xlabel('Personal loan')
ax.set_ylabel('% Customers')
ax.set_title('Term deposit distribution')
totals = [] # creating a list to collect the plt.patches data
# finding the values and append to list
for i in ax.patches:
totals.append(i.get_width())
total = sum(totals) # setting individual bar lables using above list
for i in ax.patches:
# getting_width pulls left or right; get_y pushes up or down
ax.text(i.get_x()+.15, i.get_height()-3.5, \
str(round((i.get_height()/total), 1))+'%', color='white', weight = 'bold')
In [32]:
pd.Series(y).value_counts()
Out[32]:
In [33]:
#from sklearn.model_selection import train_test_split
#from sklearn.model_selection import cross_val_score
#from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
#print("libraries")
In [34]:
# 20% of the data will be used for testing
#test_size= 0.20
#seed = 7
#X_train, X_test, Y_train, Y_test= train_test_split(X, y, test_size=test_size, random_state=seed)
#print("____data splited")
In [39]:
# Time for Classification Models
import time
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
dict_classifiers = {
"Logistic Regression": LogisticRegression(solver='lbfgs', max_iter=5000),
"Nearest Neighbors": KNeighborsClassifier(),
"Linear SVM": SVC(gamma = 'auto'),
"Gradient Boosting Classifier": GradientBoostingClassifier(),
"Decision Tree": tree.DecisionTreeClassifier(),
"Random Forest": RandomForestClassifier(n_estimators=18),
"Neural Net": MLPClassifier(alpha=1),
"Naive Bayes": GaussianNB()
}
In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
In [41]:
no_classifiers = len(dict_classifiers.keys())
def batch_classify(X_train, Y_train, verbose = True):
df_results = pd.DataFrame(data=np.zeros(shape=(no_classifiers,3)), columns = ['classifier', 'train_score', 'training_time'])
count = 0
for key, classifier in dict_classifiers.items():
t_start = time.process_time()
classifier.fit(X_train, Y_train)
t_end = time.process_time()
t_diff = t_end - t_start
train_score = classifier.score(X_train, Y_train)
df_results.loc[count,'classifier'] = key
df_results.loc[count,'train_score'] = train_score
df_results.loc[count,'training_time'] = t_diff
if verbose:
print("trained {c} in {f:.2f} s".format(c=key, f=t_diff))
count+=1
return df_results
In [617]:
df_results = batch_classify(X_train, y_train)
print(df_results.sort_values(by='train_score', ascending=False))
In [42]:
# Use Cross-validation.
from sklearn.model_selection import cross_val_score
# Logistic Regression
log_reg = LogisticRegression(solver='lbfgs', max_iter=5000)
log_scores = cross_val_score(log_reg, X_train, y_train, cv=3)
log_reg_mean = log_scores.mean()
# SVC
svc_clf = SVC(gamma='auto')
svc_scores = cross_val_score(svc_clf, X_train, y_train, cv=3)
svc_mean = svc_scores.mean()
# KNearestNeighbors
knn_clf = KNeighborsClassifier()
knn_scores = cross_val_score(knn_clf, X_train, y_train, cv=3)
knn_mean = knn_scores.mean()
# Decision Tree
tree_clf = tree.DecisionTreeClassifier()
tree_scores = cross_val_score(tree_clf, X_train, y_train, cv=3)
tree_mean = tree_scores.mean()
# Gradient Boosting Classifier
grad_clf = GradientBoostingClassifier()
grad_scores = cross_val_score(grad_clf, X_train, y_train, cv=3)
grad_mean = grad_scores.mean()
# Random Forest Classifier
rand_clf = RandomForestClassifier(n_estimators=18)
rand_scores = cross_val_score(rand_clf, X_train, y_train, cv=3)
rand_mean = rand_scores.mean()
# NeuralNet Classifier
neural_clf = MLPClassifier(alpha=1)
neural_scores = cross_val_score(neural_clf, X_train, y_train, cv=3)
neural_mean = neural_scores.mean()
# Naives Bayes
nav_clf = GaussianNB()
nav_scores = cross_val_score(nav_clf, X_train, y_train, cv=3)
nav_mean = neural_scores.mean()
# Create a Dataframe with the results.
d = {'Classifiers': ['Logistic Reg.', 'SVC', 'KNN', 'Dec Tree', 'Grad B CLF', 'Rand FC', 'Neural Classifier', 'Naives Bayes'],
'Crossval Mean Scores': [log_reg_mean, svc_mean, knn_mean, tree_mean, grad_mean, rand_mean, neural_mean, nav_mean]}
result_df = pd.DataFrame(data=d)
In [59]:
result_df = result_df.sort_values(by=['Crossval Mean Scores'], ascending=False)
result_df
Out[59]:
In [43]:
# Use Cross-validation.
from sklearn.model_selection import cross_val_score
# Gradient Boosting Classifier
grad_clf = GradientBoostingClassifier()
grad_scores = cross_val_score(grad_clf, X_train, y_train, cv=3)
grad_mean = grad_scores.mean()
from sklearn.model_selection import cross_val_predict
y_train_pred = cross_val_predict(grad_clf, X_train, y_train, cv=5)
from sklearn.metrics import accuracy_score
grad_clf.fit(X_train, y_train)
print ("Gradient Boost Classifier accuracy is %2.2f" % accuracy_score(y_train, y_train_pred))
predicted_probas =grad_clf.predict_proba(X_test)
In [621]:
from yellowbrick.classifier import ClassificationReport
# Specify the target classes
classes = ["yes", "no"]
visualizer = ClassificationReport(grad_clf, classes=classes, support=True, force_model=True)
visualizer.fit(X_train, y_train) # Fit the visualizer and the model
visualizer.score(X_test, y_test) # Evaluate the model on the test data
visualizer.poof() # Draw/show/poof the data
In [54]:
from sklearn.metrics import classification_report
y_pred = grad_clf.predict(X_test)
print(classification_report(y_test,y_pred,digits=2))
In [55]:
accuracy_score(y_test,y_pred)
Out[55]:
In [ ]:
In [56]:
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_train, y_train_pred)
f, ax = plt.subplots(figsize=(10, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", linewidths=.5, ax=ax)
plt.title("Confusion Matrix", fontsize=12)
plt.subplots_adjust(left=0.15, right=0.99, bottom=0.15, top=0.99)
ax.set_yticks(np.arange(conf_matrix.shape[0]) + 0.5, minor=False)
ax.set_xticklabels("")
ax.set_yticklabels(['Refused T. Deposits', 'Accepted T. Deposits'], fontsize=12, rotation=360)
plt.show()
In [57]:
import sklearn.metrics
import pandas as pd
def calc_cumulative_gains(df: pd.DataFrame, actual_col: str, predicted_col:str, probability_col:str):
df.sort_values(by=probability_col, ascending=False, inplace=True)
subset = df[df[predicted_col] == True]
rows = []
for group in np.array_split(subset, 10):
score = sklearn.metrics.accuracy_score(group[actual_col].tolist(),
group[predicted_col].tolist(),
normalize=False)
rows.append({'NumCases': len(group), 'NumCorrectPredictions': score})
lift = pd.DataFrame(rows)
print("done")
In [58]:
from scikitplot.metrics import plot_lift_curve
import scikitplot as skplt
skplt.metrics.plot_cumulative_gain(y_test, predicted_probas)
plt.show()
In [0]:
In [553]:
y_scores = cross_val_predict(grad_clf, X_train, y_train, cv=5, method="decision_function")
# hack to work around issue #9589 introduced in Scikit-Learn 0.19.0
if y_scores.ndim == 2:
y_scores = y_scores[:, 1]
y_scores.shape
Out[553]:
In [0]:
In [0]:
import sklearn.metrics
import pandas as pd
def calc_cumulative_gains(df: pd.DataFrame, actual_col: str, predicted_col:str, probability_col:str):
df.sort_values(by=probability_col, ascending=False, inplace=True)
subset = df[df[predicted_col] == True]
rows = []
for group in np.array_split(subset, 10):
score = sklearn.metrics.accuracy_score(group[actual_col].tolist(),
group[predicted_col].tolist(),
normalize=False)
rows.append({'NumCases': len(group), 'NumCorrectPredictions': score})
lift = pd.DataFrame(rows)
print("done")
In [558]:
from scikitplot.metrics import plot_lift_curve
import scikitplot as skplt
skplt.metrics.plot_cumulative_gain(y_test, predicted_probas)
plt.show()