In [ ]:
"""
Inspired from the "Introduction to Statistical Learning" of James et al.
where we see a scatter plot of two variables and the corresponding boxplots.
It can be particularily useful for a binary classification case where we have to pass
the target binary variable too.
"""
In [1]:
# necessary imports
import pandas as pd
import numpy as np
%load_ext autoreload
%autoreload 2
import sys
sys.path.append("../lib_plot")
#import db_to_pd as ddd
import scatter_boxplot as sbp
%matplotlib inline
In [2]:
# download the data concerning the defaults of credit cards
# — the dataset is provided by James et al., Introduction to Statistical Learning.
# sample found online in this link http://www.datarobot.com/blog/classification-with-scikit-learn/
# or maybe here:
#http://www-bcf.usc.edu/~gareth/ISL/Credit.csv
df = pd.read_csv('https://d1pqsl2386xqi9.cloudfront.net/notebooks/Default.csv', index_col=0)
# we need to transform the default var to boolean
df['target'] = df['default'].apply(lambda x: True if x=='Yes' else False)
df.head()
Out[2]:
In [3]:
# see the scatter plot and the relevant boxplots
sbp.plot_2d_data(df, 'balance', 'income', 'target' , filename=None, title='', xlabel='', ylabel='')
In [ ]: