In [ ]:
"""
Inspired from the "Introduction to Statistical Learning" of James et al.
where we see a scatter plot of two variables and  the corresponding boxplots.

It can be particularily useful for a binary classification case where we have to pass
the target binary variable too.
"""

In [1]:
# necessary imports
import pandas as pd
import numpy as np
%load_ext autoreload
%autoreload 2
import sys
sys.path.append("../lib_plot")
#import db_to_pd as ddd
import scatter_boxplot as sbp

%matplotlib inline


/Users/charilaostsarouchas/anaconda/lib/python2.7/site-packages/pytz/__init__.py:29: UserWarning: Module argparse was already imported from /Users/charilaostsarouchas/anaconda/lib/python2.7/argparse.pyc, but /Users/charilaostsarouchas/anaconda/lib/python2.7/site-packages is being added to sys.path
  from pkg_resources import resource_stream

In [2]:
# download the data concerning the defaults of credit cards 
# — the dataset is provided by James et al., Introduction to Statistical Learning.

# sample found online in this link http://www.datarobot.com/blog/classification-with-scikit-learn/
# or maybe here:
#http://www-bcf.usc.edu/~gareth/ISL/Credit.csv


df = pd.read_csv('https://d1pqsl2386xqi9.cloudfront.net/notebooks/Default.csv', index_col=0)
# we need to transform the default var to boolean
df['target'] = df['default'].apply(lambda x: True if x=='Yes' else False)
df.head()


Out[2]:
default student balance income target
1 No No 729.526495 44361.625074 False
2 No Yes 817.180407 12106.134700 False
3 No No 1073.549164 31767.138947 False
4 No No 529.250605 35704.493935 False
5 No No 785.655883 38463.495879 False

In [3]:
# see the scatter plot and the relevant boxplots
sbp.plot_2d_data(df, 'balance', 'income', 'target' , filename=None, title='', xlabel='', ylabel='')



In [ ]: