In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn import *

plt.style.use("seaborn")


%matplotlib inline

In [2]:
plt.rcParams["figure.figsize"] = (12,5)

In [3]:
df = pd.read_csv("/data/insurance.csv")
df.head()


Out[3]:
age gender bmi children smoker region charges
0 19 female 27.900 0 yes southwest 16884.92400
1 18 male 33.770 1 no southeast 1725.55230
2 28 male 33.000 3 no southeast 4449.46200
3 33 male 22.705 0 no northwest 21984.47061
4 32 male 28.880 0 no northwest 3866.85520

In [4]:
target = "charges"

In [5]:
y = df[target]

In [6]:
from scipy.stats import norm

In [7]:
def gaussian(x):
    n = len(x)
    x = np.sort(x)
    x_mean, x_std = np.mean(x), np.std(x)
    rv = norm(loc = x_mean, scale = x_std)
    return x, rv.pdf(x)

In [8]:
plt.subplot(211)
y.plot.hist(bins = 50, density = True)
plt.title("Histogram of charges")

x_vals, y_vals = gaussian(y)
plt.plot(x_vals, y_vals)


plt.subplot(212)
y.plot.box(vert=False)


Out[8]:
<matplotlib.axes._subplots.AxesSubplot at 0x109b89cf8>

In [9]:
y = np.log(df[target])

In [10]:
plt.subplot(211)
y.plot.hist(bins = 50, density = True)
plt.title("Histogram of charges")

x_vals, y_vals = gaussian(y)
plt.plot(x_vals, y_vals)


plt.subplot(212)
y.plot.box(vert=False)


Out[10]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a1d92bf98>

In [11]:
X = df.drop(columns=[target])

In [12]:
cat_columns = [k for k in dict(X.dtypes) if str(X.dtypes[k]) == "object"]
num_columns = [k for k in dict(X.dtypes) if str(X.dtypes[k]) != "object"]
print("categorical columns:", cat_columns)
print("continuous columns:", num_columns)


categorical columns: ['gender', 'smoker', 'region']
continuous columns: ['age', 'bmi', 'children']

In [13]:
X[num_columns].dtypes


Out[13]:
age           int64
bmi         float64
children      int64
dtype: object

In [14]:
X[num_columns] = X[num_columns].astype("float64")
X[num_columns].dtypes


Out[14]:
age         float64
bmi         float64
children    float64
dtype: object

In [15]:
cat_pipe = pipeline.make_pipeline(preprocessing.OneHotEncoder(handle_unknown = "ignore"))

num_pipe = pipeline.make_pipeline(
    preprocessing.StandardScaler()
)

column_transformers = compose.ColumnTransformer([
    ("cat", cat_pipe, cat_columns),
    ("num", num_pipe, num_columns),
    ("bucketizers", preprocessing.KBinsDiscretizer(n_bins = 5), ["age", "bmi"])
])

column_transformers


Out[15]:
ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('cat', Pipeline(memory=None,
     steps=[('onehotencoder', OneHotEncoder(categorical_features=None, categories=None,
       dtype=<class 'numpy.float64'>, handle_unknown='ignore',
       n_values=None, sparse=True))]), ['gender', 'smoker', 'region']), ('num', Pipeline(memory=None,
   ..., ('bucketizers', KBinsDiscretizer(encode='onehot', n_bins=5, strategy='quantile'), ['age', 'bmi'])])

In [16]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y
                                        , test_size = 0.3, random_state = 1)

In [17]:
pipe = pipeline.Pipeline([
    ("column_transformers", column_transformers),
    ("lr", linear_model.LinearRegression())
])
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)


Out[17]:
0.7883913140398686

In [ ]: