In [1]:
%matplotlib inline
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import accuracy_score as acc
from sklearn.metrics import make_scorer
from sklearn.dummy import DummyRegressor
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import LabelEncoder

In [19]:
df = pd.read_csv("./bank-additional-full.csv", delimiter = ";")

features = ["age",
            "job",
            "marital",
            "education",
            "default",
            "housing",
            "loan",
            "contact",
            "month",
            "day_of_week",
            "campaign",
            "pdays",
            "previous",
            "poutcome",
            "emp.var.rate",
            "cons.price.idx",
            "cons.conf.idx",
            "euribor3m",
            "nr.employed"]


df[features].head()


Out[19]:
age job marital education default housing loan contact month day_of_week campaign pdays previous poutcome emp.var.rate cons.price.idx cons.conf.idx euribor3m nr.employed
0 56 housemaid married basic.4y no no no telephone may mon 1 999 0 nonexistent 1.1 93.994 -36.4 4.857 5191.0
1 57 services married high.school unknown no no telephone may mon 1 999 0 nonexistent 1.1 93.994 -36.4 4.857 5191.0
2 37 services married high.school no yes no telephone may mon 1 999 0 nonexistent 1.1 93.994 -36.4 4.857 5191.0
3 40 admin. married basic.6y no no no telephone may mon 1 999 0 nonexistent 1.1 93.994 -36.4 4.857 5191.0
4 56 services married high.school no no yes telephone may mon 1 999 0 nonexistent 1.1 93.994 -36.4 4.857 5191.0

In [18]:



Out[18]:
age job marital education default housing loan contact month day_of_week ... campaign pdays previous poutcome emp.var.rate cons.price.idx cons.conf.idx euribor3m nr.employed y
0 56 3 1 0 0 0 0 1 6 1 ... 1 999 0 1 1.1 93.994 -36.4 4.857 5191.0 no
1 57 7 1 3 1 0 0 1 6 1 ... 1 999 0 1 1.1 93.994 -36.4 4.857 5191.0 no
2 37 7 1 3 0 2 0 1 6 1 ... 1 999 0 1 1.1 93.994 -36.4 4.857 5191.0 no
3 40 0 1 1 0 0 0 1 6 1 ... 1 999 0 1 1.1 93.994 -36.4 4.857 5191.0 no
4 56 7 1 3 0 0 2 1 6 1 ... 1 999 0 1 1.1 93.994 -36.4 4.857 5191.0 no

5 rows × 21 columns


In [14]:
outcomes=  ["y"]

df[outcomes].head()


Out[14]:
y
0 no
1 no
2 no
3 no
4 no

In [15]:
# convert a string variable to a categorical one
#types = list(set(df["Type"]))
#to_categorical = {types[i]:i for i in range(len(types))}
#df["Type"] = df["Type"].apply(lambda x: to_categorical[x])

df[["job"]] = df[["job"]].apply(LabelEncoder().fit_transform)
df[["marital"]] = df[["marital"]].apply(LabelEncoder().fit_transform)
df[["education"]] = df[["education"]].apply(LabelEncoder().fit_transform)
df[["default"]] = df[["default"]].apply(LabelEncoder().fit_transform)
df[["housing"]] = df[["housing"]].apply(LabelEncoder().fit_transform)
df[["loan"]] = df[["loan"]].apply(LabelEncoder().fit_transform)
df[["contact"]] = df[["contact"]].apply(LabelEncoder().fit_transform)
df[["month"]] = df[["month"]].apply(LabelEncoder().fit_transform)
df[["day_of_week"]] = df[["day_of_week"]].apply(LabelEncoder().fit_transform)
df[["poutcome"]] = df[["poutcome"]].apply(LabelEncoder().fit_transform)

df.head()


Out[15]:
age job marital education default housing loan contact month day_of_week ... campaign pdays previous poutcome emp.var.rate cons.price.idx cons.conf.idx euribor3m nr.employed y
0 56 3 1 0 0 0 0 1 6 1 ... 1 999 0 1 1.1 93.994 -36.4 4.857 5191.0 no
1 57 7 1 3 1 0 0 1 6 1 ... 1 999 0 1 1.1 93.994 -36.4 4.857 5191.0 no
2 37 7 1 3 0 2 0 1 6 1 ... 1 999 0 1 1.1 93.994 -36.4 4.857 5191.0 no
3 40 0 1 1 0 0 0 1 6 1 ... 1 999 0 1 1.1 93.994 -36.4 4.857 5191.0 no
4 56 7 1 3 0 0 2 1 6 1 ... 1 999 0 1 1.1 93.994 -36.4 4.857 5191.0 no

5 rows × 21 columns


In [20]:
df_dummies = pd.get_dummies(df)
df_dummies.head()


Out[20]:
age duration campaign pdays previous emp.var.rate cons.price.idx cons.conf.idx euribor3m nr.employed ... day_of_week_fri day_of_week_mon day_of_week_thu day_of_week_tue day_of_week_wed poutcome_failure poutcome_nonexistent poutcome_success y_no y_yes
0 56 261 1 999 0 1.1 93.994 -36.4 4.857 5191.0 ... 0 1 0 0 0 0 1 0 1 0
1 57 149 1 999 0 1.1 93.994 -36.4 4.857 5191.0 ... 0 1 0 0 0 0 1 0 1 0
2 37 226 1 999 0 1.1 93.994 -36.4 4.857 5191.0 ... 0 1 0 0 0 0 1 0 1 0
3 40 151 1 999 0 1.1 93.994 -36.4 4.857 5191.0 ... 0 1 0 0 0 0 1 0 1 0
4 56 307 1 999 0 1.1 93.994 -36.4 4.857 5191.0 ... 0 1 0 0 0 0 1 0 1 0

5 rows × 65 columns


In [31]:
plt.hist(df_dummies["y_yes"].values)
plt.title("Yes Histogram")
plt.xlabel("Value")
plt.ylabel("Occurance")
plt.plot()


Out[31]:
[]

In [36]:
X_df = df[features].copy()
y_df = df[outcomes].copy() 
X_df.head()

X = X_df.values
y = y_df.values.T[0]
y = (y-y.min())/(y.max() - y.min())
clf.fit(X,y)
print mse(y,clf.predict(X))


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-36-2f135b34c969> in <module>()
      7 X = X_df.values
      8 y = y_df.values.T[0]
----> 9 y = (y-y.min())/(y.max() - y.min())
     10 clf.fit(X,y)
     11 print mse(y,clf.predict(X))

TypeError: unsupported operand type(s) for -: 'str' and 'str'

In [ ]: