In [1]:
%matplotlib inline
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import accuracy_score as acc
from sklearn.metrics import make_scorer
from sklearn.dummy import DummyRegressor
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import LabelEncoder
In [19]:
df = pd.read_csv("./bank-additional-full.csv", delimiter = ";")
features = ["age",
"job",
"marital",
"education",
"default",
"housing",
"loan",
"contact",
"month",
"day_of_week",
"campaign",
"pdays",
"previous",
"poutcome",
"emp.var.rate",
"cons.price.idx",
"cons.conf.idx",
"euribor3m",
"nr.employed"]
df[features].head()
Out[19]:
In [18]:
Out[18]:
In [14]:
outcomes= ["y"]
df[outcomes].head()
Out[14]:
In [15]:
# convert a string variable to a categorical one
#types = list(set(df["Type"]))
#to_categorical = {types[i]:i for i in range(len(types))}
#df["Type"] = df["Type"].apply(lambda x: to_categorical[x])
df[["job"]] = df[["job"]].apply(LabelEncoder().fit_transform)
df[["marital"]] = df[["marital"]].apply(LabelEncoder().fit_transform)
df[["education"]] = df[["education"]].apply(LabelEncoder().fit_transform)
df[["default"]] = df[["default"]].apply(LabelEncoder().fit_transform)
df[["housing"]] = df[["housing"]].apply(LabelEncoder().fit_transform)
df[["loan"]] = df[["loan"]].apply(LabelEncoder().fit_transform)
df[["contact"]] = df[["contact"]].apply(LabelEncoder().fit_transform)
df[["month"]] = df[["month"]].apply(LabelEncoder().fit_transform)
df[["day_of_week"]] = df[["day_of_week"]].apply(LabelEncoder().fit_transform)
df[["poutcome"]] = df[["poutcome"]].apply(LabelEncoder().fit_transform)
df.head()
Out[15]:
In [20]:
df_dummies = pd.get_dummies(df)
df_dummies.head()
Out[20]:
In [31]:
plt.hist(df_dummies["y_yes"].values)
plt.title("Yes Histogram")
plt.xlabel("Value")
plt.ylabel("Occurance")
plt.plot()
Out[31]:
In [36]:
X_df = df[features].copy()
y_df = df[outcomes].copy()
X_df.head()
X = X_df.values
y = y_df.values.T[0]
y = (y-y.min())/(y.max() - y.min())
clf.fit(X,y)
print mse(y,clf.predict(X))
In [ ]: