notebook.community

Edit and run



In [1]:

    
%matplotlib inline
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import accuracy_score as acc
from sklearn.metrics import make_scorer
from sklearn.dummy import DummyRegressor
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import LabelEncoder



In [19]:

    
df = pd.read_csv("./bank-additional-full.csv", delimiter = ";")

features = ["age",
            "job",
            "marital",
            "education",
            "default",
            "housing",
            "loan",
            "contact",
            "month",
            "day_of_week",
            "campaign",
            "pdays",
            "previous",
            "poutcome",
            "emp.var.rate",
            "cons.price.idx",
            "cons.conf.idx",
            "euribor3m",
            "nr.employed"]


df[features].head()









    Out[19]:






  
    
      
      age
      job
      marital
      education
      default
      housing
      loan
      contact
      month
      day_of_week
      campaign
      pdays
      previous
      poutcome
      emp.var.rate
      cons.price.idx
      cons.conf.idx
      euribor3m
      nr.employed
    
  
  
    
      0
      56
      housemaid
      married
      basic.4y
      no
      no
      no
      telephone
      may
      mon
      1
      999
      0
      nonexistent
      1.1
      93.994
      -36.4
      4.857
      5191.0
    
    
      1
      57
      services
      married
      high.school
      unknown
      no
      no
      telephone
      may
      mon
      1
      999
      0
      nonexistent
      1.1
      93.994
      -36.4
      4.857
      5191.0
    
    
      2
      37
      services
      married
      high.school
      no
      yes
      no
      telephone
      may
      mon
      1
      999
      0
      nonexistent
      1.1
      93.994
      -36.4
      4.857
      5191.0
    
    
      3
      40
      admin.
      married
      basic.6y
      no
      no
      no
      telephone
      may
      mon
      1
      999
      0
      nonexistent
      1.1
      93.994
      -36.4
      4.857
      5191.0
    
    
      4
      56
      services
      married
      high.school
      no
      no
      yes
      telephone
      may
      mon
      1
      999
      0
      nonexistent
      1.1
      93.994
      -36.4
      4.857
      5191.0



In [18]:









    Out[18]:






  
    
      
      age
      job
      marital
      education
      default
      housing
      loan
      contact
      month
      day_of_week
      ...
      campaign
      pdays
      previous
      poutcome
      emp.var.rate
      cons.price.idx
      cons.conf.idx
      euribor3m
      nr.employed
      y
    
  
  
    
      0
      56
      3
      1
      0
      0
      0
      0
      1
      6
      1
      ...
      1
      999
      0
      1
      1.1
      93.994
      -36.4
      4.857
      5191.0
      no
    
    
      1
      57
      7
      1
      3
      1
      0
      0
      1
      6
      1
      ...
      1
      999
      0
      1
      1.1
      93.994
      -36.4
      4.857
      5191.0
      no
    
    
      2
      37
      7
      1
      3
      0
      2
      0
      1
      6
      1
      ...
      1
      999
      0
      1
      1.1
      93.994
      -36.4
      4.857
      5191.0
      no
    
    
      3
      40
      0
      1
      1
      0
      0
      0
      1
      6
      1
      ...
      1
      999
      0
      1
      1.1
      93.994
      -36.4
      4.857
      5191.0
      no
    
    
      4
      56
      7
      1
      3
      0
      0
      2
      1
      6
      1
      ...
      1
      999
      0
      1
      1.1
      93.994
      -36.4
      4.857
      5191.0
      no
    
  

5 rows × 21 columns



In [14]:

    
outcomes=  ["y"]

df[outcomes].head()



In [15]:

    
# convert a string variable to a categorical one
#types = list(set(df["Type"]))
#to_categorical = {types[i]:i for i in range(len(types))}
#df["Type"] = df["Type"].apply(lambda x: to_categorical[x])

df[["job"]] = df[["job"]].apply(LabelEncoder().fit_transform)
df[["marital"]] = df[["marital"]].apply(LabelEncoder().fit_transform)
df[["education"]] = df[["education"]].apply(LabelEncoder().fit_transform)
df[["default"]] = df[["default"]].apply(LabelEncoder().fit_transform)
df[["housing"]] = df[["housing"]].apply(LabelEncoder().fit_transform)
df[["loan"]] = df[["loan"]].apply(LabelEncoder().fit_transform)
df[["contact"]] = df[["contact"]].apply(LabelEncoder().fit_transform)
df[["month"]] = df[["month"]].apply(LabelEncoder().fit_transform)
df[["day_of_week"]] = df[["day_of_week"]].apply(LabelEncoder().fit_transform)
df[["poutcome"]] = df[["poutcome"]].apply(LabelEncoder().fit_transform)

df.head()









    Out[15]:






  
    
      
      age
      job
      marital
      education
      default
      housing
      loan
      contact
      month
      day_of_week
      ...
      campaign
      pdays
      previous
      poutcome
      emp.var.rate
      cons.price.idx
      cons.conf.idx
      euribor3m
      nr.employed
      y
    
  
  
    
      0
      56
      3
      1
      0
      0
      0
      0
      1
      6
      1
      ...
      1
      999
      0
      1
      1.1
      93.994
      -36.4
      4.857
      5191.0
      no
    
    
      1
      57
      7
      1
      3
      1
      0
      0
      1
      6
      1
      ...
      1
      999
      0
      1
      1.1
      93.994
      -36.4
      4.857
      5191.0
      no
    
    
      2
      37
      7
      1
      3
      0
      2
      0
      1
      6
      1
      ...
      1
      999
      0
      1
      1.1
      93.994
      -36.4
      4.857
      5191.0
      no
    
    
      3
      40
      0
      1
      1
      0
      0
      0
      1
      6
      1
      ...
      1
      999
      0
      1
      1.1
      93.994
      -36.4
      4.857
      5191.0
      no
    
    
      4
      56
      7
      1
      3
      0
      0
      2
      1
      6
      1
      ...
      1
      999
      0
      1
      1.1
      93.994
      -36.4
      4.857
      5191.0
      no
    
  

5 rows × 21 columns



In [20]:

    
df_dummies = pd.get_dummies(df)
df_dummies.head()









    Out[20]:






  
    
      
      age
      duration
      campaign
      pdays
      previous
      emp.var.rate
      cons.price.idx
      cons.conf.idx
      euribor3m
      nr.employed
      ...
      day_of_week_fri
      day_of_week_mon
      day_of_week_thu
      day_of_week_tue
      day_of_week_wed
      poutcome_failure
      poutcome_nonexistent
      poutcome_success
      y_no
      y_yes
    
  
  
    
      0
      56
      261
      1
      999
      0
      1.1
      93.994
      -36.4
      4.857
      5191.0
      ...
      0
      1
      0
      0
      0
      0
      1
      0
      1
      0
    
    
      1
      57
      149
      1
      999
      0
      1.1
      93.994
      -36.4
      4.857
      5191.0
      ...
      0
      1
      0
      0
      0
      0
      1
      0
      1
      0
    
    
      2
      37
      226
      1
      999
      0
      1.1
      93.994
      -36.4
      4.857
      5191.0
      ...
      0
      1
      0
      0
      0
      0
      1
      0
      1
      0
    
    
      3
      40
      151
      1
      999
      0
      1.1
      93.994
      -36.4
      4.857
      5191.0
      ...
      0
      1
      0
      0
      0
      0
      1
      0
      1
      0
    
    
      4
      56
      307
      1
      999
      0
      1.1
      93.994
      -36.4
      4.857
      5191.0
      ...
      0
      1
      0
      0
      0
      0
      1
      0
      1
      0
    
  

5 rows × 65 columns



In [31]:

    
plt.hist(df_dummies["y_yes"].values)
plt.title("Yes Histogram")
plt.xlabel("Value")
plt.ylabel("Occurance")
plt.plot()









    Out[31]:





[]



In [36]:

    
X_df = df[features].copy()
y_df = df[outcomes].copy() 
X_df.head()

X = X_df.values
y = y_df.values.T[0]
y = (y-y.min())/(y.max() - y.min())
clf.fit(X,y)
print mse(y,clf.predict(X))









    



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-36-2f135b34c969> in <module>()
      7 X = X_df.values
      8 y = y_df.values.T[0]
----> 9 y = (y-y.min())/(y.max() - y.min())
     10 clf.fit(X,y)
     11 print mse(y,clf.predict(X))

TypeError: unsupported operand type(s) for -: 'str' and 'str'



In [ ]:

	age	job	marital	education	default	housing	loan	contact	month	day_of_week	campaign	pdays	poutcome	emp.var.rate	cons.price.idx	cons.conf.idx	euribor3m	nr.employed
0	56	housemaid	married	basic.4y	no	no	no	telephone	may	mon	1	999	nonexistent	1.1	93.994	-36.4	4.857	5191.0
1	57	services	married	high.school	unknown	no	no	telephone	may	mon	1	999	nonexistent	1.1	93.994	-36.4	4.857	5191.0
2	37	services	married	high.school	no	yes	no	telephone	may	mon	1	999	nonexistent	1.1	93.994	-36.4	4.857	5191.0
3	40	admin.	married	basic.6y	no	no	no	telephone	may	mon	1	999	nonexistent	1.1	93.994	-36.4	4.857	5191.0
4	56	services	married	high.school	no	no	yes	telephone	may	mon	1	999	nonexistent	1.1	93.994	-36.4	4.857	5191.0