Supervised learning

Data Representations

Dataset Split



In [1]:

    
from preamble import *
% matplotlib notebook



In [5]:

    
# read data.
# you can find a description in bank/bank-campaign-desc.txt
data = pd.read_csv("data/bank-campaign.csv")



In [6]:

    
data.shape









    Out[6]:





(41188, 64)



In [7]:

    
data.columns









    Out[7]:





Index(['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate',
       'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed',
       'job_admin.', 'job_blue-collar', 'job_entrepreneur', 'job_housemaid',
       'job_management', 'job_retired', 'job_self-employed', 'job_services',
       'job_student', 'job_technician', 'job_unemployed', 'job_unknown',
       'marital_divorced', 'marital_married', 'marital_single',
       'marital_unknown', 'education_basic.4y', 'education_basic.6y',
       'education_basic.9y', 'education_high.school', 'education_illiterate',
       'education_professional.course', 'education_university.degree',
       'education_unknown', 'default_no', 'default_unknown', 'default_yes',
       'housing_no', 'housing_unknown', 'housing_yes', 'loan_no',
       'loan_unknown', 'loan_yes', 'contact_cellular', 'contact_telephone',
       'month_apr', 'month_aug', 'month_dec', 'month_jul', 'month_jun',
       'month_mar', 'month_may', 'month_nov', 'month_oct', 'month_sep',
       'day_of_week_fri', 'day_of_week_mon', 'day_of_week_thu',
       'day_of_week_tue', 'day_of_week_wed', 'poutcome_failure',
       'poutcome_nonexistent', 'poutcome_success', 'target'],
      dtype='object')



In [8]:

    
data.head()









    Out[8]:






  
    
      
      age
      duration
      campaign
      pdays
      previous
      emp.var.rate
      cons.price.idx
      cons.conf.idx
      euribor3m
      nr.employed
      ...
      month_sep
      day_of_week_fri
      day_of_week_mon
      day_of_week_thu
      day_of_week_tue
      day_of_week_wed
      poutcome_failure
      poutcome_nonexistent
      poutcome_success
      target
    
  
  
    
      0
      56
      261
      1
      999
      0
      1.1
      93.994
      -36.4
      4.857
      5191.0
      ...
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      no
    
    
      1
      57
      149
      1
      999
      0
      1.1
      93.994
      -36.4
      4.857
      5191.0
      ...
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      no
    
    
      2
      37
      226
      1
      999
      0
      1.1
      93.994
      -36.4
      4.857
      5191.0
      ...
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      no
    
    
      3
      40
      151
      1
      999
      0
      1.1
      93.994
      -36.4
      4.857
      5191.0
      ...
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      no
    
    
      4
      56
      307
      1
      999
      0
      1.1
      93.994
      -36.4
      4.857
      5191.0
      ...
      0.0
      0.0
      1.0
      0.0
      0.0
      0.0
      0.0
      1.0
      0.0
      no
    
  

5 rows × 64 columns



In [9]:

    
y = data.target.values



In [10]:

    
X = data.drop("target", axis=1).values



In [11]:

    
X.shape









    Out[11]:





(41188, 63)



In [12]:

    
y.shape









    Out[12]:





(41188,)

Data is always a numpy array (or sparse matrix) of shape (n_samples, n_features)

Splitting the data:



In [13]:

    
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)



In [16]:

    
# import model
from sklearn.linear_model import LogisticRegression
# instantiate model, set parameters
lr = LogisticRegression()
# fit model
lr.fit(X_train, y_train)









    Out[16]:





LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

Make predictions:



In [17]:

    
lr.predict(X_train)[:10]









    Out[17]:





array(['no', 'no', 'no', 'no', 'no', 'no', 'no', 'yes', 'no', 'no'], dtype=object)



In [18]:

    
lr.score(X_train, y_train)









    Out[18]:





0.90890550645819168



In [19]:

    
lr.score(X_test, y_test)









    Out[19]:





0.91201320773040695



In [ ]:

``model.fit(X_train, [y_train])``
``model.predict(X_test)``	``model.transform(X_test)``
Classification	Preprocessing
Regression	Dimensionality Reduction
Clustering	Feature Extraction
	Feature selection

Additional methods

Model evaluation : score(X, [y])

Uncertainties from Classifiers: decision_function(X) and predict_proba(X)

Method chaining

Shorter, maybe less readible.



In [20]:

    
# this is short, but we never stored the model
LogisticRegression().fit(X_train, y_train).score(X_test, y_test)









    Out[20]:





0.91201320773040695

Exercise

Load the dataset data/bike_day_raw.csv, which has the regression target cnt. This dataset is hourly bike rentals in the citybike platform. The cnt column is the number of rentals, which we want to predict from date and weather data.

Split the data into a training and a test set using train_test_split. Use the LinearRegression class to learn a regression model on this data. You can evaluate with the score method, which provides the $R^2$ or using the mean_squared_error function from sklearn.metrics (or write it yourself in numpy).



In [21]:

    
pd.read_csv("data/bike_day_raw.csv")









    Out[21]:






  
    
      
      season
      mnth
      holiday
      weekday
      workingday
      weathersit
      temp
      atemp
      hum
      windspeed
      cnt
    
  
  
    
      0
      1
      1
      0
      6
      0
      2
      0.344167
      0.363625
      0.805833
      0.160446
      985
    
    
      1
      1
      1
      0
      0
      0
      2
      0.363478
      0.353739
      0.696087
      0.248539
      801
    
    
      2
      1
      1
      0
      1
      1
      1
      0.196364
      0.189405
      0.437273
      0.248309
      1349
    
    
      3
      1
      1
      0
      2
      1
      1
      0.200000
      0.212122
      0.590435
      0.160296
      1562
    
    
      4
      1
      1
      0
      3
      1
      1
      0.226957
      0.229270
      0.436957
      0.186900
      1600
    
    
      5
      1
      1
      0
      4
      1
      1
      0.204348
      0.233209
      0.518261
      0.089565
      1606
    
    
      6
      1
      1
      0
      5
      1
      2
      0.196522
      0.208839
      0.498696
      0.168726
      1510
    
    
      7
      1
      1
      0
      6
      0
      2
      0.165000
      0.162254
      0.535833
      0.266804
      959
    
    
      8
      1
      1
      0
      0
      0
      1
      0.138333
      0.116175
      0.434167
      0.361950
      822
    
    
      9
      1
      1
      0
      1
      1
      1
      0.150833
      0.150888
      0.482917
      0.223267
      1321
    
    
      10
      1
      1
      0
      2
      1
      2
      0.169091
      0.191464
      0.686364
      0.122132
      1263
    
    
      11
      1
      1
      0
      3
      1
      1
      0.172727
      0.160473
      0.599545
      0.304627
      1162
    
    
      12
      1
      1
      0
      4
      1
      1
      0.165000
      0.150883
      0.470417
      0.301000
      1406
    
    
      13
      1
      1
      0
      5
      1
      1
      0.160870
      0.188413
      0.537826
      0.126548
      1421
    
    
      14
      1
      1
      0
      6
      0
      2
      0.233333
      0.248112
      0.498750
      0.157963
      1248
    
    
      15
      1
      1
      0
      0
      0
      1
      0.231667
      0.234217
      0.483750
      0.188433
      1204
    
    
      16
      1
      1
      1
      1
      0
      2
      0.175833
      0.176771
      0.537500
      0.194017
      1000
    
    
      17
      1
      1
      0
      2
      1
      2
      0.216667
      0.232333
      0.861667
      0.146775
      683
    
    
      18
      1
      1
      0
      3
      1
      2
      0.292174
      0.298422
      0.741739
      0.208317
      1650
    
    
      19
      1
      1
      0
      4
      1
      2
      0.261667
      0.255050
      0.538333
      0.195904
      1927
    
    
      20
      1
      1
      0
      5
      1
      1
      0.177500
      0.157833
      0.457083
      0.353242
      1543
    
    
      21
      1
      1
      0
      6
      0
      1
      0.059130
      0.079070
      0.400000
      0.171970
      981
    
    
      22
      1
      1
      0
      0
      0
      1
      0.096522
      0.098839
      0.436522
      0.246600
      986
    
    
      23
      1
      1
      0
      1
      1
      1
      0.097391
      0.117930
      0.491739
      0.158330
      1416
    
    
      24
      1
      1
      0
      2
      1
      2
      0.223478
      0.234526
      0.616957
      0.129796
      1985
    
    
      25
      1
      1
      0
      3
      1
      3
      0.217500
      0.203600
      0.862500
      0.293850
      506
    
    
      26
      1
      1
      0
      4
      1
      1
      0.195000
      0.219700
      0.687500
      0.113837
      431
    
    
      27
      1
      1
      0
      5
      1
      2
      0.203478
      0.223317
      0.793043
      0.123300
      1167
    
    
      28
      1
      1
      0
      6
      0
      1
      0.196522
      0.212126
      0.651739
      0.145365
      1098
    
    
      29
      1
      1
      0
      0
      0
      1
      0.216522
      0.250322
      0.722174
      0.073983
      1096
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      701
      4
      12
      0
      0
      0
      2
      0.347500
      0.359208
      0.823333
      0.124379
      4649
    
    
      702
      4
      12
      0
      1
      1
      1
      0.452500
      0.455796
      0.767500
      0.082721
      6234
    
    
      703
      4
      12
      0
      2
      1
      1
      0.475833
      0.469054
      0.733750
      0.174129
      6606
    
    
      704
      4
      12
      0
      3
      1
      1
      0.438333
      0.428012
      0.485000
      0.324021
      5729
    
    
      705
      4
      12
      0
      4
      1
      1
      0.255833
      0.258204
      0.508750
      0.174754
      5375
    
    
      706
      4
      12
      0
      5
      1
      2
      0.320833
      0.321958
      0.764167
      0.130600
      5008
    
    
      707
      4
      12
      0
      6
      0
      2
      0.381667
      0.389508
      0.911250
      0.101379
      5582
    
    
      708
      4
      12
      0
      0
      0
      2
      0.384167
      0.390146
      0.905417
      0.157975
      3228
    
    
      709
      4
      12
      0
      1
      1
      2
      0.435833
      0.435575
      0.925000
      0.190308
      5170
    
    
      710
      4
      12
      0
      2
      1
      2
      0.353333
      0.338363
      0.596667
      0.296037
      5501
    
    
      711
      4
      12
      0
      3
      1
      2
      0.297500
      0.297338
      0.538333
      0.162937
      5319
    
    
      712
      4
      12
      0
      4
      1
      1
      0.295833
      0.294188
      0.485833
      0.174129
      5532
    
    
      713
      4
      12
      0
      5
      1
      1
      0.281667
      0.294192
      0.642917
      0.131229
      5611
    
    
      714
      4
      12
      0
      6
      0
      1
      0.324167
      0.338383
      0.650417
      0.106350
      5047
    
    
      715
      4
      12
      0
      0
      0
      2
      0.362500
      0.369938
      0.838750
      0.100742
      3786
    
    
      716
      4
      12
      0
      1
      1
      2
      0.393333
      0.401500
      0.907083
      0.098258
      4585
    
    
      717
      4
      12
      0
      2
      1
      1
      0.410833
      0.409708
      0.666250
      0.221404
      5557
    
    
      718
      4
      12
      0
      3
      1
      1
      0.332500
      0.342162
      0.625417
      0.184092
      5267
    
    
      719
      4
      12
      0
      4
      1
      2
      0.330000
      0.335217
      0.667917
      0.132463
      4128
    
    
      720
      1
      12
      0
      5
      1
      2
      0.326667
      0.301767
      0.556667
      0.374383
      3623
    
    
      721
      1
      12
      0
      6
      0
      1
      0.265833
      0.236113
      0.441250
      0.407346
      1749
    
    
      722
      1
      12
      0
      0
      0
      1
      0.245833
      0.259471
      0.515417
      0.133083
      1787
    
    
      723
      1
      12
      0
      1
      1
      2
      0.231304
      0.258900
      0.791304
      0.077230
      920
    
    
      724
      1
      12
      1
      2
      0
      2
      0.291304
      0.294465
      0.734783
      0.168726
      1013
    
    
      725
      1
      12
      0
      3
      1
      3
      0.243333
      0.220333
      0.823333
      0.316546
      441
    
    
      726
      1
      12
      0
      4
      1
      2
      0.254167
      0.226642
      0.652917
      0.350133
      2114
    
    
      727
      1
      12
      0
      5
      1
      2
      0.253333
      0.255046
      0.590000
      0.155471
      3095
    
    
      728
      1
      12
      0
      6
      0
      2
      0.253333
      0.242400
      0.752917
      0.124383
      1341
    
    
      729
      1
      12
      0
      0
      0
      1
      0.255833
      0.231700
      0.483333
      0.350754
      1796
    
    
      730
      1
      12
      0
      1
      1
      2
      0.215833
      0.223487
      0.577500
      0.154846
      2729
    
  

731 rows × 11 columns



In [22]:

    
import pandas as pd
pd.__version__









    Out[22]:





'0.18.0'



In [ ]:

	age	duration	campaign	pdays	emp.var.rate	cons.price.idx	cons.conf.idx	euribor3m	nr.employed	...	day_of_week_mon	poutcome_nonexistent	target
0	56	261	1	999	1.1	93.994	-36.4	4.857	5191.0	...	1.0	1.0	no
1	57	149	1	999	1.1	93.994	-36.4	4.857	5191.0	...	1.0	1.0	no
2	37	226	1	999	1.1	93.994	-36.4	4.857	5191.0	...	1.0	1.0	no
3	40	151	1	999	1.1	93.994	-36.4	4.857	5191.0	...	1.0	1.0	no
4	56	307	1	999	1.1	93.994	-36.4	4.857	5191.0	...	1.0	1.0	no

	season	mnth	holiday	weekday	workingday	weathersit	temp	atemp	hum	windspeed	cnt
0	1	1	0	6	0	2	0.344167	0.363625	0.805833	0.160446	985
1	1	1	0	0	0	2	0.363478	0.353739	0.696087	0.248539	801
2	1	1	0	1	1	1	0.196364	0.189405	0.437273	0.248309	1349
3	1	1	0	2	1	1	0.200000	0.212122	0.590435	0.160296	1562
4	1	1	0	3	1	1	0.226957	0.229270	0.436957	0.186900	1600
5	1	1	0	4	1	1	0.204348	0.233209	0.518261	0.089565	1606
6	1	1	0	5	1	2	0.196522	0.208839	0.498696	0.168726	1510
7	1	1	0	6	0	2	0.165000	0.162254	0.535833	0.266804	959
8	1	1	0	0	0	1	0.138333	0.116175	0.434167	0.361950	822
9	1	1	0	1	1	1	0.150833	0.150888	0.482917	0.223267	1321
10	1	1	0	2	1	2	0.169091	0.191464	0.686364	0.122132	1263
11	1	1	0	3	1	1	0.172727	0.160473	0.599545	0.304627	1162
12	1	1	0	4	1	1	0.165000	0.150883	0.470417	0.301000	1406
13	1	1	0	5	1	1	0.160870	0.188413	0.537826	0.126548	1421
14	1	1	0	6	0	2	0.233333	0.248112	0.498750	0.157963	1248
15	1	1	0	0	0	1	0.231667	0.234217	0.483750	0.188433	1204
16	1	1	1	1	0	2	0.175833	0.176771	0.537500	0.194017	1000
17	1	1	0	2	1	2	0.216667	0.232333	0.861667	0.146775	683
18	1	1	0	3	1	2	0.292174	0.298422	0.741739	0.208317	1650
19	1	1	0	4	1	2	0.261667	0.255050	0.538333	0.195904	1927
20	1	1	0	5	1	1	0.177500	0.157833	0.457083	0.353242	1543
21	1	1	0	6	0	1	0.059130	0.079070	0.400000	0.171970	981
22	1	1	0	0	0	1	0.096522	0.098839	0.436522	0.246600	986
23	1	1	0	1	1	1	0.097391	0.117930	0.491739	0.158330	1416
24	1	1	0	2	1	2	0.223478	0.234526	0.616957	0.129796	1985
25	1	1	0	3	1	3	0.217500	0.203600	0.862500	0.293850	506
26	1	1	0	4	1	1	0.195000	0.219700	0.687500	0.113837	431
27	1	1	0	5	1	2	0.203478	0.223317	0.793043	0.123300	1167
28	1	1	0	6	0	1	0.196522	0.212126	0.651739	0.145365	1098
29	1	1	0	0	0	1	0.216522	0.250322	0.722174	0.073983	1096
...	...	...	...	...	...	...	...	...	...	...	...
701	4	12	0	0	0	2	0.347500	0.359208	0.823333	0.124379	4649
702	4	12	0	1	1	1	0.452500	0.455796	0.767500	0.082721	6234
703	4	12	0	2	1	1	0.475833	0.469054	0.733750	0.174129	6606
704	4	12	0	3	1	1	0.438333	0.428012	0.485000	0.324021	5729
705	4	12	0	4	1	1	0.255833	0.258204	0.508750	0.174754	5375
706	4	12	0	5	1	2	0.320833	0.321958	0.764167	0.130600	5008
707	4	12	0	6	0	2	0.381667	0.389508	0.911250	0.101379	5582
708	4	12	0	0	0	2	0.384167	0.390146	0.905417	0.157975	3228
709	4	12	0	1	1	2	0.435833	0.435575	0.925000	0.190308	5170
710	4	12	0	2	1	2	0.353333	0.338363	0.596667	0.296037	5501
711	4	12	0	3	1	2	0.297500	0.297338	0.538333	0.162937	5319
712	4	12	0	4	1	1	0.295833	0.294188	0.485833	0.174129	5532
713	4	12	0	5	1	1	0.281667	0.294192	0.642917	0.131229	5611
714	4	12	0	6	0	1	0.324167	0.338383	0.650417	0.106350	5047
715	4	12	0	0	0	2	0.362500	0.369938	0.838750	0.100742	3786
716	4	12	0	1	1	2	0.393333	0.401500	0.907083	0.098258	4585
717	4	12	0	2	1	1	0.410833	0.409708	0.666250	0.221404	5557
718	4	12	0	3	1	1	0.332500	0.342162	0.625417	0.184092	5267
719	4	12	0	4	1	2	0.330000	0.335217	0.667917	0.132463	4128
720	1	12	0	5	1	2	0.326667	0.301767	0.556667	0.374383	3623
721	1	12	0	6	0	1	0.265833	0.236113	0.441250	0.407346	1749
722	1	12	0	0	0	1	0.245833	0.259471	0.515417	0.133083	1787
723	1	12	0	1	1	2	0.231304	0.258900	0.791304	0.077230	920
724	1	12	1	2	0	2	0.291304	0.294465	0.734783	0.168726	1013
725	1	12	0	3	1	3	0.243333	0.220333	0.823333	0.316546	441
726	1	12	0	4	1	2	0.254167	0.226642	0.652917	0.350133	2114
727	1	12	0	5	1	2	0.253333	0.255046	0.590000	0.155471	3095
728	1	12	0	6	0	2	0.253333	0.242400	0.752917	0.124383	1341
729	1	12	0	0	0	1	0.255833	0.231700	0.483333	0.350754	1796
730	1	12	0	1	1	2	0.215833	0.223487	0.577500	0.154846	2729