This notebook is designed to be a starting point, the hello world scenario, of a regression problem We try to keep this up to date with best practices, such that it is easy to get an initial model trained, with sufficient endpoints to then start customizing and tuning the pipeline to your specific situations.
The notebook is tailored for the case where you get a labelled trainingset and an unlabeled 'challenge' set. This is slightly different from a testset, because usually you do have the labels of the test set, but you just use them to determine a quality measure.
In [1]:
import os
print os.getcwd()
/Users/gerben/Projects/bdr-analytics-py/notebooks
We here import bdr-analytics -py from a local checkout. This is not necessary if it is installed in your (conda) environment.
In [2]:
import sys
sys.path.append("../../bdr-analytics-py")
%load_ext autoreload
%autoreload 2
In [3]:
# import generic packages
import numpy as np
import pandas as pd
from IPython.display import display
# visualization packages
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
plt.style.use('fivethirtyeight')
%matplotlib inline
In [4]:
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.preprocessing import OneHotEncoder, Imputer, StandardScaler
from bdranalytics.pipeline.encoders import WeightOfEvidenceEncoder
from bdranalytics.pipeline.encoders import ColumnSelector, StringIndexer
from bdranalytics.pandaspipeline.transformers import PdFeatureUnion
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer, r2_score
In [5]:
from sklearn.datasets import load_boston
raw = load_boston()
df = pd.DataFrame(raw['data'], columns=raw['feature_names'])
df['TARGET'] = raw['target']
# for educational purpose, let us add a date
ts = pd.date_range('1/1/2011', periods=len(df), freq='H').strftime('%Y-%m-%d %H:%M:%S')
np.random.shuffle(ts)
df['example_date'] = ts
# for educational purposes, let us add a category
cat = np.floor(df['AGE'] % 6)
df['example_cat'] = pd.Series(cat).map({0:'ZERO', 1:'UNO', 2:'TWO', 3:'TRES', 4:'QUAT', 5:'SIX'})
# for educational purposes, let us add a very high cardinality category
df['example_cardinalcat'] = pd.Series(np.floor(df.index.values**2)).astype(str)+'_str'
print("{:d} samples".format(len(df)))
df.index.name = 'INDEX'
df.head()
506 samples
Out[5]:
CRIM
ZN
INDUS
CHAS
NOX
RM
AGE
DIS
RAD
TAX
PTRATIO
B
LSTAT
TARGET
example_date
example_cat
example_cardinalcat
INDEX
0
0.00632
18.0
2.31
0.0
0.538
6.575
65.2
4.0900
1.0
296.0
15.3
396.90
4.98
24.0
2011-01-03 02:00:00
SIX
0.0_str
1
0.02731
0.0
7.07
0.0
0.469
6.421
78.9
4.9671
2.0
242.0
17.8
396.90
9.14
21.6
2011-01-15 00:00:00
ZERO
1.0_str
2
0.02729
0.0
7.07
0.0
0.469
7.185
61.1
4.9671
2.0
242.0
17.8
392.83
4.03
34.7
2011-01-18 03:00:00
UNO
4.0_str
3
0.03237
0.0
2.18
0.0
0.458
6.998
45.8
6.0622
3.0
222.0
18.7
394.63
2.94
33.4
2011-01-18 02:00:00
TRES
9.0_str
4
0.06905
0.0
2.18
0.0
0.458
7.147
54.2
6.0622
3.0
222.0
18.7
396.90
5.33
36.2
2011-01-18 20:00:00
ZERO
16.0_str
In [6]:
labelled = df.iloc[:-100,:]
challenge = df.iloc[-100:,:]
Note: if you have a dataset without labels, assign it to challenge
, the labelled
set is the set using which we train (and where we extract a testset from).
Remove the target column from the train dataset
In [7]:
y_col = 'TARGET'
X = labelled.reset_index().drop([y_col, labelled.index.name], 1).copy()
y = labelled[y_col].copy()
# assign items to groups, train and test set will be stratified selection from groups
groups = np.random.randint(1, 10, len(y)) # here we define random groups, could also be the class index taken from X
In [8]:
def root_mean_squared_error(y_true, y_pred, sample_weight=None, multioutput='uniform_average'):
return np.sqrt(mean_squared_error(y_true, y_pred, sample_weight, multioutput))
In [9]:
model_score = root_mean_squared_error
model_scorer = make_scorer(root_mean_squared_error, greater_is_better=False)
Print some values to get some shallow information about the data set (that's why we haven't extracted a test set yet)
In [10]:
display(X.head())
print(X.columns.values)
display(y.head())
CRIM
ZN
INDUS
CHAS
NOX
RM
AGE
DIS
RAD
TAX
PTRATIO
B
LSTAT
example_date
example_cat
example_cardinalcat
0
0.00632
18.0
2.31
0.0
0.538
6.575
65.2
4.0900
1.0
296.0
15.3
396.90
4.98
2011-01-03 02:00:00
SIX
0.0_str
1
0.02731
0.0
7.07
0.0
0.469
6.421
78.9
4.9671
2.0
242.0
17.8
396.90
9.14
2011-01-15 00:00:00
ZERO
1.0_str
2
0.02729
0.0
7.07
0.0
0.469
7.185
61.1
4.9671
2.0
242.0
17.8
392.83
4.03
2011-01-18 03:00:00
UNO
4.0_str
3
0.03237
0.0
2.18
0.0
0.458
6.998
45.8
6.0622
3.0
222.0
18.7
394.63
2.94
2011-01-18 02:00:00
TRES
9.0_str
4
0.06905
0.0
2.18
0.0
0.458
7.147
54.2
6.0622
3.0
222.0
18.7
396.90
5.33
2011-01-18 20:00:00
ZERO
16.0_str
['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
'B' 'LSTAT' 'example_date' 'example_cat' 'example_cardinalcat']
INDEX
0 24.0
1 21.6
2 34.7
3 33.4
4 36.2
Name: TARGET, dtype: float64
In [11]:
from pandas_profiling import ProfileReport
profile = ProfileReport(X, correlation_overrides=list(X.columns))
profile
/Users/gerben/anaconda3/envs/bdranalytics/lib/python2.7/site-packages/matplotlib/__init__.py:1405: UserWarning:
This call to matplotlib.use() has no effect because the backend has already
been chosen; matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.
warnings.warn(_use_error_msg)
Out[11]:
Overview
Dataset info
Number of variables
16
Number of observations
406
Total Missing (%)
0.0%
Total size in memory
50.8 KiB
Average record size in memory
128.2 B
Variables types
Numeric
13
Categorical
1
Date
0
Text (Unique)
2
Rejected
0
Warnings
CHAS
has 371 / 91.4% zerosZN
has 272 / 67.0% zeros
Variables
AGE
Numeric
Distinct count
309
Unique (%)
76.1%
Missing (%)
0.0%
Missing (n)
0
Infinite (%)
0.0%
Infinite (n)
0
Mean
64.567
Minimum
2.9
Maximum
100
Zeros (%)
0.0%
Quantile statistics
Minimum
2.9
5-th percentile
16.475
Q1
38.15
Median
69.65
Q3
92.35
95-th percentile
100
Maximum
100
Range
97.1
Interquartile range
54.2
Descriptive statistics
Standard deviation
29.054
Coef of variation
0.44999
Kurtosis
-1.2261
Mean
64.567
MAD
25.781
Skewness
-0.37336
Sum
26214
Variance
844.15
Memory size
3.2 KiB
Value
Count
Frequency (%)
100.0
29
7.1%
96.0
4
1.0%
21.4
3
0.7%
97.0
3
0.7%
98.2
3
0.7%
98.9
3
0.7%
96.2
3
0.7%
97.9
3
0.7%
98.8
3
0.7%
32.2
3
0.7%
Other values (299)
349
86.0%
Minimum 5 values
Value
Count
Frequency (%)
2.9
1
0.2%
6.0
1
0.2%
6.2
1
0.2%
6.5
1
0.2%
6.6
2
0.5%
Maximum 5 values
Value
Count
Frequency (%)
98.5
1
0.2%
98.8
3
0.7%
98.9
3
0.7%
99.1
1
0.2%
100.0
29
7.1%
B
Numeric
Distinct count
279
Unique (%)
68.7%
Missing (%)
0.0%
Missing (n)
0
Infinite (%)
0.0%
Infinite (n)
0
Mean
379.86
Minimum
70.8
Maximum
396.9
Zeros (%)
0.0%
Quantile statistics
Minimum
70.8
5-th percentile
329.6
Q1
380.92
Median
392.52
Q3
396.9
95-th percentile
396.9
Maximum
396.9
Range
326.1
Interquartile range
15.977
Descriptive statistics
Standard deviation
40.495
Coef of variation
0.10661
Kurtosis
28.334
Mean
379.86
MAD
20.215
Skewness
-4.925
Sum
154220
Variance
1639.9
Memory size
3.2 KiB
Value
Count
Frequency (%)
396.9
103
25.4%
395.24
3
0.7%
393.74
3
0.7%
394.12
2
0.5%
395.63
2
0.5%
392.78
2
0.5%
391.34
2
0.5%
393.23
2
0.5%
395.62
2
0.5%
377.07
2
0.5%
Other values (269)
283
69.7%
Minimum 5 values
Value
Count
Frequency (%)
70.8
1
0.2%
88.01
1
0.2%
88.63
1
0.2%
131.42
1
0.2%
169.27
1
0.2%
Maximum 5 values
Value
Count
Frequency (%)
396.28
1
0.2%
396.3
1
0.2%
396.33
1
0.2%
396.42
1
0.2%
396.9
103
25.4%
CHAS
Numeric
Distinct count
2
Unique (%)
0.5%
Missing (%)
0.0%
Missing (n)
0
Infinite (%)
0.0%
Infinite (n)
0
Mean
0.086207
Minimum
0
Maximum
1
Zeros (%)
91.4%
Quantile statistics
Minimum
0
5-th percentile
0
Q1
0
Median
0
Q3
0
95-th percentile
1
Maximum
1
Range
1
Interquartile range
0
Descriptive statistics
Standard deviation
0.28102
Coef of variation
3.2598
Kurtosis
6.7924
Mean
0.086207
MAD
0.15755
Skewness
2.9596
Sum
35
Variance
0.07897
Memory size
3.2 KiB
Value
Count
Frequency (%)
0.0
371
91.4%
1.0
35
8.6%
Minimum 5 values
Value
Count
Frequency (%)
0.0
371
91.4%
1.0
35
8.6%
Maximum 5 values
Value
Count
Frequency (%)
0.0
371
91.4%
1.0
35
8.6%
CRIM
Numeric
Distinct count
405
Unique (%)
99.8%
Missing (%)
0.0%
Missing (n)
0
Infinite (%)
0.0%
Infinite (n)
0
Mean
2.176
Minimum
0.00632
Maximum
88.976
Zeros (%)
0.0%
Quantile statistics
Minimum
0.00632
5-th percentile
0.022648
Q1
0.066475
Median
0.16831
Q3
0.78533
95-th percentile
11.46
Maximum
88.976
Range
88.97
Interquartile range
0.71885
Descriptive statistics
Standard deviation
7.2834
Coef of variation
3.3471
Kurtosis
69.363
Mean
2.176
MAD
3.1605
Skewness
7.2965
Sum
883.47
Variance
53.048
Memory size
3.2 KiB
Value
Count
Frequency (%)
0.01501
2
0.5%
0.08265
1
0.2%
0.1
1
0.2%
0.537
1
0.2%
0.97617
1
0.2%
1.35472
1
0.2%
0.14103
1
0.2%
0.03502
1
0.2%
0.03615
1
0.2%
0.66351
1
0.2%
Other values (395)
395
97.3%
Minimum 5 values
Value
Count
Frequency (%)
0.00632
1
0.2%
0.00906
1
0.2%
0.01096
1
0.2%
0.01301
1
0.2%
0.01311
1
0.2%
Maximum 5 values
Value
Count
Frequency (%)
25.0461
1
0.2%
38.3518
1
0.2%
41.5292
1
0.2%
67.9208
1
0.2%
88.9762
1
0.2%
DIS
Numeric
Distinct count
315
Unique (%)
77.6%
Missing (%)
0.0%
Missing (n)
0
Infinite (%)
0.0%
Infinite (n)
0
Mean
4.1611
Minimum
1.1296
Maximum
12.127
Zeros (%)
0.0%
Quantile statistics
Minimum
1.1296
5-th percentile
1.4448
Q1
2.2741
Median
3.8182
Q3
5.6014
95-th percentile
8.0136
Maximum
12.127
Range
10.997
Interquartile range
3.3273
Descriptive statistics
Standard deviation
2.184
Coef of variation
0.52486
Kurtosis
0.0044272
Mean
4.1611
MAD
1.7959
Skewness
0.73058
Sum
1689.4
Variance
4.7699
Memory size
3.2 KiB
Value
Count
Frequency (%)
3.4952
5
1.2%
5.4007
4
1.0%
6.8147
4
1.0%
5.7209
4
1.0%
5.2873
4
1.0%
7.8278
3
0.7%
3.9454
3
0.7%
7.309
3
0.7%
3.6519
3
0.7%
4.8122
3
0.7%
Other values (305)
370
91.1%
Minimum 5 values
Value
Count
Frequency (%)
1.1296
1
0.2%
1.137
1
0.2%
1.1691
1
0.2%
1.1742
1
0.2%
1.2024
1
0.2%
Maximum 5 values
Value
Count
Frequency (%)
9.2203
2
0.5%
9.2229
1
0.2%
10.5857
2
0.5%
10.7103
2
0.5%
12.1265
1
0.2%
INDUS
Numeric
Distinct count
73
Unique (%)
18.0%
Missing (%)
0.0%
Missing (n)
0
Infinite (%)
0.0%
Infinite (n)
0
Mean
9.5447
Minimum
0.46
Maximum
25.65
Zeros (%)
0.0%
Quantile statistics
Minimum
0.46
5-th percentile
2.0125
Q1
4.39
Median
7.38
Q3
17.335
95-th percentile
21.89
Maximum
25.65
Range
25.19
Interquartile range
12.945
Descriptive statistics
Standard deviation
6.5479
Coef of variation
0.68602
Kurtosis
-0.76646
Mean
9.5447
MAD
5.5528
Skewness
0.72425
Sum
3875.1
Variance
42.875
Memory size
3.2 KiB
Value
Count
Frequency (%)
18.1
50
12.3%
19.58
30
7.4%
8.14
22
5.4%
6.2
18
4.4%
21.89
15
3.7%
3.97
12
3.0%
9.9
12
3.0%
10.59
11
2.7%
8.56
11
2.7%
5.86
10
2.5%
Other values (63)
215
53.0%
Minimum 5 values
Value
Count
Frequency (%)
0.46
1
0.2%
0.74
1
0.2%
1.21
1
0.2%
1.22
1
0.2%
1.25
2
0.5%
Maximum 5 values
Value
Count
Frequency (%)
15.04
3
0.7%
18.1
50
12.3%
19.58
30
7.4%
21.89
15
3.7%
25.65
7
1.7%
LSTAT
Numeric
Distinct count
368
Unique (%)
90.6%
Missing (%)
0.0%
Missing (n)
0
Infinite (%)
0.0%
Infinite (n)
0
Mean
11.423
Minimum
1.73
Maximum
37.97
Zeros (%)
0.0%
Quantile statistics
Minimum
1.73
5-th percentile
3.5325
Q1
6.36
Median
9.685
Q3
14.685
95-th percentile
26.808
Maximum
37.97
Range
36.24
Interquartile range
8.325
Descriptive statistics
Standard deviation
6.8769
Coef of variation
0.60202
Kurtosis
1.2681
Mean
11.423
MAD
5.3388
Skewness
1.2112
Sum
4637.8
Variance
47.292
Memory size
3.2 KiB
Value
Count
Frequency (%)
8.05
3
0.7%
7.79
3
0.7%
6.36
3
0.7%
9.5
2
0.5%
4.45
2
0.5%
12.67
2
0.5%
13.15
2
0.5%
5.49
2
0.5%
30.81
2
0.5%
3.76
2
0.5%
Other values (358)
383
94.3%
Minimum 5 values
Value
Count
Frequency (%)
1.73
1
0.2%
1.92
1
0.2%
1.98
1
0.2%
2.47
1
0.2%
2.87
1
0.2%
Maximum 5 values
Value
Count
Frequency (%)
30.81
2
0.5%
31.99
1
0.2%
34.41
1
0.2%
34.77
1
0.2%
37.97
1
0.2%
NOX
Numeric
Distinct count
67
Unique (%)
16.5%
Missing (%)
0.0%
Missing (n)
0
Infinite (%)
0.0%
Infinite (n)
0
Mean
0.53253
Minimum
0.385
Maximum
0.871
Zeros (%)
0.0%
Quantile statistics
Minimum
0.385
5-th percentile
0.405
Q1
0.44223
Median
0.507
Q3
0.605
95-th percentile
0.77
Maximum
0.871
Range
0.486
Interquartile range
0.16277
Descriptive statistics
Standard deviation
0.11477
Coef of variation
0.21552
Kurtosis
1.071
Mean
0.53253
MAD
0.089102
Skewness
1.2084
Sum
216.21
Variance
0.013173
Memory size
3.2 KiB
Value
Count
Frequency (%)
0.538
23
5.7%
0.437
17
4.2%
0.871
16
3.9%
0.624
15
3.7%
0.489
15
3.7%
0.605
14
3.4%
0.693
13
3.2%
0.544
12
3.0%
0.52
11
2.7%
0.7
11
2.7%
Other values (57)
259
63.8%
Minimum 5 values
Value
Count
Frequency (%)
0.385
1
0.2%
0.389
1
0.2%
0.392
2
0.5%
0.394
1
0.2%
0.398
2
0.5%
Maximum 5 values
Value
Count
Frequency (%)
0.693
13
3.2%
0.7
11
2.7%
0.718
3
0.7%
0.77
8
2.0%
0.871
16
3.9%
PTRATIO
Numeric
Distinct count
45
Unique (%)
11.1%
Missing (%)
0.0%
Missing (n)
0
Infinite (%)
0.0%
Infinite (n)
0
Mean
18.037
Minimum
12.6
Maximum
22
Zeros (%)
0.0%
Quantile statistics
Minimum
12.6
5-th percentile
14.7
Q1
16.6
Median
18.4
Q3
20.2
95-th percentile
21
Maximum
22
Range
9.4
Interquartile range
3.6
Descriptive statistics
Standard deviation
2.22
Coef of variation
0.12308
Kurtosis
-0.6161
Mean
18.037
MAD
1.8232
Skewness
-0.47677
Sum
7323
Variance
4.9282
Memory size
3.2 KiB
Value
Count
Frequency (%)
20.2
58
14.3%
14.7
34
8.4%
17.8
23
5.7%
21.0
22
5.4%
17.4
18
4.4%
19.1
17
4.2%
18.6
17
4.2%
16.6
16
3.9%
18.4
16
3.9%
21.2
15
3.7%
Other values (35)
170
41.9%
Minimum 5 values
Value
Count
Frequency (%)
12.6
3
0.7%
13.0
12
3.0%
13.6
1
0.2%
14.4
1
0.2%
14.7
34
8.4%
Maximum 5 values
Value
Count
Frequency (%)
20.9
11
2.7%
21.0
22
5.4%
21.1
1
0.2%
21.2
15
3.7%
22.0
2
0.5%
RAD
Numeric
Distinct count
9
Unique (%)
2.2%
Missing (%)
0.0%
Missing (n)
0
Infinite (%)
0.0%
Infinite (n)
0
Mean
6.8744
Minimum
1
Maximum
24
Zeros (%)
0.0%
Quantile statistics
Minimum
1
5-th percentile
2
Q1
4
Median
5
Q3
6
95-th percentile
24
Maximum
24
Range
23
Interquartile range
2
Descriptive statistics
Standard deviation
6.5994
Coef of variation
0.96
Kurtosis
2.7241
Mean
6.8744
MAD
4.3617
Skewness
2.0709
Sum
2791
Variance
43.552
Memory size
3.2 KiB
Value
Count
Frequency (%)
5.0
115
28.3%
4.0
105
25.9%
24.0
50
12.3%
3.0
38
9.4%
8.0
24
5.9%
2.0
24
5.9%
6.0
18
4.4%
7.0
17
4.2%
1.0
15
3.7%
Minimum 5 values
Value
Count
Frequency (%)
1.0
15
3.7%
2.0
24
5.9%
3.0
38
9.4%
4.0
105
25.9%
5.0
115
28.3%
Maximum 5 values
Value
Count
Frequency (%)
5.0
115
28.3%
6.0
18
4.4%
7.0
17
4.2%
8.0
24
5.9%
24.0
50
12.3%
RM
Numeric
Distinct count
374
Unique (%)
92.1%
Missing (%)
0.0%
Missing (n)
0
Infinite (%)
0.0%
Infinite (n)
0
Mean
6.3295
Minimum
3.561
Maximum
8.78
Zeros (%)
0.0%
Quantile statistics
Minimum
3.561
5-th percentile
5.3522
Q1
5.8883
Median
6.2245
Q3
6.677
95-th percentile
7.7927
Maximum
8.78
Range
5.219
Interquartile range
0.78875
Descriptive statistics
Standard deviation
0.73191
Coef of variation
0.11563
Kurtosis
1.6116
Mean
6.3295
MAD
0.54106
Skewness
0.43551
Sum
2569.8
Variance
0.53569
Memory size
3.2 KiB
Value
Count
Frequency (%)
6.127
3
0.7%
6.405
3
0.7%
6.431
2
0.5%
6.727
2
0.5%
6.417
2
0.5%
6.108
2
0.5%
5.404
2
0.5%
5.713
2
0.5%
6.004
2
0.5%
5.813
2
0.5%
Other values (364)
384
94.6%
Minimum 5 values
Value
Count
Frequency (%)
3.561
1
0.2%
3.863
1
0.2%
4.138
1
0.2%
4.368
1
0.2%
4.652
1
0.2%
Maximum 5 values
Value
Count
Frequency (%)
8.375
1
0.2%
8.398
1
0.2%
8.704
1
0.2%
8.725
1
0.2%
8.78
1
0.2%
TAX
Numeric
Distinct count
63
Unique (%)
15.5%
Missing (%)
0.0%
Missing (n)
0
Infinite (%)
0.0%
Infinite (n)
0
Mean
354.45
Minimum
187
Maximum
666
Zeros (%)
0.0%
Quantile statistics
Minimum
187
5-th percentile
216
Q1
276
Median
307
Q3
403
95-th percentile
666
Maximum
666
Range
479
Interquartile range
127
Descriptive statistics
Standard deviation
133.05
Coef of variation
0.37537
Kurtosis
0.99077
Mean
354.45
MAD
100.87
Skewness
1.3722
Sum
143910
Variance
17703
Memory size
3.2 KiB
Value
Count
Frequency (%)
666.0
50
12.3%
307.0
40
9.9%
403.0
30
7.4%
437.0
15
3.7%
304.0
14
3.4%
398.0
12
3.0%
264.0
12
3.0%
384.0
11
2.7%
277.0
11
2.7%
224.0
10
2.5%
Other values (53)
201
49.5%
Minimum 5 values
Value
Count
Frequency (%)
187.0
1
0.2%
188.0
7
1.7%
193.0
8
2.0%
198.0
1
0.2%
216.0
5
1.2%
Maximum 5 values
Value
Count
Frequency (%)
430.0
3
0.7%
432.0
9
2.2%
437.0
15
3.7%
469.0
1
0.2%
666.0
50
12.3%
ZN
Numeric
Distinct count
26
Unique (%)
6.4%
Missing (%)
0.0%
Missing (n)
0
Infinite (%)
0.0%
Infinite (n)
0
Mean
14.163
Minimum
0
Maximum
100
Zeros (%)
67.0%
Quantile statistics
Minimum
0
5-th percentile
0
Q1
0
Median
0
Q3
20
95-th percentile
80
Maximum
100
Range
100
Interquartile range
20
Descriptive statistics
Standard deviation
25.269
Coef of variation
1.7842
Kurtosis
2.4529
Mean
14.163
MAD
19.058
Skewness
1.8715
Sum
5750
Variance
638.5
Memory size
3.2 KiB
Value
Count
Frequency (%)
0.0
272
67.0%
20.0
21
5.2%
80.0
15
3.7%
12.5
10
2.5%
22.0
10
2.5%
25.0
10
2.5%
40.0
7
1.7%
45.0
6
1.5%
30.0
6
1.5%
90.0
5
1.2%
Other values (16)
44
10.8%
Minimum 5 values
Value
Count
Frequency (%)
0.0
272
67.0%
12.5
10
2.5%
17.5
1
0.2%
18.0
1
0.2%
20.0
21
5.2%
Maximum 5 values
Value
Count
Frequency (%)
82.5
2
0.5%
85.0
2
0.5%
90.0
5
1.2%
95.0
4
1.0%
100.0
1
0.2%
example_cardinalcat
Categorical, Unique
First 3 values
114244.0_str
136900.0_str
44944.0_str
Last 3 values
55225.0_str
128164.0_str
43681.0_str
First 10 values
Value
Count
Frequency (%)
0.0_str
1
0.2%
1.0_str
1
0.2%
100.0_str
1
0.2%
10000.0_str
1
0.2%
100489.0_str
1
0.2%
Last 10 values
Value
Count
Frequency (%)
97969.0_str
1
0.2%
9801.0_str
1
0.2%
98596.0_str
1
0.2%
99225.0_str
1
0.2%
99856.0_str
1
0.2%
example_cat
Categorical
Distinct count
6
Unique (%)
1.5%
Missing (%)
0.0%
Missing (n)
0
QUAT
97
UNO
72
TWO
62
Other values (3)
175
Value
Count
Frequency (%)
QUAT
97
23.9%
UNO
72
17.7%
TWO
62
15.3%
ZERO
60
14.8%
SIX
59
14.5%
TRES
56
13.8%
example_date
Categorical, Unique
First 3 values
2011-01-10 07:00:00
2011-01-01 05:00:00
2011-01-14 17:00:00
Last 3 values
2011-01-20 18:00:00
2011-01-01 22:00:00
2011-01-01 14:00:00
First 10 values
Value
Count
Frequency (%)
2011-01-01 00:00:00
1
0.2%
2011-01-01 04:00:00
1
0.2%
2011-01-01 05:00:00
1
0.2%
2011-01-01 06:00:00
1
0.2%
2011-01-01 07:00:00
1
0.2%
Last 10 values
Value
Count
Frequency (%)
2011-01-21 18:00:00
1
0.2%
2011-01-21 19:00:00
1
0.2%
2011-01-21 20:00:00
1
0.2%
2011-01-21 21:00:00
1
0.2%
2011-01-22 00:00:00
1
0.2%
Sample
CRIM
ZN
INDUS
CHAS
NOX
RM
AGE
DIS
RAD
TAX
PTRATIO
B
LSTAT
example_date
example_cat
example_cardinalcat
0
0.00632
18.0
2.31
0.0
0.538
6.575
65.2
4.0900
1.0
296.0
15.3
396.90
4.98
2011-01-03 02:00:00
SIX
0.0_str
1
0.02731
0.0
7.07
0.0
0.469
6.421
78.9
4.9671
2.0
242.0
17.8
396.90
9.14
2011-01-15 00:00:00
ZERO
1.0_str
2
0.02729
0.0
7.07
0.0
0.469
7.185
61.1
4.9671
2.0
242.0
17.8
392.83
4.03
2011-01-18 03:00:00
UNO
4.0_str
3
0.03237
0.0
2.18
0.0
0.458
6.998
45.8
6.0622
3.0
222.0
18.7
394.63
2.94
2011-01-18 02:00:00
TRES
9.0_str
4
0.06905
0.0
2.18
0.0
0.458
7.147
54.2
6.0622
3.0
222.0
18.7
396.90
5.33
2011-01-18 20:00:00
ZERO
16.0_str
Feature generation can be divided into two different types, the row based which can be applied row by row, and the model based for which first all rows need to be analyzed.
An example of a row based feature generation is changing a string column containing a date time string, into columns containing the year, month, day, hour and minute, as this can be determined using only information within the row.
An example of a model based feature generation is changing a string column containing a users response, into a bag of words, as one needs to scan the full dataset to know which words are possible.
Applying a one-hot encoding can be either one. If one is quite sure all possible values are in the dataset, or one knows the possible values in advance, it is row based. A good example is the gender field with only Male or Female. If the values can be practically anything, it needs to be regarded model based. A good example is the city field of an address, or the postal code. Both are theoretically limited, but usually not all values are used in the dataset. As a general rule, if you really need to check which values are actually possible, it is model based.
The importance of this is the fact that row based feature generation can be applied before the train/test split, while the model based feature generation needs to be part of the model train pipeline.
Note : we first define transforms, and only at the end we apply them In this way we force ourselves to have a repeatable function, which we can also apply on the challenge dataset.
Based on the pandas dtypes we can divide the columns in different sets, and have a closer look into the categoricals. Especially the number of unique values per categorical column:
A note about identifiers: If a single value of a column can only map to a very small number of rows, these will trick the model in learning a map from id to value. Typical of such columns are the user id, address, but also timestamp.
In [12]:
booleans = []
numericals = X.select_dtypes(include=['float']).columns.values.tolist()
print("Numericals:")
display(numericals)
print("Categoricals with # unique values:")
categoricals = X.select_dtypes(include=['object']).columns.values.tolist()
display(X[categoricals].apply(lambda s: len(s.unique())))
unknowns = list(set(X.columns.values.tolist())-set(numericals)-set(categoricals))
if(len(unknowns)>0):
print("Columns of unhandled type:")
print(X.loc[:,unknowns].dtypes)
assert(len(unknowns)==0)
Numericals:
['CRIM',
'ZN',
'INDUS',
'CHAS',
'NOX',
'RM',
'AGE',
'DIS',
'RAD',
'TAX',
'PTRATIO',
'B',
'LSTAT']
Categoricals with # unique values:
example_date 406
example_cat 6
example_cardinalcat 406
dtype: int64
Dates just by themselves are often too unique, but can hold valueable information. As the parts are circular variables (24H is equal to 0H), we can tranform them while maintaining that aspect. Otherwise, parts can be one-hot encoded. As these are decisions to be made, we postpone them to after the train-test split.
What we do define now, is how to parse the dates we want to keep, and determine which dates we would remove unused.
In [13]:
dates = ['example_date']
display(X[dates].head(5))
example_date
0
2011-01-03 02:00:00
1
2011-01-15 00:00:00
2
2011-01-18 03:00:00
3
2011-01-18 02:00:00
4
2011-01-18 20:00:00
Now we define the format to be able to parse them. Check http://strftime.org/ for more details about formatting
In [14]:
date_formats = {
'example_date' : '%Y-%m-%d %H:%M:%S'
}
# verify parsing of all defined date_formats:
for col_name, col_format in date_formats.items():
display(
pd.to_datetime(
X[col_name],
format=col_format
).head(5))
0 2011-01-03 02:00:00
1 2011-01-15 00:00:00
2 2011-01-18 03:00:00
3 2011-01-18 02:00:00
4 2011-01-18 20:00:00
Name: example_date, dtype: datetime64[ns]
Now we remove from the categorical columns the ones we regard identifiers or highlycategoricals
In [15]:
identifiers = ['example_date']
identifiers = list( set(identifiers) - set(dates)) # remove the dates that will be transformed into lower cardinality
And we also remove from the categorical columns the ones we regard highlycategoricals
In [16]:
highlycategoricals = ['example_cardinalcat']
In [17]:
categoricals = list(set(categoricals) - set(highlycategoricals) - set(identifiers) - set(dates))
Now we've got the columns divided into the following groups. These can then be used to apply different model pipelines:
In [18]:
print("The identifiers (we will remove them):")
display(identifiers)
print("Dates (to parse and further process):")
display(dates)
print("Numericals:")
display(numericals)
print("The selected highly categoricals")
display(highlycategoricals)
print("The remaining regular categoricals")
display(categoricals)
The identifiers (we will remove them):
[]
Dates (to parse and further process):
['example_date']
Numericals:
['CRIM',
'ZN',
'INDUS',
'CHAS',
'NOX',
'RM',
'AGE',
'DIS',
'RAD',
'TAX',
'PTRATIO',
'B',
'LSTAT']
The selected highly categoricals
['example_cardinalcat']
The remaining regular categoricals
['example_cat']
In [19]:
def featurize(input_dataframe):
"""Applies all the model-less / row based feature generation on the provided pandas dataframe"""
df = input_dataframe.copy()
# handling identifiers
print "Removing identifier columns: {}".format(", ".join(identifiers))
df.drop(identifiers, axis=1, inplace=True)
# handling dates
for col_name in dates:
print "Parsing dates in column {}. Will show some examples when done:".format(col_name)
col_format = date_formats[col_name]
df[col_name] = pd.to_datetime(
df[col_name],
format=col_format
)
display(X[col_name].head(5))
return df
In [20]:
X = featurize(X)
challenge = featurize(challenge)
Removing identifier columns:
Parsing dates in column example_date. Will show some examples when done:
0 2011-01-03 02:00:00
1 2011-01-15 00:00:00
2 2011-01-18 03:00:00
3 2011-01-18 02:00:00
4 2011-01-18 20:00:00
Name: example_date, dtype: object
Removing identifier columns:
Parsing dates in column example_date. Will show some examples when done:
0 2011-01-03 02:00:00
1 2011-01-15 00:00:00
2 2011-01-18 03:00:00
3 2011-01-18 02:00:00
4 2011-01-18 20:00:00
Name: example_date, dtype: datetime64[ns]
We start with two datasets: define the following datasets:
To develop the final model, we divide the labelled dataset into three parts:
In the 3 split way, there is only one validation set, which is only part of the full dataset. A different approach is to use cross validation, with multiple validation sets, in total covering the full dataset, such that a more accurate estimate of the performance can be determined. Therefore we also define :
Let's now divide the datasets and create a cross fold generator
In [21]:
#from bdranalytics.model_selection.growingwindow import IntervalGrowingWindow
from sklearn.model_selection import GroupShuffleSplit, GridSearchCV
cv_dev = GroupShuffleSplit(n_splits=5, test_size=0.1, train_size=0.3, random_state=11)
def train_test_split(X, y, groups):
"""Splits the labelled set X (by y) in two parts, one for training, one for testing.
The function centralizes the decisions related to the specific dataset in this notebook.
It takes care of required ordering (if applicable) and stratification (if applicable)"""
cv_test = GroupShuffleSplit(n_splits=1, test_size=0.1, train_size=None, random_state=22)
return next(cv_test.split(X, y, groups=groups)) # take the first (and only) split
We just created cv_dev , which creates k fold cross validation sets within the dev dataset.
First we divide the full labelled dataset into a dev and a test set
In [22]:
dev_indices, test_indices = train_test_split(X, y, groups)
X_dev = X.iloc[dev_indices,:]
y_dev = y[dev_indices]
groups_dev = groups[dev_indices]
X_test = X.iloc[test_indices]
y_test = y[test_indices]
groups_test = groups[test_indices]
Next we divide the dev set into a train and validate set
In [23]:
train_indices, validate_indices = train_test_split(X_dev, y_dev, groups_dev)
X_train = X_dev.iloc[train_indices,]
y_train = y_dev[train_indices]
groups_train = groups_dev[train_indices]
X_validate = X_dev.iloc[validate_indices,]
y_validate = y_dev[validate_indices]
groups_validate = groups_dev[validate_indices]
In [24]:
from bdranalytics.pandaspipeline.transformers import PdFeatureUnion
from bdranalytics.pdpreprocessing.encoders import DateOneHotEncoding, DateCyclicalEncoding
from sklearn.preprocessing import MaxAbsScaler
from sklearn.feature_selection import VarianceThreshold
In [25]:
transformer_steps = [
# Weight of evidence needs to be BEFORE FeatureUnion, because it needs a dataframe
# It also works on ALL columns, and replaces the transformed columns (so no columnselector necessary)
WeightOfEvidenceEncoder(cols=highlycategoricals),
PdFeatureUnion([ # Union of features with retention of column names
# one hot encoding of dates
('dates1', Pipeline([
('selector', ColumnSelector(columns=dates)),
('dateencoder', DateOneHotEncoding(dates, drop=True, parts=['DAY_OF_WEEK']))
])),
# cyclical encoding of dates
('dates2', Pipeline([
('selector', ColumnSelector(columns=dates)),
('dateencoder', DateCyclicalEncoding(dates, drop=True, parts=['DAY'])), # drops the original dates
])),
# ('booleans', Pipeline([
# ('selector', ColumnSelector(columns=booleans)),
# ])), # booleans close
('categoricals', Pipeline([
('selector', ColumnSelector(columns=categoricals)),
('labeler', StringIndexer()),
# ('labeler', LabelEncoder()),
('encoder', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])), # categoricals close
('numericals', Pipeline([
('selector', ColumnSelector(columns=numericals)),
('imputer', Imputer()),
('scaler', StandardScaler())
])), # numericals close
]),
# TODO: unfortunately the Imputer kills the column names
Imputer(missing_values='NaN', strategy='mean'), # you probably want something better
VarianceThreshold(threshold=0.0), # remove constant columns
MaxAbsScaler() # scale the features while preserving sparsity to allow regularization
]
transformer = make_pipeline(*transformer_steps)
In [26]:
transformer = transformer.fit(X_dev, y_dev)
In [27]:
X_dev_transformed = pd.DataFrame(transformer.transform(X_dev)) #.iloc[:,:])
print(X_dev_transformed.columns)
RangeIndex(start=0, stop=28, step=1)
In [28]:
assert len(X_dev)==len(X_dev_transformed), "length shouldnt change, means concat went wrong"
In [29]:
profile_dev = ProfileReport(X_dev_transformed, correlation_overrides=list(X_dev_transformed.columns))
profile_dev
Out[29]:
Overview
Dataset info
Number of variables
28
Number of observations
356
Total Missing (%)
0.0%
Total size in memory
77.9 KiB
Average record size in memory
224.2 B
Variables types
Numeric
28
Categorical
0
Date
0
Text (Unique)
0
Rejected
0
Warnings
0
has 274 / 77.0% zeros1
has 265 / 74.4% zeros2
has 267 / 75.0% zeros3
has 272 / 76.4% zeros4
has 271 / 76.1% zeros5
has 263 / 73.9% zeros6
has 266 / 74.7% zeros9
has 276 / 77.5% zeros10
has 300 / 84.3% zeros11
has 309 / 86.8% zeros12
has 301 / 84.6% zeros13
has 294 / 82.6% zeros14
has 300 / 84.3% zeros
Variables
0
Numeric
Distinct count
3
Unique (%)
0.8%
Missing (%)
0.0%
Missing (n)
0
Infinite (%)
0.0%
Infinite (n)
0
Mean
0.1246
Minimum
0
Maximum
1
Zeros (%)
77.0%
Quantile statistics
Minimum
0
5-th percentile
0
Q1
0
Median
0
Q3
0
95-th percentile
1
Maximum
1
Range
1
Interquartile range
0
Descriptive statistics
Standard deviation
0.31011
Coef of variation
2.4889
Kurtosis
4.0894
Mean
0.1246
MAD
0.1918
Skewness
2.4347
Sum
44.358
Variance
0.096171
Memory size
2.9 KiB
Value
Count
Frequency (%)
0.0
274
77.0%
0.124600638978
43
12.1%
1.0
39
11.0%
Minimum 5 values
Value
Count
Frequency (%)
0.0
274
77.0%
0.124600638978
43
12.1%
1.0
39
11.0%
Maximum 5 values
Value
Count
Frequency (%)
0.0
274
77.0%
0.124600638978
43
12.1%
1.0
39
11.0%
1
Numeric
Distinct count
3
Unique (%)
0.8%
Missing (%)
0.0%
Missing (n)
0
Infinite (%)
0.0%
Infinite (n)
0
Mean
0.15335
Minimum
0
Maximum
1
Zeros (%)
74.4%
Quantile statistics
Minimum
0
5-th percentile
0
Q1
0
Median
0
Q3
0.15335
95-th percentile
1
Maximum
1
Range
1
Interquartile range
0.15335
Descriptive statistics
Standard deviation
0.33834
Coef of variation
2.2063
Kurtosis
2.3983
Mean
0.15335
MAD
0.22831
Skewness
2.0607
Sum
54.594
Variance
0.11448
Memory size
2.9 KiB
Value
Count
Frequency (%)
0.0
265
74.4%
1.0
48
13.5%
0.153354632588
43
12.1%
Minimum 5 values
Value
Count
Frequency (%)
0.0
265
74.4%
0.153354632588
43
12.1%
1.0
48
13.5%
Maximum 5 values
Value
Count
Frequency (%)
0.0
265
74.4%
0.153354632588
43
12.1%
1.0
48
13.5%
2
Numeric
Distinct count
3
Unique (%)
0.8%
Missing (%)
0.0%
Missing (n)
0
Infinite (%)
0.0%
Infinite (n)
0
Mean
0.14696
Minimum
0
Maximum
1
Zeros (%)
75.0%
Quantile statistics
Minimum
0
5-th percentile
0
Q1
0
Median
0
Q3
0.036741
95-th percentile
1
Maximum
1
Range
1
Interquartile range
0.036741
Descriptive statistics
Standard deviation
0.33247
Coef of variation
2.2622
Kurtosis
2.7151
Mean
0.14696
MAD
0.22045
Skewness
2.1357
Sum
52.319
Variance
0.11053
Memory size
2.9 KiB
Value
Count
Frequency (%)
0.0
267
75.0%
1.0
46
12.9%
0.14696485623
43
12.1%
Minimum 5 values
Value
Count
Frequency (%)
0.0
267
75.0%
0.14696485623
43
12.1%
1.0
46
12.9%
Maximum 5 values
Value
Count
Frequency (%)
0.0
267
75.0%
0.14696485623
43
12.1%
1.0
46
12.9%
3
Numeric
Distinct count
3
Unique (%)
0.8%
Missing (%)
0.0%
Missing (n)
0
Infinite (%)
0.0%
Infinite (n)
0
Mean
0.13099
Minimum
0
Maximum
1
Zeros (%)
76.4%
Quantile statistics
Minimum
0
5-th percentile
0
Q1
0
Median
0
Q3
0
95-th percentile
1
Maximum
1
Range
1
Interquartile range
0
Descriptive statistics
Standard deviation
0.3168
Coef of variation
2.4185
Kurtosis
3.6474
Mean
0.13099
MAD
0.20017
Skewness
2.3427
Sum
46.633
Variance
0.10036
Memory size
2.9 KiB
Value
Count
Frequency (%)
0.0
272
76.4%
0.130990415335
43
12.1%
1.0
41
11.5%
Minimum 5 values
Value
Count
Frequency (%)
0.0
272
76.4%
0.130990415335
43
12.1%
1.0
41
11.5%
Maximum 5 values
Value
Count
Frequency (%)
0.0
272
76.4%
0.130990415335
43
12.1%
1.0
41
11.5%
4
Numeric
Distinct count
3
Unique (%)
0.8%
Missing (%)
0.0%
Missing (n)
0
Infinite (%)
0.0%
Infinite (n)
0
Mean
0.13419
Minimum
0
Maximum
1
Zeros (%)
76.1%
Quantile statistics
Minimum
0
5-th percentile
0
Q1
0
Median
0
Q3
0
95-th percentile
1
Maximum
1
Range
1
Interquartile range
0
Descriptive statistics
Standard deviation
0.32005
Coef of variation
2.3852
Kurtosis
3.4427
Mean
0.13419
MAD
0.20429
Skewness
2.2989
Sum
47.77
Variance
0.10243
Memory size
2.9 KiB
Value
Count
Frequency (%)
0.0
271
76.1%
0.134185303514
43
12.1%
1.0
42
11.8%
Minimum 5 values
Value
Count
Frequency (%)
0.0
271
76.1%
0.134185303514
43
12.1%
1.0
42
11.8%
Maximum 5 values
Value
Count
Frequency (%)
0.0
271
76.1%
0.134185303514
43
12.1%
1.0
42
11.8%
5
Numeric
Distinct count
3
Unique (%)
0.8%
Missing (%)
0.0%
Missing (n)
0
Infinite (%)
0.0%
Infinite (n)
0
Mean
0.15974
Minimum
0
Maximum
1
Zeros (%)
73.9%
Quantile statistics
Minimum
0
5-th percentile
0
Q1
0
Median
0
Q3
0.15974
95-th percentile
1
Maximum
1
Range
1
Interquartile range
0.15974
Descriptive statistics
Standard deviation
0.34401
Coef of variation
2.1535
Kurtosis
2.1078
Mean
0.15974
MAD
0.23603
Skewness
1.9893
Sum
56.869
Variance
0.11835
Memory size
2.9 KiB
Value
Count
Frequency (%)
0.0
263
73.9%
1.0
50
14.0%
0.159744408946
43
12.1%
Minimum 5 values
Value
Count
Frequency (%)
0.0
263
73.9%
0.159744408946
43
12.1%
1.0
50
14.0%
Maximum 5 values
Value
Count
Frequency (%)
0.0
263
73.9%
0.159744408946
43
12.1%
1.0
50
14.0%
6
Numeric
Distinct count
3
Unique (%)
0.8%
Missing (%)
0.0%
Missing (n)
0
Infinite (%)
0.0%
Infinite (n)
0
Mean
0.15016
Minimum
0
Maximum
1
Zeros (%)
74.7%
Quantile statistics
Minimum
0
5-th percentile
0
Q1
0
Median
0
Q3
0.15016
95-th percentile
1
Maximum
1
Range
1
Interquartile range
0.15016
Descriptive statistics
Standard deviation
0.33543
Coef of variation
2.2338
Kurtosis
2.5532
Mean
0.15016
MAD
0.2244
Skewness
2.0977
Sum
53.457
Variance
0.11251
Memory size
2.9 KiB
Value
Count
Frequency (%)
0.0
266
74.7%
1.0
47
13.2%
0.150159744409
43
12.1%
Minimum 5 values
Value
Count
Frequency (%)
0.0
266
74.7%
0.150159744409
43
12.1%
1.0
47
13.2%
Maximum 5 values
Value
Count
Frequency (%)
0.0
266
74.7%
0.150159744409
43
12.1%
1.0
47
13.2%
7
Numeric
Distinct count
22
Unique (%)
6.2%
Missing (%)
0.0%
Missing (n)
0
Infinite (%)
0.0%
Infinite (n)
0
Mean
0.48843
Minimum
0.045551
Maximum
1
Zeros (%)
0.0%
Quantile statistics
Minimum
0.045551
5-th percentile
0.091101
Q1
0.26188
Median
0.5008
Q3
0.728
95-th percentile
0.90943
Maximum
1
Range
0.95445
Interquartile range
0.46612
Descriptive statistics
Standard deviation
0.27165
Coef of variation
0.55616
Kurtosis
-1.1746
Mean
0.48843
MAD
0.23408
Skewness
0.040184
Sum
173.88
Variance
0.073792
Memory size
2.9 KiB
Value
Count
Frequency (%)
0.0911010720924
21
5.9%
0.182192539203
20
5.6%
0.682594879339
20
5.6%
0.455313282094
19
5.3%
0.500798398573
18
5.1%
0.591727832524
18
5.1%
0.227731669583
18
5.1%
0.273264797363
17
4.8%
0.864103497917
17
4.8%
0.318790722373
17
4.8%
Other values (12)
171
48.0%
Minimum 5 values
Value
Count
Frequency (%)
0.0455511363694
16
4.5%
0.0911010720924
21
5.9%
0.136648606554
14
3.9%
0.182192539203
20
5.6%
0.227731669583
18
5.1%
Maximum 5 values
Value
Count
Frequency (%)
0.818757518805
17
4.8%
0.864103497917
17
4.8%
0.909426700807
14
3.9%
0.954725932837
13
3.7%
1.0
1
0.3%
8
Numeric
Distinct count
22
Unique (%)
6.2%
Missing (%)
0.0%
Missing (n)
0
Infinite (%)
0.0%
Infinite (n)
0
Mean
0.99803
Minimum
0.99364
Maximum
1
Zeros (%)
0.0%
Quantile statistics
Minimum
0.99364
5-th percentile
0.99475
Q1
0.99664
Median
0.99842
Q3
0.99957
95-th percentile
0.99996
Maximum
1
Range
0.0063588
Interquartile range
0.0029338
Descriptive statistics
Standard deviation
0.0017609
Coef of variation
0.0017644
Kurtosis
-0.73504
Mean
0.99803
MAD
0.0015017
Skewness
-0.68239
Sum
355.3
Variance
3.1008e-06
Memory size
2.9 KiB
Value
Count
Frequency (%)
0.999960462511
21
5.9%
0.997049300241
20
5.6%
0.999802317764
20
5.6%
0.998695537952
19
5.3%
0.997786702927
18
5.1%
0.999683714674
18
5.1%
0.998418906817
18
5.1%
0.995259220554
17
4.8%
0.998945845259
17
4.8%
0.999367462696
17
4.8%
Other values (12)
171
48.0%
Minimum 5 values
Value
Count
Frequency (%)
0.993641172776
1
0.3%
0.994206732284
13
3.7%
0.994746086282
14
3.9%
0.995259220554
17
4.8%
0.995746121574
17
4.8%
Maximum 5 values
Value
Count
Frequency (%)
0.999683714674
18
5.1%
0.999802317764
20
5.6%
0.999894567853
14
3.9%
0.999960462511
21
5.9%
1.0
16
4.5%
9
Numeric
Distinct count
2
Unique (%)
0.6%
Missing (%)
0.0%
Missing (n)
0
Infinite (%)
0.0%
Infinite (n)
0
Mean
0.22472
Minimum
0
Maximum
1
Zeros (%)
77.5%
Quantile statistics
Minimum
0
5-th percentile
0
Q1
0
Median
0
Q3
0
95-th percentile
1
Maximum
1
Range
1
Interquartile range
0
Descriptive statistics
Standard deviation
0.41798
Coef of variation
1.86
Kurtosis
-0.24679
Mean
0.22472
MAD
0.34844
Skewness
1.3246
Sum
80
Variance
0.17471
Memory size
2.9 KiB
Value
Count
Frequency (%)
0.0
276
77.5%
1.0
80
22.5%
Minimum 5 values
Value
Count
Frequency (%)
0.0
276
77.5%
1.0
80
22.5%
Maximum 5 values
Value
Count
Frequency (%)
0.0
276
77.5%
1.0
80
22.5%
10
Numeric
Distinct count
2
Unique (%)
0.6%
Missing (%)
0.0%
Missing (n)
0
Infinite (%)
0.0%
Infinite (n)
0
Mean
0.1573
Minimum
0
Maximum
1
Zeros (%)
84.3%
Quantile statistics
Minimum
0
5-th percentile
0
Q1
0
Median
0
Q3
0
95-th percentile
1
Maximum
1
Range
1
Interquartile range
0
Descriptive statistics
Standard deviation
0.3646
Coef of variation
2.3178
Kurtosis
1.5828
Mean
0.1573
MAD
0.26512
Skewness
1.8905
Sum
56
Variance
0.13293
Memory size
2.9 KiB
Value
Count
Frequency (%)
0.0
300
84.3%
1.0
56
15.7%
Minimum 5 values
Value
Count
Frequency (%)
0.0
300
84.3%
1.0
56
15.7%
Maximum 5 values
Value
Count
Frequency (%)
0.0
300
84.3%
1.0
56
15.7%
11
Numeric
Distinct count
2
Unique (%)
0.6%
Missing (%)
0.0%
Missing (n)
0
Infinite (%)
0.0%
Infinite (n)
0
Mean
0.13202
Minimum
0
Maximum
1
Zeros (%)
86.8%
Quantile statistics
Minimum
0
5-th percentile
0
Q1
0
Median
0
Q3
0
95-th percentile
1
Maximum
1
Range
1
Interquartile range
0
Descriptive statistics
Standard deviation
0.33899
Coef of variation
2.5677
Kurtosis
2.7823
Mean
0.13202
MAD
0.22919
Skewness
2.1833
Sum
47
Variance
0.11492
Memory size
2.9 KiB
Value
Count
Frequency (%)
0.0
309
86.8%
1.0
47
13.2%
Minimum 5 values
Value
Count
Frequency (%)
0.0
309
86.8%
1.0
47
13.2%
Maximum 5 values
Value
Count
Frequency (%)
0.0
309
86.8%
1.0
47
13.2%
12
Numeric
Distinct count
2
Unique (%)
0.6%
Missing (%)
0.0%
Missing (n)
0
Infinite (%)
0.0%
Infinite (n)
0
Mean
0.15449
Minimum
0
Maximum
1
Zeros (%)
84.6%
Quantile statistics
Minimum
0
5-th percentile
0
Q1
0
Median
0
Q3
0
95-th percentile
1
Maximum
1
Range
1
Interquartile range
0
Descriptive statistics
Standard deviation
0.36193
Coef of variation
2.3427
Kurtosis
1.696
Mean
0.15449
MAD
0.26125
Skewness
1.92
Sum
55
Variance
0.13099
Memory size
2.9 KiB
Value
Count
Frequency (%)
0.0
301
84.6%
1.0
55
15.4%
Minimum 5 values
Value
Count
Frequency (%)
0.0
301
84.6%
1.0
55
15.4%
Maximum 5 values
Value
Count
Frequency (%)
0.0
301
84.6%
1.0
55
15.4%
13
Numeric
Distinct count
2
Unique (%)
0.6%
Missing (%)
0.0%
Missing (n)
0
Infinite (%)
0.0%
Infinite (n)
0
Mean
0.17416
Minimum
0
Maximum
1
Zeros (%)
82.6%
Quantile statistics
Minimum
0
5-th percentile
0
Q1
0
Median
0
Q3
0
95-th percentile
1
Maximum
1
Range
1
Interquartile range
0
Descriptive statistics
Standard deviation
0.37978
Coef of variation
2.1807
Kurtosis
0.98338
Mean
0.17416
MAD
0.28765
Skewness
1.7257
Sum
62
Variance
0.14423
Memory size
2.9 KiB
Value
Count
Frequency (%)
0.0
294
82.6%
1.0
62
17.4%
Minimum 5 values
Value
Count
Frequency (%)
0.0
294
82.6%
1.0
62
17.4%
Maximum 5 values
Value
Count
Frequency (%)
0.0
294
82.6%
1.0
62
17.4%
14
Numeric
Distinct count
2
Unique (%)
0.6%
Missing (%)
0.0%
Missing (n)
0
Infinite (%)
0.0%
Infinite (n)
0
Mean
0.1573
Minimum
0
Maximum
1
Zeros (%)
84.3%
Quantile statistics
Minimum
0
5-th percentile
0
Q1
0
Median
0
Q3
0
95-th percentile
1
Maximum
1
Range
1
Interquartile range
0
Descriptive statistics
Standard deviation
0.3646
Coef of variation
2.3178
Kurtosis
1.5828
Mean
0.1573
MAD
0.26512
Skewness
1.8905
Sum
56
Variance
0.13293
Memory size
2.9 KiB
Value
Count
Frequency (%)
0.0
300
84.3%
1.0
56
15.7%
Minimum 5 values
Value
Count
Frequency (%)
0.0
300
84.3%
1.0
56
15.7%
Maximum 5 values
Value
Count
Frequency (%)
0.0
300
84.3%
1.0
56
15.7%
15
Numeric
Distinct count
355
Unique (%)
99.7%
Missing (%)
0.0%
Missing (n)
0
Infinite (%)
0.0%
Infinite (n)
0
Mean
2.4949e-18
Minimum
-0.025035
Maximum
1
Zeros (%)
0.0%
Quantile statistics
Minimum
-0.025035
5-th percentile
-0.024861
Q1
-0.024315
Median
-0.023148
Q3
-0.016268
95-th percentile
0.092574
Maximum
1
Range
1.025
Interquartile range
0.008047
Descriptive statistics
Standard deviation
0.08708
Coef of variation
3.4903e+16
Kurtosis
68.083
Mean
2.4949e-18
MAD
0.036628
Skewness
7.347
Sum
8.8818e-16
Variance
0.0075829
Memory size
2.9 KiB
Value
Count
Frequency (%)
-0.0249350663306
2
0.6%
0.151027317027
1
0.3%
-0.0244113149272
1
0.3%
-0.0245527946288
1
0.3%
0.188012507422
1
0.3%
0.00311339972164
1
0.3%
-0.0211901170633
1
0.3%
-0.0213080936223
1
0.3%
-0.0243402294419
1
0.3%
-0.0240121071372
1
0.3%
Other values (345)
345
96.9%
Minimum 5 values
Value
Count
Frequency (%)
-0.0250351851097
1
0.3%
-0.0250036171632
1
0.3%
-0.0249817269814
1
0.3%
-0.0249581086273
1
0.3%
-0.0249513111498
1
0.3%
Maximum 5 values
Value
Count
Frequency (%)
0.263451834626
1
0.3%
0.416748778069
1
0.3%
0.453356074799
1
0.3%
0.75741761328
1
0.3%
1.0
1
0.3%
16
Numeric
Distinct count
26
Unique (%)
7.3%
Missing (%)
0.0%
Missing (n)
0
Infinite (%)
0.0%
Infinite (n)
0
Mean
-9.9795e-18
Minimum
-0.1588
Maximum
1
Zeros (%)
0.0%
Quantile statistics
Minimum
-0.1588
5-th percentile
-0.1588
Q1
-0.1588
Median
-0.1588
Q3
0.072962
95-th percentile
0.76824
Maximum
1
Range
1.1588
Interquartile range
0.23176
Descriptive statistics
Standard deviation
0.28723
Coef of variation
-2.8782e+16
Kurtosis
2.6651
Mean
-9.9795e-18
MAD
0.21644
Skewness
1.9105
Sum
-3.5527e-15
Variance
0.082501
Memory size
2.9 KiB
Value
Count
Frequency (%)
-0.158797584753
242
68.0%
0.0729619321973
18
5.1%
0.768240483049
11
3.1%
0.0961378838924
9
2.5%
0.130901811435
8
2.2%
0.304721449148
7
2.0%
-0.0139478866592
7
2.0%
0.188841690673
6
1.7%
0.362661328386
6
1.7%
0.884120241525
4
1.1%
Other values (16)
38
10.7%
Minimum 5 values
Value
Count
Frequency (%)
-0.158797584753
242
68.0%
-0.0139478866592
7
2.0%
0.0439919925785
1
0.3%
0.0497859805023
1
0.3%
0.0729619321973
18
5.1%
Maximum 5 values
Value
Count
Frequency (%)
0.797210422668
2
0.6%
0.826180362287
2
0.6%
0.884120241525
4
1.1%
0.942060120762
3
0.8%
1.0
1
0.3%
17
Numeric
Distinct count
72
Unique (%)
20.2%
Missing (%)
0.0%
Missing (n)
0
Infinite (%)
0.0%
Infinite (n)
0
Mean
3.4928e-17
Minimum
-0.56937
Maximum
1
Zeros (%)
0.0%
Quantile statistics
Minimum
-0.56937
5-th percentile
-0.47156
Q1
-0.3183
Median
-0.1479
Q3
0.38664
95-th percentile
0.76575
Maximum
1
Range
1.5694
Interquartile range
0.70494
Descriptive statistics
Standard deviation
0.40788
Coef of variation
1.1678e+16
Kurtosis
-0.71341
Mean
3.4928e-17
MAD
0.34473
Skewness
0.74768
Sum
1.2434e-14
Variance
0.16636
Memory size
2.9 KiB
Value
Count
Frequency (%)
0.529623828566
43
12.1%
0.621830018463
25
7.0%
-0.0908989088491
18
5.1%
-0.21176377939
17
4.8%
0.765746436478
14
3.9%
0.0187516953528
12
3.4%
-0.350696079032
11
3.1%
-0.0647322873918
10
2.8%
-0.232946282474
9
2.5%
-0.167529728831
9
2.5%
Other values (62)
188
52.8%
Minimum 5 values
Value
Count
Frequency (%)
-0.569374272639
1
0.3%
-0.551929858334
1
0.3%
-0.522648162894
1
0.3%
-0.520156103707
2
0.6%
-0.515795000131
1
0.3%
Maximum 5 values
Value
Count
Frequency (%)
0.338981300806
2
0.6%
0.529623828566
43
12.1%
0.621830018463
25
7.0%
0.765746436478
14
3.9%
1.0
7
2.0%
18
Numeric
Distinct count
2
Unique (%)
0.6%
Missing (%)
0.0%
Missing (n)
0
Infinite (%)
0.0%
Infinite (n)
0
Mean
9.9795e-18
Minimum
-0.098765
Maximum
1
Zeros (%)
0.0%
Quantile statistics
Minimum
-0.098765
5-th percentile
-0.098765
Q1
-0.098765
Median
-0.098765
Q3
-0.098765
95-th percentile
1
Maximum
1
Range
1.0988
Interquartile range
0
Descriptive statistics
Standard deviation
0.31471
Coef of variation
3.1536e+16
Kurtosis
6.3291
Mean
9.9795e-18
MAD
0.17978
Skewness
2.8799
Sum
3.5527e-15
Variance
0.099044
Memory size
2.9 KiB
Value
Count
Frequency (%)
-0.0987654320988
324
91.0%
1.0
32
9.0%
Minimum 5 values
Value
Count
Frequency (%)
-0.0987654320988
324
91.0%
1.0
32
9.0%
Maximum 5 values
Value
Count
Frequency (%)
-0.0987654320988
324
91.0%
1.0
32
9.0%
19
Numeric
Distinct count
67
Unique (%)
18.8%
Missing (%)
0.0%
Missing (n)
0
Infinite (%)
0.0%
Infinite (n)
0
Mean
4.9898e-17
Minimum
-0.43461
Maximum
1
Zeros (%)
0.0%
Quantile statistics
Minimum
-0.43461
5-th percentile
-0.36672
Q1
-0.26636
Median
-0.074484
Q3
0.2148
95-th percentile
0.70186
Maximum
1
Range
1.4346
Interquartile range
0.48116
Descriptive statistics
Standard deviation
0.33825
Coef of variation
6.7789e+15
Kurtosis
1.1214
Mean
4.9898e-17
MAD
0.26237
Skewness
1.2253
Sum
1.7764e-14
Variance
0.11442
Memory size
2.9 KiB
Value
Count
Frequency (%)
0.0170247062218
19
5.3%
-0.281114947447
17
4.8%
0.270886193504
14
3.9%
1.0
14
3.9%
-0.127617303974
13
3.7%
0.474565758881
12
3.4%
0.0347359727764
12
3.4%
0.214800516081
11
3.1%
-0.0361090934419
10
2.8%
-0.0744835043101
10
2.8%
Other values (57)
224
62.9%
Minimum 5 values
Value
Count
Frequency (%)
-0.434612590919
1
0.3%
-0.422805079883
1
0.3%
-0.413949446606
1
0.3%
-0.408045691088
1
0.3%
-0.396238180051
2
0.6%
Maximum 5 values
Value
Count
Frequency (%)
0.474565758881
12
3.4%
0.495228903195
7
2.0%
0.548362702859
2
0.6%
0.701860346332
8
2.2%
1.0
14
3.9%
20
Numeric
Distinct count
337
Unique (%)
94.7%
Missing (%)
0.0%
Missing (n)
0
Infinite (%)
0.0%
Infinite (n)
0
Mean
-1.5219e-16
Minimum
-1
Maximum
0.885
Zeros (%)
0.0%
Quantile statistics
Minimum
-1
5-th percentile
-0.34193
Q1
-0.15953
Median
-0.042691
Q3
0.11758
95-th percentile
0.53926
Maximum
0.885
Range
1.885
Interquartile range
0.27712
Descriptive statistics
Standard deviation
0.26563
Coef of variation
-1.7454e+15
Kurtosis
1.7699
Mean
-1.5219e-16
MAD
0.19524
Skewness
0.52665
Sum
-5.4179e-14
Variance
0.07056
Memory size
2.9 KiB
Value
Count
Frequency (%)
0.230541659015
2
0.6%
0.224401592035
2
0.6%
0.0271970878308
2
0.6%
0.110268582276
2
0.6%
-0.133167014489
2
0.6%
0.308917808122
2
0.6%
-0.0750169683774
2
0.6%
-0.0800734941263
2
0.6%
0.0315312527583
2
0.6%
-0.222739756325
2
0.6%
Other values (327)
336
94.4%
Minimum 5 values
Value
Count
Frequency (%)
-1.0
1
0.3%
-0.89092351599
1
0.3%
-0.791598903067
1
0.3%
-0.605952172003
1
0.3%
-0.515295888935
1
0.3%
Maximum 5 values
Value
Count
Frequency (%)
0.738722496771
1
0.3%
0.747029646216
1
0.3%
0.857550851868
1
0.3%
0.865135640492
1
0.3%
0.885000563076
1
0.3%
21
Numeric
Distinct count
278
Unique (%)
78.1%
Missing (%)
0.0%
Missing (n)
0
Infinite (%)
0.0%
Infinite (n)
0
Mean
1.447e-16
Minimum
-1
Maximum
0.59138
Zeros (%)
0.0%
Quantile statistics
Minimum
-1
5-th percentile
-0.78899
Q1
-0.42433
Median
0.07922
Q3
0.4574
95-th percentile
0.59138
Maximum
0.59138
Range
1.5914
Interquartile range
0.88173
Descriptive statistics
Standard deviation
0.47766
Coef of variation
3.3009e+15
Kurtosis
-1.2288
Mean
1.447e-16
MAD
0.42404
Skewness
-0.35962
Sum
5.1514e-14
Variance
0.22816
Memory size
2.9 KiB
Value
Count
Frequency (%)
0.591378246738
23
6.5%
0.525821985287
3
0.8%
-0.696802290786
3
0.8%
0.529099798359
3
0.8%
0.54221105065
3
0.8%
-0.519800384867
3
0.8%
0.573350274839
3
0.8%
0.306208509424
2
0.6%
0.470099163053
2
0.6%
-0.537828356766
2
0.6%
Other values (268)
309
86.8%
Minimum 5 values
Value
Count
Frequency (%)
-1.0
1
0.3%
-0.949193897375
1
0.3%
-0.945916084302
1
0.3%
-0.940999364694
1
0.3%
-0.939360458157
2
0.6%
Maximum 5 values
Value
Count
Frequency (%)
0.566794648694
1
0.3%
0.571711368303
2
0.6%
0.573350274839
3
0.8%
0.576628087912
1
0.3%
0.591378246738
23
6.5%
22
Numeric
Distinct count
287
Unique (%)
80.6%
Missing (%)
0.0%
Missing (n)
0
Infinite (%)
0.0%
Infinite (n)
0
Mean
-5.4887e-17
Minimum
-0.37867
Maximum
1
Zeros (%)
0.0%
Quantile statistics
Minimum
-0.37867
5-th percentile
-0.33654
Q1
-0.23354
Median
-0.03851
Q3
0.17854
95-th percentile
0.4813
Maximum
1
Range
1.3787
Interquartile range
0.41207
Descriptive statistics
Standard deviation
0.27112
Coef of variation
-4.9395e+15
Kurtosis
0.032494
Mean
-5.4887e-17
MAD
0.22299
Skewness
0.73243
Sum
-1.954e-14
Variance
0.073504
Memory size
2.9 KiB
Value
Count
Frequency (%)
0.142579037862
4
1.1%
0.196938864915
4
1.1%
0.461077978427
3
0.8%
0.158701440443
3
0.8%
0.0715953338087
3
0.8%
0.334067044876
3
0.8%
0.168204380689
3
0.8%
0.397064768802
3
0.8%
-0.0820940388496
3
0.8%
0.292081099119
3
0.8%
Other values (277)
324
91.0%
Minimum 5 values
Value
Count
Frequency (%)
-0.37866601043
1
0.3%
-0.377738282754
1
0.3%
-0.373713950539
1
0.3%
-0.373074570655
1
0.3%
-0.369539175999
1
0.3%
Maximum 5 values
Value
Count
Frequency (%)
0.63565376065
1
0.3%
0.635979719022
1
0.3%
0.806832053681
2
0.6%
0.822452981843
1
0.3%
1.0
1
0.3%
23
Numeric
Distinct count
9
Unique (%)
2.5%
Missing (%)
0.0%
Missing (n)
0
Infinite (%)
0.0%
Infinite (n)
0
Mean
0
Minimum
-0.34098
Maximum
1
Zeros (%)
0.0%
Quantile statistics
Minimum
-0.34098
5-th percentile
-0.28267
Q1
-0.16607
Median
-0.10776
Q3
-0.04946
95-th percentile
1
Maximum
1
Range
1.341
Interquartile range
0.11661
Descriptive statistics
Standard deviation
0.38178
Coef of variation
Kurtosis
2.8388
Mean
0
MAD
0.25099
Skewness
2.0909
Sum
0
Variance
0.14576
Memory size
2.9 KiB
Value
Count
Frequency (%)
-0.107762856207
99
27.8%
-0.166066164428
91
25.6%
1.0
43
12.1%
-0.22436947265
34
9.6%
0.0671470684573
23
6.5%
-0.282672780871
20
5.6%
-0.0494595479856
17
4.8%
0.00884376023583
15
4.2%
-0.340976089093
14
3.9%
Minimum 5 values
Value
Count
Frequency (%)
-0.340976089093
14
3.9%
-0.282672780871
20
5.6%
-0.22436947265
34
9.6%
-0.166066164428
91
25.6%
-0.107762856207
99
27.8%
Maximum 5 values
Value
Count
Frequency (%)
-0.107762856207
99
27.8%
-0.0494595479856
17
4.8%
0.00884376023583
15
4.2%
0.0671470684573
23
6.5%
1.0
43
12.1%
24
Numeric
Distinct count
62
Unique (%)
17.4%
Missing (%)
0.0%
Missing (n)
0
Infinite (%)
0.0%
Infinite (n)
0
Mean
-1.9959e-17
Minimum
-0.53225
Maximum
1
Zeros (%)
0.0%
Quantile statistics
Minimum
-0.53225
5-th percentile
-0.43948
Q1
-0.24755
Median
-0.14839
Q3
0.1587
95-th percentile
1
Maximum
1
Range
1.5322
Interquartile range
0.40625
Descriptive statistics
Standard deviation
0.42332
Coef of variation
-2.1209e+16
Kurtosis
1.0596
Mean
-1.9959e-17
MAD
0.31984
Skewness
1.3871
Sum
-7.1054e-15
Variance
0.1792
Memory size
2.9 KiB
Value
Count
Frequency (%)
1.0
43
12.1%
-0.148387096774
35
9.8%
0.158702488993
25
7.0%
-0.157983646329
14
3.9%
0.267463383952
14
3.9%
0.142708239734
12
3.4%
-0.285937640399
11
3.1%
0.0979243418097
10
2.8%
-0.244352592326
9
2.5%
-0.0748135501842
9
2.5%
Other values (52)
174
48.9%
Minimum 5 values
Value
Count
Frequency (%)
-0.532249078983
1
0.3%
-0.529050229131
7
2.0%
-0.513055979872
7
2.0%
-0.497061730614
1
0.3%
-0.439482433282
4
1.1%
Maximum 5 values
Value
Count
Frequency (%)
0.24507143499
2
0.6%
0.251469134693
8
2.2%
0.267463383952
14
3.9%
0.369826579207
1
0.3%
1.0
43
12.1%
25
Numeric
Distinct count
45
Unique (%)
12.6%
Missing (%)
0.0%
Missing (n)
0
Infinite (%)
0.0%
Infinite (n)
0
Mean
4.4908e-17
Minimum
-1
Maximum
0.71857
Zeros (%)
0.0%
Quantile statistics
Minimum
-1
5-th percentile
-0.61606
Q1
-0.26869
Median
0.060394
Q3
0.38948
95-th percentile
0.53574
Maximum
0.71857
Range
1.7186
Interquartile range
0.65818
Descriptive statistics
Standard deviation
0.40157
Coef of variation
8.942e+15
Kurtosis
-0.61801
Mean
4.4908e-17
MAD
0.33011
Skewness
-0.47182
Sum
1.5987e-14
Variance
0.16126
Memory size
2.9 KiB
Value
Count
Frequency (%)
0.389482333607
50
14.0%
-0.61606409203
29
8.1%
-0.0493015612161
21
5.9%
0.535743631882
18
5.1%
-0.122432210353
17
4.8%
0.18837304848
16
4.5%
0.0603944124897
15
4.2%
-0.268693508628
15
4.2%
0.0969597370583
14
3.9%
0.57230895645
14
3.9%
Other values (35)
147
41.3%
Minimum 5 values
Value
Count
Frequency (%)
-1.0
1
0.3%
-0.926869350863
11
3.1%
-0.817173377157
1
0.3%
-0.670912078882
1
0.3%
-0.61606409203
29
8.1%
Maximum 5 values
Value
Count
Frequency (%)
0.517460969597
10
2.8%
0.535743631882
18
5.1%
0.554026294166
1
0.3%
0.57230895645
14
3.9%
0.718570254725
2
0.6%
26
Numeric
Distinct count
245
Unique (%)
68.8%
Missing (%)
0.0%
Missing (n)
0
Infinite (%)
0.0%
Infinite (n)
0
Mean
2.7444e-17
Minimum
-1
Maximum
0.053632
Zeros (%)
0.0%
Quantile statistics
Minimum
-1
5-th percentile
-0.13502
Q1
0.0028647
Median
0.04011
Q3
0.053632
95-th percentile
0.053632
Maximum
0.053632
Range
1.0536
Interquartile range
0.050767
Descriptive statistics
Standard deviation
0.13231
Coef of variation
4.8211e+15
Kurtosis
30.498
Mean
2.7444e-17
MAD
0.063585
Skewness
-5.1699
Sum
9.77e-15
Variance
0.017506
Memory size
2.9 KiB
Value
Count
Frequency (%)
0.0536319305703
90
25.3%
0.0482684561614
3
0.8%
0.0494962394599
2
0.6%
0.0403847949821
2
0.6%
-0.0104389715554
2
0.6%
0.047848425033
2
0.6%
0.0304009781607
2
0.6%
0.0493023789391
2
0.6%
-0.0180641520404
2
0.6%
0.0403201748085
2
0.6%
Other values (235)
247
69.4%
Minimum 5 values
Value
Count
Frequency (%)
-1.0
1
0.3%
-0.944394340616
1
0.3%
-0.942391115234
1
0.3%
-0.804136253814
1
0.3%
-0.681842575274
1
0.3%
Maximum 5 values
Value
Count
Frequency (%)
0.0514994648415
1
0.3%
0.0516287051887
1
0.3%
0.0516933253623
1
0.3%
0.0520810464039
1
0.3%
0.0536319305703
90
25.3%
27
Numeric
Distinct count
329
Unique (%)
92.4%
Missing (%)
0.0%
Missing (n)
0
Infinite (%)
0.0%
Infinite (n)
0
Mean
-5.2393e-17
Minimum
-0.36082
Maximum
1
Zeros (%)
0.0%
Quantile statistics
Minimum
-0.36082
5-th percentile
-0.29511
Q1
-0.18696
Median
-0.064173
Q3
0.12677
95-th percentile
0.54545
Maximum
1
Range
1.3608
Interquartile range
0.31373
Descriptive statistics
Standard deviation
0.2541
Coef of variation
-4.8499e+15
Kurtosis
1.363
Mean
-5.2393e-17
MAD
0.19815
Skewness
1.2053
Sum
-1.8652e-14
Variance
0.064566
Memory size
2.9 KiB
Value
Count
Frequency (%)
-0.123502210821
3
0.8%
-0.186962061633
3
0.8%
-0.173443986904
2
0.6%
0.0409676983832
2
0.6%
-0.277458061903
2
0.6%
0.0623713167044
2
0.6%
-0.121624700442
2
0.6%
-0.146407837446
2
0.6%
-0.148285347825
2
0.6%
-0.201231140514
2
0.6%
Other values (319)
334
93.8%
Minimum 5 values
Value
Count
Frequency (%)
-0.360819522733
1
0.3%
-0.351431970837
1
0.3%
-0.333032369123
1
0.3%
-0.31801228609
1
0.3%
-0.317636784015
1
0.3%
Maximum 5 values
Value
Count
Frequency (%)
0.731140513721
1
0.3%
0.775449758666
1
0.3%
0.866321261012
1
0.3%
0.879839335741
1
0.3%
1.0
1
0.3%
Sample
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.136649
0.999895
0.0
1.0
0.0
0.0
0.0
0.0
-0.025035
0.049786
-0.454117
-0.098765
0.017025
0.088598
0.021039
-0.007525
-0.340976
-0.183574
-0.506368
0.053632
-0.238781
1
0.0
0.0
0.0
0.0
0.0
1.0
0.0
0.682595
0.997049
0.0
0.0
0.0
0.0
0.0
1.0
-0.024793
-0.158798
-0.157561
-0.098765
-0.186655
0.032976
0.245569
0.102436
-0.282673
-0.356312
-0.049302
0.053632
-0.082572
2
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.818758
0.995746
0.0
0.0
0.0
0.0
1.0
0.0
-0.024794
-0.158798
-0.157561
-0.098765
-0.186655
0.308918
-0.046156
0.102436
-0.282673
-0.356312
-0.049302
0.040482
-0.274454
3
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.818758
0.995746
0.0
0.0
1.0
0.0
0.0
0.0
-0.024735
-0.158798
-0.462216
-0.098765
-0.219126
0.241377
-0.296909
0.239727
-0.224369
-0.420289
0.115242
0.046298
-0.315384
4
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.136649
0.999895
1.0
0.0
0.0
0.0
0.0
0.0
-0.024764
-0.158798
-0.462216
-0.098765
-0.219126
0.036227
-0.085490
0.239727
-0.224369
-0.420289
0.115242
0.044650
-0.230145
Create a cross-validation routine for parameter tuning
Let's define our search space
In [30]:
from sklearn.linear_model import SGDRegressor
from sklearn.dummy import DummyRegressor
from xgboost.sklearn import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
# specification of different model types and their defaults
# We use factory methods to prevent having to hold all models in memory
model_instances = {'lr': lambda : SGDRegressor(loss='squared_loss'),
'xgb': lambda: XGBRegressor(n_estimators=100,
objective='reg:linear', #'reg:gamma'
# max_depth=9,
# reg_alpha=15,
# reg_lambda=5,
# gamma=0.5
),
'gb': lambda : GradientBoostingRegressor(n_estimators=10),
'rf' : lambda : RandomForestRegressor(),
'dummy': lambda : DummyRegressor(strategy='mean')
}
# specification of the different model hyperparameters and tuning space
model_params_grid = {'lr': {'alpha': [1e-5, 1e-4, 1e-3]},
'xgb': {
'max_depth': [3,6,9],
'reg_alpha': [0,5,15],
'reg_lambda': [0,5,15],
'gamma' : [0,0.5,5,50],
},
'gb' : {'n_estimators' : [10, 50, 100],
'max_depth' : [3, 5, 7, 10]},
'rf' : {'n_estimators' : [10],
'max_depth' : [3, 5, None]},
'dummy': {'strategy':['mean','median']}
}
/Users/gerben/anaconda3/envs/bdranalytics/lib/python2.7/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
"This module will be removed in 0.20.", DeprecationWarning)
Now we have pipeline steps for feature generation, and different final steps for model choices, each with a different parameter tuning grid
In [31]:
n_jobs = 1 # note that this also affects used memory
def change_param_keys(named_step, param_dict):
return {'regressor__{}'.format(key):val for key,val in param_dict.items()}
def grid_search(model_name, model_instance):
print "=== Tuning parameters for {} ===".format(model_name)
estimator = Pipeline([('featurizer', make_pipeline(*transformer_steps)), ('regressor', model_instance)])
estimator_param_grid = change_param_keys('regressor',model_params_grid[model_name])
gridsearch = GridSearchCV(estimator,
cv=cv_dev, n_jobs=n_jobs, refit=False,
param_grid=estimator_param_grid,
scoring=model_scorer,
verbose=1)
gridsearch.fit(X_dev, y_dev, groups=groups_dev)
# show grid search results
display(pd.DataFrame(gridsearch.cv_results_).sort_values('rank_test_score').head())
print gridsearch.best_params_
return gridsearch
In [32]:
model_tune = {}
for (model_name, model_instance_create) in model_instances.items():
gridsearch = grid_search(model_name, model_instance_create())
model_tune[model_name] = {
'best_params' : gridsearch.best_params_,
'best_score' : gridsearch.best_score_
}
# set best parameters for estimator
#estimator.set_params(**grid_search.best_params_)
=== Tuning parameters for xgb ===
Fitting 5 folds for each of 108 candidates, totalling 540 fits
[Parallel(n_jobs=1)]: Done 540 out of 540 | elapsed: 49.6s finished
mean_fit_time
mean_score_time
mean_test_score
mean_train_score
param_regressor__gamma
param_regressor__max_depth
param_regressor__reg_alpha
param_regressor__reg_lambda
params
rank_test_score
...
split2_test_score
split2_train_score
split3_test_score
split3_train_score
split4_test_score
split4_train_score
std_fit_time
std_score_time
std_test_score
std_train_score
55
0.042671
0.013428
-4.388511
-1.433428
5
3
0
5
{u'regressor__gamma': 5, u'regressor__reg_lamb...
1
...
-4.048043
-1.243772
-3.245698
-1.481906
-4.663778
-1.481906
0.001404
0.000545
0.677906
0.099037
64
0.050766
0.013455
-4.395936
-1.284148
5
6
0
5
{u'regressor__gamma': 5, u'regressor__reg_lamb...
2
...
-3.999686
-1.140479
-3.301801
-1.354247
-4.595167
-1.354247
0.004127
0.000614
0.679038
0.082119
73
0.060989
0.013164
-4.413423
-1.270211
5
9
0
5
{u'regressor__gamma': 5, u'regressor__reg_lamb...
3
...
-3.999686
-1.140479
-3.344815
-1.313139
-4.603852
-1.313139
0.002312
0.000450
0.672446
0.075671
1
0.045600
0.014454
-4.429830
-1.178690
0
3
0
5
{u'regressor__gamma': 0, u'regressor__reg_lamb...
4
...
-4.070631
-0.638417
-3.398927
-1.308871
-4.702658
-1.308871
0.001869
0.001225
0.625460
0.275317
0
0.046628
0.022545
-4.438206
-0.289423
0
3
0
0
{u'regressor__gamma': 0, u'regressor__reg_lamb...
5
...
-4.141463
-0.231305
-3.504854
-0.281575
-4.353119
-0.281575
0.005129
0.017709
0.696072
0.035388
5 rows × 24 columns
{'regressor__gamma': 5, 'regressor__reg_lambda': 5, 'regressor__reg_alpha': 0, 'regressor__max_depth': 3}
=== Tuning parameters for rf ===
Fitting 5 folds for each of 3 candidates, totalling 15 fits
[Parallel(n_jobs=1)]: Done 15 out of 15 | elapsed: 1.7s finished
mean_fit_time
mean_score_time
mean_test_score
mean_train_score
param_regressor__max_depth
param_regressor__n_estimators
params
rank_test_score
split0_test_score
split0_train_score
...
split2_test_score
split2_train_score
split3_test_score
split3_train_score
split4_test_score
split4_train_score
std_fit_time
std_score_time
std_test_score
std_train_score
2
0.059041
0.018021
-4.759744
-2.512890
None
10
{u'regressor__n_estimators': 10, u'regressor__...
1
-4.771597
-2.572566
...
-3.951797
-1.882513
-4.288435
-2.685051
-4.984095
-2.570741
0.002747
0.000303
0.656705
0.331659
1
0.059661
0.020213
-5.004089
-2.413691
5
10
{u'regressor__n_estimators': 10, u'regressor__...
2
-4.545730
-2.077193
...
-4.036492
-2.551206
-5.263249
-2.223789
-4.973571
-2.502379
0.003040
0.001467
0.756992
0.230732
0
0.067759
0.021241
-5.046437
-3.129457
3
10
{u'regressor__n_estimators': 10, u'regressor__...
3
-5.288711
-3.345931
...
-3.988998
-2.579392
-4.545092
-3.010700
-4.960754
-3.475108
0.010575
0.002391
0.868137
0.314450
3 rows × 22 columns
{'regressor__n_estimators': 10, 'regressor__max_depth': None}
=== Tuning parameters for lr ===
Fitting 5 folds for each of 3 candidates, totalling 15 fits
[Parallel(n_jobs=1)]: Done 15 out of 15 | elapsed: 1.1s finished
mean_fit_time
mean_score_time
mean_test_score
mean_train_score
param_regressor__alpha
params
rank_test_score
split0_test_score
split0_train_score
split1_test_score
...
split2_test_score
split2_train_score
split3_test_score
split3_train_score
split4_test_score
split4_train_score
std_fit_time
std_score_time
std_test_score
std_train_score
1
0.031482
0.012905
-8.023576
-7.825222
0.0001
{u'regressor__alpha': 0.0001}
1
-8.674259
-7.073379
-6.216955
...
-7.918029
-7.533113
-8.397907
-8.336502
-8.646544
-8.362686
0.001680
0.000876
0.882133
0.490080
2
0.032978
0.013738
-8.028749
-7.852340
0.001
{u'regressor__alpha': 0.001}
2
-8.604921
-7.071523
-6.198867
...
-7.974245
-7.646163
-8.392506
-8.285071
-8.700496
-8.370655
0.001991
0.000936
0.886364
0.471038
0
0.033265
0.013458
-8.056269
-7.887549
1e-05
{u'regressor__alpha': 1e-05}
3
-8.635788
-7.112510
-6.299914
...
-8.036709
-7.668393
-8.478160
-8.402212
-8.579684
-8.340494
0.001869
0.001030
0.842324
0.473172
3 rows × 21 columns
{'regressor__alpha': 0.0001}
=== Tuning parameters for gb ===
Fitting 5 folds for each of 12 candidates, totalling 60 fits
[Parallel(n_jobs=1)]: Done 60 out of 60 | elapsed: 5.6s finished
mean_fit_time
mean_score_time
mean_test_score
mean_train_score
param_regressor__max_depth
param_regressor__n_estimators
params
rank_test_score
split0_test_score
split0_train_score
...
split2_test_score
split2_train_score
split3_test_score
split3_train_score
split4_test_score
split4_train_score
std_fit_time
std_score_time
std_test_score
std_train_score
2
0.057706
0.014301
-4.179133
-0.289423
3
100
{u'regressor__n_estimators': 100, u'regressor_...
1
-4.049145
-0.322062
...
-4.177120
-0.231304
-3.317813
-0.281575
-3.949781
-0.281575
0.003311
0.001230
0.721470
0.035388
1
0.043864
0.013911
-4.284440
-0.747355
3
50
{u'regressor__n_estimators': 50, u'regressor__...
2
-4.198106
-0.792572
...
-4.361917
-0.569113
-3.402843
-0.755770
-4.058877
-0.755770
0.003294
0.001229
0.684400
0.097425
5
0.067837
0.013007
-4.802713
-0.013278
5
100
{u'regressor__n_estimators': 100, u'regressor_...
3
-3.773874
-0.014812
...
-4.621112
-0.006901
-4.030835
-0.012840
-5.256018
-0.012840
0.003344
0.000230
0.939355
0.003901
4
0.050138
0.012658
-4.869981
-0.132217
5
50
{u'regressor__n_estimators': 50, u'regressor__...
4
-4.096214
-0.161193
...
-4.547366
-0.118021
-4.101961
-0.116804
-5.265640
-0.116804
0.000880
0.000364
0.871849
0.018834
3
0.035742
0.013151
-5.551529
-3.392144
5
10
{u'regressor__n_estimators': 10, u'regressor__...
5
-5.596091
-3.359890
...
-5.094204
-3.165504
-5.306697
-3.427350
-5.962615
-3.427350
0.001424
0.000693
0.320141
0.134470
5 rows × 22 columns
{'regressor__n_estimators': 100, 'regressor__max_depth': 3}
=== Tuning parameters for dummy ===
Fitting 5 folds for each of 2 candidates, totalling 10 fits
[Parallel(n_jobs=1)]: Done 10 out of 10 | elapsed: 0.8s finished
mean_fit_time
mean_score_time
mean_test_score
mean_train_score
param_regressor__strategy
params
rank_test_score
split0_test_score
split0_train_score
split1_test_score
...
split2_test_score
split2_train_score
split3_test_score
split3_train_score
split4_test_score
split4_train_score
std_fit_time
std_score_time
std_test_score
std_train_score
0
0.036664
0.014562
-9.390203
-9.062925
mean
{u'regressor__strategy': u'mean'}
1
-9.847559
-8.957147
-7.898198
...
-9.691174
-8.719776
-9.634746
-8.976379
-9.678355
-8.976379
0.003712
0.001609
0.696170
0.325830
1
0.033785
0.014115
-9.616864
-9.261099
median
{u'regressor__strategy': u'median'}
2
-9.930053
-9.095508
-8.254184
...
-10.067688
-8.919286
-9.787151
-9.133221
-9.861083
-9.133221
0.001245
0.000881
0.639253
0.389711
2 rows × 21 columns
{'regressor__strategy': 'mean'}
In [33]:
model_tune
Out[33]:
{'dummy': {'best_params': {'regressor__strategy': 'mean'},
'best_score': -9.3902034558460716},
'gb': {'best_params': {'regressor__max_depth': 3,
'regressor__n_estimators': 100},
'best_score': -4.1791334314564068},
'lr': {'best_params': {'regressor__alpha': 0.0001},
'best_score': -8.0235759616222158},
'rf': {'best_params': {'regressor__max_depth': None,
'regressor__n_estimators': 10},
'best_score': -4.75974358276535},
'xgb': {'best_params': {'regressor__gamma': 5,
'regressor__max_depth': 3,
'regressor__reg_alpha': 0,
'regressor__reg_lambda': 5},
'best_score': -4.3885109365170702}}
In [34]:
best_model_tune = max(model_tune.items(), key=lambda x:x[1]['best_score'])
display(best_model_tune)
('gb',
{'best_params': {'regressor__max_depth': 3, 'regressor__n_estimators': 100},
'best_score': -4.1791334314564068})
In [35]:
model_name = best_model_tune[0]
regressor = model_instances[model_name]() #the dict contains a function, need to instantiate it
estimator = Pipeline([('featurizer', make_pipeline(*transformer_steps)), ('regressor', regressor)])
estimator.set_params(**best_model_tune[1]['best_params'])
Out[35]:
Pipeline(steps=[('featurizer', Pipeline(steps=[('weightofevidenceencoder', WeightOfEvidenceEncoder(cols=['example_cardinalcat'],
dependent_variable_values=None, fillna=0, return_df=True,
smooth=0.5, verbose=0)), ('pdfeatureunion', PdFeatureUnion(debug=False, n_jobs=None,
trans... presort='auto', random_state=None, subsample=1.0, verbose=0,
warm_start=False))])
In [36]:
estimator=estimator.fit(X_dev, y_dev)
In [37]:
from sklearn.model_selection import cross_val_predict
y_actual = y_dev
y_pred = cross_val_predict(estimator, X_dev, y_dev, groups=groups_dev, cv=5) #cannot use cv_dev as cross_val_predict doesn't like it
In [38]:
def plot_residualplot(y_actual, y_pred):
g = sns.jointplot(y_actual, y_pred-y_actual)
ax = g.ax_joint
axes = ax.axes
axes.set_xlim(np.percentile(y_actual, 1), np.percentile(y_actual, 99)) # remove outliers
axes.set_ylabel('residuals (predicted-actual)')
axes.set_xlabel('actual {}'.format(y_col))
logscale = False
if logscale:
g.ax_joint
ax = g.ax_joint
ax.set_xscale('log')
ax.set_yscale('log')
g.ax_marg_x.set_xscale('log')
g.ax_marg_y.set_yscale('log')
In [39]:
plot_residualplot(y_actual, y_pred)
print('Target cost function {}:'.format(model_score))
print(model_score(y_actual, y_pred))
Target cost function <function root_mean_squared_error at 0x10e7e9848>:
5.15814921025
In [40]:
estimator = estimator.fit(X_dev, y_dev)
estimator_columns = pd.DataFrame(transformer.transform(X_dev)).columns
y_actual = y_test
y_pred = estimator.predict(X_test)
plot_residualplot(y_actual, y_pred)
results = pd.DataFrame(y_actual)
results['ACTUAL'] = results[[y_col]]
results['PREDICTION']=y_pred
results['ABS_ERROR'] = (results['ACTUAL'] - results['PREDICTION']).abs()
results['REL_ERROR'] = results['ABS_ERROR'] / results['ACTUAL']
print('Mean errors:')
display(results.filter(regex="ERROR").mean())
print('Median errors:')
display(results.filter(regex="ERROR").median())
print('Target cost function {}:'.format(model_score))
print(model_score(y_actual, y_pred))
Mean errors:
ABS_ERROR 1.985708
REL_ERROR 0.099894
dtype: float64
Median errors:
ABS_ERROR 1.408716
REL_ERROR 0.064722
dtype: float64
Target cost function <function root_mean_squared_error at 0x10e7e9848>:
3.25083236728
In [41]:
estimator._final_estimator
Out[41]:
GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
max_leaf_nodes=None, min_impurity_split=1e-07,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
presort='auto', random_state=None, subsample=1.0, verbose=0,
warm_start=False)
In [42]:
feature_labels = estimator_columns
n_features = len(estimator_columns)
if model_name is 'lr':
weights = estimator._final_estimator.coef_[0]
elif model_name in ['rf','gb']:
weights = estimator._final_estimator.feature_importances_
#stdevs = np.std([tree.feature_importances_ for tree in estimator._final_estimator.estimators_], axis=0)
elif model_name is 'xgb':
importance_dictionary = estimator._final_estimator.booster().get_score(importance_type='weight')
weights = [value for (key, value) in sorted(importance_dictionary.items())]
elif model_name is 'dummy':
print('DummyRegressor does not have weights')
weights = np.zeros(n_features)
feature_weights = pd.Series(weights, index=feature_labels).sort_values()
feature_weights.plot.barh(title='Feature importance ({})'.format(model_name), fontsize=8, figsize=(10,6), grid=True);
In [43]:
from sklearn.ensemble.partial_dependence import plot_partial_dependence
if model_name is 'gb':
X_transformed = Pipeline(steps=estimator.steps[:-1]).fit_transform(X.iloc[dev_indices,:], y[dev_indices])
plot_partial_dependence(estimator._final_estimator, X_transformed,
features=range(n_features), feature_names=feature_labels,
figsize=(10,24), n_cols=3, percentiles=(0.1,0.9));
else:
print("No partial dependence plots available for model type {}".format(model_name))
In [44]:
estimator_challenge=estimator.fit(X, y)
In [45]:
#import pickle
#import dill
#pickle.dump(estimator_challenge, open('../estimator.p', "wb" ), protocol=2)
In [46]:
y_challenge = estimator_challenge.predict(challenge)
challenge_out=pd.DataFrame({y_col:y_challenge}, index=challenge.index)
display(challenge_out.head())
challenge_out.to_csv('../challenge_out.csv',header=False,index=False)
TARGET
INDEX
406
12.089819
407
29.091209
408
11.100659
409
15.671439
410
28.523922
In [47]:
n_bins = 100
range_ratio=(n_bins*(max(y)-min(y))/(max(challenge_out.values)-min(challenge_out.values)))
print range_ratio
[ 169.13111865]
In [48]:
ax=sns.distplot(challenge_out, bins=n_bins, label='predicted challenge')
ax=sns.distplot(y, bins=int(range_ratio), label='train')
axes = ax.axes
axes.set_xlim(np.percentile(y, 1), np.percentile(y, 99))
print axes.get_xlim()
plt.legend();
(7.2000000000000002, 50.0)
In [ ]:
Content source: BigDataRepublic/bdr-analytics-py
Similar notebooks: