In [2]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import matplotlib
%matplotlib inline

from sklearn import preprocessing

Numerical Variable Transform

Min-Max Scaling


In [3]:
a = np.array([[1., -1., 2.],
              [2., 0., 0.],
              [0., 1, -1.]])

min_max_scaler = preprocessing.MinMaxScaler()
a_scaled = min_max_scaler.fit_transform(a)
a_scaled


Out[3]:
array([[ 0.5       ,  0.        ,  1.        ],
       [ 1.        ,  0.5       ,  0.33333333],
       [ 0.        ,  1.        ,  0.        ]])

In [4]:
min_max_scaler.min_


Out[4]:
array([ 0.        ,  0.5       ,  0.33333333])

In [5]:
min_max_scaler.scale_


Out[5]:
array([ 0.5       ,  0.5       ,  0.33333333])

In [6]:
a_test = np.array([[-3., -1., 4.]])
a_test_scaled = min_max_scaler.transform(a_test)
a_test_scaled


Out[6]:
array([[-1.5       ,  0.        ,  1.66666667]])

z-score


In [7]:
zscore_scaler = preprocessing.StandardScaler()
a_scaled = zscore_scaler.fit_transform(a)
a_scaled


Out[7]:
array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])

In [8]:
zscore_scaler.mean_


Out[8]:
array([ 1.        ,  0.        ,  0.33333333])

In [9]:
zscore_scaler.scale_


Out[9]:
array([ 0.81649658,  0.81649658,  1.24721913])

In [10]:
a_test_scaled = zscore_scaler.transform([[-1., 1., 0.]])
a_test_scaled


Out[10]:
array([[-2.44948974,  1.22474487, -0.26726124]])

Normalization


In [11]:
normalizer = preprocessing.Normalizer().fit(a)

a_normalized = normalizer.transform(a)

In [12]:
normalizer.transform([[-1., 1., 0.]])


Out[12]:
array([[-0.70710678,  0.70710678,  0.        ]])

Logarithm


In [13]:
from sklearn.preprocessing import FunctionTransformer

a = np.array([[0, 1], [2, 3]])

transformer = FunctionTransformer(np.log1p)
transformer.transform(a)


Out[13]:
array([[ 0.        ,  0.69314718],
       [ 1.09861229,  1.38629436]])

In [14]:
transformer = FunctionTransformer(np.sqrt)
transformer.transform(a)


Out[14]:
array([[ 0.        ,  1.        ],
       [ 1.41421356,  1.73205081]])

In [15]:
def cube_root(x):
    return np.power(x, 1.0 / 3 )
transformer = FunctionTransformer(cube_root)
transformer.transform(a)


Out[15]:
array([[ 0.        ,  1.        ],
       [ 1.25992105,  1.44224957]])

In [16]:
transformer = FunctionTransformer(np.square)
transformer.transform(a)


Out[16]:
array([[0, 1],
       [4, 9]])

In [17]:
def cube(x):
    return np.power(x, 3 )
transformer = FunctionTransformer(cube)
transformer.transform(a)


Out[17]:
array([[ 0,  1],
       [ 8, 27]])

One-Hot Encoder


In [18]:
# two important parameters of OneHotEncoder
#  handle_unknown: 'error', 'ignore'
#  n_values: n
enc = preprocessing.OneHotEncoder()
enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]])


Out[18]:
OneHotEncoder(categorical_features='all', dtype=<type 'numpy.float64'>,
       handle_unknown='error', n_values='auto', sparse=True)

In [19]:
enc.transform([[0, 1, 3]]).toarray()


Out[19]:
array([[ 1.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  1.]])

Discretization and binning


In [20]:
import pandas as pd
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

In [21]:
bins = [18, 25, 35, 60, 100]
cats = pd.cut(ages, bins)
cats


Out[21]:
[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, object): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [22]:
cats.codes


Out[22]:
array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [23]:
pd.value_counts(cats)


Out[23]:
(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

In [24]:
pd.cut(ages, [18, 26, 36, 61, 100], right=False)


Out[24]:
[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, object): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

In [25]:
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
pd.cut(ages, bins, labels=group_names)


Out[25]:
[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]

In [26]:
data = np.random.rand(20)
pd.cut(data, 4, precision = 2)


Out[26]:
[(0.26, 0.49], (0.26, 0.49], (0.49, 0.73], (0.49, 0.73], (0.018, 0.26], ..., (0.018, 0.26], (0.49, 0.73], (0.26, 0.49], (0.26, 0.49], (0.73, 0.97]]
Length: 20
Categories (4, object): [(0.018, 0.26] < (0.26, 0.49] < (0.49, 0.73] < (0.73, 0.97]]

In [27]:
data = np.random.randn(1000) # Normally distributed
cats = pd.qcut(data, 4) # Cut into quartiles
cats


Out[27]:
[(-0.014, 0.625], (-0.663, -0.014], (-0.014, 0.625], [-2.997, -0.663], (-0.663, -0.014], ..., (-0.663, -0.014], (-0.014, 0.625], (0.625, 3.677], (-0.663, -0.014], (-0.014, 0.625]]
Length: 1000
Categories (4, object): [[-2.997, -0.663] < (-0.663, -0.014] < (-0.014, 0.625] < (0.625, 3.677]]

In [28]:
pd.value_counts(cats)


Out[28]:
(0.625, 3.677]      250
(-0.014, 0.625]     250
(-0.663, -0.014]    250
[-2.997, -0.663]    250
dtype: int64

In [29]:
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])


Out[29]:
[(-0.014, 1.201], (-1.269, -0.014], (-0.014, 1.201], [-2.997, -1.269], (-1.269, -0.014], ..., (-1.269, -0.014], (-0.014, 1.201], (-0.014, 1.201], (-1.269, -0.014], (-0.014, 1.201]]
Length: 1000
Categories (4, object): [[-2.997, -1.269] < (-1.269, -0.014] < (-0.014, 1.201] < (1.201, 3.677]]