In [2]:

    
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import matplotlib
%matplotlib inline

from sklearn import preprocessing

Numerical Variable Transform

Min-Max Scaling



In [3]:

    
a = np.array([[1., -1., 2.],
              [2., 0., 0.],
              [0., 1, -1.]])

min_max_scaler = preprocessing.MinMaxScaler()
a_scaled = min_max_scaler.fit_transform(a)
a_scaled









    Out[3]:





array([[ 0.5       ,  0.        ,  1.        ],
       [ 1.        ,  0.5       ,  0.33333333],
       [ 0.        ,  1.        ,  0.        ]])



In [4]:

    
min_max_scaler.min_









    Out[4]:





array([ 0.        ,  0.5       ,  0.33333333])



In [5]:

    
min_max_scaler.scale_









    Out[5]:





array([ 0.5       ,  0.5       ,  0.33333333])



In [6]:

    
a_test = np.array([[-3., -1., 4.]])
a_test_scaled = min_max_scaler.transform(a_test)
a_test_scaled









    Out[6]:





array([[-1.5       ,  0.        ,  1.66666667]])

z-score



In [7]:

    
zscore_scaler = preprocessing.StandardScaler()
a_scaled = zscore_scaler.fit_transform(a)
a_scaled









    Out[7]:





array([[ 0.        , -1.22474487,  1.33630621],
       [ 1.22474487,  0.        , -0.26726124],
       [-1.22474487,  1.22474487, -1.06904497]])



In [8]:

    
zscore_scaler.mean_









    Out[8]:





array([ 1.        ,  0.        ,  0.33333333])



In [9]:

    
zscore_scaler.scale_









    Out[9]:





array([ 0.81649658,  0.81649658,  1.24721913])



In [10]:

    
a_test_scaled = zscore_scaler.transform([[-1., 1., 0.]])
a_test_scaled









    Out[10]:





array([[-2.44948974,  1.22474487, -0.26726124]])

Normalization



In [11]:

    
normalizer = preprocessing.Normalizer().fit(a)

a_normalized = normalizer.transform(a)



In [12]:

    
normalizer.transform([[-1., 1., 0.]])









    Out[12]:





array([[-0.70710678,  0.70710678,  0.        ]])

Popular Transforms

Logarithm



In [13]:

    
from sklearn.preprocessing import FunctionTransformer

a = np.array([[0, 1], [2, 3]])

transformer = FunctionTransformer(np.log1p)
transformer.transform(a)









    Out[13]:





array([[ 0.        ,  0.69314718],
       [ 1.09861229,  1.38629436]])



In [14]:

    
transformer = FunctionTransformer(np.sqrt)
transformer.transform(a)









    Out[14]:





array([[ 0.        ,  1.        ],
       [ 1.41421356,  1.73205081]])



In [15]:

    
def cube_root(x):
    return np.power(x, 1.0 / 3 )
transformer = FunctionTransformer(cube_root)
transformer.transform(a)









    Out[15]:





array([[ 0.        ,  1.        ],
       [ 1.25992105,  1.44224957]])



In [16]:

    
transformer = FunctionTransformer(np.square)
transformer.transform(a)









    Out[16]:





array([[0, 1],
       [4, 9]])



In [17]:

    
def cube(x):
    return np.power(x, 3 )
transformer = FunctionTransformer(cube)
transformer.transform(a)









    Out[17]:





array([[ 0,  1],
       [ 8, 27]])

One-Hot Encoder



In [18]:

    
# two important parameters of OneHotEncoder
#  handle_unknown: 'error', 'ignore'
#  n_values: n
enc = preprocessing.OneHotEncoder()
enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]])









    Out[18]:





OneHotEncoder(categorical_features='all', dtype=<type 'numpy.float64'>,
       handle_unknown='error', n_values='auto', sparse=True)



In [19]:

    
enc.transform([[0, 1, 3]]).toarray()









    Out[19]:





array([[ 1.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  1.]])

Discretization and binning



In [20]:

    
import pandas as pd
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]



In [21]:

    
bins = [18, 25, 35, 60, 100]
cats = pd.cut(ages, bins)
cats









    Out[21]:





[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, object): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]



In [22]:

    
cats.codes









    Out[22]:





array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)



In [23]:

    
pd.value_counts(cats)









    Out[23]:





(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64



In [24]:

    
pd.cut(ages, [18, 26, 36, 61, 100], right=False)









    Out[24]:





[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, object): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]



In [25]:

    
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
pd.cut(ages, bins, labels=group_names)









    Out[25]:





[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]



In [26]:

    
data = np.random.rand(20)
pd.cut(data, 4, precision = 2)









    Out[26]:





[(0.26, 0.49], (0.26, 0.49], (0.49, 0.73], (0.49, 0.73], (0.018, 0.26], ..., (0.018, 0.26], (0.49, 0.73], (0.26, 0.49], (0.26, 0.49], (0.73, 0.97]]
Length: 20
Categories (4, object): [(0.018, 0.26] < (0.26, 0.49] < (0.49, 0.73] < (0.73, 0.97]]



In [27]:

    
data = np.random.randn(1000) # Normally distributed
cats = pd.qcut(data, 4) # Cut into quartiles
cats









    Out[27]:





[(-0.014, 0.625], (-0.663, -0.014], (-0.014, 0.625], [-2.997, -0.663], (-0.663, -0.014], ..., (-0.663, -0.014], (-0.014, 0.625], (0.625, 3.677], (-0.663, -0.014], (-0.014, 0.625]]
Length: 1000
Categories (4, object): [[-2.997, -0.663] < (-0.663, -0.014] < (-0.014, 0.625] < (0.625, 3.677]]



In [28]:

    
pd.value_counts(cats)









    Out[28]:





(0.625, 3.677]      250
(-0.014, 0.625]     250
(-0.663, -0.014]    250
[-2.997, -0.663]    250
dtype: int64



In [29]:

    
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])









    Out[29]:





[(-0.014, 1.201], (-1.269, -0.014], (-0.014, 1.201], [-2.997, -1.269], (-1.269, -0.014], ..., (-1.269, -0.014], (-0.014, 1.201], (-0.014, 1.201], (-1.269, -0.014], (-0.014, 1.201]]
Length: 1000
Categories (4, object): [[-2.997, -1.269] < (-1.269, -0.014] < (-0.014, 1.201] < (1.201, 3.677]]