In [2]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
%matplotlib inline
from sklearn import preprocessing
In [3]:
a = np.array([[1., -1., 2.],
[2., 0., 0.],
[0., 1, -1.]])
min_max_scaler = preprocessing.MinMaxScaler()
a_scaled = min_max_scaler.fit_transform(a)
a_scaled
Out[3]:
In [4]:
min_max_scaler.min_
Out[4]:
In [5]:
min_max_scaler.scale_
Out[5]:
In [6]:
a_test = np.array([[-3., -1., 4.]])
a_test_scaled = min_max_scaler.transform(a_test)
a_test_scaled
Out[6]:
In [7]:
zscore_scaler = preprocessing.StandardScaler()
a_scaled = zscore_scaler.fit_transform(a)
a_scaled
Out[7]:
In [8]:
zscore_scaler.mean_
Out[8]:
In [9]:
zscore_scaler.scale_
Out[9]:
In [10]:
a_test_scaled = zscore_scaler.transform([[-1., 1., 0.]])
a_test_scaled
Out[10]:
In [11]:
normalizer = preprocessing.Normalizer().fit(a)
a_normalized = normalizer.transform(a)
In [12]:
normalizer.transform([[-1., 1., 0.]])
Out[12]:
In [13]:
from sklearn.preprocessing import FunctionTransformer
a = np.array([[0, 1], [2, 3]])
transformer = FunctionTransformer(np.log1p)
transformer.transform(a)
Out[13]:
In [14]:
transformer = FunctionTransformer(np.sqrt)
transformer.transform(a)
Out[14]:
In [15]:
def cube_root(x):
return np.power(x, 1.0 / 3 )
transformer = FunctionTransformer(cube_root)
transformer.transform(a)
Out[15]:
In [16]:
transformer = FunctionTransformer(np.square)
transformer.transform(a)
Out[16]:
In [17]:
def cube(x):
return np.power(x, 3 )
transformer = FunctionTransformer(cube)
transformer.transform(a)
Out[17]:
In [18]:
# two important parameters of OneHotEncoder
# handle_unknown: 'error', 'ignore'
# n_values: n
enc = preprocessing.OneHotEncoder()
enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]])
Out[18]:
In [19]:
enc.transform([[0, 1, 3]]).toarray()
Out[19]:
In [20]:
import pandas as pd
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
In [21]:
bins = [18, 25, 35, 60, 100]
cats = pd.cut(ages, bins)
cats
Out[21]:
In [22]:
cats.codes
Out[22]:
In [23]:
pd.value_counts(cats)
Out[23]:
In [24]:
pd.cut(ages, [18, 26, 36, 61, 100], right=False)
Out[24]:
In [25]:
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
pd.cut(ages, bins, labels=group_names)
Out[25]:
In [26]:
data = np.random.rand(20)
pd.cut(data, 4, precision = 2)
Out[26]:
In [27]:
data = np.random.randn(1000) # Normally distributed
cats = pd.qcut(data, 4) # Cut into quartiles
cats
Out[27]:
In [28]:
pd.value_counts(cats)
Out[28]:
In [29]:
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])
Out[29]: