SciKit Learn Preprocessing Overview


In [1]:
import numpy as np

In [2]:
# Importing MinMaxScaler
from sklearn.preprocessing import MinMaxScaler

In [3]:
data = np.random.randint(0,100,(10,2))

In [4]:
data


Out[4]:
array([[84, 14],
       [14, 40],
       [81, 60],
       [90, 63],
       [32, 84],
       [82, 34],
       [48, 84],
       [32, 88],
       [ 0, 13],
       [17, 29]])

In [5]:
scaler_model = MinMaxScaler()

In [6]:
scaler_model.fit(data)


C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\validation.py:429: DataConversionWarning: Data with input dtype int32 was converted to float64 by MinMaxScaler.
  warnings.warn(msg, _DataConversionWarning)
Out[6]:
MinMaxScaler(copy=True, feature_range=(0, 1))

In [7]:
scaler_model.transform(data)


Out[7]:
array([[0.93333333, 0.01333333],
       [0.15555556, 0.36      ],
       [0.9       , 0.62666667],
       [1.        , 0.66666667],
       [0.35555556, 0.94666667],
       [0.91111111, 0.28      ],
       [0.53333333, 0.94666667],
       [0.35555556, 1.        ],
       [0.        , 0.        ],
       [0.18888889, 0.21333333]])

In [8]:
# In one step
result = scaler_model.fit_transform(data)


C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\validation.py:429: DataConversionWarning: Data with input dtype int32 was converted to float64 by MinMaxScaler.
  warnings.warn(msg, _DataConversionWarning)

In [9]:
result


Out[9]:
array([[0.93333333, 0.01333333],
       [0.15555556, 0.36      ],
       [0.9       , 0.62666667],
       [1.        , 0.66666667],
       [0.35555556, 0.94666667],
       [0.91111111, 0.28      ],
       [0.53333333, 0.94666667],
       [0.35555556, 1.        ],
       [0.        , 0.        ],
       [0.18888889, 0.21333333]])

In [10]:
import pandas as pd

In [11]:
data = pd.DataFrame(data = np.random.randint(0, 101, size = (50,4)),
                    columns = ['f1','f2','f3','label'])

In [12]:
data.head()


Out[12]:
f1 f2 f3 label
0 48 55 73 37
1 7 88 83 86
2 5 52 21 16
3 46 8 7 36
4 96 86 35 41

In [13]:
x = data[['f1', 'f2', 'f3']] # Alternatively x = data.drop('label',axis=1)
y = data['label']

In [14]:
# Train-test split
from sklearn.model_selection import train_test_split

In [15]:
# Splitting the dataset into train and test dataset with given test size and random state (data will be shuffled)
X_train, X_test, y_train, y_test = train_test_split(x, y, 
                                                    test_size = 0.3, 
                                                    random_state = 101)

In [16]:
X_train.shape


Out[16]:
(35, 3)

In [17]:
X_train[0:5]


Out[17]:
f1 f2 f3
3 46 8 7
41 42 54 31
30 35 61 99
15 58 12 48
20 38 3 79

In [18]:
X_test.shape


Out[18]:
(15, 3)

In [19]:
y_train.shape


Out[19]:
(35,)

In [20]:
y_test.shape


Out[20]:
(15,)

Great Job!