MinMaxScaler


In [2]:
from pandas import Series
from sklearn.preprocessing import MinMaxScaler

In [6]:
data = [10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0]
series = Series(data)
print(series)


0     10.0
1     20.0
2     30.0
3     40.0
4     50.0
5     60.0
6     70.0
7     80.0
8     90.0
9    100.0
dtype: float64

In [8]:
values = series.values
values = values.reshape((len(values), 1))

In [11]:
print(values)
print(values.shape)


[[  10.]
 [  20.]
 [  30.]
 [  40.]
 [  50.]
 [  60.]
 [  70.]
 [  80.]
 [  90.]
 [ 100.]]
(10, 1)

In [22]:
scaler = MinMaxScaler(feature_range=(0, 1))
print(scaler)


MinMaxScaler(copy=True, feature_range=(0, 1))

In [26]:
scaler = scaler.fit(values)
print('Min: %f, Max: %f' % (scaler.data_min_, scaler.data_max_))


Min: 10.000000, Max: 100.000000

In [27]:
normalized = scaler.transform(values)
print(normalized)


[[ 0.        ]
 [ 0.11111111]
 [ 0.22222222]
 [ 0.33333333]
 [ 0.44444444]
 [ 0.55555556]
 [ 0.66666667]
 [ 0.77777778]
 [ 0.88888889]
 [ 1.        ]]

In [29]:
inversed = scaler.inverse_transform(normalized)
inversed


Out[29]:
array([[  10.],
       [  20.],
       [  30.],
       [  40.],
       [  50.],
       [  60.],
       [  70.],
       [  80.],
       [  90.],
       [ 100.]])

StandardScaler


In [31]:
from pandas import Series
from sklearn.preprocessing import StandardScaler
from math import sqrt

In [32]:
data = [1.0, 5.5, 9.0, 2.6, 8.8, 3.0, 4.1, 7.9, 6.3]
series = Series(data)
print(series)


0    1.0
1    5.5
2    9.0
3    2.6
4    8.8
5    3.0
6    4.1
7    7.9
8    6.3
dtype: float64

In [36]:
values = series.values
values = values.reshape((len(values), 1))
print(values.shape)


(9, 1)

In [39]:
scaler = StandardScaler()
scaler = scaler.fit(values)
print('Mean: %f, StandardDeviation: %f' % (scaler.mean_, sqrt(scaler.var_)))
standardized = scaler.transform(values)
print(standardized)
inversed = scaler.inverse_transform(standardized)
print(inversed)


Mean: 5.355556, StandardDeviation: 2.712568
[[-1.60569456]
 [ 0.05325007]
 [ 1.34354035]
 [-1.01584758]
 [ 1.26980948]
 [-0.86838584]
 [-0.46286604]
 [ 0.93802055]
 [ 0.34817357]]
[[ 1. ]
 [ 5.5]
 [ 9. ]
 [ 2.6]
 [ 8.8]
 [ 3. ]
 [ 4.1]
 [ 7.9]
 [ 6.3]]

OneHotEncoder


In [55]:
from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [56]:
data = ['cold', 'cold', 'warm', 'cold', 'hot', 'hot', 'warm', 'cold', 'warm', 'hot']
values = array(data)

In [57]:
values


Out[57]:
array(['cold', 'cold', 'warm', 'cold', 'hot', 'hot', 'warm', 'cold',
       'warm', 'hot'],
      dtype='<U4')

In [72]:
# integer encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
print(integer_encoded)


[0 0 2 0 1 1 2 0 2 1]

In [73]:
# onehot encode
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
print(onehot_encoded)


[[ 1.  0.  0.]
 [ 1.  0.  0.]
 [ 0.  0.  1.]
 [ 1.  0.  0.]
 [ 0.  1.  0.]
 [ 0.  1.  0.]
 [ 0.  0.  1.]
 [ 1.  0.  0.]
 [ 0.  0.  1.]
 [ 0.  1.  0.]]

In [69]:
inverted = label_encoder.inverse_transform([argmax(onehot_encoded[0, :])])
inverted


Out[69]:
array(['cold'],
      dtype='<U4')

Sequence Padding


In [81]:
import numpy as np
from keras.preprocessing.sequence import pad_sequences

In [92]:
sequences = [
    [1, 2, 3, 4],
    [1, 2, 3],
    [1]
]

In [93]:
sequences


Out[93]:
[[1, 2, 3, 4], [1, 2, 3], [1]]

In [94]:
padded = pad_sequences(sequences)
print(padded)


[[1 2 3 4]
 [0 1 2 3]
 [0 0 0 1]]

In [95]:
padded = pad_sequences(sequences, padding='post')
print(padded)


[[1 2 3 4]
 [1 2 3 0]
 [1 0 0 0]]

Sequence Truncation


In [96]:
from keras.preprocessing.sequence import pad_sequences

In [97]:
sequences = [
    [1, 2, 3, 4],
    [1, 2, 3],
    [1]
]

In [100]:
truncated = pad_sequences(sequences, maxlen=2)
print(truncated)


[[3 4]
 [2 3]
 [0 1]]

In [101]:
truncated = pad_sequences(sequences, maxlen=2, truncating='post')
print(truncated)


[[1 2]
 [1 2]
 [0 1]]

Sequence vs Supervised Learning


In [104]:
from pandas import DataFrame
df = DataFrame()
df['t'] = [x for x in range(10)]
df['t-1'] = df['t'].shift(-1)
print(df)


   t  t-1
0  0  1.0
1  1  2.0
2  2  3.0
3  3  4.0
4  4  5.0
5  5  6.0
6  6  7.0
7  7  8.0
8  8  9.0
9  9  NaN