MinMaxScaler



In [2]:

    
from pandas import Series
from sklearn.preprocessing import MinMaxScaler



In [6]:

    
data = [10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0]
series = Series(data)
print(series)









    



0     10.0
1     20.0
2     30.0
3     40.0
4     50.0
5     60.0
6     70.0
7     80.0
8     90.0
9    100.0
dtype: float64



In [8]:

    
values = series.values
values = values.reshape((len(values), 1))



In [11]:

    
print(values)
print(values.shape)









    



[[  10.]
 [  20.]
 [  30.]
 [  40.]
 [  50.]
 [  60.]
 [  70.]
 [  80.]
 [  90.]
 [ 100.]]
(10, 1)



In [22]:

    
scaler = MinMaxScaler(feature_range=(0, 1))
print(scaler)









    



MinMaxScaler(copy=True, feature_range=(0, 1))



In [26]:

    
scaler = scaler.fit(values)
print('Min: %f, Max: %f' % (scaler.data_min_, scaler.data_max_))









    



Min: 10.000000, Max: 100.000000



In [27]:

    
normalized = scaler.transform(values)
print(normalized)









    



[[ 0.        ]
 [ 0.11111111]
 [ 0.22222222]
 [ 0.33333333]
 [ 0.44444444]
 [ 0.55555556]
 [ 0.66666667]
 [ 0.77777778]
 [ 0.88888889]
 [ 1.        ]]



In [29]:

    
inversed = scaler.inverse_transform(normalized)
inversed









    Out[29]:





array([[  10.],
       [  20.],
       [  30.],
       [  40.],
       [  50.],
       [  60.],
       [  70.],
       [  80.],
       [  90.],
       [ 100.]])

StandardScaler



In [31]:

    
from pandas import Series
from sklearn.preprocessing import StandardScaler
from math import sqrt



In [32]:

    
data = [1.0, 5.5, 9.0, 2.6, 8.8, 3.0, 4.1, 7.9, 6.3]
series = Series(data)
print(series)









    



0    1.0
1    5.5
2    9.0
3    2.6
4    8.8
5    3.0
6    4.1
7    7.9
8    6.3
dtype: float64



In [36]:

    
values = series.values
values = values.reshape((len(values), 1))
print(values.shape)









    



(9, 1)



In [39]:

    
scaler = StandardScaler()
scaler = scaler.fit(values)
print('Mean: %f, StandardDeviation: %f' % (scaler.mean_, sqrt(scaler.var_)))
standardized = scaler.transform(values)
print(standardized)
inversed = scaler.inverse_transform(standardized)
print(inversed)









    



Mean: 5.355556, StandardDeviation: 2.712568
[[-1.60569456]
 [ 0.05325007]
 [ 1.34354035]
 [-1.01584758]
 [ 1.26980948]
 [-0.86838584]
 [-0.46286604]
 [ 0.93802055]
 [ 0.34817357]]
[[ 1. ]
 [ 5.5]
 [ 9. ]
 [ 2.6]
 [ 8.8]
 [ 3. ]
 [ 4.1]
 [ 7.9]
 [ 6.3]]

OneHotEncoder



In [55]:

    
from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder



In [56]:

    
data = ['cold', 'cold', 'warm', 'cold', 'hot', 'hot', 'warm', 'cold', 'warm', 'hot']
values = array(data)



In [57]:

    
values









    Out[57]:





array(['cold', 'cold', 'warm', 'cold', 'hot', 'hot', 'warm', 'cold',
       'warm', 'hot'],
      dtype='<U4')



In [72]:

    
# integer encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
print(integer_encoded)









    



[0 0 2 0 1 1 2 0 2 1]



In [73]:

    
# onehot encode
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
print(onehot_encoded)









    



[[ 1.  0.  0.]
 [ 1.  0.  0.]
 [ 0.  0.  1.]
 [ 1.  0.  0.]
 [ 0.  1.  0.]
 [ 0.  1.  0.]
 [ 0.  0.  1.]
 [ 1.  0.  0.]
 [ 0.  0.  1.]
 [ 0.  1.  0.]]



In [69]:

    
inverted = label_encoder.inverse_transform([argmax(onehot_encoded[0, :])])
inverted









    Out[69]:





array(['cold'],
      dtype='<U4')

Sequence Padding



In [81]:

    
import numpy as np
from keras.preprocessing.sequence import pad_sequences



In [92]:

    
sequences = [
    [1, 2, 3, 4],
    [1, 2, 3],
    [1]
]



In [93]:

    
sequences









    Out[93]:





[[1, 2, 3, 4], [1, 2, 3], [1]]



In [94]:

    
padded = pad_sequences(sequences)
print(padded)



In [95]:

    
padded = pad_sequences(sequences, padding='post')
print(padded)

Sequence Truncation



In [96]:

    
from keras.preprocessing.sequence import pad_sequences



In [97]:

    
sequences = [
    [1, 2, 3, 4],
    [1, 2, 3],
    [1]
]



In [100]:

    
truncated = pad_sequences(sequences, maxlen=2)
print(truncated)









    



[[3 4]
 [2 3]
 [0 1]]



In [101]:

    
truncated = pad_sequences(sequences, maxlen=2, truncating='post')
print(truncated)









    



[[1 2]
 [1 2]
 [0 1]]

Sequence vs Supervised Learning



In [104]:

    
from pandas import DataFrame
df = DataFrame()
df['t'] = [x for x in range(10)]
df['t-1'] = df['t'].shift(-1)
print(df)