In [4]:
from sklearn.datasets import load_iris

# 导入 IRIS 数据集
iris = load_iris()


Out[4]:
(150, 4)

In [7]:
# 特征矩阵
iris.data
# 目标向量
iris.target

print('特征 shape: ', iris.data.shape)
print('目标 shape: ', iris.target.shape)


特征 shape:  (150, 4)
目标 shape:  (150,)

数据预处理  

通过特征提取,我们能得到未经处理的特征,这时的特征可能有以下问题:

  • 不属于同一量纲:即特征的规格不一样,不能够放在一起比较。无量纲化可以解决这一问题。
  • 信息冗余:对于某些定量特征,其包含的有效信息为区间划分,例如学习成绩,假若只关心“及格”或不“及格”,那么需要将定量的考分,转换成“1”和“0”表示及格和未及格。二值化可以解决这一问题。
  • 定性特征不能直接使用:某些机器学习算法和模型只能接受定量特征的输入,那么需要将定性特征转换为定量特征。最简单的方式是为每一种定性值指定一个定量值,但是这种方式过于灵活,增加了调参的工作。通常使用哑编码的方式将定性特征转换为定量特征:假设有N种定性值,则将这一个特征扩展为N种特征,当原始特征值为第i种定性值时,第i个扩展特征赋值为1,其他扩展特征赋值为0。哑编码的方式相比直接指定的方式,不用增加调参的工作,对于线性模型来说,使用哑编码后的特征可达到非线性的效果。
  • 存在缺失值:缺失值需要补充。
  • 信息利用率低:不同的机器学习算法和模型对数据中信息的利用是不同的,之前提到在线性模型中,使用对定性特征哑编码可以达到非线性的效果。类似地,对定量变量多项式化,或者进行其他的转换,都能达到非线性的效果。  

我们使用sklearn中的preproccessing库来进行数据预处理,可以覆盖以上问题的解决方案。

无量钢化

无量纲化使不同规格的数据转换到同一规格。常见的无量纲化方法有标准化和区间缩放法。标准化的前提是特征值服从正态分布,标准化后,其转换成标准正态分布。区间缩放法利用了边界值信息,将特征的取值区间缩放到某个特点的范围,例如[0, 1]等。

标准化

标准化需要计算特征的均值和标准差,公式表达为:  使用preproccessing库的StandardScaler类对数据进行标准化的代码如下:


In [8]:
iris.data


Out[8]:
array([[ 5.1,  3.5,  1.4,  0.2],
       [ 4.9,  3. ,  1.4,  0.2],
       [ 4.7,  3.2,  1.3,  0.2],
       [ 4.6,  3.1,  1.5,  0.2],
       [ 5. ,  3.6,  1.4,  0.2],
       [ 5.4,  3.9,  1.7,  0.4],
       [ 4.6,  3.4,  1.4,  0.3],
       [ 5. ,  3.4,  1.5,  0.2],
       [ 4.4,  2.9,  1.4,  0.2],
       [ 4.9,  3.1,  1.5,  0.1],
       [ 5.4,  3.7,  1.5,  0.2],
       [ 4.8,  3.4,  1.6,  0.2],
       [ 4.8,  3. ,  1.4,  0.1],
       [ 4.3,  3. ,  1.1,  0.1],
       [ 5.8,  4. ,  1.2,  0.2],
       [ 5.7,  4.4,  1.5,  0.4],
       [ 5.4,  3.9,  1.3,  0.4],
       [ 5.1,  3.5,  1.4,  0.3],
       [ 5.7,  3.8,  1.7,  0.3],
       [ 5.1,  3.8,  1.5,  0.3],
       [ 5.4,  3.4,  1.7,  0.2],
       [ 5.1,  3.7,  1.5,  0.4],
       [ 4.6,  3.6,  1. ,  0.2],
       [ 5.1,  3.3,  1.7,  0.5],
       [ 4.8,  3.4,  1.9,  0.2],
       [ 5. ,  3. ,  1.6,  0.2],
       [ 5. ,  3.4,  1.6,  0.4],
       [ 5.2,  3.5,  1.5,  0.2],
       [ 5.2,  3.4,  1.4,  0.2],
       [ 4.7,  3.2,  1.6,  0.2],
       [ 4.8,  3.1,  1.6,  0.2],
       [ 5.4,  3.4,  1.5,  0.4],
       [ 5.2,  4.1,  1.5,  0.1],
       [ 5.5,  4.2,  1.4,  0.2],
       [ 4.9,  3.1,  1.5,  0.1],
       [ 5. ,  3.2,  1.2,  0.2],
       [ 5.5,  3.5,  1.3,  0.2],
       [ 4.9,  3.1,  1.5,  0.1],
       [ 4.4,  3. ,  1.3,  0.2],
       [ 5.1,  3.4,  1.5,  0.2],
       [ 5. ,  3.5,  1.3,  0.3],
       [ 4.5,  2.3,  1.3,  0.3],
       [ 4.4,  3.2,  1.3,  0.2],
       [ 5. ,  3.5,  1.6,  0.6],
       [ 5.1,  3.8,  1.9,  0.4],
       [ 4.8,  3. ,  1.4,  0.3],
       [ 5.1,  3.8,  1.6,  0.2],
       [ 4.6,  3.2,  1.4,  0.2],
       [ 5.3,  3.7,  1.5,  0.2],
       [ 5. ,  3.3,  1.4,  0.2],
       [ 7. ,  3.2,  4.7,  1.4],
       [ 6.4,  3.2,  4.5,  1.5],
       [ 6.9,  3.1,  4.9,  1.5],
       [ 5.5,  2.3,  4. ,  1.3],
       [ 6.5,  2.8,  4.6,  1.5],
       [ 5.7,  2.8,  4.5,  1.3],
       [ 6.3,  3.3,  4.7,  1.6],
       [ 4.9,  2.4,  3.3,  1. ],
       [ 6.6,  2.9,  4.6,  1.3],
       [ 5.2,  2.7,  3.9,  1.4],
       [ 5. ,  2. ,  3.5,  1. ],
       [ 5.9,  3. ,  4.2,  1.5],
       [ 6. ,  2.2,  4. ,  1. ],
       [ 6.1,  2.9,  4.7,  1.4],
       [ 5.6,  2.9,  3.6,  1.3],
       [ 6.7,  3.1,  4.4,  1.4],
       [ 5.6,  3. ,  4.5,  1.5],
       [ 5.8,  2.7,  4.1,  1. ],
       [ 6.2,  2.2,  4.5,  1.5],
       [ 5.6,  2.5,  3.9,  1.1],
       [ 5.9,  3.2,  4.8,  1.8],
       [ 6.1,  2.8,  4. ,  1.3],
       [ 6.3,  2.5,  4.9,  1.5],
       [ 6.1,  2.8,  4.7,  1.2],
       [ 6.4,  2.9,  4.3,  1.3],
       [ 6.6,  3. ,  4.4,  1.4],
       [ 6.8,  2.8,  4.8,  1.4],
       [ 6.7,  3. ,  5. ,  1.7],
       [ 6. ,  2.9,  4.5,  1.5],
       [ 5.7,  2.6,  3.5,  1. ],
       [ 5.5,  2.4,  3.8,  1.1],
       [ 5.5,  2.4,  3.7,  1. ],
       [ 5.8,  2.7,  3.9,  1.2],
       [ 6. ,  2.7,  5.1,  1.6],
       [ 5.4,  3. ,  4.5,  1.5],
       [ 6. ,  3.4,  4.5,  1.6],
       [ 6.7,  3.1,  4.7,  1.5],
       [ 6.3,  2.3,  4.4,  1.3],
       [ 5.6,  3. ,  4.1,  1.3],
       [ 5.5,  2.5,  4. ,  1.3],
       [ 5.5,  2.6,  4.4,  1.2],
       [ 6.1,  3. ,  4.6,  1.4],
       [ 5.8,  2.6,  4. ,  1.2],
       [ 5. ,  2.3,  3.3,  1. ],
       [ 5.6,  2.7,  4.2,  1.3],
       [ 5.7,  3. ,  4.2,  1.2],
       [ 5.7,  2.9,  4.2,  1.3],
       [ 6.2,  2.9,  4.3,  1.3],
       [ 5.1,  2.5,  3. ,  1.1],
       [ 5.7,  2.8,  4.1,  1.3],
       [ 6.3,  3.3,  6. ,  2.5],
       [ 5.8,  2.7,  5.1,  1.9],
       [ 7.1,  3. ,  5.9,  2.1],
       [ 6.3,  2.9,  5.6,  1.8],
       [ 6.5,  3. ,  5.8,  2.2],
       [ 7.6,  3. ,  6.6,  2.1],
       [ 4.9,  2.5,  4.5,  1.7],
       [ 7.3,  2.9,  6.3,  1.8],
       [ 6.7,  2.5,  5.8,  1.8],
       [ 7.2,  3.6,  6.1,  2.5],
       [ 6.5,  3.2,  5.1,  2. ],
       [ 6.4,  2.7,  5.3,  1.9],
       [ 6.8,  3. ,  5.5,  2.1],
       [ 5.7,  2.5,  5. ,  2. ],
       [ 5.8,  2.8,  5.1,  2.4],
       [ 6.4,  3.2,  5.3,  2.3],
       [ 6.5,  3. ,  5.5,  1.8],
       [ 7.7,  3.8,  6.7,  2.2],
       [ 7.7,  2.6,  6.9,  2.3],
       [ 6. ,  2.2,  5. ,  1.5],
       [ 6.9,  3.2,  5.7,  2.3],
       [ 5.6,  2.8,  4.9,  2. ],
       [ 7.7,  2.8,  6.7,  2. ],
       [ 6.3,  2.7,  4.9,  1.8],
       [ 6.7,  3.3,  5.7,  2.1],
       [ 7.2,  3.2,  6. ,  1.8],
       [ 6.2,  2.8,  4.8,  1.8],
       [ 6.1,  3. ,  4.9,  1.8],
       [ 6.4,  2.8,  5.6,  2.1],
       [ 7.2,  3. ,  5.8,  1.6],
       [ 7.4,  2.8,  6.1,  1.9],
       [ 7.9,  3.8,  6.4,  2. ],
       [ 6.4,  2.8,  5.6,  2.2],
       [ 6.3,  2.8,  5.1,  1.5],
       [ 6.1,  2.6,  5.6,  1.4],
       [ 7.7,  3. ,  6.1,  2.3],
       [ 6.3,  3.4,  5.6,  2.4],
       [ 6.4,  3.1,  5.5,  1.8],
       [ 6. ,  3. ,  4.8,  1.8],
       [ 6.9,  3.1,  5.4,  2.1],
       [ 6.7,  3.1,  5.6,  2.4],
       [ 6.9,  3.1,  5.1,  2.3],
       [ 5.8,  2.7,  5.1,  1.9],
       [ 6.8,  3.2,  5.9,  2.3],
       [ 6.7,  3.3,  5.7,  2.5],
       [ 6.7,  3. ,  5.2,  2.3],
       [ 6.3,  2.5,  5. ,  1.9],
       [ 6.5,  3. ,  5.2,  2. ],
       [ 6.2,  3.4,  5.4,  2.3],
       [ 5.9,  3. ,  5.1,  1.8]])

In [9]:
from sklearn.preprocessing import StandardScaler

# 标准化, 返回值为标准化后的数据
iris_standard = StandardScaler().fit_transform(iris.data)

区间缩放法

区间缩放法的思路有多种,常见的一种为利用两个最值进行缩放,公式表达为: 使用preproccessing库的MinMaxScaler类对数据进行区间缩放的代码如下:


In [11]:
from sklearn.preprocessing import MinMaxScaler

#区间缩放,返回值为缩放到[0, 1]区间的数据
iris_minmax = MinMaxScaler().fit_transform(iris.data)

标准化与归一化的区别  

简单来说,标准化是依照特征矩阵的列处理数据,其通过求z-score的方法,将样本的特征值转换到同一量纲下。归一化是依照特征矩阵的行处理数据,其目的在于样本向量在点乘运算或其他核函数计算相似性时,拥有统一的标准,也就是说都转化为“单位向量”。规则为l2的归一化公式如下:   使用preproccessing库的Normalizer类对数据进行归一化的代码如下:


In [13]:
from sklearn.preprocessing import Normalizer

#归一化,返回值为归一化后的数据
iris_norm = Normalizer().fit_transform(iris.data)

In [14]:
iris_norm


Out[14]:
array([[ 0.80377277,  0.55160877,  0.22064351,  0.0315205 ],
       [ 0.82813287,  0.50702013,  0.23660939,  0.03380134],
       [ 0.80533308,  0.54831188,  0.2227517 ,  0.03426949],
       [ 0.80003025,  0.53915082,  0.26087943,  0.03478392],
       [ 0.790965  ,  0.5694948 ,  0.2214702 ,  0.0316386 ],
       [ 0.78417499,  0.5663486 ,  0.2468699 ,  0.05808704],
       [ 0.78010936,  0.57660257,  0.23742459,  0.0508767 ],
       [ 0.80218492,  0.54548574,  0.24065548,  0.0320874 ],
       [ 0.80642366,  0.5315065 ,  0.25658935,  0.03665562],
       [ 0.81803119,  0.51752994,  0.25041771,  0.01669451],
       [ 0.80373519,  0.55070744,  0.22325977,  0.02976797],
       [ 0.786991  ,  0.55745196,  0.26233033,  0.03279129],
       [ 0.82307218,  0.51442011,  0.24006272,  0.01714734],
       [ 0.8025126 ,  0.55989251,  0.20529392,  0.01866308],
       [ 0.81120865,  0.55945424,  0.16783627,  0.02797271],
       [ 0.77381111,  0.59732787,  0.2036345 ,  0.05430253],
       [ 0.79428944,  0.57365349,  0.19121783,  0.05883625],
       [ 0.80327412,  0.55126656,  0.22050662,  0.04725142],
       [ 0.8068282 ,  0.53788547,  0.24063297,  0.04246464],
       [ 0.77964883,  0.58091482,  0.22930848,  0.0458617 ],
       [ 0.8173379 ,  0.51462016,  0.25731008,  0.03027177],
       [ 0.78591858,  0.57017622,  0.23115252,  0.06164067],
       [ 0.77577075,  0.60712493,  0.16864581,  0.03372916],
       [ 0.80597792,  0.52151512,  0.26865931,  0.07901744],
       [ 0.776114  ,  0.54974742,  0.30721179,  0.03233808],
       [ 0.82647451,  0.4958847 ,  0.26447184,  0.03305898],
       [ 0.79778206,  0.5424918 ,  0.25529026,  0.06382256],
       [ 0.80641965,  0.54278246,  0.23262105,  0.03101614],
       [ 0.81609427,  0.5336001 ,  0.21971769,  0.03138824],
       [ 0.79524064,  0.54144043,  0.27072022,  0.03384003],
       [ 0.80846584,  0.52213419,  0.26948861,  0.03368608],
       [ 0.82225028,  0.51771314,  0.22840286,  0.06090743],
       [ 0.76578311,  0.60379053,  0.22089897,  0.0147266 ],
       [ 0.77867447,  0.59462414,  0.19820805,  0.02831544],
       [ 0.81803119,  0.51752994,  0.25041771,  0.01669451],
       [ 0.82512295,  0.52807869,  0.19802951,  0.03300492],
       [ 0.82699754,  0.52627116,  0.19547215,  0.03007264],
       [ 0.81803119,  0.51752994,  0.25041771,  0.01669451],
       [ 0.80212413,  0.54690282,  0.23699122,  0.03646019],
       [ 0.80779568,  0.53853046,  0.23758697,  0.03167826],
       [ 0.80033301,  0.56023311,  0.20808658,  0.04801998],
       [ 0.86093857,  0.44003527,  0.24871559,  0.0573959 ],
       [ 0.78609038,  0.57170209,  0.23225397,  0.03573138],
       [ 0.78889479,  0.55222635,  0.25244633,  0.09466737],
       [ 0.76693897,  0.57144472,  0.28572236,  0.06015208],
       [ 0.82210585,  0.51381615,  0.23978087,  0.05138162],
       [ 0.77729093,  0.57915795,  0.24385598,  0.030482  ],
       [ 0.79594782,  0.55370283,  0.24224499,  0.03460643],
       [ 0.79837025,  0.55735281,  0.22595384,  0.03012718],
       [ 0.81228363,  0.5361072 ,  0.22743942,  0.03249135],
       [ 0.76701103,  0.35063361,  0.51499312,  0.15340221],
       [ 0.74549757,  0.37274878,  0.52417798,  0.17472599],
       [ 0.75519285,  0.33928954,  0.53629637,  0.16417236],
       [ 0.75384916,  0.31524601,  0.54825394,  0.17818253],
       [ 0.7581754 ,  0.32659863,  0.5365549 ,  0.17496355],
       [ 0.72232962,  0.35482858,  0.57026022,  0.16474184],
       [ 0.72634846,  0.38046824,  0.54187901,  0.18446945],
       [ 0.75916547,  0.37183615,  0.51127471,  0.15493173],
       [ 0.76301853,  0.33526572,  0.53180079,  0.15029153],
       [ 0.72460233,  0.37623583,  0.54345175,  0.19508524],
       [ 0.76923077,  0.30769231,  0.53846154,  0.15384615],
       [ 0.73923462,  0.37588201,  0.52623481,  0.187941  ],
       [ 0.78892752,  0.28927343,  0.52595168,  0.13148792],
       [ 0.73081412,  0.34743622,  0.56308629,  0.16772783],
       [ 0.75911707,  0.3931142 ,  0.48800383,  0.17622361],
       [ 0.76945444,  0.35601624,  0.50531337,  0.16078153],
       [ 0.70631892,  0.37838513,  0.5675777 ,  0.18919257],
       [ 0.75676497,  0.35228714,  0.53495455,  0.13047672],
       [ 0.76444238,  0.27125375,  0.55483721,  0.18494574],
       [ 0.76185188,  0.34011245,  0.53057542,  0.14964948],
       [ 0.6985796 ,  0.37889063,  0.56833595,  0.21312598],
       [ 0.77011854,  0.35349703,  0.50499576,  0.16412362],
       [ 0.74143307,  0.29421947,  0.57667016,  0.17653168],
       [ 0.73659895,  0.33811099,  0.56754345,  0.14490471],
       [ 0.76741698,  0.34773582,  0.51560829,  0.15588157],
       [ 0.76785726,  0.34902603,  0.51190484,  0.16287881],
       [ 0.76467269,  0.31486523,  0.53976896,  0.15743261],
       [ 0.74088576,  0.33173989,  0.55289982,  0.18798594],
       [ 0.73350949,  0.35452959,  0.55013212,  0.18337737],
       [ 0.78667474,  0.35883409,  0.48304589,  0.13801311],
       [ 0.76521855,  0.33391355,  0.52869645,  0.15304371],
       [ 0.77242925,  0.33706004,  0.51963422,  0.14044168],
       [ 0.76434981,  0.35581802,  0.51395936,  0.15814134],
       [ 0.70779525,  0.31850786,  0.60162596,  0.1887454 ],
       [ 0.69333409,  0.38518561,  0.57777841,  0.1925928 ],
       [ 0.71524936,  0.40530797,  0.53643702,  0.19073316],
       [ 0.75457341,  0.34913098,  0.52932761,  0.16893434],
       [ 0.77530021,  0.28304611,  0.54147951,  0.15998258],
       [ 0.72992443,  0.39103094,  0.53440896,  0.16944674],
       [ 0.74714194,  0.33960997,  0.54337595,  0.17659719],
       [ 0.72337118,  0.34195729,  0.57869695,  0.15782644],
       [ 0.73260391,  0.36029701,  0.55245541,  0.1681386 ],
       [ 0.76262994,  0.34186859,  0.52595168,  0.1577855 ],
       [ 0.76986879,  0.35413965,  0.5081134 ,  0.15397376],
       [ 0.73544284,  0.35458851,  0.55158213,  0.1707278 ],
       [ 0.73239618,  0.38547167,  0.53966034,  0.15418867],
       [ 0.73446047,  0.37367287,  0.5411814 ,  0.16750853],
       [ 0.75728103,  0.3542121 ,  0.52521104,  0.15878473],
       [ 0.78258054,  0.38361791,  0.4603415 ,  0.16879188],
       [ 0.7431482 ,  0.36505526,  0.5345452 ,  0.16948994],
       [ 0.65387747,  0.34250725,  0.62274045,  0.25947519],
       [ 0.69052512,  0.32145135,  0.60718588,  0.22620651],
       [ 0.71491405,  0.30207636,  0.59408351,  0.21145345],
       [ 0.69276796,  0.31889319,  0.61579374,  0.1979337 ],
       [ 0.68619022,  0.31670318,  0.61229281,  0.232249  ],
       [ 0.70953708,  0.28008043,  0.61617694,  0.1960563 ],
       [ 0.67054118,  0.34211284,  0.61580312,  0.23263673],
       [ 0.71366557,  0.28351098,  0.61590317,  0.17597233],
       [ 0.71414125,  0.26647062,  0.61821183,  0.19185884],
       [ 0.69198788,  0.34599394,  0.58626751,  0.24027357],
       [ 0.71562645,  0.3523084 ,  0.56149152,  0.22019275],
       [ 0.71576546,  0.30196356,  0.59274328,  0.21249287],
       [ 0.71718148,  0.31640359,  0.58007326,  0.22148252],
       [ 0.6925518 ,  0.30375079,  0.60750157,  0.24300063],
       [ 0.67767924,  0.32715549,  0.59589036,  0.28041899],
       [ 0.69589887,  0.34794944,  0.57629125,  0.25008866],
       [ 0.70610474,  0.3258945 ,  0.59747324,  0.1955367 ],
       [ 0.69299099,  0.34199555,  0.60299216,  0.19799743],
       [ 0.70600618,  0.2383917 ,  0.63265489,  0.21088496],
       [ 0.72712585,  0.26661281,  0.60593821,  0.18178146],
       [ 0.70558934,  0.32722984,  0.58287815,  0.23519645],
       [ 0.68307923,  0.34153961,  0.59769433,  0.24395687],
       [ 0.71486543,  0.25995106,  0.62202576,  0.18567933],
       [ 0.73122464,  0.31338199,  0.56873028,  0.20892133],
       [ 0.69595601,  0.3427843 ,  0.59208198,  0.21813547],
       [ 0.71529453,  0.31790868,  0.59607878,  0.17882363],
       [ 0.72785195,  0.32870733,  0.56349829,  0.21131186],
       [ 0.71171214,  0.35002236,  0.57170319,  0.21001342],
       [ 0.69594002,  0.30447376,  0.60894751,  0.22835532],
       [ 0.73089855,  0.30454106,  0.58877939,  0.1624219 ],
       [ 0.72766159,  0.27533141,  0.59982915,  0.18683203],
       [ 0.71578999,  0.34430405,  0.5798805 ,  0.18121266],
       [ 0.69417747,  0.30370264,  0.60740528,  0.2386235 ],
       [ 0.72366005,  0.32162669,  0.58582004,  0.17230001],
       [ 0.69385414,  0.29574111,  0.63698085,  0.15924521],
       [ 0.73154399,  0.28501714,  0.57953485,  0.21851314],
       [ 0.67017484,  0.36168166,  0.59571097,  0.2553047 ],
       [ 0.69804799,  0.338117  ,  0.59988499,  0.196326  ],
       [ 0.71066905,  0.35533453,  0.56853524,  0.21320072],
       [ 0.72415258,  0.32534391,  0.56672811,  0.22039426],
       [ 0.69997037,  0.32386689,  0.58504986,  0.25073566],
       [ 0.73337886,  0.32948905,  0.54206264,  0.24445962],
       [ 0.69052512,  0.32145135,  0.60718588,  0.22620651],
       [ 0.69193502,  0.32561648,  0.60035539,  0.23403685],
       [ 0.68914871,  0.33943145,  0.58629069,  0.25714504],
       [ 0.72155725,  0.32308533,  0.56001458,  0.24769876],
       [ 0.72965359,  0.28954508,  0.57909015,  0.22005426],
       [ 0.71653899,  0.3307103 ,  0.57323119,  0.22047353],
       [ 0.67467072,  0.36998072,  0.58761643,  0.25028107],
       [ 0.69025916,  0.35097923,  0.5966647 ,  0.21058754]])

In [ ]: