notebook.community

Edit and run



In [11]:

    
import numpy as np
import pandas as pd
# rawArray = np.loadtxt('../data/vehicleinfo/imports-85.data', dtype=object ,delimiter=',')



In [12]:

    
df = pd.read_csv('../data/vehicleinfo/imports-85.data',delimiter=',', sep='\t', header=None)



In [13]:

    
df.head()









    Out[13]:







  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      16
      17
      18
      19
      20
      21
      22
      23
      24
      25
    
  
  
    
      0
      3
      ?
      alfa-romero
      gas
      std
      two
      convertible
      rwd
      front
      88.6
      ...
      130
      mpfi
      3.47
      2.68
      9.0
      111
      5000
      21
      27
      13495
    
    
      1
      3
      ?
      alfa-romero
      gas
      std
      two
      convertible
      rwd
      front
      88.6
      ...
      130
      mpfi
      3.47
      2.68
      9.0
      111
      5000
      21
      27
      16500
    
    
      2
      1
      ?
      alfa-romero
      gas
      std
      two
      hatchback
      rwd
      front
      94.5
      ...
      152
      mpfi
      2.68
      3.47
      9.0
      154
      5000
      19
      26
      16500
    
    
      3
      2
      164
      audi
      gas
      std
      four
      sedan
      fwd
      front
      99.8
      ...
      109
      mpfi
      3.19
      3.40
      10.0
      102
      5500
      24
      30
      13950
    
    
      4
      2
      164
      audi
      gas
      std
      four
      sedan
      4wd
      front
      99.4
      ...
      136
      mpfi
      3.19
      3.40
      8.0
      115
      5500
      18
      22
      17450
    
  

5 rows × 26 columns



In [14]:

    
# 将？ 转换为None
df.replace(to_replace='?',value=np.nan,inplace=True)



In [15]:

    
df.head()









    Out[15]:







  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      16
      17
      18
      19
      20
      21
      22
      23
      24
      25
    
  
  
    
      0
      3
      NaN
      alfa-romero
      gas
      std
      two
      convertible
      rwd
      front
      88.6
      ...
      130
      mpfi
      3.47
      2.68
      9.0
      111
      5000
      21
      27
      13495
    
    
      1
      3
      NaN
      alfa-romero
      gas
      std
      two
      convertible
      rwd
      front
      88.6
      ...
      130
      mpfi
      3.47
      2.68
      9.0
      111
      5000
      21
      27
      16500
    
    
      2
      1
      NaN
      alfa-romero
      gas
      std
      two
      hatchback
      rwd
      front
      94.5
      ...
      152
      mpfi
      2.68
      3.47
      9.0
      154
      5000
      19
      26
      16500
    
    
      3
      2
      164
      audi
      gas
      std
      four
      sedan
      fwd
      front
      99.8
      ...
      109
      mpfi
      3.19
      3.40
      10.0
      102
      5500
      24
      30
      13950
    
    
      4
      2
      164
      audi
      gas
      std
      four
      sedan
      4wd
      front
      99.4
      ...
      136
      mpfi
      3.19
      3.40
      8.0
      115
      5500
      18
      22
      17450
    
  

5 rows × 26 columns



In [16]:

    
# 表头建立
df.columns = ['symboling','normalized-losses','make','fuel-type','aspiration','num-of-doors','body-style','drive-wheels','engine-location','wheel-base','length','width','height','curb-weight','engine-type','num-of-cylinders','engine-size','fuel-system','bore','stroke','compression-ratio','horsepower','peak-rpm','city-mpg','highway-mpg','price']



In [17]:

    
df.head()









    Out[17]:







  
    
      
      symboling
      normalized-losses
      make
      fuel-type
      aspiration
      num-of-doors
      body-style
      drive-wheels
      engine-location
      wheel-base
      ...
      engine-size
      fuel-system
      bore
      stroke
      compression-ratio
      horsepower
      peak-rpm
      city-mpg
      highway-mpg
      price
    
  
  
    
      0
      3
      NaN
      alfa-romero
      gas
      std
      two
      convertible
      rwd
      front
      88.6
      ...
      130
      mpfi
      3.47
      2.68
      9.0
      111
      5000
      21
      27
      13495
    
    
      1
      3
      NaN
      alfa-romero
      gas
      std
      two
      convertible
      rwd
      front
      88.6
      ...
      130
      mpfi
      3.47
      2.68
      9.0
      111
      5000
      21
      27
      16500
    
    
      2
      1
      NaN
      alfa-romero
      gas
      std
      two
      hatchback
      rwd
      front
      94.5
      ...
      152
      mpfi
      2.68
      3.47
      9.0
      154
      5000
      19
      26
      16500
    
    
      3
      2
      164
      audi
      gas
      std
      four
      sedan
      fwd
      front
      99.8
      ...
      109
      mpfi
      3.19
      3.40
      10.0
      102
      5500
      24
      30
      13950
    
    
      4
      2
      164
      audi
      gas
      std
      four
      sedan
      4wd
      front
      99.4
      ...
      136
      mpfi
      3.19
      3.40
      8.0
      115
      5500
      18
      22
      17450
    
  

5 rows × 26 columns



In [18]:

    
# 选择部分特征值作为实验数据
data = df[['wheel-base','length','width','height','curb-weight','engine-size','bore','stroke','compression-ratio','horsepower','peak-rpm','highway-mpg']]



In [39]:

    
data_after=df[['wheel-base','length','width','height','curb-weight','engine-size','stroke','compression-ratio','peak-rpm']]
data_after.head()









    Out[39]:







  
    
      
      wheel-base
      length
      width
      height
      curb-weight
      engine-size
      stroke
      compression-ratio
      peak-rpm
    
  
  
    
      0
      88.6
      168.8
      64.1
      48.8
      2548
      130
      2.68
      9.0
      5000
    
    
      1
      88.6
      168.8
      64.1
      48.8
      2548
      130
      2.68
      9.0
      5000
    
    
      2
      94.5
      171.2
      65.5
      52.4
      2823
      152
      3.47
      9.0
      5000
    
    
      3
      99.8
      176.6
      66.2
      54.3
      2337
      109
      3.40
      10.0
      5500
    
    
      4
      99.4
      176.6
      66.4
      54.3
      2824
      136
      3.40
      8.0
      5500



In [19]:

    
data.shape









    Out[19]:





(205, 12)



In [20]:

    
# 新增两条随机数据作为实验
data2 = data.copy()
# 生成随机数据
random01 = np.random.rand(data.shape[0])
random02 = np.random.rand(data.shape[0])
data2['random01']=random01
data2['random02']=random02
data2.head()









    Out[20]:







  
    
      
      wheel-base
      length
      width
      height
      curb-weight
      engine-size
      bore
      stroke
      compression-ratio
      horsepower
      peak-rpm
      highway-mpg
      random01
      random02
    
  
  
    
      0
      88.6
      168.8
      64.1
      48.8
      2548
      130
      3.47
      2.68
      9.0
      111
      5000
      27
      0.100502
      0.246372
    
    
      1
      88.6
      168.8
      64.1
      48.8
      2548
      130
      3.47
      2.68
      9.0
      111
      5000
      27
      0.753619
      0.172148
    
    
      2
      94.5
      171.2
      65.5
      52.4
      2823
      152
      2.68
      3.47
      9.0
      154
      5000
      26
      0.787117
      0.673282
    
    
      3
      99.8
      176.6
      66.2
      54.3
      2337
      109
      3.19
      3.40
      10.0
      102
      5500
      30
      0.199307
      0.009581
    
    
      4
      99.4
      176.6
      66.4
      54.3
      2824
      136
      3.19
      3.40
      8.0
      115
      5500
      22
      0.444355
      0.247614



In [21]:

    
data2= data2.values
data2 = data2.astype(np.float)
# 去除Nan的行
data2=np.delete(data2,np.where(np.isnan(data2))[0],axis=0)
Y= data2[:,-3]
X = np.delete(data2, -3, 1)



In [22]:

    
# 归一化处理
X1 = (X-np.mean(X,axis=0))/np.std(X,axis=0)
Y = (Y-np.mean(Y))/np.std(Y)
X1.shape,Y.shape









    Out[22]:





((199, 13), (199,))



In [23]:

    
Y.reshape((199,1))
Y.shape









    Out[23]:





(199,)



In [24]:

    
X1-np.std(X1,axis=0)









    Out[24]:





array([[-2.68891279, -1.42539008, -1.8317241 , ..., -1.2311026 ,
        -2.22590236, -1.77142243],
       [-2.68891279, -1.42539008, -1.8317241 , ..., -1.2311026 ,
        -0.05562281, -2.02037631],
       [-1.71702017, -1.23274234, -1.18680806, ..., -1.2311026 ,
         0.05569089, -0.3395264 ],
       ...,
       [ 0.68800224,  0.18000771,  0.37941663, ..., -0.15908823,
        -1.17343659, -1.73044764],
       [ 0.68800224,  0.18000771,  0.37941663, ..., -1.65990834,
        -0.21819131, -2.21695705],
       [ 0.68800224,  0.18000771,  0.37941663, ..., -0.3734911 ,
        -1.59882625, -1.38431351]])

r = \frac{\sum (x - m_x) (y - m_y)} {\sqrt{\sum (x - m_x)^2 \sum (y - m_y)^2}}



In [25]:

    
from scipy.stats import pearsonr
def pearsonrs(x,y):
    pears = []
    # 循环列处理
    for arr in x.T:
        pears.append(pearsonr(arr,Y)[0])
    return np.array(pears)



In [26]:

    
pears = np.abs(pearsonrs(X1,Y))
index = np.argsort(-pears)
pears,index









    Out[26]:





(array([0.56658047, 0.72508504, 0.68858184, 0.15241849, 0.8148516 ,
        0.72867957, 0.59532071, 0.04577927, 0.26468372, 0.77903191,
        0.0136203 , 0.05790304, 0.05676424]),
 array([ 4,  9,  5,  1,  2,  6,  0,  8,  3, 11, 12,  7, 10], dtype=int64))



In [38]:

    
# 选取相关系数大于0.1的，即前9个特征
X2 = X1[:,index[0:9]]
X2.shape









    Out[38]:





(199, 9)



In [28]:

    
# 1，分别将数据分为训练集和测试集,随机分配
X2Y = np.insert(X2,0,Y,axis=1)
X1Y= np.insert(X1,0,Y,axis=1)
# 过滤后
np.array(np.random.shuffle(X2Y))

trp = X2Y[:140,:]
tep =X2Y[140:,:]
# 原始
np.random.shuffle(X1Y)
trraw= X1Y[:140,:]
teraw= X1Y[140:,:]
trp.shape,tep.shape,trraw.shape,teraw.shape









    Out[28]:





((140, 10), (59, 10), (140, 14), (59, 14))

分别使用svr训练模型



In [29]:

    
# 三种不同的实现方式： SVR，NuSVR和LinearSVR。LinearSVR 提供的实现速度比SVR线性内核快，但仅考虑线性内核，而NuSVR实现的方式与SVR和略有不同LinearSVR。
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
# C：一个浮点数，惩罚参数
# loss：字符串。表示损失函数。可取值为
# 'hinge'：合页损失函数；
# 'squared_hinge'：合页损失函数的平方
# max_iter:一个整数，指定最大的迭代次数
# svc = svm.SVR(C=1, kernel='rbf')
regr1 = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.1))
regr1.fit(trp[:,1:], trp[:,0])
print(regr1.score(tep[:,1:],tep[:,0]))# 预测准确度
# 
regr2 = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.1))
regr2.fit(trraw[:,1:], trraw[:,0])
print(regr2.score(teraw[:,1:],teraw[:,0]))# 预测准确度









    



0.8260793187195429
0.7176690210157035



In [30]:

    
# 比较预测准确度可以看到
# 去除权重较小特征值后，两者的准确度非常接近，证明效果不错
# 随机挑选预测：Yba 和Y
regr1.predict([X2[3,:]]),Y[3]









    Out[30]:





(array([0.05418867]), -0.131404085206265)



In [31]:

    
# 引入pca压缩，再次压缩
# PCA的算法相当简单。 在确保数据被归一化之后，输出仅仅是原始数据的协方差矩阵的奇异值分解
def pca(X):
    X = np.matrix(X)
    cov = (X.T * X) / X.shape[0]
    # perform SVD
    U, S, V = np.linalg.svd(cov)
    return U, S, V



In [32]:

    
# 有效地减少了维数。
def project_data(X, U, k):
    U_reduced = U[:,:k]
    return np.dot(X, U_reduced)



In [33]:

    
# 划分k=4,训练集
z=trp[:,1:]
U, S, V = pca(z)
Z = project_data(z, U, 4)
yz=trp[:,0]
# 测试集
tz=tep[:,1:]
U1, S1, V1 = pca(tz)
tz = project_data(tz, U1, 4)
tyz=tep[:,0]



In [34]:

    
regr3 = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.1))
regr3.fit(Z, yz)
print(regr3.score(tz,tyz))# 预测准确度









    



-0.003784079367260107

皮尔逊相关系数定义为两个变量之间的协方差和标准差的商：

总体相关系数，常用希腊小写字母ρ 作为代表符号。估算样本的协方差和标准差，可得到皮尔逊相关系数，常用英文小写字母 r

皮尔逊相关系数有一个重要的数学特性是，因两个变量的位置和尺度的变化并不会引起该系数的改变，即它该变化的不变量(由符号确定)。

svd



In [35]:

    
求相关系数矩阵 rij,当0<i,j<n 其中n为原数据集类别数量

求得相关矩阵后分别计算每个特征值对整体影响带入函数计算整理权重值w

得到相关权重W矩阵后，适当去除特征值相关权重较小的特征集合

剩下的数据集分离要预测的结果集，将剩下特征集做PCA（主要成分分析），得到训练需要的维度K

利用PCA降维后车辆数据后通过SVR回归训练

训练后测试集验证是否满足









    



  File "<ipython-input-35-a1bd9b6c7617>", line 1
    求相关系数矩阵 rij,当0<i,j<n 其中n为原数据集类别数量
            ^
SyntaxError: invalid syntax

	0	1	2	3	4	5	6	7	8	9	...	16	17	18	19	20	21	22	23	24	25
0	3	?	alfa-romero	gas	std	two	convertible	rwd	front	88.6	...	130	mpfi	3.47	2.68	9.0	111	5000	21	27	13495
1	3	?	alfa-romero	gas	std	two	convertible	rwd	front	88.6	...	130	mpfi	3.47	2.68	9.0	111	5000	21	27	16500
2	1	?	alfa-romero	gas	std	two	hatchback	rwd	front	94.5	...	152	mpfi	2.68	3.47	9.0	154	5000	19	26	16500
3	2	164	audi	gas	std	four	sedan	fwd	front	99.8	...	109	mpfi	3.19	3.40	10.0	102	5500	24	30	13950
4	2	164	audi	gas	std	four	sedan	4wd	front	99.4	...	136	mpfi	3.19	3.40	8.0	115	5500	18	22	17450

	wheel-base	length	width	height	curb-weight	engine-size	stroke	compression-ratio	peak-rpm
0	88.6	168.8	64.1	48.8	2548	130	2.68	9.0	5000
1	88.6	168.8	64.1	48.8	2548	130	2.68	9.0	5000
2	94.5	171.2	65.5	52.4	2823	152	3.47	9.0	5000
3	99.8	176.6	66.2	54.3	2337	109	3.40	10.0	5500
4	99.4	176.6	66.4	54.3	2824	136	3.40	8.0	5500