In [11]:
import numpy as np
import pandas as pd
# rawArray = np.loadtxt('../data/vehicleinfo/imports-85.data', dtype=object ,delimiter=',')

In [12]:
df = pd.read_csv('../data/vehicleinfo/imports-85.data',delimiter=',', sep='\t', header=None)

In [13]:
df.head()


Out[13]:
0 1 2 3 4 5 6 7 8 9 ... 16 17 18 19 20 21 22 23 24 25
0 3 ? alfa-romero gas std two convertible rwd front 88.6 ... 130 mpfi 3.47 2.68 9.0 111 5000 21 27 13495
1 3 ? alfa-romero gas std two convertible rwd front 88.6 ... 130 mpfi 3.47 2.68 9.0 111 5000 21 27 16500
2 1 ? alfa-romero gas std two hatchback rwd front 94.5 ... 152 mpfi 2.68 3.47 9.0 154 5000 19 26 16500
3 2 164 audi gas std four sedan fwd front 99.8 ... 109 mpfi 3.19 3.40 10.0 102 5500 24 30 13950
4 2 164 audi gas std four sedan 4wd front 99.4 ... 136 mpfi 3.19 3.40 8.0 115 5500 18 22 17450

5 rows × 26 columns


In [14]:
# 将? 转换为None
df.replace(to_replace='?',value=np.nan,inplace=True)

In [15]:
df.head()


Out[15]:
0 1 2 3 4 5 6 7 8 9 ... 16 17 18 19 20 21 22 23 24 25
0 3 NaN alfa-romero gas std two convertible rwd front 88.6 ... 130 mpfi 3.47 2.68 9.0 111 5000 21 27 13495
1 3 NaN alfa-romero gas std two convertible rwd front 88.6 ... 130 mpfi 3.47 2.68 9.0 111 5000 21 27 16500
2 1 NaN alfa-romero gas std two hatchback rwd front 94.5 ... 152 mpfi 2.68 3.47 9.0 154 5000 19 26 16500
3 2 164 audi gas std four sedan fwd front 99.8 ... 109 mpfi 3.19 3.40 10.0 102 5500 24 30 13950
4 2 164 audi gas std four sedan 4wd front 99.4 ... 136 mpfi 3.19 3.40 8.0 115 5500 18 22 17450

5 rows × 26 columns


In [16]:
# 表头建立
df.columns = ['symboling','normalized-losses','make','fuel-type','aspiration','num-of-doors','body-style','drive-wheels','engine-location','wheel-base','length','width','height','curb-weight','engine-type','num-of-cylinders','engine-size','fuel-system','bore','stroke','compression-ratio','horsepower','peak-rpm','city-mpg','highway-mpg','price']

In [17]:
df.head()


Out[17]:
symboling normalized-losses make fuel-type aspiration num-of-doors body-style drive-wheels engine-location wheel-base ... engine-size fuel-system bore stroke compression-ratio horsepower peak-rpm city-mpg highway-mpg price
0 3 NaN alfa-romero gas std two convertible rwd front 88.6 ... 130 mpfi 3.47 2.68 9.0 111 5000 21 27 13495
1 3 NaN alfa-romero gas std two convertible rwd front 88.6 ... 130 mpfi 3.47 2.68 9.0 111 5000 21 27 16500
2 1 NaN alfa-romero gas std two hatchback rwd front 94.5 ... 152 mpfi 2.68 3.47 9.0 154 5000 19 26 16500
3 2 164 audi gas std four sedan fwd front 99.8 ... 109 mpfi 3.19 3.40 10.0 102 5500 24 30 13950
4 2 164 audi gas std four sedan 4wd front 99.4 ... 136 mpfi 3.19 3.40 8.0 115 5500 18 22 17450

5 rows × 26 columns


In [18]:
# 选择部分特征值作为实验数据
data = df[['wheel-base','length','width','height','curb-weight','engine-size','bore','stroke','compression-ratio','horsepower','peak-rpm','highway-mpg']]

In [39]:
data_after=df[['wheel-base','length','width','height','curb-weight','engine-size','stroke','compression-ratio','peak-rpm']]
data_after.head()


Out[39]:
wheel-base length width height curb-weight engine-size stroke compression-ratio peak-rpm
0 88.6 168.8 64.1 48.8 2548 130 2.68 9.0 5000
1 88.6 168.8 64.1 48.8 2548 130 2.68 9.0 5000
2 94.5 171.2 65.5 52.4 2823 152 3.47 9.0 5000
3 99.8 176.6 66.2 54.3 2337 109 3.40 10.0 5500
4 99.4 176.6 66.4 54.3 2824 136 3.40 8.0 5500

In [19]:
data.shape


Out[19]:
(205, 12)

In [20]:
# 新增两条随机数据作为实验
data2 = data.copy()
# 生成随机数据
random01 = np.random.rand(data.shape[0])
random02 = np.random.rand(data.shape[0])
data2['random01']=random01
data2['random02']=random02
data2.head()


Out[20]:
wheel-base length width height curb-weight engine-size bore stroke compression-ratio horsepower peak-rpm highway-mpg random01 random02
0 88.6 168.8 64.1 48.8 2548 130 3.47 2.68 9.0 111 5000 27 0.100502 0.246372
1 88.6 168.8 64.1 48.8 2548 130 3.47 2.68 9.0 111 5000 27 0.753619 0.172148
2 94.5 171.2 65.5 52.4 2823 152 2.68 3.47 9.0 154 5000 26 0.787117 0.673282
3 99.8 176.6 66.2 54.3 2337 109 3.19 3.40 10.0 102 5500 30 0.199307 0.009581
4 99.4 176.6 66.4 54.3 2824 136 3.19 3.40 8.0 115 5500 22 0.444355 0.247614

In [21]:
data2= data2.values
data2 = data2.astype(np.float)
# 去除Nan的行
data2=np.delete(data2,np.where(np.isnan(data2))[0],axis=0)
Y= data2[:,-3]
X = np.delete(data2, -3, 1)

In [22]:
# 归一化处理
X1 = (X-np.mean(X,axis=0))/np.std(X,axis=0)
Y = (Y-np.mean(Y))/np.std(Y)
X1.shape,Y.shape


Out[22]:
((199, 13), (199,))

In [23]:
Y.reshape((199,1))
Y.shape


Out[23]:
(199,)

In [24]:
X1-np.std(X1,axis=0)


Out[24]:
array([[-2.68891279, -1.42539008, -1.8317241 , ..., -1.2311026 ,
        -2.22590236, -1.77142243],
       [-2.68891279, -1.42539008, -1.8317241 , ..., -1.2311026 ,
        -0.05562281, -2.02037631],
       [-1.71702017, -1.23274234, -1.18680806, ..., -1.2311026 ,
         0.05569089, -0.3395264 ],
       ...,
       [ 0.68800224,  0.18000771,  0.37941663, ..., -0.15908823,
        -1.17343659, -1.73044764],
       [ 0.68800224,  0.18000771,  0.37941663, ..., -1.65990834,
        -0.21819131, -2.21695705],
       [ 0.68800224,  0.18000771,  0.37941663, ..., -0.3734911 ,
        -1.59882625, -1.38431351]])

r = \frac{\sum (x - m_x) (y - m_y)} {\sqrt{\sum (x - m_x)^2 \sum (y - m_y)^2}}


In [25]:
from scipy.stats import pearsonr
def pearsonrs(x,y):
    pears = []
    # 循环列处理
    for arr in x.T:
        pears.append(pearsonr(arr,Y)[0])
    return np.array(pears)

In [26]:
pears = np.abs(pearsonrs(X1,Y))
index = np.argsort(-pears)
pears,index


Out[26]:
(array([0.56658047, 0.72508504, 0.68858184, 0.15241849, 0.8148516 ,
        0.72867957, 0.59532071, 0.04577927, 0.26468372, 0.77903191,
        0.0136203 , 0.05790304, 0.05676424]),
 array([ 4,  9,  5,  1,  2,  6,  0,  8,  3, 11, 12,  7, 10], dtype=int64))

In [38]:
# 选取相关系数大于0.1的,即前9个特征
X2 = X1[:,index[0:9]]
X2.shape


Out[38]:
(199, 9)

In [28]:
# 1,分别将数据分为训练集和测试集,随机分配
X2Y = np.insert(X2,0,Y,axis=1)
X1Y= np.insert(X1,0,Y,axis=1)
# 过滤后
np.array(np.random.shuffle(X2Y))

trp = X2Y[:140,:]
tep =X2Y[140:,:]
# 原始
np.random.shuffle(X1Y)
trraw= X1Y[:140,:]
teraw= X1Y[140:,:]
trp.shape,tep.shape,trraw.shape,teraw.shape


Out[28]:
((140, 10), (59, 10), (140, 14), (59, 14))

分别使用svr训练模型


In [29]:
# 三种不同的实现方式: SVR,NuSVR和LinearSVR。LinearSVR 提供的实现速度比SVR线性内核快,但仅考虑线性内核,而NuSVR实现的方式与SVR和略有不同LinearSVR。
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
# C:一个浮点数,惩罚参数
# loss:字符串。表示损失函数。可取值为
# 'hinge':合页损失函数;
# 'squared_hinge':合页损失函数的平方
# max_iter:一个整数,指定最大的迭代次数
# svc = svm.SVR(C=1, kernel='rbf')
regr1 = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.1))
regr1.fit(trp[:,1:], trp[:,0])
print(regr1.score(tep[:,1:],tep[:,0]))# 预测准确度
# 
regr2 = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.1))
regr2.fit(trraw[:,1:], trraw[:,0])
print(regr2.score(teraw[:,1:],teraw[:,0]))# 预测准确度


0.8260793187195429
0.7176690210157035

In [30]:
# 比较预测准确度可以看到
# 去除权重较小特征值后,两者的准确度非常接近,证明效果不错
# 随机挑选预测:Yba 和Y
regr1.predict([X2[3,:]]),Y[3]


Out[30]:
(array([0.05418867]), -0.131404085206265)

In [31]:
# 引入pca压缩,再次压缩
# PCA的算法相当简单。 在确保数据被归一化之后,输出仅仅是原始数据的协方差矩阵的奇异值分解
def pca(X):
    X = np.matrix(X)
    cov = (X.T * X) / X.shape[0]
    # perform SVD
    U, S, V = np.linalg.svd(cov)
    return U, S, V

In [32]:
# 有效地减少了维数。
def project_data(X, U, k):
    U_reduced = U[:,:k]
    return np.dot(X, U_reduced)

In [33]:
# 划分k=4,训练集
z=trp[:,1:]
U, S, V = pca(z)
Z = project_data(z, U, 4)
yz=trp[:,0]
# 测试集
tz=tep[:,1:]
U1, S1, V1 = pca(tz)
tz = project_data(tz, U1, 4)
tyz=tep[:,0]

In [34]:
regr3 = make_pipeline(StandardScaler(), SVR(C=1.0, epsilon=0.1))
regr3.fit(Z, yz)
print(regr3.score(tz,tyz))# 预测准确度


-0.003784079367260107

皮尔逊相关系数定义为两个变量之间的协方差和标准差的商:

总体相关系数,常用希腊小写字母ρ 作为代表符号。估算样本的协方差和标准差,可得到皮尔逊相关系数,常用英文小写字母 r

皮尔逊相关系数有一个重要的数学特性是,因两个变量的位置和尺度的变化并不会引起该系数的改变,即它该变化的不变量(由符号确定)。

svd


In [35]:
求相关系数矩阵 rij,当0<i,j<n 其中n为原数据集类别数量

求得相关矩阵后分别计算每个特征值对整体影响带入函数计算整理权重值w

得到相关权重W矩阵后适当去除特征值相关权重较小的特征集合

剩下的数据集分离要预测的结果集将剩下特征集做PCA主要成分分析),得到训练需要的维度K

利用PCA降维后车辆数据后通过SVR回归训练

训练后测试集验证是否满足


  File "<ipython-input-35-a1bd9b6c7617>", line 1
    求相关系数矩阵 rij,当0<i,j<n 其中n为原数据集类别数量
            ^
SyntaxError: invalid syntax