In [16]:
#读取excel
import pandas as pd
# 行列式、矩阵处理
import numpy as np
#画图
import seaborn as sns
import matplotlib.pyplot as plt
#树
from sklearn import tree
#导入机器学习linear_model库
from sklearn import linear_model
#导入交叉验证库
from sklearn import cross_validation
from sklearn.linear_model import LinearRegression
# 在代码中画图
%matplotlib inline
In [17]:
#载入Excel 文件
book = pd.read_csv("/Users/page/data/book/newbook.csv")
In [18]:
book.head()
Out[18]:
In [48]:
X = book[['CBS_XS_MY', 'CBS_DXPZ', 'CBS_XSCS', 'CBS_CRTAX',
'CBS_BK_Average_price', 'CBS_BK_MY_Order', 'CBS_BK_MY_Rate',
'CBS_BK_DXPZ','User_Area',
'User_Get_Price', 'FXS_Discount', 'FXS_BK_MY_Order', 'FXS_BK_MY_Rate',
'FXS_BK_MY', 'FXS_BK_DXPZ', 'FXS_BK_XSCS', 'FXS_BK_Average_Price']]
y = book[['CBS_BK_XSCS']]
In [49]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, random_state=1)
In [50]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
In [51]:
linreg = LinearRegression()
linreg.fit(X_train, y_train)
Out[51]:
In [52]:
print(linreg.intercept_)
print (linreg.coef_)
In [53]:
#模型拟合测试集
y_pred = linreg.predict(X_test)
from sklearn import metrics
# 用scikit-learn计算MSE
print("MSE:",metrics.mean_squared_error(y_test, y_pred))
# 用scikit-learn计算RMSE
print("RMSE:",np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
In [54]:
from sklearn.model_selection import cross_val_predict
predicted = cross_val_predict(linreg, X, y, cv=10)
# 用scikit-learn计算MSE
print("MSE:",metrics.mean_squared_error(y, predicted))
# 用scikit-learn计算RMSE
print("RMSE:",np.sqrt(metrics.mean_squared_error(y, predicted)))
In [55]:
fig, ax = plt.subplots()
ax.scatter(y, predicted)
ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.show()
In [35]:
#将广告成本设为自变量X
X = np.array(book[['FXS_BK_DXPZ']])
#将点击量设为因变量Y
Y = np.array(y)
#查看自变量和因变量的行数
X.shape,Y.shape
#设置图表字体为华文细黑,字号15
plt.rc('font', family='STXihei', size=5)
#绘制散点图,广告成本X,点击量Y,设置颜色,标记点样式和透明度等参数
plt.scatter(X,Y,30,color='red',marker='o',linewidth=2,alpha=0.8)
#添加x轴标题
plt.xlabel('Average_price')
#添加y轴标题
plt.ylabel('sell')
#添加图表标题
plt.title('ana')
#设置背景网格线颜色,样式,尺寸和透明度
plt.grid(color='#95a5a6',linestyle='--', linewidth=1,axis='both',alpha=0.4)
#显示图表
plt.show()
In [37]:
from sklearn import linear_model
clf = linear_model.LinearRegression()
clf.fit (X_train,y_train)
In [ ]: