In [56]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline
In [57]:
train = pd.read_csv('./regression_train.csv')
test = pd.read_csv('./regression_test.csv')
merge = pd.concat([train, test])
In [58]:
train.head()
Out[58]:
In [59]:
train.describe()
Out[59]:
发现特征10的std跟mean差也差不多十倍,这个特征应该有特别之处,我们来看看到底有什么特别
In [60]:
train.plot('y', '10', 'scatter')
Out[60]:
可以看到,上图y=0附近的时候特征10的值都比较大
In [61]:
fig = plt.figure()
ax = Axes3D(fig)
# ax.plot_surface(train.loc[:, 'y'], train.loc[:, '1'], train.loc[:, '2'], rstride=1, cstride=1, cmap='rainbow')
ax.scatter(train.loc[:, 'y'], train.loc[:, '1'], train.loc[:, '2'], c='r')
ax.set_xlabel('y')
ax.set_ylabel('1')
ax.set_zlabel('2')
plt.show()
In [62]:
fig = plt.figure()
ax = Axes3D(fig)
# ax.plot_surface(train.loc[:, 'y'], train.loc[:, '1'], train.loc[:, '2'], rstride=1, cstride=1, cmap='rainbow')
ax.scatter(train.loc[:, '4'], train.loc[:, '5'], train.loc[:, '6'], c='r')
ax.set_xlabel('4')
ax.set_ylabel('5')
ax.set_zlabel('6')
plt.show()
In [65]:
tmptrain = train[train['y']>1]
tmptrain.head()
Out[65]:
In [82]:
tmptrain.sort_values('1').plot('1', '2', 'line', figsize=(100,10))
Out[82]:
In [ ]: