In [56]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline

In [57]:
train = pd.read_csv('./regression_train.csv')
test = pd.read_csv('./regression_test.csv')
merge = pd.concat([train, test])

In [58]:
train.head()


Out[58]:
y 1 2 3 4 5 6 7 8 9 10 11 12
0 90 6 2 1036 103 114 1.00 1.00 172076 355965 2.0 6527 1851864
1 88 1 0 2165 205 101 0.40 1.20 43107 44139 3.0 130 1131931
2 85 62 77 3806 258 166 1.40 1.40 492142 268706 5.2 256 1314590
3 81 5 0 4721 256 177 0.99 2.58 524787 174964 1.0 233 972606
4 79 42 55 3949 249 244 2.60 4.60 197289 529200 3.4 331 1013805

In [59]:
train.describe()


Out[59]:
y 1 2 3 4 5 6 7 8 9 10 11 12
count 5885.000000 5885.000000 5885.000000 5885.000000 5885.000000 5885.000000 5885.000000 5885.000000 5.885000e+03 5.885000e+03 5885.000000 5885.000000 5.885000e+03
mean 83.882243 18.482923 12.758369 2317.068649 209.980799 149.201869 1.910194 2.832087 1.966200e+05 9.465236e+04 20.375412 1773.934919 1.329557e+06
std 18.503814 43.389243 28.283702 1649.238358 184.784991 142.088398 2.529381 5.314881 2.405632e+05 1.385465e+05 132.611086 2486.094990 4.238756e+05
min 0.000000 0.000000 0.000000 125.000000 6.000000 7.000000 0.000000 0.000000 4.160000e+02 1.498000e+03 1.000000 55.000000 2.000000e+00
25% 81.000000 2.000000 0.000000 998.000000 85.000000 62.000000 0.400000 0.200000 3.439000e+04 2.288400e+04 1.200000 232.000000 1.042893e+06
50% 89.000000 7.000000 1.000000 2057.000000 166.000000 117.000000 0.800000 1.200000 1.234760e+05 4.666500e+04 2.000000 579.000000 1.293494e+06
75% 94.000000 19.000000 10.000000 3346.000000 280.000000 186.000000 2.400000 2.800000 2.653190e+05 1.053070e+05 3.000000 2029.000000 1.732208e+06
max 99.000000 825.000000 543.000000 12493.000000 2503.000000 2485.000000 20.120000 59.560000 2.526649e+06 1.801623e+06 2823.000000 12012.000000 1.892571e+06

发现特征10的std跟mean差也差不多十倍,这个特征应该有特别之处,我们来看看到底有什么特别


In [60]:
train.plot('y', '10', 'scatter')


Out[60]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f9fcae069e8>

可以看到,上图y=0附近的时候特征10的值都比较大


In [61]:
fig = plt.figure()
ax = Axes3D(fig)
# ax.plot_surface(train.loc[:, 'y'], train.loc[:, '1'], train.loc[:, '2'], rstride=1, cstride=1, cmap='rainbow')
ax.scatter(train.loc[:, 'y'], train.loc[:, '1'], train.loc[:, '2'], c='r')

ax.set_xlabel('y')
ax.set_ylabel('1')
ax.set_zlabel('2')

plt.show()



In [62]:
fig = plt.figure()
ax = Axes3D(fig)
# ax.plot_surface(train.loc[:, 'y'], train.loc[:, '1'], train.loc[:, '2'], rstride=1, cstride=1, cmap='rainbow')
ax.scatter(train.loc[:, '4'], train.loc[:, '5'], train.loc[:, '6'], c='r')

ax.set_xlabel('4')
ax.set_ylabel('5')
ax.set_zlabel('6')

plt.show()



In [65]:
tmptrain = train[train['y']>1]
tmptrain.head()


Out[65]:
y 1 2 3 4 5 6 7 8 9 10 11 12
0 90 6 2 1036 103 114 1.00 1.00 172076 355965 2.0 6527 1851864
1 88 1 0 2165 205 101 0.40 1.20 43107 44139 3.0 130 1131931
2 85 62 77 3806 258 166 1.40 1.40 492142 268706 5.2 256 1314590
3 81 5 0 4721 256 177 0.99 2.58 524787 174964 1.0 233 972606
4 79 42 55 3949 249 244 2.60 4.60 197289 529200 3.4 331 1013805

In [82]:
tmptrain.sort_values('1').plot('1', '2', 'line', figsize=(100,10))


Out[82]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f9fab644f60>

In [ ]: