In [1]:
from sys import path
path.append('/home/bingnan/ecworkspace/HFT1')
In [2]:
sns.set_context('poster')
In [3]:
%matplotlib inline
In [4]:
from init import *
In [5]:
_xin_mean = xin.mean(axis=0)
_xin_std = xin.std(axis=0)
xin_stdzd = (xin - _xin_mean) / _xin_std
xout_stdzd = (xout - _xin_mean) / _xin_std
xtest_stdzd = (xtest - _xin_mean) / _xin_std
In [5]:
In [45]:
yin2 = yin.ix[::50]
yout2 = yout.ix[::15]
In [46]:
modr = linear_model.Ridge(alpha=6000.)
res2, rsq_in, rsq_out = MyRgrs(xin_stdzd, xout_stdzd, yin2, yout2, modr, align=True)
mycoef2 = res2.coef_
nonzero_len = (mycoef2 != 0).sum() * 1. / len(mycoef2)
rsq_test = res2.score(xtest_stdzd.ix[::15], ytest.ix[::15])
print ('rsq_test: %f, nonzero_len: %.3f' %(rsq_test, nonzero_len))
In [47]:
coef3_mat = np.arange(80)
for i in np.arange(5e-3, 10e-3, 1e-4):
modr = linear_model.Lasso(alpha=i)
res3, rsq_in, rsq_out = MyRgrs(xin_stdzd, xout_stdzd, yin2, yout2, modr, align=True)
mycoef3 = res3.coef_
coef3_mat = np.vstack((coef3_mat, mycoef3))
nonzero_len = (mycoef2 != 0).sum() * 1. / len(mycoef2)
rsq_test = res3.score(xtest_stdzd.ix[::15], ytest.ix[::15])
print 'rsq_test: {1}, nonzero_len: {2:.3f}\n====i={0:.4f}==\n'.format(rsq_test, nonzero_len, i)
coef3_mat = pd.DataFrame(data=coef3_mat[1:, :], columns=x0.columns, index=np.arange(5e-3, 10e-3, 1e-4))
In [44]:
coef3_mat = pd.DataFrame(data=coef3_mat[1:, :], columns=x0.columns, index=np.arange(5e-3, 10e-3, 1e-4))
In [49]:
%matplotlib inline
In [29]:
temp1 = coef3_mat.all(axis=0, ) != 0
In [30]:
temp2 = coef3_mat.any(axis=0, ) != 0
In [47]:
(coef3_mat.iloc[3, :] != 0).sum()
Out[47]:
In [48]:
np.abs(coef3_mat).plot()
Out[48]:
In [52]:
sns.boxplot(np.abs(coef3_mat).iloc[0,:])
Out[52]:
In [53]:
sns.distplot(np.abs(coef3_mat).iloc[0,:])
Out[53]:
In [48]:
selected_index = coef3_mat.ix[:, coef3_mat.iloc[3, :] != 0].columns
In [49]:
unselected_index = np.array([i for i in x0.columns if i not in selected_index])
unselected_index = pd.Index(unselected_index)
In [50]:
unselected_index
Out[50]:
In [51]:
selected_index
Out[51]:
In [69]:
(x0_prop.ix[coef3_mat.iloc[3, :] != 0, :]).sort_values('std', ascending=False)
Out[69]:
In [68]:
x0_prop.sort_values('std', ascending=False)
Out[68]:
In [52]:
corr_mat_unselected = np.corrcoef(xin_stdzd.ix[:, unselected_index].values, rowvar=0)
corr_mat_selected = np.corrcoef(xin_stdzd.ix[:, selected_index].values, rowvar=0)
In [53]:
corr_arr_selected = np.array([])
ran = corr_mat_selected.shape[1]
for i in range(ran):
for j in range(i+1, ran, 1):
corr_arr_selected = np.append(corr_arr_selected, corr_mat_selected[i, j])
corr_arr_unselected = np.array([])
ran = corr_mat_unselected.shape[1]
for i in range(ran):
for j in range(i+1, ran, 1):
corr_arr_unselected = np.append(corr_arr_unselected, corr_mat_unselected[i, j])
In [7]:
# all features
corr_mat = np.corrcoef(xin_stdzd.ix[:, :].values, rowvar=0)
corr_arr = np.array([])
ran = corr_mat.shape[1]
for i in range(ran):
for j in range(i+1, ran, 1):
corr_arr = np.append(corr_arr, corr_mat[i, j])
In [12]:
plt.plot(corr_arr)
Out[12]:
In [15]:
%matplotlib auto
In [21]:
def CorrHeatmap(df, cols=None):
if cols == None:
cols = df.columns
cm = np.corrcoef(df[cols].values.T) * 100
sns.set(font_scale=1.5)
fig = plt.figure(figsize=(50, 30))
ax1 = fig.add_subplot(111)
hm = sns.heatmap(cm,
cbar=True,
annot=False,
square=True,
fmt='.2f',
annot_kws={'size': 15},
yticklabels=cols,
xticklabels=cols, ax=ax1)
plt.title('Coef. of corr. Matrix (unit: percent)')
plt.savefig('Corr_Matrix')
In [22]:
CorrHeatmap(xin_stdzd)
In [127]:
print corr_arr_unselected.shape
print corr_arr_selected.shape
In [129]:
del corr_arr, corr_mat
In [54]:
plt.figure(figsize=(16,8))
# all x
sns.distplot(corr_arr_selected, label='selected', norm_hist=True)
# selected
sns.distplot(corr_arr_unselected, label='unselected', norm_hist=True)
plt.legend()
Out[54]:
In [137]:
fig = plt.figure(figsize=(16,8))
# all x
ax1 = fig.add_subplot(211)
ax2 = fig.add_subplot(212, sharex=ax1)
sns.boxplot(np.abs(corr_arr_selected), ax=ax1)
# selected
sns.boxplot(np.abs(corr_arr_unselected), ax=ax2)
Out[137]:
In [28]:
corr_mat[2, 1]
Out[28]:
In [24]:
x0_prop.ix[40:60, :]
Out[24]:
In [34]:
uniqueness = 1 - (np.abs(corr_mat).sum(axis=0) - 1.) / 79
In [36]:
np.save('uniqueness.npy', uniqueness)
In [138]:
from sklearn.decomposition import PCA
In [293]:
pca = PCA(n_components=80)
In [294]:
pca.fit(xin_stdzd.values)
Out[294]:
In [297]:
np.save('pca_components_calc_from_xinstdzd.npy', pca.components_)
In [295]:
print pca.components_.shape
print pca.explained_variance_ratio_
sns.distplot(np.log(pca.explained_variance_ratio_*1e5), kde=False)
plt.figure()
sns.distplot(pca.explained_variance_ratio_, kde=False)
Out[295]:
In [303]:
n_PCA = 20
In [304]:
projected_in = np.dot(xin_stdzd, pca.components_[:n_PCA, :].T)
projected_in = pd.DataFrame(data=projected_in, index=xin_stdzd.index, columns=np.arange(n_PCA))
In [305]:
projected_out = np.dot(xout_stdzd, pca.components_[:n_PCA, :].T)
projected_out = pd.DataFrame(data=projected_out, index=xout_stdzd.index, columns=np.arange(n_PCA))
In [306]:
projected_test = np.dot(xtest_stdzd, pca.components_[:n_PCA, :].T)
projected_test = pd.DataFrame(data=projected_test, index=xtest_stdzd.index, columns=np.arange(n_PCA))
In [307]:
modr = linear_model.Ridge(alpha=3000.)
res2, rsq_in, rsq_out = MyRgrs(projected_in, projected_out, yin2, yout2, modr, align=True)
mycoef2 = res2.coef_
nonzero_len = (mycoef2 != 0).sum() * 1. / len(mycoef2)
rsq_test = res2.score(projected_test.ix[::15], ytest.ix[::15])
print ('rsq_test: %f, nonzero_len: %.3f' %(rsq_test, nonzero_len))
In [292]:
modr = linear_model.Ridge(alpha=3000.)
res2, rsq_in, rsq_out = MyRgrs(xin_stdzd, xout_stdzd, yin2, yout2, modr, align=True)
mycoef2 = res2.coef_
nonzero_len = (mycoef2 != 0).sum() * 1. / len(mycoef2)
rsq_test = res2.score(xtest_stdzd.ix[::15], ytest.ix[::15])
print ('rsq_test: %f, nonzero_len: %.3f' %(rsq_test, nonzero_len))
In [235]:
from sklearn.manifold import TSNE
In [236]:
tsne_mod = TSNE(n_components=2, random_state=0)
In [260]:
tsne_in = tsne_mod.fit_transform(projected_in.ix[::100].values)
In [261]:
tsne_in.shape
Out[261]:
In [262]:
Blues = plt.get_cmap('Blues')
yin2 = yin.ix[::500]
yin2_norm = (yin2 - yin2.min()) / (yin2.max() - yin2.min())
mycolor = Blues(yin2_norm)
In [254]:
%matplotlib auto
In [263]:
plt.scatter(tsne_in[:, 0], tsne_in[:, 1], color=mycolor)
Out[263]:
In [270]:
plt.scatter(tsne_in[:, 0], tsne_in[:, 1], color=Blues(0.8))
Out[270]:
In [ ]: