In [1]:
from sys import path
path.append('/home/bingnan/ecworkspace/HFT1')

In [2]:
from init import *

In [3]:
sns.set_context('talk')

In [4]:
%matplotlib inline

In [4]:

benchmark


In [83]:
rgs2.half_life/5


Out[83]:
0.011569049498593637

In [84]:
rgs_bcmk_eusq = myRgr()
rgs_bcmk_eusq.dataGo(xin_stdzd, yin2, xout_stdzd, yout2, xtest_stdzd, ytest2, align=True)
rgs_bcmk_eusq.disPrep(my_distance, metric='eu', w=None, squared=True)

In [85]:
rgs_bcmk_eusq.kernelGo(my_kernel_exp, gamma=rgs_bcmk_eusq.half_life/5)
rgs_bcmk_eusq.regressorGo('svr', epsilon=0.2, C=.1)
rgs_bcmk_eusq.fit()
rgs_bcmk_eusq.rsqGo()
rgs_bcmk_eusq.test()


in-in
'fit' 6.67 sec
in-in
'predict' 4.61 sec
out-in
'predict' 5.11 sec
'residualGo' 9.72 sec
using residual
using residual
					  ---rsq_in: 0.091258
					 ---rsq_out: 0.090886
'rsqGo' 9.73 sec
re-calc!!!
					---rsq_test: 0.057090 
'test' 8.85 sec

In [49]:
print rgs_bcmk_eusq.result.support_.shape
plt.figure(figsize=(15, 5))
sns.distplot(rgs_bcmk_eusq.result.dual_coef_)


(4473,)
Out[49]:
<matplotlib.axes.AxesSubplot at 0x7f115c808950>

In [26]:
Corr2D(rgs_bcmk_eu._yin_predict, rgs1._yin_predict)


Out[26]:
0.9478216007454503

above is SVR benchmark result

below is boosting result


In [73]:
def tst_bst(rgs_linear, rgs_svr_list, x, y, w):
    ypre_linear = rgs_linear.predict(x)
    
    x = np.dot(x, w.T)
    try:
        ypre_list = []
        
        for i, rgs_svr in enumerate(rgs_svr_list):
            ypre_list.append(rgs_svr.predict(x))
            print '\t\t\t\tNo.{0} Residual_rsq = {1}, Combined_rsq = {2}'.format(i, 
                    rsquare(rgs_svr.yout, rgs_svr._yout_predict), rsquare(y, yhat=(ypre_linear + ypre_list[i])))
        
        ypre_svr = np.zeros_like(ypre_linear)
        for ypre in ypre_list:
            ypre_svr = ypre_svr + ypre
        ypre_svr = ypre_svr / len(ypre_list)
    
    except:
        ypre_svr = rgs_svr_list.predict(x)
    rsq = rsquare(y, yhat=(ypre_linear + ypre_svr))
    print '\t\t\t\tAll in all rsq = {0}'.format(rsq)

In [82]:
tst_bst(rgs1, [rgs2, rgs3, rgs4], 
        xtest_stdzd.ix[ytest2.index], 
        ytest2, 
        pca_mat[:7, :])


'predict' 0.06 sec
out-in
'predict' 5.22 sec
using yhat
using yhat
				No.0 Residual_rsq = 0.000827642050765, Combined_rsq = 0.0588711216764
out-in
'predict' 16.13 sec
using yhat
using yhat
				No.1 Residual_rsq = 0.000294289392113, Combined_rsq = 0.0585375809833
'predict' 0.79 sec
using yhat
using yhat
				No.2 Residual_rsq = 0.00110030516693, Combined_rsq = 0.0591965912826
using yhat
				All in all rsq = 0.0589918952466

In [47]:
tst_bst(rgs1, bager, 
        xtest_stdzd.ix[ytest2.index], 
        ytest2, 
        pca_mat[:7, :])


'predict' 0.01 sec
using yhat
				All in all rsq = 0.0593426340871

In [200]:
print Corr2D(rgs2._yout_predict, rgs4._yout_predict)
# print rsquare(rgs2.yout, rgs2._yout_predict)
# print rsquare(rgs3.yout, rgs3._yout_predict)
# print rsquare(rgs2.yout, (rgs2._yout_predict + rgs3._yout_predict)/2)


0.941109399373

boosting


In [56]:
yin2 = yin.ix[::50]
yout2 = yout.ix[::15]
ytest2 = ytest.ix[::15]
print yin2.shape, yout2.shape


(10448,) (11609,)

In [86]:
rgs1 = myRgr()
rgs1.dataGo(xin_stdzd, yin2, xout_stdzd, yout2, xtest_stdzd, ytest2, align=True)
rgs1.regressorGo('Ridge', alpha=6000)
rgs1.fit()
rgs1.rsqGo()
rgs1.test()
#---------------------


'fit' 0.07 sec
'predict' 0.01 sec
'predict' 0.00 sec
'residualGo' 0.02 sec
using residual
using residual
					  ---rsq_in: 0.092371
					 ---rsq_out: 0.095636
'rsqGo' 0.02 sec
					---rsq_test: 0.058292 
'test' 0.01 sec

In [197]:



Out[197]:
array([ 1.06884203,  0.20401049,  0.1170549 ])

In [44]:
rgs_stack = myRgr()
rgs_stack.dataGo(pd.DataFrame([rgs2._yin_predict, rgs3._yin_predict]).T, rgs1.rzdu_in, 
                 pd.DataFrame([rgs2._yout_predict, rgs3._yout_predict]).T, rgs1.rzdu_out, 
                 xtest_stdzd,#pd.DataFrame([rgs2.predict(rgs2.xtest), rgs3.predict(rgs3.xtest)]).T, 
                 ytest2, align=False)
rgs_stack.regressorGo('Ridge', alpha=.5)
rgs_stack.fit()
rgs_stack.rsqGo()
#rgs_stack.test()
#---------------------
print rgs_stack.result.coef_


'fit' 0.00 sec
'predict' 0.01 sec
'predict' 0.00 sec
'residualGo' 0.02 sec
using residual
using residual
					  ---rsq_in: 0.001023
					 ---rsq_out: 0.000711
'rsqGo' 0.03 sec
[ 0.96915858 -0.07995365]

PCA

$X_{projected}$ = np.dot($X_{original}$, pca.components[:$n_{you want}$, :].T)


In [20]:
pca_mat = np.load('pca_components_calc_from_xinstdzd.npy')
explain_var = np.load('pca_explain_ratio_calc_from_xinstdzd.npy')

In [21]:
plt.figure(figsize=(15,10))
plt.plot(explain_var, marker='d')
plt.show()



In [61]:
from sklearn.ensemble import BaggingRegressor

In [62]:
temp = svm.SVR(kernel='linear', 
               gamma=rgs2.half_life*.2, 
               epsilon=.17, C=.0004)

In [63]:
bager = BaggingRegressor(temp, n_estimators=100, max_samples=0.5, n_jobs=7)

In [64]:
bager.fit(rgs4.xin, rgs4.yin)


Out[64]:
BaggingRegressor(base_estimator=SVR(C=0.0004, cache_size=200, coef0=0.0, degree=3, epsilon=0.17,
  gamma=0.011521131276865269, kernel='linear', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=0.5, n_estimators=100, n_jobs=7, oob_score=False,
         random_state=None, verbose=0, warm_start=False)

In [66]:
print bager.score(rgs4.xin, rgs4.yin)
print bager.score(rgs4.xout, rgs4.yout)


-0.000270303553432
0.00106919910111

In [75]:
rgs2 = myRgr()
rgs2.dataGo(xin_stdzd, rgs1.rzdu_in, xout_stdzd, rgs1.rzdu_out, xtest_stdzd, ytest2, align=True, w=pca_mat[:7, :])
rgs2.disPrep(my_distance, metric='eu', w=None, squared=True)


PCA!

In [76]:
for myepsilon in np.arange(.17, .2, .2):
    for myc in np.arange(.015, .09, 9.01):
        for mygamma in np.arange(.2, 2.04, 2.001):
            print '==================== C: {0} epsilon: {1} gamma: {2:.4f}'.format(myc, myepsilon, mygamma)
            rgs2.kernelGo(my_kernel_exp, gamma=rgs2.half_life * mygamma)
            rgs2.regressorGo('svr', epsilon=myepsilon, C=myc)
            rgs2.fit()
            rgs2.rsqGo()


==================== C: 0.015 epsilon: 0.17 gamma: 0.2000
in-in
'fit' 6.42 sec
in-in
'predict' 4.51 sec
out-in
'predict' 5.02 sec
'residualGo' 9.53 sec
using residual
using residual
					  ---rsq_in: 0.001018
					 ---rsq_out: 0.000828
'rsqGo' 9.54 sec

In [68]:
def my_kernel_spherical(dis, gamma=.1):
    dis /= 20.
    dis[dis > 1.] = 1.
    f_s = lambda x: 1 - x * 3 / 2. + x**3 / 2.
    return f_s(dis)

In [69]:
rgs3 = myRgr()
rgs3.dataGo(xin_stdzd, rgs1.rzdu_in, xout_stdzd, rgs1.rzdu_out, xtest_stdzd, ytest2, align=True, w=pca_mat[:7, :])
rgs3.disPrep(my_distance, metric='eu', w=None, squared=True)
#rgs3.partial_kernel = pairwise.cosine_similarity


PCA!

In [71]:
for myepsilon in np.arange(.17, .19, 1.002):
    for myc in np.arange(10e-5, 100e-5, 200e-5):
        for mygamma in np.arange(.2, 6., 9.3):
            print '==================== C: {0} epsilon: {1} gamma: {2:.4f}'.format(myc, myepsilon, mygamma)
            rgs3.kernelGo(my_kernel_spherical)
            rgs3.regressorGo('svr', epsilon=myepsilon, C=myc)
            rgs3.fit()
            rgs3.rsqGo()


==================== C: 0.0001 epsilon: 0.17 gamma: 0.2000
in-in
'fit' 16.11 sec
in-in
'predict' 14.27 sec
out-in
'predict' 15.96 sec
'residualGo' 30.23 sec
using residual
using residual
					  ---rsq_in: 0.000377
					 ---rsq_out: 0.000294
'rsqGo' 30.23 sec

In [77]:
rgs4 = myRgr()
rgs4.dataGo(xin_stdzd, rgs1.rzdu_in, xout_stdzd, rgs1.rzdu_out, xtest_stdzd, ytest2, align=True, w=pca_mat[:7, :])
#rgs4.disPrep(my_distance, metric='mh', w=pca_mat[:6, :], squared=False)


PCA!

In [97]:
sns.distplot(np.dot(xin_stdzd.ix[::1000], xin_stdzd.ix[::1000].T).ravel() * 1e-2)


Out[97]:
<matplotlib.axes.AxesSubplot at 0x7f6e35c23e10>

In [138]:
sns.distplot(pairwise.sigmoid_kernel(xin_stdzd.ix[::300], xin_stdzd.ix[::300], gamma=5e-2, coef0=0.).ravel())


Out[138]:
<matplotlib.axes.AxesSubplot at 0x7f6dcf9408d0>

In [78]:
for myepsilon in np.arange(.17,.18, 1.003):
    for myc in np.arange(2e-2, 20e-2, 20e-2):
        for mygamma in np.arange(10e-2, 16., 19.3):
            print '==================== C: {0} epsilon: {1} gamma: {2:.4f}'.format(myc, myepsilon, mygamma)
            rgs4.kernelGo('linear')
            rgs4.regressorGo('svr', epsilon=myepsilon, C=myc, 
                             gamma=mygamma, coef0=1.
                             )
            rgs4.fit()
            rgs4.rsqGo()


==================== C: 0.02 epsilon: 0.17 gamma: 0.1000
'fit' 5.27 sec
'predict' 0.71 sec
'predict' 0.79 sec
'residualGo' 1.50 sec
using residual
using residual
					  ---rsq_in: 0.000325
					 ---rsq_out: 0.001100
'rsqGo' 1.50 sec

In [67]:


In [81]:
Corr2D(rgs2._yout_predict, rgs4._yout_predict)


Out[81]:
0.89093919137807143

In [79]:
print rgs3.result.support_.shape
print rgs3.result.intercept_
sns.distplot(rgs3.result.dual_coef_)


(4474,)
[ 0.10437228]
Out[79]:
<matplotlib.axes.AxesSubplot at 0x7fa931c5d190>

In [75]:
np.median(rgs3.partial_kernel(rgs3.xin, rgs3.xin))


in-in
Out[75]:
0.98931190872007368

In [121]:
%matplotlib inline

In [120]:
sns.distplot(rgs3.yin, bins=300, kde=False, norm_hist=True)
sns.distplot(rgs1.yin.values, bins=300, kde=False, norm_hist=True)


Out[120]:
<matplotlib.axes.AxesSubplot at 0x7f7e3a74f790>

In [52]:
temp = rgs3.partial_kernel(rgs3.xin, rgs3.xin).ravel()


in-in

In [54]:



Out[54]:
array([ 0.37795561])

In [53]:
sns.distplot(temp[::10])
print np.median(temp)


0.870550563296

In [47]:
rgs3.result.support_.shape


Out[47]:
(6608,)

In [169]:
rgs2.result.dual_coef_


Out[169]:
array([[-0.5, -0.5,  0.5, ..., -0.5,  0.5,  0.5]])

In [84]:
for myepsilon in np.arange(.17, .9, .9):
    for myc in np.arange(.8, 2., 2.2):
        for mygamma in np.arange(.2, 2., 2.2):
            print '==================== C: {0} epsilon: {1} gamma: {2}'.format(myc, myepsilon, mygamma)
            rgs2 = myRgr()
            rgs2.modelGo('svr', regressor_kws={'epsilon': myepsilon, 'C': myc}, 
                         kernel=my_kernel_exp, 
                         kernel_kws={'gamma': half_life*mygamma, 'metric': 'eu', 'squared': True, 'w': None, 'full_output': False})
            rgs2.dataGo(xin_stdzd.ix[:, feature], rgs1.rzdu_in, xout_stdzd.ix[:, feature], rgs1.rzdu_out, xtest_stdzd, ytest2, align=True)
            rgs2.fit()
            rgs2.rsqGo()


==================== C: 0.8 epsilon: 0.17 gamma: 0.2
'fit' 9.42 sec
'predict' 7.10 sec
'predict' 7.86 sec
'residualGo' 14.97 sec
using residual
using residual
					  ---rsq_in: 1.580003
					 ---rsq_out: -0.408104
'rsqGo' 14.97 sec

In [28]:
rsquare(yout2, rgs2.predict(xout_stdzd.ix[yout2.index]))


'predict' 2.96 sec
using yhat
Out[28]:
-0.18474196731434001

In [33]:
sns.set_context('poster')

In [37]:
sns.distplot(rgs1.rzdu_in, kde=False, bins=100)


Out[37]:
<matplotlib.axes.AxesSubplot at 0x7f117f267350>

In [339]:
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 5))
sns.distplot(rgs_bcmk.yin, norm_hist=True, ax=ax1)
sns.distplot(rgs_bcmk.yout, norm_hist=True, ax=ax2)
sns.distplot(rgs_bcmk.rzdu_in, norm_hist=True, ax=ax3)
sns.distplot(rgs_bcmk.rzdu_out, norm_hist=True, ax=ax4)


Out[339]:
<matplotlib.axes.AxesSubplot at 0x7f2ef30eb690>

In [380]:
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 5))
sns.distplot(rgs1.rzdu_in, norm_hist=True, ax=ax1)
sns.distplot(rgs1.rzdu_out, norm_hist=True, ax=ax2)
# sns.distplot(rgs_bcmk.rzdu_in, norm_hist=True, ax=ax3)
# sns.distplot(rgs_bcmk.rzdu_out, norm_hist=True, ax=ax4)


Out[380]:
<matplotlib.axes.AxesSubplot at 0x7f2ef22e6510>

In [250]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 3))
sns.distplot(rgs1.rzdu_in, ax=ax1)
sns.distplot(rgs1.rzdu_out, ax=ax2)


Out[250]:
<matplotlib.axes.AxesSubplot at 0x7f2efaf11c50>

In [ ]:

Feature selection


In [242]:
rgs_lasso = myRgr()
rgs_lasso.dataGo(xin_stdzd, yin2, xout_stdzd, yout2, xtest_stdzd, ytest2, align=True)
rgs_lasso.regressorGo('Lasso', alpha=5e-3)
rgs_lasso.fit()
rgs_lasso.rsqGo()
rgs_lasso.test()
#---------------------


'fit' 0.54 sec
'predict' 0.00 sec
'predict' 0.01 sec
'residualGo' 0.01 sec
using residual
using residual
					  ---rsq_in: 0.109058
					 ---rsq_out: 0.092796
'rsqGo' 0.02 sec
					---rsq_test: 0.055742 
'test' 0.01 sec

In [244]:
feature = rgs_lasso.result.coef_ != 0

In [245]:
feature


Out[245]:
array([False, False, False,  True, False, False, False, False, False,
       False,  True,  True, False,  True,  True, False, False,  True,
        True, False, False, False, False, False, False, False, False,
       False,  True, False,  True, False, False, False, False,  True,
       False, False,  True, False, False, False, False,  True, False,
       False, False,  True,  True,  True,  True, False, False,  True,
        True, False, False, False, False,  True, False, False, False,
       False, False,  True,  True,  True,  True,  True,  True, False,
       False, False, False, False, False,  True,  True,  True], dtype=bool)

KNN


In [17]:
from sklearn.neighbors import KNeighborsRegressor

In [35]:
def sigmaEst(y, yhat):
    err = y - yhat
    N = len(err)
    sns.distplot(err)
    #err /= np.sqrt(N)
    return err.std()

In [36]:
def noiseEst(y, yhat, nnn):
    res = y - yhat
    res = res**2
    res = res.sum()
    N = len(y)
    ddof = N / (N**(1./5) * nnn)
    print 'ddof = {0}'.format(ddof)
    res /= N - ddof
    return res

In [37]:
def epsilonEst(x, y, nnn, w=None):
    if w is not None:
        x = np.dot(x, w.T)
    neigh = KNeighborsRegressor(n_neighbors=nnn)
    neigh.fit(x, y)
    ypre = neigh.predict(x)
    print 'rsq: {0}'.format(rsquare(y, ypre))
#     sigma2 = noiseEst(y, ypre, nnn)
#     print sigma2
#     n = len(y)
#     print 'n = {0}'.format(n)
#     return 3 * np.sqrt(sigma2 * np.log(n) / n)
    return noiseEst(y, ypre, nnn), sigmaEst(y, ypre)

In [38]:
epsilonEst(xin_stdzd.ix[rgs1.rzdu_in.ix[::].index], rgs1.rzdu_in.ix[::], 3, w=pca_mat[:5, :])


using yhat
rsq: 0.336963260836
ddof = 547.148579973
Out[38]:
(0.12054365010053021, 0.33799127931216399)

In [26]:
max(abs(yin2.mean() - 3*(yin2.std())), abs(yin2.mean() + 3*(yin2.std())))


Out[26]:
1.3118310100563189

In [148]:
neigh = KNeighborsRegressor(n_neighbors=6)

In [149]:
temp_resample = 10
neigh.fit(xin_stdzd.ix[yin2.ix[::temp_resample].index], yin2.ix[::temp_resample])


Out[149]:
KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=6, p=2,
          weights='uniform')

In [150]:
temp = neigh.predict(xin_stdzd.ix[yin2.ix[::temp_resample].index])

In [153]:
sigma2 = noiseEst(yin2.ix[::temp_resample], temp)
print sigma2


180.491719911
0.147849172851

In [156]:
n = len(yin2.ix[::temp_resample])
3 * np.sqrt(sigma2 * np.log(n) / n)


Out[156]:
0.063310063176152453

In [146]:
rsquare(yin2.ix[::temp_resample], temp)


using yhat
Out[146]:
0.60423117608552124

In [147]:
plt.scatter(yin2.ix[::temp_resample], temp, facecolor='')


Out[147]:
<matplotlib.collections.PathCollection at 0x7f6b28d17f10>

In [ ]: