notebook.community



In [1]:

    
from sys import path
path.append('/home/bingnan/ecworkspace/HFT1')



In [2]:

    
from init import *



In [3]:

    
sns.set_context('talk')



In [4]:

    
%matplotlib inline



In [4]:

benchmark



In [83]:

    
rgs2.half_life/5









    Out[83]:





0.011569049498593637



In [84]:

    
rgs_bcmk_eusq = myRgr()
rgs_bcmk_eusq.dataGo(xin_stdzd, yin2, xout_stdzd, yout2, xtest_stdzd, ytest2, align=True)
rgs_bcmk_eusq.disPrep(my_distance, metric='eu', w=None, squared=True)



In [85]:

    
rgs_bcmk_eusq.kernelGo(my_kernel_exp, gamma=rgs_bcmk_eusq.half_life/5)
rgs_bcmk_eusq.regressorGo('svr', epsilon=0.2, C=.1)
rgs_bcmk_eusq.fit()
rgs_bcmk_eusq.rsqGo()
rgs_bcmk_eusq.test()









    



in-in
'fit' 6.67 sec
in-in
'predict' 4.61 sec
out-in
'predict' 5.11 sec
'residualGo' 9.72 sec
using residual
using residual
					  ---rsq_in: 0.091258
					 ---rsq_out: 0.090886
'rsqGo' 9.73 sec
re-calc!!!
					---rsq_test: 0.057090 
'test' 8.85 sec



In [49]:

    
print rgs_bcmk_eusq.result.support_.shape
plt.figure(figsize=(15, 5))
sns.distplot(rgs_bcmk_eusq.result.dual_coef_)









    



(4473,)






    Out[49]:





<matplotlib.axes.AxesSubplot at 0x7f115c808950>



In [26]:

    
Corr2D(rgs_bcmk_eu._yin_predict, rgs1._yin_predict)









    Out[26]:





0.9478216007454503

above is SVR benchmark result

below is boosting result



In [73]:

    
def tst_bst(rgs_linear, rgs_svr_list, x, y, w):
    ypre_linear = rgs_linear.predict(x)
    
    x = np.dot(x, w.T)
    try:
        ypre_list = []
        
        for i, rgs_svr in enumerate(rgs_svr_list):
            ypre_list.append(rgs_svr.predict(x))
            print '\t\t\t\tNo.{0} Residual_rsq = {1}, Combined_rsq = {2}'.format(i, 
                    rsquare(rgs_svr.yout, rgs_svr._yout_predict), rsquare(y, yhat=(ypre_linear + ypre_list[i])))
        
        ypre_svr = np.zeros_like(ypre_linear)
        for ypre in ypre_list:
            ypre_svr = ypre_svr + ypre
        ypre_svr = ypre_svr / len(ypre_list)
    
    except:
        ypre_svr = rgs_svr_list.predict(x)
    rsq = rsquare(y, yhat=(ypre_linear + ypre_svr))
    print '\t\t\t\tAll in all rsq = {0}'.format(rsq)



In [82]:

    
tst_bst(rgs1, [rgs2, rgs3, rgs4], 
        xtest_stdzd.ix[ytest2.index], 
        ytest2, 
        pca_mat[:7, :])









    



'predict' 0.06 sec
out-in
'predict' 5.22 sec
using yhat
using yhat
				No.0 Residual_rsq = 0.000827642050765, Combined_rsq = 0.0588711216764
out-in
'predict' 16.13 sec
using yhat
using yhat
				No.1 Residual_rsq = 0.000294289392113, Combined_rsq = 0.0585375809833
'predict' 0.79 sec
using yhat
using yhat
				No.2 Residual_rsq = 0.00110030516693, Combined_rsq = 0.0591965912826
using yhat
				All in all rsq = 0.0589918952466



In [47]:

    
tst_bst(rgs1, bager, 
        xtest_stdzd.ix[ytest2.index], 
        ytest2, 
        pca_mat[:7, :])









    



'predict' 0.01 sec
using yhat
				All in all rsq = 0.0593426340871



In [200]:

    
print Corr2D(rgs2._yout_predict, rgs4._yout_predict)
# print rsquare(rgs2.yout, rgs2._yout_predict)
# print rsquare(rgs3.yout, rgs3._yout_predict)
# print rsquare(rgs2.yout, (rgs2._yout_predict + rgs3._yout_predict)/2)









    



0.941109399373

boosting



In [56]:

    
yin2 = yin.ix[::50]
yout2 = yout.ix[::15]
ytest2 = ytest.ix[::15]
print yin2.shape, yout2.shape









    



(10448,) (11609,)



In [86]:

    
rgs1 = myRgr()
rgs1.dataGo(xin_stdzd, yin2, xout_stdzd, yout2, xtest_stdzd, ytest2, align=True)
rgs1.regressorGo('Ridge', alpha=6000)
rgs1.fit()
rgs1.rsqGo()
rgs1.test()
#---------------------









    



'fit' 0.07 sec
'predict' 0.01 sec
'predict' 0.00 sec
'residualGo' 0.02 sec
using residual
using residual
					  ---rsq_in: 0.092371
					 ---rsq_out: 0.095636
'rsqGo' 0.02 sec
					---rsq_test: 0.058292 
'test' 0.01 sec



In [197]:









    Out[197]:





array([ 1.06884203,  0.20401049,  0.1170549 ])



In [44]:

    
rgs_stack = myRgr()
rgs_stack.dataGo(pd.DataFrame([rgs2._yin_predict, rgs3._yin_predict]).T, rgs1.rzdu_in, 
                 pd.DataFrame([rgs2._yout_predict, rgs3._yout_predict]).T, rgs1.rzdu_out, 
                 xtest_stdzd,#pd.DataFrame([rgs2.predict(rgs2.xtest), rgs3.predict(rgs3.xtest)]).T, 
                 ytest2, align=False)
rgs_stack.regressorGo('Ridge', alpha=.5)
rgs_stack.fit()
rgs_stack.rsqGo()
#rgs_stack.test()
#---------------------
print rgs_stack.result.coef_









    



'fit' 0.00 sec
'predict' 0.01 sec
'predict' 0.00 sec
'residualGo' 0.02 sec
using residual
using residual
					  ---rsq_in: 0.001023
					 ---rsq_out: 0.000711
'rsqGo' 0.03 sec
[ 0.96915858 -0.07995365]

PCA

$X_{projected}$ = np.dot($X_{original}$, pca.components[:$n_{you want}$, :].T)



In [20]:

    
pca_mat = np.load('pca_components_calc_from_xinstdzd.npy')
explain_var = np.load('pca_explain_ratio_calc_from_xinstdzd.npy')



In [21]:

    
plt.figure(figsize=(15,10))
plt.plot(explain_var, marker='d')
plt.show()



In [61]:

    
from sklearn.ensemble import BaggingRegressor



In [62]:

    
temp = svm.SVR(kernel='linear', 
               gamma=rgs2.half_life*.2, 
               epsilon=.17, C=.0004)



In [63]:

    
bager = BaggingRegressor(temp, n_estimators=100, max_samples=0.5, n_jobs=7)



In [64]:

    
bager.fit(rgs4.xin, rgs4.yin)









    Out[64]:





BaggingRegressor(base_estimator=SVR(C=0.0004, cache_size=200, coef0=0.0, degree=3, epsilon=0.17,
  gamma=0.011521131276865269, kernel='linear', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=0.5, n_estimators=100, n_jobs=7, oob_score=False,
         random_state=None, verbose=0, warm_start=False)



In [66]:

    
print bager.score(rgs4.xin, rgs4.yin)
print bager.score(rgs4.xout, rgs4.yout)









    



-0.000270303553432
0.00106919910111



In [75]:

    
rgs2 = myRgr()
rgs2.dataGo(xin_stdzd, rgs1.rzdu_in, xout_stdzd, rgs1.rzdu_out, xtest_stdzd, ytest2, align=True, w=pca_mat[:7, :])
rgs2.disPrep(my_distance, metric='eu', w=None, squared=True)









    



PCA!



In [76]:

    
for myepsilon in np.arange(.17, .2, .2):
    for myc in np.arange(.015, .09, 9.01):
        for mygamma in np.arange(.2, 2.04, 2.001):
            print '==================== C: {0} epsilon: {1} gamma: {2:.4f}'.format(myc, myepsilon, mygamma)
            rgs2.kernelGo(my_kernel_exp, gamma=rgs2.half_life * mygamma)
            rgs2.regressorGo('svr', epsilon=myepsilon, C=myc)
            rgs2.fit()
            rgs2.rsqGo()









    



==================== C: 0.015 epsilon: 0.17 gamma: 0.2000
in-in
'fit' 6.42 sec
in-in
'predict' 4.51 sec
out-in
'predict' 5.02 sec
'residualGo' 9.53 sec
using residual
using residual
					  ---rsq_in: 0.001018
					 ---rsq_out: 0.000828
'rsqGo' 9.54 sec



In [68]:

    
def my_kernel_spherical(dis, gamma=.1):
    dis /= 20.
    dis[dis > 1.] = 1.
    f_s = lambda x: 1 - x * 3 / 2. + x**3 / 2.
    return f_s(dis)



In [69]:

    
rgs3 = myRgr()
rgs3.dataGo(xin_stdzd, rgs1.rzdu_in, xout_stdzd, rgs1.rzdu_out, xtest_stdzd, ytest2, align=True, w=pca_mat[:7, :])
rgs3.disPrep(my_distance, metric='eu', w=None, squared=True)
#rgs3.partial_kernel = pairwise.cosine_similarity









    



PCA!



In [71]:

    
for myepsilon in np.arange(.17, .19, 1.002):
    for myc in np.arange(10e-5, 100e-5, 200e-5):
        for mygamma in np.arange(.2, 6., 9.3):
            print '==================== C: {0} epsilon: {1} gamma: {2:.4f}'.format(myc, myepsilon, mygamma)
            rgs3.kernelGo(my_kernel_spherical)
            rgs3.regressorGo('svr', epsilon=myepsilon, C=myc)
            rgs3.fit()
            rgs3.rsqGo()









    



==================== C: 0.0001 epsilon: 0.17 gamma: 0.2000
in-in
'fit' 16.11 sec
in-in
'predict' 14.27 sec
out-in
'predict' 15.96 sec
'residualGo' 30.23 sec
using residual
using residual
					  ---rsq_in: 0.000377
					 ---rsq_out: 0.000294
'rsqGo' 30.23 sec



In [77]:

    
rgs4 = myRgr()
rgs4.dataGo(xin_stdzd, rgs1.rzdu_in, xout_stdzd, rgs1.rzdu_out, xtest_stdzd, ytest2, align=True, w=pca_mat[:7, :])
#rgs4.disPrep(my_distance, metric='mh', w=pca_mat[:6, :], squared=False)









    



PCA!



In [97]:

    
sns.distplot(np.dot(xin_stdzd.ix[::1000], xin_stdzd.ix[::1000].T).ravel() * 1e-2)









    Out[97]:





<matplotlib.axes.AxesSubplot at 0x7f6e35c23e10>



In [138]:

    
sns.distplot(pairwise.sigmoid_kernel(xin_stdzd.ix[::300], xin_stdzd.ix[::300], gamma=5e-2, coef0=0.).ravel())









    Out[138]:





<matplotlib.axes.AxesSubplot at 0x7f6dcf9408d0>



In [78]:

    
for myepsilon in np.arange(.17,.18, 1.003):
    for myc in np.arange(2e-2, 20e-2, 20e-2):
        for mygamma in np.arange(10e-2, 16., 19.3):
            print '==================== C: {0} epsilon: {1} gamma: {2:.4f}'.format(myc, myepsilon, mygamma)
            rgs4.kernelGo('linear')
            rgs4.regressorGo('svr', epsilon=myepsilon, C=myc, 
                             gamma=mygamma, coef0=1.
                             )
            rgs4.fit()
            rgs4.rsqGo()









    



==================== C: 0.02 epsilon: 0.17 gamma: 0.1000
'fit' 5.27 sec
'predict' 0.71 sec
'predict' 0.79 sec
'residualGo' 1.50 sec
using residual
using residual
					  ---rsq_in: 0.000325
					 ---rsq_out: 0.001100
'rsqGo' 1.50 sec



In [67]:



In [81]:

    
Corr2D(rgs2._yout_predict, rgs4._yout_predict)









    Out[81]:





0.89093919137807143



In [79]:

    
print rgs3.result.support_.shape
print rgs3.result.intercept_
sns.distplot(rgs3.result.dual_coef_)









    



(4474,)
[ 0.10437228]






    Out[79]:





<matplotlib.axes.AxesSubplot at 0x7fa931c5d190>



In [75]:

    
np.median(rgs3.partial_kernel(rgs3.xin, rgs3.xin))









    



in-in






    Out[75]:





0.98931190872007368



In [121]:

    
%matplotlib inline



In [120]:

    
sns.distplot(rgs3.yin, bins=300, kde=False, norm_hist=True)
sns.distplot(rgs1.yin.values, bins=300, kde=False, norm_hist=True)









    Out[120]:





<matplotlib.axes.AxesSubplot at 0x7f7e3a74f790>



In [52]:

    
temp = rgs3.partial_kernel(rgs3.xin, rgs3.xin).ravel()









    



in-in



In [54]:









    Out[54]:





array([ 0.37795561])



In [53]:

    
sns.distplot(temp[::10])
print np.median(temp)









    



0.870550563296



In [47]:

    
rgs3.result.support_.shape









    Out[47]:





(6608,)



In [169]:

    
rgs2.result.dual_coef_









    Out[169]:





array([[-0.5, -0.5,  0.5, ..., -0.5,  0.5,  0.5]])



In [84]:

    
for myepsilon in np.arange(.17, .9, .9):
    for myc in np.arange(.8, 2., 2.2):
        for mygamma in np.arange(.2, 2., 2.2):
            print '==================== C: {0} epsilon: {1} gamma: {2}'.format(myc, myepsilon, mygamma)
            rgs2 = myRgr()
            rgs2.modelGo('svr', regressor_kws={'epsilon': myepsilon, 'C': myc}, 
                         kernel=my_kernel_exp, 
                         kernel_kws={'gamma': half_life*mygamma, 'metric': 'eu', 'squared': True, 'w': None, 'full_output': False})
            rgs2.dataGo(xin_stdzd.ix[:, feature], rgs1.rzdu_in, xout_stdzd.ix[:, feature], rgs1.rzdu_out, xtest_stdzd, ytest2, align=True)
            rgs2.fit()
            rgs2.rsqGo()









    



==================== C: 0.8 epsilon: 0.17 gamma: 0.2
'fit' 9.42 sec
'predict' 7.10 sec
'predict' 7.86 sec
'residualGo' 14.97 sec
using residual
using residual
					  ---rsq_in: 1.580003
					 ---rsq_out: -0.408104
'rsqGo' 14.97 sec



In [28]:

    
rsquare(yout2, rgs2.predict(xout_stdzd.ix[yout2.index]))









    



'predict' 2.96 sec
using yhat






    Out[28]:





-0.18474196731434001



In [33]:

    
sns.set_context('poster')



In [37]:

    
sns.distplot(rgs1.rzdu_in, kde=False, bins=100)









    Out[37]:





<matplotlib.axes.AxesSubplot at 0x7f117f267350>



In [339]:

    
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 5))
sns.distplot(rgs_bcmk.yin, norm_hist=True, ax=ax1)
sns.distplot(rgs_bcmk.yout, norm_hist=True, ax=ax2)
sns.distplot(rgs_bcmk.rzdu_in, norm_hist=True, ax=ax3)
sns.distplot(rgs_bcmk.rzdu_out, norm_hist=True, ax=ax4)









    Out[339]:





<matplotlib.axes.AxesSubplot at 0x7f2ef30eb690>



In [380]:

    
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 5))
sns.distplot(rgs1.rzdu_in, norm_hist=True, ax=ax1)
sns.distplot(rgs1.rzdu_out, norm_hist=True, ax=ax2)
# sns.distplot(rgs_bcmk.rzdu_in, norm_hist=True, ax=ax3)
# sns.distplot(rgs_bcmk.rzdu_out, norm_hist=True, ax=ax4)









    Out[380]:





<matplotlib.axes.AxesSubplot at 0x7f2ef22e6510>



In [250]:

    
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 3))
sns.distplot(rgs1.rzdu_in, ax=ax1)
sns.distplot(rgs1.rzdu_out, ax=ax2)









    Out[250]:





<matplotlib.axes.AxesSubplot at 0x7f2efaf11c50>



In [ ]:

Feature selection



In [242]:

    
rgs_lasso = myRgr()
rgs_lasso.dataGo(xin_stdzd, yin2, xout_stdzd, yout2, xtest_stdzd, ytest2, align=True)
rgs_lasso.regressorGo('Lasso', alpha=5e-3)
rgs_lasso.fit()
rgs_lasso.rsqGo()
rgs_lasso.test()
#---------------------









    



'fit' 0.54 sec
'predict' 0.00 sec
'predict' 0.01 sec
'residualGo' 0.01 sec
using residual
using residual
					  ---rsq_in: 0.109058
					 ---rsq_out: 0.092796
'rsqGo' 0.02 sec
					---rsq_test: 0.055742 
'test' 0.01 sec



In [244]:

    
feature = rgs_lasso.result.coef_ != 0



In [245]:

    
feature









    Out[245]:





array([False, False, False,  True, False, False, False, False, False,
       False,  True,  True, False,  True,  True, False, False,  True,
        True, False, False, False, False, False, False, False, False,
       False,  True, False,  True, False, False, False, False,  True,
       False, False,  True, False, False, False, False,  True, False,
       False, False,  True,  True,  True,  True, False, False,  True,
        True, False, False, False, False,  True, False, False, False,
       False, False,  True,  True,  True,  True,  True,  True, False,
       False, False, False, False, False,  True,  True,  True], dtype=bool)

KNN



In [17]:

    
from sklearn.neighbors import KNeighborsRegressor



In [35]:

    
def sigmaEst(y, yhat):
    err = y - yhat
    N = len(err)
    sns.distplot(err)
    #err /= np.sqrt(N)
    return err.std()



In [36]:

    
def noiseEst(y, yhat, nnn):
    res = y - yhat
    res = res**2
    res = res.sum()
    N = len(y)
    ddof = N / (N**(1./5) * nnn)
    print 'ddof = {0}'.format(ddof)
    res /= N - ddof
    return res



In [37]:

    
def epsilonEst(x, y, nnn, w=None):
    if w is not None:
        x = np.dot(x, w.T)
    neigh = KNeighborsRegressor(n_neighbors=nnn)
    neigh.fit(x, y)
    ypre = neigh.predict(x)
    print 'rsq: {0}'.format(rsquare(y, ypre))
#     sigma2 = noiseEst(y, ypre, nnn)
#     print sigma2
#     n = len(y)
#     print 'n = {0}'.format(n)
#     return 3 * np.sqrt(sigma2 * np.log(n) / n)
    return noiseEst(y, ypre, nnn), sigmaEst(y, ypre)



In [38]:

    
epsilonEst(xin_stdzd.ix[rgs1.rzdu_in.ix[::].index], rgs1.rzdu_in.ix[::], 3, w=pca_mat[:5, :])









    



using yhat
rsq: 0.336963260836
ddof = 547.148579973






    Out[38]:





(0.12054365010053021, 0.33799127931216399)



In [26]:

    
max(abs(yin2.mean() - 3*(yin2.std())), abs(yin2.mean() + 3*(yin2.std())))









    Out[26]:





1.3118310100563189



In [148]:

    
neigh = KNeighborsRegressor(n_neighbors=6)



In [149]:

    
temp_resample = 10
neigh.fit(xin_stdzd.ix[yin2.ix[::temp_resample].index], yin2.ix[::temp_resample])









    Out[149]:





KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=6, p=2,
          weights='uniform')



In [150]:

    
temp = neigh.predict(xin_stdzd.ix[yin2.ix[::temp_resample].index])



In [153]:

    
sigma2 = noiseEst(yin2.ix[::temp_resample], temp)
print sigma2









    



180.491719911
0.147849172851



In [156]:

    
n = len(yin2.ix[::temp_resample])
3 * np.sqrt(sigma2 * np.log(n) / n)









    Out[156]:





0.063310063176152453



In [146]:

    
rsquare(yin2.ix[::temp_resample], temp)









    



using yhat






    Out[146]:





0.60423117608552124



In [147]:

    
plt.scatter(yin2.ix[::temp_resample], temp, facecolor='')









    Out[147]:





<matplotlib.collections.PathCollection at 0x7f6b28d17f10>



In [ ]: