In [1]:
from sys import path
path.append('/home/bingnan/ecworkspace/HFT1')

In [2]:
sns.set_context('poster')


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-2-a0be4f61f81d> in <module>()
----> 1 sns.set_context('poster')

NameError: name 'sns' is not defined

In [3]:
%matplotlib inline

In [4]:
from init import *

In [5]:
_xin_mean = xin.mean(axis=0)
_xin_std = xin.std(axis=0)
xin_stdzd = (xin - _xin_mean) / _xin_std
xout_stdzd = (xout - _xin_mean) / _xin_std
xtest_stdzd = (xtest - _xin_mean) / _xin_std

In [5]:


In [45]:
yin2 = yin.ix[::50]
yout2 = yout.ix[::15]

Feature Selection with Lasso


In [46]:
modr = linear_model.Ridge(alpha=6000.)
res2, rsq_in, rsq_out = MyRgrs(xin_stdzd, xout_stdzd, yin2, yout2, modr, align=True)
mycoef2 = res2.coef_
nonzero_len = (mycoef2 != 0).sum() * 1. / len(mycoef2)
rsq_test = res2.score(xtest_stdzd.ix[::15], ytest.ix[::15])
print ('rsq_test: %f, nonzero_len: %.3f' %(rsq_test, nonzero_len))


rsq_in: 0.092371
rsq_out: 0.095636
rsq_test: 0.058292, nonzero_len: 1.000

In [47]:
coef3_mat = np.arange(80)
for i in np.arange(5e-3, 10e-3, 1e-4):
    modr = linear_model.Lasso(alpha=i)
    res3, rsq_in, rsq_out = MyRgrs(xin_stdzd, xout_stdzd, yin2, yout2, modr, align=True)
    mycoef3 = res3.coef_
    coef3_mat = np.vstack((coef3_mat, mycoef3))
    nonzero_len = (mycoef2 != 0).sum() * 1. / len(mycoef2)
    rsq_test = res3.score(xtest_stdzd.ix[::15], ytest.ix[::15])
    print 'rsq_test: {1}, nonzero_len: {2:.3f}\n====i={0:.4f}==\n'.format(rsq_test, nonzero_len, i)
coef3_mat = pd.DataFrame(data=coef3_mat[1:, :], columns=x0.columns, index=np.arange(5e-3, 10e-3, 1e-4))


rsq_in: 0.093357
rsq_out: 0.095050
rsq_test: 1.0, nonzero_len: 0.005
====i=0.0570==

rsq_in: 0.093311
rsq_out: 0.095033
rsq_test: 1.0, nonzero_len: 0.005
====i=0.0570==

rsq_in: 0.093264
rsq_out: 0.095016
rsq_test: 1.0, nonzero_len: 0.005
====i=0.0569==

rsq_in: 0.093216
rsq_out: 0.094997
rsq_test: 1.0, nonzero_len: 0.005
====i=0.0569==

rsq_in: 0.093167
rsq_out: 0.094978
rsq_test: 1.0, nonzero_len: 0.005
====i=0.0568==

rsq_in: 0.093117
rsq_out: 0.094958
rsq_test: 1.0, nonzero_len: 0.006
====i=0.0568==

rsq_in: 0.093066
rsq_out: 0.094937
rsq_test: 1.0, nonzero_len: 0.006
====i=0.0568==

rsq_in: 0.093014
rsq_out: 0.094915
rsq_test: 1.0, nonzero_len: 0.006
====i=0.0567==

rsq_in: 0.092961
rsq_out: 0.094892
rsq_test: 1.0, nonzero_len: 0.006
====i=0.0567==

rsq_in: 0.092908
rsq_out: 0.094869
rsq_test: 1.0, nonzero_len: 0.006
====i=0.0566==

rsq_in: 0.092853
rsq_out: 0.094845
rsq_test: 1.0, nonzero_len: 0.006
====i=0.0566==

rsq_in: 0.092798
rsq_out: 0.094820
rsq_test: 1.0, nonzero_len: 0.006
====i=0.0565==

rsq_in: 0.092742
rsq_out: 0.094794
rsq_test: 1.0, nonzero_len: 0.006
====i=0.0565==

rsq_in: 0.092684
rsq_out: 0.094768
rsq_test: 1.0, nonzero_len: 0.006
====i=0.0564==

rsq_in: 0.092626
rsq_out: 0.094741
rsq_test: 1.0, nonzero_len: 0.006
====i=0.0564==

rsq_in: 0.092567
rsq_out: 0.094713
rsq_test: 1.0, nonzero_len: 0.007
====i=0.0563==

rsq_in: 0.092507
rsq_out: 0.094684
rsq_test: 1.0, nonzero_len: 0.007
====i=0.0563==

rsq_in: 0.092446
rsq_out: 0.094654
rsq_test: 1.0, nonzero_len: 0.007
====i=0.0562==

rsq_in: 0.092385
rsq_out: 0.094624
rsq_test: 1.0, nonzero_len: 0.007
====i=0.0562==

rsq_in: 0.092322
rsq_out: 0.094592
rsq_test: 1.0, nonzero_len: 0.007
====i=0.0561==

rsq_in: 0.092258
rsq_out: 0.094560
rsq_test: 1.0, nonzero_len: 0.007
====i=0.0561==

rsq_in: 0.092194
rsq_out: 0.094527
rsq_test: 1.0, nonzero_len: 0.007
====i=0.0560==

rsq_in: 0.092128
rsq_out: 0.094494
rsq_test: 1.0, nonzero_len: 0.007
====i=0.0560==

rsq_in: 0.092062
rsq_out: 0.094459
rsq_test: 1.0, nonzero_len: 0.007
====i=0.0559==

rsq_in: 0.091995
rsq_out: 0.094424
rsq_test: 1.0, nonzero_len: 0.007
====i=0.0559==

rsq_in: 0.091927
rsq_out: 0.094388
rsq_test: 1.0, nonzero_len: 0.008
====i=0.0558==

rsq_in: 0.091857
rsq_out: 0.094351
rsq_test: 1.0, nonzero_len: 0.008
====i=0.0557==

rsq_in: 0.091787
rsq_out: 0.094314
rsq_test: 1.0, nonzero_len: 0.008
====i=0.0557==

rsq_in: 0.091717
rsq_out: 0.094275
rsq_test: 1.0, nonzero_len: 0.008
====i=0.0556==

rsq_in: 0.091645
rsq_out: 0.094236
rsq_test: 1.0, nonzero_len: 0.008
====i=0.0556==

rsq_in: 0.091572
rsq_out: 0.094196
rsq_test: 1.0, nonzero_len: 0.008
====i=0.0555==

rsq_in: 0.091498
rsq_out: 0.094156
rsq_test: 1.0, nonzero_len: 0.008
====i=0.0555==

rsq_in: 0.091424
rsq_out: 0.094114
rsq_test: 1.0, nonzero_len: 0.008
====i=0.0554==

rsq_in: 0.091348
rsq_out: 0.094072
rsq_test: 1.0, nonzero_len: 0.008
====i=0.0553==

rsq_in: 0.091272
rsq_out: 0.094029
rsq_test: 1.0, nonzero_len: 0.008
====i=0.0553==

rsq_in: 0.091194
rsq_out: 0.093985
rsq_test: 1.0, nonzero_len: 0.009
====i=0.0552==

rsq_in: 0.091116
rsq_out: 0.093940
rsq_test: 1.0, nonzero_len: 0.009
====i=0.0552==

rsq_in: 0.091037
rsq_out: 0.093895
rsq_test: 1.0, nonzero_len: 0.009
====i=0.0551==

rsq_in: 0.090957
rsq_out: 0.093848
rsq_test: 1.0, nonzero_len: 0.009
====i=0.0550==

rsq_in: 0.090876
rsq_out: 0.093801
rsq_test: 1.0, nonzero_len: 0.009
====i=0.0550==

rsq_in: 0.090794
rsq_out: 0.093753
rsq_test: 1.0, nonzero_len: 0.009
====i=0.0549==

rsq_in: 0.090711
rsq_out: 0.093705
rsq_test: 1.0, nonzero_len: 0.009
====i=0.0548==

rsq_in: 0.090627
rsq_out: 0.093655
rsq_test: 1.0, nonzero_len: 0.009
====i=0.0548==

rsq_in: 0.090543
rsq_out: 0.093605
rsq_test: 1.0, nonzero_len: 0.009
====i=0.0547==

rsq_in: 0.090457
rsq_out: 0.093554
rsq_test: 1.0, nonzero_len: 0.009
====i=0.0546==

rsq_in: 0.090371
rsq_out: 0.093502
rsq_test: 1.0, nonzero_len: 0.010
====i=0.0546==

rsq_in: 0.090283
rsq_out: 0.093449
rsq_test: 1.0, nonzero_len: 0.010
====i=0.0545==

rsq_in: 0.090195
rsq_out: 0.093396
rsq_test: 1.0, nonzero_len: 0.010
====i=0.0544==

rsq_in: 0.090111
rsq_out: 0.093353
rsq_test: 1.0, nonzero_len: 0.010
====i=0.0544==

rsq_in: 0.090026
rsq_out: 0.093309
rsq_test: 1.0, nonzero_len: 0.010
====i=0.0543==


In [44]:
coef3_mat = pd.DataFrame(data=coef3_mat[1:, :], columns=x0.columns, index=np.arange(5e-3, 10e-3, 1e-4))


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-44-ce36caf6ff5a> in <module>()
----> 1 coef3_mat = pd.DataFrame(data=coef3_mat[1:, :], columns=x0.columns, index=np.arange(5e-3, 10e-3, 1e-4))

/usr/local/lib/python2.7/dist-packages/pandas/core/frame.pyc in __init__(self, data, index, columns, dtype, copy)
    253             else:
    254                 mgr = self._init_ndarray(data, index, columns, dtype=dtype,
--> 255                                          copy=copy)
    256         elif isinstance(data, (list, types.GeneratorType)):
    257             if isinstance(data, types.GeneratorType):

/usr/local/lib/python2.7/dist-packages/pandas/core/frame.pyc in _init_ndarray(self, values, index, columns, dtype, copy)
    430             values = _possibly_infer_to_datetimelike(values)
    431 
--> 432         return create_block_manager_from_blocks([values], [columns, index])
    433 
    434     @property

/usr/local/lib/python2.7/dist-packages/pandas/core/internals.pyc in create_block_manager_from_blocks(blocks, axes)
   3991         blocks = [getattr(b, 'values', b) for b in blocks]
   3992         tot_items = sum(b.shape[0] for b in blocks)
-> 3993         construction_error(tot_items, blocks[0].shape[1:], axes, e)
   3994 
   3995 

/usr/local/lib/python2.7/dist-packages/pandas/core/internals.pyc in construction_error(tot_items, block_shape, axes, e)
   3968         raise ValueError("Empty data passed with indices specified.")
   3969     raise ValueError("Shape of passed values is {0}, indices imply {1}".format(
-> 3970         passed, implied))
   3971 
   3972 

ValueError: Shape of passed values is (80, 22), indices imply (80, 50)

In [49]:
%matplotlib inline

In [29]:
temp1 = coef3_mat.all(axis=0, ) != 0

In [30]:
temp2 = coef3_mat.any(axis=0, ) != 0

In [47]:
(coef3_mat.iloc[3, :] != 0).sum()


Out[47]:
23

In [48]:
np.abs(coef3_mat).plot()


Out[48]:
<matplotlib.axes.AxesSubplot at 0x7fcd98dcb2d0>

In [52]:
sns.boxplot(np.abs(coef3_mat).iloc[0,:])


Out[52]:
<matplotlib.axes.AxesSubplot at 0x7fcd987d92d0>

In [53]:
sns.distplot(np.abs(coef3_mat).iloc[0,:])


Out[53]:
<matplotlib.axes.AxesSubplot at 0x7fcd9872a810>

In [48]:
selected_index = coef3_mat.ix[:, coef3_mat.iloc[3, :] != 0].columns

In [49]:
unselected_index = np.array([i for i in x0.columns if i not in selected_index])
unselected_index = pd.Index(unselected_index)

In [50]:
unselected_index


Out[50]:
Index([u'x0', u'x1', u'x2', u'x4', u'x6', u'x7', u'x8', u'x9', u'x14', u'x15',
       u'x16', u'x19', u'x20', u'x21', u'x22', u'x23', u'x24', u'x25', u'x26',
       u'x27', u'x28', u'x29', u'x31', u'x32', u'x34', u'x36', u'x37', u'x39',
       u'x40', u'x41', u'x42', u'x44', u'x45', u'x46', u'x47', u'x48', u'x51',
       u'x52', u'x55', u'x56', u'x57', u'x58', u'x60', u'x61', u'x62', u'x63',
       u'x64', u'x65', u'x70', u'x71', u'x72', u'x73', u'x74', u'x75', u'x76',
       u'x77', u'x79'],
      dtype='object')

In [51]:
selected_index


Out[51]:
Index([u'x3', u'x5', u'x10', u'x11', u'x12', u'x13', u'x17', u'x18', u'x30',
       u'x33', u'x35', u'x38', u'x43', u'x49', u'x50', u'x53', u'x54', u'x59',
       u'x66', u'x67', u'x68', u'x69', u'x78'],
      dtype='object')

In [69]:
(x0_prop.ix[coef3_mat.iloc[3, :] != 0, :]).sort_values('std', ascending=False)


Out[69]:
rsq_in rsq_out slope_P slope const_P const corr with Y mean std min 25% 50% 75% max
x33 0.037485 0.028196 0.000000e+00 0.111786 1.624675e-08 -0.003360 0.176089 1.335332e-02 0.756463 -1.000000 -0.800000 0.100000 0.900000 1.000000
x78 0.050408 0.041583 0.000000e+00 0.136487 4.788693e-07 -0.002975 0.201318 7.192254e-03 0.715330 -1.000000 -0.733333 0.033333 0.733333 1.000000
x35 0.056139 0.054750 0.000000e+00 0.158412 4.814137e-06 -0.002693 0.213798 2.897277e-03 0.649677 -1.000000 -0.600000 0.000000 0.600000 1.000000
x3 0.061652 0.060021 0.000000e+00 0.190966 2.283902e-01 -0.000707 0.223430 -7.706089e-03 0.552837 -3.645521 -0.314283 -0.009804 0.292229 3.176236
x5 0.035753 0.037509 0.000000e+00 0.170821 1.999827e-01 -0.000763 0.167552 -7.884684e-03 0.463596 -3.343546 -0.287744 -0.009429 0.268107 2.743771
x30 0.024310 0.013377 0.000000e+00 0.154902 7.704244e-06 -0.002679 0.137133 5.770656e-03 0.451264 -1.934732 -0.294130 0.000139 0.307959 1.904741
x50 0.003707 0.002613 0.000000e+00 0.068587 1.291578e-03 -0.001947 0.064548 4.212534e-03 0.379452 -1.600814 -0.231839 0.005940 0.242013 1.601556
x49 0.002829 0.002180 0.000000e+00 0.069968 9.567692e-04 -0.002000 0.056658 7.278583e-03 0.359988 -1.643423 -0.208904 0.007022 0.225924 1.716797
x17 0.014828 0.006880 0.000000e+00 0.158919 1.011068e-03 -0.001978 0.097267 2.353525e-03 0.349963 -1.996780 -0.187755 0.000000 0.196142 1.808665
x13 0.029680 0.021189 0.000000e+00 -0.302267 1.416186e-06 -0.002880 -0.146688 -3.353766e-03 0.256696 -1.566101 -0.158266 -0.003313 0.149536 1.466759
x18 0.010673 0.001653 0.000000e+00 0.198981 7.718421e-04 -0.002028 0.072950 2.258953e-03 0.236151 -1.439729 -0.129921 0.004698 0.138178 1.292773
x59 0.015737 0.007763 0.000000e+00 -0.343559 1.211072e-16 -0.004990 -0.104851 -8.333821e-03 0.159596 -0.500000 -0.100000 0.000000 0.100000 0.500000
x54 0.001812 -0.002550 5.057587e-208 0.120247 3.227178e-03 -0.001784 0.017994 7.571190e-04 0.136164 -0.982159 -0.067521 0.000615 0.070477 0.970479
x43 0.003796 0.000315 0.000000e+00 0.252876 3.454921e-03 -0.001769 0.056615 1.646775e-03 0.111206 -0.738249 -0.060106 0.000624 0.064521 0.802034
x38 0.013086 0.018332 0.000000e+00 1.816383 4.194454e-03 -0.001724 0.120450 1.454746e-05 0.028221 -0.586053 -0.014412 0.000000 0.014462 0.364065
x10 0.041243 0.037459 0.000000e+00 -3.292849 2.738606e-03 -0.001778 -0.183341 -9.176858e-07 0.027310 -0.195754 -0.005484 -0.000033 0.005461 0.305588
x66 0.007501 0.008411 0.000000e+00 2.018742 8.510263e-03 -0.001589 0.091926 -1.498705e-04 0.019132 -0.430980 -0.009493 0.000000 0.009427 0.340267
x67 0.011284 0.014833 0.000000e+00 2.636956 4.996012e-03 -0.001692 0.110822 8.890214e-06 0.017892 -0.294545 -0.009204 0.000000 0.009276 0.233266
x69 0.008807 0.014105 0.000000e+00 3.419984 9.739654e-05 -0.002352 0.100755 8.100167e-05 0.012853 -0.137953 -0.006610 0.000000 0.006578 0.233424
x68 0.013171 0.013859 0.000000e+00 4.229836 1.110893e-04 -0.002328 0.112625 1.430658e-04 0.012812 -0.131110 -0.006728 0.000000 0.006820 0.240946
x11 0.056412 0.055133 0.000000e+00 -9.278240 2.721224e-03 -0.001765 -0.213299 2.963735e-06 0.011194 -0.060948 -0.004582 -0.000010 0.004663 0.111311
x53 0.020334 0.020879 0.000000e+00 6.319268 5.187640e-03 -0.001677 0.131005 -2.529367e-05 0.009505 -0.159443 -0.005908 0.000000 0.005884 0.100136
x12 0.059658 0.060138 0.000000e+00 -17.241108 2.511319e-03 -0.001777 -0.219229 1.028488e-06 0.006148 -0.029745 -0.003208 0.000013 0.003258 0.059767

In [68]:
x0_prop.sort_values('std', ascending=False)


Out[68]:
rsq_in rsq_out slope_P slope const_P const corr with Y mean std min 25% 50% 75% max
x33 0.037485 0.028196 0.000000e+00 0.111786 1.624675e-08 -0.003360 0.176089 1.335332e-02 0.756463 -1.000000 -8.000000e-01 1.000000e-01 9.000000e-01 1.000000
x76 0.037044 0.028533 0.000000e+00 0.111544 1.056480e-07 -0.003163 0.175400 1.156028e-02 0.753917 -1.000000 -8.000000e-01 1.000000e-01 8.000000e-01 1.000000
x78 0.050408 0.041583 0.000000e+00 0.136487 4.788693e-07 -0.002975 0.201318 7.192254e-03 0.715330 -1.000000 -7.333333e-01 3.333333e-02 7.333333e-01 1.000000
x79 0.052589 0.046925 0.000000e+00 0.145412 8.123284e-07 -0.002911 0.205832 5.300457e-03 0.684711 -1.000000 -6.666667e-01 0.000000e+00 6.666667e-01 1.000000
x34 0.054657 0.051355 0.000000e+00 0.148977 2.921350e-07 -0.003023 0.211363 5.916476e-03 0.681527 -1.000000 -6.500000e-01 0.000000e+00 6.666667e-01 1.000000
x77 0.052336 0.048130 0.000000e+00 0.151946 2.897205e-05 -0.002468 0.204056 2.145526e-03 0.653816 -1.000000 -6.000000e-01 0.000000e+00 6.000000e-01 1.000000
x35 0.056139 0.054750 0.000000e+00 0.158412 4.814137e-06 -0.002693 0.213798 2.897277e-03 0.649677 -1.000000 -6.000000e-01 0.000000e+00 6.000000e-01 1.000000
x3 0.061652 0.060021 0.000000e+00 0.190966 2.283902e-01 -0.000707 0.223430 -7.706089e-03 0.552837 -3.645521 -3.142827e-01 -9.803903e-03 2.922287e-01 3.176236
x73 0.008162 0.014496 0.000000e+00 -0.077270 7.717035e-04 -0.002030 -0.090872 1.332508e-03 0.524056 -0.991667 -3.750000e-01 0.000000e+00 3.833333e-01 0.991667
x32 0.012593 0.005025 0.000000e+00 0.098536 1.563721e-04 -0.002278 0.094170 5.331118e-03 0.516706 -2.026272 -3.207682e-01 4.421758e-03 3.341454e-01 1.981084
x57 0.010267 0.019412 0.000000e+00 -0.099494 1.534660e-03 -0.001911 -0.102227 2.383332e-04 0.491779 -7.278310 -4.547474e-13 0.000000e+00 6.821210e-13 7.147906
x55 0.002279 0.003065 3.287589e-261 -0.046948 3.012538e-03 -0.001796 -0.045715 -3.930620e-04 0.483335 -3.664548 0.000000e+00 0.000000e+00 0.000000e+00 4.001388
x63 0.011538 0.004577 0.000000e+00 0.104184 1.346315e-03 -0.001932 0.085859 2.468522e-03 0.472604 -2.300416 -2.644439e-01 0.000000e+00 2.723456e-01 2.061524
x5 0.035753 0.037509 0.000000e+00 0.170821 1.999827e-01 -0.000763 0.167552 -7.884684e-03 0.463596 -3.343546 -2.877436e-01 -9.428620e-03 2.681067e-01 2.743771
x23 0.010344 0.020013 0.000000e+00 -0.107451 2.644953e-04 -0.002200 -0.105212 -2.091302e-03 0.459595 -7.460999 -1.213049e-01 2.469312e-07 1.244239e-01 7.040525
x75 0.009496 0.017773 0.000000e+00 -0.097513 9.872748e-04 -0.001988 -0.098996 2.318970e-04 0.455041 -0.983333 -2.000000e-01 0.000000e+00 2.000000e-01 0.983333
x30 0.024310 0.013377 0.000000e+00 0.154902 7.704244e-06 -0.002679 0.137133 5.770656e-03 0.451264 -1.934732 -2.941304e-01 1.392676e-04 3.079595e-01 1.904741
x15 0.034786 0.027140 0.000000e+00 -0.199427 1.669192e-05 -0.002564 -0.163590 -3.709938e-03 0.424584 -2.300416 -2.616417e-01 0.000000e+00 2.460898e-01 2.061524
x50 0.003707 0.002613 0.000000e+00 0.068587 1.291578e-03 -0.001947 0.064548 4.212534e-03 0.379452 -1.600814 -2.318389e-01 5.939958e-03 2.420133e-01 1.601556
x4 0.011227 0.013212 0.000000e+00 0.117066 7.862083e-02 -0.001060 0.092922 -8.002861e-03 0.377618 -2.400846 -2.451890e-01 -8.557804e-03 2.281868e-01 2.139698
x49 0.002829 0.002180 0.000000e+00 0.069968 9.567692e-04 -0.002000 0.056658 7.278583e-03 0.359988 -1.643423 -2.089041e-01 7.022118e-03 2.259238e-01 1.716797
x17 0.014828 0.006880 0.000000e+00 0.158919 1.011068e-03 -0.001978 0.097267 2.353525e-03 0.349963 -1.996780 -1.877545e-01 0.000000e+00 1.961415e-01 1.808665
x56 0.010004 0.014869 0.000000e+00 -0.135979 2.074905e-03 -0.001857 -0.093692 -5.494510e-05 0.348561 -6.465667 -2.273737e-13 0.000000e+00 2.273737e-13 7.159021
x74 0.010983 0.016889 0.000000e+00 -0.147436 1.175300e-03 -0.001956 -0.096922 -3.470990e-04 0.330196 -0.950000 0.000000e+00 0.000000e+00 0.000000e+00 0.950000
x31 0.011676 0.002065 0.000000e+00 0.148323 3.960418e-05 -0.002477 0.084001 4.925928e-03 0.328768 -1.612741 -1.980224e-01 6.868703e-03 2.120769e-01 1.473627
x60 0.023344 0.014200 0.000000e+00 -0.205115 1.266251e-10 -0.003855 -0.135739 -9.042491e-03 0.327733 -0.600000 -2.000000e-01 0.000000e+00 2.000000e-01 0.600000
x47 0.001207 0.000931 3.365239e-139 0.045301 4.219238e-03 -0.001734 0.039955 2.446335e-03 0.327621 -1.448128 -1.971294e-01 3.432586e-03 2.035180e-01 1.494480
x6 0.008152 0.013381 0.000000e+00 0.140943 1.744301e-03 -0.001890 0.090779 -2.772702e-03 0.302895 -1.871802 -2.231436e-01 0.000000e+00 2.231436e-01 2.014903
x0 0.038710 0.037445 0.000000e+00 0.301682 6.933415e-01 -0.000234 0.177106 -6.355550e-03 0.285360 -1.428135 -2.068259e-01 -7.534418e-03 1.945143e-01 1.133379
x28 0.022628 0.009541 0.000000e+00 0.236784 3.381633e-07 -0.003058 0.125917 5.352567e-03 0.283429 -1.352308 -1.776066e-01 6.692083e-03 1.908653e-01 1.240911
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
x59 0.015737 0.007763 0.000000e+00 -0.343559 1.211072e-16 -0.004990 -0.104851 -8.333821e-03 0.159596 -0.500000 -1.000000e-01 0.000000e+00 1.000000e-01 0.500000
x16 0.010742 0.005133 0.000000e+00 -0.305840 1.780839e-06 -0.002881 -0.086633 -3.182501e-03 0.152147 -0.790803 -9.742458e-02 -4.273688e-03 8.942915e-02 0.898926
x54 0.001812 -0.002550 5.057587e-208 0.120247 3.227178e-03 -0.001784 0.017994 7.571190e-04 0.136164 -0.982159 -6.752076e-02 6.151493e-04 7.047668e-02 0.970479
x61 0.005880 0.001086 0.000000e+00 -0.285581 3.124004e-06 -0.002820 -0.061891 -3.151358e-03 0.120707 -0.561393 -7.919910e-02 -4.758540e-03 7.154116e-02 0.609120
x43 0.003796 0.000315 0.000000e+00 0.252876 3.454921e-03 -0.001769 0.056615 1.646775e-03 0.111206 -0.738249 -6.010550e-02 6.241311e-04 6.452087e-02 0.802034
x25 0.009645 0.014705 0.000000e+00 -0.882108 1.438389e-03 -0.001923 -0.092378 -9.516011e-05 0.053487 -1.116149 0.000000e+00 0.000000e+00 0.000000e+00 1.240896
x27 0.006803 0.014233 0.000000e+00 -0.880876 7.009102e-04 -0.002048 -0.089560 -1.488330e-04 0.044885 -0.400000 0.000000e+00 0.000000e+00 0.000000e+00 0.375000
x21 0.027413 0.037533 0.000000e+00 -2.265631 2.260869e-04 -0.002205 -0.156247 -9.894859e-05 0.034130 -0.510929 -7.699879e-03 1.998058e-05 7.937268e-03 0.482764
x71 0.005105 0.010812 0.000000e+00 -1.094911 3.908556e-04 -0.002145 -0.079721 -1.586643e-04 0.031134 -0.361702 0.000000e+00 0.000000e+00 0.000000e+00 0.250000
x52 0.015780 0.017094 0.000000e+00 1.775136 3.594354e-03 -0.001751 0.114030 -1.481753e-05 0.030728 -0.669406 -5.555669e-03 0.000000e+00 5.504655e-03 0.381391
x37 0.007539 0.009775 0.000000e+00 1.293470 5.466588e-03 -0.001678 0.097439 -1.333870e-04 0.030172 -0.819886 -1.371007e-02 0.000000e+00 1.381232e-02 0.456009
x26 0.009735 0.018922 0.000000e+00 -1.683781 5.375760e-04 -0.002088 -0.101550 -1.007999e-04 0.028525 -0.462359 0.000000e+00 0.000000e+00 0.000000e+00 0.441531
x38 0.013086 0.018332 0.000000e+00 1.816383 4.194454e-03 -0.001724 0.120450 1.454746e-05 0.028221 -0.586053 -1.441199e-02 0.000000e+00 1.446215e-02 0.364065
x36 0.006972 0.009163 0.000000e+00 1.660526 4.251162e-03 -0.001727 0.104972 -1.041741e-05 0.028097 -0.531218 -1.312341e-02 0.000000e+00 1.337027e-02 0.491824
x10 0.041243 0.037459 0.000000e+00 -3.292849 2.738606e-03 -0.001778 -0.183341 -9.176858e-07 0.027310 -0.195754 -5.484018e-03 -3.301880e-05 5.460508e-03 0.305588
x41 0.010877 0.017862 0.000000e+00 2.145162 1.440139e-03 -0.001921 0.115319 7.885086e-05 0.023440 -0.445933 -1.205707e-02 0.000000e+00 1.208543e-02 0.343716
x20 0.010344 0.020013 0.000000e+00 -2.271979 2.644953e-04 -0.002200 -0.105212 -9.890630e-05 0.021736 -0.352861 -5.737010e-03 1.167837e-08 5.884519e-03 0.332975
x39 0.014025 0.019103 0.000000e+00 2.741665 2.797050e-04 -0.002187 0.119286 1.695179e-04 0.020478 -0.227473 -1.038361e-02 0.000000e+00 1.047093e-02 0.324944
x40 0.010363 0.019659 0.000000e+00 2.408771 2.541182e-04 -0.002206 0.112021 9.624993e-05 0.019996 -0.288067 -9.662981e-03 0.000000e+00 9.612246e-03 0.306738
x66 0.007501 0.008411 0.000000e+00 2.018742 8.510263e-03 -0.001589 0.091926 -1.498705e-04 0.019132 -0.430980 -9.492595e-03 0.000000e+00 9.427441e-03 0.340267
x67 0.011284 0.014833 0.000000e+00 2.636956 4.996012e-03 -0.001692 0.110822 8.890214e-06 0.017892 -0.294545 -9.203905e-03 0.000000e+00 9.276418e-03 0.233266
x65 0.006100 0.006809 0.000000e+00 2.414170 5.307385e-03 -0.001685 0.095014 -1.934351e-05 0.017879 -0.239056 -8.596676e-03 0.000000e+00 8.706528e-03 0.233725
x22 0.024140 0.035720 0.000000e+00 -4.378153 1.697967e-05 -0.002576 -0.148418 -8.992390e-05 0.016817 -0.271283 -7.263073e-03 8.838458e-05 7.296406e-03 0.199517
x70 0.010373 0.013222 0.000000e+00 3.304626 1.303462e-03 -0.001939 0.106919 5.380037e-05 0.014837 -0.242700 -7.855872e-03 0.000000e+00 7.828223e-03 0.235795
x69 0.008807 0.014105 0.000000e+00 3.419984 9.739654e-05 -0.002352 0.100755 8.100167e-05 0.012853 -0.137953 -6.609845e-03 0.000000e+00 6.577795e-03 0.233424
x68 0.013171 0.013859 0.000000e+00 4.229836 1.110893e-04 -0.002328 0.112625 1.430658e-04 0.012812 -0.131110 -6.728199e-03 0.000000e+00 6.819887e-03 0.240946
x72 0.006143 0.011195 0.000000e+00 -3.009261 1.415605e-04 -0.002300 -0.083019 -8.556277e-05 0.012678 -0.237460 -6.508649e-03 0.000000e+00 6.558969e-03 0.149889
x11 0.056412 0.055133 0.000000e+00 -9.278240 2.721224e-03 -0.001765 -0.213299 2.963735e-06 0.011194 -0.060948 -4.582049e-03 -9.964929e-06 4.662572e-03 0.111311
x53 0.020334 0.020879 0.000000e+00 6.319268 5.187640e-03 -0.001677 0.131005 -2.529367e-05 0.009505 -0.159443 -5.908440e-03 0.000000e+00 5.883632e-03 0.100136
x12 0.059658 0.060138 0.000000e+00 -17.241108 2.511319e-03 -0.001777 -0.219229 1.028488e-06 0.006148 -0.029745 -3.207732e-03 1.325267e-05 3.258262e-03 0.059767

80 rows × 14 columns


In [52]:
corr_mat_unselected = np.corrcoef(xin_stdzd.ix[:, unselected_index].values, rowvar=0)
corr_mat_selected = np.corrcoef(xin_stdzd.ix[:, selected_index].values, rowvar=0)

In [53]:
corr_arr_selected = np.array([])
ran = corr_mat_selected.shape[1]
for i in range(ran):
    for j in range(i+1, ran, 1):
        corr_arr_selected = np.append(corr_arr_selected, corr_mat_selected[i, j])

corr_arr_unselected = np.array([])
ran = corr_mat_unselected.shape[1]
for i in range(ran):
    for j in range(i+1, ran, 1):
        corr_arr_unselected = np.append(corr_arr_unselected, corr_mat_unselected[i, j])

In [7]:
# all features
corr_mat = np.corrcoef(xin_stdzd.ix[:, :].values, rowvar=0)
corr_arr = np.array([])
ran = corr_mat.shape[1]
for i in range(ran):
    for j in range(i+1, ran, 1):
        corr_arr = np.append(corr_arr, corr_mat[i, j])

In [12]:
plt.plot(corr_arr)


Out[12]:
[<matplotlib.lines.Line2D at 0x7fde23ba69d0>]

In [15]:
%matplotlib auto


Using matplotlib backend: TkAgg

In [21]:
def CorrHeatmap(df, cols=None):
    if cols == None:
        cols = df.columns
    cm = np.corrcoef(df[cols].values.T) * 100
    sns.set(font_scale=1.5)
    fig = plt.figure(figsize=(50, 30))
    ax1 = fig.add_subplot(111)
    hm = sns.heatmap(cm, 
    cbar=True,
    annot=False, 
    square=True,
    fmt='.2f',
    annot_kws={'size': 15},
    yticklabels=cols,
    xticklabels=cols, ax=ax1)
    plt.title('Coef. of corr. Matrix (unit: percent)')
    plt.savefig('Corr_Matrix')

In [22]:
CorrHeatmap(xin_stdzd)

In [127]:
print corr_arr_unselected.shape
print corr_arr_selected.shape


(1596,)
(253,)

In [129]:
del corr_arr, corr_mat

In [54]:
plt.figure(figsize=(16,8))
# all x
sns.distplot(corr_arr_selected, label='selected', norm_hist=True)
# selected
sns.distplot(corr_arr_unselected, label='unselected', norm_hist=True)
plt.legend()


Out[54]:
<matplotlib.legend.Legend at 0x7fde239451d0>

In [137]:
fig = plt.figure(figsize=(16,8))
# all x
ax1 = fig.add_subplot(211)
ax2 = fig.add_subplot(212, sharex=ax1)
sns.boxplot(np.abs(corr_arr_selected), ax=ax1)
# selected
sns.boxplot(np.abs(corr_arr_unselected), ax=ax2)


Out[137]:
<matplotlib.axes.AxesSubplot at 0x7fcd732f5d50>

In [28]:
corr_mat[2, 1]


Out[28]:
0.9242512195801823

In [24]:
x0_prop.ix[40:60, :]


Out[24]:
rsq_in rsq_out slope_P slope const_P const corr with Y mean std min 25% 50% 75% max
x40 0.010363 0.019659 0.000000e+00 2.408771 2.541182e-04 -0.002206 0.112021 0.000096 0.019996 -0.288067 -9.662981e-03 0.000000 9.612246e-03 0.306738
x41 0.010877 0.017862 0.000000e+00 2.145162 1.440139e-03 -0.001921 0.115319 0.000079 0.023440 -0.445933 -1.205707e-02 0.000000 1.208543e-02 0.343716
x42 0.001084 0.000958 3.006864e-125 0.063534 4.505705e-03 -0.001721 0.038397 0.002384 0.221671 -1.212832 -1.318046e-01 0.003954 1.384451e-01 1.138966
x43 0.003796 0.000315 0.000000e+00 0.252876 3.454921e-03 -0.001769 0.056615 0.001647 0.111206 -0.738249 -6.010550e-02 0.000624 6.452087e-02 0.802034
x44 0.002922 0.002213 0.000000e+00 0.104812 4.222059e-04 -0.002135 0.056048 0.007355 0.247733 -1.365750 -1.402246e-01 0.007971 1.572438e-01 1.412043
x45 0.004384 0.003251 0.000000e+00 0.108536 6.183663e-04 -0.002071 0.066516 0.004282 0.261694 -1.339704 -1.574684e-01 0.005359 1.697673e-01 1.268719
x46 0.000554 0.000003 6.972247e-65 0.062691 5.447910e-04 -0.002097 0.046762 0.005244 0.190200 -1.107632 -9.547786e-02 0.006735 1.090240e-01 1.115071
x47 0.001207 0.000931 3.365239e-139 0.045301 4.219238e-03 -0.001734 0.039955 0.002446 0.327621 -1.448128 -1.971294e-01 0.003433 2.035180e-01 1.494480
x48 0.003236 0.000512 0.000000e+00 0.149624 3.405888e-03 -0.001773 0.057154 0.001683 0.173333 -1.177285 -8.605302e-02 0.000000 9.031007e-02 1.182892
x49 0.002829 0.002180 0.000000e+00 0.069968 9.567692e-04 -0.002000 0.056658 0.007279 0.359988 -1.643423 -2.089041e-01 0.007022 2.259238e-01 1.716797
x50 0.003707 0.002613 0.000000e+00 0.068587 1.291578e-03 -0.001947 0.064548 0.004213 0.379452 -1.600814 -2.318389e-01 0.005940 2.420133e-01 1.601556
x51 0.000396 0.000144 5.990529e-47 0.036182 1.271207e-03 -0.001954 0.040979 0.005262 0.280778 -1.351381 -1.411197e-01 0.003868 1.539879e-01 1.439747
x52 0.015780 0.017094 0.000000e+00 1.775136 3.594354e-03 -0.001751 0.114030 -0.000015 0.030728 -0.669406 -5.555669e-03 0.000000 5.504655e-03 0.381391
x53 0.020334 0.020879 0.000000e+00 6.319268 5.187640e-03 -0.001677 0.131005 -0.000025 0.009505 -0.159443 -5.908440e-03 0.000000 5.883632e-03 0.100136
x54 0.001812 -0.002550 5.057587e-208 0.120247 3.227178e-03 -0.001784 0.017994 0.000757 0.136164 -0.982159 -6.752076e-02 0.000615 7.047668e-02 0.970479
x55 0.002279 0.003065 3.287589e-261 -0.046948 3.012538e-03 -0.001796 -0.045715 -0.000393 0.483335 -3.664548 0.000000e+00 0.000000 0.000000e+00 4.001388
x56 0.010004 0.014869 0.000000e+00 -0.135979 2.074905e-03 -0.001857 -0.093692 -0.000055 0.348561 -6.465667 -2.273737e-13 0.000000 2.273737e-13 7.159021
x57 0.010267 0.019412 0.000000e+00 -0.099494 1.534660e-03 -0.001911 -0.102227 0.000238 0.491779 -7.278310 -4.547474e-13 0.000000 6.821210e-13 7.147906
x58 0.021352 0.012330 0.000000e+00 -0.310760 3.067238e-15 -0.004736 -0.125622 -0.008492 0.205834 -0.533333 -1.333333e-01 0.000000 1.333333e-01 0.533333
x59 0.015737 0.007763 0.000000e+00 -0.343559 1.211072e-16 -0.004990 -0.104851 -0.008334 0.159596 -0.500000 -1.000000e-01 0.000000 1.000000e-01 0.500000

mean corr


In [34]:
uniqueness = 1 - (np.abs(corr_mat).sum(axis=0) - 1.) / 79

In [36]:
np.save('uniqueness.npy', uniqueness)

PCA


In [138]:
from sklearn.decomposition import PCA

In [293]:
pca = PCA(n_components=80)

In [294]:
pca.fit(xin_stdzd.values)


Out[294]:
PCA(copy=True, iterated_power=4, n_components=80, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [297]:
np.save('pca_components_calc_from_xinstdzd.npy', pca.components_)

In [295]:
print pca.components_.shape
print pca.explained_variance_ratio_
sns.distplot(np.log(pca.explained_variance_ratio_*1e5), kde=False)
plt.figure()
sns.distplot(pca.explained_variance_ratio_, kde=False)


(80, 80)
[  3.10079911e-01   1.79803751e-01   6.38052440e-02   5.60083872e-02
   4.31263642e-02   3.65849051e-02   2.52459064e-02   2.28141479e-02
   2.14943597e-02   2.09108616e-02   1.91954684e-02   1.78685344e-02
   1.52328825e-02   1.39333834e-02   1.22715636e-02   1.08922639e-02
   1.00337962e-02   8.77874225e-03   7.75691522e-03   7.27610299e-03
   6.88231226e-03   6.69181428e-03   6.37011420e-03   5.59077685e-03
   5.38082837e-03   5.00953744e-03   4.52415001e-03   4.13580056e-03
   3.51818001e-03   3.35938385e-03   2.78174220e-03   2.55703617e-03
   2.43504065e-03   2.31391339e-03   2.14117872e-03   2.01524843e-03
   1.93978405e-03   1.88145051e-03   1.83608725e-03   1.76662052e-03
   1.72240270e-03   1.68196275e-03   1.61666532e-03   1.52380170e-03
   1.43085768e-03   1.36798617e-03   1.17322505e-03   1.10782723e-03
   1.01629413e-03   9.36547887e-04   8.75835626e-04   8.43010673e-04
   8.13711628e-04   7.42681471e-04   6.79593948e-04   6.23831285e-04
   5.74918822e-04   5.46386037e-04   5.31034024e-04   4.85659033e-04
   4.75898469e-04   4.34518640e-04   3.83282779e-04   3.52657742e-04
   2.97174783e-04   2.82919677e-04   2.43424386e-04   2.15160665e-04
   1.66894712e-04   1.45938446e-04   1.41420171e-04   9.35231376e-05
   6.78524283e-05   5.78430750e-05   4.84545157e-05   2.18466282e-05
   1.53129325e-05   9.83124393e-06   7.32132217e-06   1.53267529e-25]
Out[295]:
<matplotlib.axes.AxesSubplot at 0x7fcd718be550>

In [303]:
n_PCA = 20

In [304]:
projected_in = np.dot(xin_stdzd, pca.components_[:n_PCA, :].T)
projected_in = pd.DataFrame(data=projected_in, index=xin_stdzd.index, columns=np.arange(n_PCA))

In [305]:
projected_out = np.dot(xout_stdzd, pca.components_[:n_PCA, :].T)
projected_out = pd.DataFrame(data=projected_out, index=xout_stdzd.index, columns=np.arange(n_PCA))

In [306]:
projected_test = np.dot(xtest_stdzd, pca.components_[:n_PCA, :].T)
projected_test = pd.DataFrame(data=projected_test, index=xtest_stdzd.index, columns=np.arange(n_PCA))

In [307]:
modr = linear_model.Ridge(alpha=3000.)
res2, rsq_in, rsq_out = MyRgrs(projected_in, projected_out, yin2, yout2, modr, align=True)
mycoef2 = res2.coef_
nonzero_len = (mycoef2 != 0).sum() * 1. / len(mycoef2)
rsq_test = res2.score(projected_test.ix[::15], ytest.ix[::15])
print ('rsq_test: %f, nonzero_len: %.3f' %(rsq_test, nonzero_len))


rsq_in: 0.088802
rsq_out: 0.094422
rsq_test: 0.057422, nonzero_len: 1.000

In [292]:
modr = linear_model.Ridge(alpha=3000.)
res2, rsq_in, rsq_out = MyRgrs(xin_stdzd, xout_stdzd, yin2, yout2, modr, align=True)
mycoef2 = res2.coef_
nonzero_len = (mycoef2 != 0).sum() * 1. / len(mycoef2)
rsq_test = res2.score(xtest_stdzd.ix[::15], ytest.ix[::15])
print ('rsq_test: %f, nonzero_len: %.3f' %(rsq_test, nonzero_len))


rsq_in: 0.094886
rsq_out: 0.095837
rsq_test: 0.058611, nonzero_len: 1.000

t-SNE


In [235]:
from sklearn.manifold import TSNE

In [236]:
tsne_mod = TSNE(n_components=2, random_state=0)

In [260]:
tsne_in = tsne_mod.fit_transform(projected_in.ix[::100].values)

In [261]:
tsne_in.shape


Out[261]:
(5224, 2)

In [262]:
Blues = plt.get_cmap('Blues')
yin2 = yin.ix[::500]
yin2_norm = (yin2 - yin2.min()) / (yin2.max() - yin2.min())
mycolor = Blues(yin2_norm)

In [254]:
%matplotlib auto


Using matplotlib backend: TkAgg

In [263]:
plt.scatter(tsne_in[:, 0], tsne_in[:, 1], color=mycolor)


Out[263]:
<matplotlib.collections.PathCollection at 0x7fcd7185f390>

In [270]:
plt.scatter(tsne_in[:, 0], tsne_in[:, 1], color=Blues(0.8))


Out[270]:
<matplotlib.collections.PathCollection at 0x7fcd71935e10>

In [ ]: