In [38]:
%matplotlib inline

import glob
import os
import re

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
plt.style.use('ggplot')

In [2]:
## handle bottom-up data
#csvs1 = glob.glob('./bottomup/*csv')
#data1 = {}
#for f in csvs1:
#    content = open(f).read().splitlines()[1:]
#    for line in content:
#        tmp = line.split(',', 1)[0]
#        if tmp not in data1:
#            data1[tmp] = []
#        data1[tmp].append('%s,%s' % (os.path.basename(f).replace('.csv', ''), line))
#for k, v in data1.iteritems():
#    with open('./stock_data/bu_%s.csv' % k, 'wb') as fout:
#        fout.write('\n'.join(v))

# handle top-down data
#csvs2 = glob.glob('./topdown/*csv')
#data2 = {}
#for f in csvs2:
#    content = open(f).read().splitlines()[1:]
#    for line in content:
#        d, v = line.split(',', 1)
#        d = d.replace('-', '')
#        if d not in data2:
#            data2[d] = []
#        data2[d].append(v)
#with open('./stock_data/td.csv', 'wb') as fout:
#    for k, v in data2.iteritems():
#        fout.write('%s,%s\n' % (k, ','.join(v)))

In [39]:
# load top-down data
data = np.loadtxt('./stocks_data/td.csv', delimiter=',')
tdDate = data[:, 0].astype(np.float64).reshape(-1, 1)
tdMat = data[:, 1:]
data = None
print tdMat.shape


(249, 14)

In [86]:
from sklearn import linear_model
import sklearn.feature_selection as fs
from sklearn.pipeline import Pipeline

def predict_stock_returns(stockId, k):
    # load stock data
    data = np.loadtxt('./stocks_data/bu_%s.csv' % stockId, delimiter=',', dtype=str, ndmin=2)
    stockDate = data[:, 0].astype(np.float64)
    X = data[:, 3:-1].astype(np.float64)
    Y = data[:, 2].astype(np.float64)
    
    if stockDate.shape[0] == 1:
        return np.hstack((stockDate.reshape(-1, 1), Y.reshape(-1, 1)))
    
    # combine top-down data into individual stock data as new factors
    sel = np.in1d(tdDate, stockDate)
    X_extended = np.hstack((X, tdMat[sel]))
    
    # build model 
    ret = np.hstack((stockDate.reshape(-1, 1), np.zeros((stockDate.shape[0], 1), dtype=np.float64)))
    ret[0, 1] = Y[0]
    ret[-1, 1] = Y[-1]
    for i in range(len(stockDate)-1):
        testDate = stockDate[i+1]
        sel_train = stockDate < testDate
        sel_test = stockDate == testDate
        X_train, Y_train = X_extended[sel_train], Y[sel_train]
        X_test = X_extended[sel_test]
        model = Pipeline([
                ('fs', fs.SelectKBest(fs.f_regression, k=k)),
                ('lr', linear_model.LinearRegression())
            ]).fit(X_train, Y_train)
        ret[i, 1] = model.predict(X_test)
    return ret

In [95]:
stockIds = [re.findall(r'bu_(.*?)\.csv', f)[0] for f in glob.glob('./stocks_data/bu*csv')]

mat = np.zeros((tdDate.shape[0], len(stockIds)), dtype=np.float64)
def sum_forcast_returns(k):
    for i, sid in enumerate(stockIds):
        ret = predict_stock_returns(sid, k)
        tmpsel = np.in1d(tdDate, ret[:, 0])
        mat[tmpsel, i] = ret[:, 1]

K = 50
sum_forcast_returns(K)
mat[mat==0] = np.nan
# np.savetxt('all.csv', mat, delimiter=',')

In [90]:
# matGroundTrue = np.zeros_like(mat, dtype=np.float64)
# for i, sid in enumerate(stockIds):
#     csv = './stocks_data/bu_%s.csv' % sid
#     data = np.loadtxt(csv, delimiter=',', dtype=str, ndmin=2)
#     d1 = data[:, 0].astype(np.float64)
#     r1 = data[:, 2].astype(np.float64)
#     tmpsel = np.in1d(tdDate, d1)
#     matGroundTrue[tmpsel, i] = r1
# matGroundTrue[matGroundTrue==0] = np.nan
# np.savetxt('all.gt.csv', matGroundTrue, delimiter=',')
# np.savetxt('baseline.csv', np.nanmean(matGroundTrue, axis=1), delimiter=',')

In [4]:
mat = np.loadtxt('all.csv', delimiter=',', dtype=np.float64)
matGroundTrue = np.loadtxt('all.gt.csv', delimiter=',', dtype=np.float64)

In [96]:
N = 200
argMat = np.argsort(mat, axis=1)
ret = np.zeros(argMat.shape[0], dtype=np.float64)
for r in range(argMat.shape[0]):
    tmp = matGroundTrue[r, argMat[r]]
    tmp = tmp[~np.isnan(tmp)]
    ret[r] = (tmp[-N:].sum() - tmp[:N].sum()) / N
np.savetxt('k.3.%sn200.csv' % K, ret, delimiter=',')

In [97]:
for f in glob.glob('./k*csv'):
    d = np.loadtxt(f, delimiter=',')
    print f, d.mean(), d.std()


./k.2.10n200.csv 0.0717015518455 0.0584279056591
./k.2.50n200.csv 0.0498094232063 0.0442850893565
./k.3.10n200.csv 0.0738408345396 0.061262843522
./k.3.20n200.csv 0.0617398089292 0.0531703111846
./k.3.30n200.csv 0.0558338400471 0.0483758520259
./k.3.40n200.csv 0.0528939087895 0.0454621143018
./k.3.50n200.csv 0.0512564372835 0.0459066917543
./k10n200.csv 0.0582069608748 0.0457759089132
./k20n200.csv 0.05343363998 0.0447622060246
./k30n200.csv 0.0496470433707 0.0426373178659
./k40n200.csv 0.0486198427458 0.0424227090715
./k50n100.csv 0.0546127582505 0.0563057035686
./k50n200.csv 0.0471803425917 0.0424847757347

In [ ]: