In [38]:
%matplotlib inline
import glob
import os
import re
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
plt.style.use('ggplot')
In [2]:
## handle bottom-up data
#csvs1 = glob.glob('./bottomup/*csv')
#data1 = {}
#for f in csvs1:
# content = open(f).read().splitlines()[1:]
# for line in content:
# tmp = line.split(',', 1)[0]
# if tmp not in data1:
# data1[tmp] = []
# data1[tmp].append('%s,%s' % (os.path.basename(f).replace('.csv', ''), line))
#for k, v in data1.iteritems():
# with open('./stock_data/bu_%s.csv' % k, 'wb') as fout:
# fout.write('\n'.join(v))
# handle top-down data
#csvs2 = glob.glob('./topdown/*csv')
#data2 = {}
#for f in csvs2:
# content = open(f).read().splitlines()[1:]
# for line in content:
# d, v = line.split(',', 1)
# d = d.replace('-', '')
# if d not in data2:
# data2[d] = []
# data2[d].append(v)
#with open('./stock_data/td.csv', 'wb') as fout:
# for k, v in data2.iteritems():
# fout.write('%s,%s\n' % (k, ','.join(v)))
In [39]:
# load top-down data
data = np.loadtxt('./stocks_data/td.csv', delimiter=',')
tdDate = data[:, 0].astype(np.float64).reshape(-1, 1)
tdMat = data[:, 1:]
data = None
print tdMat.shape
In [86]:
from sklearn import linear_model
import sklearn.feature_selection as fs
from sklearn.pipeline import Pipeline
def predict_stock_returns(stockId, k):
# load stock data
data = np.loadtxt('./stocks_data/bu_%s.csv' % stockId, delimiter=',', dtype=str, ndmin=2)
stockDate = data[:, 0].astype(np.float64)
X = data[:, 3:-1].astype(np.float64)
Y = data[:, 2].astype(np.float64)
if stockDate.shape[0] == 1:
return np.hstack((stockDate.reshape(-1, 1), Y.reshape(-1, 1)))
# combine top-down data into individual stock data as new factors
sel = np.in1d(tdDate, stockDate)
X_extended = np.hstack((X, tdMat[sel]))
# build model
ret = np.hstack((stockDate.reshape(-1, 1), np.zeros((stockDate.shape[0], 1), dtype=np.float64)))
ret[0, 1] = Y[0]
ret[-1, 1] = Y[-1]
for i in range(len(stockDate)-1):
testDate = stockDate[i+1]
sel_train = stockDate < testDate
sel_test = stockDate == testDate
X_train, Y_train = X_extended[sel_train], Y[sel_train]
X_test = X_extended[sel_test]
model = Pipeline([
('fs', fs.SelectKBest(fs.f_regression, k=k)),
('lr', linear_model.LinearRegression())
]).fit(X_train, Y_train)
ret[i, 1] = model.predict(X_test)
return ret
In [95]:
stockIds = [re.findall(r'bu_(.*?)\.csv', f)[0] for f in glob.glob('./stocks_data/bu*csv')]
mat = np.zeros((tdDate.shape[0], len(stockIds)), dtype=np.float64)
def sum_forcast_returns(k):
for i, sid in enumerate(stockIds):
ret = predict_stock_returns(sid, k)
tmpsel = np.in1d(tdDate, ret[:, 0])
mat[tmpsel, i] = ret[:, 1]
K = 50
sum_forcast_returns(K)
mat[mat==0] = np.nan
# np.savetxt('all.csv', mat, delimiter=',')
In [90]:
# matGroundTrue = np.zeros_like(mat, dtype=np.float64)
# for i, sid in enumerate(stockIds):
# csv = './stocks_data/bu_%s.csv' % sid
# data = np.loadtxt(csv, delimiter=',', dtype=str, ndmin=2)
# d1 = data[:, 0].astype(np.float64)
# r1 = data[:, 2].astype(np.float64)
# tmpsel = np.in1d(tdDate, d1)
# matGroundTrue[tmpsel, i] = r1
# matGroundTrue[matGroundTrue==0] = np.nan
# np.savetxt('all.gt.csv', matGroundTrue, delimiter=',')
# np.savetxt('baseline.csv', np.nanmean(matGroundTrue, axis=1), delimiter=',')
In [4]:
mat = np.loadtxt('all.csv', delimiter=',', dtype=np.float64)
matGroundTrue = np.loadtxt('all.gt.csv', delimiter=',', dtype=np.float64)
In [96]:
N = 200
argMat = np.argsort(mat, axis=1)
ret = np.zeros(argMat.shape[0], dtype=np.float64)
for r in range(argMat.shape[0]):
tmp = matGroundTrue[r, argMat[r]]
tmp = tmp[~np.isnan(tmp)]
ret[r] = (tmp[-N:].sum() - tmp[:N].sum()) / N
np.savetxt('k.3.%sn200.csv' % K, ret, delimiter=',')
In [97]:
for f in glob.glob('./k*csv'):
d = np.loadtxt(f, delimiter=',')
print f, d.mean(), d.std()
In [ ]: