In [1]:
#!/Tsan/bin/python
# -*- coding: utf-8 -*-

In [125]:
# Libraries to use
from __future__ import division 
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

import mysql.connector 
import json
import collections

In [127]:
import talib as tb

In [4]:
# Import My own library for factor testing
from SingleFactorTest import factorFilterFunctions as ff
#from config import *

In [5]:
%matplotlib inline
%load_ext line_profiler

In [ ]:


In [6]:
# {'600036':u'招商银行','002142':u'宁波银行','601939':u'建设银行','000709':u'河钢股份','000807':u'云铝股份','600111':u'北方稀土',
#  '002001':u'新和成','002562':u'兄弟科技','0300121':u'阳谷华泰','600352':u'浙江龙盛','600409':u'三友化工','000636':u'风华高科',
#  '000962':u'东方钽业','300228':u'富瑞特装','600681':u'百川能源','600789':u'鲁抗医药','002166':u'莱茵生物','002166':u'万华化学',
# '000933':u'神火股份','600977':u'中国电影','000802':u'北京文化','600809':u'山西汾酒','600438':u'通威股份','000970':u'中科三环'}

In [7]:
# 股票推荐字典
stkdict = {'20170815':['600036','002142','601939'], '20170814':['000709','000807','600111','002001','002562'],\
          '20170810':['300121','600352','600409'], '20170809':['000636','000962'], '20170808':['600803','300228','600681'],
          '20170807':['600789','002166','600309'],'20170803':['000933','600977','000802','600809'],
          '20170801':['600438'], '20170731':['600438','600111','000970']}

In [8]:
sortedstkdict = collections.OrderedDict(sorted(stkdict.items()))

In [9]:
fileNameClose = 'LZ_CN_STKA_QUOTE_TCLOSE.h5.csv'
fileNameOpen = 'LZ_CN_STKA_QUOTE_TOPEN.h5.csv'

In [10]:
path = ff.data_path
# Constants
startTime =  datetime.strptime('20170731', '%Y%m%d')
endTime = datetime.strptime('20170816', '%Y%m%d')

In [11]:
# 开盘价
openPrice= pd.read_csv(path+fileNameOpen,infer_datetime_format=True,parse_dates=[0],index_col=0).loc[startTime:endTime]

stkList = map(lambda x: x.split('.')[0],openPrice.columns[1:])

openPrice = openPrice.iloc[:,:-1]

openPrice.columns = stkList


---------------------------------------------------------------------------
IOError                                   Traceback (most recent call last)
<ipython-input-11-a8e42093d534> in <module>()
      1 # 开盘价
----> 2 openPrice= pd.read_csv(path+fileNameOpen,infer_datetime_format=True,parse_dates=[0],index_col=0).loc[startTime:endTime]
      3 
      4 stkList = map(lambda x: x.split('.')[0],openPrice.columns[1:])
      5 

c:\python27\lib\site-packages\pandas\io\parsers.pyc in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)
    653                     skip_blank_lines=skip_blank_lines)
    654 
--> 655         return _read(filepath_or_buffer, kwds)
    656 
    657     parser_f.__name__ = name

c:\python27\lib\site-packages\pandas\io\parsers.pyc in _read(filepath_or_buffer, kwds)
    403 
    404     # Create the parser.
--> 405     parser = TextFileReader(filepath_or_buffer, **kwds)
    406 
    407     if chunksize or iterator:

c:\python27\lib\site-packages\pandas\io\parsers.pyc in __init__(self, f, engine, **kwds)
    762             self.options['has_index_names'] = kwds['has_index_names']
    763 
--> 764         self._make_engine(self.engine)
    765 
    766     def close(self):

c:\python27\lib\site-packages\pandas\io\parsers.pyc in _make_engine(self, engine)
    983     def _make_engine(self, engine='c'):
    984         if engine == 'c':
--> 985             self._engine = CParserWrapper(self.f, **self.options)
    986         else:
    987             if engine == 'python':

c:\python27\lib\site-packages\pandas\io\parsers.pyc in __init__(self, src, **kwds)
   1603         kwds['allow_leading_cols'] = self.index_col is not False
   1604 
-> 1605         self._reader = parsers.TextReader(src, **kwds)
   1606 
   1607         # XXX

pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader.__cinit__ (pandas\_libs\parsers.c:4209)()

pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._setup_parser_source (pandas\_libs\parsers.c:8873)()

IOError: File D:/cStrategy/Factor/LZ_CN_STKA_QUOTE_TOPEN.h5.csv does not exist

In [ ]:
# 收盘价
closePrice= pd.read_csv(path+fileNameClose,infer_datetime_format=True,parse_dates=[0],index_col=0).loc[startTime:endTime]

stkList = map(lambda x: x.split('.')[0],closePrice.columns[1:])

closePrice = closePrice.iloc[:,:-1]

closePrice.columns = stkList

In [ ]:
closePrice

In [ ]:
InitiCap = 100000  # 初始资金
capital = InitiCap  
feeRate = 0.0003  # 手续费+冲击成本
stampTaxRate = 0.001  # 印花税
buyList = []
availableCash = {}  # 每天的现金流量
# 生成购买的股票池
for key,item in sortedstkdict.iteritems():
    print key
    date = datetime.strptime(key, '%Y%m%d')
    totalCostToday = 0  # 当天买入的所有股票的成本
    for stk in item:        
        buyPrice = round(openPrice.loc[date][stk],2)
        amount = 100
        cost = round(buyPrice*amount*(1+feeRate),2)
        buyList.append([date,stk,buyPrice,amount,cost,1]) # 日期,股票代码,购买价格,购买数量,总cost,方向
        totalCostToday+= cost
    assert capital > totalCostToday
    capital  = capital  - totalCostToday
    availableCash[date] = capital  # 买入股票后剩下的资金

In [ ]:


In [ ]:
availableCash

In [ ]:
dateList = sorted([datetime.strptime(key, '%Y%m%d') for key in stkdict.keys()])

dateList = closePrice.loc[dateList[0]:dateList[-1]].index

dateList

In [ ]:
availableCash

In [ ]:
availableCashSeries = pd.Series(index = dateList,data = availableCash )

availableCashSeries = availableCashSeries.ffill()

availableCashSeries

In [ ]:
availableCashSeries

In [ ]:
buyList

In [ ]:
# 简单回测
cutwinthreshold = 0.05
cutlossthreshold  = -0.05
pnldict = {}
sellList = []
for tradeitem in buyList:
    pnl = (closePrice.loc[tradeitem[0]:][tradeitem[1]] - tradeitem[2]) / tradeitem[2]  # 计算每天的pnl
    cutwin = pnl[pnl >= cutwinthreshold]
    cutloss = pnl[pnl <= cutlossthreshold]
    if len(cutwin) > 0:    # 如果达到止盈线,则按当天收盘价止盈
        pnldict[tradeitem[1]] = round(cutwin .iloc[0],2)
        selldate = cutwin.index[0]
        sellPrice = closePrice.loc[selldate][tradeitem[1]]
        cost = round(sellPrice*tradeitem[3]*(1-stampTaxRate),2)
        sellList.append([selldate,tradeitem[1],round(sellPrice,2),tradeitem[3],cost,-1]) # 卖出list的格式与买入相同
        availableCashSeries.loc[selldate:]+= cost   # 记录到资金账户里
    elif  len(cutloss) > 0:     #  反之则止损
        pnldict[tradeitem[1]] = round(cutloss .iloc[0],2)
        selldate = cutloss.index[0]
        sellPrice = closePrice.loc[selldate][tradeitem[1]]
        cost = round(sellPrice*tradeitem[3]*(1-stampTaxRate),2)
        sellList.append([selldate,tradeitem[1],round(sellPrice,2),tradeitem[3],cost,-1]) # 卖出list的格式与买入相同
        availableCashSeries.loc[selldate:]+= cost 
    else:                                   # 没有达到止盈止损线的情况下,按照倒数第二天的收盘价计算pnl
        pnldict[tradeitem[1]] = round(pnl.iloc[-2],2)

In [ ]:
sellList

In [ ]:
holdingList = list(set([i[1] for i in buyList]) - set([i[1] for i in sellList]))

In [ ]:
holdingList

In [ ]:


In [ ]:
selldatepair = {i[1]:i[0] for i in buyList if i[1] in holdingList}
selldatepair

In [ ]:
tempo = closePrice[holdingList]

In [ ]:
position = tempo.copy()
for stk,date in selldatepair.iteritems():
    filterCondition = tempo[tempo[stk].index >= date]
    falseCondition= tempo[tempo[stk].index  < date]
    position.loc[filterCondition.index,stk]= 100
    position.loc[falseCondition.index,stk]=  0
position

In [ ]:
marketValue = position*tempo

In [ ]:
marketValue.sum(axis=1)+availableCashSeries

In [ ]:
# 计算卖出时得到的资金
for sellitem in sellList:
    availableCashSeries.loc[sellitem[0]]+= sellitem[-2]
availableCashSeries

In [ ]:
# 合并买卖订单
tradingList = buyList + sellList
tradingCode = list(set([i[1] for i in buyList]) | set([i[1] for i in sellList]))

In [ ]:
tradingList

In [ ]:
# 创建tradingsheet
tradingSheet = pd.DataFrame(index = dateList, columns = tradingCode, data=0, dtype = float)

for item in tradingList:
    tradingSheet .loc[item[0],item[1]]+= item[-1] * item[-3]
positionchangeDF = tradingSheet .cumsum().ffill()

In [ ]:
tradingSheet

In [ ]:
positionchangeDF

In [ ]:
# 总资产
totalCapital= (positionchangeDF * closePrice[positionchangeDF.columns]).sum(axis=1)+ availableCashSeries

In [ ]:
totalCapital

In [ ]:
# 年化收益率
annualizedRet = ((totalCapital.iloc[-2] - InitiCap)/ InitiCap +1)**(250/(len(totalCapital)-1)) -1

In [ ]:
annualizedRet

In [ ]:
availableCashSeries

In [ ]:
pnlseries = pd.Series(pnldict)

In [ ]:
pnlseries.describe()

In [ ]:
plt.hist(pnlseries)

In [ ]:
#-----------------------------------------------------Market Timing------------------------------------

In [ ]:
filenameHS300Member = 'LZ_CN_STKA_INDEX_HS300MEMBER.h5'
filenameCSI500Member = 'LZ_CN_STKA_INDEX_CSI500MEMBER.h5'

# 复权后价格
filenameAdjClose = 'OwnfactorAdjustedClose.h5'

In [ ]:
teststartTime =  datetime.strptime('20120731', '%Y%m%d')
testendTime = datetime.strptime('20171016', '%Y%m%d')

In [ ]:
def getMemberList(path,filename):
    df = ff.readh5data(path,filename).iloc[-1]
    return df.loc[df==1].index.tolist()

In [ ]:
hs300MemberList = getMemberList(path, filenameHS300Member)
csi500MemberList = getMemberList(path, filenameCSI500Member)

In [ ]:
priceData =  ff.readh5data(path,filenameAdjClose).pct_change().dropna(how='all').loc[teststartTime:testendTime]

In [ ]:
priceData[hs300MemberList].loc[datetime.strptime('20170810', '%Y%m%d')].name

In [ ]:
priceData[hs300MemberList].loc[datetime.strptime('20170810', '%Y%m%d')].hist()

In [ ]:
priceData[csi500MemberList].iloc[-2].name

In [ ]:
#-----------------------------------------------------CTA Backtest Engine------------------------------------

In [ ]:
def getNewMatrix(inputArray, t, m):
    newMatrix = []
    n = t-m+1
    for i in range(n):
        newdata = list(inputArray[i:m+i])
        newMatrix.append(newdata)
    #newMatrix = np.array(newMatrix).reshape(n,m)
    return np.array(newMatrix)

def recreateArray(newMatrix,t,m):
    ret = []
    n = t - m + 1
    for p in range(1, t+1):
        if p < m:
            alpha = p
        elif p > t-m+1:
            alpha = t-p+1
        else:
            alpha = m
        sigma = 0
        for j in range(1, m+1):
            i = p - j + 1
            if i > 0 and i < n+1:
                sigma += newMatrix[i-1][j-1]
        ret.append(sigma/alpha)
    return np.array(ret)

def getSVD(inputArray,t,m):
    #print 1
    inputmatrix = getNewMatrix(inputArray, t, m)
    u, s, v = np.linalg.svd(inputmatrix)
    eviNum = 1 if s[0]/s.sum() > 0.99 else 2
    sNew = np.zeros((eviNum, eviNum))
    np.fill_diagonal(sNew, s[:eviNum])
    matrixForts = np.dot(np.dot(u[:, :eviNum].reshape(u.shape[0], eviNum), sNew), v[:eviNum])
    #print matrixForts.shape
    newts = recreateArray(matrixForts, t, m)
    return newts

In [12]:
old_path = 'C:/Users/LZJF_02/Desktop/original_data/'
new_path = 'C:/Users/LZJF_02/Desktop/modified_data/'
filename = 'rb000_1min.csv'
#newname = 'j9000_1min_modi.csv'

In [13]:
startTime =  datetime.strptime('20160110', '%Y%m%d') # 上涨趋势
endTime = datetime.strptime('20161107', '%Y%m%d')

startTrainTime = datetime.strptime('20150530', '%Y%m%d') 
endTimeTrain =  datetime.strptime('20160530', '%Y%m%d')

startTimeVal = datetime.strptime('20151016', '%Y%m%d')  # 下跌趋势
endTimeVal = datetime.strptime('20151123', '%Y%m%d')

startTimeSurge =  datetime.strptime('20170606', '%Y%m%d') # 上涨趋势
endTimeSurge = datetime.strptime('20170809', '%Y%m%d')

startTimePlunge =  datetime.strptime('20170906', '%Y%m%d') # 下跌趋势
endTimePlunge = datetime.strptime('20170929', '%Y%m%d')

In [14]:
# svd滤波参数
shapeNum = 30
svdShort = 7
svdLong = 20

In [15]:
# 期货合约相关参数
Leverage = 10
Slipage = 2
ContractSize = 10
IniCapital = 100000
FeeRate = 0.003
PriceTick = 1

In [16]:
data = pd.read_csv(old_path+filename,infer_datetime_format=True,header=None,names = ['Open', 'High', 'Low', 'Close','TotalVolume','OpenInterest'])
data.index = data.index.map(lambda x : pd.to_datetime(x))

In [17]:
data['OpenRatio'] = (data['OpenInterest'] - data['OpenInterest'].shift(1)) / data['TotalVolume']

In [18]:
data.loc[startTime:].describe()
data.loc[startTime:].quantile(0.85)


Out[18]:
Open            3.454000e+03
High            3.457000e+03
Low             3.452000e+03
Close           3.454000e+03
TotalVolume     3.324800e+04
OpenInterest    4.524026e+06
OpenRatio       1.723304e-01
Name: 0.85, dtype: float64

In [19]:
data.tail(20)


Out[19]:
Open High Low Close TotalVolume OpenInterest OpenRatio
2017-09-18 11:16:00 3786 3789 3770 3776 56594 4057618 0.105559
2017-09-18 11:17:00 3775 3779 3771 3771 25718 4061640 0.156389
2017-09-18 11:18:00 3771 3780 3771 3776 15230 4065086 0.226264
2017-09-18 11:19:00 3776 3777 3772 3773 9056 4066862 0.196113
2017-09-18 11:20:00 3773 3773 3764 3766 38662 4072204 0.138172
2017-09-18 11:21:00 3767 3770 3766 3768 15876 4075438 0.203704
2017-09-18 11:22:00 3767 3768 3764 3764 18014 4077740 0.127789
2017-09-18 11:23:00 3763 3765 3759 3763 32302 4083090 0.165624
2017-09-18 11:24:00 3762 3768 3760 3768 15844 4085922 0.178743
2017-09-18 11:25:00 3769 3769 3762 3762 12436 4087190 0.101962
2017-09-18 11:26:00 3762 3766 3761 3763 12268 4089582 0.194979
2017-09-18 11:27:00 3763 3764 3761 3763 12170 4093248 0.301233
2017-09-18 11:28:00 3763 3763 3754 3754 27592 4096376 0.113366
2017-09-18 11:29:00 3754 3764 3754 3764 24468 4101462 0.207863
2017-09-18 13:30:00 3763 3773 3760 3771 35206 4110392 0.253650
2017-09-18 13:31:00 3771 3771 3761 3762 14936 4115096 0.314944
2017-09-18 13:32:00 3762 3768 3762 3766 11528 4118020 0.253643
2017-09-18 13:33:00 3766 3775 3766 3775 17158 4119222 0.070055
2017-09-18 13:34:00 3775 3784 3775 3783 27988 4119790 0.020294
2017-09-18 13:35:00 3783 3790 3780 3790 23382 4119394 -0.016936

In [20]:
data.describe()


Out[20]:
Open High Low Close TotalVolume OpenInterest OpenRatio
count 576590.000000 576590.000000 576590.000000 576590.000000 576590.000000 5.765900e+05 576589.000000
mean 3279.472292 3280.876907 3278.048208 3279.472297 11638.035467 2.440778e+06 -0.003634
std 949.560688 949.722005 949.397871 949.559562 15181.314867 1.293012e+06 0.224045
min 1617.000000 1618.000000 1616.000000 1617.000000 2.000000 8.376000e+03 -1.638679
25% 2422.000000 2423.000000 2421.000000 2422.000000 2578.000000 1.345436e+06 -0.130831
50% 3409.000000 3410.000000 3407.000000 3409.000000 6876.000000 2.478721e+06 0.004251
75% 4074.000000 4076.000000 4072.000000 4074.000000 14788.000000 3.501908e+06 0.133173
max 5186.000000 5187.000000 5182.000000 5185.000000 583962.000000 6.040226e+06 1.000000

In [21]:
d = {'x': {'b': 10, 'c': 20}, 'y': {'b': '33', 'c': 44,'a':1}}

In [22]:
data['TotalVolume'].idxmax()


Out[22]:
Timestamp('2016-03-08 10:05:00')

In [23]:
dd=[1,2,3,4.5,-2.3]
s =pd.Series(dd)
s[s<0].shape


Out[23]:
(1L,)

In [24]:
#pd.DataFrame.from_dict(f,orient='index')


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-24-19c7529072bd> in <module>()
----> 1 pd.DataFrame.from_dict(f,orient='index')

NameError: name 'f' is not defined

In [176]:
c = u'1,231'.encode('utf-8').split(',')
float(c[0] +c[1])


Out[176]:
1231.0

In [179]:
float("123,456.908".replace(',',''))


Out[179]:
123456.908

In [25]:
dataIndice = data.loc[startTime:endTime]
indexby5MIN = filter(lambda x : x.minute % 5 ==0, dataIndice.index)

# modify data by customized method
#dataIndice = data.loc[startTime:]
resultList = []
for index, time in enumerate(indexby5MIN):
    if index < len(indexby5MIN) - 1:
        tempdata = dataIndice.loc[time:indexby5MIN [index+1]].iloc[:-1]
        resultdict = {}
        resultdict['Open'] = tempdata['Open'].values[0]
        resultdict['High'] = tempdata['High'].max()
        resultdict['Low'] = tempdata['Low'].min()
        resultdict['Close'] = tempdata['Close'].values[-1]
        resultdict['TotalVolume'] = tempdata['TotalVolume'].sum()
        resultdict['OpenInterest'] = tempdata['OpenInterest'].values[-1]
        resultdict['OpenRatio'] = tempdata['OpenRatio'].values[-1]
        resultdict['OpenRatioModi'] = (np.sqrt(tempdata['TotalVolume']) * tempdata['OpenRatio'] / \
                                       (np.sqrt(tempdata['TotalVolume']).sum())).mean()
        resultdict['time'] = time
        resultList.append(resultdict)
df = pd.DataFrame(resultList,columns = ['Open','High','Low','Close','TotalVolume','OpenInterest','OpenRatio','time','OpenRatioModi'])  

dataClean = df.set_index('time')
dataClean['pnl'] = dataClean['Close'].pct_change()
dataClean['nextpnl'] = dataClean['pnl'].shift(1)

In [133]:
np_real_data = np.array(dataClean['Close'].values,dtype='f8')

In [140]:
alpha1 = (np.cos(0.707 * np.pi /48) + np.sin(0.707 * np.pi /48) - 1 ) / np.cos(0.707 * np.pi /48)
alpha1


Out[140]:
0.045234543450991919

In [146]:
LPPeriod = 20
a1 = np.exp(-1.414 * np.pi / LPPeriod) 
b1 = 2 * a1 * np.cos(1.414 * np.pi /  LPPeriod) 
c2 = b1
c3 = - a1 * a1
c1 = 1 - c2 - c3

In [ ]:
HP = np.zeros(close)
HP = (1 - alpha1 / 2) ** 2 * (Close - 2 * Close[-1] + Close[-2]) + 2 * (1 - alpha1) * HP[-1] - (1 - alpha1) ** 2 * HP[-2]

In [154]:
# Stay long if price is trading above HT_TRENDLINE and stay short if price is trading below HT_TRENDLINE.
tb.HT_TRENDLINE(np_real_data[-200:])[np.isnan(tb.HT_TRENDLINE(np_real_data[-500:]))].__len__()


c:\python27\lib\site-packages\ipykernel\__main__.py:2: VisibleDeprecationWarning: boolean index did not match indexed array along dimension 0; dimension is 200 but corresponding boolean dimension is 500
  from ipykernel import kernelapp as app
Out[154]:
63

In [136]:
np_real_data / tb.HT_TRENDLINE(np_real_data)


Out[136]:
array([        nan,         nan,         nan, ...,  1.00360515,
        1.00545756,  1.00777331])

In [165]:
n


Out[165]:
True

In [43]:
dataClean.Close.values


Out[43]:
array([1759, 1758, 1755, ..., 2731, 2737, 2745], dtype=int64)

In [27]:
dataClean[dataClean['OpenRatioModi'] > 0.023]['nextpnl'].hist()


Out[27]:
<matplotlib.axes._subplots.AxesSubplot at 0x3c314f98>

In [28]:
dataClean['pnl'].mean()


Out[28]:
3.199237800532039e-05

In [29]:
# check the monotonity of pnl filtered by openRatio indicator in bullish market
variation = []
bullpnlMeanList = []
bullpnlMedianList = []
x_axis = np.linspace(0,0.03,50)
for i in x_axis:
    variation.append(i)
    bullpnlMeanList.append(dataClean[dataClean['OpenRatioModi'] >= i]['pnl'].mean())
    bullpnlMedianList.append(dataClean[dataClean['OpenRatioModi'] >= i]['pnl'].shape[0] / dataClean.shape[0])
fig, (ax1,ax2) = plt.subplots(nrows=2,sharex=True,figsize=(16,10))
ax1.plot(x_axis,bullpnlMeanList,'blue',label='pnlMean')
ax2.plot(x_axis,bullpnlMedianList,'red',label='pnlMedian')
ax1.legend()
ax2.legend()


Out[29]:
<matplotlib.legend.Legend at 0x3a43ada0>

In [ ]:

新指标做多阈值大概为0.02(0.85左右的分位数),做空为0.024(0.8的分位数),也可取(0.85的分位数0.0275)


In [30]:
np.sqrt(np.array([1,2,3,4]))/np.sqrt(np.array([1,2,3,4])).sum()


Out[30]:
array([ 0.16270045,  0.23009319,  0.28180545,  0.32540091])

In [31]:
dataClean['OpenRatio'].hist()


Out[31]:
<matplotlib.axes._subplots.AxesSubplot at 0x36867780>

In [32]:
dataClean['OpenRatioModi'].hist()


Out[32]:
<matplotlib.axes._subplots.AxesSubplot at 0x3a785e10>

In [33]:
a,b,c = 10000,3000,7000
print np.sqrt(a) / (np.sqrt(a)+np.sqrt(b)+np.sqrt(c))
print np.sqrt(b) / (np.sqrt(a)+np.sqrt(b)+np.sqrt(c))
print np.sqrt(c) / (np.sqrt(a)+np.sqrt(b)+np.sqrt(c))


0.419395782663
0.229712530687
0.350891686651

In [34]:
print 3/(1+2+3)
print 1/(1+2+3)
print 2/(1+2+3)


0.5
0.166666666667
0.333333333333

In [35]:
print np.log10(a) / (np.log10(a)+np.log10(b)+np.log10(c))
print np.log10(b) / (np.log10(a)+np.log10(b)+np.log10(c))
print np.log10(c) / (np.log10(a)+np.log10(b)+np.log10(c))


0.353287628147
0.307105980215
0.339606391638

In [36]:
print np.log(a) / (np.log(a)+np.log(b)+np.log(c))
print np.log(b) / (np.log(a)+np.log(b)+np.log(c))
print np.log(c) / (np.log(a)+np.log(b)+np.log(c))


0.353287628147
0.307105980215
0.339606391638

In [ ]:


In [ ]:


In [37]:
#c = data.loc[startTime:].groupby(pd.TimeGrouper(freq='5Min'))

In [38]:
#nlarge = c['TotalVolume'].nlargest()

In [39]:
#nlarge.iloc[:50]

In [40]:
# Rescale openratio by the rank of Volume corresponding trading minute. e.g.Weighing the Open ratio by the order of total volume.
a = c['TotalVolume'].rank()/15 * data['OpenRatio']  
dataslice = data.loc[a.index]
dataslice['ModiOR'] = a
dataslice.groupby(pd.TimeGrouper(freq='5Min'))['OpenRatio'].mean().dropna().describe()


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-40-be87cdd29d01> in <module>()
      1 # Rescale openratio by the rank of Volume corresponding trading minute. e.g.Weighing the Open ratio by the order of total volume.
----> 2 a = c['TotalVolume'].rank()/15 * data['OpenRatio']
      3 dataslice = data.loc[a.index]
      4 dataslice['ModiOR'] = a
      5 dataslice.groupby(pd.TimeGrouper(freq='5Min'))['OpenRatio'].mean().dropna().describe()

TypeError: 'int' object has no attribute '__getitem__'

In [ ]:
dataslice.groupby(pd.TimeGrouper(freq='5Min'))['OpenRatio'].mean().dropna().hist()

In [ ]:
dataModi = dataslice.groupby(pd.TimeGrouper(freq='5Min'))['OpenRatio'].mean().dropna()
dataModi.quantile(0.85)

In [ ]:
def resample_data(data,period='5Min'):
    # 将一分钟线聚合为5分钟线
    databy5min = pd.DataFrame()
    groupgenerator = data.groupby(pd.TimeGrouper(freq=period))
    databy5min['Open'] = groupgenerator['Open'].first()
    databy5min['High'] = groupgenerator['High'].max()
    databy5min['Low'] = groupgenerator['Low'].min()
    databy5min['Close'] = groupgenerator['Close'].last()
    databy5min['TotalVolume'] = groupgenerator['TotalVolume'].sum()
    databy5min['OpenInterest'] = groupgenerator['OpenInterest'].last()
    databy5min.dropna(axis=0,inplace = True)
    return databy5min

In [ ]:
dfResampled = resample_data(data)

In [ ]:
dfResampled['OpenRatio'] = (dfResampled['OpenInterest'] - dfResampled['OpenInterest'].shift(1)) / dfResampled['TotalVolume']
dfResampled['WeightedOpenRatio'] = (dfResampled['OpenInterest'] - dfResampled['OpenInterest'].shift(1)) / np.sqrt(dfResampled['TotalVolume'])
dfResampled['pnl'] = dfResampled['Close'].pct_change()

In [ ]:
sampleSurge= dfResampled.loc[startTimeSurge:endTimeSurge]
sampleSurge['nextpnl'] = sampleSurge['pnl'].shift(-1)

In [ ]:
sampleSurge

In [ ]:
samplePlunge = dfResampled.loc[startTimePlunge:endTimePlunge]
samplePlunge['nextpnl'] = samplePlunge['pnl'].shift(-1)

In [ ]:
samplePlunge['OpenRatio'].hist()

In [ ]:
sampleSurge['nextpnl'].describe()

In [ ]:
sampleSurge['OpenRatio'].hist()

In [ ]:
sampleSurge[sampleSurge['OpenRatio'] > .15]['nextpnl'].hist()

In [ ]:
sampleSurge[sampleSurge['OpenRatio'] > 0.03]['nextpnl'].describe()

In [ ]:


In [ ]:
# check the monotonity of pnl filtered by openRatio indicator in bullish market
variation = []
bullpnlMeanList = []
bullpnlMedianList = []
for i in np.linspace(0,0.2,40):
    variation.append(i)
    bullpnlMeanList.append(sampleSurge[sampleSurge['OpenRatio'] >= i]['pnl'].mean())
    bullpnlMedianList.append(sampleSurge[sampleSurge['OpenRatio'] >= i].shape[0] / sampleSurge.shape[0])
fig, (ax1,ax2) = plt.subplots(nrows=2,sharex=True,figsize=(16,10))
ax1.plot(np.linspace(0,0.2,40),bullpnlMeanList,'blue',label='pnlMean')
ax2.plot(np.linspace(0,0.2,40),bullpnlMedianList,'red',label='pnlMedian')
ax1.legend()
ax2.legend()

In [ ]:
# check the monotonity of pnl filtered by openRatio indicator in bearrih market
variation = []
bearpnlMeanList = []
bearpnlMedianList = []
for i in np.linspace(0,0.2,40):
    variation.append(i)
    bearpnlMeanList.append(samplePlunge[samplePlunge['OpenRatio'] >= i]['nextpnl'].mean())
    bearpnlMedianList.append(samplePlunge[samplePlunge['OpenRatio'] >= i].shape[0] / samplePlunge.shape[0])
fig, (ax1,ax2) = plt.subplots(nrows=2,sharex=True,figsize=(16,10))
ax1.plot(np.linspace(0,0.2,40),bearpnlMeanList,'blue',label='pnlMean')
ax2.plot(np.linspace(0,0.2,40),bearpnlMedianList,'red',label='pnlMedian')
ax1.legend()
ax2.legend()

In [ ]:


In [ ]:
sampleSurge[sampleSurge['OpenRatio'] >= 0.1]['nextpnl'].hist()

In [ ]:
pnlMedianList

In [ ]:
dfResampled['OpenRatio'].loc[startTime:].hist()

In [ ]:
dfResampled['WeightedOpenRatio'].quantile(0.996)

In [ ]:
dfResampled[dfResampled['OpenRatio'] > 0].mean()

In [ ]:
dfResampled['OpenRatio'].tail(500).plot()

In [ ]:
dfResampled['Close'].tail(30).plot()

In [ ]:
dfResampled[['pnl','OpenRatio']].tail(50)

In [ ]:
len(dfResampled[dfResampled['OpenRatio'] < 0.15]) / len(dfResampled)

In [ ]:
len(dfResampled[dfResampled['OpenRatio'] < 0]) / len(dfResampled)

In [ ]:
np.sqrt(0.33*225/5)

In [ ]:
datatrain = dfResampled.loc[startTrainTime:endTimeTrain]

In [ ]:
startTime

In [ ]:
datatrain['EMAOpenRatio'] = talib.EMA(datatrain['OpenRatio'].values, timeperiod=4)

datatrain['EMAOpenRatio'].describe()

In [ ]:
datatrain['OpenRatio'].describe()

In [ ]:
datatrain['EMAOpenRatio'].quantile(0.92)

In [ ]:
datatrain['OpenRatio'].quantile(0.9)

In [ ]:
# show size distribution(quantile plot)
fig = plt.figure(figsize=(16,10))
# Add a subplot
ax = fig.add_subplot(111)
datatrain['EMAOpenRatio'].hist(alpha =0.8)
datatrain['OpenRatio'].hist(alpha = 0.3)
ax.set_title(ax.get_title(),alpha=0.7, fontsize=25)

In [ ]:
datatrain['EMAOpenRatio'].hist()
datatrain['OpenRatio'].hist(alpha = 0.3)

In [ ]:
datatrain['OpenRatio'].describe()

In [ ]:


In [ ]:
dfResampled['Close'].plot(figsize=(18,12))

In [ ]:
shapeNum = 20

In [ ]:
svdindicator = getSVD(datatrain['Close'].values[-shapeNum:], shapeNum, svdShort)

In [ ]:


In [ ]:
svdindicator1 = getSVD(datatrain['Close'].values[-shapeNum:], shapeNum, svdLong)
svdindicator1

In [ ]:
fig, (ax1, ax2) = plt.subplots(nrows=2, sharex=True,figsize=(16,10))
ax1.plot(datatrain.index[-shapeNum:],datatrain['Close'].values[-shapeNum:],'blue',label='Close')
ax2.plot(datatrain.index[-shapeNum:],svdindicator,'green',)
ax2.plot(datatrain.index[-shapeNum:],svdindicator1,'black')
#ax1.title('SVD')

In [ ]:
fig, (ax1, ax2) = plt.subplots(nrows=2, sharex=True,figsize=(22,16))
ax1.plot(datatrain['Close'].values[-shapeNum:],'blue',label='Close')
ax2.plot(svdindicator,'red',label='svd-'+str(svdShort))
ax2.plot(svdindicator1,'yellow',label='svd-'+str(svdLong))
#ax1.title('SVD')
ax1.legend()
ax2.legend()

In [ ]:
datatrain['Close'].plot(figsize=(18,12))

In [ ]:
#kvol.median()

In [ ]:
kvol = (datatrain['Close'] - datatrain['Close'].shift(1)).rolling(min_periods=60,window=60,center=False).std()

In [ ]:
kvol.iloc[-100:].plot(figsize=(18,12))
((7*datatrain['Close']/datatrain['Close'].iloc[0]).iloc[-100:]).plot(sharex=True)

In [ ]:
datatrain['Close'].iloc[-500:].plot(figsize=(18,12))

In [ ]:
datatrain['Impact'] = (datatrain['Close'] - datatrain['Open']) / datatrain['TotalVolume']

In [ ]:
datatrain['Impact'].iloc[-55:].plot(figsize=(18,12))

In [ ]:


In [ ]:
datatrain['TotalVolume'].iloc[-55:].mean()

In [ ]:
datatrain[datatrain['Impact']>0]

In [ ]:
filtered = datatrain[(datatrain['TotalVolume']<datatrain['TotalVolume'].quantile(0.2)) & (datatrain['Impact']>((datatrain[datatrain['Impact']>0]['Impact']).quantile(0.7)))]

In [ ]:
filtered

In [ ]:
datatrain.iloc[:2]

In [ ]:
filtered['Close'].plot(figsize=(18,10))

In [ ]:
trackBackPeriod = 50
code = 'rb000' 
initialNum = 100

In [ ]:


In [ ]:
futurebuyList = []
for i in range(initialNum,len(datatrain)-1):
    svdshortArray = getSVD(datatrain['Close'].values[i+1-trackBackPeriod:i+1], trackBackPeriod, svdShort)
    svdlongArray = getSVD(datatrain['Close'].values[i+1-trackBackPeriod:i+1], trackBackPeriod, svdLong)
    if svdshortArray[-2] < svdlongArray[-2] and svdshortArray[-1] > svdlongArray[-1]:
        dfslice = datatrain.iloc[i+1]
        amount = 1
        price = dfslice['Open']
        cost = price * amount
        direction = 1
        futurebuyList.append([dfslice.name,code,price,amount,cost,direction])

In [ ]:
futurebuyList

In [ ]:


In [105]:
filefullname = 'C:/Users/LZJF_02/Desktop/temp.xlsx'

In [106]:
xls = pd.ExcelFile(filefullname)
df1 = xls.parse('file',header=0,index_col=0,parse_dates=True)

In [116]:
df = df1.dropna(how='all',axis=1)
df=df.dropna(axis=0)
df.columns = ['open','high','low',u'贵州茅台','amount','volume','hs300',u'白酒指数']
df = df[[u'贵州茅台',u'白酒指数']]
df=df.dropna(axis=0)
df.index.name  = 'time'

In [117]:
df


Out[117]:
贵州茅台 白酒指数
time
2004-12-31 00:00:00 8.38 1000.00
2005-01-04 00:00:00 8.34 983.44
2005-01-05 00:00:00 8.52 998.04
2005-01-06 00:00:00 8.37 989.37
2005-01-07 00:00:00 8.44 997.57
2005-01-10 00:00:00 8.75 1014.30
2005-01-11 00:00:00 8.76 1017.21
2005-01-12 00:00:00 8.78 1017.29
2005-01-13 00:00:00 8.83 1015.72
2005-01-14 00:00:00 8.67 1010.15
2005-01-17 00:00:00 8.67 994.76
2005-01-18 00:00:00 8.80 1007.14
2005-01-19 00:00:00 8.67 1002.66
2005-01-20 00:00:00 8.67 990.12
2005-01-21 00:00:00 8.67 1009.99
2005-01-24 00:00:00 8.75 1019.32
2005-01-25 00:00:00 9.01 1032.88
2005-01-26 00:00:00 9.01 1028.75
2005-01-27 00:00:00 8.97 1012.42
2005-01-28 00:00:00 8.94 1014.37
2005-01-31 00:00:00 9.00 1013.07
2005-02-01 00:00:00 8.92 1007.23
2005-02-02 00:00:00 9.11 1042.80
2005-02-03 00:00:00 9.00 1026.61
2005-02-04 00:00:00 9.07 1036.95
2005-02-16 00:00:00 9.11 1038.59
2005-02-17 00:00:00 8.92 1038.30
2005-02-18 00:00:00 8.98 1026.77
2005-02-21 00:00:00 9.22 1045.06
2005-02-22 00:00:00 9.24 1057.40
... ... ...
2017-10-25 00:00:00 565.67 24653.24
2017-10-26 00:00:00 605.09 25671.48
2017-10-27 00:00:00 649.63 26899.15
2017-10-30 00:00:00 622.08 26230.02
2017-10-31 00:00:00 618.03 26102.88
2017-11-01 00:00:00 623.01 26328.94
2017-11-02 00:00:00 626.92 26555.80
2017-11-03 00:00:00 639.17 26860.32
2017-11-06 00:00:00 653.06 27615.56
2017-11-07 00:00:00 642.07 27370.96
2017-11-08 00:00:00 650.38 27400.20
2017-11-09 00:00:00 650.07 27509.60
2017-11-10 00:00:00 677.95 28532.22
2017-11-13 00:00:00 687.88 28617.41
2017-11-14 00:00:00 678.75 27983.82
2017-11-15 00:00:00 688.08 28553.16
2017-11-16 00:00:00 719.11 29538.21
2017-11-17 00:00:00 690.25 28611.32
2017-11-20 00:00:00 679.15 28129.15
2017-11-21 00:00:00 677.25 28473.68
2017-11-22 00:00:00 650.52 27438.76
2017-11-23 00:00:00 633.73 26310.06
2017-11-24 00:00:00 630.04 26292.58
2017-11-27 00:00:00 621.29 25912.52
2017-11-28 00:00:00 648.23 26599.25
2017-11-29 00:00:00 638.12 26031.74
2017-11-30 00:00:00 631.00 25735.33
2017-12-01 00:00:00 622.35 25598.49
2017-12-04 00:00:00 637.79 26386.29
2017-12-05 00:00:00 643.02 26366.45

3142 rows × 2 columns


In [118]:
df.index = pd.to_datetime(df.index)

In [119]:
df.index


Out[119]:
DatetimeIndex(['2004-12-31', '2005-01-04', '2005-01-05', '2005-01-06',
               '2005-01-07', '2005-01-10', '2005-01-11', '2005-01-12',
               '2005-01-13', '2005-01-14',
               ...
               '2017-11-22', '2017-11-23', '2017-11-24', '2017-11-27',
               '2017-11-28', '2017-11-29', '2017-11-30', '2017-12-01',
               '2017-12-04', '2017-12-05'],
              dtype='datetime64[ns]', name=u'time', length=3142, freq=None)

In [120]:
endOfWeekList = sorted(list(set(df.iloc[df.resample('W').size().cumsum().sub(1)].index)))

In [121]:
vol = df.loc[endOfWeekList].pct_change().rolling(window= 20, min_periods=20).std().round(3)

In [122]:
vol


Out[122]:
贵州茅台 白酒指数
time
2004-12-31 NaN NaN
2005-01-07 NaN NaN
2005-01-14 NaN NaN
2005-01-21 NaN NaN
2005-01-28 NaN NaN
2005-02-04 NaN NaN
2005-02-18 NaN NaN
2005-02-25 NaN NaN
2005-03-04 NaN NaN
2005-03-11 NaN NaN
2005-03-18 NaN NaN
2005-03-25 NaN NaN
2005-04-01 NaN NaN
2005-04-08 NaN NaN
2005-04-15 NaN NaN
2005-04-22 NaN NaN
2005-04-29 NaN NaN
2005-05-13 NaN NaN
2005-05-20 NaN NaN
2005-05-27 NaN NaN
2005-06-03 0.049 0.034
2005-06-10 0.055 0.040
2005-06-17 0.055 0.040
2005-06-24 0.057 0.041
2005-07-01 0.057 0.042
2005-07-08 0.057 0.042
2005-07-15 0.057 0.042
2005-07-22 0.054 0.040
2005-07-29 0.054 0.040
2005-08-05 0.053 0.041
... ... ...
2017-05-12 0.022 0.020
2017-05-19 0.024 0.021
2017-05-26 0.022 0.021
2017-06-02 0.022 0.019
2017-06-09 0.023 0.021
2017-06-16 0.021 0.021
2017-06-23 0.021 0.021
2017-06-30 0.021 0.021
2017-07-07 0.024 0.024
2017-07-14 0.024 0.023
2017-07-21 0.023 0.023
2017-07-28 0.023 0.024
2017-08-04 0.024 0.025
2017-08-11 0.024 0.025
2017-08-18 0.024 0.025
2017-08-25 0.024 0.025
2017-09-01 0.024 0.025
2017-09-08 0.027 0.027
2017-09-15 0.027 0.028
2017-09-22 0.027 0.028
2017-09-29 0.027 0.029
2017-10-13 0.028 0.029
2017-10-20 0.028 0.029
2017-10-27 0.038 0.033
2017-11-03 0.039 0.033
2017-11-10 0.039 0.034
2017-11-17 0.039 0.034
2017-11-24 0.046 0.040
2017-12-01 0.044 0.039
2017-12-05 0.045 0.040

658 rows × 2 columns


In [123]:
# make sure that matplotib and seaborn can show Chinese
import matplotlib as mpl
mpl.rcParams['font.sans-serif'] = ['SimHei']
mpl.rcParams['font.serif'] = ['SimHei']
sns.set_style("darkgrid",{"font.sans-serif":['simhei', 'Arial']})

In [124]:
# show size distribution(quantile plot)
fig = plt.figure(figsize=(16,10))
# Add a subplot
ax = fig.add_subplot(111)
vol.plot(figsize=(22,14),ax=ax,title ='Volatility Difference',fontsize =13)
ax.set_title(ax.get_title(),alpha=0.7, fontsize=25)


Out[124]:
<matplotlib.text.Text at 0x466aea90>