In [583]:
#!/Tsan/bin/python
# -*- coding: utf-8 -*-
In [584]:
# Libraries to use
from __future__ import division
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.cluster import KMeans
In [585]:
# Import My own library for factor testing
from SingleFactorTest import factorFilterFunctions as ff
#from config import *
In [586]:
%matplotlib inline
In [587]:
%load_ext line_profiler
In [588]:
# make sure that matplotib and seaborn can show Chinese
import matplotlib as mpl
mpl.rcParams['font.sans-serif'] = ['SimHei']
mpl.rcParams['font.serif'] = ['SimHei']
sns.set_style("darkgrid",{"font.sans-serif":['simhei', 'Arial']})
In [589]:
# Files to use
filenamePrice = 'Own_Factor_AdjustedPriceForward-1d.csv'
filenameST = 'LZ_GPA_SLCIND_ST_FLAG.csv'
filenameTradeday = 'LZ_GPA_SLCIND_TRADEDAYCOUNT.csv'
filenameStopFlag = 'LZ_GPA_SLCIND_STOP_FLAG.csv'
filenameIndu = 'LZ_GPA_INDU_ZX.csv'
filenameFCAP = 'LZ_GPA_VAL_A_FCAP.csv'
filenameAdjustFactor = 'LZ_GPA_CMFTR_CUM_FACTOR.csv'
filenameHS300 = 'LZ_GPA_INDXQUOTE_CLOSE.csv'
filenameZXIndustry = 'LZ_GPA_INDU_ZX.csv'
filenameZXExplanation = 'LZ_GPA_TMP_INDU_ZX.csv'
In [590]:
# Constants
startTime = datetime.strptime('20161201', '%Y%m%d')
endTime = datetime.strptime('20170429', '%Y%m%d')
path = ff.data_path
timeStampNum = 2500
thresholdNum = 0.2
HS300Index ='000300.SH' # HS300 index code
ZZ500Index = '000905.SH' # ZZ500 index code
In [591]:
stDF = pd.read_csv(path+filenameST,infer_datetime_format=True,parse_dates=[0],index_col=0).loc[startTime:endTime]
tradeDayDF = pd.read_csv(path+filenameTradeday,infer_datetime_format=True,parse_dates=[0],index_col=0).loc[startTime:endTime]
stopFlagDF = pd.read_csv(path+filenameStopFlag,infer_datetime_format=True,parse_dates=[0],index_col=0).loc[startTime:endTime]
indusDF = pd.read_csv(path+filenameZXIndustry,infer_datetime_format=True,parse_dates=[0],index_col=0).loc[startTime:endTime]
induExplanation = pd.read_csv(path+filenameZXExplanation,infer_datetime_format=True,parse_dates=[0],encoding='gb2312')
In [670]:
# Uqer factor
filenameUQAD20 = 'Uqer_factor_AD20.csv' # group9最好,有2%左右的超额收益,其他组都没有超额收益,多空效果很好,有较好的单调性(但是单调性反转,长期是group0最小)(证明类动量/价因子还是效果比较好的)
filenameUQADTM = 'Uqer_factor_ADTM.csv' # 因子失效中(没有超额收益),失去单调性
filenameUQATR6 = 'Uqer_factor_ATR6.csv' # group1最好,有0.2%左右的超额收益,其他组都没有超额收益,多空效果很好,有较好的单调性
filenameUQAroon = 'Uqer_factor_Aroon.csv' # 因子失效中(几乎没有超额收益),失去单调性
filenameUQBias10 = 'Uqer_factor_BIAS10.csv' # 因子失效中(几乎没有超额收益),失去单调性
filenameUQCCI10 = 'Uqer_factor_CCI10.csv' # 因子失效中(几乎没有超额收益),失去单调性
filenameUQKDJ_K = 'Uqer_factor_KDJ_K.csv' # 因子失效中(几乎没有超额收益),失去单调性
filenameROC6 = 'Uqer_factor_ROC6.csv' # 无效因子
filenameCMO = 'Uqer_factor_CMO.csv' # 因子失去单调性
filenameRVI = 'Uqer_factor_RVI.csv' # 无效银子
filenameRSI = 'Uqer_factor_RSI.csv' # 价格波动率策略目前都处于失效中?(大概)
In [593]:
# Own factor
filenameOWNILLIQ ='Own_Factor_ILLQ-1d.csv'
filenameSharpe = 'Own_Factor_sharpe_ratio_20D.csv'
filenameTOV20 = 'Own_Factor_Turnover_Volatility_20D.csv' # group0 最好,有年化 10%的收益率,单调性很好,多空效果也很好,
#但是市值单调递减,group0市值最大,可能选到了上证50(大市值股票交易比较稳定,因此换手率波动较小)
filenameADJTOV20 = 'Own_Factor_ADJ_Turnover_Volatility_20D.csv' # 单调性较好,但是没有超额收益
filenameTOVD20 = 'Own_Factor_Turnover_Volatility_deviation_20D.csv' # 无单调性,无超额收益
filenameADJTOVD20 = 'Own_Factor_ADJ_Turnover_Volatility_Deviation_20D.csv' # 无单调性,无超额收益
filenameMOM1M = 'LZ_GPA_DERI_Momentum_1M.csv' # 一月反转
In [594]:
# yield/growth
filenameYOYGR = 'LZ_GPA_FIN_IND_QFA_YOYGR.csv'
filenameYOYROE = 'LZ_GPA_FIN_IND_YOYROE.csv'
filenameROE = 'LZ_GPA_FIN_IND_ROE.csv'
In [595]:
# value
filenamePS = 'LZ_GPA_VAL_PS.csv' # group0 最好,有年化 5%的收益率,单调性很好,多空效果也很好
filenamePB ='LZ_GPA_VAL_PB.csv' # group0 最好,有年化 12%的收益率,单调性很好,多空效果也很好 (神级因子)
filenamePE ='LZ_GPA_VAL_PE.csv' # group0 最好,有年化 10%的收益率,单调性不是特别完美,但不错,多空效果也很好
In [596]:
#
filenameROE = 'LZ_GPA_FIN_IND_ROE.csv'
filenameRoeVol250 = 'Own_Factor_ROE_Volatility_250D.csv'
In [671]:
sololist = [filenameRSI]
In [672]:
sparedata = pd.read_csv(path+sololist[0],infer_datetime_format=True,parse_dates=[0],index_col=0)
In [673]:
# adjust the columns name of the Uqer data same as the own factor
if 'Uqer' in sololist[0]:
uqercolumnList = sparedata.columns.tolist()
uqercolumnName = [x.split('.')[0] for x in uqercolumnList]
newcolumnList = stDF.columns.tolist()
newcolumnName = [x.split('.')[0] for x in newcolumnList]
columndict = dict(zip(newcolumnName,newcolumnList))
finalcolumns = []
for stk in uqercolumnName:
if stk in newcolumnName:
stk = columndict[stk]
else:
pass
finalcolumns.append(stk)
sparedata.columns = finalcolumns
addNanColumns = list(set(newcolumnList) - set(sparedata.columns))
addData = pd.DataFrame(index = sparedata.index,columns = addNanColumns,dtype =float)
sparedata = pd.concat([sparedata,addData],axis=1)
sparedata = sparedata[newcolumnList]
In [ ]:
In [674]:
priceData = pd.read_csv(path+ filenamePrice ,infer_datetime_format=True,parse_dates=[0],index_col=0).loc[startTime:endTime]
benchMarkData = pd.read_csv(path+filenameHS300,infer_datetime_format=True,parse_dates=[0],index_col=0).loc[startTime:endTime][ZZ500Index]
In [675]:
LFCAPDF = np.log10(ff.getData(thresholdNum, startTime, endTime,filename = filenameFCAP))
In [676]:
LFCAPDF
Out[676]:
In [677]:
endOfWeekList = sorted(list(set(LFCAPDF.iloc[LFCAPDF.resample('W').size().cumsum().sub(1)].index)))
In [678]:
endOfWeekList
Out[678]:
In [679]:
# calculate correlation between two factors
if 'Uqer' in sololist[0]:
factor1 = sparedata.loc[startTime:endTime]
else:
factor1 = pd.read_csv(path+sololist[0],infer_datetime_format=True,parse_dates=[0],index_col=0).loc[startTime:endTime]
#correlationDF = ff.showCorrelation(factor1,LFCAPDF, endOfWeekList, filterdic = None).astype(float)
#correlationDF.describe()
In [680]:
activeReturnData = ff.calcReturn(priceData, endOfWeekList, benchmark = benchMarkData,activeReturn = True,logReturn = False)
In [681]:
# Generate the useles stock list
filterdict={}
for i in endOfWeekList:
suspendList = ff.GetSTNewSuspend(i,stDF,tradeDayDF,stopFlagDF)
filterdict[i] = suspendList
print i, len(filterdict[i])
In [682]:
### Calculate the return of each group of given factor(solo factor)
totalGroupDict ={}
factorData = ff.getData(thresholdNum, startTime, endTime,availableData = factor1)
for date in endOfWeekList:
factorDataTemp = factorData.loc[:date].tail()
factorDataTemp = factorDataTemp[list(set(factorDataTemp.columns.tolist())- set(filterdict[date]))]
#print factorDataTemp
totalGroupDict[date] = ff.getStockGroup(factorDataTemp,groupNum=10,Mean_Num=1)
In [683]:
factorData
Out[683]:
In [684]:
ReturnDF = pd.DataFrame(index=endOfWeekList[:-1],columns=totalGroupDict.values()[0].keys(),data=None, dtype =float)
sizeDistribution = pd.DataFrame(index=endOfWeekList[:-1],columns=totalGroupDict.values()[0].keys(),data=None, dtype =float)
for group in ReturnDF.columns.tolist():
for time in ReturnDF.index:
ReturnDF.loc[time][group] = activeReturnData.loc[time][totalGroupDict[time][group]].mean()
sizeDistribution.loc[time][group] = LFCAPDF.loc[time][totalGroupDict[time][group]].quantile()
ReturnDF.sort_index(axis=1,inplace=True)
sizeDistribution.sort_index(axis=1,inplace=True)
In [685]:
# show size distribution(quantile plot)
fig = plt.figure(figsize=(16,10))
# Add a subplot
ax = fig.add_subplot(111)
sizeDistribution.median().plot(kind='bar',ax = ax, fontsize =13,title ='Size Distribution of each group',alpha =0.8)
ax.set_title(ax.get_title(),alpha=0.7, fontsize=25)
Out[685]:
In [686]:
# Calc spearman correlation to investigate monotonicity
referSeries = pd.Series(index = ReturnDF.columns, data=range(len(ReturnDF.columns)))
monoDF = pd.DataFrame(index = ReturnDF.index, columns=['Spearman_Cor'], dtype = float)
for date in ReturnDF.index:
monoDF.loc[date] = ReturnDF.loc[date].corr(referSeries,method='spearman')
In [687]:
# Plot
fig = plt.figure(figsize=(18,14))
# Add a subplot
ax = fig.add_subplot(111)
monoDF.plot(figsize=(22,14),ax=ax,fontsize =13,title ='Monotonicity')
ax.set_title(ax.get_title(),alpha=0.7, fontsize=30)
Out[687]:
In [688]:
# long top group short bot group
sortGroups = ReturnDF[['group_0','group_9']].cumsum().iloc[-1].sort_values(ascending = False).index
top, bot = sortGroups[0], sortGroups[-1]
# Add another column
ReturnDF['top_bot_comb'] = ReturnDF[top] - ReturnDF[bot]
In [689]:
ReturnDF
Out[689]:
In [690]:
# Net worth plot
netWorthDF = (ReturnDF.astype(float)+1).cumprod()
fig = plt.figure(figsize=(18,14))
# Add a subplot
ax = fig.add_subplot(111)
netWorthDF.plot(figsize=(22,14),ax=ax,color=sns.color_palette("Paired",11),title ='Net Worth',fontsize =13)
ax.set_title(ax.get_title(),alpha=0.7, fontsize=30, ha='right')
Out[690]:
In [691]:
# Calc spearman correlation to investigate monotonicity
referSeries = pd.Series(index = netWorthDF.iloc[:,:-1].columns, data=range(len(netWorthDF.iloc[:,:-1].columns)))
monoDF = pd.DataFrame(index = netWorthDF.index, columns=['Spearman_Cor'], dtype = float)
for date in netWorthDF.index:
monoDF.loc[date] = netWorthDF.iloc[:,:-1].loc[date].corr(referSeries,method='spearman')
In [692]:
# Plot
fig = plt.figure(figsize=(18,14))
# Add a subplot
ax = fig.add_subplot(111)
monoDF.plot(figsize=(22,14),ax=ax,fontsize =13,title ='Monotonicity')
ax.set_title(ax.get_title(),alpha=0.7, fontsize=30)
Out[692]:
In [693]:
# basic indicator
annualizedReturn = (1+ReturnDF.mean())**50 - 1
annualizedVol = ReturnDF.std()* np.sqrt(50)
sharpeRatio = annualizedReturn / annualizedVol
print 'Annual Return:','\n',annualizedReturn,'\n\n','Annual Volatility:','\n',annualizedVol,'\n\n','Sharpe Ratio:''\n',sharpeRatio
In [694]:
copyReturn = ReturnDF.copy()
copyReturn[copyReturn > copyReturn.mean()] =0
downsideRisk = ReturnDF.std(skipna = True) * np.sqrt(12)
downsideRisk
Out[694]:
In [695]:
sortinoRatio = annualizedReturn / downsideRisk
sortinoRatio
Out[695]:
In [696]:
# Max drawdown
maxdd = netWorthDF.copy()
maxdd.iloc[0] = 0
for date in netWorthDF.index[1:]:
maxdd.loc[date] = 1-netWorthDF.loc[date]/netWorthDF.loc[:date].max()
maxddInfo = pd.concat([maxdd.max(),maxdd.idxmax()],axis=1)
maxddInfo.columns = ['Max_drawdown','Time']
maxddInfo
Out[696]:
In [697]:
# calmar Ratio
calmarRatio = annualizedReturn/ maxddInfo['Max_drawdown']
calmarRatio
Out[697]:
In [698]:
# show the difference of a risk factor through different market capitalzation size
# capdata should not contain Nan value
# Return: DICTIONARY, the KEY is the date and the Value is the tuple of the groups
# Inputs:
# capdata: DATAFRAME ,the LFCAP DATA
# datelist: LIST, the datelist of the end month
def getGroupsbyCap(capdata, datelist):
capdict ={}
for date in datelist:
capdataindice = capdata.loc[date]
lower = capdataindice.quantile(1/3)
upper = capdataindice.quantile(2/3)
smallcap = capdataindice[capdataindice<=lower].index
midcap = capdataindice[(lower<capdataindice) & (capdataindice<=upper)].index
hugecap = capdataindice[capdataindice>upper].index
capdict[date] = (smallcap,midcap,hugecap)
return capdict
In [699]:
# show the difference of a risk factor through different industry
# capdata should not contain Nan value
# Return: DICTIONARY, the KEY is the date and the Value is the DICTIONARY of the groups of each industry on that day
# Inputs:
# datelist: LIST, the datelist of the end month
# grouplable: LIST, the industry label,usually 3 of them is constant and other 3 is randomed alike [2,3,6,15,18,25]
# industryDF: DATAFRAME,the industry dataframe(could )
def getIndustryDict(datelist,grouplabel,industryDF):
industrydict = {}
industryDF = industryDF.loc[datelist]
for date in datelist:
industryDFindice = industryDF.loc[date]
industrydict[date] = {label:industryDFindice[industryDFindice == label].index for label in grouplabel}
return industrydict
In [700]:
# industry number
grouplabel = np.random.choice(29,10,replace=False)
grouplabel
Out[700]:
In [701]:
wholeIndList = np.array(range(29))
wholeIndDict = getIndustryDict(endOfWeekList,wholeIndList,indusDF)
In [702]:
multindexList = [endOfWeekList,wholeIndList]
induReturnDF = pd.DataFrame(data=None, columns=totalGroupDict.values()[0].keys(),\
index=pd.MultiIndex.from_product(multindexList,names=['time','industry']),dtype=float)
In [703]:
groupNumberThrottle = 10
for i,j in wholeIndDict.iteritems():
print i
factorIndice = factorData.loc[:i].tail()
factorIndice = factorIndice[list(set(factorIndice.columns.tolist())- set(filterdict[date]))] # Remove ST ,new and suspend stk
for ind, stk in j.iteritems():
intersection = list(set(factorIndice.columns.tolist()) & set(stk))
if len(intersection) < groupNumberThrottle:
induReturnDF.loc[i,ind] = 0
continue
else:
stkgroup = ff.getStockGroup(factorIndice[intersection], groupNum=10, Mean_Num=1)
for p,q in stkgroup.iteritems():
try:
induReturnDF.loc[i,ind][p] = activeReturnData.loc[i][q].mean()
except:
induReturnDF.loc[i,ind][p] = np.NaN
In [704]:
finalReturn = induReturnDF.mean(level = 'time')
finalReturn.sort_index(axis=1,inplace=True)
In [705]:
sortGroups = finalReturn[['group_0','group_9']].cumsum().iloc[-1].sort_values(ascending = False).index
top, bot = sortGroups[0], sortGroups[-1]
# Add another column
finalReturn['top_bot_comb'] = finalReturn[top] - finalReturn[bot]
In [706]:
indNetWorth = (finalReturn+1).cumprod()
fig = plt.figure(figsize=(14,9))
# Add a subplot
ax = fig.add_subplot(111)
indNetWorth.plot(figsize=(22,14),ax=ax,color=sns.color_palette("Paired",11),title ='Net Worth',fontsize =13)
ax.set_title(ax.get_title(),alpha=0.7, fontsize=30, ha='right')
Out[706]: