In [1]:
#!/Tsan/bin/python
# -*- coding: utf-8 -*-
In [2]:
# Libraries to use
from __future__ import division
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
In [3]:
from datetime import datetime
In [4]:
# Import My own library for factor testing
from SingleFactorTest import factorFilterFunctions as ff
#from config import *
In [5]:
%matplotlib inline
In [6]:
# assert np.any([[6,2]])
In [7]:
# Files to use
filenamePrice = 'Own_Factor_AdjustedPriceForward-1d.csv'
filenameST = 'LZ_CN_STKA_SLCIND_ST_FLAG.h5'
filenameTradeday = 'LZ_CN_STKA_SLCIND_TRADEDAYCOUNT.h5'
filenameStopFlag = 'LZ_CN_STKA_SLCIND_STOP_FLAG.h5'
filenameIndu = 'LZ_GPA_INDU_ZX.csv'
filenameFCAP = 'LZ_CN_STKA_VAL_A_FCAP.h5'
filenameAdjustFactor = 'LZ_CN_STKA_CMFTR_CUM_FACTOR.h5'
filenameHS300 = 'LZ_CN_STKA_INDXQUOTE_CLOSE.h5'
filenameZXIndustry = 'LZ_CN_STKA_INDU_ZX.h5' # 各股票对应的行业
filenameZXExplanation = 'LZ_GPA_TMP_INDU_ZX.csv'
In [8]:
# Factors
# Value factor
filenamePE='LZ_GPA_VAL_PE.csv' # 市盈率
filenamePB='LZ_GPA_VAL_PB.csv' # 市净率
filenamePS = 'LZ_GPA_VAL_PS.csv' # 市销率
filenamePCF = 'LZ_GPA_VAL_PC.csv' # 市现率
filenameADJPB ='Own_Factor_AdjustedPB-1d.csv' # 市值调整市净率
# Growth factor
filenameYOYGR = 'LZ_GPA_FIN_IND_QFA_YOYGR.csv' # 单季度.营业总收入同比增长率(%)
filenameYOYNETPROFIT = 'LZ_GPA_FIN_IND_QFA_YOYNETPROFIT.csv' # 单季度.归属母公司股东的净利润同比增长率(%)
filenameYOYOCF = 'LZ_GPA_FIN_IND_YOYOCF.csv' # 同比增长率-经营活动产生的现金流量净额(%)
filenameYOYROE = 'LZ_GPA_FIN_IND_YOYROE.csv' # 同比增长率-净资产收益率(摊薄)(%)
filenameYOYBPS = 'LZ_GPA_FIN_IND_YOYBPS.csv' # 相对年初增长率-每股净资产(%)
# Financial factor
filenameCAPITALIZEDTODA = 'LZ_GPA_FIN_IND_CAPITALIZEDTODA.csv' # 资本支出/折旧和摊销
filenameCASHRATIO = 'LZ_GPA_FIN_IND_CASHRATIO.csv' # 保守速动比率
filenameCASHTOLIQDEBT = 'LZ_GPA_FIN_IND_CASHTOLIQDEBT.csv' # 货币资金/流动负债
filenameOCFTODEBT = 'LZ_GPA_FIN_IND_OCFTODEBT.csv' # 经营活动产生的现金流量净额/负债合计
filenamePROFITTOOP = 'LZ_GPA_FIN_IND_PROFITTOOP.csv' # 利润总额/营业收入
filenamePROFITTOOPTTM ='LZ_GPA_FIN_DERI_COMBO_EBTTOOR_TTM.csv' # # 利润总额/营业收入(TTM)
# Momentum factor
filenameTRUEMOM = 'LZ_GPA_USR_MOM.csv' # 真动量
filenameMOM1M = 'LZ_GPA_DERI_Momentum_1M.csv' # 一月反转
filenameMOM3M = 'LZ_GPA_TURNOVER_TurnoverAvg_3M.csv' # 三月反转
filenamePVO = 'LZ_GPA_DERI_PVO.csv' # 一年偏度
filenameABNORMALVOLUME = 'LZ_GPA_DERI_NormalizedAbormalVolume.csv' # 归一化异常交易量
filenameSKEW = 'LZ_GPA_DERI_TSKEW.csv'# 偏度
filenameMACD = 'LZ_GPA_DERI_MACD.csv' # MACD
# Liquidity factor
filenameTURNOVER1M = 'LZ_GPA_TURNOVER_TurnoverAvg_1M.csv' # 一月换手率均值
filenameAMOUNTAVG1M = 'LZ_GPA_DERI_AmountAvg_1M.csv' # 日均成交量
filenameILLIQ = 'LZ_GPA_DERI_ILLIQ.csv' # 非流动性因子
filenameVOLUME = 'LZ_GPA_QUOTE_TVOLUME.csv' # 成交量
filenameOWNILLIQ ='Own_Factor_ILLQ-1d.csv' # 非流动性因子(自算)
filenameADJTURNOVER = 'LZ_GPA_DERI_adjustedTurnOver_20.csv' #市值调整换手率
filenameDDA = 'Own_Factor_DDA-1d.csv' # 股票每日成交额(前复权)
# Volatility factor
filenameRV1Y = 'LZ_GPA_DERI_RealizedVolatility_1Y.csv' # 一年收益波动
filenameOwnVol = 'Own_Factor_Volatility_90d.csv' # 90天收益波动率
filenameAbove20 = 'Own_Factor_Above20MA_20d.csv' # 高于20天MA的价格平均
filenameTOV20 = 'Own_Factor_Turnover_Volatility_20D.csv' # 20天换手率波动率
filenameADJTOV20 = 'Own_Factor_ADJ_Turnover_Volatility_20D.csv' # 20天换手率波动率(市值调整)
filenameADJTOVD20 = 'Own_Factor_ADJ_Turnover_Volatility_Deviation_20D.csv' # 20天换手率均值除500天换手率均值-1(市值调整)
# SIZE
filenameSIZE = 'LZ_GPA_VAL_A_FCAP.csv'
In [9]:
#
filenameAroon = 'Aroon_Allstocks.csv'
filenameDDA20 = 'Own_Factor_DDA-20d.csv' # DDA二十天均值 2012-2016 Cum return 2.2(group0), - 0.6(group9) monotonicity: Perfect
filenameIDIVOL = 'Own_Factor_Idiosyncratic_Volatility.csv' # 特异常波动率
filenamePPO ='LZ_GPA_DERI_PPO.csv' #
In [10]:
filenameOwnSkewness = 'Own_Factor_Skewness_120d.csv' # 250日偏度
In [ ]:
In [11]:
# Constants
# some useful parameters
startTime = datetime.strptime('20100101', '%Y%m%d')
endTime = datetime.strptime('20170228', '%Y%m%d')
path = ff.data_path
timeStampNum = 2500
thresholdNum = 0.2
HSIndex='000300.SH' # HS300 index code
ZZ500Index = '000905.SH' # ZZ500 index code
In [12]:
# Several dictionaries for different sorts of factors
filenameDictValue = {'PE':filenamePE,'PB':filenamePB, 'PS':filenamePS, 'PCF':filenamePCF,'ADJPB':filenameADJPB}
filenameDictGrowth = {'YOYGR':filenameYOYGR,'YOYGRPROFIT':filenameYOYNETPROFIT,'YOYOCF':filenameYOYOCF, 'YOYROE':filenameYOYROE, 'YOYBPS':filenameYOYBPS}
filenameDicFinance = {'CAPITALIZEDTODA':filenameCAPITALIZEDTODA, 'CASHRATIO':filenameCASHRATIO, 'CASHTOLIQDEBT':filenameCASHTOLIQDEBT,\
'OCFTODEBT':filenameOCFTODEBT,'PROFITTOOPTTM':filenamePROFITTOOPTTM}
filenameDictMomentum = {'TRUE_MOM':filenameTRUEMOM, 'MOM_1M':filenameMOM1M , 'MOM3M':filenameMOM3M,'PVO':filenamePVO, 'RV1Y':filenameRV1Y,\
'ABORMALVOLUME':filenameABNORMALVOLUME,'SKEW':filenameSKEW,'MACD':filenameMACD}
filenameDictLiq = {'TURNOVER_1M':filenameTURNOVER1M, 'AMOUNTAVG1M':filenameAMOUNTAVG1M, 'ILLIQ':filenameILLIQ,'VOLUME':filenameVOLUME,\
'OWNILLIQ':filenameOWNILLIQ,'ADJTURNOVER':filenameADJTURNOVER,'DDA':filenameDDA}
filenameVolatility = {'VOLATILITY':filenameOwnVol}
In [13]:
filenameTest = {'PB':filenamePB, 'YOYGR':filenameYOYGR,'ILLIQ':filenameILLIQ,'TURNOVER_1M':filenameTURNOVER1M,\
'OWNILLIQ':filenameOWNILLIQ,'MOM_1M':filenameMOM1M,'CASHRATIO':filenameCASHRATIO,'ABOVE20MA':filenameAbove20,\
'OWNVOL':filenameOwnVol}
In [14]:
filename7factor = {'PB':filenamePB,'YOYGR':filenameYOYGR,'OCFTODEBT':filenameOCFTODEBT,'MOM_1M':filenameMOM1M,
'VOLATILITY':filenameOwnVol,'DDA20':filenameDDA20,'OWNILLIQ':filenameOWNILLIQ,
'IDIVOL':filenameIDIVOL,'ADJTOV20':filenameADJTOV20}
In [15]:
filenameDict = filename7factor
In [16]:
# total filename dictionary
#filenameDict = {'PE':filenamePE,'PB':filenamePB, 'PS':filenamePS, 'PCF':filenamePCF, 'YOYGR':filenameYOYGR,'YOYGRPROFIT':filenameYOYNETPROFIT, \
# 'TRUE_MOM':filenameTRUEMOM, 'MOM_1M':filenameMOM1M , 'MOM3M':filenameMOM3M, 'TURNOVER_1M':filenameTURNOVER1M }
stDF = pd.read_csv(path+filenameST,infer_datetime_format=True,parse_dates=[0],index_col=0).loc[startTime:endTime] tradeDayDF = pd.read_csv(path+filenameTradeday,infer_datetime_format=True,parse_dates=[0],index_col=0).loc[startTime:endTime] stopFlagDF = pd.read_csv(path+filenameStopFlag,infer_datetime_format=True,parse_dates=[0],index_col=0).loc[startTime:endTime]
In [17]:
# for h5 file
stDF = ff.readh5data(path,filenameST).loc[startTime:endTime]
tradeDayDF = ff.readh5data(path,filenameTradeday).loc[startTime:endTime]
stopFlagDF = ff.readh5data(path,filenameStopFlag).loc[startTime:endTime]
In [ ]:
In [18]:
priceData = pd.read_csv(path+ filenamePrice ,infer_datetime_format=True,parse_dates=[0],index_col=0).loc[startTime:endTime]
#benchMarkData = pd.read_csv(path+filenameHS300,infer_datetime_format=True,parse_dates=[0],index_col=0).loc[startTime:endTime][ZZ500Index]
benchMarkData = ff.readh5data(path, filenameHS300).loc[startTime:endTime][ZZ500Index] # for h5 file
In [19]:
#LFCAPDF = np.log10(ff.getData(thresholdNum, startTime, endTime,filename = filenameFCAP)) # for csv file
LFCAPDF = np.log10(ff.getData(thresholdNum, startTime, endTime,filename = filenameFCAP)) # for h5 file
In [20]:
endOfMonthList = ff.getLastDayOfMonth(LFCAPDF.index)[1]
In [21]:
map(lambda x: x.date().strftime("%Y%m%d"), endOfMonthList)
Out[21]:
In [22]:
activeReturnData =ff.calcReturn(priceData, endOfMonthList, benchmark = benchMarkData,activeReturn = True,logReturn = True)
In [23]:
activeReturnData.iloc[activeReturnData.resample('M').size().cumsum().sub(1)]
Out[23]:
In [24]:
#IndustryDF = pd.read_csv(path+filenameIndu,infer_datetime_format=True,parse_dates=[0],index_col=0).loc[endOfMonthList] # for csv file
IndustryDF = ff.readh5data(path,filenameZXIndustry).loc[endOfMonthList] # for h5 file
In [25]:
IndustryDF
Out[25]:
In [26]:
# Generate the useles stock list
filterdict={}
for i in endOfMonthList:
suspendList = ff.GetSTNewSuspend(i,stDF,tradeDayDF,stopFlagDF)
filterdict[i] = suspendList
print i, len(filterdict[i])
In [27]:
normalizedLFCAP = ff.winsorAndnorm(LFCAPDF, filterdict, endOfMonthList)
In [28]:
# Creat new dataframe to save the outcome
global returnofFactor
global tValueofFactor
global pValueofFactor
global ICFactor
global ICpValue
returnofFactor = pd.DataFrame(index=endOfMonthList[:-1],columns=[filenameDict.keys()],data=None,dtype = float)
tValueofFactor = pd.DataFrame(index= endOfMonthList[:-1],columns= [filenameDict.keys()],data=None,dtype = float)
pValueofFactor = pd.DataFrame(index= endOfMonthList[:-1],columns= [filenameDict.keys()],data=None,dtype = float)
ICFactor = pd.DataFrame(index= endOfMonthList[:-1],columns= [filenameDict.keys()],data=None,dtype = float)
ICpValue = pd.DataFrame(index= endOfMonthList [:-1],columns= [filenameDict.keys()],data=None,dtype = float)
In [29]:
pValueofFactor.tail()
Out[29]:
In [ ]:
In [30]:
# ---------All above are global variables --------
In [31]:
def modifyUQdata(filename):
sparedata = pd.read_csv(path+filename,infer_datetime_format=True,parse_dates=[0],index_col=0)
uqercolumnList = sparedata.columns.tolist()
uqercolumnName = [x.split('.')[0] for x in uqercolumnList]
newcolumnList = stDF.columns.tolist()
newcolumnName = [x.split('.')[0] for x in newcolumnList]
columndict = dict(zip(newcolumnName,newcolumnList))
finalcolumns = []
for stk in uqercolumnName:
if stk in newcolumnName:
stk = columndict[stk]
else:
pass
finalcolumns.append(stk)
sparedata.columns = finalcolumns
addNanColumns = list(set(newcolumnList) - set(sparedata.columns))
addData = pd.DataFrame(index = sparedata.index,columns = addNanColumns,dtype =float)
sparedata = pd.concat([sparedata,addData],axis=1)
sparedata = sparedata[newcolumnList]
return sparedata
In [32]:
for i,j in filenameDict.iteritems():
print i
print j
if 'Uqer' in j :
factorData = modifyUQdata(j).loc[startTime:endTime]
else:
factorData = ff.getData(thresholdNum,startTime,endTime,filename = j)
normalizedData = ff.winsorAndnorm(factorData, filterdict, endOfMonthList)
neutralizedData = ff.neutralizeFactor(normalizedData, normalizedLFCAP,IndustryDF,endOfMonthList)
ff.calReturnAndIC(returnofFactor,tValueofFactor,pValueofFactor,ICFactor,ICpValue,neutralizedData,activeReturnData,i)
In [101]:
ICFactor[np.abs(ICFactor)>np.abs(ICFactor).mean().mean()].count().plot(figsize=(16,10),kind='bar',color=sns.color_palette("GnBu_d",2))
Out[101]:
In [ ]:
In [34]:
ICFactor.astype(float).describe()
Out[34]:
In [35]:
# Factors with positive IC
Above0df = ICFactor.astype(float)[ICFactor.astype(float)>0].describe()
Above0df
Out[35]:
In [36]:
# Factors with negative IC
Below0df = ICFactor.astype(float)[ICFactor.astype(float)<0].describe()
Below0df
Out[36]:
In [37]:
# show the amount of Negative IC and Positive IC in same figure
fig = plt.figure(figsize=(14,9))
ax = fig.add_subplot(111)
totaldf = pd.DataFrame({ 'Above_0':Above0df.loc['count'].values, 'Below_0':Below0df.loc['count'].values},index = Above0df.columns)
totaldf.plot(kind='bar',ax=ax, stacked =True,alpha=0.84,title ='IC Distribution',fontsize =13)
ax.set_title(ax.get_title(),alpha=0.88, fontsize=30)
Out[37]:
In [ ]:
In [38]:
returnofFactor.tail()
Out[38]:
In [39]:
ICFactor.iloc[-1] > ICFactor.mean()
Out[39]:
In [40]:
ICFactorTosave = ICFactor.apply(lambda x : x/np.abs(x).sum(),axis=1).shift(1)
#ICFactorTosave.to_csv((path+'ICfactorWeight8factorsPB.csv'))
In [41]:
ReturnTosave = returnofFactor.apply(lambda x : x/np.abs(x).sum(),axis=1).shift(1)
In [42]:
ReturnTosave.tail()
Out[42]:
In [43]:
# calculate correlation between two factors
turnOver = pd.read_csv(path+filenameADJTOV20,infer_datetime_format=True,parse_dates=[0],index_col=0).loc[startTime:endTime]
illq = pd.read_csv(path+filenameOwnVol ,infer_datetime_format=True,parse_dates=[0],index_col=0).loc[startTime:endTime]
correlationDF = ff.showCorrelation(turnOver,illq, endOfMonthList, filterdic = None).astype(float)
correlationDF.describe()
Out[43]:
In [ ]:
In [44]:
returnofFactor.cumsum().tail()
Out[44]:
In [45]:
returnofFactor.tail()
Out[45]:
In [ ]:
In [46]:
# simple cumulative return
fig = plt.figure(figsize=(14,9))
# Add a subplot
ax = fig.add_subplot(111)
returnofFactor.cumsum().plot(figsize=(20,12),ax=ax,color=sns.color_palette("Paired",10),title ='Cumulative Return',fontsize =13)
ax.set_title(ax.get_title(),alpha=0.7, fontsize=30)
Out[46]:
In [47]:
# Net worth plot for each factor
fig = plt.figure(figsize=(18,14))
# Add a subplot
ax = fig.add_subplot(111)
(np.abs(returnofFactor)+1).cumprod().plot(figsize=(22,14),ax=ax,color=sns.color_palette("Paired",10),title ='Net Worth',fontsize =13)
ax.set_title(ax.get_title(),alpha=0.7, fontsize=30, ha='right')
Out[47]:
In [48]:
returnofFactor.astype(float).describe()
Out[48]:
In [49]:
# show how ln function decays
fig = plt.figure(figsize=(16,9))
x = np.linspace(0,1,40)
y = np.exp(-np.sqrt(x))
y1 = np.exp(-10*x)
plt.plot(x,y)
plt.plot(x,y1)
Out[49]:
In [ ]:
In [50]:
#
pWeight = np.exp(-7*pValueofFactor.astype(float)).shift(1)
In [51]:
pWeight.tail()
Out[51]:
In [52]:
returnofFactor.tail()
Out[52]:
In [53]:
ICweighted = ICFactorTosave * pWeight
#ICweighted.to_csv((path+'IC_Weighted_By_P_Value.csv'))
ICweighted.tail()
Out[53]:
In [54]:
totalWeighted = (ReturnTosave * pWeight + ICFactorTosave)
#totalWeighted .to_csv((path+'total_Weighted_By_P_Value.csv'))
In [55]:
pValueofFactor.astype(float).describe()
Out[55]:
In [108]:
pd.DataFrame(pValueofFactor[pValueofFactor<0.05].count()/len(pValueofFactor)).plot(figsize=(18,12),kind='bar',color=my_colors )
Out[108]:
In [ ]:
In [104]:
#color_set=sns.light_palette((210, 90, 60), input="husl")
my_colors = [(x/10.0, x/20.0, 0.75) for x in range(pValueofFactor.shape[0])]
(pValueofFactor[pValueofFactor<0.05].count()/len(pValueofFactor)).plot(figsize=(18,12),kind='bar',color=my_colors )
Out[104]:
In [114]:
from itertools import cycle, islice
x = [{i:np.random.randint(1,5)} for i in range(10)]
df = pd.DataFrame(x)
my_colors = ['g', 'b']*5 # <-- this concatenates the list to itself 5 times.
my_colors = [(0.5,0.4,0.5), (0.75, 0.75, 0.25)]*5 # <-- make two custom RGBs and repeat/alternate them over all the bar elements.
my_colors = [(x/10.0, x/20.0, 0.75) for x in range(len(df))]
# Specify this list of colors as the `color` option to `plot`.
df.plot(kind='bar', stacked=True, color=my_colors)
Out[114]:
In [ ]:
In [57]:
ReturnForPlot = returnofFactor.copy()
ReturnForPlot.index = ReturnForPlot.index.map(lambda x:100*x.year+(1+x.month) if x.month<12 else\
100*(x.year+1)+1)
In [58]:
ReturnForPlot = ReturnForPlot.astype(float)
plt.figure(figsize=(20,12))
ax = plt.axes()
sns.heatmap(ReturnForPlot[-40:],ax=ax, annot=True)
ax.set_title('Monthly Return of Each Factor',fontsize=18, fontweight='bold')
plt.show()
In [ ]: