In [239]:
#!/Tsan/bin/python
# -*- coding: utf-8 -*-
In [240]:
# Libraries to use
from __future__ import division
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.cluster import KMeans
In [241]:
# Import My own library for factor testing
from SingleFactorTest import factorFilterFunctions as ff
#from config import *
In [242]:
%matplotlib inline
In [243]:
%load_ext line_profiler
In [244]:
# make sure that matplotib and seaborn can show Chinese
import matplotlib as mpl
mpl.rcParams['font.sans-serif'] = ['SimHei']
mpl.rcParams['font.serif'] = ['SimHei']
sns.set_style("darkgrid",{"font.sans-serif":['simhei', 'Arial']})
In [245]:
# Files to use
filenamePrice = 'Own_Factor_AdjustedPriceForward-1d.csv'
filenameST = 'LZ_CN_STKA_SLCIND_ST_FLAG.h5'
filenameTradeday = 'LZ_CN_STKA_SLCIND_TRADEDAYCOUNT.h5'
filenameStopFlag = 'LZ_CN_STKA_SLCIND_STOP_FLAG.h5'
filenameIndu = 'LZ_GPA_INDU_ZX.csv'
filenameFCAP = 'LZ_CN_STKA_VAL_A_FCAP.h5'
filenameAdjustFactor = 'LZ_CN_STKA_CMFTR_CUM_FACTOR.h5'
filenameHS300 = 'LZ_CN_STKA_INDXQUOTE_CLOSE.h5'
filenameZXIndustry = 'LZ_CN_STKA_INDU_ZX.h5' # 各股票对应的行业
filenameZXExplanation = 'LZ_GPA_TMP_INDU_ZX.csv'
In [246]:
# Factors
# Value factor
filenamePE ='LZ_GPA_VAL_PE.csv' # 市盈率 012-2016 Cum return 0.9(group1),1.4(group8) monotonicity: Normal
filenamePB ='LZ_GPA_VAL_PB.csv' # 市净率 012-2016 Cum return 1.1(group2),0.1(group9) monotonicity: Good+
filenamePS = 'LZ_GPA_VAL_PS.csv' # 市销率 012-2016 Cum return 0.9(group0),0.3(group9) monotonicity: Good+
filenamePCF = 'LZ_GPA_VAL_PC.csv' # 市现率 2012-2016 Cum return 0.95(group0),0.35(group8) monotonicity: Normal
filenameADJPB ='Own_Factor_AdjustedPB-1d.csv' # 市值调整市净率 2012-2016 Cum return 1.1(group2),0.2(group9) monotonicity: Normal
# Growth factor
filenameYOYGR = 'LZ_GPA_FIN_IND_QFA_YOYGR.csv' # 单季度.营业总收入同比增长率(%) 2012-2016 Cum return 1.1(group8),0.41(group1) monotonicity: Good+
filenameYOYNETPROFIT = 'LZ_GPA_FIN_IND_QFA_YOYNETPROFIT.csv' # 单季度.归属母公司股东的净利润同比增长率(%) 2012-2016 Cum return 1.2(group9),0.2(group1) monotonicity: Good
filenameYOYOCF = 'LZ_GPA_FIN_IND_YOYOCF.csv' # 同比增长率-经营活动产生的现金流量净额(%) 2012-2016 Cum return 0.98(group8),0.4(group2) monotonicity: Good
filenameYOYROE = 'LZ_GPA_FIN_IND_YOYROE.csv' # 同比增长率-净资产收益率(摊薄)(%) 2012-2016 Cum return 1.17(group7),0.18(group1) monotonicity: Normal
filenameYOYBPS = 'LZ_GPA_FIN_IND_YOYBPS.csv' # 相对年初增长率-每股净资产(%) 2012-2016 Cum return 0.85(group4),0.5(group0) monotonicity: Normal
# Financial factor
filenameCAPITALIZEDTODA = 'LZ_GPA_FIN_IND_CAPITALIZEDTODA.csv' # 资本支出/折旧和摊销 2012-2016 Cum return 0.95(group0),0.6(group9) monotonicity: Normal+
filenameCASHRATIO = 'LZ_GPA_FIN_IND_CASHRATIO.csv' # 保守速动比率 2012-2016 Cum return 0.85(group0),0.53(group2) monotonicity: Normal+
filenameCASHTOLIQDEBT = 'LZ_GPA_FIN_IND_CASHTOLIQDEBT.csv' # 货币资金/流动负债 2012-2016 Cum return 0.8(group7),0.6(group8) monotonicity: Normal
filenameOCFTODEBT = 'LZ_GPA_FIN_IND_OCFTODEBT.csv' # 经营活动产生的现金流量净额/负债合计 2012-2016 Cum return 0.8(group0),0.55(group4) monotonicity: Normal
filenamePROFITTOOP = 'LZ_GPA_FIN_IND_PROFITTOOP.csv' # 利润总额/营业收入 2012-2016 Cum return 0.84(group3),0.23(group9) monotonicity: Normal
filenamePROFITTOOPTTM ='LZ_GPA_FIN_DERI_COMBO_EBTTOOR_TTM.csv' # # 利润总额/营业收入(TTM) 2012-2016 Cum return 0.72(group8),0.41(group1) monotonicity: Good
filenameBERYRATIO = 'LZ_GPA_DERI_BerryRatio.csv' #边际毛利/营运开支 2012-2016 monotonicity: bad ,not useful at all
# Momentum factor
filenameTRUEMOM = 'LZ_GPA_USR_MOM.csv' # 真动量 2012-2016 Cum return 1.0(group3), - 0.3(group9) monotonicity: Normal+
filenameMOM1M = 'LZ_GPA_DERI_Momentum_1M.csv' # 一月反转 2012-2016 Cum return 0.85(group0), - 0.2(group9) monotonicity: Good
filenameMOM3M = 'LZ_GPA_TURNOVER_TurnoverAvg_3M.csv' # 三月反转 2012-2016 Cum return 0.65(group7), 0.4(group9) monotonicity: Normal
filenamePVO = 'LZ_GPA_DERI_PVO.csv' # 一年偏度 2012-2016 Cum return 0.47(group0), -0.3(group9) monotonicity: Good
filenameABORMALVOLUME = 'LZ_GPA_DERI_NormalizedAbormalVolume.csv' # 归一化异常交易量 2012-2016 Cum return 0.7(group0), 0.1(group9) monotonicity: Normal
filenameSKEW = 'LZ_GPA_DERI_TSKEW.csv' # 斜率 2012-2016 Cum return 1.1(group0), 0.25(group8) monotonicity: Good+
filenameMACD = 'LZ_GPA_DERI_MACD.csv' # MACD 2012-2016 Cum return 0.8(group3), -0.4(group9) monotonicity: Normal
finenameBB = 'LZ_GPA_DERI_BB_20.csv' # Bolling_bands 2012-2016 Cum return 0.75(group3) , 0.05(group9) monotonicity: Normal
# Liquidity factor
filenameTURNOVER1M = 'LZ_GPA_TURNOVER_TurnoverAvg_1M.csv' # 一月换手率均值 2012-2016 Cum return 0.7(group4) , 0.1(group9) monotonicity: Normal
filenameAMOUNTAVG1M = 'LZ_GPA_DERI_AmountAvg_1M.csv' # 日均成交额 2012-2016 Cum return 1.52(group0) , -0.27(group9) monotonicity: Perfect
filenameILLIQ = 'LZ_GPA_DERI_ILLIQ.csv' # 非流动性因子 2012-2016 Cum return 1.5(group9), - 0.07(group1) monotonicity: Perfect
filenameTURNOVER = 'LZ_GPA_VAL_TURN.csv' # 换手率 2012-2016 Cum return 0.78(group6), - 0.25(group9) monotonicity: Normal(除了第九组外其他差不多)
filenameOWNILLIQ ='Own_Factor_ILLQ-1d.csv' # 非流动性 2012-2016 Cum return 1.5(group9), - 0.25(group0) monotonicity: Perfect
filenameADJILLIQ = 'Own_Factor_ADJ_ILLQ_1D.csv' # 非流动性(市值调整)
filenameADJTURNOVER = 'LZ_GPA_DERI_adjustedTurnOver_20.csv' # 市值调整日均成交额 2012-2016 Cum return 1.25(group0), - 0.5(group9) monotonicity: Perfect
# Volatility factor
filenameRV1Y = 'LZ_GPA_DERI_RealizedVolatility_1Y.csv' # 一年收益波动 2012-2016 Cum return 0.65(group8), 0.4(group1) monotonicity : Normal
filenameOwnVol = 'Own_Factor_Volatility_90d.csv' # 90天收益波动率 2012-2016 Cum return 0.65(group1), - 0.26(group9) monotonicity: Good
filenameAbove20 = 'Own_Factor_Above20MA_20d.csv' # 高于20天MA的价格平均 2012-2016 Cum return 0.8(group1) , -0.4(group9) monotonicity:Good
filenameTOV20 = 'Own_Factor_Turnover_Volatility_20D.csv' # 20天换手率波动率
filenameADJTOV20 = 'Own_Factor_ADJ_Turnover_Volatility_20D.csv' # 20天换手率波动率(市值调整) 2012-2016 Cum return 1.8(group0), -0.3(group9) monotonicity: perfect
filenameTOVD20 = 'Own_Factor_Turnover_Volatility_deviation_20D.csv' # 20天换手率均值除500天换手率均值-1
filenameADJTOVD20 = 'Own_Factor_ADJ_Turnover_Volatility_Deviation_20D.csv' # 20天换手率均值除500天换手率均值-1(市值调整) Cum return 0.7(group0), -0.2(group9)Good
In [247]:
#
#filenameSpecificVol = 'Own_Factor_Specific_Volatility.csv' # 特质波动率
filenameAroon = 'Aroon_Allstocks.csv'
filenameAdjTOTrue = 'Own_Factor_AdjustedTurnOver-1d.csv'# 真市值调整换手率 2012-2016 Cum return 1.0(group7), - 0.4(group9) monotonicity: Normal
filenameDDA = 'Own_Factor_DDA-1d.csv' # 股票每日成交额(前复权) 2012-2016 Cum return 2.2(group0), - 0.6(group9) monotonicity: Perfect
filennameQFAEPS = 'LZ_GPA_FIN_IND_QFA_EPS.csv' # 单季度每股收益
filenameDDA20 = 'Own_Factor_DDA-20d.csv' # DDA二十天均值 2012-2016 Cum return 2.2(group0), - 0.6(group9) monotonicity: Perfect
filenameADJDDA20 ='Own_Factor_ADJ_DDA_20D.csv' # 中性化(仅去市值)后的DDA20
filenameIDIVOL = 'Own_Factor_Idiosyncratic_Volatility.csv' # 特异波动率 2012-2016 Cum return 0.9(group1), 0(group9) monotonicity: Good+
filenameOwnSkewness = 'Own_Factor_Skewness_250d.csv' # 250日偏度
filenameOwnReturnSkew = 'Own_Factor_Return_Skew_250D.csv' # 250日收益率偏度
filenamePPO ='LZ_GPA_DERI_PPO.csv' #
filename_5_20_deviation = 'Own_Factor_5_20_price_deviation_1D.csv' # 5日价格平均除以20日价格平均
filename_5_20_return_deviation = 'Own_Factor_5_20_return_deviation_1D.csv' # 5日平均收益除以20日平均收益
filenameSharpe = 'Own_Factor_sharpe_ratio_20D.csv' # 20日夏普比率
filenameDownsideRisk = 'Own_Factor_downside_risk_252D.csv' # 252日下行波动率 不太好的因子
filenameSortinoRatio = 'Own_Factor_sortino_ratio_20D.csv' # 20日sortino比率 不太好的因子
In [248]:
# Uqer factor
filenameUQAD20 = 'Uqer_factor_AD20.csv' #累积/派发线(Accumulation / Distribution Line)的20日均线
filenameUQADTM = 'Uqer_factor_ADTM.csv' #动态买卖气指标,用开盘价的向上波动幅度和向下波动幅度的距离差值来描述人气高低的指标。属于情绪类因子
filenameUQATR6 = 'Uqer_factor_ATR6.csv' #6日均幅指标(Average TRUE Ranger),取一定时间周期内的股价波动幅度的移动平均值 (# perfect!)
filenameUQAroon = 'Uqer_factor_Aroon.csv' #Aroon通过计算自价格达到近期最高值和最低值以来所经过的期间数
filenameUQBias10 = 'Uqer_factor_BIAS10.csv' # 10日乖离率,简称Y值,是移动平均原理派生的一项技术指标,表示股价偏离趋向指标斩百分比值
filenameUQCCI10 = 'Uqer_factor_CCI10.csv' # 10日顺势指标(Commodity Channel Index),专门测量股价是否已超出常态分布范围(效果不太理想)
filenameUQKDJ_K = 'Uqer_factor_KDJ_K.csv' #随机指标。它综合了动量观念、强弱指标及移动平均线的优点,用来度量股价脱离价格正常范围的变异程度。(不理想0
filenameUQROC6 = 'Uqer_factor_ROC6.csv' #6日变动速率(Price Rate of Change),以当日的收盘价和N天前的收盘价比较
filenameUQRVI = 'Uqer_factor_RVI.csv' # 相对离散指数(Relative Volatility Index)
filenameUQCMO = 'Uqer_factor_CMO.csv' # 钱德动量摆动指标(Chande Momentum Osciliator) # 相当不错的因子,单调性相当好!特别是分行业之后!
filenameUQRSI = 'Uqer_factor_RSI.csv' # 相对强弱指标(Relative Strength Index) 或许还可以?
filenameUQSkewness = 'Uqer_factor_Skewness.csv' # 过去20个交易日股价的偏度,分行业后又不错的单调性
filenameUQOBV20 = 'Uqer_factor_OBV20.csv' # 20日能量潮指标(On Balance Volume,OBV) 非常棒的因子(但与市值因子相关性太高)
filenameUQMTM ='Uqer_factor_MTM.csv' # 动量指标(Momentom Index) 中间组(group4,group5)比较好,或许可以当作检验因子?
filenameUQPVT6 = 'Uqer_factor_PVT6.csv' # 价量趋势(Price and Volume Trend)指标 中间组效果较好(可当作检验因子,3组,4组)
filenameUQREC = 'Uqer_factor_REC.csv' # 析师推荐评级(Recommended rating score by analyst 没有单调性,第九组最好,可以当作一个选股的检验因子
filenameUQDAREC = 'Uqer_factor_DAREC.csv' # 分析师推荐评级变化,相比于60个交易日前。 没有单调性,但第九组最好
filenameUQGREC = 'Uqer_factor_GREC.csv' # 分析师推荐评级变化趋势,过去60个交易日内的DAREC 符号加和. 第五组最好,单调性不太好
filenameUQREVS20 = 'Uqer_factor_REVS20.csv' # 股票的20日收益 perfect! 很好的因子!并且与市值因子相关性不大!
filenameUQREVS5 ='Uqer_factor_REVS5.csv' # 股票的5日收益 不错,但不如20日收益
filenameUQMA10RegressCoeff12 = 'Uqer_factor_MA10RegressCoeff12.csv' # 10日价格平均线12日线性回归系数 单调性还不错,从第三组开始基本单调(第三组最好)
filenameUQWVAD = 'Uqer_factor_WVAD.csv' # 威廉变异离散量(William's variable accumulation distribution),是一种将成交量加权的量价指标 相当不错的因子
# 从group 1 开始单调性明显(最好的组为group2 ,group3),特别是分了行业之后的
filenameUQHurst = 'Uqer_factor_Hurst.csv' # 赫斯特指数 不错的因子(和市值因子没啥相关性),单调性不错(group9在最好),特别是分行业之后效果更明显
filenameUQMassIndex = 'Uqer_factor_MassIndex.csv' # 梅斯线(Mass Index),本指标是Donald Dorsey累积股价波幅宽度之后所设计的震荡曲线。其最主要的作用,在于寻找飙涨股或者极度弱势股的重要趋势反转点。属于常用技术指标类因子
# 波动率指标,单调性不错,group1在最上面,group9在最下面。
filenameUQKlingerOscillator = 'Uqer_factor_KlingerOscillator.csv' # 成交量摆动指标.单调性一般,俩头差中间好,或许可用来选股。
In [249]:
# Not good factors but maybe useful
filenameOVERVOL = 'Over_Heat_Volume.csv'
filenameSIZE = 'LZ_GPA_VAL_A_FCAP.csv'
filenameExcessReturn = 'Own_Factor_excess_return_20D.csv'
filenameROE = 'LZ_GPA_FIN_IND_ROE.csv'
In [250]:
# ROE x日 波动率
filenameRoeVol60 = 'Own_Factor_ROE_Volatility_60D.csv' # 和预期不符
filenameRoeVol120 = 'Own_Factor_ROE_Volatility_120D.csv' # 符合预期,即ROE波动小的组收益较好,单调性不明显
filenameRoeVol250 = 'Own_Factor_ROE_Volatility_250D.csv' # 符合预期,单调性非常好,描述ROE波动率可用此因子!good+!
In [251]:
# Non_linear factor
filenameNLFCAP ='Own_Factor_Non_Linear_Size.1D.csv' # 效果很完美,但是与市值因子相关性接近-1,可能还是受小市值影响?
filenameBeta = 'Own_Factor_Beta_1D.csv' # 贝塔因子
In [252]:
# 一致预期
filenameNetProfitTTM = 'LZ_CN_STKA_CRD_NET_PRFT_FTTM.h5'
In [253]:
# h5 files here
filenameOwnFactorILLIQh5 = 'OwnFactorILLIQ.h5'
filenameOwnFactorADJILLIQh5 = 'OwnFactorADJILLIQ.h5' # 垃圾因子效果很差
filenameLZBPLR = 'LZ_CN_STKA_DERI_BP_LR.h5' # 股东权益总市值比 (股东权益合计/总资产)
filenameOwnFactorSmartMoney = 'OwnfactorSmartMoney1min.h5'
filenameOwnFactorLogSmartMoney = 'OwnfactorlogSmartMoneyCSIall.h5'
filenameOwnMassIndex = 'OwnfactorMassIndex.h5' # mass index 比较优秀的因子,和市值不太相关,前几组区分的不算开(group1-group3),但后面几组区分的很好。
filenameMassIndex800 = 'OwnfactorMassIndex800.h5'
filenameOwnNetIn = 'OwnfactorNetIn.h5' # 资金净流入(分钟线合成) 效果一般,单调性不好
filenameOwnNetInRatio = 'OwnfactorNetInRatio.h5' # 资金净流入 / 流动市值
filenameOwnNetInRatio5d = 'OwnfactorNetInRatio5d.h5' # 资金净流入5日平均 / 流动市值
filenameOwnNetInRatio20d = 'OwnfactorNetInRatio20d.h5' # 资金净流入20日平均 / 流动市值
filenameVolQuantileRatio = 'VolQuantileRatio.h5' # 成交量分布指数,整体单调性不是那么强,但前几组单调性不错,可用来选股。用tick数据做可能效果更好。
filenameInduSpreadBias = 'OwnFactorInduSpreadBias.h5' # 行业偏差指数,很好的反转因子,单调性强,IC均值及中位数为-0.07,但17年没有什么超额收益。
In [254]:
# 复权后价格
filenameAdjClose = 'OwnfactorAdjustedClose.h5'
In [255]:
# factors not creat by myself
filenameTrueSpreadBias = 'OthersSpreadBias222_1D.csv' #东方证券价差波动率因子,正常测试效果一般,分行业后效果非常好
filenameIdiosycraticVol = 'Othersid2_std_3m_1D.csv'
In [256]:
# Constants
startTime = datetime.strptime('20120504', '%Y%m%d')
endTime = datetime.strptime('20170228', '%Y%m%d')
path = ff.data_path
timeStampNum = 2500
thresholdNum = 0.2
HS300Index ='000300.SH' # HS300 index code
ZZ500Index = '000905.SH' # ZZ500 index code
In [257]:
# for csv file
'''
stDF = pd.read_csv(path+filenameST,infer_datetime_format=True,parse_dates=[0],index_col=0).loc[startTime:endTime]
tradeDayDF = pd.read_csv(path+filenameTradeday,infer_datetime_format=True,parse_dates=[0],index_col=0).loc[startTime:endTime]
stopFlagDF = pd.read_csv(path+filenameStopFlag,infer_datetime_format=True,parse_dates=[0],index_col=0).loc[startTime:endTime]
indusDF = pd.read_csv(path+filenameZXIndustry,infer_datetime_format=True,parse_dates=[0],index_col=0).loc[startTime:endTime]
induExplanation = pd.read_csv(path+filenameZXExplanation,infer_datetime_format=True,parse_dates=[0],encoding='gb2312')
'''
Out[257]:
In [ ]:
In [ ]:
In [258]:
# for h5 file
stDF = ff.readh5data(path,filenameST).loc[startTime:endTime]
tradeDayDF = ff.readh5data(path,filenameTradeday).loc[startTime:endTime]
stopFlagDF = ff.readh5data(path,filenameStopFlag).loc[startTime:endTime]
indusDF = ff.readh5data(path,filenameZXIndustry).loc[startTime:endTime]
induExplanation = pd.read_csv(path+filenameZXExplanation,infer_datetime_format=True,parse_dates=[0],encoding='gb2312')
In [ ]:
In [350]:
#
sololist = [filenameOwnMassIndex]
#
filenameDict = {'PE':filenamePE,'PB':filenamePB, 'PS':filenamePS, 'PCF':filenamePCF, 'YOYGR':filenameYOYGR,'YOYGRPROFIT':filenameYOYNETPROFIT, \
'TRUE_MOM':filenameTRUEMOM, 'MOM_1M':filenameMOM1M , 'TURNOVER_1M':filenameTURNOVER1M }
In [351]:
induExplanation.tail()
Out[351]:
In [352]:
explanationDict = induExplanation.iloc[:,0].to_dict()
In [353]:
explanationDict
Out[353]:
In [354]:
sololist[0]
Out[354]:
In [ ]:
In [355]:
if ('Uqer' or 'Others') in sololist[0]:
sparedata = pd.read_csv(path+sololist[0],infer_datetime_format=True,parse_dates=[0],index_col=0)
else:
sparedata = ff.readh5data(path,sololist[0])
In [ ]:
In [356]:
# adjust the columns name of the Uqer data same as the own factor
if 'Uqer' or '1min' in sololist[0]:
uqercolumnList = sparedata.columns.tolist()
uqercolumnName = [x.split('.')[0] for x in uqercolumnList]
newcolumnList = stDF.columns.tolist()
newcolumnName = [x.split('.')[0] for x in newcolumnList]
columndict = dict(zip(newcolumnName,newcolumnList))
finalcolumns = []
for stk in uqercolumnName:
if stk in newcolumnName:
stk = columndict[stk]
else:
pass
finalcolumns.append(stk)
sparedata.columns = finalcolumns
addNanColumns = list(set(newcolumnList) - set(sparedata.columns))
addData = pd.DataFrame(index = sparedata.index,columns = addNanColumns,dtype =float)
sparedata = pd.concat([sparedata,addData],axis=1)
sparedata = sparedata[newcolumnList]
In [ ]:
In [ ]:
In [357]:
## easy way to calculate the last day of the month
#stDF['label'] = stDF.index.map(lambda x: (x.year,x.month))
#locationList=(stDF.groupby(['label']).size().cumsum()-1).values
#stDF.iloc[locationList].index
#stDF.groupby(['label'])
In [358]:
# Using pd.read_csv method since the adjusted price data is in csv format
#priceData = pd.read_csv(path+ filenamePrice ,infer_datetime_format=True,parse_dates=[0],index_col=0).loc[startTime:endTime]
priceData = ff.readh5data(path,filenameAdjClose).loc[startTime:endTime]
#benchMarkData = pd.read_csv(path+filenameHS300,infer_datetime_format=True,parse_dates=[0],index_col=0).loc[startTime:endTime][ZZ500Index]
In [ ]:
In [359]:
# Read h5data since the benchmarkData is the h5 file
#priceData = ff.readh5data(path, filenamePrice).loc[startTime:endTime]
benchMarkData = ff.readh5data(path, filenameHS300).loc[startTime:endTime][ZZ500Index]
In [ ]:
In [360]:
LFCAPDF = np.log10(ff.getData(thresholdNum, startTime, endTime,filename = filenameFCAP))
In [ ]:
In [ ]:
In [361]:
#fs = np.log10(pd.read_csv(path+ filenameFCAP ,infer_datetime_format=True,parse_dates=[0],index_col=0))
#fs[fs>fs.quantile(0.9,axis=1)] = np.NaN
#fs.to_csv(path+'Own_factor_Nonlinear_FCAP.csv',na_rep='NaN',date_format='%Y%m%d')
In [362]:
#fs[fs<fs.quantile(0.4,axis=1)]
In [ ]:
In [363]:
endOfMonthList = ff.getLastDayOfMonth(LFCAPDF.index)[1]
In [364]:
#sorted(list(set(endOfMonthList) & set(endOfMonthList1)))
In [ ]:
In [365]:
# calculate correlation between two factors
#if 'Uqer' or '1min' in sololist[0]:
factor1 = sparedata.loc[startTime:endTime]
#else:
#factor1 = pd.read_csv(path+sololist[0],infer_datetime_format=True,parse_dates=[0],index_col=0).loc[startTime:endTime]
#factor1 = ff.readh5data(path,sololist[0])
correlationDF = ff.showCorrelation(factor1,LFCAPDF, endOfMonthList, filterdic = None).astype(float)
correlationDF.describe()
Out[365]:
In [ ]:
In [ ]:
In [366]:
#%lprun -f ff.getLastDayOfMonth ff.getLastDayOfMonth(LFCAPDF.index)
In [367]:
activeReturnData = ff.calcReturn(priceData, endOfMonthList, benchmark = benchMarkData,activeReturn = True,logReturn = False)
In [ ]:
In [368]:
# Generate the useles stock list
filterdict={}
for i in endOfMonthList:
suspendList = ff.GetSTNewSuspend(i,stDF,tradeDayDF,stopFlagDF)
filterdict[i] = suspendList
print i, len(filterdict[i])
In [369]:
### Calculate the return of each group of given factor(solo factor)
totalGroupDict ={}
factorData = ff.getData(thresholdNum, startTime, endTime,availableData = factor1)
for date in endOfMonthList:
factorDataTemp = factorData.loc[:date].tail()
factorDataTemp = factorDataTemp[list(set(factorDataTemp.columns.tolist())- set(filterdict[date]))]
#print factorDataTemp
totalGroupDict[date] = ff.getStockGroup(factorDataTemp,groupNum=10,Mean_Num=20)
In [370]:
#totalGroupDict
In [371]:
ReturnDF = pd.DataFrame(index=endOfMonthList[:-1],columns=totalGroupDict.values()[0].keys(),data=None, dtype =float)
sizeDistribution = pd.DataFrame(index=endOfMonthList[:-1],columns=totalGroupDict.values()[0].keys(),data=None, dtype =float)
for group in ReturnDF.columns.tolist():
for time in ReturnDF.index:
ReturnDF.loc[time][group] = activeReturnData.loc[time][totalGroupDict[time][group]].mean()
sizeDistribution.loc[time][group] = LFCAPDF.loc[time][totalGroupDict[time][group]].quantile()
ReturnDF.sort_index(axis=1,inplace=True)
sizeDistribution.sort_index(axis=1,inplace=True)
In [372]:
# show size distribution(quantile plot)
fig = plt.figure(figsize=(16,10))
# Add a subplot
ax = fig.add_subplot(111)
sizeDistribution.median().plot(kind='bar',ax = ax, fontsize =13,title ='Size Distribution of each group',alpha =0.8)
ax.set_title(ax.get_title(),alpha=0.7, fontsize=25)
Out[372]:
In [373]:
sizeDistribution.quantile()
sizeMono = sizeDistribution.quantile().corr(pd.Series(index = sizeDistribution.quantile().index,data = range(len(sizeDistribution.quantile()))),\
method = 'spearman')
sizeMono
Out[373]:
In [374]:
# Calc spearman correlation to investigate monotonicity
referSeries = pd.Series(index = ReturnDF.columns, data=range(len(ReturnDF.columns)))
monoDF = pd.DataFrame(index = ReturnDF.index, columns=['Spearman_Cor'], dtype = float)
for date in ReturnDF.index:
monoDF.loc[date] = ReturnDF.loc[date].corr(referSeries,method='spearman')
In [375]:
# Plot
fig = plt.figure(figsize=(18,14))
# Add a subplot
ax = fig.add_subplot(111)
monoDF.plot(figsize=(22,14),ax=ax,fontsize =13,title ='Monotonicity')
ax.set_title(ax.get_title(),alpha=0.7, fontsize=30)
Out[375]:
In [ ]:
In [376]:
### Method to calculate moving max drawdown
from numpy.lib.stride_tricks import as_strided
def windowed_view(x, window_size):
"""Creat a 2d windowed view of a 1d array.
`x` must be a 1d numpy array.
`numpy.lib.stride_tricks.as_strided` is used to create the view.
The data is not copied.
Example:
>>> x = np.array([1, 2, 3, 4, 5, 6])
>>> windowed_view(x, 3)
array([[1, 2, 3],
[2, 3, 4],
[3, 4, 5],
[4, 5, 6]])
"""
y = as_strided(x, shape=(x.size - window_size + 1, window_size),
strides=(x.strides[0], x.strides[0]))
return y
In [377]:
###
def rolling_max_dd(x, window_size, min_periods=1):
"""Compute the rolling maximum drawdown of `x`.
`x` must be a 1d numpy array.
`min_periods` should satisfy `1 <= min_periods <= window_size`.
Returns an 1d array with length `len(x) - min_periods + 1`.
"""
if min_periods < window_size:
pad = np.empty(window_size - min_periods)
pad.fill(x[0])
x = np.concatenate((pad, x))
y = windowed_view(x, window_size)
#print y
rolling_max_y = np.maximum.accumulate(y, axis=1)
#print rolling_max_y
dd = 1-y/rolling_max_y
return np.abs(dd).max(axis=1)
In [378]:
max(rolling_max_dd(ReturnDF['group_0'].values,4, min_periods=1))
Out[378]:
In [379]:
# long top group short bot group
sortGroups = ReturnDF[['group_0','group_9']].cumsum().iloc[-1].sort_values(ascending = False).index
top, bot = sortGroups[0], sortGroups[-1]
# Add another column
ReturnDF['top_bot_comb'] = ReturnDF[top] - ReturnDF[bot]
In [380]:
top, bot
Out[380]:
In [381]:
ReturnDF.tail(10)
Out[381]:
In [382]:
# simple cumulative return
fig = plt.figure(figsize=(18,14))
# Add a subplot
ax = fig.add_subplot(111)
ReturnDF.astype(float).cumsum().plot(figsize=(22,14),ax=ax, color=sns.color_palette("Paired",11),fontsize =13,title ='Cumulative Return')
ax.set_title(ax.get_title(),alpha=0.7, fontsize=30)
Out[382]:
In [ ]:
In [383]:
# Net worth plot
netWorthDF = (ReturnDF.astype(float)+1).cumprod()
fig = plt.figure(figsize=(18,14))
# Add a subplot
ax = fig.add_subplot(111)
netWorthDF.plot(figsize=(22,14),ax=ax,color=sns.color_palette("Paired",11),title ='Net Worth',fontsize =13)
ax.set_title(ax.get_title(),alpha=0.7, fontsize=30, ha='right')
Out[383]:
In [384]:
# Calc spearman correlation to investigate monotonicity
referSeries = pd.Series(index = netWorthDF.iloc[:,:-1].columns, data=range(len(netWorthDF.iloc[:,:-1].columns)))
monoDF = pd.DataFrame(index = netWorthDF.index, columns=['Spearman_Cor'], dtype = float)
for date in netWorthDF.index:
monoDF.loc[date] = netWorthDF.iloc[:,:-1].loc[date].corr(referSeries,method='spearman')
In [ ]:
In [385]:
# Plot
fig = plt.figure(figsize=(18,14))
# Add a subplot
ax = fig.add_subplot(111)
monoDF.plot(figsize=(22,14),ax=ax,fontsize =13,title ='Monotonicity')
ax.set_title(ax.get_title(),alpha=0.7, fontsize=30)
Out[385]:
In [386]:
# monotonicity information
print monoDF.std()
print monoDF.median()
In [387]:
# basic indicator
annualizedReturn = (1+ReturnDF.mean())**12 - 1
annualizedVol = ReturnDF.std()* np.sqrt(12)
sharpeRatio = annualizedReturn / annualizedVol
print 'Annual Return:','\n',annualizedReturn,'\n\n','Annual Volatility:','\n',annualizedVol,'\n\n','Sharpe Ratio:''\n',sharpeRatio
In [388]:
copyReturn = ReturnDF.copy()
copyReturn [copyReturn > copyReturn .mean()] =0
downsideRisk = ReturnDF.std(skipna = True) * np.sqrt(12)
downsideRisk
Out[388]:
In [ ]:
In [389]:
sortinoRatio = annualizedReturn / downsideRisk
sortinoRatio
Out[389]:
In [390]:
# Max drawdown
maxdd = netWorthDF.copy()
maxdd.iloc[0] = 0
for date in netWorthDF.index[1:]:
maxdd.loc[date] = 1-netWorthDF.loc[date]/netWorthDF.loc[:date].max()
maxddInfo = pd.concat([maxdd.max(),maxdd.idxmax()],axis=1)
maxddInfo.columns = ['Max_drawdown','Time']
maxddInfo
Out[390]:
In [391]:
# calmar Ratio
calmarRatio = annualizedReturn/ maxddInfo['Max_drawdown']
calmarRatio
Out[391]:
In [ ]:
In [ ]:
In [ ]:
In [392]:
ReturnForPlot =ReturnDF.copy()
In [393]:
ReturnForPlot.index = ReturnForPlot.index.map(lambda x:100*x.year+(1+x.month) if x.month < 12 else\
100*(x.year+1)+1) ### this shows the real time
In [394]:
### the following part is to save return data of every factor into one Dataframe
In [395]:
#---------------- the following part shows the show the difference of a risk factor across different market capitalzation size----#
#------------- and different industries ------#
In [396]:
# show the difference of a risk factor through different market capitalzation size
# capdata should not contain Nan value
# Return: DICTIONARY, the KEY is the date and the Value is the tuple of the groups
# Inputs:
# capdata: DATAFRAME ,the LFCAP DATA
# datelist: LIST, the datelist of the end month
def getGroupsbyCap(capdata, datelist):
capdict ={}
for date in datelist:
capdataindice = capdata.loc[date]
lower = capdataindice.quantile(1/3)
upper = capdataindice.quantile(2/3)
smallcap = capdataindice[capdataindice<=lower].index
midcap = capdataindice[(lower<capdataindice) & (capdataindice<=upper)].index
hugecap = capdataindice[capdataindice>upper].index
capdict[date] = (smallcap,midcap,hugecap)
return capdict
In [397]:
# industry number
grouplabel = np.random.choice(29,10,replace=False)
grouplabel
Out[397]:
In [398]:
# show the difference of a risk factor through different industry
# capdata should not contain Nan value
# Return: DICTIONARY, the KEY is the date and the Value is the DICTIONARY of the groups of each industry on that day
# Inputs:
# datelist: LIST, the datelist of the end month
# grouplable: LIST, the industry label,usually 3 of them is constant and other 3 is randomed alike [2,3,6,15,18,25]
# industryDF: DATAFRAME,the industry dataframe(could )
def getIndustryDict(datelist,grouplabel,industryDF):
industrydict = {}
industryDF = industryDF.loc[datelist]
for date in datelist:
industryDFindice = industryDF.loc[date]
industrydict[date] = {label:industryDFindice[industryDFindice == label].index for label in grouplabel}
return industrydict
In [ ]:
In [399]:
#------------------------------ Following part is to group stocks within industry --------------------------
In [400]:
wholeIndList = np.array(range(29))
wholeIndDict = getIndustryDict(endOfMonthList,wholeIndList,indusDF)
In [401]:
multindexList = [endOfMonthList,wholeIndList]
induReturnDF = pd.DataFrame(data=None, columns=totalGroupDict.values()[0].keys(),\
index=pd.MultiIndex.from_product(multindexList,names=['time','industry']),dtype=float)
In [402]:
# Group the stocks
# To-Dos: Add new weighting option for return,Same weight as the benchmark inter the different industries.(Namely industry-neutralized)
groupNumberThrottle = 10
for i,j in wholeIndDict.iteritems():
print i
factorIndice = factorData.loc[:i].tail()
factorIndice = factorIndice[list(set(factorIndice.columns.tolist())- set(filterdict[date]))] # Remove ST ,new and suspending stk
for ind, stk in j.iteritems():
intersection = list(set(factorIndice.columns.tolist()) & set(stk))
if len(intersection) < groupNumberThrottle:
induReturnDF.loc[i,ind] = 0
continue
else:
stkgroup = ff.getStockGroup(factorIndice[intersection], groupNum=10, Mean_Num=20)
for p,q in stkgroup.iteritems():
try:
induReturnDF.loc[i,ind][p] = activeReturnData.loc[i][q].mean()
except:
induReturnDF.loc[i,ind][p] = np.NaN
In [ ]:
In [403]:
finalReturn = induReturnDF.mean(level = 'time')
finalReturn.sort_index(axis=1,inplace=True)
In [404]:
sortGroups = finalReturn[['group_0','group_9']].cumsum().iloc[-1].sort_values(ascending = False).index
top, bot = sortGroups[0], sortGroups[-1]
# Add another column
finalReturn['top_bot_comb'] = finalReturn[top] - finalReturn[bot]
In [405]:
indNetWorth = (finalReturn+1).cumprod()
fig = plt.figure(figsize=(14,9))
# Add a subplot
ax = fig.add_subplot(111)
indNetWorth.plot(figsize=(22,14),ax=ax,color=sns.color_palette("Paired",11),title ='Net Worth',fontsize =13)
ax.set_title(ax.get_title(),alpha=0.7, fontsize=30, ha='right')
Out[405]:
In [406]:
# basic indicator
annualizedReturnNew = (1+finalReturn.mean())**12 - 1
annualizedVolNew = finalReturn.std()* np.sqrt(12)
sharpeRatioNew = annualizedReturnNew / annualizedVolNew
print 'Annual Return:','\n',annualizedReturnNew,'\n\n','Annual Volatility:','\n',annualizedVolNew,'\n\n','Sharpe Ratio:''\n',sharpeRatioNew
In [407]:
copyReturn = finalReturn.copy()
copyReturn[copyReturn > copyReturn.mean()] = 0
downsideRiskNew = copyReturn.std(skipna = True) * np.sqrt(12)
downsideRiskNew
Out[407]:
In [ ]:
In [408]:
sortinoRatioNew = annualizedReturnNew / downsideRiskNew
sortinoRatioNew
Out[408]:
In [409]:
# Max drawdown
maxdd1 = indNetWorth.copy()
maxdd1.iloc[0] = 0
for date in indNetWorth.index[1:]:
maxdd1.loc[date] = 1-indNetWorth.loc[date]/indNetWorth.loc[:date].max()
maxddInfo1 = pd.concat([maxdd1.max(),maxdd1.idxmax()],axis=1)
maxddInfo1.columns = ['Max_drawdown','Time']
maxddInfo1
Out[409]:
In [ ]:
In [410]:
# calmar Ratio
calmarRatioNew = annualizedReturnNew/ maxddInfo1['Max_drawdown']
calmarRatioNew
Out[410]:
In [411]:
def str_to_datetime_format(string):
return '%Y/%m/%d' if '/'in string else '%Y-%m-%d'
In [412]:
print downsideRisk['top_bot_comb'],sortinoRatio['top_bot_comb']
In [413]:
# save factor performance summary into csv file
if np.isnan(monoDF.median().values[0]) and np.isnan(sharpeRatio['top_bot_comb']):
raise Exception( 'Error! Please check the original data!')
savepath = 'C:/Users/LZJF_02/Desktop/myownliarbry'
infodata = np.array([[startTime,endTime,monoDF.median().values[0],monoDF.std().values[0],annualizedReturn['top_bot_comb'],annualizedVol['top_bot_comb'],sharpeRatio['top_bot_comb'],\
downsideRisk['top_bot_comb'],sortinoRatio['top_bot_comb'], maxddInfo['Max_drawdown'].loc['top_bot_comb'],calmarRatio['top_bot_comb']]])
totalInfo = pd. DataFrame(index =[sololist[0].split('.')[0]],columns = ['Start_time','End_time','Mono_median','Mono_std','Annualized_return','Annualized_volatility','Sharpe_ratio','Downside_Risk',
'Sortino_ratio','Max_drawdown','Calmar_Ratio'],data= infodata)
totalInfo.index.name = 'Factor_Name'
try:
readfacInfo = pd.read_csv(savepath+'/'+'factorInfo.csv',infer_datetime_format=True,parse_dates=[0],index_col=0)
except:
readfacInfo = totalInfo
readfacInfo.to_csv(savepath+'/'+'factorInfo.csv',na_rep='NaN')
factorName = sololist[0].split('.')[0]
if factorName in readfacInfo.index:
print factorName+ ' '+'already in the file!'
try:
begin = datetime.strptime(readfacInfo.loc[factorName]['Start_time'].split()[0],str_to_datetime_format(readfacInfo.loc[factorName]['Start_time']))
end = datetime.strptime(readfacInfo.loc[factorName]['End_time'].split()[0],str_to_datetime_format(readfacInfo.loc[factorName]['End_time']))
except:
print 'No convert needed!The time type is already python.datetime!'
begin = readfacInfo.loc[factorName]['Start_time']
end = readfacInfo.loc[factorName]['End_time']
print begin,startTime, end, endTime
if begin > startTime or end < endTime:
print 'Update needed'
print readfacInfo.loc[factorName].values
print infodata.flatten()
readfacInfo.loc[factorName] = infodata.flatten()
updatedInfo = readfacInfo
else:
updatedInfo = pd.concat([readfacInfo,totalInfo])
updatedInfo = updatedInfo.rename(columns = {'End_Time': 'End_time'})
updatedInfo.to_csv(savepath+'/'+'factorInfo.csv',na_rep='NaN')
In [414]:
updatedInfo
Out[414]:
In [ ]:
In [415]:
'''define the risk preference weight indice before perform kmeans clustering'''
Out[415]:
In [416]:
updatedInfo['Mono_median'] = np.abs(updatedInfo['Mono_median'])
mat =updatedInfo.dropna(axis=0).iloc[:,2:].as_matrix()
# Using sklearn
km = KMeans(n_clusters=4)
result = km.fit(mat)
# Get cluster assignment labels
labels = km.labels_
# Format results as a DataFrame
results = pd.DataFrame(data=labels, columns=['cluster'],index = updatedInfo.index)
In [417]:
results.groupby(['cluster'])
Out[417]:
In [418]:
for name,group in results.groupby(['cluster']):
print group
In [419]:
(maxddInfo1-maxddInfo)
Out[419]:
In [420]:
#-------------------------------------- Section End -------------------------------------
In [421]:
industryDict = getIndustryDict(endOfMonthList,grouplabel,indusDF)
In [422]:
capDict=getGroupsbyCap(LFCAPDF,endOfMonthList)
In [423]:
templist=[endOfMonthList,['mean','median','std']]
capGroupsDF = pd.DataFrame(data=None, columns=['small','mid','huge'],index=pd.MultiIndex.from_product(templist,names=['time','stats']),dtype=float)
indusGroupDF = pd.DataFrame(data=None, columns=grouplabel,index=pd.MultiIndex.from_product(templist,names=['time','stats']),dtype=float)
In [424]:
for date in endOfMonthList:
factorindice = factorData.loc[date]
smallindice = factorindice.loc[list(set(factorindice.index) & set(capDict[date][0]))]
midindice = factorindice.loc[list(set(factorindice.index) & set(capDict[date][1]))]
hugeindice = factorindice.loc[list(set(factorindice.index) & set(capDict[date][2]))]
capGroupsDF.loc[date,'small'] = [smallindice.mean(),smallindice.median(),smallindice.std()]
capGroupsDF.loc[date,'mid'] = [midindice.mean(),midindice.median(),midindice.std()]
capGroupsDF.loc[date,'huge'] = [hugeindice.mean(),hugeindice.median(),hugeindice.std()]
for i in grouplabel:
#print grouplabel
inDFIndice = factorindice.loc[list(set(factorindice.index) & set(industryDict[date][i]))]
indusGroupDF.loc[date,i] = [inDFIndice.mean(),inDFIndice.median(),inDFIndice.std()]
indusGroupDF.rename(columns = {i:u''+explanationDict[i]+'' for i in grouplabel},inplace=True)
In [425]:
indusGroupDF
Out[425]:
In [426]:
capGroupsDF.head()
Out[426]:
In [ ]:
In [ ]:
In [ ]:
In [427]:
newstack = capGroupsDF.stack().unstack('stats').reset_index()
newstack = newstack.rename(columns = {'level_1':'cap'})
newstack = newstack.pivot_table(index='time',columns='cap')
In [428]:
newstack.head()
Out[428]:
In [429]:
fig, axs = plt.subplots(3,1, figsize=(16, 10), facecolor='w', edgecolor='k',sharex=True)
fig.subplots_adjust(hspace = .5, wspace=.001)
for label,num in zip(newstack.columns.levels[0],range(len(newstack.columns.levels[0]))):
newstack[label].plot(ax=axs[num],legend=False)
axs[num].set_title(label)
axs[0].legend()
Out[429]:
In [430]:
newstackInd = indusGroupDF.stack().unstack('stats').reset_index()
newstackInd = newstackInd.rename(columns = {'level_1':'industry'})
newstackInd = newstackInd.pivot_table(index='time',columns='industry')
In [431]:
fig, axs = plt.subplots(3,1, figsize=(22, 16), facecolor='w', edgecolor='k',sharex=True)
fig.subplots_adjust(hspace = .5, wspace=.001)
for label,num in zip(newstackInd.columns.levels[0],range(len(newstackInd.columns.levels[0]))):
newstackInd[label].plot(ax=axs[num],color=sns.color_palette("Paired",10),legend=False)
axs[num].set_title(label)
axs[0].legend()
Out[431]:
In [432]:
capcopy=capGroupsDF.copy()
copydata=capcopy.reset_index()
In [433]:
copydata.pivot_table(index='time',columns='stats').head()
Out[433]:
In [434]:
fig, axs = plt.subplots(3,1, figsize=(16, 10), facecolor='w', edgecolor='k',sharex=True)
fig.subplots_adjust(hspace = .5, wspace=.001)
for label,num in zip(set(copydata['stats']),range(len(set(copydata['stats'])))):
#print label,num
dataslice = copydata[copydata['stats']==label]
dataslice.plot(ax=axs[num])
axs[num].set_title(label)
In [ ]: