In [239]:
#!/Tsan/bin/python
# -*- coding: utf-8 -*-

In [240]:
# Libraries to use
from __future__ import division 
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.cluster import KMeans

In [241]:
# Import My own library for factor testing
from SingleFactorTest import factorFilterFunctions as ff
#from config import *

In [242]:
%matplotlib inline

In [243]:
%load_ext line_profiler


The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler

In [244]:
# make sure that matplotib and seaborn can show Chinese
import matplotlib as mpl
mpl.rcParams['font.sans-serif'] = ['SimHei']
mpl.rcParams['font.serif'] = ['SimHei']
sns.set_style("darkgrid",{"font.sans-serif":['simhei', 'Arial']})

In [245]:
# Files to use 
filenamePrice = 'Own_Factor_AdjustedPriceForward-1d.csv'
filenameST = 'LZ_CN_STKA_SLCIND_ST_FLAG.h5'
filenameTradeday = 'LZ_CN_STKA_SLCIND_TRADEDAYCOUNT.h5'
filenameStopFlag = 'LZ_CN_STKA_SLCIND_STOP_FLAG.h5'
filenameIndu = 'LZ_GPA_INDU_ZX.csv'
filenameFCAP =  'LZ_CN_STKA_VAL_A_FCAP.h5'
filenameAdjustFactor = 'LZ_CN_STKA_CMFTR_CUM_FACTOR.h5'
filenameHS300 = 'LZ_CN_STKA_INDXQUOTE_CLOSE.h5'
filenameZXIndustry = 'LZ_CN_STKA_INDU_ZX.h5'  # 各股票对应的行业
filenameZXExplanation = 'LZ_GPA_TMP_INDU_ZX.csv'

In [246]:
# Factors
# Value factor
filenamePE ='LZ_GPA_VAL_PE.csv'  # 市盈率   012-2016 Cum return 0.9(group1),1.4(group8) monotonicity: Normal
filenamePB ='LZ_GPA_VAL_PB.csv'  # 市净率   012-2016 Cum return 1.1(group2),0.1(group9) monotonicity: Good+
filenamePS = 'LZ_GPA_VAL_PS.csv' # 市销率   012-2016 Cum return 0.9(group0),0.3(group9) monotonicity: Good+
filenamePCF = 'LZ_GPA_VAL_PC.csv' # 市现率   2012-2016 Cum return 0.95(group0),0.35(group8) monotonicity: Normal
filenameADJPB ='Own_Factor_AdjustedPB-1d.csv' # 市值调整市净率 2012-2016 Cum return 1.1(group2),0.2(group9) monotonicity: Normal

# Growth factor
filenameYOYGR = 'LZ_GPA_FIN_IND_QFA_YOYGR.csv'  # 单季度.营业总收入同比增长率(%) 2012-2016 Cum return 1.1(group8),0.41(group1) monotonicity: Good+
filenameYOYNETPROFIT = 'LZ_GPA_FIN_IND_QFA_YOYNETPROFIT.csv' # 单季度.归属母公司股东的净利润同比增长率(%) 2012-2016 Cum return 1.2(group9),0.2(group1) monotonicity: Good
filenameYOYOCF = 'LZ_GPA_FIN_IND_YOYOCF.csv' # 同比增长率-经营活动产生的现金流量净额(%) 2012-2016 Cum return 0.98(group8),0.4(group2) monotonicity: Good
filenameYOYROE = 'LZ_GPA_FIN_IND_YOYROE.csv' # 同比增长率-净资产收益率(摊薄)(%) 2012-2016 Cum return 1.17(group7),0.18(group1) monotonicity: Normal
filenameYOYBPS = 'LZ_GPA_FIN_IND_YOYBPS.csv' # 相对年初增长率-每股净资产(%)  2012-2016 Cum return 0.85(group4),0.5(group0) monotonicity: Normal

# Financial factor
filenameCAPITALIZEDTODA = 'LZ_GPA_FIN_IND_CAPITALIZEDTODA.csv'  # 资本支出/折旧和摊销 2012-2016 Cum return 0.95(group0),0.6(group9) monotonicity: Normal+
filenameCASHRATIO = 'LZ_GPA_FIN_IND_CASHRATIO.csv' # 保守速动比率 2012-2016 Cum return 0.85(group0),0.53(group2) monotonicity: Normal+
filenameCASHTOLIQDEBT = 'LZ_GPA_FIN_IND_CASHTOLIQDEBT.csv' # 货币资金/流动负债 2012-2016 Cum return 0.8(group7),0.6(group8) monotonicity: Normal
filenameOCFTODEBT = 'LZ_GPA_FIN_IND_OCFTODEBT.csv' # 经营活动产生的现金流量净额/负债合计 2012-2016 Cum return 0.8(group0),0.55(group4) monotonicity: Normal
filenamePROFITTOOP = 'LZ_GPA_FIN_IND_PROFITTOOP.csv' # 利润总额/营业收入 2012-2016 Cum return 0.84(group3),0.23(group9) monotonicity: Normal
filenamePROFITTOOPTTM ='LZ_GPA_FIN_DERI_COMBO_EBTTOOR_TTM.csv' # # 利润总额/营业收入(TTM) 2012-2016 Cum return 0.72(group8),0.41(group1) monotonicity: Good

filenameBERYRATIO = 'LZ_GPA_DERI_BerryRatio.csv' #边际毛利/营运开支 2012-2016  monotonicity: bad ,not useful at all

# Momentum factor
filenameTRUEMOM = 'LZ_GPA_USR_MOM.csv' # 真动量 2012-2016 Cum return 1.0(group3), - 0.3(group9) monotonicity: Normal+
filenameMOM1M = 'LZ_GPA_DERI_Momentum_1M.csv' # 一月反转 2012-2016 Cum return 0.85(group0), - 0.2(group9) monotonicity: Good
filenameMOM3M = 'LZ_GPA_TURNOVER_TurnoverAvg_3M.csv' # 三月反转 2012-2016 Cum return 0.65(group7), 0.4(group9) monotonicity: Normal
filenamePVO = 'LZ_GPA_DERI_PVO.csv' # 一年偏度  2012-2016 Cum return 0.47(group0), -0.3(group9) monotonicity: Good
filenameABORMALVOLUME = 'LZ_GPA_DERI_NormalizedAbormalVolume.csv' # 归一化异常交易量  2012-2016 Cum return 0.7(group0), 0.1(group9) monotonicity: Normal
filenameSKEW = 'LZ_GPA_DERI_TSKEW.csv' # 斜率   2012-2016 Cum return 1.1(group0), 0.25(group8) monotonicity: Good+
filenameMACD = 'LZ_GPA_DERI_MACD.csv'  # MACD   2012-2016 Cum return 0.8(group3), -0.4(group9) monotonicity: Normal
finenameBB = 'LZ_GPA_DERI_BB_20.csv'  # Bolling_bands  2012-2016 Cum return 0.75(group3) , 0.05(group9)  monotonicity: Normal 

# Liquidity factor
filenameTURNOVER1M = 'LZ_GPA_TURNOVER_TurnoverAvg_1M.csv' # 一月换手率均值 2012-2016 Cum return 0.7(group4) , 0.1(group9)  monotonicity: Normal 
filenameAMOUNTAVG1M = 'LZ_GPA_DERI_AmountAvg_1M.csv'    #  日均成交额   2012-2016 Cum return 1.52(group0) , -0.27(group9)  monotonicity: Perfect
filenameILLIQ = 'LZ_GPA_DERI_ILLIQ.csv'         # 非流动性因子  2012-2016 Cum return 1.5(group9), - 0.07(group1) monotonicity: Perfect
filenameTURNOVER = 'LZ_GPA_VAL_TURN.csv'    # 换手率 2012-2016 Cum return  0.78(group6), - 0.25(group9) monotonicity: Normal(除了第九组外其他差不多)
filenameOWNILLIQ ='Own_Factor_ILLQ-1d.csv' # 非流动性 2012-2016 Cum return 1.5(group9), - 0.25(group0) monotonicity: Perfect
filenameADJILLIQ = 'Own_Factor_ADJ_ILLQ_1D.csv' # 非流动性(市值调整) 
filenameADJTURNOVER = 'LZ_GPA_DERI_adjustedTurnOver_20.csv' # 市值调整日均成交额 2012-2016 Cum return 1.25(group0), - 0.5(group9) monotonicity: Perfect

# Volatility factor
filenameRV1Y = 'LZ_GPA_DERI_RealizedVolatility_1Y.csv' # 一年收益波动   2012-2016 Cum return  0.65(group8), 0.4(group1) monotonicity : Normal
filenameOwnVol = 'Own_Factor_Volatility_90d.csv'   # 90天收益波动率     2012-2016 Cum return 0.65(group1), - 0.26(group9) monotonicity: Good
filenameAbove20 = 'Own_Factor_Above20MA_20d.csv'  # 高于20天MA的价格平均 2012-2016 Cum return 0.8(group1) , -0.4(group9)  monotonicity:Good

filenameTOV20 = 'Own_Factor_Turnover_Volatility_20D.csv' # 20天换手率波动率
filenameADJTOV20 = 'Own_Factor_ADJ_Turnover_Volatility_20D.csv' # 20天换手率波动率(市值调整) 2012-2016 Cum return 1.8(group0), -0.3(group9) monotonicity: perfect
filenameTOVD20 = 'Own_Factor_Turnover_Volatility_deviation_20D.csv' # 20天换手率均值除500天换手率均值-1
filenameADJTOVD20 = 'Own_Factor_ADJ_Turnover_Volatility_Deviation_20D.csv' # 20天换手率均值除500天换手率均值-1(市值调整)  Cum return 0.7(group0), -0.2(group9)Good

In [247]:
#
#filenameSpecificVol = 'Own_Factor_Specific_Volatility.csv' # 特质波动率
filenameAroon = 'Aroon_Allstocks.csv' 
filenameAdjTOTrue = 'Own_Factor_AdjustedTurnOver-1d.csv'# 真市值调整换手率 2012-2016 Cum return 1.0(group7), - 0.4(group9) monotonicity: Normal
filenameDDA = 'Own_Factor_DDA-1d.csv'  # 股票每日成交额(前复权)  2012-2016 Cum return 2.2(group0), - 0.6(group9) monotonicity: Perfect
filennameQFAEPS = 'LZ_GPA_FIN_IND_QFA_EPS.csv' # 单季度每股收益
filenameDDA20 = 'Own_Factor_DDA-20d.csv' # DDA二十天均值   2012-2016 Cum return 2.2(group0), - 0.6(group9) monotonicity: Perfect
filenameADJDDA20 ='Own_Factor_ADJ_DDA_20D.csv' # 中性化(仅去市值)后的DDA20
filenameIDIVOL = 'Own_Factor_Idiosyncratic_Volatility.csv' # 特异波动率  2012-2016 Cum return 0.9(group1), 0(group9) monotonicity: Good+
filenameOwnSkewness = 'Own_Factor_Skewness_250d.csv'  # 250日偏度
filenameOwnReturnSkew = 'Own_Factor_Return_Skew_250D.csv' # 250日收益率偏度
filenamePPO ='LZ_GPA_DERI_PPO.csv' #
filename_5_20_deviation = 'Own_Factor_5_20_price_deviation_1D.csv'  #  5日价格平均除以20日价格平均
filename_5_20_return_deviation = 'Own_Factor_5_20_return_deviation_1D.csv'  # 5日平均收益除以20日平均收益
filenameSharpe = 'Own_Factor_sharpe_ratio_20D.csv'  # 20日夏普比率
filenameDownsideRisk = 'Own_Factor_downside_risk_252D.csv' # 252日下行波动率  不太好的因子
filenameSortinoRatio = 'Own_Factor_sortino_ratio_20D.csv'  # 20日sortino比率   不太好的因子

In [248]:
# Uqer factor
filenameUQAD20 = 'Uqer_factor_AD20.csv'    #累积/派发线(Accumulation / Distribution Line)的20日均线
filenameUQADTM = 'Uqer_factor_ADTM.csv'    #动态买卖气指标,用开盘价的向上波动幅度和向下波动幅度的距离差值来描述人气高低的指标。属于情绪类因子
filenameUQATR6 = 'Uqer_factor_ATR6.csv'    #6日均幅指标(Average TRUE Ranger),取一定时间周期内的股价波动幅度的移动平均值 (# perfect!)
filenameUQAroon = 'Uqer_factor_Aroon.csv'   #Aroon通过计算自价格达到近期最高值和最低值以来所经过的期间数
filenameUQBias10 = 'Uqer_factor_BIAS10.csv' # 10日乖离率,简称Y值,是移动平均原理派生的一项技术指标,表示股价偏离趋向指标斩百分比值
filenameUQCCI10 = 'Uqer_factor_CCI10.csv'   # 10日顺势指标(Commodity Channel Index),专门测量股价是否已超出常态分布范围(效果不太理想)
filenameUQKDJ_K = 'Uqer_factor_KDJ_K.csv'   #随机指标。它综合了动量观念、强弱指标及移动平均线的优点,用来度量股价脱离价格正常范围的变异程度。(不理想0
filenameUQROC6 = 'Uqer_factor_ROC6.csv' #6日变动速率(Price Rate of Change),以当日的收盘价和N天前的收盘价比较
filenameUQRVI = 'Uqer_factor_RVI.csv' # 相对离散指数(Relative Volatility Index)
filenameUQCMO = 'Uqer_factor_CMO.csv' # 钱德动量摆动指标(Chande Momentum Osciliator)  # 相当不错的因子,单调性相当好!特别是分行业之后!
filenameUQRSI = 'Uqer_factor_RSI.csv'  # 相对强弱指标(Relative Strength Index)   或许还可以?
filenameUQSkewness = 'Uqer_factor_Skewness.csv' # 过去20个交易日股价的偏度,分行业后又不错的单调性
filenameUQOBV20 = 'Uqer_factor_OBV20.csv'   # 20日能量潮指标(On Balance Volume,OBV) 非常棒的因子(但与市值因子相关性太高)
filenameUQMTM ='Uqer_factor_MTM.csv'   # 动量指标(Momentom Index) 中间组(group4,group5)比较好,或许可以当作检验因子?
filenameUQPVT6 = 'Uqer_factor_PVT6.csv'  # 价量趋势(Price and Volume Trend)指标    中间组效果较好(可当作检验因子,3组,4组)
filenameUQREC = 'Uqer_factor_REC.csv'     # 析师推荐评级(Recommended rating score by analyst  没有单调性,第九组最好,可以当作一个选股的检验因子
filenameUQDAREC = 'Uqer_factor_DAREC.csv'   # 分析师推荐评级变化,相比于60个交易日前。 没有单调性,但第九组最好
filenameUQGREC = 'Uqer_factor_GREC.csv'    # 分析师推荐评级变化趋势,过去60个交易日内的DAREC 符号加和. 第五组最好,单调性不太好
filenameUQREVS20 = 'Uqer_factor_REVS20.csv'  # 股票的20日收益   perfect! 很好的因子!并且与市值因子相关性不大!
filenameUQREVS5 ='Uqer_factor_REVS5.csv'  # 股票的5日收益  不错,但不如20日收益
filenameUQMA10RegressCoeff12 = 'Uqer_factor_MA10RegressCoeff12.csv' # 10日价格平均线12日线性回归系数  单调性还不错,从第三组开始基本单调(第三组最好)
filenameUQWVAD = 'Uqer_factor_WVAD.csv' # 威廉变异离散量(William's variable accumulation distribution),是一种将成交量加权的量价指标 相当不错的因子
                                        # 从group 1 开始单调性明显(最好的组为group2 ,group3),特别是分了行业之后的
filenameUQHurst = 'Uqer_factor_Hurst.csv'  # 赫斯特指数   不错的因子(和市值因子没啥相关性),单调性不错(group9在最好),特别是分行业之后效果更明显
filenameUQMassIndex = 'Uqer_factor_MassIndex.csv'  # 梅斯线(Mass Index),本指标是Donald Dorsey累积股价波幅宽度之后所设计的震荡曲线。其最主要的作用,在于寻找飙涨股或者极度弱势股的重要趋势反转点。属于常用技术指标类因子
                                                   # 波动率指标,单调性不错,group1在最上面,group9在最下面。
filenameUQKlingerOscillator = 'Uqer_factor_KlingerOscillator.csv'  # 成交量摆动指标.单调性一般,俩头差中间好,或许可用来选股。

In [249]:
# Not good factors but maybe useful
filenameOVERVOL = 'Over_Heat_Volume.csv' 
filenameSIZE = 'LZ_GPA_VAL_A_FCAP.csv'
filenameExcessReturn = 'Own_Factor_excess_return_20D.csv'
filenameROE = 'LZ_GPA_FIN_IND_ROE.csv'

In [250]:
# ROE x日 波动率
filenameRoeVol60 = 'Own_Factor_ROE_Volatility_60D.csv'   # 和预期不符
filenameRoeVol120 = 'Own_Factor_ROE_Volatility_120D.csv'  # 符合预期,即ROE波动小的组收益较好,单调性不明显
filenameRoeVol250 = 'Own_Factor_ROE_Volatility_250D.csv'   # 符合预期,单调性非常好,描述ROE波动率可用此因子!good+!

In [251]:
# Non_linear factor
filenameNLFCAP ='Own_Factor_Non_Linear_Size.1D.csv'  # 效果很完美,但是与市值因子相关性接近-1,可能还是受小市值影响?
filenameBeta  = 'Own_Factor_Beta_1D.csv'   # 贝塔因子

In [252]:
# 一致预期
filenameNetProfitTTM = 'LZ_CN_STKA_CRD_NET_PRFT_FTTM.h5'

In [253]:
# h5 files here
filenameOwnFactorILLIQh5 = 'OwnFactorILLIQ.h5' 
filenameOwnFactorADJILLIQh5 = 'OwnFactorADJILLIQ.h5' # 垃圾因子效果很差
filenameLZBPLR = 'LZ_CN_STKA_DERI_BP_LR.h5' # 股东权益总市值比 (股东权益合计/总资产)
filenameOwnFactorSmartMoney = 'OwnfactorSmartMoney1min.h5'
filenameOwnFactorLogSmartMoney = 'OwnfactorlogSmartMoneyCSIall.h5'
filenameOwnMassIndex = 'OwnfactorMassIndex.h5'   # mass index  比较优秀的因子,和市值不太相关,前几组区分的不算开(group1-group3),但后面几组区分的很好。
filenameMassIndex800 = 'OwnfactorMassIndex800.h5' 
filenameOwnNetIn = 'OwnfactorNetIn.h5'  # 资金净流入(分钟线合成) 效果一般,单调性不好
filenameOwnNetInRatio = 'OwnfactorNetInRatio.h5' # 资金净流入 / 流动市值
filenameOwnNetInRatio5d = 'OwnfactorNetInRatio5d.h5' # 资金净流入5日平均 / 流动市值
filenameOwnNetInRatio20d = 'OwnfactorNetInRatio20d.h5' # 资金净流入20日平均 / 流动市值
filenameVolQuantileRatio = 'VolQuantileRatio.h5'  # 成交量分布指数,整体单调性不是那么强,但前几组单调性不错,可用来选股。用tick数据做可能效果更好。
filenameInduSpreadBias = 'OwnFactorInduSpreadBias.h5' # 行业偏差指数,很好的反转因子,单调性强,IC均值及中位数为-0.07,但17年没有什么超额收益。

In [254]:
# 复权后价格
filenameAdjClose = 'OwnfactorAdjustedClose.h5'

In [255]:
# factors not creat by myself
filenameTrueSpreadBias = 'OthersSpreadBias222_1D.csv' #东方证券价差波动率因子,正常测试效果一般,分行业后效果非常好
filenameIdiosycraticVol = 'Othersid2_std_3m_1D.csv'

In [256]:
# Constants
startTime =  datetime.strptime('20120504', '%Y%m%d')
endTime = datetime.strptime('20170228', '%Y%m%d')

path = ff.data_path
timeStampNum = 2500
thresholdNum = 0.2
HS300Index ='000300.SH' # HS300 index code
ZZ500Index = '000905.SH' #   ZZ500 index code

In [257]:
# for csv file
'''
stDF = pd.read_csv(path+filenameST,infer_datetime_format=True,parse_dates=[0],index_col=0).loc[startTime:endTime]
tradeDayDF = pd.read_csv(path+filenameTradeday,infer_datetime_format=True,parse_dates=[0],index_col=0).loc[startTime:endTime]
stopFlagDF = pd.read_csv(path+filenameStopFlag,infer_datetime_format=True,parse_dates=[0],index_col=0).loc[startTime:endTime]
indusDF = pd.read_csv(path+filenameZXIndustry,infer_datetime_format=True,parse_dates=[0],index_col=0).loc[startTime:endTime]
induExplanation = pd.read_csv(path+filenameZXExplanation,infer_datetime_format=True,parse_dates=[0],encoding='gb2312')
'''


Out[257]:
"\nstDF = pd.read_csv(path+filenameST,infer_datetime_format=True,parse_dates=[0],index_col=0).loc[startTime:endTime]\ntradeDayDF = pd.read_csv(path+filenameTradeday,infer_datetime_format=True,parse_dates=[0],index_col=0).loc[startTime:endTime]\nstopFlagDF = pd.read_csv(path+filenameStopFlag,infer_datetime_format=True,parse_dates=[0],index_col=0).loc[startTime:endTime]\nindusDF = pd.read_csv(path+filenameZXIndustry,infer_datetime_format=True,parse_dates=[0],index_col=0).loc[startTime:endTime]\ninduExplanation = pd.read_csv(path+filenameZXExplanation,infer_datetime_format=True,parse_dates=[0],encoding='gb2312')\n"

In [ ]:


In [ ]:


In [258]:
# for h5 file
stDF = ff.readh5data(path,filenameST).loc[startTime:endTime]
tradeDayDF = ff.readh5data(path,filenameTradeday).loc[startTime:endTime]
stopFlagDF = ff.readh5data(path,filenameStopFlag).loc[startTime:endTime]
indusDF = ff.readh5data(path,filenameZXIndustry).loc[startTime:endTime]
induExplanation = pd.read_csv(path+filenameZXExplanation,infer_datetime_format=True,parse_dates=[0],encoding='gb2312')

In [ ]:


In [350]:
#
sololist = [filenameOwnMassIndex]
#
filenameDict = {'PE':filenamePE,'PB':filenamePB, 'PS':filenamePS, 'PCF':filenamePCF, 'YOYGR':filenameYOYGR,'YOYGRPROFIT':filenameYOYNETPROFIT, \
                'TRUE_MOM':filenameTRUEMOM, 'MOM_1M':filenameMOM1M , 'TURNOVER_1M':filenameTURNOVER1M }

In [351]:
induExplanation.tail()


Out[351]:
LZ_GPA_TMP_INDU_ZX-t
24 电子元器件
25 通信
26 计算机
27 传媒
28 综合

In [352]:
explanationDict = induExplanation.iloc[:,0].to_dict()

In [353]:
explanationDict


Out[353]:
{0: u'\u77f3\u6cb9\u77f3\u5316',
 1: u'\u7164\u70ad',
 2: u'\u6709\u8272\u91d1\u5c5e',
 3: u'\u7535\u529b\u53ca\u516c\u7528\u4e8b\u4e1a',
 4: u'\u94a2\u94c1',
 5: u'\u57fa\u7840\u5316\u5de5',
 6: u'\u5efa\u7b51',
 7: u'\u5efa\u6750',
 8: u'\u8f7b\u5de5\u5236\u9020',
 9: u'\u673a\u68b0',
 10: u'\u7535\u529b\u8bbe\u5907',
 11: u'\u56fd\u9632\u519b\u5de5',
 12: u'\u6c7d\u8f66',
 13: u'\u5546\u8d38\u96f6\u552e',
 14: u'\u9910\u996e\u65c5\u6e38',
 15: u'\u5bb6\u7535',
 16: u'\u7eba\u7ec7\u670d\u88c5',
 17: u'\u533b\u836f',
 18: u'\u98df\u54c1\u996e\u6599',
 19: u'\u519c\u6797\u7267\u6e14',
 20: u'\u94f6\u884c',
 21: u'\u975e\u94f6\u884c\u91d1\u878d',
 22: u'\u623f\u5730\u4ea7',
 23: u'\u4ea4\u901a\u8fd0\u8f93',
 24: u'\u7535\u5b50\u5143\u5668\u4ef6',
 25: u'\u901a\u4fe1',
 26: u'\u8ba1\u7b97\u673a',
 27: u'\u4f20\u5a92',
 28: u'\u7efc\u5408'}

In [354]:
sololist[0]


Out[354]:
'OwnfactorMassIndex.h5'

In [ ]:


In [355]:
if ('Uqer' or 'Others') in sololist[0]:
    sparedata = pd.read_csv(path+sololist[0],infer_datetime_format=True,parse_dates=[0],index_col=0)
else:
    sparedata = ff.readh5data(path,sololist[0])

In [ ]:


In [356]:
# adjust the columns name of the Uqer data same as the own factor
if 'Uqer' or '1min' in sololist[0]: 
    uqercolumnList = sparedata.columns.tolist()
    uqercolumnName = [x.split('.')[0] for x in uqercolumnList]
    newcolumnList = stDF.columns.tolist()
    newcolumnName = [x.split('.')[0] for x in newcolumnList]
    columndict = dict(zip(newcolumnName,newcolumnList))
    finalcolumns = []
    for stk in uqercolumnName:
        if stk in newcolumnName:
            stk = columndict[stk]
        else:
            pass
        finalcolumns.append(stk)
    sparedata.columns = finalcolumns
    addNanColumns = list(set(newcolumnList) - set(sparedata.columns))
    addData = pd.DataFrame(index = sparedata.index,columns = addNanColumns,dtype =float)
    sparedata = pd.concat([sparedata,addData],axis=1)
    sparedata = sparedata[newcolumnList]

In [ ]:


In [ ]:


In [357]:
## easy way to calculate the last day of the month
#stDF['label'] = stDF.index.map(lambda x: (x.year,x.month))
#locationList=(stDF.groupby(['label']).size().cumsum()-1).values
#stDF.iloc[locationList].index
#stDF.groupby(['label'])

In [358]:
# Using pd.read_csv method since the adjusted price data is in csv format
#priceData =  pd.read_csv(path+ filenamePrice ,infer_datetime_format=True,parse_dates=[0],index_col=0).loc[startTime:endTime]
priceData =  ff.readh5data(path,filenameAdjClose).loc[startTime:endTime]
#benchMarkData = pd.read_csv(path+filenameHS300,infer_datetime_format=True,parse_dates=[0],index_col=0).loc[startTime:endTime][ZZ500Index]

In [ ]:


In [359]:
# Read h5data since the benchmarkData is the h5 file

#priceData = ff.readh5data(path, filenamePrice).loc[startTime:endTime]
benchMarkData = ff.readh5data(path, filenameHS300).loc[startTime:endTime][ZZ500Index]

In [ ]:


In [360]:
LFCAPDF = np.log10(ff.getData(thresholdNum, startTime, endTime,filename = filenameFCAP))

In [ ]:


In [ ]:


In [361]:
#fs = np.log10(pd.read_csv(path+ filenameFCAP ,infer_datetime_format=True,parse_dates=[0],index_col=0))

#fs[fs>fs.quantile(0.9,axis=1)] = np.NaN
#fs.to_csv(path+'Own_factor_Nonlinear_FCAP.csv',na_rep='NaN',date_format='%Y%m%d')

In [362]:
#fs[fs<fs.quantile(0.4,axis=1)]

In [ ]:


In [363]:
endOfMonthList = ff.getLastDayOfMonth(LFCAPDF.index)[1]

In [364]:
#sorted(list(set(endOfMonthList) & set(endOfMonthList1)))

In [ ]:


In [365]:
# calculate correlation between two factors
#if  'Uqer' or '1min' in sololist[0]:
factor1 = sparedata.loc[startTime:endTime]
#else:
    #factor1 = pd.read_csv(path+sololist[0],infer_datetime_format=True,parse_dates=[0],index_col=0).loc[startTime:endTime]
    #factor1 = ff.readh5data(path,sololist[0])
correlationDF = ff.showCorrelation(factor1,LFCAPDF, endOfMonthList, filterdic = None).astype(float)
correlationDF.describe()


Out[365]:
Pearson Spearman
count 58.000000 58.000000
mean 0.063635 0.026797
std 0.069319 0.120121
min -0.079835 -0.192784
25% 0.011941 -0.049422
50% 0.068115 0.027325
75% 0.106348 0.082577
max 0.230051 0.411534

In [ ]:


In [ ]:


In [366]:
#%lprun -f ff.getLastDayOfMonth ff.getLastDayOfMonth(LFCAPDF.index)

In [367]:
activeReturnData = ff.calcReturn(priceData, endOfMonthList, benchmark = benchMarkData,activeReturn = True,logReturn = False)

In [ ]:


In [368]:
# Generate the useles stock list
filterdict={}
for i in endOfMonthList:
    suspendList = ff.GetSTNewSuspend(i,stDF,tradeDayDF,stopFlagDF)
    filterdict[i] = suspendList
    print i, len(filterdict[i])


2012-05-31 00:00:00 1366
2012-06-29 00:00:00 1352
2012-07-31 00:00:00 1269
2012-08-31 00:00:00 1243
2012-09-28 00:00:00 1211
2012-10-31 00:00:00 1186
2012-11-30 00:00:00 1180
2012-12-31 00:00:00 1168
2013-01-31 00:00:00 1161
2013-02-28 00:00:00 1161
2013-03-29 00:00:00 1169
2013-04-26 00:00:00 1163
2013-05-31 00:00:00 1158
2013-06-28 00:00:00 1166
2013-07-31 00:00:00 1176
2013-08-30 00:00:00 1174
2013-09-30 00:00:00 1176
2013-10-31 00:00:00 1183
2013-11-29 00:00:00 1198
2013-12-31 00:00:00 1184
2014-01-30 00:00:00 1178
2014-02-28 00:00:00 1196
2014-03-31 00:00:00 1232
2014-04-30 00:00:00 1213
2014-05-30 00:00:00 1216
2014-06-30 00:00:00 1227
2014-07-31 00:00:00 1220
2014-08-29 00:00:00 1236
2014-09-30 00:00:00 1216
2014-10-31 00:00:00 1225
2014-11-28 00:00:00 1250
2014-12-31 00:00:00 1219
2015-01-30 00:00:00 1218
2015-02-27 00:00:00 1206
2015-03-31 00:00:00 1264
2015-04-30 00:00:00 1234
2015-05-29 00:00:00 1285
2015-06-30 00:00:00 1304
2015-07-31 00:00:00 1336
2015-08-31 00:00:00 1290
2015-09-30 00:00:00 1178
2015-10-30 00:00:00 1142
2015-11-30 00:00:00 1056
2015-12-31 00:00:00 1014
2016-01-29 00:00:00 1007
2016-02-29 00:00:00 1027
2016-03-31 00:00:00 1012
2016-04-29 00:00:00 1037
2016-05-31 00:00:00 1011
2016-06-30 00:00:00 982
2016-07-29 00:00:00 932
2016-08-31 00:00:00 903
2016-09-30 00:00:00 893
2016-10-31 00:00:00 861
2016-11-30 00:00:00 842
2016-12-30 00:00:00 812
2017-01-26 00:00:00 783
2017-02-28 00:00:00 779

In [369]:
### Calculate the return of each group of given factor(solo factor)
totalGroupDict ={}
factorData = ff.getData(thresholdNum, startTime, endTime,availableData = factor1)

for date in endOfMonthList:
    factorDataTemp = factorData.loc[:date].tail()
    factorDataTemp = factorDataTemp[list(set(factorDataTemp.columns.tolist())- set(filterdict[date]))]
    #print factorDataTemp
    totalGroupDict[date] = ff.getStockGroup(factorDataTemp,groupNum=10,Mean_Num=20)

In [370]:
#totalGroupDict

In [371]:
ReturnDF = pd.DataFrame(index=endOfMonthList[:-1],columns=totalGroupDict.values()[0].keys(),data=None, dtype =float)
sizeDistribution = pd.DataFrame(index=endOfMonthList[:-1],columns=totalGroupDict.values()[0].keys(),data=None, dtype =float)
for group in ReturnDF.columns.tolist():
    for time in ReturnDF.index:
        ReturnDF.loc[time][group] = activeReturnData.loc[time][totalGroupDict[time][group]].mean()
        sizeDistribution.loc[time][group] = LFCAPDF.loc[time][totalGroupDict[time][group]].quantile()
ReturnDF.sort_index(axis=1,inplace=True)
sizeDistribution.sort_index(axis=1,inplace=True)

In [372]:
# show size distribution(quantile plot)
fig = plt.figure(figsize=(16,10))
# Add a subplot
ax = fig.add_subplot(111)
sizeDistribution.median().plot(kind='bar',ax = ax, fontsize =13,title ='Size Distribution of each group',alpha =0.8)
ax.set_title(ax.get_title(),alpha=0.7, fontsize=25)


Out[372]:
<matplotlib.text.Text at 0x5313c2b0>

In [373]:
sizeDistribution.quantile()
sizeMono = sizeDistribution.quantile().corr(pd.Series(index = sizeDistribution.quantile().index,data = range(len(sizeDistribution.quantile()))),\
                                           method = 'spearman')
sizeMono


Out[373]:
0.30909090909090903

In [374]:
# Calc spearman correlation to investigate monotonicity
referSeries = pd.Series(index = ReturnDF.columns, data=range(len(ReturnDF.columns)))
monoDF = pd.DataFrame(index = ReturnDF.index, columns=['Spearman_Cor'], dtype = float)
for date in ReturnDF.index:
    monoDF.loc[date] = ReturnDF.loc[date].corr(referSeries,method='spearman')

In [375]:
# Plot
fig = plt.figure(figsize=(18,14))
# Add a subplot
ax = fig.add_subplot(111)
monoDF.plot(figsize=(22,14),ax=ax,fontsize =13,title ='Monotonicity')
ax.set_title(ax.get_title(),alpha=0.7, fontsize=30)


Out[375]:
<matplotlib.text.Text at 0x3966e8d0>

In [ ]:


In [376]:
### Method to calculate moving max drawdown
from numpy.lib.stride_tricks import as_strided
def windowed_view(x, window_size):
    """Creat a 2d windowed view of a 1d array.

    `x` must be a 1d numpy array.

    `numpy.lib.stride_tricks.as_strided` is used to create the view.
    The data is not copied.

    Example:

    >>> x = np.array([1, 2, 3, 4, 5, 6])
    >>> windowed_view(x, 3)
    array([[1, 2, 3],
           [2, 3, 4],
           [3, 4, 5],
           [4, 5, 6]])
    """
    y = as_strided(x, shape=(x.size - window_size + 1, window_size),
                   strides=(x.strides[0], x.strides[0]))
    return y

In [377]:
###
def rolling_max_dd(x, window_size, min_periods=1):
    """Compute the rolling maximum drawdown of `x`.

    `x` must be a 1d numpy array.
    `min_periods` should satisfy `1 <= min_periods <= window_size`.

    Returns an 1d array with length `len(x) - min_periods + 1`.
    """
    if min_periods < window_size:
        pad = np.empty(window_size - min_periods)
        pad.fill(x[0])
        x = np.concatenate((pad, x))
    y = windowed_view(x, window_size)
    #print y
    rolling_max_y = np.maximum.accumulate(y, axis=1)
    #print rolling_max_y
    dd = 1-y/rolling_max_y
    return np.abs(dd).max(axis=1)

In [378]:
max(rolling_max_dd(ReturnDF['group_0'].values,4, min_periods=1))


Out[378]:
9.4090168655254303

In [379]:
# long top group short bot group 
sortGroups = ReturnDF[['group_0','group_9']].cumsum().iloc[-1].sort_values(ascending = False).index
top, bot = sortGroups[0], sortGroups[-1]

# Add another column
ReturnDF['top_bot_comb'] = ReturnDF[top] - ReturnDF[bot]

In [380]:
top, bot


Out[380]:
('group_0', 'group_9')

In [381]:
ReturnDF.tail(10)


Out[381]:
group_0 group_1 group_2 group_3 group_4 group_5 group_6 group_7 group_8 group_9 top_bot_comb
2016-04-29 -0.014786 -0.012027 0.004326 -0.008509 0.002305 -0.007659 -0.010035 -0.008823 -0.021024 -0.026645 0.011859
2016-05-31 0.033453 0.015366 0.025428 0.018033 0.024678 0.032415 0.029878 0.034312 0.027137 0.019690 0.013763
2016-06-30 -0.002798 0.016313 0.018038 0.001802 -0.000997 -0.003439 -0.018455 -0.030696 -0.053449 -0.064684 0.061887
2016-07-29 0.014565 0.005168 0.008527 0.008308 0.021788 0.018760 0.010603 0.017110 0.001674 0.018260 -0.003696
2016-08-31 -0.012339 -0.006737 0.011002 0.017276 0.015889 0.025597 0.014195 0.022748 0.014206 0.000898 -0.013238
2016-09-30 0.020434 0.018856 0.019749 0.017414 0.014997 0.016332 0.010510 0.012242 0.002624 0.003860 0.016574
2016-10-31 0.002239 -0.001973 0.002343 0.006996 0.005440 0.006260 0.014275 0.012441 0.015380 0.000811 0.001428
2016-11-30 0.006055 -0.004464 -0.007019 0.002998 0.002857 0.016680 0.005462 0.012217 0.009900 0.014976 -0.008921
2016-12-30 -0.008760 -0.015727 -0.013924 -0.009137 -0.004085 -0.009040 -0.016822 -0.005369 -0.011491 -0.019497 0.010736
2017-01-26 -0.009173 -0.004230 0.001045 0.007732 0.003869 0.010781 0.004113 0.002603 0.008428 -0.000319 -0.008854

In [382]:
# simple cumulative return 
fig = plt.figure(figsize=(18,14))
# Add a subplot
ax = fig.add_subplot(111)
ReturnDF.astype(float).cumsum().plot(figsize=(22,14),ax=ax,  color=sns.color_palette("Paired",11),fontsize =13,title ='Cumulative Return')
ax.set_title(ax.get_title(),alpha=0.7, fontsize=30)


Out[382]:
<matplotlib.text.Text at 0x3946e518>

In [ ]:


In [383]:
# Net worth plot 
netWorthDF = (ReturnDF.astype(float)+1).cumprod()
fig = plt.figure(figsize=(18,14))
# Add a subplot
ax = fig.add_subplot(111)
netWorthDF.plot(figsize=(22,14),ax=ax,color=sns.color_palette("Paired",11),title ='Net Worth',fontsize =13)
ax.set_title(ax.get_title(),alpha=0.7, fontsize=30, ha='right')


Out[383]:
<matplotlib.text.Text at 0x3970ed68>

In [384]:
# Calc spearman correlation to investigate monotonicity
referSeries = pd.Series(index = netWorthDF.iloc[:,:-1].columns, data=range(len(netWorthDF.iloc[:,:-1].columns)))
monoDF = pd.DataFrame(index = netWorthDF.index, columns=['Spearman_Cor'], dtype = float)
for date in netWorthDF.index:
    monoDF.loc[date] = netWorthDF.iloc[:,:-1].loc[date].corr(referSeries,method='spearman')

In [ ]:


In [385]:
# Plot
fig = plt.figure(figsize=(18,14))
# Add a subplot
ax = fig.add_subplot(111)
monoDF.plot(figsize=(22,14),ax=ax,fontsize =13,title ='Monotonicity')
ax.set_title(ax.get_title(),alpha=0.7, fontsize=30)


Out[385]:
<matplotlib.text.Text at 0x39a17860>

In [386]:
# monotonicity information 
print monoDF.std()
print monoDF.median()


Spearman_Cor    0.277084
dtype: float64
Spearman_Cor   -0.769697
dtype: float64

In [387]:
# basic indicator
annualizedReturn = (1+ReturnDF.mean())**12 - 1
annualizedVol = ReturnDF.std()* np.sqrt(12)
sharpeRatio = annualizedReturn / annualizedVol 
print 'Annual Return:','\n',annualizedReturn,'\n\n','Annual Volatility:','\n',annualizedVol,'\n\n','Sharpe Ratio:''\n',sharpeRatio


Annual Return: 
group_0         0.156024
group_1         0.153781
group_2         0.153068
group_3         0.134514
group_4         0.154951
group_5         0.161712
group_6         0.143073
group_7         0.091823
group_8         0.053177
group_9         0.052519
top_bot_comb    0.098778
dtype: float64 

Annual Volatility: 
group_0         0.097661
group_1         0.089834
group_2         0.092746
group_3         0.078567
group_4         0.087283
group_5         0.081228
group_6         0.079780
group_7         0.080485
group_8         0.091390
group_9         0.109763
top_bot_comb    0.125205
dtype: float64 

Sharpe Ratio:
group_0         1.597611
group_1         1.711838
group_2         1.650407
group_3         1.712104
group_4         1.775277
group_5         1.990852
group_6         1.793356
group_7         1.140877
group_8         0.581872
group_9         0.478479
top_bot_comb    0.788935
dtype: float64

In [388]:
copyReturn = ReturnDF.copy()
copyReturn [copyReturn > copyReturn .mean()] =0
downsideRisk = ReturnDF.std(skipna = True) * np.sqrt(12)
downsideRisk


Out[388]:
group_0         0.097661
group_1         0.089834
group_2         0.092746
group_3         0.078567
group_4         0.087283
group_5         0.081228
group_6         0.079780
group_7         0.080485
group_8         0.091390
group_9         0.109763
top_bot_comb    0.125205
dtype: float64

In [ ]:


In [389]:
sortinoRatio = annualizedReturn / downsideRisk
sortinoRatio


Out[389]:
group_0         1.597611
group_1         1.711838
group_2         1.650407
group_3         1.712104
group_4         1.775277
group_5         1.990852
group_6         1.793356
group_7         1.140877
group_8         0.581872
group_9         0.478479
top_bot_comb    0.788935
dtype: float64

In [390]:
# Max drawdown
maxdd = netWorthDF.copy()
maxdd.iloc[0] = 0
for date in netWorthDF.index[1:]:
    maxdd.loc[date] = 1-netWorthDF.loc[date]/netWorthDF.loc[:date].max()
maxddInfo = pd.concat([maxdd.max(),maxdd.idxmax()],axis=1)
maxddInfo.columns = ['Max_drawdown','Time']
maxddInfo


Out[390]:
Max_drawdown Time
group_0 0.072089 2014-11-28
group_1 0.065952 2014-11-28
group_2 0.078445 2014-11-28
group_3 0.067686 2014-11-28
group_4 0.069246 2014-11-28
group_5 0.061807 2015-06-30
group_6 0.071651 2015-06-30
group_7 0.090931 2015-06-30
group_8 0.133007 2015-06-30
group_9 0.121273 2015-06-30
top_bot_comb 0.129742 2014-11-28

In [391]:
# calmar Ratio
calmarRatio = annualizedReturn/ maxddInfo['Max_drawdown']
calmarRatio


Out[391]:
group_0         2.164318
group_1         2.331718
group_2         1.951274
group_3         1.987335
group_4         2.237691
group_5         2.616400
group_6         1.996809
group_7         1.009811
group_8         0.399806
group_9         0.433069
top_bot_comb    0.761344
dtype: float64

In [ ]:


In [ ]:


In [ ]:


In [392]:
ReturnForPlot =ReturnDF.copy()

In [393]:
ReturnForPlot.index = ReturnForPlot.index.map(lambda x:100*x.year+(1+x.month) if x.month < 12 else\
                                             100*(x.year+1)+1) ### this shows the real time
plt.figure(figsize=(24,13)) ax = plt.axes() sns.heatmap(ReturnForPlot.iloc[-40:].astype(float),ax=ax, annot=True) ax.set_title('Monthly Return of Each Group',fontsize=18, fontweight='bold') plt.show()

In [394]:
### the following part is to save return data of every factor into one Dataframe

In [395]:
#---------------- the following part shows the show the difference of a risk factor across different market capitalzation size----#
#-------------  and different industries ------#

In [396]:
# show the difference of a risk factor through different market capitalzation size
# capdata should not contain Nan value
# Return: DICTIONARY, the KEY is the date and the Value is the tuple of the groups
# Inputs: 
# capdata: DATAFRAME ,the LFCAP DATA
# datelist: LIST, the datelist of the end month
def getGroupsbyCap(capdata, datelist):
    capdict ={}
    for date in datelist:
        capdataindice = capdata.loc[date]
        lower = capdataindice.quantile(1/3)
        upper = capdataindice.quantile(2/3)
        smallcap = capdataindice[capdataindice<=lower].index
        midcap = capdataindice[(lower<capdataindice) & (capdataindice<=upper)].index
        hugecap = capdataindice[capdataindice>upper].index
        capdict[date] = (smallcap,midcap,hugecap)
    return capdict

In [397]:
# industry number
grouplabel = np.random.choice(29,10,replace=False)
grouplabel


Out[397]:
array([19, 26, 18,  4, 12, 16,  3, 25, 22, 27])

In [398]:
# show the difference of a risk factor through different industry
# capdata should not contain Nan value
# Return: DICTIONARY, the KEY is the date and the Value is the DICTIONARY of the groups of each industry on that day
# Inputs: 
# datelist: LIST, the datelist of the end month
# grouplable: LIST, the industry label,usually 3 of them is constant and other 3 is randomed  alike [2,3,6,15,18,25]
# industryDF: DATAFRAME,the industry dataframe(could )
def getIndustryDict(datelist,grouplabel,industryDF):
    industrydict = {}
    industryDF = industryDF.loc[datelist]
    for date in datelist:
        industryDFindice = industryDF.loc[date]
        industrydict[date] = {label:industryDFindice[industryDFindice == label].index for label in grouplabel}
    return industrydict

In [ ]:


In [399]:
#------------------------------ Following part is to group  stocks within industry --------------------------

In [400]:
wholeIndList = np.array(range(29))

wholeIndDict = getIndustryDict(endOfMonthList,wholeIndList,indusDF)

In [401]:
multindexList = [endOfMonthList,wholeIndList]
induReturnDF  = pd.DataFrame(data=None, columns=totalGroupDict.values()[0].keys(),\
                             index=pd.MultiIndex.from_product(multindexList,names=['time','industry']),dtype=float)

In [402]:
# Group the stocks
# To-Dos: Add new weighting option for return,Same weight as the benchmark inter the different industries.(Namely industry-neutralized) 
groupNumberThrottle = 10
for i,j in wholeIndDict.iteritems():
    print i
    factorIndice = factorData.loc[:i].tail()
    factorIndice = factorIndice[list(set(factorIndice.columns.tolist())- set(filterdict[date]))]  # Remove ST ,new and suspending stk
    for ind, stk in j.iteritems():
        intersection = list(set(factorIndice.columns.tolist()) & set(stk))
        if len(intersection) < groupNumberThrottle:
            induReturnDF.loc[i,ind] = 0
            continue
        else:
            stkgroup = ff.getStockGroup(factorIndice[intersection], groupNum=10, Mean_Num=20)
            for p,q in stkgroup.iteritems():
                try:
                    induReturnDF.loc[i,ind][p] = activeReturnData.loc[i][q].mean()
                except:
                    induReturnDF.loc[i,ind][p] = np.NaN


2015-01-30 00:00:00
2016-02-29 00:00:00
2014-01-30 00:00:00
2016-12-30 00:00:00
2016-06-30 00:00:00
2013-02-28 00:00:00
2015-03-31 00:00:00
2017-01-26 00:00:00
2015-10-30 00:00:00
2013-04-26 00:00:00
2016-03-31 00:00:00
2016-11-30 00:00:00
2013-12-31 00:00:00
2013-03-29 00:00:00
2014-02-28 00:00:00
2017-02-28 00:00:00
2016-10-31 00:00:00
2016-01-29 00:00:00
2014-07-31 00:00:00
2012-06-29 00:00:00
2015-11-30 00:00:00
2012-11-30 00:00:00
2016-09-30 00:00:00
2012-05-31 00:00:00
2013-07-31 00:00:00
2014-06-30 00:00:00
2012-08-31 00:00:00
2012-12-31 00:00:00
2013-05-31 00:00:00
2014-04-30 00:00:00
2012-10-31 00:00:00
2016-07-29 00:00:00
2013-10-31 00:00:00
2013-11-29 00:00:00
2012-09-28 00:00:00
2014-12-31 00:00:00
2016-08-31 00:00:00
2014-11-28 00:00:00
2012-07-31 00:00:00
2015-04-30 00:00:00
2014-03-31 00:00:00
2013-06-28 00:00:00
2015-08-31 00:00:00
2016-05-31 00:00:00
2014-10-31 00:00:00
2015-05-29 00:00:00
2016-04-29 00:00:00
2014-05-30 00:00:00
2014-09-30 00:00:00
2015-06-30 00:00:00
2013-09-30 00:00:00
2013-08-30 00:00:00
2015-07-31 00:00:00
2015-12-31 00:00:00
2015-09-30 00:00:00
2014-08-29 00:00:00
2015-02-27 00:00:00
2013-01-31 00:00:00

In [ ]:


In [403]:
finalReturn = induReturnDF.mean(level = 'time')
finalReturn.sort_index(axis=1,inplace=True)

In [404]:
sortGroups = finalReturn[['group_0','group_9']].cumsum().iloc[-1].sort_values(ascending = False).index
top, bot = sortGroups[0], sortGroups[-1]

# Add another column
finalReturn['top_bot_comb'] = finalReturn[top] - finalReturn[bot]

In [405]:
indNetWorth = (finalReturn+1).cumprod()
fig = plt.figure(figsize=(14,9))
# Add a subplot
ax = fig.add_subplot(111)
indNetWorth.plot(figsize=(22,14),ax=ax,color=sns.color_palette("Paired",11),title ='Net Worth',fontsize =13)
ax.set_title(ax.get_title(),alpha=0.7, fontsize=30, ha='right')


Out[405]:
<matplotlib.text.Text at 0x3e0b6550>

In [406]:
# basic indicator
annualizedReturnNew = (1+finalReturn.mean())**12 - 1
annualizedVolNew = finalReturn.std()* np.sqrt(12)
sharpeRatioNew = annualizedReturnNew / annualizedVolNew 
print 'Annual Return:','\n',annualizedReturnNew,'\n\n','Annual Volatility:','\n',annualizedVolNew,'\n\n','Sharpe Ratio:''\n',sharpeRatioNew


Annual Return: 
group_0         0.238441
group_1         0.116152
group_2         0.114914
group_3         0.119476
group_4         0.135069
group_5         0.143708
group_6         0.121505
group_7         0.101965
group_8         0.044896
group_9         0.019274
top_bot_comb    0.215397
dtype: float64 

Annual Volatility: 
group_0         0.153136
group_1         0.055010
group_2         0.061030
group_3         0.059027
group_4         0.067602
group_5         0.059314
group_6         0.053708
group_7         0.065812
group_8         0.061020
group_9         0.081083
top_bot_comb    0.185337
dtype: float64 

Sharpe Ratio:
group_0         1.557052
group_1         2.111452
group_2         1.882927
group_3         2.024107
group_4         1.998007
group_5         2.422836
group_6         2.262310
group_7         1.549350
group_8         0.735754
group_9         0.237709
top_bot_comb    1.162193
dtype: float64

In [407]:
copyReturn = finalReturn.copy()
copyReturn[copyReturn > copyReturn.mean()] = 0
downsideRiskNew = copyReturn.std(skipna = True) * np.sqrt(12)
downsideRiskNew


Out[407]:
group_0         0.071487
group_1         0.018493
group_2         0.024701
group_3         0.021934
group_4         0.029448
group_5         0.027667
group_6         0.023831
group_7         0.035313
group_8         0.030831
group_9         0.045844
top_bot_comb    0.103966
dtype: float64

In [ ]:


In [408]:
sortinoRatioNew = annualizedReturnNew / downsideRiskNew
sortinoRatioNew


Out[408]:
group_0         3.335459
group_1         6.280990
group_2         4.652199
group_3         5.447072
group_4         4.586683
group_5         5.194258
group_6         5.098672
group_7         2.887437
group_8         1.456195
group_9         0.420424
top_bot_comb    2.071800
dtype: float64

In [409]:
# Max drawdown
maxdd1 = indNetWorth.copy()
maxdd1.iloc[0] = 0
for date in indNetWorth.index[1:]:
    maxdd1.loc[date] = 1-indNetWorth.loc[date]/indNetWorth.loc[:date].max()
maxddInfo1 = pd.concat([maxdd1.max(),maxdd1.idxmax()],axis=1)
maxddInfo1.columns = ['Max_drawdown','Time']
maxddInfo1


Out[409]:
Max_drawdown Time
group_0 0.156435 2015-09-30
group_1 0.024299 2014-11-28
group_2 0.035784 2014-11-28
group_3 0.032926 2013-06-28
group_4 0.050773 2015-06-30
group_5 0.050719 2015-06-30
group_6 0.036702 2015-06-30
group_7 0.063952 2015-06-30
group_8 0.081955 2015-07-31
group_9 0.106494 2015-06-30
top_bot_comb 0.258448 2015-10-30

In [ ]:


In [410]:
# calmar Ratio
calmarRatioNew = annualizedReturnNew/ maxddInfo1['Max_drawdown']
calmarRatioNew


Out[410]:
group_0         1.524224
group_1         4.780080
group_2         3.211374
group_3         3.628654
group_4         2.660243
group_5         2.833399
group_6         3.310557
group_7         1.594401
group_8         0.547812
group_9         0.180987
top_bot_comb    0.833423
dtype: float64

In [411]:
def str_to_datetime_format(string):
    return '%Y/%m/%d' if '/'in string else '%Y-%m-%d'

In [412]:
print  downsideRisk['top_bot_comb'],sortinoRatio['top_bot_comb']


0.125204756957 0.788934956205

In [413]:
# save factor performance summary into csv file
if np.isnan(monoDF.median().values[0]) and np.isnan(sharpeRatio['top_bot_comb']):
    raise  Exception( 'Error! Please check the original data!')
savepath = 'C:/Users/LZJF_02/Desktop/myownliarbry'
infodata = np.array([[startTime,endTime,monoDF.median().values[0],monoDF.std().values[0],annualizedReturn['top_bot_comb'],annualizedVol['top_bot_comb'],sharpeRatio['top_bot_comb'],\
    downsideRisk['top_bot_comb'],sortinoRatio['top_bot_comb'], maxddInfo['Max_drawdown'].loc['top_bot_comb'],calmarRatio['top_bot_comb']]])
totalInfo = pd. DataFrame(index =[sololist[0].split('.')[0]],columns = ['Start_time','End_time','Mono_median','Mono_std','Annualized_return','Annualized_volatility','Sharpe_ratio','Downside_Risk',
                                                     'Sortino_ratio','Max_drawdown','Calmar_Ratio'],data= infodata)
totalInfo.index.name = 'Factor_Name'
try:
    readfacInfo = pd.read_csv(savepath+'/'+'factorInfo.csv',infer_datetime_format=True,parse_dates=[0],index_col=0)
except:  
    readfacInfo = totalInfo
    readfacInfo.to_csv(savepath+'/'+'factorInfo.csv',na_rep='NaN')
factorName = sololist[0].split('.')[0]
if  factorName in readfacInfo.index:
    print factorName+ ' '+'already in the file!'
    try:
        begin = datetime.strptime(readfacInfo.loc[factorName]['Start_time'].split()[0],str_to_datetime_format(readfacInfo.loc[factorName]['Start_time']))
        end = datetime.strptime(readfacInfo.loc[factorName]['End_time'].split()[0],str_to_datetime_format(readfacInfo.loc[factorName]['End_time']))
    except:
        print 'No convert needed!The time type is already python.datetime!'
        begin = readfacInfo.loc[factorName]['Start_time']
        end = readfacInfo.loc[factorName]['End_time']
    print begin,startTime, end, endTime
    if  begin > startTime or  end < endTime:
        print  'Update needed'
        print  readfacInfo.loc[factorName].values 
        print infodata.flatten()
        readfacInfo.loc[factorName] = infodata.flatten()
    updatedInfo = readfacInfo
else:
    updatedInfo = pd.concat([readfacInfo,totalInfo])
    updatedInfo = updatedInfo.rename(columns = {'End_Time': 'End_time'})
updatedInfo.to_csv(savepath+'/'+'factorInfo.csv',na_rep='NaN')


OwnfactorMassIndex already in the file!
2012-01-04 00:00:00 2012-05-04 00:00:00 2017-09-28 00:00:00 2017-02-28 00:00:00

In [414]:
updatedInfo


Out[414]:
Start_time End_time Mono_median Mono_std Annualized_return Annualized_volatility Sharpe_ratio Downside_Risk Sortino_ratio Max_drawdown Calmar_Ratio
Factor_Name
Own_Factor_Turnover_Volatility_deviation_20D 2010-01-01 00:00:00 2017-02-28 00:00:00 -0.927273 0.252882 0.195560 0.168607 1.159855 0.168607 1.159855 0.257361 0.759865
Own_Factor_ILLQ-1d 2010-01-01 00:00:00 2017-02-28 00:00:00 0.987879 0.157828 0.324461 0.261079 1.242771 0.261079 1.242771 0.397588 0.816074
Own_Factor_Idiosyncratic_Volatility 2010-01-01 00:00:00 2017-02-28 00:00:00 -0.903030 0.281116 0.189600 0.158699 1.194719 0.158699 1.194719 0.152969 1.239467
Own_Factor_ADJ_Turnover_Volatility_20D 2010-01-01 00:00:00 2017-02-28 00:00:00 -1.000000 0.220559 0.466077 0.297559 1.566334 0.297559 1.566334 0.624890 0.745854
LZ_GPA_VAL_PE 2010-01-01 00:00:00 2017-02-28 00:00:00 0.757576 0.224231 0.071443 0.207247 0.344724 0.207247 0.344724 0.301485 0.236970
LZ_GPA_VAL_PB 2010-01-01 00:00:00 2017-02-28 00:00:00 -0.115152 0.450342 0.030133 0.230448 0.130760 0.230448 0.130760 0.355927 0.084661
LZ_GPA_VAL_PS 2010-01-01 00:00:00 2017-02-28 00:00:00 0.103030 0.522217 0.007019 0.190525 0.036841 0.190525 0.036841 0.355447 0.019747
LZ_GPA_FIN_IND_QFA_YOYGR 2010-01-01 00:00:00 2017-02-28 00:00:00 0.939394 0.160919 0.087862 0.070906 1.239132 0.070906 1.239132 0.081457 1.078637
LZ_GPA_FIN_IND_YOYBPS 2010-01-01 00:00:00 2017-02-28 00:00:00 0.200000 0.221057 0.016883 0.089912 0.187772 0.089912 0.187772 0.167963 0.100516
LZ_GPA_FIN_IND_QFA_YOYNETPROFIT 2010-01-01 00:00:00 2017-02-28 00:00:00 0.890909 0.102845 0.088731 0.072936 1.216553 0.072936 1.216553 0.057036 1.555711
LZ_GPA_FIN_IND_CAPITALIZEDTODA 2010-01-01 00:00:00 2017-02-28 00:00:00 0.248485 0.478620 0.013869 0.110005 0.126078 0.110005 0.126078 0.279676 0.049590
LZ_GPA_FIN_IND_CASHRATIO 2010-01-01 00:00:00 2017-02-28 00:00:00 0.842424 0.482296 0.072803 0.200644 0.362847 0.200644 0.362847 0.274937 0.264799
LZ_GPA_DERI_Momentum_1M 2010-01-01 00:00:00 2017-02-28 00:00:00 -0.975758 0.219044 0.234231 0.179390 1.305707 0.179390 1.305707 0.211129 1.109419
LZ_GPA_TURNOVER_TurnoverAvg_3M 2010-01-01 00:00:00 2017-02-28 00:00:00 -0.503030 0.424059 0.102717 0.205064 0.500900 0.205064 0.500900 0.284812 0.360647
LZ_GPA_VAL_TURN 2010-01-01 00:00:00 2017-02-28 00:00:00 -0.660606 0.282483 0.334626 0.202912 1.649118 0.202912 1.649118 0.170676 1.960596
Own_Factor_Volatility_90d 2010-01-01 00:00:00 2017-02-28 00:00:00 -0.369697 0.391101 0.065162 0.204425 0.318759 0.204425 0.318759 0.265979 0.244990
Own_Factor_DDA-20d 2010-01-01 00:00:00 2017-02-28 00:00:00 -0.987879 0.057228 0.586765 0.233574 2.512117 0.233574 2.512117 0.277902 2.111408
Own_Factor_Skewness_250d 2010-01-01 00:00:00 2017-02-28 00:00:00 -0.951515 0.312497 0.079775 0.134299 0.594016 0.134299 0.594016 0.255741 0.311939
LZ_GPA_FIN_IND_OCFTODEBT 2010-01-01 00:00:00 2017-02-28 00:00:00 -0.151515 0.343190 0.037943 0.072903 0.520460 0.072903 0.520460 0.121523 0.312229
Own_Factor_Return_Skew_250D 2010-01-01 00:00:00 2017-02-28 00:00:00 -0.915152 0.125775 0.132202 0.113174 1.168130 0.113174 1.168130 0.186287 0.709667
Own_Factor_5_20_return_deviation_1D 2010-01-01 00:00:00 2017-02-28 00:00:00 -0.115152 0.343066 0.034217 0.068324 0.500804 0.068324 0.500804 0.113482 0.301520
Own_Factor_downside_risk_252D 2010-01-01 00:00:00 2017-02-28 00:00:00 0.115152 0.312213 0.063775 0.244886 0.260429 0.244886 0.260429 0.247510 0.257668
Own_Factor_sortino_ratio_20D 2010-01-01 00:00:00 2017-02-28 00:00:00 -0.842424 0.412710 0.148444 0.220652 0.672752 0.220652 0.672752 0.373013 0.397959
Own_Factor_excess_return_20D 2010-01-01 00:00:00 2017-02-28 00:00:00 -0.066667 0.191054 0.021491 0.103385 0.207876 0.103385 0.207876 0.191390 0.112290
Uqer_factor_AD20 2010-01-01 00:00:00 2017-02-28 00:00:00 -0.842424 0.330977 0.131389 0.187849 0.699441 0.187849 0.699441 0.196628 0.668212
Uqer_factor_ADTM 2010-01-01 00:00:00 2017-02-28 00:00:00 -0.903030 0.272842 0.124757 0.162724 0.766680 0.162724 0.766680 0.211954 0.588605
Uqer_factor_ATR6 2010-01-01 00:00:00 2017-02-28 00:00:00 -0.987879 0.079624 0.319641 0.183263 1.744170 0.183263 1.744170 0.129518 2.467921
Uqer_factor_Aroon 2010-01-01 00:00:00 2017-02-28 00:00:00 -0.866667 0.207764 0.151488 0.155398 0.974840 0.155398 0.974840 0.260018 0.582608
Uqer_factor_BIAS10 2010-01-01 00:00:00 2017-02-28 00:00:00 -0.212121 0.255514 0.135854 0.162293 0.837090 0.162293 0.837090 0.236858 0.573566
LZ_GPA_FIN_IND_ROE 2010-01-01 00:00:00 2017-02-28 00:00:00 0.248485 0.541320 0.026381 0.123334 0.213895 0.123334 0.213895 0.223166 0.118211
... ... ... ... ... ... ... ... ... ... ... ...
Uqer_factor_ROC6 2010-01-01 00:00:00 2017-02-28 00:00:00 -0.272727 0.159828 0.125337 0.155992 0.803486 0.155992 0.803486 0.229675 0.545716
Uqer_factor_RVI 2010-01-01 00:00:00 2017-02-28 00:00:00 0.296970 0.269994 0.023267 0.100626 0.231226 0.100626 0.231226 0.176433 0.131877
Uqer_factor_CMO 2010-01-01 00:00:00 2017-02-28 00:00:00 -0.757576 0.314749 0.175174 0.180025 0.973054 0.180025 0.973054 0.238347 0.734953
Uqer_factor_RSI 2010-01-01 00:00:00 2017-02-28 00:00:00 -0.284848 0.419446 0.126249 0.152102 0.830029 0.152102 0.830029 0.151151 0.835251
Uqer_factor_Skewness 2010-01-01 00:00:00 2017-02-28 00:00:00 -0.090909 0.532514 0.055360 0.093224 0.593839 0.093224 0.593839 0.139466 0.396942
Uqer_factor_OBV20 2010-01-01 00:00:00 2017-02-28 00:00:00 -0.963636 0.242911 0.182649 0.232356 0.786073 0.232356 0.786073 0.276910 0.659595
Uqer_factor_MTM 2010-01-01 00:00:00 2017-02-28 00:00:00 0.345455 0.294705 0.114816 0.139492 0.823100 0.139492 0.823100 0.162813 0.705202
Uqer_factor_PVT6 2010-01-01 00:00:00 2017-02-28 00:00:00 -0.478788 0.169562 0.138363 0.163633 0.845574 0.163633 0.845574 0.371262 0.372684
Own_Factor_Non_Linear_Size 2010-01-01 00:00:00 2017-02-28 00:00:00 0.987879 0.103554 0.365225 0.260086 1.404250 0.260086 1.404250 0.363877 1.003706
Own_Factor_Beta_1D 2014-01-01 00:00:00 2017-02-28 00:00:00 -0.321212 0.205558 0.060245 0.280683 0.214638 0.280683 0.214638 0.319325 0.188664
Uqer_factor_REC 2010-01-01 00:00:00 2017-02-28 00:00:00 0.684848 0.222095 0.101425 0.104614 0.969514 0.104614 0.969514 0.117914 0.860165
Uqer_factor_DAREC 2010-01-01 00:00:00 2017-02-28 00:00:00 0.696970 0.190183 0.071891 0.077365 0.929251 0.077365 0.929251 0.086857 0.827696
Uqer_factor_GREC 2010-01-01 00:00:00 2017-02-28 00:00:00 0.321212 0.085382 0.025729 0.059282 0.434019 0.059282 0.434019 0.098773 0.260490
Uqer_factor_REVS20 2010-01-01 00:00:00 2017-02-28 00:00:00 -0.963636 0.293734 0.246413 0.183685 1.341496 0.183685 1.341496 0.215362 1.144179
Uqer_factor_MA10RegressCoeff12 2010-01-01 00:00:00 2017-02-28 00:00:00 -0.503030 0.349971 0.137653 0.177725 0.774531 0.177725 0.774531 0.223596 0.615632
Uqer_factor_REVS5 2010-01-01 00:00:00 2017-02-28 00:00:00 -0.418182 0.195611 0.128630 0.159985 0.804012 0.159985 0.804012 0.233563 0.550731
Uqer_factor_WVAD 2010-01-01 00:00:00 2017-02-28 00:00:00 -0.709091 0.235787 0.187525 0.187516 1.000048 0.187516 1.000048 0.377979 0.496125
Uqer_factor_Hurst 2010-01-01 00:00:00 2017-02-28 00:00:00 0.903030 0.315446 0.047758 0.100909 0.473280 0.100909 0.473280 0.157058 0.304082
Uqer_factor_MassIndex 2010-01-01 00:00:00 2017-02-28 00:00:00 -0.745455 0.103745 0.137922 0.160887 0.857265 0.160887 0.857265 0.218426 0.631438
LZ_CN_STKA_CRD_NET_PRFT_FTTM 2012-01-04 00:00:00 2017-09-28 00:00:00 -0.527273 0.191070 0.187691 0.294209 0.637952 0.294209 0.637952 0.433025 0.216721
OwnfactorMassIndex 2012-01-04 00:00:00 2017-09-28 00:00:00 -0.806061 0.107720 0.085841 0.119966 0.715548 0.119966 0.715548 0.130660 0.328490
OwnfactorNetIn 2012-01-04 00:00:00 2017-09-28 00:00:00 0.090909 0.101865 0.062391 0.144794 0.430898 0.144794 0.430898 0.304817 0.102342
OwnfactorNetInRatio 2012-01-04 00:00:00 2017-09-28 00:00:00 -0.515152 0.215181 0.156945 0.131480 1.193675 0.131480 1.193675 0.140680 0.557807
OwnfactorNetInRatio5d 2012-01-04 00:00:00 2017-09-28 00:00:00 -0.157576 0.190452 0.103416 0.132851 0.778440 0.132851 0.778440 0.160737 0.321694
OwnfactorNetInRatio20d 2012-01-04 00:00:00 2017-09-28 00:00:00 -0.800000 0.198371 0.114064 0.132060 0.863725 0.132060 0.863725 0.211442 0.269729
VolQuantileRatio 2012-01-04 00:00:00 2017-09-28 00:00:00 -0.709091 0.214110 0.094508 0.081041 1.166166 0.081041 1.166166 0.095243 0.496142
OwnFactorInduSpreadBias 2012-05-04 00:00:00 2017-09-28 00:00:00 -0.939394 0.137225 0.234363 0.143745 1.630408 0.143745 1.630408 0.128056 0.915081
OthersSpreadBias222_1D 2012-05-04 00:00:00 2017-09-28 00:00:00 -0.848485 0.192570 0.152392 0.131752 1.156657 0.131752 1.156657 0.128779 0.591682
Othersid2_std_3m_1D 2012-05-04 00:00:00 2017-09-28 00:00:00 -0.872727 0.446710 0.133595 0.207055 0.645215 0.207055 0.645215 0.238415 0.280173
OwnFactorILLIQ 2012-05-04 00:00:00 2017-02-28 00:00:00 0.987879 0.025434 0.415396 0.303977 1.366539 0.303977 1.366539 0.400209 1.037947

65 rows × 11 columns


In [ ]:


In [415]:
'''define the risk preference weight indice before perform kmeans clustering'''


Out[415]:
'define the risk preference weight indice before perform kmeans clustering'

In [416]:
updatedInfo['Mono_median'] = np.abs(updatedInfo['Mono_median'])
mat =updatedInfo.dropna(axis=0).iloc[:,2:].as_matrix()
# Using sklearn
km = KMeans(n_clusters=4)
result = km.fit(mat)
# Get cluster assignment labels
labels = km.labels_
# Format results as a DataFrame
results = pd.DataFrame(data=labels, columns=['cluster'],index = updatedInfo.index)

In [417]:
results.groupby(['cluster'])


Out[417]:
<pandas.core.groupby.DataFrameGroupBy object at 0x000000006360C128>

In [418]:
for name,group in results.groupby(['cluster']):
    print group


                                     cluster
Factor_Name                                 
LZ_GPA_VAL_PE                              0
LZ_GPA_VAL_PB                              0
LZ_GPA_VAL_PS                              0
LZ_GPA_FIN_IND_YOYBPS                      0
LZ_GPA_FIN_IND_CAPITALIZEDTODA             0
LZ_GPA_FIN_IND_CASHRATIO                   0
LZ_GPA_TURNOVER_TurnoverAvg_3M             0
Own_Factor_Volatility_90d                  0
LZ_GPA_FIN_IND_OCFTODEBT                   0
Own_Factor_5_20_return_deviation_1D        0
Own_Factor_downside_risk_252D              0
Own_Factor_excess_return_20D               0
LZ_GPA_FIN_IND_ROE                         0
Own_Factor_ROE_Volatility_60D              0
Own_Factor_ROE_Volatility_120D             0
Uqer_factor_KDJ_K                          0
Uqer_factor_RVI                            0
Uqer_factor_Skewness                       0
Own_Factor_Beta_1D                         0
Uqer_factor_GREC                           0
OwnfactorNetIn                             0
                                              cluster
Factor_Name                                          
Own_Factor_Turnover_Volatility_deviation_20D        1
Own_Factor_ILLQ-1d                                  1
Own_Factor_Idiosyncratic_Volatility                 1
Own_Factor_ADJ_Turnover_Volatility_20D              1
LZ_GPA_FIN_IND_QFA_YOYGR                            1
LZ_GPA_FIN_IND_QFA_YOYNETPROFIT                     1
LZ_GPA_DERI_Momentum_1M                             1
Own_Factor_Return_Skew_250D                         1
Own_Factor_ROE_Volatility_250D                      1
Own_Factor_Non_Linear_Size                          1
Uqer_factor_REVS20                                  1
OwnFactorInduSpreadBias                             1
OthersSpreadBias222_1D                              1
OwnFactorILLIQ                                      1
                    cluster
Factor_Name                
LZ_GPA_VAL_TURN           2
Own_Factor_DDA-20d        2
Uqer_factor_ATR6          2
                                cluster
Factor_Name                            
Own_Factor_Skewness_250d              3
Own_Factor_sortino_ratio_20D          3
Uqer_factor_AD20                      3
Uqer_factor_ADTM                      3
Uqer_factor_Aroon                     3
Uqer_factor_BIAS10                    3
Uqer_factor_CCI10                     3
Uqer_factor_ROC6                      3
Uqer_factor_CMO                       3
Uqer_factor_RSI                       3
Uqer_factor_OBV20                     3
Uqer_factor_MTM                       3
Uqer_factor_PVT6                      3
Uqer_factor_REC                       3
Uqer_factor_DAREC                     3
Uqer_factor_MA10RegressCoeff12        3
Uqer_factor_REVS5                     3
Uqer_factor_WVAD                      3
Uqer_factor_Hurst                     3
Uqer_factor_MassIndex                 3
LZ_CN_STKA_CRD_NET_PRFT_FTTM          3
OwnfactorMassIndex                    3
OwnfactorNetInRatio                   3
OwnfactorNetInRatio5d                 3
OwnfactorNetInRatio20d                3
VolQuantileRatio                      3
Othersid2_std_3m_1D                   3

In [419]:
(maxddInfo1-maxddInfo)


Out[419]:
Max_drawdown Time
group_0 0.084346 306 days
group_1 -0.041652 0 days
group_2 -0.042662 0 days
group_3 -0.034760 -518 days
group_4 -0.018473 214 days
group_5 -0.011088 0 days
group_6 -0.034949 0 days
group_7 -0.026979 0 days
group_8 -0.051053 31 days
group_9 -0.014778 0 days
top_bot_comb 0.128706 336 days

In [420]:
#--------------------------------------  Section End   -------------------------------------

In [421]:
industryDict = getIndustryDict(endOfMonthList,grouplabel,indusDF)

In [422]:
capDict=getGroupsbyCap(LFCAPDF,endOfMonthList)

In [423]:
templist=[endOfMonthList,['mean','median','std']]
capGroupsDF = pd.DataFrame(data=None, columns=['small','mid','huge'],index=pd.MultiIndex.from_product(templist,names=['time','stats']),dtype=float)
indusGroupDF  = pd.DataFrame(data=None, columns=grouplabel,index=pd.MultiIndex.from_product(templist,names=['time','stats']),dtype=float)

In [424]:
for date in endOfMonthList:
    factorindice = factorData.loc[date]
    smallindice = factorindice.loc[list(set(factorindice.index) & set(capDict[date][0]))]
    midindice = factorindice.loc[list(set(factorindice.index) & set(capDict[date][1]))]
    hugeindice = factorindice.loc[list(set(factorindice.index) & set(capDict[date][2]))]
    capGroupsDF.loc[date,'small'] = [smallindice.mean(),smallindice.median(),smallindice.std()]
    capGroupsDF.loc[date,'mid'] = [midindice.mean(),midindice.median(),midindice.std()]
    capGroupsDF.loc[date,'huge'] = [hugeindice.mean(),hugeindice.median(),hugeindice.std()]
    for i in grouplabel:
        #print grouplabel
        inDFIndice = factorindice.loc[list(set(factorindice.index) & set(industryDict[date][i]))]
        indusGroupDF.loc[date,i] = [inDFIndice.mean(),inDFIndice.median(),inDFIndice.std()]
indusGroupDF.rename(columns = {i:u''+explanationDict[i]+'' for i in grouplabel},inplace=True)

In [425]:
indusGroupDF


Out[425]:
农林牧渔 计算机 食品饮料 钢铁 汽车 纺织服装 电力及公用事业 通信 房地产 传媒
time stats
2012-05-31 mean 24.337617 23.686263 24.735306 24.344537 23.219667 24.362898 24.885194 23.726326 24.181000 23.469804
median 25.033000 23.794000 24.629000 24.819500 24.082000 24.407000 24.648000 23.914500 24.255000 23.739000
std 3.425975 3.316913 1.341010 2.993172 3.138850 1.337348 1.478817 2.867441 1.504675 2.484472
2012-06-29 mean 24.049033 24.438169 25.337385 24.971494 25.417000 24.465388 24.410838 24.453775 23.692250 24.802733
median 24.467500 24.609000 25.207000 25.354000 24.460000 24.093000 24.452000 24.620000 24.495000 24.951000
std 3.294212 2.884764 1.331104 3.300744 3.099510 1.295921 0.906842 2.464229 4.068524 2.950744
2012-07-31 mean 25.012683 24.397364 25.376737 24.065238 24.932600 24.764660 24.500087 25.053746 24.046125 24.606566
median 25.323500 24.914000 25.371000 24.348500 24.820000 24.870000 24.838000 25.233500 25.125000 24.930500
std 3.411653 3.219743 1.034335 3.021236 0.927553 1.452712 3.037387 2.394036 4.635993 2.755424
2012-08-31 mean 23.826983 24.902727 24.390797 24.147048 24.584467 24.670240 24.657214 25.243840 22.160875 25.151645
median 24.031000 25.130000 24.567000 24.325000 24.711000 24.539000 24.778000 25.317000 23.498500 25.026000
std 3.202584 3.012399 1.455852 3.377661 0.931028 1.121221 2.906055 1.441953 5.106984 1.244328
2012-09-28 mean 24.203968 25.234064 24.617407 24.377282 26.866933 25.371580 25.570243 25.273444 24.881917 25.525898
median 24.557000 25.669500 24.636000 25.000000 26.845000 25.333000 25.864500 25.157500 25.981500 25.468500
std 3.244645 3.419265 1.316226 3.663541 1.162081 0.942134 3.136475 1.309546 5.067581 1.176676
2012-10-31 mean 24.261524 23.417013 24.514966 24.467659 22.876240 24.536120 23.411740 24.001241 24.408042 23.831798
median 24.644000 23.830000 24.532000 24.549000 22.718000 24.426500 23.661000 23.862000 24.544500 23.740000
std 3.267764 3.371744 1.505734 2.987165 0.936655 1.235272 2.731695 1.095639 1.130011 0.892371
2012-11-30 mean 24.361556 24.189640 24.542000 24.871176 24.981360 24.713080 24.174877 24.524931 24.558875 24.399798
median 24.946000 24.384000 24.599000 25.051000 24.969000 24.675500 24.471000 24.498000 24.482000 24.382000
std 3.437210 3.100903 1.977920 2.901626 0.889478 0.998118 3.094199 1.095534 0.548979 1.359754
2012-12-31 mean 24.966540 24.961000 25.319017 24.811588 25.455280 25.850740 25.409534 25.508648 26.849542 25.415780
median 25.414000 25.603000 25.464000 25.160000 25.456000 25.838500 26.133000 25.764000 26.809000 25.805000
std 4.277904 3.869975 2.054867 3.036613 0.902596 1.081051 4.016038 2.548526 0.837630 2.732331
2013-01-31 mean 24.786484 25.003946 25.911335 25.108321 28.097080 25.897360 24.447689 25.490152 25.613708 25.894560
median 24.948000 25.799000 26.018000 25.884000 27.883000 25.889000 24.825000 25.612000 25.471000 25.802000
std 3.406462 4.783559 2.254950 4.246043 1.238551 0.971628 3.747251 2.600675 0.612375 1.759975
2013-02-28 mean 24.839391 24.062257 25.629199 24.567321 24.141360 24.645040 24.133622 24.319752 25.475250 24.313073
median 24.907500 24.483500 25.593500 25.080000 24.562000 24.715000 24.166500 24.601000 25.363500 24.599000
std 3.376781 3.943873 1.025312 3.835331 3.017310 1.129440 3.118832 2.776262 1.071369 2.857305
... ... ... ... ... ... ... ... ... ... ... ...
2016-05-31 mean 23.787231 21.382703 22.737551 23.129189 23.395295 23.991833 21.687901 22.232957 23.092567 22.489236
median 24.712000 23.846500 23.702000 23.830000 24.000500 24.444000 23.612000 24.141000 23.631500 23.728500
std 5.423114 7.332413 5.125924 4.871237 3.799454 3.669611 6.989874 6.603666 4.118131 5.834218
2016-06-30 mean 23.992615 23.460230 23.649915 23.374766 24.207591 24.100625 22.501531 23.200700 24.446600 24.164358
median 24.407000 24.965000 24.605500 24.327000 25.345000 24.447000 25.001000 25.379500 25.670000 25.287500
std 4.140543 6.052281 4.866484 5.077175 5.033465 3.763871 7.346662 7.502333 4.785742 5.381772
2016-07-29 mean 24.126385 23.260284 24.434751 23.809126 23.978591 24.140104 22.787634 21.358836 23.766300 23.803689
median 24.509000 24.625500 25.065000 24.769000 24.760500 24.622500 24.454000 23.662000 24.279000 24.474500
std 3.712723 5.699705 4.485169 4.620684 4.776274 3.562405 6.830782 6.856474 4.442911 4.522847
2016-08-31 mean 24.222308 21.860541 24.075667 23.962144 22.785500 23.101479 22.136268 22.402657 24.936000 23.352321
median 23.824000 23.634000 24.459000 24.802000 23.613000 24.006000 22.760500 23.598000 24.612000 23.525000
std 2.633080 6.396071 3.702042 4.757423 4.650881 3.961364 5.039416 6.656483 1.302665 3.141290
2016-09-30 mean 24.203169 22.406608 23.671198 23.403649 22.536773 23.770854 22.815768 22.708579 22.781567 23.561708
median 23.921000 23.831500 24.105000 24.437000 23.807000 24.166000 23.974000 23.987500 22.629000 23.733500
std 1.570329 6.293223 3.656099 5.466778 5.198556 3.621806 5.581251 5.984522 1.407915 2.759976
2016-10-31 mean 25.366877 22.433054 24.050729 23.376550 22.848864 24.073042 22.709902 22.693736 24.242333 24.333019
median 25.589000 24.227000 24.374000 24.159000 24.286500 24.530000 24.601000 23.908500 25.041000 24.624000
std 1.778419 6.633661 3.149959 4.927928 5.524703 4.421399 6.207208 6.222161 2.974933 3.116715
2016-11-30 mean 24.622846 22.657243 24.518282 24.592441 22.562364 24.324979 24.016012 23.236343 25.287500 24.403189
median 24.899000 24.377000 24.769000 24.970000 24.297500 24.905000 25.783000 24.499500 26.748000 25.214000
std 2.928304 6.654210 3.338642 3.651973 6.042042 3.837744 6.756742 6.144480 5.788998 3.700156
2016-12-30 mean 25.236538 23.290703 24.629362 24.620405 22.666318 24.899729 21.487159 23.620271 22.459200 23.786047
median 24.936000 24.716000 24.987000 24.917000 25.770000 24.962000 23.701000 24.753000 23.348000 24.976000
std 1.391776 6.578757 2.972297 3.189430 7.730322 2.398340 6.989612 5.147242 4.182209 4.891178
2017-01-26 mean 24.119062 22.919933 24.282326 24.382124 23.686864 23.940191 22.584321 23.255316 23.740581 23.496514
median 24.451000 24.507000 24.651000 24.684000 25.136000 24.432000 24.692000 24.663500 23.736000 24.791000
std 2.121641 5.817083 2.942083 2.491236 5.965606 3.212742 6.485829 5.874814 2.487030 5.425072
2017-02-28 mean 23.561785 22.102133 23.692247 23.728991 22.031614 23.206617 22.395494 22.320890 25.365844 22.823075
median 24.092000 23.837000 24.177000 23.932000 23.199000 24.173000 24.530000 23.787000 25.058500 23.876500
std 4.016014 5.699899 3.504035 2.929973 5.716032 4.368083 7.261793 5.437966 2.883582 5.130999

174 rows × 10 columns


In [426]:
capGroupsDF.head()


Out[426]:
small mid huge
time stats
2012-05-31 mean 23.884430 23.723938 24.270578
median 23.929000 24.031000 24.438500
std 2.029463 3.270327 2.561568
2012-06-29 mean 24.587646 24.126187 24.553639
median 24.747000 24.447000 24.631000

In [ ]:


In [ ]:


In [ ]:


In [427]:
newstack = capGroupsDF.stack().unstack('stats').reset_index()
newstack = newstack.rename(columns = {'level_1':'cap'})
newstack = newstack.pivot_table(index='time',columns='cap')

In [428]:
newstack.head()


Out[428]:
stats mean median std
cap huge mid small huge mid small huge mid small
time
2012-05-31 24.270578 23.723938 23.884430 24.4385 24.0310 23.9290 2.561568 3.270327 2.029463
2012-06-29 24.553639 24.126187 24.587646 24.6310 24.4470 24.7470 2.221958 3.070402 2.145529
2012-07-31 24.999228 24.994384 24.846234 25.1550 25.2250 24.9595 2.439171 2.793392 2.156882
2012-08-31 24.251235 24.438690 24.539629 24.3110 24.6590 24.7230 2.091846 2.986278 2.277619
2012-09-28 25.232045 24.778303 25.052839 25.2830 25.0935 25.0930 2.250390 3.119188 2.021150

In [429]:
fig, axs = plt.subplots(3,1, figsize=(16, 10), facecolor='w', edgecolor='k',sharex=True)
fig.subplots_adjust(hspace = .5, wspace=.001)
for label,num in zip(newstack.columns.levels[0],range(len(newstack.columns.levels[0]))):
    newstack[label].plot(ax=axs[num],legend=False)
    axs[num].set_title(label)
axs[0].legend()


Out[429]:
<matplotlib.legend.Legend at 0x38008f98>

In [430]:
newstackInd = indusGroupDF.stack().unstack('stats').reset_index()
newstackInd = newstackInd.rename(columns = {'level_1':'industry'})
newstackInd = newstackInd.pivot_table(index='time',columns='industry')

In [431]:
fig, axs = plt.subplots(3,1, figsize=(22, 16), facecolor='w', edgecolor='k',sharex=True)
fig.subplots_adjust(hspace = .5, wspace=.001)
for label,num in zip(newstackInd.columns.levels[0],range(len(newstackInd.columns.levels[0]))):
    newstackInd[label].plot(ax=axs[num],color=sns.color_palette("Paired",10),legend=False)
    axs[num].set_title(label)
axs[0].legend()


Out[431]:
<matplotlib.legend.Legend at 0x6360c438>

In [432]:
capcopy=capGroupsDF.copy()
copydata=capcopy.reset_index()

In [433]:
copydata.pivot_table(index='time',columns='stats').head()


Out[433]:
huge mid small
stats mean median std mean median std mean median std
time
2012-05-31 24.270578 24.4385 2.561568 23.723938 24.0310 3.270327 23.884430 23.9290 2.029463
2012-06-29 24.553639 24.6310 2.221958 24.126187 24.4470 3.070402 24.587646 24.7470 2.145529
2012-07-31 24.999228 25.1550 2.439171 24.994384 25.2250 2.793392 24.846234 24.9595 2.156882
2012-08-31 24.251235 24.3110 2.091846 24.438690 24.6590 2.986278 24.539629 24.7230 2.277619
2012-09-28 25.232045 25.2830 2.250390 24.778303 25.0935 3.119188 25.052839 25.0930 2.021150

In [434]:
fig, axs = plt.subplots(3,1, figsize=(16, 10), facecolor='w', edgecolor='k',sharex=True)
fig.subplots_adjust(hspace = .5, wspace=.001)
for label,num in zip(set(copydata['stats']),range(len(set(copydata['stats'])))):
    #print label,num
    dataslice = copydata[copydata['stats']==label]
    dataslice.plot(ax=axs[num])
    axs[num].set_title(label)



In [ ]: