DB_URI指向数据库
In [1]:
%matplotlib inline
import os
import pandas as pd
import numpy as np
from PyFin.api import *
from alphamind.api import *
factor = 'CFO2EV'
universe = Universe('zz800')
start_date = '2010-01-01'
end_date = '2018-04-26'
freq = '20b'
category = 'sw_adj'
level = 1
horizon = map_freq(freq)
engine = SqlEngine(os.environ['DB_URI'])
ref_dates = makeSchedule(start_date, end_date, freq, 'china.sse')
sample_date = '2018-01-04'
sample_codes = engine.fetch_codes(sample_date, universe)
sample_industry = engine.fetch_industry(sample_date, sample_codes, category=category, level=level)
In [2]:
sample_industry.head()
Out[2]:
In [3]:
factor1 = {'f1': CSQuantiles(factor)}
sample_factor1 = engine.fetch_factor(sample_date, factor1, sample_codes)
sample_factor1 = pd.merge(sample_factor1, sample_industry[['code', 'industry']], on='code')
In [4]:
sample_factor1.sort_values('f1', ascending=False).head(15)
Out[4]:
对于原始因子,如果我们不做任何行业上面的处理,发现我们选定的alpha因子CFO2EV较大的股票集中于银行和大金融板块。
这里我们使用调整后的申万行业分类作为行业标签:
In [5]:
factor2 = {'f2': CSQuantiles(factor, groups='sw1_adj')}
sample_factor2 = engine.fetch_factor(sample_date, factor2, sample_codes)
sample_factor2 = pd.merge(sample_factor2, sample_industry[['code', 'industry']], on='code')
sample_factor2.sort_values('f2', ascending=False).head(15)
Out[5]:
使用行业内的排序,则行业分布会比较平均。
还有一种思路,使用线性回归,以行业为哑变量,使用回归后的残差作为因子的替代值,做到行业中性:
In [6]:
factor3 = {'f3': factor}
sample_factor3 = engine.fetch_factor(sample_date, factor3, sample_codes)
risk_cov, risk_exp = engine.fetch_risk_model(sample_date, sample_codes)
sample_factor3 = pd.merge(sample_factor3, sample_industry[['code', 'industry']], on='code')
sample_factor3 = pd.merge(sample_factor3, risk_exp, on='code')
In [7]:
raw_factors = sample_factor3['f3'].values
industry_exp = sample_factor3[industry_styles + ['COUNTRY']].values.astype(float)
processed_values = factor_processing(raw_factors, pre_process=[], risk_factors=industry_exp, post_process=[percentile])
sample_factor3['f3'] = processed_values
In [8]:
sample_factor3 = sample_factor3[['code', 'f3', 'industry']]
sample_factor3.sort_values('f3', ascending=False).head(15)
Out[8]:
我们发现这种方法的效果并不是很好。调整的幅度并不是很大,同时仍然存在着集中于大金融板块的问题。
我们使用简单等权重做多前20%支股票,做空后20%的方法,考察三种方法的效果:
In [9]:
factors = {
'raw': CSQuantiles(factor),
'peer quantile': CSQuantiles(factor, groups='sw1'),
'risk neutral': LAST(factor)
}
In [10]:
df_ret = pd.DataFrame(columns=['raw', 'peer quantile', 'risk neutral'])
df_ic = pd.DataFrame(columns=['raw', 'peer quantile', 'risk neutral'])
for date in ref_dates:
ref_date = date.strftime('%Y-%m-%d')
codes = engine.fetch_codes(ref_date, universe)
total_factor = engine.fetch_factor(ref_date, factors, codes)
risk_cov, risk_exp = engine.fetch_risk_model(ref_date, codes)
industry = engine.fetch_industry(ref_date, codes, category=category, level=level)
rets = engine.fetch_dx_return(ref_date, codes, horizon=horizon, offset=1)
total_factor = pd.merge(total_factor, industry[['code', 'industry']], on='code')
total_factor = pd.merge(total_factor, risk_exp, on='code')
total_factor = pd.merge(total_factor, rets, on='code').dropna()
raw_factors = total_factor['risk neutral'].values
industry_exp = total_factor[industry_styles + ['COUNTRY']].values.astype(float)
processed_values = factor_processing(raw_factors, pre_process=[], risk_factors=industry_exp, post_process=[percentile])
total_factor['risk neutral'] = processed_values
total_factor[['f1_d', 'f2_d', 'f3_d']] = (total_factor[['raw', 'peer quantile', 'risk neutral']] >= 0.8) * 1.
total_factor.loc[total_factor['raw'] <= 0.2, 'f1_d'] = -1.
total_factor.loc[total_factor['peer quantile'] <= 0.2, 'f2_d'] = -1.
total_factor.loc[total_factor['risk neutral'] <= 0.2, 'f3_d'] = -1.
total_factor[['f1_d', 'f2_d', 'f3_d']] /= np.abs(total_factor[['f1_d', 'f2_d', 'f3_d']]).sum(axis=0)
ret_values = total_factor.dx.values @ total_factor[['f1_d', 'f2_d', 'f3_d']].values
df_ret.loc[date] = ret_values
ic_values = total_factor[['dx', 'raw', 'peer quantile', 'risk neutral']].corr().values[0, 1:]
df_ic.loc[date] = ic_values
print(f"{date} is finished")
In [11]:
df_ret.cumsum().plot(figsize=(14, 7))
Out[11]:
In [12]:
df_ic.cumsum().plot(figsize=(14, 7))
Out[12]:
In [ ]: