Peter (mr.peter.lee@hotmail.com)
Last Update: 31 Dec 2015
Version: 1.0
The file (Analysis.ipynb) is written protected to prevent changes. To make changes or execute code blocks, please make a copy of the file by clicking the mouse button twice:
How to inspect dataset quickly?
[x] completed [ ] to-do
To get an overview of the date first:
Daily holdings of all funds
In [1]:
import pandas as pd
import numpy as np
import datetime as dt
from pandas import Series, DataFrame, Panel, datetime
from datetime import timedelta
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
%matplotlib inline
import io
import os
import random
import statsmodels.stats.api as sms
from warnings import warn
from pylab import rcParams
rcParams['figure.figsize'] = 10, 5 # Change default plot size
from ipywidgets import FloatProgress # Progress bar - f = FloatProgress(min=0, max=1); display(f); then in a loop: f.value = percentage
from IPython.display import display # Progress bar
# ****************** Program Settings ******************
Data_Folder = "data/new/"
Output_Folder = "data/output/"
Datafile_Main = "fin_mf_trades_1995_2014_fid.dta" # The main dataset - mutual fund holdings
Datafile_stocks = "compustat_finland_1990_2015_all.dta" # The dataset - stock prices / dividends
# Calculate Daily Holdings
Balanced_Panel = True # Default: True
# True if add extra entries to the holdings data so that there is at least one observation per day;
# If there is no transaction in a day --> 1 entry in the day
# If there is n transactions --> n entries (n>0)
Dividends_Reinvested = True # True if dividends are assumed to be fully reinvested; False otherwise.
# Note that the Barber's method excludes dividends
Adjust_Price_Before_1999 = True # True if to divide 'price' before 1999 by 5.94573
Vars_To_Keep = ['fund_id', 'isin', 'seq', 'acc', 'sect', 'owntype', 'legtype', 'ref_code',
'buy_sell', 'volume', 'price', 'date']
# To preserve memory usage, select the key variables to keep in the data.
# Note that 'date' is actually 'trade_date'
# ******************************************************
"data_main" is the mutual fund holdings dataset.
In [3]:
data_main = pd.read_stata(Data_Folder + Datafile_Main)
data_main.columns = [var.lower() for var in data_main.columns]
# There are several datetime variables -> remove the unused ones
data_main = data_main.drop(['date', 'tr_date', 'trans_date'], axis=1)
# Convert trade_date to datetime
data_main['date'] = pd.to_datetime(data_main.trade_date)
date_main = data_main.sort_index(axis=1)
# Generate an index for multiple transactions within the same day to preserve their order
def add_seq(grp):
"""Add the variable 'seq' to indicate the order of transactions within a day"""
grp['seq'] = np.arange(1, len(grp) + 1)
return grp
data_main = data_main.groupby(by=['fund_id', 'isin', 'date'], as_index=False).apply(add_seq)
date_main = data_main.sort_values(by=['fund_id', 'isin', 'date', 'seq', 'buy_sell'])
# data_main = data_main.ix[0:10000]
data_main.head(10)
Out[3]:
In [4]:
def sum_stats(col):
"""Produce a table of summary statistics for a numerical variable"""
count = col.count()
mean = col.mean()
median = col.median()
stddev = col.std()
skewness = col.skew()
kurtosis = col.kurt()
quantile1 = col.quantile(0.25)
quantile2 = col.quantile(0.5)
quantile3 = col.quantile(0.75)
first_obs_index = col.first_valid_index()
last_obs_index = col.last_valid_index()
return pd.Series([count, mean, median, stddev, skewness, kurtosis, quantile1, quantile2, quantile3, first_obs_index, last_obs_index],
index=['count', 'mean', 'median', 'stddev', 'skewness', 'kurtosis', 'quantile1', 'quantile2', 'quantile3', 'first_obs_index', 'last_obs_index']
)
print("Descriptive Statistics for Numerical Variables:")
stats = data_main[['volume', 'price', 'prccd', 'ajexdi', 'cshoc', 'date']].sort_values(by=['date']).drop('date', axis=1).apply(sum_stats).T
# Fix up the dates
stats['first_obs'] = data_main.date[stats.T.ix['first_obs_index']].values
stats['last_obs'] = data_main.date[stats.T.ix['last_obs_index']].values
stats
Out[4]:
In [5]:
def_class = {
'HH': 'Household',
'FI': 'Financial institution (domestic)',
'FR': 'Foreign with own account at euroclear',
'FI': 'Foreign nominee account where most foreign holdings are registered',
'NF': 'Non-financial',
'GG': 'General government',
'NP': 'Non-for-profit',
'NA': "Not Available"
}
def_legtype = {
11 : 'NATURAL PERSON',
12 : 'ESTATE OF A DECEASED PERSON',
13 : 'TAXATION CONSORTIUM',
14 : 'PARTNERSHIP',
15 : "BANKRUPT'S ESTATE",
21 : 'LIMITED PARTNERSHIP',
22 : 'LAIVAISÄNNISTÖYHTIÖ (EI OSAKEYHTIÖ)',
30 : 'PUBLIC LIMITED COMPANY',
31 : 'LIMITED COMPANY',
32 : 'MUTUAL INSURANCE COMPANY',
33 : 'SAVINGS BANK',
34 : 'PENSION TRUST OR FUND',
35 : 'HOUSING CORPORATION',
41 : 'COOPERATIVE',
51 : 'FOUNDATION, FUND',
52 : 'NONPROFIT ASSOCIATION',
53 : 'MUTUAL INDEMNITY INSURANCE ASSOCIATION',
54 : 'ECONOMIC ASSOCIATION',
61 : 'PUBLIC AUTHORITY',
62 : 'PUBLIC UTILITY',
63 : 'PUBLIC CORPORATION',
71 : 'STATE CHURCH',
72 : 'OTHER RELIGIOUS COMMUNITY',
90 : 'OTHER JURIDICAL FORM',
'NA': "Not Available"
}
def_ref_code = {
1 : 'MARKETPLACE TRADE',
2 : 'OTHER TRADE',
3 : 'TRANSFER TO BOOKENTRIES',
4 : 'TRANSFER TO OWN ACCOUNT',
5 : 'CONVERSION',
6 : 'BEQUEST',
7 : 'INHERITANCE',
8 : 'WILL',
9 : 'DIVISION OF HTE PROPERTY OF THE SPOU',
10 : 'FORCED SALE',
11 : 'TRANSFER INT. CERTIFICATE OF OWN',
12 : 'TRANSFER OF INTERNAT. CERTIFICATE',
13 : 'TRANSFER TO NOM.REG BE ACCOUNT',
14 : 'TRANSFER FROM NOM.REG BE ACCOUNT',
15 : 'TRADEBYTRADE SETTLEMENT',
16 : 'DIRECT SELLING',
17 : 'TRANSFER OF COLLATERAL',
18 : 'RETURN OF COLLATERAL',
19 : 'CANCELLATION OF RELATIONSHIP BASED O',
20 : 'ISSUE ACCOUNT TRANSACTION',
50 : 'RIGHTS ISSUE',
51 : 'BONUS ISSUE',
52 : 'SUBSCRIPTION',
53 : 'FORFEITURE',
54 : 'REDEMPTION',
55 : 'SPLIT',
56 : 'MERGER',
57 : 'COMBINATION',
58 : 'CANCELLATION OF A SECURITY',
59 : 'OTHER SUBSCRIPTION',
60 : 'SUBSCRIPTION RIGHT',
61 : 'TRANSFER OF RECORD TIME',
62 : 'SECURITY TYPE CONVERSION',
63 : 'CANCELLING OF SECURITY TYPE LOAN',
64 : 'REVERSE SPLIT',
65 : 'CANCELLATION OF SUBSCRIPTION',
71 : 'CONVERSION OF CONVERTIBLE BOND',
72 : 'SUBSCRIPTION WITH WARRANT',
73 : 'EXECUTION OF WARRANT',
75 : 'DEMERGER',
76 : 'STOCK LOAN',
77 : 'RETURN OF STOCK LOAN',
78 : 'RERUTN OF WARRANTS',
96 : 'ACCOUNT TRANSFER',
97 : 'ARCHIVING',
98 : 'CORRECTION',
99 : 'OTHER',
0 : 'Not Available',
'NA': "Not Available"
}
In [6]:
def tab(col, desc=None):
"""Create a simple tabluation table for a categorical variable."""
obs = len(col)
# convert .value_counts() to a DataFrame
df = pd.DataFrame(col.value_counts()).reset_index()
df.columns = ['code', 'obs']
if desc is not None:
df['desc'] = df.code.apply(lambda x: desc[x])
else:
df['desc'] = np.nan
df.sort_index(axis=1, inplace=True)
total_freq = np.sum(df.obs)
df['percent'] = df['obs'] / total_freq * 100
# report missing observations
df.loc[len(df)] = [np.nan, 'non-missing obs; <-- divided by total obs;', col.count(), col.count()/obs*100]
return df
In [7]:
var = 'class'
dct = def_class
print("Summary statistics of variable: {}".format(var))
tab(data_main[var], dct)
Out[7]:
In [8]:
var = 'legtype'
dct = def_legtype
print("Summary statistics of variable: {}".format(var))
tab(data_main[var], dct)
Out[8]:
In [9]:
var = 'ref_code'
dct = def_ref_code
print("Summary statistics of variable: {}".format(var))
tab(data_main[var], dct)
Out[9]:
In [10]:
def_buy_sell = {10: "buy", 20: "sell",
11: "what is this?",
21: "Not sure what is this?"}
var = 'buy_sell'
dct = def_buy_sell
print("Summary statistics of variable: {}".format(var))
warn("Code 11, Code 21's definitions are unclear.")
tab(data_main[var], dct)
Out[10]:
In [11]:
var = 'owntype'
dct = None
print("Summary statistics of variable: {}".format(var))
tab(data_main[var], dct)
Out[11]:
In [12]:
var = 'sect'
dct = None
print("Summary statistics of variable: {}".format(var))
tab(data_main[var], dct)
Out[12]:
In [13]:
var = 'currency'
dct = None
print("Summary statistics of variable: {}".format(var))
print('20% of transactions are based in currency FIM.')
tab(data_main[var], dct)
Out[13]:
In [14]:
var = 'byear'
dct = None
print("Summary statistics of variable: {}".format(var))
tab(data_main[var], dct)
Out[14]:
In [15]:
var = 'gender'
dct = None
print("Summary statistics of variable: {}".format(var))
tab(data_main[var], dct)
Out[15]:
In [16]:
var = 'pcode'
dct = None
print("Summary statistics of variable: {}".format(var))
tab(data_main[var], dct)
Out[16]:
In [17]:
var = 'country'
dct = None
print("Summary statistics of variable: {}".format(var))
tab(data_main[var], dct)
Out[17]:
In [18]:
var = 'language'
dct = None
print("Summary statistics of variable: {}".format(var))
tab(data_main[var], dct)
Out[18]:
In [19]:
var = 'nom_ind'
dct = None
print("Summary statistics of variable: {}".format(var))
tab(data_main[var], dct)
Out[19]:
In [20]:
grouped = data_main.groupby('fund_id', as_index=False, axis=0)
list_of_numerical_vars = ['volume', 'price', 'prccd', 'ajexdi', 'cshoc', 'date']
list_of_categorical_vars = ['class', 'legtype', 'ref_code', 'buy_sell', 'sect', 'gender', 'pcode']
dict_of_categorical_vars = {'class': def_class,
'legtype': def_legtype,
'ref_code': def_ref_code,
'buy_sell': def_buy_sell,
'sect': None,
'gender': None,
'pcode': None}
In [21]:
for name, group in grouped:
print("********** Fund: {} **********".format(name))
print("Numerical variables:")
print(group[list_of_numerical_vars].describe().T)
print("\n")
print("Categorical variables:")
for var in list_of_categorical_vars:
print("- Variable: {}".format(var))
print(tab(group[var], dict_of_categorical_vars[var]).to_string(index=False))
print("\n")
print("\n")
print("\n")
In [22]:
# Resampling from daily to annually
# How to down-sample from daily to annually? Some variables are computed as means; some are aggregated.
vars_compute_average = ['price', 'prccd', 'ajexdi', 'cshoc']
vars_compute_sum = ['buy_value', 'sell_value', 'buy_volume', 'sell_volume', 'buy', 'sell']
In [23]:
ts_main = data_main
# Generate variables
ts_main['buy'] = (ts_main.buy_sell == 10) | (ts_main.buy_sell == 11)
ts_main['sell'] = (ts_main.buy_sell == 20) | (ts_main.buy_sell == 21)
ts_main['buy_volume'] = np.nan
ts_main.ix[ts_main['buy']==True, 'buy_volume'] = ts_main['volume']
ts_main['sell_volume'] = np.nan
ts_main.ix[ts_main['sell']==True, 'sell_volume'] = ts_main['volume']
ts_main['buy_value'] = np.nan
ts_main.ix[ts_main['buy']==True, 'buy_value'] = ts_main['volume'] * ts_main['price']
ts_main['sell_value'] = np.nan
ts_main.ix[ts_main['sell']==True, 'sell_value'] = ts_main['volume'] * ts_main['price']
ts_main.index = ts_main.date
In [24]:
ts_temp_mean = ts_main[vars_compute_average].resample('A', how='mean')
ts_temp_sum = ts_main[vars_compute_sum].resample('A', how='sum')
ts_annual = pd.concat([ts_temp_mean, ts_temp_sum], axis=1)
ts_annual.columns = ['avg_price', 'avg_prccd', 'avg_ajexdi', 'avg_cshoc', 'sum_buy_value', 'sum_sell_value', 'sum_buy_vol', 'sum_sell_vol', 'sum_buys', 'sum_sells']
print('Variable definitions:')
print('sum_buys: number of buys during the year')
print('sum_buy_vol: total buy volume during the year')
print('sum_buy_value (in 000,000s): total turonver of purchase value during the year')
ts_annual['sum_buy_value'] = ts_annual['sum_buy_value']/1000000
ts_annual['sum_sell_value'] = ts_annual['sum_sell_value']/1000000
ts_annual
Out[24]:
In [25]:
ts_buys = data_main[['fund_id', 'buy', 'sell', 'buy_value', 'sell_value', 'date']].copy()
ts_buys.index = ts_buys.date
ts_buys.drop('date', axis=1, inplace=True)
In [26]:
# Group by fund_id
ts_buys_grouped = ts_buys.groupby('fund_id', as_index=False, axis=0)
i = 1
for name, group in ts_buys_grouped:
# Resample from daily
ts_temp = group.drop('fund_id', axis=1).resample('M', how='sum')
# Plot
plt.figure(i)
plt.plot(ts_temp.index, ts_temp.buy, 'b', label='number of buys')
plt.plot(ts_temp.index, ts_temp.sell, 'r', label='number of sells')
plt.legend(loc='upper left')
plt.xlabel('Time')
plt.ylabel('Freq')
plt.grid(True)
plt.title('Number of Buys/Sells Per Month for Fund {}'.format(name))
i += 1
plt.show()
In [27]:
# Group by fund_id
i = 1
for name, group in ts_buys_grouped:
# Resample from daily
ts_temp = group.drop('fund_id', axis=1).resample('M', how='sum')
# Plot
plt.figure(i)
plt.plot(ts_temp.index, ts_temp.buy_value, 'b', label='total value of buys')
plt.plot(ts_temp.index, ts_temp.sell_value, 'r', label='total value of sells')
plt.legend(loc='upper left')
plt.xlabel('Time')
plt.ylabel('Value in Reported Currency')
plt.grid(True)
plt.title('Value of Buys/Sells Per Month for Fund {}'.format(name))
i += 1
plt.show()
In [ ]:
"""
Module: Calculate First-In-First-Out (FIFO) profits from account transaction data
Author: Peter Lee (mr.peter.lee@hotmail.com)
Last update: 2016-Jan-15
This module calculates the investment profit/loss of each sell in the account transaction history using the First-In-First-Out (FIFO) method.
A number of checks are in place to improve data quality:
1.) Cash dividends are assumed to be fully reinvested by default.
2.)
"""
##
import numpy as np
import pandas as pd
import os.path
def check_missing_in_data_stocks(data_stocks):
"""
Remove missing entrieces from the End-Of-Day stocks data.
Notes:
The variables in this DataFrame are:
['isin', 'date', 'ajexdi', 'div', 'prccd']
"""
missing_isin = len(data_stocks[data_stocks['isin'] == ""])
print("Number of missing isin entries in the stocks price data: {}".format(missing_isin))
print("These missing entries are dropped.")
data_stocks.drop(data_stocks.index[data_stocks['isin'] == ""], inplace=True)
print("\n")
# Checking zero dividend payments
print("Number of zero-dividends paymetns: {}".format(np.sum(data_stocks['div'] == 0)))
print("zero-dividend payments are replaced as missing.")
data_stocks.ix[data_stocks['div'] == 0, 'div'] = np.nan
print("\n")
# Check missing prccd
print("Number of missing prccd entries: {}".format(np.sum(np.isnan(data_stocks['prccd']))))
print("They are dropped")
data_stocks.drop(data_stocks.index[np.isnan(data_stocks['prccd'])], inplace=True)
print("\n")
# Check missing ajexdi
print("Number of missing ajexdi entries: {}".format(np.sum(np.isnan(data_stocks['ajexdi']))))
print("They are dropped")
data_stocks.drop(data_stocks.index[np.isnan(data_stocks['ajexdi'])], inplace=True)
return data_stocks
def add_div_index(group):
"""
Added a dividend index column to the group (isin EOD df).
"""
# Replace NANs in "div" as ZERO
group.ix[group['div'].isnull(), 'div'] = 0
group['divIndex'] = (group['div'] / group['prccd'].shift(1) + 1).cumprod()
# Set the value in the first row
group.ix[group.index[0], 'divIndex'] = 1
return group
def compute_dividend_index(data_stocks):
"""
Calculate the dividend index to show the cumulative effects of dividends payments. The index begins with 1.
"""
data_stocks['divIndex'] = np.nan
data_stocks.sort_values(by=['isin', 'date'], inplace=True)
return data_stocks.groupby(by='isin', as_index=False).apply(add_div_index)
def add_mod_ajexdi(group):
"""
Added a modified ajexdi column to the group
"""
# Calculate the change factor
group['change_factor'] = group['ajexdi'] / group['ajexdi'].shift(1) /\
group['divIndex'] * group['divIndex'].shift(1)
group.ix[group.index[0], 'change_factor'] = group.ix[group.index[0], 'ajexdi']
group['modAjexdi'] = group['change_factor'].cumprod()
group.drop('change_factor', axis=1, inplace=True)
return group
def calculate_mod_ajexdi(data_stocks):
"""
Calculate the modified ajexdi from divIndex per isin.
"""
return data_stocks.groupby(by='isin', as_index=False).apply(add_mod_ajexdi)
def data_main_add_seq(grp):
"""Add the variable 'seq' to indicate the order of transactions within a day"""
# Generate an index for multiple transactions within the same day to preserve their order
grp['seq'] = np.arange(1, len(grp) + 1)
return grp
def data_main_fill_na(grp):
lst_vars = ['modAjexdi', 'prccd', 'divIndex']
for x in lst_vars:
grp[x] = grp[x].fillna(method='ffill')
grp[x] = grp[x].fillna(method='bfill')
return grp
def calculate_fifo_profit(data_main):
"""
Correct any "over-sale" transactions.
Calculate FIFO profits.
"""
return data_main.groupby(by=['fund_id', 'isin'], as_index=False).apply(data_main_add_profit)
def data_main_add_profit(group):
"""
Add a new column - FIFO profit; while correct any "over-sale" transactions.
"""
# Setting up some test data
# gp = data_main.groupby(by=['fund_id', 'isin'], as_index=False)
# temp = gp.get_group(('FI0008804422', 'FI0009000707'))
# group = temp.copy()
# group = temp.ix[fuck.index[:15], ['fund_id', 'isin', 'date', 'buy_sell', 'volume', 'price', 'modAjexdi']]
# group
# Identify all buys
buys = group.ix[(group['buy_sell'] == 10) | (group['buy_sell'] == 11), ['date', 'price', 'volume', 'modAjexdi']]
# position (stock volume)
group['position'] = np.nan # Stock volumes in position
group['trans'] = np.nan # Transaction volume after correcting over-sale
group['profit'] = 0 # Fifo-profit
# Calculate positions
previous_ajexdi = None
previous_position = None
position = 0
ajexdi = 0
volume = 0
order = None
# ith_sell = 1
for i, row in group.iterrows():
ajexdi = row['modAjexdi']
volume = row['volume']
# Identify order type
if (row['buy_sell'] == 10) | (row['buy_sell'] == 11):
order = 1
elif (row['buy_sell'] == 20) | (row['buy_sell'] == 21):
order = -1
else:
order = 0
# 1st row
if previous_position is None:
# if there is a sell or no trans
group.ix[i, 'trans'] = 0
group.ix[i, 'position'] = 0
previous_position = 0
previous_ajexdi = row['modAjexdi']
# if there is a buy
if (order == 1):
group.ix[i, 'trans'] = volume
group.ix[i, 'position'] = volume
previous_position = row['volume']
continue
# >1st row
# Convert position before buy/sell
position = previous_position / ajexdi * previous_ajexdi
# Check if there is a over-sale
if (order == -1) & (volume > position):
volume = position
# Update position after buy/sell
position += volume * order
group.ix[i, 'trans'] = (volume if order >= 0 else volume * order)
group.ix[i, 'position'] = position
# Update previous_row
previous_ajexdi = row['modAjexdi']
previous_position = position
# Calculate profit if there is a sell
if (order == -1):
# print('Sell {} at index {}'.format(ith_sell, row.name))
# ith_sell += 1
sell_revenue = volume * row['price']
sell_index = row.name
buy_index = 0
purchase_cost = 0
# The correct sell volume is "volume"
# k = 1
while volume >= 1 and buy_index < sell_index:
# print('k is', k)
# print(buys)
# k += 1
for j, buy in buys.iterrows():
buy_index = buy.name
buy_volume_at_sell = buy['volume'] * buy['modAjexdi'] / row['modAjexdi']
# print('j is', j, 'Cvt_buy_vol', buy_volume_at_sell)
if volume > buy_volume_at_sell:
volume -= buy_volume_at_sell
buys.ix[j, 'volume'] = 0
purchase_cost += buy['volume'] * buy['price']
else:
offset_vol = (volume * row['modAjexdi'] / buy['modAjexdi'])
buys.ix[j, 'volume'] = buy['volume'] - offset_vol
purchase_cost += offset_vol * buy['price']
volume = 0
group.ix[i, 'profit'] = sell_revenue - purchase_cost
return group
def expand_data(data_main):
"""
Expand the holdings dataset so that that there is one End-of-Day entry for the entire holding period.
"""
data_main = data_main.groupby(by=['fund_id', 'isin'], as_index=False).apply(add_eod_rows_to_holding_period)
data_main.reset_index(drop=True, inplace=True)
return data_main
def add_eod_rows_to_holding_period(group):
# Create time series
group.index = group['date']
group.drop('date', axis=1, inplace=True)
# Resample to End-of-Day data
transform = {
'agg_volume': group['volume'].resample('D', how='sum', fill_method='ffill', label='right', closed='right'),
'num_acts': group['buy_sell'].resample('D', how='count', fill_method='ffill', label='right', closed='right'),
'avg_price': group['price'].resample('D', how='mean', fill_method='ffill', label='right', closed='right'),
'avg_ajexdi': group['modAjexdi'].resample('D', how='mean', fill_method='ffill', label='right', closed='right'),
'position': group['position'].resample('D', how='last', fill_method='ffill', label='right', closed='right'),
'trans': group['trans'].resample('D', how='sum', fill_method='ffill', label='right', closed='right'),
'profit': group['profit'].resample('D', how='sum', fill_method='ffill', label='right', closed='right')
}
# Up-sample
ts_daily = pd.DataFrame(transform)
# Deal with string variables specifically
lst_fill = ['fund_id', 'isin']
for x in lst_fill:
ts_daily[x] = group.ix[0, x]
ts_daily.reset_index(inplace=True)
ts_daily.rename(columns={'index': 'date'}, inplace=True)
ts_daily.head()
return ts_daily
##
def main(
data_folder = "data/new/",
output_folder = "data/output/"
filename_transactions = "fin_mf_trades_1995_2014_fid.dta",
filename_stocks = "compustat_finland_1990_2015_all.dta",
Balanced_Panel = True,
debug = True,
debug_file = "test1.xlsx",
Account_For_Dividend = True,
Adjust_Price_Before_1999 = True):
if debug:
test_folder = "data/tests/"
# Test 1
test_file = "test1.xlsx"
test = pd.read_excel(os.path.join(test_folder, test_file))
test = calculate_fifo_profit(test)
test.to_excel(os.path.join(test_folder, test_file.split('.')[0] + "_output.xlsx"))
test = expand_data(test)
test.to_excel(os.path.join(test_folder, test_file.split('.')[0] + "_output2.xlsx"))
# Loading data_stocks
# ===================
data_stocks = pd.read_stata(os.path.join(data_folder, filename_stocks))
data_stocks['date'] = pd.to_datetime(data_stocks['date'], format="%Y%m%d")
data_stocks = data_stocks[['isin', 'date', 'ajexdi', 'div', 'prccd']]
# Check missing data in the data_stocks
data_stocks = check_missing_in_data_stocks(data_stocks)
# Calculate cumulative dividend index
data_stocks = compute_dividend_index(data_stocks)
# Modify ajexdi to include dividend payment (the original ajexdi is assumed to be net of dividends)
if Account_For_Dividend:
data_stocks = calculate_mod_ajexdi(data_stocks)
else:
data_stocks['modAdjexi'] = data_stocks['ajexdi']
##
# Load data_transactions
# ======================
data_main = pd.read_stata(os.path.join(data_folder, filename_transactions))
data_main.columns = [var.lower() for var in data_main.columns]
# There are several datetime variables -> remove the unused ones
data_main = data_main.drop(['date', 'tr_date', 'trans_date'], axis=1)
# Convert trade_date to datetime
data_main['date'] = pd.to_datetime(data_main.trade_date)
date_main = data_main.sort_index(axis=1)
data_main = data_main.groupby(by=['fund_id', 'isin', 'date'], as_index=False).apply(data_main_add_seq)
date_main = data_main.sort_values(by=['fund_id', 'isin', 'date', 'seq', 'buy_sell'])
if Adjust_Price_Before_1999:
data_main.ix[data_main['date'] < "1999-Jan-01", 'price'] = data_main['price'] / 5.94573
# Merge data_stocks with data_transactions, and impute missing prices.
data_main.drop('prccd', inplace=True, axis=1)
data_main = data_main.merge(data_stocks, how='left', on=['isin', 'date'])
data_main = data_main[['fund_id', 'isin', 'date', 'seq', 'acc', 'sect', 'owntype', 'legtype', 'ref_code', 'buy_sell', 'volume', 'price', 'prccd', 'divIndex', 'modAjexdi', 'div']]
data_main.head()
##
# Inspect missing data
# check if there is any missing values in ajexdi and divIndex
lst_vars = ['modAjexdi', 'divIndex', 'price', 'buy_sell', 'volume']
print("Number of obs in data: {}".format(len(data_main)))
for x in lst_vars:
print("missing values in {}: {}".format(x, np.sum(np.isnan(data_main[x]))))
# Use forward fill to impute ajexdi; prccd, divIndex;
data_main = data_main.groupby(by=['fund_id', 'isin'], as_index=False).apply(data_main_fill_na)
print("Need to get rid of missing ajexdi or divIndex.")
data_main = data_main.ix[data_main['modAjexdi'].notnull()].ix[data_main['divIndex'].notnull()]
# Check for over-sales and compute holdings
# Some funds may "over-sale" their stocks and causes negative positions. This is prohibited. Any sells that cause negative balance are lowered to result in a zero balance.
data_main = data_main.sort_values(by=['fund_id', 'isin', 'date', 'seq']).reset_index(drop=True)
##
data_main = calculate_fifo_profit(data_main)
if Balanced_Panel:
# Expand the data_main - to fill in daily holdings in between transaction records.
data_main = expand_data(data_main)
data_main.to_excel(output_folder + "data_main.xlsx")
data_stocks.to_excel(output_folder + "data_stocks.xlsx")
In [ ]:
# Execeute FIFO profile calculations and obtain the output datasets
main(data_folder = Data_Folder,
output_folder = Output_Folder,
filename_transactions = Datafile_Main,
filename_stocks = Datafile_stocks,
Balanced_Panel = Balanced_Panel,
Account_For_Dividend = Dividends_Reinvested,
Adjust_Price_Before_1999 = Adjust_Price_Before_1999,
debug = False
)