Peter (mr.peter.lee@hotmail.com)
Last Update: 31 Dec 2015
Version: 1.0
The file (Analysis.ipynb) is written protected to prevent changes. To make changes or execute code blocks, please make a copy of the file by clicking the mouse button twice:
How to inspect dataset quickly?
[x] completed [ ] to-do
To get an overview of the date first:
Daily holdings of all funds
In [1]:
import pandas as pd
import numpy as np
import datetime as dt
from pandas import Series, DataFrame, Panel, datetime
from datetime import timedelta
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
%matplotlib inline
import io
import os
import random
import statsmodels.stats.api as sms
from warnings import warn
from pylab import rcParams
rcParams['figure.figsize'] = 10, 5 # Change default plot size
from ipywidgets import FloatProgress # Progress bar - f = FloatProgress(min=0, max=1); display(f); then in a loop: f.value = percentage
from IPython.display import display # Progress bar
# ****************** Program Settings ******************
Data_Folder = "data/new/"
Output_Folder = "data/output/"
Datafile_Main = "fin_mf_trades_1995_2014_fid.dta" # The main dataset - mutual fund holdings
Datafile_stocks = "compustat_finland_1990_2015_all.dta" # The dataset - stock prices / dividends
# Calculate Daily Holdings
Balanced_Panel = True # Default: True
# True if add extra entries to the holdings data so that there is at least one observation per day;
# If there is no transaction in a day --> 1 entry in the day
# If there is n transactions --> n entries (n>0)
Dividends_Reinvested = True # True if dividends are assumed to be fully reinvested; False otherwise.
# Note that the Barber's method excludes dividends
Adjust_Price_Before_1999 = True # True if to divide 'price' before 1999 by 5.94573
First_Day = "" # Default: ""
# Set to "" if you would like the first day in the holdings data to be the date of the first transaction of a particular stock
# Or you can set it to a date such as "1994-Jan-01" - The holdings record for all isin per stock will begin with that day (filled with zeros if no-transactions have yet happened)
# Note the first obs in the test sample is 1994-Dec-19 - this day must to be earlier than the first obs to make sense
Vars_To_Keep = ['fund_id', 'isin', 'seq', 'acc', 'sect', 'owntype', 'legtype', 'ref_code',
'buy_sell', 'volume', 'price', 'date']
# To preserve memory usage, select the key variables to keep in the data.
# Note that 'date' is actually 'trade_date'
# ******************************************************
"data_main" is the mutual fund holdings dataset.
In [3]:
data_main = pd.read_stata(Data_Folder + Datafile_Main)
data_main.columns = [var.lower() for var in data_main.columns]
# There are several datetime variables -> remove the unused ones
data_main = data_main.drop(['date', 'tr_date', 'trans_date'], axis=1)
# Convert trade_date to datetime
data_main['date'] = pd.to_datetime(data_main.trade_date)
date_main = data_main.sort_index(axis=1)
# Generate an index for multiple transactions within the same day to preserve their order
def add_seq(grp):
"""Add the variable 'seq' to indicate the order of transactions within a day"""
grp['seq'] = np.arange(1, len(grp) + 1)
return grp
data_main = data_main.groupby(by=['fund_id', 'isin', 'date'], as_index=False).apply(add_seq)
date_main = data_main.sort_values(by=['fund_id', 'isin', 'date', 'seq', 'buy_sell'])
# data_main = data_main.ix[0:10000]
data_main.head(10)
Out[3]:
In [4]:
def sum_stats(col):
"""Produce a table of summary statistics for a numerical variable"""
count = col.count()
mean = col.mean()
median = col.median()
stddev = col.std()
skewness = col.skew()
kurtosis = col.kurt()
quantile1 = col.quantile(0.25)
quantile2 = col.quantile(0.5)
quantile3 = col.quantile(0.75)
first_obs_index = col.first_valid_index()
last_obs_index = col.last_valid_index()
return pd.Series([count, mean, median, stddev, skewness, kurtosis, quantile1, quantile2, quantile3, first_obs_index, last_obs_index],
index=['count', 'mean', 'median', 'stddev', 'skewness', 'kurtosis', 'quantile1', 'quantile2', 'quantile3', 'first_obs_index', 'last_obs_index']
)
print("Descriptive Statistics for Numerical Variables:")
stats = data_main[['volume', 'price', 'prccd', 'ajexdi', 'cshoc', 'date']].sort_values(by=['date']).drop('date', axis=1).apply(sum_stats).T
# Fix up the dates
stats['first_obs'] = data_main.date[stats.T.ix['first_obs_index']].values
stats['last_obs'] = data_main.date[stats.T.ix['last_obs_index']].values
stats
Out[4]:
In [5]:
def_class = {
'HH': 'Household',
'FI': 'Financial institution (domestic)',
'FR': 'Foreign with own account at euroclear',
'FI': 'Foreign nominee account where most foreign holdings are registered',
'NF': 'Non-financial',
'GG': 'General government',
'NP': 'Non-for-profit',
'NA': "Not Available"
}
def_legtype = {
11 : 'NATURAL PERSON',
12 : 'ESTATE OF A DECEASED PERSON',
13 : 'TAXATION CONSORTIUM',
14 : 'PARTNERSHIP',
15 : "BANKRUPT'S ESTATE",
21 : 'LIMITED PARTNERSHIP',
22 : 'LAIVAISÄNNISTÖYHTIÖ (EI OSAKEYHTIÖ)',
30 : 'PUBLIC LIMITED COMPANY',
31 : 'LIMITED COMPANY',
32 : 'MUTUAL INSURANCE COMPANY',
33 : 'SAVINGS BANK',
34 : 'PENSION TRUST OR FUND',
35 : 'HOUSING CORPORATION',
41 : 'COOPERATIVE',
51 : 'FOUNDATION, FUND',
52 : 'NONPROFIT ASSOCIATION',
53 : 'MUTUAL INDEMNITY INSURANCE ASSOCIATION',
54 : 'ECONOMIC ASSOCIATION',
61 : 'PUBLIC AUTHORITY',
62 : 'PUBLIC UTILITY',
63 : 'PUBLIC CORPORATION',
71 : 'STATE CHURCH',
72 : 'OTHER RELIGIOUS COMMUNITY',
90 : 'OTHER JURIDICAL FORM',
'NA': "Not Available"
}
def_ref_code = {
1 : 'MARKETPLACE TRADE',
2 : 'OTHER TRADE',
3 : 'TRANSFER TO BOOKENTRIES',
4 : 'TRANSFER TO OWN ACCOUNT',
5 : 'CONVERSION',
6 : 'BEQUEST',
7 : 'INHERITANCE',
8 : 'WILL',
9 : 'DIVISION OF HTE PROPERTY OF THE SPOU',
10 : 'FORCED SALE',
11 : 'TRANSFER INT. CERTIFICATE OF OWN',
12 : 'TRANSFER OF INTERNAT. CERTIFICATE',
13 : 'TRANSFER TO NOM.REG BE ACCOUNT',
14 : 'TRANSFER FROM NOM.REG BE ACCOUNT',
15 : 'TRADEBYTRADE SETTLEMENT',
16 : 'DIRECT SELLING',
17 : 'TRANSFER OF COLLATERAL',
18 : 'RETURN OF COLLATERAL',
19 : 'CANCELLATION OF RELATIONSHIP BASED O',
20 : 'ISSUE ACCOUNT TRANSACTION',
50 : 'RIGHTS ISSUE',
51 : 'BONUS ISSUE',
52 : 'SUBSCRIPTION',
53 : 'FORFEITURE',
54 : 'REDEMPTION',
55 : 'SPLIT',
56 : 'MERGER',
57 : 'COMBINATION',
58 : 'CANCELLATION OF A SECURITY',
59 : 'OTHER SUBSCRIPTION',
60 : 'SUBSCRIPTION RIGHT',
61 : 'TRANSFER OF RECORD TIME',
62 : 'SECURITY TYPE CONVERSION',
63 : 'CANCELLING OF SECURITY TYPE LOAN',
64 : 'REVERSE SPLIT',
65 : 'CANCELLATION OF SUBSCRIPTION',
71 : 'CONVERSION OF CONVERTIBLE BOND',
72 : 'SUBSCRIPTION WITH WARRANT',
73 : 'EXECUTION OF WARRANT',
75 : 'DEMERGER',
76 : 'STOCK LOAN',
77 : 'RETURN OF STOCK LOAN',
78 : 'RERUTN OF WARRANTS',
96 : 'ACCOUNT TRANSFER',
97 : 'ARCHIVING',
98 : 'CORRECTION',
99 : 'OTHER',
0 : 'Not Available',
'NA': "Not Available"
}
In [6]:
def tab(col, desc=None):
"""Create a simple tabluation table for a categorical variable."""
obs = len(col)
# convert .value_counts() to a DataFrame
df = pd.DataFrame(col.value_counts()).reset_index()
df.columns = ['code', 'obs']
if desc is not None:
df['desc'] = df.code.apply(lambda x: desc[x])
else:
df['desc'] = np.nan
df.sort_index(axis=1, inplace=True)
total_freq = np.sum(df.obs)
df['percent'] = df['obs'] / total_freq * 100
# report missing observations
df.loc[len(df)] = [np.nan, 'non-missing obs; <-- divided by total obs;', col.count(), col.count()/obs*100]
return df
In [7]:
var = 'class'
dct = def_class
print("Summary statistics of variable: {}".format(var))
tab(data_main[var], dct)
Out[7]:
In [8]:
var = 'legtype'
dct = def_legtype
print("Summary statistics of variable: {}".format(var))
tab(data_main[var], dct)
Out[8]:
In [9]:
var = 'ref_code'
dct = def_ref_code
print("Summary statistics of variable: {}".format(var))
tab(data_main[var], dct)
Out[9]:
In [10]:
def_buy_sell = {10: "buy", 20: "sell",
11: "what is this?",
21: "Not sure what is this?"}
var = 'buy_sell'
dct = def_buy_sell
print("Summary statistics of variable: {}".format(var))
warn("Code 11, Code 21's definitions are unclear.")
tab(data_main[var], dct)
Out[10]:
In [11]:
var = 'owntype'
dct = None
print("Summary statistics of variable: {}".format(var))
tab(data_main[var], dct)
Out[11]:
In [12]:
var = 'sect'
dct = None
print("Summary statistics of variable: {}".format(var))
tab(data_main[var], dct)
Out[12]:
In [13]:
var = 'currency'
dct = None
print("Summary statistics of variable: {}".format(var))
print('20% of transactions are based in currency FIM.')
tab(data_main[var], dct)
Out[13]:
In [14]:
var = 'byear'
dct = None
print("Summary statistics of variable: {}".format(var))
tab(data_main[var], dct)
Out[14]:
In [15]:
var = 'gender'
dct = None
print("Summary statistics of variable: {}".format(var))
tab(data_main[var], dct)
Out[15]:
In [16]:
var = 'pcode'
dct = None
print("Summary statistics of variable: {}".format(var))
tab(data_main[var], dct)
Out[16]:
In [17]:
var = 'country'
dct = None
print("Summary statistics of variable: {}".format(var))
tab(data_main[var], dct)
Out[17]:
In [18]:
var = 'language'
dct = None
print("Summary statistics of variable: {}".format(var))
tab(data_main[var], dct)
Out[18]:
In [19]:
var = 'nom_ind'
dct = None
print("Summary statistics of variable: {}".format(var))
tab(data_main[var], dct)
Out[19]:
In [20]:
grouped = data_main.groupby('fund_id', as_index=False, axis=0)
list_of_numerical_vars = ['volume', 'price', 'prccd', 'ajexdi', 'cshoc', 'date']
list_of_categorical_vars = ['class', 'legtype', 'ref_code', 'buy_sell', 'sect', 'gender', 'pcode']
dict_of_categorical_vars = {'class': def_class,
'legtype': def_legtype,
'ref_code': def_ref_code,
'buy_sell': def_buy_sell,
'sect': None,
'gender': None,
'pcode': None}
In [21]:
for name, group in grouped:
print("********** Fund: {} **********".format(name))
print("Numerical variables:")
print(group[list_of_numerical_vars].describe().T)
print("\n")
print("Categorical variables:")
for var in list_of_categorical_vars:
print("- Variable: {}".format(var))
print(tab(group[var], dict_of_categorical_vars[var]).to_string(index=False))
print("\n")
print("\n")
print("\n")
In [22]:
# Resampling from daily to annually
# How to down-sample from daily to annually? Some variables are computed as means; some are aggregated.
vars_compute_average = ['price', 'prccd', 'ajexdi', 'cshoc']
vars_compute_sum = ['buy_value', 'sell_value', 'buy_volume', 'sell_volume', 'buy', 'sell']
In [23]:
ts_main = data_main
# Generate variables
ts_main['buy'] = (ts_main.buy_sell == 10) | (ts_main.buy_sell == 11)
ts_main['sell'] = (ts_main.buy_sell == 20) | (ts_main.buy_sell == 21)
ts_main['buy_volume'] = np.nan
ts_main.ix[ts_main['buy']==True, 'buy_volume'] = ts_main['volume']
ts_main['sell_volume'] = np.nan
ts_main.ix[ts_main['sell']==True, 'sell_volume'] = ts_main['volume']
ts_main['buy_value'] = np.nan
ts_main.ix[ts_main['buy']==True, 'buy_value'] = ts_main['volume'] * ts_main['price']
ts_main['sell_value'] = np.nan
ts_main.ix[ts_main['sell']==True, 'sell_value'] = ts_main['volume'] * ts_main['price']
ts_main.index = ts_main.date
In [24]:
ts_temp_mean = ts_main[vars_compute_average].resample('A', how='mean')
ts_temp_sum = ts_main[vars_compute_sum].resample('A', how='sum')
ts_annual = pd.concat([ts_temp_mean, ts_temp_sum], axis=1)
ts_annual.columns = ['avg_price', 'avg_prccd', 'avg_ajexdi', 'avg_cshoc', 'sum_buy_value', 'sum_sell_value', 'sum_buy_vol', 'sum_sell_vol', 'sum_buys', 'sum_sells']
print('Variable definitions:')
print('sum_buys: number of buys during the year')
print('sum_buy_vol: total buy volume during the year')
print('sum_buy_value (in 000,000s): total turonver of purchase value during the year')
ts_annual['sum_buy_value'] = ts_annual['sum_buy_value']/1000000
ts_annual['sum_sell_value'] = ts_annual['sum_sell_value']/1000000
ts_annual
Out[24]:
In [25]:
ts_buys = data_main[['fund_id', 'buy', 'sell', 'buy_value', 'sell_value', 'date']].copy()
ts_buys.index = ts_buys.date
ts_buys.drop('date', axis=1, inplace=True)
In [26]:
# Group by fund_id
ts_buys_grouped = ts_buys.groupby('fund_id', as_index=False, axis=0)
i = 1
for name, group in ts_buys_grouped:
# Resample from daily
ts_temp = group.drop('fund_id', axis=1).resample('M', how='sum')
# Plot
plt.figure(i)
plt.plot(ts_temp.index, ts_temp.buy, 'b', label='number of buys')
plt.plot(ts_temp.index, ts_temp.sell, 'r', label='number of sells')
plt.legend(loc='upper left')
plt.xlabel('Time')
plt.ylabel('Freq')
plt.grid(True)
plt.title('Number of Buys/Sells Per Month for Fund {}'.format(name))
i += 1
plt.show()
In [27]:
# Group by fund_id
i = 1
for name, group in ts_buys_grouped:
# Resample from daily
ts_temp = group.drop('fund_id', axis=1).resample('M', how='sum')
# Plot
plt.figure(i)
plt.plot(ts_temp.index, ts_temp.buy_value, 'b', label='total value of buys')
plt.plot(ts_temp.index, ts_temp.sell_value, 'r', label='total value of sells')
plt.legend(loc='upper left')
plt.xlabel('Time')
plt.ylabel('Value in Reported Currency')
plt.grid(True)
plt.title('Value of Buys/Sells Per Month for Fund {}'.format(name))
i += 1
plt.show()
In [28]:
# load stock data
data_stocks = pd.read_stata(Data_Folder + Datafile_stocks)
# Format date
data_stocks['date'] = pd.to_datetime(data_stocks['date'], format="%Y%m%d")
# Keep only useful variables
data_stocks = data_stocks[['isin', 'date', 'ajexdi', 'div', 'prccd']]
data_stocks.info()
Dealing with missing data in the stock price data.
In [29]:
# Identify the number of missing isin
print("Number of missing isin entries in the stocks price data: {}".format(len(data_stocks[data_stocks['isin']==""])))
print("These missing entries are dropped.")
data_stocks.drop(data_stocks.index[data_stocks['isin']==""], inplace=True)
print("\n")
# Checking zero dividend payments
print("Number of zero-dividends paymetns: {}".format(np.sum(data_stocks['div']==0)))
print("zero-dividend payments are replaced as missing.")
data_stocks.ix[data_stocks['div']==0, 'div'] = np.nan
print("\n")
# Check missing prccd
print("Number of missing prccd entries: {}".format(np.sum(np.isnan(data_stocks['prccd']))))
print("They are dropped")
data_stocks.drop(data_stocks.index[np.isnan(data_stocks['prccd'])], inplace=True)
print("\n")
# Check missing ajexdi
print("Number of missing ajexdi entries: {}".format(np.sum(np.isnan(data_stocks['ajexdi']))))
print("They are dropped")
data_stocks.drop(data_stocks.index[np.isnan(data_stocks['ajexdi'])], inplace=True)
In [30]:
# display the floating progress bar
f = FloatProgress(min=0, max=1)
display(f)
data_stocks['divIndex'] = 0
data_stocks.sort_values(by=['isin', 'date'], inplace=True)
gp_temp = data_stocks.groupby(by='isin', as_index=False)
# Identify the first entry per isin
df_temp = gp_temp.first()[['isin', 'date']]
df_temp['first'] = 1
# Merge back
data_stocks = data_stocks.merge(df_temp, how="outer", on=['isin', 'date'])
data_stocks.ix[data_stocks['first']==1, 'divIndex'] = 1
data_stocks.drop('first', axis=1, inplace=True)
# Compute the rest of the divIndex
total_obs = len(data_stocks)
gp_temp = data_stocks.groupby(by='isin', as_index=False)
for name, group in gp_temp:
# Reset all variables when moving to a new stock
cumIndex = 1
previousPrice = group.ix[group.index[0], 'prccd'] # get the first row
for index, row in group.iterrows():
# print("now in stock {} row {}".format(name, index))
# print("cumIndex is {}".format(cumIndex))
# check if there is a dividend payment
if (not np.isnan(row['div'])) & (row['div']!=0):
cumIndex = cumIndex*( row['div'] / previousPrice + 1)
previousPrice = row['prccd']
else:
# There is no dividend payment
previousPrice = row['prccd']
data_stocks.ix[index, 'divIndex'] = cumIndex
f.value = index/total_obs
data_stocks.head()
Out[30]:
In [31]:
def add_modAjexdi(grp):
"""Calculate mod_ajexdi per group."""
grp['modAjexdi'] = np.nan
for index, row in grp.iterrows():
if index==grp.index[0]:
grp.ix[index, 'modAjexdi'] = grp.ix[index, 'ajexdi']
else:
grp.ix[index, 'modAjexdi'] = grp.ix[index - 1, 'modAjexdi'] \
/ grp.ix[index -1, 'ajexdi'] \
* grp.ix[index, 'ajexdi'] \
/ grp.ix[index, 'firstDiffDivIndex']
return grp
if Dividends_Reinvested:
# Adjust ajexdi to account for dividends
first_diff = lambda x: x / x.shift(1)
data_stocks['firstDiffDivIndex'] = data_stocks.groupby(by='isin', as_index=False)['divIndex'].transform(first_diff)
data_stocks = data_stocks.groupby(by='isin', as_index=False).apply(add_modAjexdi)
data_stocks.sort_values(by=['isin', 'date'], inplace=True)
data_stocks = data_stocks[['isin', 'date', 'divIndex', 'ajexdi', 'prccd']]
data_stocks.info()
In [32]:
data_main = data_main[Vars_To_Keep]
print("Check if there is any missing values in the data before expanding it to Panel:")
print(data_main.isnull().sum())
if np.sum(data_main.isnull().sum())>0:
warn("Warning: There are some missing values --> check!")
In [34]:
# Expand the data
def expand_panel(temp):
"""Convert the dataset to balanced panel."""
# Keep the first row of everyday
temp_first_rows = temp.groupby(by=['fund_id', 'isin', 'date'], as_index=False).first()
gp_isin = temp_first_rows.ix[temp_first_rows.index[0], 'isin']
gp_fund_id = temp_first_rows.ix[temp_first_rows.index[0], 'fund_id']
# Find the remaining rows of everyday
temp_first_rows['flag'] = 1
temp_remaining_rows = temp.merge(temp_first_rows, on=list(temp.columns), how='outer')
temp_remaining_rows.drop(temp_remaining_rows.index[temp_remaining_rows['flag']==1], inplace=True)
temp_first_rows.drop('flag', axis=1, inplace=True)
temp_remaining_rows.drop('flag', axis=1, inplace=True)
# Create a time series
temp_first_rows.index = temp_first_rows['date']
temp_first_rows.drop('date', axis=1, inplace=True)
# Create an entry for the first day
if First_Day != "":
temp_first_rows.loc[dt.datetime.strptime(First_Day, '%Y-%b-%d')] = np.nan
temp_first_rows.sort_index(inplace=True)
# Up-sample
ts_daily = temp_first_rows.resample('D')
lst_fill = ['acc', 'sect', 'owntype', 'legtype']
for x in lst_fill:
ts_daily[x] = ts_daily[x].fillna(method='ffill')
ts_daily[x] = ts_daily[x].fillna(method='bfill')
lst_fill = ['volume']
for x in lst_fill:
ts_daily[x] = ts_daily[x].fillna(value=0)
ts_daily[x] = ts_daily[x].fillna(value=0)
ts_daily.reset_index(inplace=True)
ts_daily.rename(columns={'index':'date'}, inplace=True)
ts_daily['fund_id'] = gp_fund_id
ts_daily['isin'] = gp_isin
# Merge the remaining rows back to the data
# Sort the DataFrame to preserve the order of transactions within the same day
return ts_daily.merge(temp_remaining_rows, on=list(temp.columns), how='outer').sort_values(by=['date', 'seq']).reset_index()
data_main = data_main.groupby(by=['fund_id', 'isin'], as_index=False).apply(expand_panel)
data_main.reset_index(drop=True, inplace=True)
data_main = data_main[['fund_id', 'isin', 'date', 'seq', 'acc', 'sect', 'owntype', 'legtype', 'ref_code', 'buy_sell', 'volume', 'price']]
data_main.sort_values(by=['fund_id', 'isin', 'date', 'seq'], inplace=True)
data_main.head()
Out[34]:
Merge with data_stocks and impute missing prices.
Missing prices/ajexdi/divIndex is the result of merging non-trading days with the Stock Prices data (e.g., weekends and public holidays), or just missing observations for some reason. I am using "forward fill" to impute the missing values.
In [35]:
# Merge
data_main = data_main[['fund_id', 'isin', 'date', 'seq', 'acc', 'sect', 'owntype', 'legtype', 'ref_code', 'buy_sell', 'volume', 'price']]
data_main = data_main.merge(data_stocks, how='left', on=['isin', 'date'])
In [36]:
# Inspect missing data
print(data_main.isnull().sum())
# Use forward fill to impute ajexdi; prccd, divIndex;
gp_temp = data_main.groupby(by=['fund_id', 'isin'], as_index=False)
def fill_na(grp):
lst_vars = ['ajexdi', 'prccd', 'divIndex']
for x in lst_vars:
grp[x] = grp[x].fillna(method='ffill')
# I don't really like the missing values at the beginning of the dataset
# just let me get rid of those
grp[x] = grp[x].fillna(method='bfill')
return grp
data_main = gp_temp.apply(fill_na)
Adjust stock purchase/sell prices before 1999
In [37]:
if Adjust_Price_Before_1999:
data_main.ix[data_main['date']<"1999-Jan-01", 'price'] = data_main['price']/5.94573
In [38]:
# Generate variables
data_main['buy'] = (data_main.buy_sell == 10) | (data_main.buy_sell == 11)
data_main['sell'] = (data_main.buy_sell == 20) | (data_main.buy_sell == 21)
# Create Adjusted Volume
data_main['adjSellVolume'] = 0
data_main.ix[data_main['sell']==True, 'adjSellVolume'] = data_main['volume']
data_main['adjBuyVolume'] = 0
data_main.ix[data_main['buy']==True, 'adjBuyVolume'] = data_main['volume']
data_main['cumulativeVolume'] = 0
In [39]:
# check if there is any missing values in ajexdi and divIndex
lst_vars = ['ajexdi', 'divIndex', 'price', 'buy_sell', 'volume']
print("Number of obs in data: {}".format(len(data_main)))
for x in lst_vars:
print("missing values in {}: {}".format(x, np.sum(np.isnan(data_main[x]))))
print("Need to get rid of missing ajexdi or divIndex (mostly the outcome of expanding the data)")
data_main = data_main.ix[data_main['ajexdi'].notnull()].ix[data_main['divIndex'].notnull()]
Some funds may "over-sale" their stocks and result in negative balance. This section prohibits negative balances by identifying all possible "over-sales" and lower the sell-volume to the level that the holdings of zero shares occur at the point of sale. Subsequent sales are then void.
The number of stocks in fund holdings for each period is stored in the variable cumulativeVolume.
In [40]:
# Progress bar
f = FloatProgress(min=0, max=1)
display(f)
total_obs = len(data_main)
# Check for over-sales
data_main = data_main.sort_values(by=['fund_id', 'isin', 'date', 'seq']).reset_index(drop=True)
gp_temp = data_main.groupby(by=['fund_id', 'isin'], as_index=False)
for name, group in gp_temp:
# Reset variables when moving to a new stock
first_row = group.index[0]
# If the first entry is a sell before buy --> the beginning balance is negative.
cumVolume = 0
previousAjexdi = group.ix[first_row, 'ajexdi']
for index, row in group.iterrows():
# check if there is a buy
if (row['buy']==True) & (row['adjBuyVolume']!=0):
cumVolume = cumVolume / row['ajexdi'] * previousAjexdi + row['adjBuyVolume']
previousAjexdi = row['ajexdi']
# check if there is a sell
if (row['sell']==True) & (row['adjSellVolume']!=0):
cumVolume = cumVolume / row['ajexdi'] * previousAjexdi
previousAjexdi = row['ajexdi']
# check if sell volume exceeds cumulative volume
if (row['adjSellVolume'] > cumVolume):
data_main.ix[index, 'adjSellVolume'] = cumVolume
cumVolume = 0
else:
cumVolume = cumVolume - row['adjSellVolume']
f.value = index/total_obs
data_main.ix[index, 'cumulativeVolume'] = cumVolume
In [41]:
# Progress bar
f = FloatProgress(min=0, max=1)
display(f)
total_obs = len(data_main)
data_main['profit'] = np.nan
data_main['daysSincePurchase'] = np.nan
data_main['profit'] = np.nan
# Note that profit is from sale only (not buy)
# Calcualte profit from each sale
data_main = data_main.sort_values(by=['fund_id', 'isin', 'date', 'seq']).reset_index(drop=True)
gp_temp = data_main.groupby(by=['fund_id', 'isin'], as_index=False)
for name, group in gp_temp:
# Move to a new stock and reset variables
buyPrice = []
buyVolume = []
buyDate = []
buyAjexdi = []
salePrice = 0
saleVol = 0
saleRevenue = 0
profit = 0
daysSinceBuy = -1
saleDate = -1
ithPurchase = 1
for index, row in group.iterrows():
# read in buyPrice and buyVolume
if (row['buy']==True) & (row['adjBuyVolume']>0):
buyPrice.append(row['price'])
buyVolume.append(row['adjBuyVolume'])
buyAjexdi.append(row['ajexdi'])
buyDate.append(index)
ithPurchase += 1
# update sellPrice and sellVolume
if (row['sell']==True) & (row['adjSellVolume']>0):
salePrice = row['price']
saleVol = row['adjSellVolume']
saleRevenue = salePrice * saleVol
saleDate = index
saleAjexdi = row['ajexdi']
# determine purchase price -- total purchase cost is purchaseCost
qtyTaken = 0 # number of shares to take from the element in list
purchaseCost = 0
while saleVol > 1:
# loop from the first element of buyVolume
for i, vol in enumerate(buyVolume):
if vol > 0:
# the ith buy's volume >0
daysSinceBuy = saleDate - buyDate[i]
data_main.ix[index, 'daysSincePurchase'] = daysSinceBuy
convertedBuyVolume = vol * buyAjexdi[i] / saleAjexdi
if saleVol > convertedBuyVolume:
saleVol = saleVol - convertedBuyVolume
purchaseCost += vol * buyPrice[i]
buyVolume[i] = 0
else:
purchaseCost += vol * buyPrice[i] / convertedBuyVolume * saleVol
buyVolume[i] = vol - saleVol * saleAjexdi / buyAjexdi[i]
saleVol = 0
data_main.ix[index, 'profit'] = saleRevenue - purchaseCost
f.value = index/total_obs
data_main.ix[np.isnan(data_main['profit']), 'profit'] = 0
data_main.ix[data_main['adjSellVolume']==0, 'profit'] = 0
In [42]:
# Output main dataset
data_main.to_excel(Output_Folder + 'data_main.xlsx')
# Holdings data
gp_temp = data_main[['fund_id', 'acc', 'isin', 'date', 'cumulativeVolume']].groupby(by=['fund_id', 'isin', 'date'])
data_holdings = gp_temp.last().reset_index()
data_holdings.rename(columns={'level_0': 'fund_id', 'level_1':'isin', 'level_2':'date'}, inplace=True)
data_holdings.to_excel(Output_Folder + 'data_holdings.xlsx')