In [6]:
# Import modules
import pandas as pd
import pandas_datareader.data as web
import datetime as dt
import matplotlib.pyplot as plt
import os
import numpy as np
%matplotlib inline
In [7]:
# Import AAPL stock data
start = dt.datetime(2010, 1, 1)
end = dt.datetime(2017, 1, 1)
dframe = web.DataReader('AAPL', 'google', start, end)
dframe.to_csv("../data/AAPL.csv", index_label="Date")
In [8]:
# Read data from CSV and print first 5 rows
df = pd.read_csv("../data/AAPL.csv")
print(df.head())
In [9]:
#Plot High price
df['High'].plot()
plt.show()
In [10]:
# Specify date range using pandas range
start_date = '2010-01-22'
end_date = '2010-01-26'
dates = pd.date_range(start_date, end_date)
# Create empty dataFrame
df1 = pd.DataFrame(index=dates)
In [11]:
# Join two dataframe
df1 = df1.join(df)
print(df1)
In [12]:
# Above shows all NaN as AAPL stock data is read without date as index.
# We can read again with correct format
df = pd.read_csv("../data/AAPL.csv", index_col="Date", parse_dates=True)
print(df.head())
In [13]:
# Now join again and print out
df1 = None
df1 = pd.DataFrame(index=dates)
df1 = df1.join(df)
print(df1)
In [14]:
# Let's remove row with has NaN
df1 = df1.dropna()
print(df1)
In [15]:
# Note that above two operation can be done by following command
# df1 = df1.join(df, how='inner')
In [16]:
# First, let's download stock information
tickers = ['AAPL', 'SPY', 'IBM', 'GOOG', 'GLD']
for ticker in tickers:
df = web.DataReader(ticker, 'google', start, end)
df.to_csv("../data/{}.csv".format(ticker), index_label="Date")
In [17]:
# Create dataframe with wanted date range
df1 = None
df1 = pd.DataFrame(index=dates)
In [18]:
# Join all stock information
for ticker in tickers:
df_temp = pd.read_csv("../data/{}.csv".format(ticker), index_col='Date',
parse_dates=True, usecols=['Date', 'Close'],
na_values=['nan'])
# Rename
df_temp = df_temp.rename(columns={'Close': ticker})
df1 = df1.join(df_temp)
print(df1)
In [19]:
# This function provides path to cvc for specified symbol
# If symbol is not available, then read from internet, save, then provide
# path
def symbol_to_path(symbol, base_dir="../data"):
"""Return CSV path to specified symbol"""
if not os.path.exists(os.path.join(base_dir, "{}.csv".format(str(symbol)))):
start = dt.datetime(2010, 1, 1)
end = dt.datetime(2017, 1, 1)
df = web.DataReader(symbol, 'google', start, end)
df.to_csv(os.path.join(base_dir,"{}.csv".format(str(symbol))))
return os.path.join(base_dir, "{}.csv".format(str(symbol)))
In [20]:
# Test Function
# Case 1: Existing one (AAPL)
ex_path = symbol_to_path('AAPL')
print(ex_path)
# Case 2: Not existing one (HOLX)
ex_path = symbol_to_path('HOLX')
print(ex_path)
os.remove(ex_path)
In [21]:
def get_data(symbols, dates):
"""Read stock data (close) for given symbols for given period of time"""
df = pd.DataFrame(index=dates)
if 'SPY' not in symbols: # add SPY for reference, if absent
symbols.insert(0, 'SPY')
for symbol in symbols:
# TODO: Read and join data for each symbol
temp_df = pd.read_csv(symbol_to_path(symbol), index_col='Date',
parse_dates=True, usecols=['Date', 'Close'],
na_values=['nan'])
temp_df = temp_df.rename(columns={'Close': symbol})
df = df.join(temp_df, how='inner')
return df
In [22]:
# Test function
# Define a date range
dates = pd.date_range('2010-01-1', '2010-12-31')
# Choose stock symbols to read
symbols = ['GOOG', 'IBM', 'GLD']
# Get stock data
df = get_data(symbols, dates)
print(df)
In [23]:
# Plot data
def plot_data(df, title="Stock prices"):
"""Plot stock prices"""
ax = df.plot(title=title, fontsize=10)
ax.set_xlabel("Date")
ax.set_ylabel("Price")
plt.show()
In [24]:
plot_data(df)
In [25]:
def plot_selected(df, columns, start_index, end_index):
"""Plot selected symbols over selected time"""
plot_data(df.ix[start_index:endIindex, columns], title="Selected Data")
In [26]:
def normalize_data(df):
"""Normalize stock price using first row"""
return df / df.iloc[0,:]
In [27]:
# Basic notation
#nd1[row, col]
#nd1[row_start:row_end,col_start:col_end]
# By using just ":", it means all row or col
# Last row can be expressed with "-1" (Cound back from END)
# Create array
print(np.array([(2, 3, 4),(5, 6, 7)]))
In [28]:
# Create empty array
print(np.empty((5, 4)))
In [29]:
# Create array with ones
print(np.ones((5, 4)))
In [30]:
# Note that default np array is float
# To make it int
print(np.ones((5, 4), dtype=np.int_))
In [31]:
# Generate random value
print( np.random.random((5, 4)) )
In [32]:
# You can also generate using following syntax
a = np.random.rand(5, 4)
print( a )
In [33]:
# Show the shape of array
print(a.shape)
# Number of row
print(a.shape[0])
# Number of col
print(a.shape[1])
In [ ]:
# Get number of elements
print(a.size)
Array Operations
In [ ]:
print( "Sum of array elements: ", a.sum())
print( "Sum of each col: ", a.sum(axis=0))
print( "Sum of each row: ", a.sum(axis=1))
In [ ]:
# Print first 5 row to remember data then plot
print(df.head())
plot_data(df)
Global Statistics
In [ ]:
# Calculate mean
print( df.mean() )
In [ ]:
# Calculate median
print( df.median() )
In [ ]:
# Calculate std
print( df.std() )
Rolling Statistics
In [ ]:
# Bollinger Bands
# If stock price is go away more than 2 std from rolling mean, then
# attention should be paid
# Calculate rolling mean
ax = df['SPY'].plot(title="SPY rolling mean", label='SPY')
rm_SPY = df['SPY'].rolling(window=20).mean() #calculating
rm_SPY.plot(label='Rolling mean', ax=ax)
ax.set_xlabel("Date")
ax.set_ylabel("Price")
ax.legend(loc='upper left')
plt.show()
In [ ]:
def get_rolling_mean(values, window):
"""Return rolling mean of given values, using specific window size"""
return values.rolling(window=window).mean()
def get_rolling_std(values, window):
"""Return rolling std of given values, using specific window size"""
return values.rolling(window=window).std()
def get_bollinger_bands(rm, rstd):
"""Return upper and lower band of Billinger Bands"""
upper_band = rm + 2 * rstd
lower_band = rm - 2 * rstd
return upper_band, lower_band
In [ ]:
# Test function
rm_SPY = get_rolling_mean(df['SPY'], window=20)
rstd_SPY = get_rolling_std(df['SPY'], window=20)
upper_band, lower_band = get_bollinger_bands(rm_SPY, rstd_SPY)
ax = df['SPY'].plot(title="SPY rolling mean", label='SPY')
rm_SPY.plot(label='Rolling mean', ax=ax)
upper_band.plot(label='upper band', ax=ax)
lower_band.plot(label='lower band', ax=ax)
ax.set_xlabel("Date")
ax.set_ylabel("Price")
ax.legend(loc='upper left')
plt.show()
Daily Returns
In [ ]:
def compute_daily_returns(df):
daily_returns = df.copy()
daily_returns[1:] = (df[1:] / df[:-1].values) - 1
daily_returns.iloc[0, :] = 0
return daily_returns
# Below is much easier implementation
def compute_daily_returns_2(df):
daily_returns = (df / df.shift(1)) - 1
daily_returns.iloc[0] = 0
return daily_returns
In [ ]:
# Test function
df1 = df['SPY']
plot_data(compute_daily_returns_2(df1))
Cumulative Returns
In [ ]:
# cumulative_return[t] = (price[t]/price[0]) - 1
def compute_cumulative_returns(df):
cumulative_returns = (df / df.iloc[0]) - 1
cumulative_returns.iloc[0] = 0
return cumulative_returns
In [ ]:
# Test function
plot_data(compute_cumulative_returns(df1))
In [ ]:
# When there is missing data in between, we continue last known price until next known
# price is available
# Fill forward first, then fill backward
#fillna(method='ffill')
Kurtosis: The difference from normal distrubution.
- Positive Kurtosis: Fat tail
- Negative Kurtosis: Skinny tail
In [ ]:
# Plotting Histogram
# 1. First let's review stock data
dates = pd.date_range('2009-01-01', '2012-12-31')
symbols = ['SPY']
df = get_data(symbols, dates)
plot_data(df)
In [ ]:
# Compute daily returns
daily_returns = compute_daily_returns_2(df)
plot_data(daily_returns, title="Daily Returns")
In [ ]:
# Plot Histogram
daily_returns.hist(bins=20);
In [ ]:
# Get mean and std
mean = daily_returns['SPY'].mean()
print("mean=",mean)
std = daily_returns['SPY'].std()
print("std=",std)
In [ ]:
# Add information to histogram
daily_returns.hist(bins=20);
plt.axvline(mean, color='w', linestyle='dashed', linewidth=2);
plt.axvline(std, color='r', linestyle='dashed', linewidth=2);
plt.axvline(-std, color='r', linestyle='dashed', linewidth=2);
In [ ]:
# Compute kurtosis for the plot
print(daily_returns.kurtosis())
In [ ]:
# Plotting two stock information
symbols = ['SPY', 'XOM']
df = get_data(symbols, dates)
plot_data(df)
In [ ]:
daily_returns = compute_daily_returns_2(df)
plot_data(daily_returns, title="Daily Returns")
In [ ]:
# It shows in two separate chart
daily_returns.hist(bins=20);
In [ ]:
daily_returns['SPY'].hist(bins=20, label="SPY")
daily_returns['XOM'].hist(bins=20, label="XOM")
plt.legend(loc='upper right')
plt.show()
In [ ]:
# Scatter Plot
symbols = ['SPY', 'XOM', 'GLD']
df = get_data(symbols, dates)
plot_data(df)
In [ ]:
daily_returns = compute_daily_returns_2(df)
# Scatter plot (SPY vs XOM)
daily_returns.plot(kind='scatter',x='SPY',y='XOM');
beta_XOM,alpha_XOM= np.polyfit(daily_returns['SPY'],
daily_returns['XOM'], 1)
plt.plot(daily_returns['SPY'],
beta_XOM * daily_returns['SPY'] + alpha_XOM,
'-',color='r');
print("beta_XOM=", beta_XOM)
print("alpha_XOM=", alpha_XOM)
In [ ]:
# Scatter plot (SPY vs GLD)
daily_returns.plot(kind='scatter',x='SPY',y='GLD');
beta_GLD,alpha_GLD= np.polyfit(daily_returns['SPY'],
daily_returns['GLD'], 1)
plt.plot(daily_returns['SPY'],
beta_GLD * daily_returns['SPY'] + alpha_GLD,
'-',color='r');
print("beta_GLD=", beta_GLD)
print("alpha_GLD=", alpha_GLD)
In [ ]:
# Calculate correlation
print(daily_returns.corr(method='pearson'))
Example Let's calculate the value of portfolio for following example.
In [ ]:
# Get necessary data
dates = pd.date_range('2009-1-1', '2011-12-31')
symbols = ['SPY','XOM','GOOG','GLD']
df = get_data(symbols, dates)
plot_data(df)
print(df.head())
In [ ]:
# Normalize data
df_norm = normalize_data(df)
print(df_norm.head())
In [ ]:
# Provide allocation
alloc = [0.4, 0.4, 0.1, 0.1]
df_alloc = df_norm * alloc
print(df_alloc.head())
In [ ]:
# Multiply by Fund
fund = 1000000
df_fund = df_alloc * fund
print(df_fund.head())
In [ ]:
# Calculate portfolio values
port_val = df_fund.sum(axis=1)
print(port_val.head())
plot_data(port_val, title="Portfolio price")
In [ ]:
# Calculate daily returns
daily_returns = compute_daily_returns_2(port_val)
print(daily_returns.head())
# Remove first value as it is always 0
daily_returns = daily_returns[1:]
print(daily_returns.head())
In [ ]:
# Calculate major statistics
#
# - Cumulative Returns
cum_return = compute_cumulative_returns(port_val)
plot_data(cum_return, title="Cumulative Return")
# - Average Daily Return
avg_daily_return = daily_returns.mean()
print("Average Daily Return =", avg_daily_return)
# - Std Daily Return
std_daily_return = daily_returns.std()
print("Std Daily Return =",std_daily_return)
# (Display as hist graph)
ax = daily_returns.hist(bins=20);
ax.set_title("Daily Return")
plt.axvline(avg_daily_return, color='w', linestyle='dashed', linewidth=2);
plt.axvline(std_daily_return, color='r', linestyle='dashed', linewidth=2);
plt.axvline(-std_daily_return, color='r', linestyle='dashed', linewidth=2);
Sharpe Ratio => Risk adjusted return
SR also considers risk free rate of return
S = E(daily_return - daily_rf) / std(daily_return - daily_rf)
But it is safe to assume 0%
S = mean(daily_return) / std(daily_return)
But to be accurate daily_rf = root[252] (1.0 + 0.1) - 1
S = mean(daily_return - daily_rf) / std(daily_return)
But more SR can cary widely depending on how frequently you sample. SR is annual measure so you need to add adjustment value
SR_addualized = K * SR
where
k = sqrt(#samples per year)
so daily => sqrt(252), weekly => sqrt(52), monthly => sqrt(12)
Finally
SR = sqrt(252) * mean(daily_return - daily_rf) / std(daily_return)
In [ ]:
# Sharpe Ratio
sharpe_ratio = np.sqrt(252) * avg_daily_return / std_daily_return
print("Sharpe Ratio =", sharpe_ratio)
In [ ]: