Reading and Plotting Stock Information


In [6]:
# Import modules
import pandas as pd
import pandas_datareader.data as web
import datetime as dt
import matplotlib.pyplot as plt
import os
import numpy as np
%matplotlib inline

Lesson 01-01


In [7]:
# Import AAPL stock data
start = dt.datetime(2010, 1, 1)
end = dt.datetime(2017, 1, 1)
dframe = web.DataReader('AAPL', 'google', start, end)
dframe.to_csv("../data/AAPL.csv", index_label="Date")

In [8]:
# Read data from CSV and print first 5 rows
df = pd.read_csv("../data/AAPL.csv")
print(df.head())


         Date   Open   High    Low  Close     Volume
0  2010-01-04  30.49  30.64  30.34  30.57  123432050
1  2010-01-05  30.66  30.80  30.46  30.63  150476004
2  2010-01-06  30.63  30.75  30.11  30.14  138039594
3  2010-01-07  30.25  30.29  29.86  30.08  119282324
4  2010-01-08  30.04  30.29  29.87  30.28  111969081

In [9]:
#Plot High price
df['High'].plot()
plt.show()


Joining two stock Information

Lesson 01-02


In [10]:
# Specify date range using pandas range
start_date = '2010-01-22'
end_date = '2010-01-26'
dates = pd.date_range(start_date, end_date)

# Create empty dataFrame
df1 = pd.DataFrame(index=dates)

In [11]:
# Join two dataframe
df1 = df1.join(df)
print(df1)


                    Date  Open  High  Low  Close  Volume
2010-01-22 00:00:00  NaN   NaN   NaN  NaN    NaN     NaN
2010-01-23 00:00:00  NaN   NaN   NaN  NaN    NaN     NaN
2010-01-24 00:00:00  NaN   NaN   NaN  NaN    NaN     NaN
2010-01-25 00:00:00  NaN   NaN   NaN  NaN    NaN     NaN
2010-01-26 00:00:00  NaN   NaN   NaN  NaN    NaN     NaN

In [12]:
# Above shows all NaN as AAPL stock data is read without date as index.
# We can read again with correct format
df = pd.read_csv("../data/AAPL.csv", index_col="Date", parse_dates=True)
print(df.head())


             Open   High    Low  Close     Volume
Date                                             
2010-01-04  30.49  30.64  30.34  30.57  123432050
2010-01-05  30.66  30.80  30.46  30.63  150476004
2010-01-06  30.63  30.75  30.11  30.14  138039594
2010-01-07  30.25  30.29  29.86  30.08  119282324
2010-01-08  30.04  30.29  29.87  30.28  111969081

In [13]:
# Now join again and print out
df1 = None
df1 = pd.DataFrame(index=dates)
df1 = df1.join(df)
print(df1)


             Open   High    Low  Close       Volume
2010-01-22  29.54  29.64  28.17  28.25  220441872.0
2010-01-23    NaN    NaN    NaN    NaN          NaN
2010-01-24    NaN    NaN    NaN    NaN          NaN
2010-01-25  28.93  29.24  28.60  29.01  266424802.0
2010-01-26  29.42  30.53  28.94  29.42  466777486.0

In [14]:
# Let's remove row with has NaN
df1 = df1.dropna()
print(df1)


             Open   High    Low  Close       Volume
2010-01-22  29.54  29.64  28.17  28.25  220441872.0
2010-01-25  28.93  29.24  28.60  29.01  266424802.0
2010-01-26  29.42  30.53  28.94  29.42  466777486.0

In [15]:
# Note that above two operation can be done by following command
# df1 = df1.join(df, how='inner')

Example: Create Stock information using Close


In [16]:
# First, let's download stock information
tickers = ['AAPL', 'SPY', 'IBM', 'GOOG', 'GLD']
for ticker in tickers:
    df = web.DataReader(ticker, 'google', start, end)
    df.to_csv("../data/{}.csv".format(ticker), index_label="Date")

In [17]:
# Create dataframe with wanted date range
df1 = None
df1 = pd.DataFrame(index=dates)

In [18]:
# Join all stock information
for ticker in tickers:
    df_temp = pd.read_csv("../data/{}.csv".format(ticker), index_col='Date',
                         parse_dates=True, usecols=['Date', 'Close'],
                         na_values=['nan'])
    # Rename
    df_temp = df_temp.rename(columns={'Close': ticker})
    df1 = df1.join(df_temp)
print(df1)


             AAPL     SPY     IBM    GOOG     GLD
2010-01-22  28.25  109.21  125.50  274.73  107.17
2010-01-23    NaN     NaN     NaN     NaN     NaN
2010-01-24    NaN     NaN     NaN     NaN     NaN
2010-01-25  29.01  109.77  126.12  269.73  107.48
2010-01-26  29.42  109.31  125.75  270.94  107.56

Utility Function to read data


In [19]:
# This function provides path to cvc for specified symbol
# If symbol is not available, then read from internet, save, then provide
# path
def symbol_to_path(symbol, base_dir="../data"):
    """Return CSV path to specified symbol"""
    if not os.path.exists(os.path.join(base_dir, "{}.csv".format(str(symbol)))):
        start = dt.datetime(2010, 1, 1)
        end = dt.datetime(2017, 1, 1)
        df = web.DataReader(symbol, 'google', start, end)
        df.to_csv(os.path.join(base_dir,"{}.csv".format(str(symbol))))
    return os.path.join(base_dir, "{}.csv".format(str(symbol)))

In [20]:
# Test Function
# Case 1: Existing one (AAPL)
ex_path = symbol_to_path('AAPL')
print(ex_path)
# Case 2: Not existing one (HOLX)
ex_path = symbol_to_path('HOLX')
print(ex_path)
os.remove(ex_path)


../data/AAPL.csv
../data/HOLX.csv

In [21]:
def get_data(symbols, dates):
    """Read stock data (close) for given symbols for given period of time"""
    df = pd.DataFrame(index=dates)
    if 'SPY' not in symbols:  # add SPY for reference, if absent
        symbols.insert(0, 'SPY')

    for symbol in symbols:
        # TODO: Read and join data for each symbol
        temp_df = pd.read_csv(symbol_to_path(symbol), index_col='Date',
                         parse_dates=True, usecols=['Date', 'Close'],
                         na_values=['nan'])
        temp_df = temp_df.rename(columns={'Close': symbol})
        df = df.join(temp_df, how='inner')
    return df

In [22]:
# Test function
# Define a date range
dates = pd.date_range('2010-01-1', '2010-12-31')
# Choose stock symbols to read
symbols = ['GOOG', 'IBM', 'GLD']
# Get stock data
df = get_data(symbols, dates)
print(df)


               SPY    GOOG     IBM     GLD
2010-01-04  113.33  313.06  132.45  109.80
2010-01-05  113.63  311.68  130.85  109.70
2010-01-06  113.71  303.83  130.00  111.51
2010-01-07  114.19  296.75  129.55  110.82
2010-01-08  114.57  300.71  130.85  111.37
2010-01-11  114.73  300.25  129.48  112.85
2010-01-12  113.66  294.94  130.51  110.49
2010-01-13  114.62  293.25  130.23  111.54
2010-01-14  114.93  294.63  132.31  112.03
2010-01-15  113.64  289.71  131.78  110.86
2010-01-19  115.06  293.52  134.14  111.52
2010-01-20  113.89  289.91  130.25  108.94
2010-01-21  111.70  291.20  129.00  107.37
2010-01-22  109.21  274.73  125.50  107.17
2010-01-25  109.77  269.73  126.12  107.48
2010-01-26  109.31  270.94  125.75  107.56
2010-01-27  109.83  270.78  126.33  106.53
2010-01-28  108.57  266.88  123.75  106.48
2010-01-29  107.39  264.71  122.39  105.96
2010-02-01  109.06  266.24  124.67  108.35
2010-02-02  110.38  265.29  125.53  109.13
2010-02-03  109.83  270.14  125.66  108.70
2010-02-04  106.44  263.13  123.00  104.37
2010-02-05  106.66  265.38  123.52  104.68
2010-02-08  105.89  266.47  121.88  104.04
2010-02-09  107.22  267.95  123.21  105.41
2010-02-10  107.01  266.96  122.81  105.12
2010-02-11  108.13  267.93  123.73  107.13
2010-02-12  108.04  266.29  124.00  107.04
2010-02-16  109.74  270.38  125.23  109.62
...            ...     ...     ...     ...
2010-11-18  119.96  297.98  144.36  132.09
2010-11-19  120.29  295.12  145.05  132.20
2010-11-22  120.19  295.31  145.39  133.48
2010-11-23  118.45  291.21  143.18  134.41
2010-11-24  120.20  297.19  145.81  134.18
2010-11-26  118.80  294.70  143.90  133.11
2010-11-29  119.16  290.76  142.89  133.51
2010-11-30  118.49  277.58  141.46  135.42
2010-12-01  121.01  281.89  144.41  135.38
2010-12-02  122.56  285.62  145.18  135.20
2010-12-03  122.89  286.21  145.38  138.07
2010-12-06  122.76  288.89  144.99  139.11
2010-12-07  122.83  293.28  144.02  136.50
2010-12-08  123.28  294.97  144.98  134.79
2010-12-09  123.76  295.45  144.30  135.37
2010-12-10  124.48  295.81  144.82  135.41
2010-12-13  124.56  297.01  144.28  136.05
2010-12-14  124.67  297.16  145.82  136.18
2010-12-15  124.10  294.85  144.72  134.70
2010-12-16  124.82  295.56  144.55  133.81
2010-12-17  124.30  295.10  145.00  134.20
2010-12-20  124.60  297.23  144.51  135.11
2010-12-21  125.39  301.23  145.74  135.32
2010-12-22  125.78  302.44  145.95  135.05
2010-12-23  125.60  301.81  145.89  134.66
2010-12-27  125.65  300.89  145.34  135.02
2010-12-28  125.83  299.16  145.71  137.22
2010-12-29  125.92  300.20  146.52  137.71
2010-12-30  125.72  299.13  146.67  137.03
2010-12-31  125.75  296.69  146.76  138.72

[251 rows x 4 columns]

In [23]:
# Plot data
def plot_data(df, title="Stock prices"):
    """Plot stock prices"""
    ax = df.plot(title=title, fontsize=10)
    ax.set_xlabel("Date")
    ax.set_ylabel("Price")
    plt.show()

In [24]:
plot_data(df)



In [25]:
def plot_selected(df, columns, start_index, end_index):
    """Plot selected symbols over selected time"""
    plot_data(df.ix[start_index:endIindex, columns], title="Selected Data")

In [26]:
def normalize_data(df):
    """Normalize stock price using first row"""
    return df / df.iloc[0,:]

Power of NumPy

Lesson 01-03


In [27]:
# Basic notation
#nd1[row, col]
#nd1[row_start:row_end,col_start:col_end]
# By using just ":", it means all row or col
# Last row can be expressed with "-1" (Cound back from END)

# Create array
print(np.array([(2, 3, 4),(5, 6, 7)]))


[[2 3 4]
 [5 6 7]]

In [28]:
# Create empty array
print(np.empty((5, 4)))


[[  0.1    0.15   0.2    0.25]
 [  0.3    0.4    0.5    0.6 ]
 [  0.8    1.     1.5    2.  ]
 [  2.5    3.     4.     5.  ]
 [  6.     8.    10.    15.  ]]

In [29]:
# Create array with ones
print(np.ones((5, 4)))


[[ 1.  1.  1.  1.]
 [ 1.  1.  1.  1.]
 [ 1.  1.  1.  1.]
 [ 1.  1.  1.  1.]
 [ 1.  1.  1.  1.]]

In [30]:
# Note that default np array is float
# To make it int
print(np.ones((5, 4), dtype=np.int_))


[[1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]]

In [31]:
# Generate random value
print( np.random.random((5, 4)) )


[[ 0.64844051  0.32914323  0.21612129  0.07502843]
 [ 0.55255307  0.39136977  0.89880417  0.23269751]
 [ 0.80907612  0.75690923  0.92205924  0.44301284]
 [ 0.65137787  0.60270333  0.51494326  0.55435674]
 [ 0.02235808  0.16681464  0.63800818  0.1264242 ]]

In [32]:
# You can also generate using following syntax
a = np.random.rand(5, 4)
print( a )


[[ 0.17181084  0.75632324  0.46740555  0.9888577 ]
 [ 0.68663858  0.98860205  0.16793841  0.32285166]
 [ 0.81512306  0.9423592   0.66667053  0.78095287]
 [ 0.98536371  0.11735136  0.73054243  0.21015268]
 [ 0.93039176  0.95237867  0.15542317  0.71463687]]

In [33]:
# Show the shape of array
print(a.shape)
# Number of row
print(a.shape[0])
# Number of col
print(a.shape[1])


(5, 4)
5
4

In [ ]:
# Get number of elements
print(a.size)


20

Array Operations


In [ ]:
print( "Sum of array elements: ", a.sum())
print( "Sum of each col: ", a.sum(axis=0))
print( "Sum of each row: ", a.sum(axis=1))

Statistical Analysis

Lesson 1-4


In [ ]:
# Print first 5 row to remember data then plot
print(df.head())
plot_data(df)

Global Statistics


In [ ]:
# Calculate mean
print( df.mean() )

In [ ]:
# Calculate median
print( df.median() )

In [ ]:
# Calculate std
print( df.std() )

Rolling Statistics


In [ ]:
# Bollinger Bands
# If stock price is go away more than 2 std from rolling mean, then 
# attention should be paid

# Calculate rolling mean
ax = df['SPY'].plot(title="SPY rolling mean", label='SPY')
rm_SPY = df['SPY'].rolling(window=20).mean() #calculating
rm_SPY.plot(label='Rolling mean', ax=ax)
ax.set_xlabel("Date")
ax.set_ylabel("Price")
ax.legend(loc='upper left')
plt.show()

In [ ]:
def get_rolling_mean(values, window):
    """Return rolling mean of given values, using specific window size"""
    return values.rolling(window=window).mean()

def get_rolling_std(values, window):
    """Return rolling std of given values, using specific window size"""
    return values.rolling(window=window).std()

def get_bollinger_bands(rm, rstd):
    """Return upper and lower band of Billinger Bands"""
    upper_band = rm + 2 * rstd
    lower_band = rm - 2 * rstd
    return upper_band, lower_band

In [ ]:
# Test function
rm_SPY = get_rolling_mean(df['SPY'], window=20)
rstd_SPY = get_rolling_std(df['SPY'], window=20)
upper_band, lower_band = get_bollinger_bands(rm_SPY, rstd_SPY)

ax = df['SPY'].plot(title="SPY rolling mean", label='SPY')
rm_SPY.plot(label='Rolling mean', ax=ax)
upper_band.plot(label='upper band', ax=ax)
lower_band.plot(label='lower band', ax=ax)
ax.set_xlabel("Date")
ax.set_ylabel("Price")
ax.legend(loc='upper left')
plt.show()

Daily Returns


In [ ]:
def compute_daily_returns(df):
    daily_returns = df.copy()
    daily_returns[1:] = (df[1:] / df[:-1].values) - 1
    daily_returns.iloc[0, :] = 0
    return daily_returns

# Below is much easier implementation
def compute_daily_returns_2(df):
    daily_returns = (df / df.shift(1)) - 1
    daily_returns.iloc[0] = 0
    return daily_returns

In [ ]:
# Test function
df1 = df['SPY']
plot_data(compute_daily_returns_2(df1))

Cumulative Returns


In [ ]:
# cumulative_return[t] = (price[t]/price[0]) - 1
def compute_cumulative_returns(df):
    cumulative_returns = (df / df.iloc[0]) - 1
    cumulative_returns.iloc[0] = 0
    return cumulative_returns

In [ ]:
# Test function
plot_data(compute_cumulative_returns(df1))

Incomplete Data

Lesson 1-5


In [ ]:
# When there is missing data in between, we continue last known price until next known
# price is available
# Fill forward first, then fill backward
#fillna(method='ffill')

Histograms and Scatter Plots

Lesson 1-6

Kurtosis: The difference from normal distrubution.

- Positive Kurtosis: Fat tail
- Negative Kurtosis: Skinny tail

In [ ]:
# Plotting Histogram
# 1. First let's review stock data
dates = pd.date_range('2009-01-01', '2012-12-31')
symbols = ['SPY']
df = get_data(symbols, dates)
plot_data(df)

In [ ]:
# Compute daily returns
daily_returns = compute_daily_returns_2(df)
plot_data(daily_returns, title="Daily Returns")

In [ ]:
# Plot Histogram
daily_returns.hist(bins=20);

In [ ]:
# Get mean and std
mean = daily_returns['SPY'].mean()
print("mean=",mean)
std = daily_returns['SPY'].std()
print("std=",std)

In [ ]:
# Add information to histogram
daily_returns.hist(bins=20);
plt.axvline(mean, color='w', linestyle='dashed', linewidth=2);
plt.axvline(std, color='r', linestyle='dashed', linewidth=2);
plt.axvline(-std, color='r', linestyle='dashed', linewidth=2);

In [ ]:
# Compute kurtosis for the plot
print(daily_returns.kurtosis())

In [ ]:
# Plotting two stock information
symbols = ['SPY', 'XOM']
df = get_data(symbols, dates)
plot_data(df)

In [ ]:
daily_returns = compute_daily_returns_2(df)
plot_data(daily_returns, title="Daily Returns")

In [ ]:
# It shows in two separate chart
daily_returns.hist(bins=20);

In [ ]:
daily_returns['SPY'].hist(bins=20, label="SPY")
daily_returns['XOM'].hist(bins=20, label="XOM")
plt.legend(loc='upper right')
plt.show()

In [ ]:
# Scatter Plot
symbols = ['SPY', 'XOM', 'GLD']
df = get_data(symbols, dates)
plot_data(df)

In [ ]:
daily_returns = compute_daily_returns_2(df)
# Scatter plot (SPY vs XOM)
daily_returns.plot(kind='scatter',x='SPY',y='XOM');
beta_XOM,alpha_XOM= np.polyfit(daily_returns['SPY'],
                               daily_returns['XOM'], 1)
plt.plot(daily_returns['SPY'], 
         beta_XOM * daily_returns['SPY'] + alpha_XOM,
         '-',color='r');
print("beta_XOM=", beta_XOM)
print("alpha_XOM=", alpha_XOM)

In [ ]:
# Scatter plot (SPY vs GLD)
daily_returns.plot(kind='scatter',x='SPY',y='GLD');
beta_GLD,alpha_GLD= np.polyfit(daily_returns['SPY'],
                               daily_returns['GLD'], 1)
plt.plot(daily_returns['SPY'],
         beta_GLD * daily_returns['SPY'] + alpha_GLD,
         '-',color='r');
print("beta_GLD=", beta_GLD)
print("alpha_GLD=", alpha_GLD)

In [ ]:
# Calculate correlation
print(daily_returns.corr(method='pearson'))

Sharpe ratio and other portfolio

Lesson 1-7

Example Let's calculate the value of portfolio for following example.

  • Start Value = $1,000,000
  • Start Date = 2009-1-1
  • End Date = 2011-12-31
  • Symbols = ['SPY','XOM','GOOG','GLD']
  • Allocation = [0.4, 0.4, 0.1, 0.1]

In [ ]:
# Get necessary data
dates = pd.date_range('2009-1-1', '2011-12-31')
symbols = ['SPY','XOM','GOOG','GLD']
df = get_data(symbols, dates)
plot_data(df)
print(df.head())

In [ ]:
# Normalize data
df_norm = normalize_data(df)
print(df_norm.head())

In [ ]:
# Provide allocation
alloc = [0.4, 0.4, 0.1, 0.1]
df_alloc = df_norm * alloc
print(df_alloc.head())

In [ ]:
# Multiply by Fund
fund = 1000000
df_fund = df_alloc * fund
print(df_fund.head())

In [ ]:
# Calculate portfolio values
port_val = df_fund.sum(axis=1)
print(port_val.head())
plot_data(port_val, title="Portfolio price")

In [ ]:
# Calculate daily returns
daily_returns = compute_daily_returns_2(port_val)
print(daily_returns.head())
# Remove first value as it is always 0
daily_returns = daily_returns[1:]
print(daily_returns.head())

In [ ]:
# Calculate major statistics
#
# - Cumulative Returns
cum_return = compute_cumulative_returns(port_val)
plot_data(cum_return, title="Cumulative Return")
# - Average Daily Return
avg_daily_return = daily_returns.mean()
print("Average Daily Return =", avg_daily_return)
# - Std Daily Return
std_daily_return = daily_returns.std()
print("Std Daily Return =",std_daily_return)
# (Display as hist graph)
ax = daily_returns.hist(bins=20);
ax.set_title("Daily Return")
plt.axvline(avg_daily_return, color='w', linestyle='dashed', linewidth=2);
plt.axvline(std_daily_return, color='r', linestyle='dashed', linewidth=2);
plt.axvline(-std_daily_return, color='r', linestyle='dashed', linewidth=2);

Sharpe Ratio => Risk adjusted return

  • Lower risk is better
  • Higher return is better

SR also considers risk free rate of return

S = E(daily_return - daily_rf) / std(daily_return - daily_rf)

But it is safe to assume 0%

S = mean(daily_return) / std(daily_return)

But to be accurate daily_rf = root[252] (1.0 + 0.1) - 1

S = mean(daily_return - daily_rf) / std(daily_return)

But more SR can cary widely depending on how frequently you sample. SR is annual measure so you need to add adjustment value

SR_addualized = K * SR

where

k = sqrt(#samples per year)

so daily => sqrt(252), weekly => sqrt(52), monthly => sqrt(12)

Finally

SR = sqrt(252) * mean(daily_return - daily_rf) / std(daily_return)

In [ ]:
# Sharpe Ratio
sharpe_ratio = np.sqrt(252) * avg_daily_return / std_daily_return
print("Sharpe Ratio =", sharpe_ratio)

In [ ]: