YahooStocks Multiple Security Downloader


In [285]:
# Stock Downloader Yahoo In-Memory / Python 3
def YahooStocks(tickers = ["FB", "MSFT"], freq = "d", selectColumn = "AdjClose", fromDate = None, toDate = None, lagPeriods = 1):
    
    """
    NOTES:
    tickers = Defaults to ["MSFT","FB"], and uses a list as input
    freq = Defaults to "d" (Note: Can be "d"[daily],"m"[monthly],"q"[quarterly], or "v"[dividends])
    selectColumn = Defaults to "AdjClose". Can choose between "Open", "High", "Low", "Close", "Volume" or "AdjClose".
    fromDate: Defaults to 1yr lag. Must include YYYY/MM/DD or YYYY-MM-DD string as format.
    toDate: Defaults to 1yr lag. Must include YYYY/MM/DD or YYYY-MM-DD string as format.
    lagPeriods: Lag periods default to 1. If they are greater than the frame, a warning will be returned.
    
    """
    import pandas as pd
    import datetime
    # For downloader
    import urllib.request
    import io
    import urllib.error
    import numpy as np
    
    stackData = pd.DataFrame()
    fullData = pd.DataFrame()
    fullDataClean = pd.DataFrame()
    
    tickers = sorted(tickers) # For matching tables, have to sort first
    for ticker in tickers:
        
        data = pd.DataFrame() 
        fromDateInternal = pd.DataFrame() 
        toDateInternal = pd.DataFrame()
        tickersNarrow = pd.DataFrame()

        url = []
        xld = []
        xlds = []


        if fromDate is None:

            toDateInternal = datetime.datetime.now()
            fromDateInternal = toDateInternal - datetime.timedelta(days=360)

            toDateInternal  = str(toDateInternal.date())
            toDateInternal = toDateInternal.replace('-', '/')
            toDateInternal = pd.to_datetime(toDateInternal, format = "%Y/%m/%d").date()


            fromDateInternal  = str(fromDateInternal.date())
            fromDateInternal = fromDateInternal.replace('-', '/')
            fromDateInternal = pd.to_datetime(fromDateInternal, format = "%Y/%m/%d").date()

        else:
            

            fromDateInternal = fromDate
            fromDateInternal = fromDateInternal.replace('-', '/')
            fromDateInternal = datetime.datetime.strptime(fromDateInternal, "%Y/%m/%d").date()


            if toDate == None: 
                toDateInternal = str(datetime.datetime.now().date())
                toDateInternal = toDateInternal.replace('-', '/')
                toDateInternal = datetime.datetime.strptime(toDateInternal, "%Y/%m/%d").date()

            else:
                toDateInternal  = str(toDate)
                toDateInternal = toDateInternal.replace('-', '/')
                toDateInternal = datetime.datetime.strptime(toDateInternal, "%Y/%m/%d").date()

        #print(fromDateInternal)
        #print(toDateInternal)
        
        # Downloader
        url = "http://real-chart.finance.yahoo.com/table.csv?s=" + ticker + "&a=0" + str(fromDateInternal.month) + "&b=" + \
            str(fromDateInternal.day) + "&c=" + str(fromDateInternal.year) + "&d=0" + str(toDateInternal.month) + "&e="  +  \
                str(toDateInternal.day) + "&f=" +  str(toDateInternal.year) + "&g=" + freq + "&ignore=.csv"

        #Checks if 404 error from no data in ticker
        try:
            
            # Download and read in as csv from memory
            xld = urllib.request.urlopen(url).read()
            xld = xld.decode('utf-8')
            xlds = io.StringIO(xld)
            data = pd.read_csv(xlds, delimiter=',', header=None, skiprows=1)
            xlds.close()
            
            data.columns = ["Date", "Open", "High", "Low", "Close", "Volume", "AdjClose"]

            data['Date'] = pd.to_datetime(data['Date'], format = "%Y-%m-%d")
            data[["Open", "High", "Low", "Close", "Volume", "AdjClose"]] = data[["Open", "High", "Low", "Close", "Volume", "AdjClose"]].astype(float)
            data["Security"] = ticker

            stackData = pd.concat([stackData, data])
            
            print(ticker, ": ", url)

            
        except urllib.error.HTTPError as err:
            print(ticker, ": No data for ticker exists in the given date range")
            if err.code == 404:
                continue
            else:
                raise
                
    ## Find all dates and create index
    # Returns all dates among used ticker data
    uniqueDates = stackData.Date.map(lambda t: t.date()).unique()
    uniqueDates = pd.DataFrame(uniqueDates, columns = ['Date'] ).sort_values(by = "Date", ascending = [0])
    uniqueDates['Date'] = pd.to_datetime(uniqueDates['Date'], format = "%Y-%m-%d")

    fullData = uniqueDates
    
    tickersNarrow = stackData.Security.unique() # Find usable tickers
    
    ## Loop through each ticker
    for tickerTemp in tickersNarrow:
        fullData = pd.merge(fullData, stackData[stackData.Security==tickerTemp][['Date',selectColumn]], on = "Date", how = 'left')
        fullData.rename(columns={selectColumn:(tickerTemp + "_" + selectColumn)}, inplace = True)     
        
    # Drop any rows with NaN
    fullDataClean = fullData.dropna()
    
    print()
    print( "fullData Dates range from: ", min(fullData.Date).date(), " to ", max(fullData.Date).date() )
    print( "fullDataClean Dates range from: ", min(fullDataClean.Date).date(), " to ", max(fullDataClean.Date).date() )
    print()
    print("Lengths of each security(for gauging optimal groups):")
    print(len(fullData) - fullData.isnull().sum())
    
    ## Create simple and log returns
    fullDataCleanLn = pd.concat([fullDataClean.Date, np.log(fullDataClean[fullDataClean.columns.difference(['Date'])].pct_change(periods = -lagPeriods) + 1)], axis=1).dropna()
    fullDataCleanSimple = pd.concat([fullDataClean.Date, fullDataClean[fullDataClean.columns.difference(['Date'])].pct_change(periods = -lagPeriods)], axis = 1).dropna()
    # Clean names of returns
    fullDataCleanSimple = fullDataCleanSimple.rename(columns={col: col.replace(selectColumn, "SimpleReturns") for col in fullDataCleanSimple.columns})
    fullDataCleanLn = fullDataCleanLn.rename(columns={col: col.replace(selectColumn, "LnReturns") for col in fullDataCleanLn.columns})

    if len(fullDataCleanLn) == 0:
        print()
        print('Warning: We recommend shorter lag periods to return an adequate return sample')
        
    ## Return all objects
    return fullData, fullDataClean, stackData, fullDataCleanSimple, fullDataCleanLn

In [299]:
## Run the Function

#     freq = Defaults to "d" (Note: Can be "d"[daily],"m"[monthly],"q"[quarterly], or "v"[dividends])
#     selectColumn = Defaults to "AdjClose". Can choose between "Open", "High", "Low", "Close", "Volume" or "AdjClose".
#     fromDate: Defaults to 1yr lag. Must include YYYY/MM/DD or YYYY-MM-DD string as format.
#     toDate: Defaults to 1yr lag. Must include YYYY/MM/DD or YYYY-MM-DD string as format.
#     lagPeriods: Lag periods default to 1. If they are greater than the frame, a warning will be returned.

tickers = ['GOOG', 'C',"F", "MSFT"]
fullData, fullDataClean, stackData, fullDataCleanSimple, fullDataCleanLn = \
    YahooStocks(tickers, freq = "d", selectColumn = "AdjClose", fromDate = "1990/01/01", toDate = "2016/01/01", lagPeriods = 252)


C :  http://real-chart.finance.yahoo.com/table.csv?s=C&a=01&b=1&c=1990&d=01&e=1&f=2016&g=d&ignore=.csv
F :  http://real-chart.finance.yahoo.com/table.csv?s=F&a=01&b=1&c=1990&d=01&e=1&f=2016&g=d&ignore=.csv
GOOG :  http://real-chart.finance.yahoo.com/table.csv?s=GOOG&a=01&b=1&c=1990&d=01&e=1&f=2016&g=d&ignore=.csv
MSFT :  http://real-chart.finance.yahoo.com/table.csv?s=MSFT&a=01&b=1&c=1990&d=01&e=1&f=2016&g=d&ignore=.csv

fullData Dates range from:  1990-02-01  to  2016-02-01
fullDataClean Dates range from:  2004-08-19  to  2016-02-01

Lengths of each security(for gauging optimal groups):
Date             6551
C_AdjClose       6551
F_AdjClose       6551
GOOG_AdjClose    2883
MSFT_AdjClose    6551
dtype: int64

In [300]:
# print(fullData.columns)
# print(fullDataClean.columns)
# print(stackData.columns)
# print(fullDataCleanSimple.columns)
# print(fullDataCleanLn.columns)

In [301]:
fullDataClean.head()


Out[301]:
Date C_AdjClose F_AdjClose GOOG_AdjClose MSFT_AdjClose
0 2016-02-01 42.435122 11.938327 752.000000 53.942693
1 2016-01-29 42.535019 11.809745 742.950012 54.317364
2 2016-01-28 40.347330 11.582255 730.960022 51.329861
3 2016-01-27 40.467206 11.720728 699.989990 50.501642
4 2016-01-26 40.407341 11.730619 713.039978 51.438315

In [302]:
fullDataClean.tail()


Out[302]:
Date C_AdjClose F_AdjClose GOOG_AdjClose MSFT_AdjClose
2878 2004-08-25 379.959291 10.971897 52.947145 19.174246
2879 2004-08-24 376.840845 10.932599 52.382705 18.958492
2880 2004-08-23 376.020198 11.034773 54.645447 18.958492
2881 2004-08-20 377.087029 11.089790 54.100990 18.874975
2882 2004-08-19 373.065866 11.003335 50.119968 18.819461

Plot Multiline Data


In [303]:
import pandas as pd 
import matplotlib.pyplot as plt
import matplotlib.figure 
%matplotlib inline

plt.figure(num=1, figsize=(20, 20))

# Nominal Prices
fullDataClean.plot(x = 'Date')


Out[303]:
<matplotlib.axes._subplots.AxesSubplot at 0x296e5995208>
<matplotlib.figure.Figure at 0x296eab5a748>

In [304]:
# Ln Prices (Easier to view in tandem)
fullDataClean.plot(x = 'Date', loglog = True)


Out[304]:
<matplotlib.axes._subplots.AxesSubplot at 0x296eae5d588>

In [305]:
# Ln YOY Returns with 1.96 Standard Deviations
from pandas import *
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

for colName in fullDataCleanLn[fullDataCleanLn.columns.difference(['Date'])].columns:
    ts = fullDataCleanLn[["Date", colName]]
    ts.plot(kind='line', x = 'Date', loglog = False, linewidth = .3, figsize=(8,4))
    plt.axhline(y = np.mean(ts[colName], axis = 0), c = "red", linewidth = 2, zorder = 0)
    plt.axhline(y = np.mean(ts[colName], axis = 0) + 1.96*np.std(ts[colName], axis = 0), c = "red", linewidth = 1, zorder = 0)
    plt.axhline(y = np.mean(ts[colName], axis = 0) - 1.96*np.std(ts[colName], axis = 0), c = "red", linewidth = 1, zorder = 0)
    plt.axhline(y = 0, c = "green", linewidth = 1.5, zorder = 0)
    plt.show()


Save as Workbook


In [306]:
# Write to XLSX 
writer = pd.ExcelWriter('MultipleSecurities.xlsx')
fullDataClean.to_excel(writer,'fullDataClean')
fullData.to_excel(writer,'fullData')
stackData.to_excel(writer,'stackData')
fullDataCleanSimple.to_excel(writer,'fullDataCleanSimple')
fullDataCleanLn.to_excel(writer,'fullDataCleanLn')
writer.save()

In [ ]: