Part 5

The following parts of the finance tutorial are better suited for a separate notebook, since this might run for a while, and it's a one-time operation.


In [1]:
import bs4 as bs
import pickle
import requests

In [18]:
def save_sp500_tickers():
    resp = requests.get("https://en.wikipedia.org/wiki/List_of_S%26P_500_companies")
    soup = bs.BeautifulSoup(resp.text, "html.parser")
    table = soup.find('table', {"class":"wikitable sortable"})
    tickers = []
    for row in table.findAll('tr')[1:]:
        ticker = row.findAll('td')[0].text
        mapping = str.maketrans(".","-")
        ticker = ticker.translate(mapping)
        tickers.append(ticker)
        
    with open('data/sp500tickers.pkl', "wb") as f:
        pickle.dump(tickers, f)
        
    return tickers

In [19]:
save_sp500_tickers()


Out[19]:
['MMM',
 'ABT',
 'ABBV',
 'ACN',
 'ATVI',
 'AYI',
 'ADBE',
 'AAP',
 'AES',
 'AET',
 'AMG',
 'AFL',
 'A',
 'APD',
 'AKAM',
 'ALK',
 'ALB',
 'AGN',
 'LNT',
 'ALXN',
 'ALLE',
 'ADS',
 'ALL',
 'GOOGL',
 'GOOG',
 'MO',
 'AMZN',
 'AEE',
 'AAL',
 'AEP',
 'AXP',
 'AIG',
 'AMT',
 'AWK',
 'AMP',
 'ABC',
 'AME',
 'AMGN',
 'APH',
 'APC',
 'ADI',
 'ANTM',
 'AON',
 'APA',
 'AIV',
 'AAPL',
 'AMAT',
 'ADM',
 'ARNC',
 'AJG',
 'AIZ',
 'T',
 'ADSK',
 'ADP',
 'AN',
 'AZO',
 'AVB',
 'AVY',
 'BHI',
 'BLL',
 'BAC',
 'BK',
 'BCR',
 'BAX',
 'BBT',
 'BDX',
 'BBBY',
 'BRK-B',
 'BBY',
 'BIIB',
 'BLK',
 'HRB',
 'BA',
 'BWA',
 'BXP',
 'BSX',
 'BMY',
 'AVGO',
 'BF-B',
 'CHRW',
 'CA',
 'COG',
 'CPB',
 'COF',
 'CAH',
 'HSIC',
 'KMX',
 'CCL',
 'CAT',
 'CBG',
 'CBS',
 'CELG',
 'CNC',
 'CNP',
 'CTL',
 'CERN',
 'CF',
 'SCHW',
 'CHTR',
 'CHK',
 'CVX',
 'CMG',
 'CB',
 'CHD',
 'CI',
 'XEC',
 'CINF',
 'CTAS',
 'CSCO',
 'C',
 'CFG',
 'CTXS',
 'CLX',
 'CME',
 'CMS',
 'COH',
 'KO',
 'CTSH',
 'CL',
 'CMCSA',
 'CMA',
 'CAG',
 'CXO',
 'COP',
 'ED',
 'STZ',
 'GLW',
 'COST',
 'COTY',
 'CCI',
 'CSRA',
 'CSX',
 'CMI',
 'CVS',
 'DHI',
 'DHR',
 'DRI',
 'DVA',
 'DE',
 'DLPH',
 'DAL',
 'XRAY',
 'DVN',
 'DLR',
 'DFS',
 'DISCA',
 'DISCK',
 'DG',
 'DLTR',
 'D',
 'DOV',
 'DOW',
 'DPS',
 'DTE',
 'DD',
 'DUK',
 'DNB',
 'ETFC',
 'EMN',
 'ETN',
 'EBAY',
 'ECL',
 'EIX',
 'EW',
 'EA',
 'EMR',
 'ENDP',
 'ETR',
 'EVHC',
 'EOG',
 'EQT',
 'EFX',
 'EQIX',
 'EQR',
 'ESS',
 'EL',
 'ES',
 'EXC',
 'EXPE',
 'EXPD',
 'ESRX',
 'EXR',
 'XOM',
 'FFIV',
 'FB',
 'FAST',
 'FRT',
 'FDX',
 'FIS',
 'FITB',
 'FSLR',
 'FE',
 'FISV',
 'FLIR',
 'FLS',
 'FLR',
 'FMC',
 'FTI',
 'FL',
 'F',
 'FTV',
 'FBHS',
 'BEN',
 'FCX',
 'FTR',
 'GPS',
 'GRMN',
 'GD',
 'GE',
 'GGP',
 'GIS',
 'GM',
 'GPC',
 'GILD',
 'GPN',
 'GS',
 'GT',
 'GWW',
 'HAL',
 'HBI',
 'HOG',
 'HAR',
 'HRS',
 'HIG',
 'HAS',
 'HCA',
 'HCP',
 'HP',
 'HES',
 'HPE',
 'HOLX',
 'HD',
 'HON',
 'HRL',
 'HST',
 'HPQ',
 'HUM',
 'HBAN',
 'IDXX',
 'ITW',
 'ILMN',
 'IR',
 'INTC',
 'ICE',
 'IBM',
 'IP',
 'IPG',
 'IFF',
 'INTU',
 'ISRG',
 'IVZ',
 'IRM',
 'JEC',
 'JBHT',
 'SJM',
 'JNJ',
 'JCI',
 'JPM',
 'JNPR',
 'KSU',
 'K',
 'KEY',
 'KMB',
 'KIM',
 'KMI',
 'KLAC',
 'KSS',
 'KHC',
 'KR',
 'LB',
 'LLL',
 'LH',
 'LRCX',
 'LEG',
 'LEN',
 'LVLT',
 'LUK',
 'LLY',
 'LNC',
 'LLTC',
 'LKQ',
 'LMT',
 'L',
 'LOW',
 'LYB',
 'MTB',
 'MAC',
 'M',
 'MNK',
 'MRO',
 'MPC',
 'MAR',
 'MMC',
 'MLM',
 'MAS',
 'MA',
 'MAT',
 'MKC',
 'MCD',
 'MCK',
 'MJN',
 'MDT',
 'MRK',
 'MET',
 'MTD',
 'KORS',
 'MCHP',
 'MU',
 'MSFT',
 'MAA',
 'MHK',
 'TAP',
 'MDLZ',
 'MON',
 'MNST',
 'MCO',
 'MS',
 'MOS',
 'MSI',
 'MUR',
 'MYL',
 'NDAQ',
 'NOV',
 'NAVI',
 'NTAP',
 'NFLX',
 'NWL',
 'NFX',
 'NEM',
 'NWSA',
 'NWS',
 'NEE',
 'NLSN',
 'NKE',
 'NI',
 'NBL',
 'JWN',
 'NSC',
 'NTRS',
 'NOC',
 'NRG',
 'NUE',
 'NVDA',
 'ORLY',
 'OXY',
 'OMC',
 'OKE',
 'ORCL',
 'PCAR',
 'PH',
 'PDCO',
 'PAYX',
 'PYPL',
 'PNR',
 'PBCT',
 'PEP',
 'PKI',
 'PRGO',
 'PFE',
 'PCG',
 'PM',
 'PSX',
 'PNW',
 'PXD',
 'PBI',
 'PNC',
 'RL',
 'PPG',
 'PPL',
 'PX',
 'PCLN',
 'PFG',
 'PG',
 'PGR',
 'PLD',
 'PRU',
 'PEG',
 'PSA',
 'PHM',
 'PVH',
 'QRVO',
 'PWR',
 'QCOM',
 'DGX',
 'RRC',
 'RTN',
 'O',
 'RHT',
 'REGN',
 'RF',
 'RSG',
 'RAI',
 'RHI',
 'ROK',
 'COL',
 'ROP',
 'ROST',
 'RCL',
 'R',
 'CRM',
 'SCG',
 'SLB',
 'SNI',
 'STX',
 'SEE',
 'SRE',
 'SHW',
 'SIG',
 'SPG',
 'SWKS',
 'SLG',
 'SNA',
 'SO',
 'LUV',
 'SWN',
 'SE',
 'SPGI',
 'SWK',
 'SPLS',
 'SBUX',
 'STT',
 'SRCL',
 'SYK',
 'STI',
 'SYMC',
 'SYF',
 'SYY',
 'TROW',
 'TGT',
 'TEL',
 'TGNA',
 'TDC',
 'TSO',
 'TXN',
 'TXT',
 'COO',
 'HSY',
 'TRV',
 'TMO',
 'TIF',
 'TWX',
 'TJX',
 'TMK',
 'TSS',
 'TSCO',
 'TDG',
 'RIG',
 'TRIP',
 'FOXA',
 'FOX',
 'TSN',
 'UDR',
 'ULTA',
 'USB',
 'UA',
 'UAA',
 'UNP',
 'UAL',
 'UNH',
 'UPS',
 'URI',
 'UTX',
 'UHS',
 'UNM',
 'URBN',
 'VFC',
 'VLO',
 'VAR',
 'VTR',
 'VRSN',
 'VRSK',
 'VZ',
 'VRTX',
 'VIAB',
 'V',
 'VNO',
 'VMC',
 'WMT',
 'WBA',
 'DIS',
 'WM',
 'WAT',
 'WEC',
 'WFC',
 'HCN',
 'WDC',
 'WU',
 'WRK',
 'WY',
 'WHR',
 'WFM',
 'WMB',
 'WLTW',
 'WYN',
 'WYNN',
 'XEL',
 'XRX',
 'XLNX',
 'XL',
 'XYL',
 'YHOO',
 'YUM',
 'ZBH',
 'ZION',
 'ZTS']

Part 6

Here we are going to get the data from Yahoo! Finance.


In [12]:
import os
import datetime as dt
import pandas as pd
import pandas_datareader.data as web

In [15]:
def get_data_from_yahoo(reload_sp500=False):
    if reload_sp500:
        tickers = save_sp500_tickers()
    else:
        with open('data/sp500tickers.pkl', "rb") as f:
            tickers = pickle.load(f)
    
    if not os.path.exists('stock_dfs'):
        os.makedirs('stock_dfs')
        
    start = dt.datetime(2000,1,1)
    end = dt.datetime(2016,12,31)
    
    for ticker in tickers:
        print(ticker)
        if not os.path.exists('stock_dfs/{}.csv'.format(ticker)):
            df = web.DataReader(ticker, 'yahoo', start, end)
            df.to_csv("stock_dfs/{}.csv".format(ticker))
        else:
            print("Already have {}".format(ticker))

In [20]:
get_data_from_yahoo()


MMM
Already have MMM
ABT
Already have ABT
ABBV
Already have ABBV
ACN
Already have ACN
ATVI
Already have ATVI
AYI
Already have AYI
ADBE
Already have ADBE
AAP
Already have AAP
AES
Already have AES
AET
Already have AET
AMG
Already have AMG
AFL
Already have AFL
A
Already have A
APD
Already have APD
AKAM
Already have AKAM
ALK
Already have ALK
ALB
Already have ALB
AGN
Already have AGN
LNT
Already have LNT
ALXN
Already have ALXN
ALLE
Already have ALLE
ADS
Already have ADS
ALL
Already have ALL
GOOGL
Already have GOOGL
GOOG
Already have GOOG
MO
Already have MO
AMZN
Already have AMZN
AEE
Already have AEE
AAL
Already have AAL
AEP
Already have AEP
AXP
Already have AXP
AIG
Already have AIG
AMT
Already have AMT
AWK
Already have AWK
AMP
Already have AMP
ABC
Already have ABC
AME
Already have AME
AMGN
Already have AMGN
APH
Already have APH
APC
Already have APC
ADI
Already have ADI
ANTM
Already have ANTM
AON
Already have AON
APA
Already have APA
AIV
Already have AIV
AAPL
Already have AAPL
AMAT
Already have AMAT
ADM
Already have ADM
ARNC
Already have ARNC
AJG
Already have AJG
AIZ
Already have AIZ
T
Already have T
ADSK
Already have ADSK
ADP
Already have ADP
AN
Already have AN
AZO
Already have AZO
AVB
Already have AVB
AVY
Already have AVY
BHI
Already have BHI
BLL
Already have BLL
BAC
Already have BAC
BK
Already have BK
BCR
Already have BCR
BAX
Already have BAX
BBT
Already have BBT
BDX
Already have BDX
BBBY
Already have BBBY
BRK-B
BBY
BIIB
BLK
HRB
BA
BWA
BXP
BSX
BMY
AVGO
BF-B
CHRW
CA
COG
CPB
COF
CAH
HSIC
KMX
CCL
CAT
CBG
CBS
CELG
CNC
CNP
CTL
CERN
CF
SCHW
CHTR
CHK
CVX
CMG
CB
CHD
CI
XEC
CINF
CTAS
CSCO
C
CFG
CTXS
CLX
CME
CMS
COH
KO
CTSH
CL
CMCSA
CMA
CAG
CXO
COP
ED
STZ
GLW
COST
COTY
CCI
CSRA
CSX
CMI
CVS
DHI
DHR
DRI
DVA
DE
DLPH
DAL
XRAY
DVN
DLR
DFS
DISCA
DISCK
DG
DLTR
D
DOV
DOW
DPS
DTE
DD
DUK
DNB
ETFC
EMN
ETN
EBAY
ECL
EIX
EW
EA
EMR
ENDP
ETR
EVHC
EOG
EQT
EFX
EQIX
EQR
ESS
EL
ES
EXC
EXPE
EXPD
ESRX
EXR
XOM
FFIV
FB
FAST
FRT
FDX
FIS
FITB
FSLR
FE
FISV
FLIR
FLS
FLR
FMC
FTI
FL
F
FTV
FBHS
BEN
FCX
FTR
GPS
GRMN
GD
GE
GGP
GIS
GM
GPC
GILD
GPN
GS
GT
GWW
HAL
HBI
HOG
HAR
HRS
HIG
HAS
HCA
HCP
HP
HES
HPE
HOLX
HD
HON
HRL
HST
HPQ
HUM
HBAN
IDXX
ITW
ILMN
IR
INTC
ICE
IBM
IP
IPG
IFF
INTU
ISRG
IVZ
IRM
JEC
JBHT
SJM
JNJ
JCI
JPM
JNPR
KSU
K
KEY
KMB
KIM
KMI
KLAC
KSS
KHC
KR
LB
LLL
LH
LRCX
LEG
LEN
LVLT
LUK
LLY
LNC
LLTC
LKQ
LMT
L
LOW
LYB
MTB
MAC
M
MNK
MRO
MPC
MAR
MMC
MLM
MAS
MA
MAT
MKC
MCD
MCK
MJN
MDT
MRK
MET
MTD
KORS
MCHP
MU
MSFT
MAA
MHK
TAP
MDLZ
MON
MNST
MCO
MS
MOS
MSI
MUR
MYL
NDAQ
NOV
NAVI
NTAP
NFLX
NWL
NFX
NEM
NWSA
NWS
NEE
NLSN
NKE
NI
NBL
JWN
NSC
NTRS
NOC
NRG
NUE
NVDA
ORLY
OXY
OMC
OKE
ORCL
PCAR
PH
PDCO
PAYX
PYPL
PNR
PBCT
PEP
PKI
PRGO
PFE
PCG
PM
PSX
PNW
PXD
PBI
PNC
RL
PPG
PPL
PX
PCLN
PFG
PG
PGR
PLD
PRU
PEG
PSA
PHM
PVH
QRVO
PWR
QCOM
DGX
RRC
RTN
O
RHT
REGN
RF
RSG
RAI
RHI
ROK
COL
ROP
ROST
RCL
R
CRM
SCG
SLB
SNI
STX
SEE
SRE
SHW
SIG
SPG
SWKS
SLG
SNA
SO
LUV
SWN
SE
SPGI
SWK
SPLS
SBUX
STT
SRCL
SYK
STI
SYMC
SYF
SYY
TROW
TGT
TEL
TGNA
TDC
TSO
TXN
TXT
COO
HSY
TRV
TMO
TIF
TWX
TJX
TMK
TSS
TSCO
TDG
RIG
TRIP
FOXA
FOX
TSN
UDR
ULTA
USB
UA
UAA
UNP
UAL
UNH
UPS
URI
UTX
UHS
UNM
URBN
VFC
VLO
VAR
VTR
VRSN
VRSK
VZ
VRTX
VIAB
V
VNO
VMC
WMT
WBA
DIS
WM
WAT
WEC
WFC
HCN
WDC
WU
WRK
WY
WHR
WFM
WMB
WLTW
WYN
WYNN
XEL
XRX
XLNX
XL
XYL
YHOO
YUM
ZBH
ZION
ZTS

In [25]:
def compile_data():
    with open("sp500tickers.pkl", "rb") as f:
        tickers = pickle.load(f)
        
    main_df = pd.DataFrame()
    
    for count, ticker in enumerate(tickers):
        df = pd.read_csv('stock_dfs/{}.csv'.format(ticker))
        df.set_index('Date', inplace=True)
        
        df.rename(columns= {'Adj Close': ticker}, inplace=True)
        df.drop(['Open', 'High', 'Low', 'Close', 'Volume'], axis=1, inplace=True)
        
        if main_df.empty:
            main_df = df
        else:
            main_df = main_df.join(df, how='outer')
            
        if count % 10 == 0:
            print(count)
        
    print(main_df.head())
    main_df.to_csv('data/sp500_joined_closes.csv')

In [26]:
compile_data()


0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
490
500
                  MMM       ABT  ABBV  ACN      ATVI  AYI       ADBE  AAP  \
Date                                                                        
2000-01-03  31.333866  9.517434   NaN  NaN  1.259640  NaN  16.274673  NaN   
2000-01-04  30.088811  9.245507   NaN  NaN  1.221324  NaN  14.909400  NaN   
2000-01-05  30.960349  9.228512   NaN  NaN  1.226114  NaN  15.204175  NaN   
2000-01-06  33.450458  9.551425   NaN  NaN  1.202166  NaN  15.328291  NaN   
2000-01-07  34.114487  9.653397   NaN  NaN  1.235693  NaN  16.072985  NaN   

                  AES       AET ...        XEL        XRX       XLNX  \
Date                            ...                                    
2000-01-03  32.346420  6.290820 ...   8.665217  12.447396  35.497325   
2000-01-04  31.063717  6.184675 ...   8.864745  11.868447  34.690567   
2000-01-05  31.398335  6.142218 ...   9.206793  12.479560  33.931266   
2000-01-06  31.649299  6.149294 ...   9.121281  12.222249  30.988975   
2000-01-07  32.262765  6.552642 ...   9.121281  12.511724  34.500742   

                   XL  XYL        YHOO       YUM  ZBH       ZION  ZTS  
Date                                                                   
2000-01-03  32.275336  NaN  118.750000  5.302430  NaN  43.725422  NaN  
2000-01-04  30.822554  NaN  110.750000  5.195849  NaN  41.608087  NaN  
2000-01-05  31.097404  NaN  102.625000  5.222494  NaN  41.558847  NaN  
2000-01-06  31.332991  NaN   92.046875  5.178085  NaN  42.149731  NaN  
2000-01-07  33.414004  NaN  101.812500  5.062622  NaN  42.248211  NaN  

[5 rows x 505 columns]

In [ ]: