Summary

This post download data from Yahoo finance and does some basic EDA.



In [1]:

    
!conda env list
%matplotlib inline









    



# conda environments:
#
base                  *  /Users/xianshi/anaconda3
blog                     /Users/xianshi/anaconda3/envs/blog



In [2]:

    
#Importing The Data


from pandas_datareader import data
import pandas as pd

# Define the instruments to download. We would like to see Apple, Microsoft and the S&P500 index.
# tickers = ['W', 'HD', 'AMZN','LOW','LL']
tickers = ['ROIC', 'SKT', 'TCO' ,'SPG' ,'MAC']
# Define which online source one should use
data_source = 'yahoo'

# We would like all available data from 01/01/2000 until 12/31/2016.
start_date = '2016-12-01'
end_date = '2017-12-31'

# User pandas_reader.data.DataReader to load the desired data. As simple as that.
panel_data = data.DataReader(tickers, data_source, start_date, end_date)



In [3]:

    
panel_data.head()









    Out[3]:







  
    
      Attributes
      High
      Low
      ...
      Volume
      Adj Close
    
    
      Symbols
      MAC
      ROIC
      SKT
      SPG
      TCO
      MAC
      ROIC
      SKT
      SPG
      TCO
      ...
      MAC
      ROIC
      SKT
      SPG
      TCO
      MAC
      ROIC
      SKT
      SPG
      TCO
    
    
      Date
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      2016-12-01
      68.279999
      20.629999
      34.669998
      179.380005
      72.290001
      66.809998
      20.190001
      34.090000
      175.869995
      71.010002
      ...
      927500.0
      1041700.0
      745100.0
      2075900.0
      355000.0
      58.302998
      18.167982
      29.123262
      157.796829
      63.129143
    
    
      2016-12-02
      68.690002
      20.799999
      34.990002
      180.789993
      72.790001
      67.540001
      20.370001
      34.380001
      177.419998
      71.459999
      ...
      676100.0
      601200.0
      418800.0
      1273900.0
      250200.0
      58.925732
      18.436207
      29.420441
      160.063568
      63.632339
    
    
      2016-12-05
      68.839996
      20.770000
      34.990002
      180.809998
      72.709999
      67.680000
      20.549999
      34.330002
      178.029999
      71.739998
      ...
      931000.0
      648500.0
      478000.0
      1027300.0
      245000.0
      59.505215
      18.534555
      29.692146
      161.312943
      64.100204
    
    
      2016-12-06
      69.610001
      21.020000
      35.310001
      182.139999
      73.550003
      68.099998
      20.590000
      34.830002
      179.130005
      72.209999
      ...
      915600.0
      590800.0
      391100.0
      1202000.0
      269000.0
      59.721443
      18.597143
      29.946865
      160.732864
      64.877075
    
    
      2016-12-07
      70.830002
      21.350000
      35.950001
      183.759995
      75.320000
      69.169998
      20.860001
      35.310001
      180.509995
      73.660004
      ...
      810800.0
      843500.0
      628000.0
      1278400.0
      406700.0
      61.165833
      18.954781
      30.481783
      163.981232
      65.768707
    
  

5 rows × 30 columns



In [4]:

    
df = panel_data['Adj Close']



In [5]:

    
# Basic Description of the Data

df.describe()



In [6]:

    
first = df.head()
last = df.tail()
print(first)
print(last)









    



Symbols           MAC       ROIC        SKT         SPG        TCO
Date                                                              
2016-12-01  58.302998  18.167982  29.123262  157.796829  63.129143
2016-12-02  58.925732  18.436207  29.420441  160.063568  63.632339
2016-12-05  59.505215  18.534555  29.692146  161.312943  64.100204
2016-12-06  59.721443  18.597143  29.946865  160.732864  64.877075
2016-12-07  61.165833  18.954781  30.481783  163.981232  65.768707
Symbols           MAC       ROIC        SKT         SPG        TCO
Date                                                              
2017-12-22  59.747002  18.089575  22.627144  155.758743  59.732700
2017-12-26  60.232689  18.398554  23.054577  157.893692  60.799690
2017-12-27  60.370136  18.520279  23.428577  159.208237  60.966686
2017-12-28  60.489265  18.735632  23.597769  159.562500  60.911030
2017-12-29  60.186863  18.679453  23.606674  160.112564  60.706913



In [7]:

    
df.sample(6)



In [8]:

    
# A Closer Look At Your Data: Queries

df.query('MAC == ROIC')









    Out[8]:







  
    
      Symbols
      MAC
      ROIC
      SKT
      SPG
      TCO
    
    
      Date



In [9]:

    
#cleaning
print(df.columns[df.isnull().any()])









    



Index([], dtype='object', name='Symbols')



In [10]:

    
# Getting all weekdays between 01/01/2000 and 12/31/2016
all_weekdays = pd.date_range(start=start_date, end=end_date, freq='B')

# How do we align the existing prices in adj_close with our new set of dates?
# All we need to do is reindex adj_close using all_weekdays as the new index
df = df.reindex(all_weekdays)

# Reindexing will insert missing values (NaN) for the dates that were not present
# in the original set. To cope with this, we can fill the missing by replacing them
# with the latest available price for each instrument.
df = df.fillna(method='ffill')
df.isnull().head()









    Out[10]:







  
    
      Symbols
      MAC
      ROIC
      SKT
      SPG
      TCO
    
  
  
    
      2016-12-01
      False
      False
      False
      False
      False
    
    
      2016-12-02
      False
      False
      False
      False
      False
    
    
      2016-12-05
      False
      False
      False
      False
      False
    
    
      2016-12-06
      False
      False
      False
      False
      False
    
    
      2016-12-07
      False
      False
      False
      False
      False



In [11]:

    
# Define your own bins
mybins = range(int(df.MAC.min()), int(df.MAC.max()), 2)

# Cut the data with the help of the bins
df['MAC_bucket'] = pd.cut(df.MAC, bins=mybins)

# Count the number of values per bucket
df['MAC_bucket'].value_counts()









    Out[11]:





(51, 53]    62
(59, 61]    61
(57, 59]    46
(49, 51]    35
(55, 57]    26
(53, 55]    21
(47, 49]    17
Name: MAC_bucket, dtype: int64

Symbols	MAC	ROIC	SKT	SPG	TCO
count	272.000000	272.000000	272.000000	272.000000	272.000000
mean	55.118781	18.442282	24.788283	151.898158	55.258763
std	4.208408	0.656658	3.412448	7.387024	6.499111
min	47.703194	16.672773	19.759792	138.740128	42.535595
25%	51.428945	17.981809	21.999384	146.275017	50.471438
50%	55.492500	18.473031	23.172514	150.871696	54.953302
75%	59.109798	18.914228	28.274007	157.328503	60.127033
max	62.869678	20.064341	31.560102	167.898178	67.782234

Symbols	MAC	ROIC	SKT	SPG	TCO
Date
2017-03-28	56.465996	18.949013	27.943995	151.426422	57.989056
2017-03-10	56.103745	18.396067	26.769661	151.534576	58.936821
2017-04-04	56.819401	19.276661	28.149717	153.688156	59.192852
2017-05-05	55.062401	19.003618	24.801085	148.245621	54.970566
2017-07-13	51.444580	17.836662	22.929958	144.016022	54.236202
2017-03-24	56.430656	19.222054	27.866848	151.336319	58.420258

Symbols	MAC	ROIC	SKT	SPG	TCO
2016-12-01	False	False	False	False	False
2016-12-02	False	False	False	False	False
2016-12-05	False	False	False	False	False
2016-12-06	False	False	False	False	False
2016-12-07	False	False	False	False	False