In [1]:
import numpy as np
import pandas as pd
import Quandl
import os
from sqlalchemy import sqlalchemy  # from flask.ext.

Read Quandl data into a Pandas dataframe, tidy up the columns, verify data integrity and stash it in an SQL database.


In [2]:
ticker = "YAHOO/INDEX_GSPC"  # Set target ticker symbol
# In shell, prior to running notebook: export QUANDL_TOKEN="secret-key"
token = os.environ.get('QUANDL_TOKEN')  # Grab the environment variable

In [3]:
df = Quandl.get(ticker, authtoken=token, trim_start='1990-01-01')  # Using the Quandl module

In [4]:
df_devBackup = df.copy()  # For development only, copy the dataframe in case we make a mistake
# df = df_devBackup.copy()  # Restore the original dataframe

In [5]:
df  # Show dataframe


Out[5]:
Open High Low Close Volume Adj Close
Date
1990-01-02 353.399994 359.690002 351.980011 359.690002 162070000 359.690002
1990-01-03 359.690002 360.589996 357.890015 358.760010 192330000 358.760010
1990-01-04 358.760010 358.760010 352.890015 355.670013 177000000 355.670013
1990-01-05 355.670013 355.670013 351.350006 352.200012 158530000 352.200012
1990-01-08 352.200012 354.239990 350.540009 353.790009 140110000 353.790009
1990-01-09 353.829987 354.170013 349.609985 349.619995 155210000 349.619995
1990-01-10 349.619995 349.619995 344.320007 347.309998 175990000 347.309998
1990-01-11 347.309998 350.140015 347.309998 348.529999 154390000 348.529999
1990-01-12 348.529999 348.529999 339.489990 339.929993 183880000 339.929993
1990-01-15 339.929993 339.940002 336.570007 337.000000 140590000 337.000000
1990-01-16 337.000000 340.750000 333.369995 340.750000 186070000 340.750000
1990-01-17 340.769989 342.010010 336.260010 337.399994 170470000 337.399994
1990-01-18 337.399994 338.380005 333.980011 338.190002 178590000 338.190002
1990-01-19 338.190002 340.480011 338.190002 339.149994 185590000 339.149994
1990-01-22 339.140015 339.959991 330.279999 330.380005 148380000 330.380005
1990-01-23 330.380005 332.760010 328.670013 331.609985 179300000 331.609985
1990-01-24 331.609985 331.709991 324.170013 330.260010 207830000 330.260010
1990-01-25 330.260010 332.329987 325.329987 326.079987 172270000 326.079987
1990-01-26 326.089996 328.579987 321.440002 325.799988 198190000 325.799988
1990-01-29 325.799988 327.309998 321.790009 325.200012 150770000 325.200012
1990-01-30 325.200012 325.730011 319.829987 322.980011 186030000 322.980011
1990-01-31 322.980011 329.079987 322.980011 329.079987 189660000 329.079987
1990-02-01 329.079987 329.859985 327.760010 328.790009 154580000 328.790009
1990-02-02 328.790009 332.100006 328.089996 330.920013 164400000 330.920013
1990-02-05 330.920013 332.160004 330.450012 331.850006 130950000 331.850006
1990-02-06 331.850006 331.859985 328.200012 329.660004 134070000 329.660004
1990-02-07 329.660004 333.760010 326.549988 333.750000 186710000 333.750000
1990-02-08 333.750000 336.089996 332.000000 332.959991 176240000 332.959991
1990-02-09 333.019989 334.600006 332.410004 333.619995 146910000 333.619995
1990-02-12 333.619995 333.619995 329.970001 330.079987 118390000 330.079987
... ... ... ... ... ... ...
2015-12-07 2090.419922 2090.419922 2066.780029 2077.070068 4043820000 2077.070068
2015-12-08 2073.389893 2073.850098 2052.320068 2063.590088 4173570000 2063.590088
2015-12-09 2061.169922 2080.330078 2036.530029 2047.619995 4385250000 2047.619995
2015-12-10 2047.930054 2067.649902 2045.670044 2052.229980 3715150000 2052.229980
2015-12-11 2047.270020 2047.270020 2008.800049 2012.369995 4301060000 2012.369995
2015-12-14 2013.369995 2022.920044 1993.260010 2021.939941 4612440000 2021.939941
2015-12-15 2025.550049 2053.870117 2025.550049 2043.410034 4353540000 2043.410034
2015-12-16 2046.500000 2076.719971 2042.430054 2073.070068 4635450000 2073.070068
2015-12-17 2073.760010 2076.370117 2041.660034 2041.890015 4327390000 2041.890015
2015-12-18 2040.810059 2040.810059 2005.329956 2005.550049 6683070000 2005.550049
2015-12-21 2010.270020 2022.900024 2005.930054 2021.150024 3760280000 2021.150024
2015-12-22 2023.150024 2042.739990 2020.489990 2038.969971 3520860000 2038.969971
2015-12-23 2042.199951 2064.729980 2042.199951 2064.290039 3484090000 2064.290039
2015-12-24 2063.520020 2067.360107 2058.729980 2060.989990 1411860000 2060.989990
2015-12-28 2057.770020 2057.770020 2044.199951 2056.500000 2492510000 2056.500000
2015-12-29 2060.540039 2081.560059 2060.540039 2078.360107 2542000000 2078.360107
2015-12-30 2077.340088 2077.340088 2061.969971 2063.360107 2367430000 2063.360107
2015-12-31 2060.590088 2062.540039 2043.619995 2043.939941 2655330000 2043.939941
2016-01-04 2038.199951 2038.199951 1989.680054 2012.660034 4304880000 2012.660034
2016-01-05 2013.780029 2021.939941 2004.170044 2016.709961 3706620000 2016.709961
2016-01-06 2011.709961 2011.709961 1979.050049 1990.260010 4336660000 1990.260010
2016-01-07 1985.319946 1985.319946 1938.829956 1943.089966 5076590000 1943.089966
2016-01-08 1985.319946 1985.319946 1918.459961 1922.030029 4664940000 1922.030029
2016-01-11 1926.119995 1935.650024 1901.099976 1923.670044 4607290000 1923.670044
2016-01-12 1927.829956 1947.380005 1914.349976 1938.680054 4887260000 1938.680054
2016-01-13 1940.339966 1950.329956 1886.410034 1890.280029 5087030000 1890.280029
2016-01-14 1891.680054 1934.469971 1878.930054 1921.839966 5241110000 1921.839966
2016-01-15 1916.680054 1916.680054 1857.829956 1880.329956 5468460000 1880.329956
2016-01-19 1888.660034 1901.439941 1864.599976 1881.329956 4928350000 1881.329956
2016-01-20 1876.180054 1876.180054 1812.290039 1859.329956 6416070000 1859.329956

6565 rows × 6 columns


In [6]:
old_columns = list(df.columns.values)  # Get the column labels
ticker_tag = ticker.split('_')[-1] + '_'  # Use the ticker symbol as our new prefix
new_labels = [ticker_tag + i.replace(' ', '') for i in old_columns]  # Drop spaces and concatenate
new_columns = dict(zip(old_columns, new_labels))  # Create a dictionary of old and new column labels
new_columns  # Show the column label dictionary


Out[6]:
{'Adj Close': 'GSPC_AdjClose',
 'Close': 'GSPC_Close',
 'High': 'GSPC_High',
 'Low': 'GSPC_Low',
 'Open': 'GSPC_Open',
 'Volume': 'GSPC_Volume'}

In [7]:
df = df.rename(columns=new_columns)  # Rename the columns using our dictionary

In [8]:
df.tail()  # Show tail-end of dataframe


Out[8]:
GSPC_Open GSPC_High GSPC_Low GSPC_Close GSPC_Volume GSPC_AdjClose
Date
2016-01-13 1940.339966 1950.329956 1886.410034 1890.280029 5087030000 1890.280029
2016-01-14 1891.680054 1934.469971 1878.930054 1921.839966 5241110000 1921.839966
2016-01-15 1916.680054 1916.680054 1857.829956 1880.329956 5468460000 1880.329956
2016-01-19 1888.660034 1901.439941 1864.599976 1881.329956 4928350000 1881.329956
2016-01-20 1876.180054 1876.180054 1812.290039 1859.329956 6416070000 1859.329956

In [9]:
nulls = df[~df.applymap(np.isreal).all(1)]  # Search for non-real numbers by negation
nulls  # Show any rows in the dataframe with non-numeric values


Out[9]:
GSPC_Open GSPC_High GSPC_Low GSPC_Close GSPC_Volume GSPC_AdjClose
Date

In [10]:
# Add error checking for non-numeric values
if len(nulls) > 0:
    raise ValueError('Dataframe contains non-numeric values')

In [12]:
# Backup data to sql
engine = sqlalchemy.create_engine('sqlite:///data/dev.db')  
df.to_sql('gspc', engine, if_exists='replace')

df_test = pd.read_sql('gspc', engine)
df_test = df_test.set_index('Date')
df_test


Out[12]:
GSPC_Open GSPC_High GSPC_Low GSPC_Close GSPC_Volume GSPC_AdjClose
Date
1990-01-02 353.399994 359.690002 351.980011 359.690002 162070000 359.690002
1990-01-03 359.690002 360.589996 357.890015 358.760010 192330000 358.760010
1990-01-04 358.760010 358.760010 352.890015 355.670013 177000000 355.670013
1990-01-05 355.670013 355.670013 351.350006 352.200012 158530000 352.200012
1990-01-08 352.200012 354.239990 350.540009 353.790009 140110000 353.790009
1990-01-09 353.829987 354.170013 349.609985 349.619995 155210000 349.619995
1990-01-10 349.619995 349.619995 344.320007 347.309998 175990000 347.309998
1990-01-11 347.309998 350.140015 347.309998 348.529999 154390000 348.529999
1990-01-12 348.529999 348.529999 339.489990 339.929993 183880000 339.929993
1990-01-15 339.929993 339.940002 336.570007 337.000000 140590000 337.000000
1990-01-16 337.000000 340.750000 333.369995 340.750000 186070000 340.750000
1990-01-17 340.769989 342.010010 336.260010 337.399994 170470000 337.399994
1990-01-18 337.399994 338.380005 333.980011 338.190002 178590000 338.190002
1990-01-19 338.190002 340.480011 338.190002 339.149994 185590000 339.149994
1990-01-22 339.140015 339.959991 330.279999 330.380005 148380000 330.380005
1990-01-23 330.380005 332.760010 328.670013 331.609985 179300000 331.609985
1990-01-24 331.609985 331.709991 324.170013 330.260010 207830000 330.260010
1990-01-25 330.260010 332.329987 325.329987 326.079987 172270000 326.079987
1990-01-26 326.089996 328.579987 321.440002 325.799988 198190000 325.799988
1990-01-29 325.799988 327.309998 321.790009 325.200012 150770000 325.200012
1990-01-30 325.200012 325.730011 319.829987 322.980011 186030000 322.980011
1990-01-31 322.980011 329.079987 322.980011 329.079987 189660000 329.079987
1990-02-01 329.079987 329.859985 327.760010 328.790009 154580000 328.790009
1990-02-02 328.790009 332.100006 328.089996 330.920013 164400000 330.920013
1990-02-05 330.920013 332.160004 330.450012 331.850006 130950000 331.850006
1990-02-06 331.850006 331.859985 328.200012 329.660004 134070000 329.660004
1990-02-07 329.660004 333.760010 326.549988 333.750000 186710000 333.750000
1990-02-08 333.750000 336.089996 332.000000 332.959991 176240000 332.959991
1990-02-09 333.019989 334.600006 332.410004 333.619995 146910000 333.619995
1990-02-12 333.619995 333.619995 329.970001 330.079987 118390000 330.079987
... ... ... ... ... ... ...
2015-12-07 2090.419922 2090.419922 2066.780029 2077.070068 4043820000 2077.070068
2015-12-08 2073.389893 2073.850098 2052.320068 2063.590088 4173570000 2063.590088
2015-12-09 2061.169922 2080.330078 2036.530029 2047.619995 4385250000 2047.619995
2015-12-10 2047.930054 2067.649902 2045.670044 2052.229980 3715150000 2052.229980
2015-12-11 2047.270020 2047.270020 2008.800049 2012.369995 4301060000 2012.369995
2015-12-14 2013.369995 2022.920044 1993.260010 2021.939941 4612440000 2021.939941
2015-12-15 2025.550049 2053.870117 2025.550049 2043.410034 4353540000 2043.410034
2015-12-16 2046.500000 2076.719971 2042.430054 2073.070068 4635450000 2073.070068
2015-12-17 2073.760010 2076.370117 2041.660034 2041.890015 4327390000 2041.890015
2015-12-18 2040.810059 2040.810059 2005.329956 2005.550049 6683070000 2005.550049
2015-12-21 2010.270020 2022.900024 2005.930054 2021.150024 3760280000 2021.150024
2015-12-22 2023.150024 2042.739990 2020.489990 2038.969971 3520860000 2038.969971
2015-12-23 2042.199951 2064.729980 2042.199951 2064.290039 3484090000 2064.290039
2015-12-24 2063.520020 2067.360107 2058.729980 2060.989990 1411860000 2060.989990
2015-12-28 2057.770020 2057.770020 2044.199951 2056.500000 2492510000 2056.500000
2015-12-29 2060.540039 2081.560059 2060.540039 2078.360107 2542000000 2078.360107
2015-12-30 2077.340088 2077.340088 2061.969971 2063.360107 2367430000 2063.360107
2015-12-31 2060.590088 2062.540039 2043.619995 2043.939941 2655330000 2043.939941
2016-01-04 2038.199951 2038.199951 1989.680054 2012.660034 4304880000 2012.660034
2016-01-05 2013.780029 2021.939941 2004.170044 2016.709961 3706620000 2016.709961
2016-01-06 2011.709961 2011.709961 1979.050049 1990.260010 4336660000 1990.260010
2016-01-07 1985.319946 1985.319946 1938.829956 1943.089966 5076590000 1943.089966
2016-01-08 1985.319946 1985.319946 1918.459961 1922.030029 4664940000 1922.030029
2016-01-11 1926.119995 1935.650024 1901.099976 1923.670044 4607290000 1923.670044
2016-01-12 1927.829956 1947.380005 1914.349976 1938.680054 4887260000 1938.680054
2016-01-13 1940.339966 1950.329956 1886.410034 1890.280029 5087030000 1890.280029
2016-01-14 1891.680054 1934.469971 1878.930054 1921.839966 5241110000 1921.839966
2016-01-15 1916.680054 1916.680054 1857.829956 1880.329956 5468460000 1880.329956
2016-01-19 1888.660034 1901.439941 1864.599976 1881.329956 4928350000 1881.329956
2016-01-20 1876.180054 1876.180054 1812.290039 1859.329956 6416070000 1859.329956

6565 rows × 6 columns


In [13]:
# Backup data to csv
df.to_csv('data/gspc.csv')

df_test = pd.read_csv('data/gspc.csv')
df_test = df_test.set_index('Date')
df_test


Out[13]:
GSPC_Open GSPC_High GSPC_Low GSPC_Close GSPC_Volume GSPC_AdjClose
Date
1990-01-02 353.399994 359.690002 351.980011 359.690002 162070000 359.690002
1990-01-03 359.690002 360.589996 357.890015 358.760010 192330000 358.760010
1990-01-04 358.760010 358.760010 352.890015 355.670013 177000000 355.670013
1990-01-05 355.670013 355.670013 351.350006 352.200012 158530000 352.200012
1990-01-08 352.200012 354.239990 350.540009 353.790009 140110000 353.790009
1990-01-09 353.829987 354.170013 349.609985 349.619995 155210000 349.619995
1990-01-10 349.619995 349.619995 344.320007 347.309998 175990000 347.309998
1990-01-11 347.309998 350.140015 347.309998 348.529999 154390000 348.529999
1990-01-12 348.529999 348.529999 339.489990 339.929993 183880000 339.929993
1990-01-15 339.929993 339.940002 336.570007 337.000000 140590000 337.000000
1990-01-16 337.000000 340.750000 333.369995 340.750000 186070000 340.750000
1990-01-17 340.769989 342.010010 336.260010 337.399994 170470000 337.399994
1990-01-18 337.399994 338.380005 333.980011 338.190002 178590000 338.190002
1990-01-19 338.190002 340.480011 338.190002 339.149994 185590000 339.149994
1990-01-22 339.140015 339.959991 330.279999 330.380005 148380000 330.380005
1990-01-23 330.380005 332.760010 328.670013 331.609985 179300000 331.609985
1990-01-24 331.609985 331.709991 324.170013 330.260010 207830000 330.260010
1990-01-25 330.260010 332.329987 325.329987 326.079987 172270000 326.079987
1990-01-26 326.089996 328.579987 321.440002 325.799988 198190000 325.799988
1990-01-29 325.799988 327.309998 321.790009 325.200012 150770000 325.200012
1990-01-30 325.200012 325.730011 319.829987 322.980011 186030000 322.980011
1990-01-31 322.980011 329.079987 322.980011 329.079987 189660000 329.079987
1990-02-01 329.079987 329.859985 327.760010 328.790009 154580000 328.790009
1990-02-02 328.790009 332.100006 328.089996 330.920013 164400000 330.920013
1990-02-05 330.920013 332.160004 330.450012 331.850006 130950000 331.850006
1990-02-06 331.850006 331.859985 328.200012 329.660004 134070000 329.660004
1990-02-07 329.660004 333.760010 326.549988 333.750000 186710000 333.750000
1990-02-08 333.750000 336.089996 332.000000 332.959991 176240000 332.959991
1990-02-09 333.019989 334.600006 332.410004 333.619995 146910000 333.619995
1990-02-12 333.619995 333.619995 329.970001 330.079987 118390000 330.079987
... ... ... ... ... ... ...
2015-12-07 2090.419922 2090.419922 2066.780029 2077.070068 4043820000 2077.070068
2015-12-08 2073.389893 2073.850098 2052.320068 2063.590088 4173570000 2063.590088
2015-12-09 2061.169922 2080.330078 2036.530029 2047.619995 4385250000 2047.619995
2015-12-10 2047.930054 2067.649902 2045.670044 2052.229980 3715150000 2052.229980
2015-12-11 2047.270020 2047.270020 2008.800049 2012.369995 4301060000 2012.369995
2015-12-14 2013.369995 2022.920044 1993.260010 2021.939941 4612440000 2021.939941
2015-12-15 2025.550049 2053.870117 2025.550049 2043.410034 4353540000 2043.410034
2015-12-16 2046.500000 2076.719971 2042.430054 2073.070068 4635450000 2073.070068
2015-12-17 2073.760010 2076.370117 2041.660034 2041.890015 4327390000 2041.890015
2015-12-18 2040.810059 2040.810059 2005.329956 2005.550049 6683070000 2005.550049
2015-12-21 2010.270020 2022.900024 2005.930054 2021.150024 3760280000 2021.150024
2015-12-22 2023.150024 2042.739990 2020.489990 2038.969971 3520860000 2038.969971
2015-12-23 2042.199951 2064.729980 2042.199951 2064.290039 3484090000 2064.290039
2015-12-24 2063.520020 2067.360107 2058.729980 2060.989990 1411860000 2060.989990
2015-12-28 2057.770020 2057.770020 2044.199951 2056.500000 2492510000 2056.500000
2015-12-29 2060.540039 2081.560059 2060.540039 2078.360107 2542000000 2078.360107
2015-12-30 2077.340088 2077.340088 2061.969971 2063.360107 2367430000 2063.360107
2015-12-31 2060.590088 2062.540039 2043.619995 2043.939941 2655330000 2043.939941
2016-01-04 2038.199951 2038.199951 1989.680054 2012.660034 4304880000 2012.660034
2016-01-05 2013.780029 2021.939941 2004.170044 2016.709961 3706620000 2016.709961
2016-01-06 2011.709961 2011.709961 1979.050049 1990.260010 4336660000 1990.260010
2016-01-07 1985.319946 1985.319946 1938.829956 1943.089966 5076590000 1943.089966
2016-01-08 1985.319946 1985.319946 1918.459961 1922.030029 4664940000 1922.030029
2016-01-11 1926.119995 1935.650024 1901.099976 1923.670044 4607290000 1923.670044
2016-01-12 1927.829956 1947.380005 1914.349976 1938.680054 4887260000 1938.680054
2016-01-13 1940.339966 1950.329956 1886.410034 1890.280029 5087030000 1890.280029
2016-01-14 1891.680054 1934.469971 1878.930054 1921.839966 5241110000 1921.839966
2016-01-15 1916.680054 1916.680054 1857.829956 1880.329956 5468460000 1880.329956
2016-01-19 1888.660034 1901.439941 1864.599976 1881.329956 4928350000 1881.329956
2016-01-20 1876.180054 1876.180054 1812.290039 1859.329956 6416070000 1859.329956

6565 rows × 6 columns


In [14]:
def load(ticker):
    """Load data from Quandl into a dataframe, modify 
    column names and check for non-numeric values."""
    # Grab the Quandl token
    token = os.environ.get('QUANDL_TOKEN')
    if token is None:
        token = input("Enter Quandl token: ")
    # Call Quandl module, trim input by default from 1990 forward
    df = Quandl.get(ticker, authtoken=token, trim_start='1990-01-01')  
    # Get the column labels
    old_columns = list(df.columns.values)  
    # Use the ticker symbol as our new prefix
    ticker_tag = ticker.split('_')[-1] + '_'  
    # Drop spaces and concatenate
    new_labels = [ticker_tag + i.replace(' ', '') for i in old_columns]  
    # Create a dictionary of old and new column labels
    new_columns = dict(zip(old_columns, new_labels))  
    # Rename the columns using our dictionary
    df = df.rename(columns=new_columns)  
    nulls = df[~df.applymap(np.isreal).all(1)]
    # Check for non-numeric values
    if len(nulls) > 0:
        raise ValueError('Dataframe contains non-numeric values')
    row_count = len(df)
    print('%d rows loaded into dataframe.' % row_count) 
    return df

In [15]:
df = load("YAHOO/INDEX_GSPC")


6565 rows loaded into dataframe.