In [1]:
import numpy as np
import pandas as pd
import Quandl
import os
from sqlalchemy import sqlalchemy # from flask.ext.
Read Quandl data into a Pandas dataframe, tidy up the columns, verify data integrity and stash it in an SQL database.
In [2]:
ticker = "YAHOO/INDEX_GSPC" # Set target ticker symbol
# In shell, prior to running notebook: export QUANDL_TOKEN="secret-key"
token = os.environ.get('QUANDL_TOKEN') # Grab the environment variable
In [3]:
df = Quandl.get(ticker, authtoken=token, trim_start='1990-01-01') # Using the Quandl module
In [4]:
df_devBackup = df.copy() # For development only, copy the dataframe in case we make a mistake
# df = df_devBackup.copy() # Restore the original dataframe
In [5]:
df # Show dataframe
Out[5]:
Open
High
Low
Close
Volume
Adj Close
Date
1990-01-02
353.399994
359.690002
351.980011
359.690002
162070000
359.690002
1990-01-03
359.690002
360.589996
357.890015
358.760010
192330000
358.760010
1990-01-04
358.760010
358.760010
352.890015
355.670013
177000000
355.670013
1990-01-05
355.670013
355.670013
351.350006
352.200012
158530000
352.200012
1990-01-08
352.200012
354.239990
350.540009
353.790009
140110000
353.790009
1990-01-09
353.829987
354.170013
349.609985
349.619995
155210000
349.619995
1990-01-10
349.619995
349.619995
344.320007
347.309998
175990000
347.309998
1990-01-11
347.309998
350.140015
347.309998
348.529999
154390000
348.529999
1990-01-12
348.529999
348.529999
339.489990
339.929993
183880000
339.929993
1990-01-15
339.929993
339.940002
336.570007
337.000000
140590000
337.000000
1990-01-16
337.000000
340.750000
333.369995
340.750000
186070000
340.750000
1990-01-17
340.769989
342.010010
336.260010
337.399994
170470000
337.399994
1990-01-18
337.399994
338.380005
333.980011
338.190002
178590000
338.190002
1990-01-19
338.190002
340.480011
338.190002
339.149994
185590000
339.149994
1990-01-22
339.140015
339.959991
330.279999
330.380005
148380000
330.380005
1990-01-23
330.380005
332.760010
328.670013
331.609985
179300000
331.609985
1990-01-24
331.609985
331.709991
324.170013
330.260010
207830000
330.260010
1990-01-25
330.260010
332.329987
325.329987
326.079987
172270000
326.079987
1990-01-26
326.089996
328.579987
321.440002
325.799988
198190000
325.799988
1990-01-29
325.799988
327.309998
321.790009
325.200012
150770000
325.200012
1990-01-30
325.200012
325.730011
319.829987
322.980011
186030000
322.980011
1990-01-31
322.980011
329.079987
322.980011
329.079987
189660000
329.079987
1990-02-01
329.079987
329.859985
327.760010
328.790009
154580000
328.790009
1990-02-02
328.790009
332.100006
328.089996
330.920013
164400000
330.920013
1990-02-05
330.920013
332.160004
330.450012
331.850006
130950000
331.850006
1990-02-06
331.850006
331.859985
328.200012
329.660004
134070000
329.660004
1990-02-07
329.660004
333.760010
326.549988
333.750000
186710000
333.750000
1990-02-08
333.750000
336.089996
332.000000
332.959991
176240000
332.959991
1990-02-09
333.019989
334.600006
332.410004
333.619995
146910000
333.619995
1990-02-12
333.619995
333.619995
329.970001
330.079987
118390000
330.079987
...
...
...
...
...
...
...
2015-12-07
2090.419922
2090.419922
2066.780029
2077.070068
4043820000
2077.070068
2015-12-08
2073.389893
2073.850098
2052.320068
2063.590088
4173570000
2063.590088
2015-12-09
2061.169922
2080.330078
2036.530029
2047.619995
4385250000
2047.619995
2015-12-10
2047.930054
2067.649902
2045.670044
2052.229980
3715150000
2052.229980
2015-12-11
2047.270020
2047.270020
2008.800049
2012.369995
4301060000
2012.369995
2015-12-14
2013.369995
2022.920044
1993.260010
2021.939941
4612440000
2021.939941
2015-12-15
2025.550049
2053.870117
2025.550049
2043.410034
4353540000
2043.410034
2015-12-16
2046.500000
2076.719971
2042.430054
2073.070068
4635450000
2073.070068
2015-12-17
2073.760010
2076.370117
2041.660034
2041.890015
4327390000
2041.890015
2015-12-18
2040.810059
2040.810059
2005.329956
2005.550049
6683070000
2005.550049
2015-12-21
2010.270020
2022.900024
2005.930054
2021.150024
3760280000
2021.150024
2015-12-22
2023.150024
2042.739990
2020.489990
2038.969971
3520860000
2038.969971
2015-12-23
2042.199951
2064.729980
2042.199951
2064.290039
3484090000
2064.290039
2015-12-24
2063.520020
2067.360107
2058.729980
2060.989990
1411860000
2060.989990
2015-12-28
2057.770020
2057.770020
2044.199951
2056.500000
2492510000
2056.500000
2015-12-29
2060.540039
2081.560059
2060.540039
2078.360107
2542000000
2078.360107
2015-12-30
2077.340088
2077.340088
2061.969971
2063.360107
2367430000
2063.360107
2015-12-31
2060.590088
2062.540039
2043.619995
2043.939941
2655330000
2043.939941
2016-01-04
2038.199951
2038.199951
1989.680054
2012.660034
4304880000
2012.660034
2016-01-05
2013.780029
2021.939941
2004.170044
2016.709961
3706620000
2016.709961
2016-01-06
2011.709961
2011.709961
1979.050049
1990.260010
4336660000
1990.260010
2016-01-07
1985.319946
1985.319946
1938.829956
1943.089966
5076590000
1943.089966
2016-01-08
1985.319946
1985.319946
1918.459961
1922.030029
4664940000
1922.030029
2016-01-11
1926.119995
1935.650024
1901.099976
1923.670044
4607290000
1923.670044
2016-01-12
1927.829956
1947.380005
1914.349976
1938.680054
4887260000
1938.680054
2016-01-13
1940.339966
1950.329956
1886.410034
1890.280029
5087030000
1890.280029
2016-01-14
1891.680054
1934.469971
1878.930054
1921.839966
5241110000
1921.839966
2016-01-15
1916.680054
1916.680054
1857.829956
1880.329956
5468460000
1880.329956
2016-01-19
1888.660034
1901.439941
1864.599976
1881.329956
4928350000
1881.329956
2016-01-20
1876.180054
1876.180054
1812.290039
1859.329956
6416070000
1859.329956
6565 rows × 6 columns
In [6]:
old_columns = list(df.columns.values) # Get the column labels
ticker_tag = ticker.split('_')[-1] + '_' # Use the ticker symbol as our new prefix
new_labels = [ticker_tag + i.replace(' ', '') for i in old_columns] # Drop spaces and concatenate
new_columns = dict(zip(old_columns, new_labels)) # Create a dictionary of old and new column labels
new_columns # Show the column label dictionary
Out[6]:
{'Adj Close': 'GSPC_AdjClose',
'Close': 'GSPC_Close',
'High': 'GSPC_High',
'Low': 'GSPC_Low',
'Open': 'GSPC_Open',
'Volume': 'GSPC_Volume'}
In [7]:
df = df.rename(columns=new_columns) # Rename the columns using our dictionary
In [8]:
df.tail() # Show tail-end of dataframe
Out[8]:
GSPC_Open
GSPC_High
GSPC_Low
GSPC_Close
GSPC_Volume
GSPC_AdjClose
Date
2016-01-13
1940.339966
1950.329956
1886.410034
1890.280029
5087030000
1890.280029
2016-01-14
1891.680054
1934.469971
1878.930054
1921.839966
5241110000
1921.839966
2016-01-15
1916.680054
1916.680054
1857.829956
1880.329956
5468460000
1880.329956
2016-01-19
1888.660034
1901.439941
1864.599976
1881.329956
4928350000
1881.329956
2016-01-20
1876.180054
1876.180054
1812.290039
1859.329956
6416070000
1859.329956
In [9]:
nulls = df[~df.applymap(np.isreal).all(1)] # Search for non-real numbers by negation
nulls # Show any rows in the dataframe with non-numeric values
Out[9]:
GSPC_Open
GSPC_High
GSPC_Low
GSPC_Close
GSPC_Volume
GSPC_AdjClose
Date
In [10]:
# Add error checking for non-numeric values
if len(nulls) > 0:
raise ValueError('Dataframe contains non-numeric values')
In [12]:
# Backup data to sql
engine = sqlalchemy.create_engine('sqlite:///data/dev.db')
df.to_sql('gspc', engine, if_exists='replace')
df_test = pd.read_sql('gspc', engine)
df_test = df_test.set_index('Date')
df_test
Out[12]:
GSPC_Open
GSPC_High
GSPC_Low
GSPC_Close
GSPC_Volume
GSPC_AdjClose
Date
1990-01-02
353.399994
359.690002
351.980011
359.690002
162070000
359.690002
1990-01-03
359.690002
360.589996
357.890015
358.760010
192330000
358.760010
1990-01-04
358.760010
358.760010
352.890015
355.670013
177000000
355.670013
1990-01-05
355.670013
355.670013
351.350006
352.200012
158530000
352.200012
1990-01-08
352.200012
354.239990
350.540009
353.790009
140110000
353.790009
1990-01-09
353.829987
354.170013
349.609985
349.619995
155210000
349.619995
1990-01-10
349.619995
349.619995
344.320007
347.309998
175990000
347.309998
1990-01-11
347.309998
350.140015
347.309998
348.529999
154390000
348.529999
1990-01-12
348.529999
348.529999
339.489990
339.929993
183880000
339.929993
1990-01-15
339.929993
339.940002
336.570007
337.000000
140590000
337.000000
1990-01-16
337.000000
340.750000
333.369995
340.750000
186070000
340.750000
1990-01-17
340.769989
342.010010
336.260010
337.399994
170470000
337.399994
1990-01-18
337.399994
338.380005
333.980011
338.190002
178590000
338.190002
1990-01-19
338.190002
340.480011
338.190002
339.149994
185590000
339.149994
1990-01-22
339.140015
339.959991
330.279999
330.380005
148380000
330.380005
1990-01-23
330.380005
332.760010
328.670013
331.609985
179300000
331.609985
1990-01-24
331.609985
331.709991
324.170013
330.260010
207830000
330.260010
1990-01-25
330.260010
332.329987
325.329987
326.079987
172270000
326.079987
1990-01-26
326.089996
328.579987
321.440002
325.799988
198190000
325.799988
1990-01-29
325.799988
327.309998
321.790009
325.200012
150770000
325.200012
1990-01-30
325.200012
325.730011
319.829987
322.980011
186030000
322.980011
1990-01-31
322.980011
329.079987
322.980011
329.079987
189660000
329.079987
1990-02-01
329.079987
329.859985
327.760010
328.790009
154580000
328.790009
1990-02-02
328.790009
332.100006
328.089996
330.920013
164400000
330.920013
1990-02-05
330.920013
332.160004
330.450012
331.850006
130950000
331.850006
1990-02-06
331.850006
331.859985
328.200012
329.660004
134070000
329.660004
1990-02-07
329.660004
333.760010
326.549988
333.750000
186710000
333.750000
1990-02-08
333.750000
336.089996
332.000000
332.959991
176240000
332.959991
1990-02-09
333.019989
334.600006
332.410004
333.619995
146910000
333.619995
1990-02-12
333.619995
333.619995
329.970001
330.079987
118390000
330.079987
...
...
...
...
...
...
...
2015-12-07
2090.419922
2090.419922
2066.780029
2077.070068
4043820000
2077.070068
2015-12-08
2073.389893
2073.850098
2052.320068
2063.590088
4173570000
2063.590088
2015-12-09
2061.169922
2080.330078
2036.530029
2047.619995
4385250000
2047.619995
2015-12-10
2047.930054
2067.649902
2045.670044
2052.229980
3715150000
2052.229980
2015-12-11
2047.270020
2047.270020
2008.800049
2012.369995
4301060000
2012.369995
2015-12-14
2013.369995
2022.920044
1993.260010
2021.939941
4612440000
2021.939941
2015-12-15
2025.550049
2053.870117
2025.550049
2043.410034
4353540000
2043.410034
2015-12-16
2046.500000
2076.719971
2042.430054
2073.070068
4635450000
2073.070068
2015-12-17
2073.760010
2076.370117
2041.660034
2041.890015
4327390000
2041.890015
2015-12-18
2040.810059
2040.810059
2005.329956
2005.550049
6683070000
2005.550049
2015-12-21
2010.270020
2022.900024
2005.930054
2021.150024
3760280000
2021.150024
2015-12-22
2023.150024
2042.739990
2020.489990
2038.969971
3520860000
2038.969971
2015-12-23
2042.199951
2064.729980
2042.199951
2064.290039
3484090000
2064.290039
2015-12-24
2063.520020
2067.360107
2058.729980
2060.989990
1411860000
2060.989990
2015-12-28
2057.770020
2057.770020
2044.199951
2056.500000
2492510000
2056.500000
2015-12-29
2060.540039
2081.560059
2060.540039
2078.360107
2542000000
2078.360107
2015-12-30
2077.340088
2077.340088
2061.969971
2063.360107
2367430000
2063.360107
2015-12-31
2060.590088
2062.540039
2043.619995
2043.939941
2655330000
2043.939941
2016-01-04
2038.199951
2038.199951
1989.680054
2012.660034
4304880000
2012.660034
2016-01-05
2013.780029
2021.939941
2004.170044
2016.709961
3706620000
2016.709961
2016-01-06
2011.709961
2011.709961
1979.050049
1990.260010
4336660000
1990.260010
2016-01-07
1985.319946
1985.319946
1938.829956
1943.089966
5076590000
1943.089966
2016-01-08
1985.319946
1985.319946
1918.459961
1922.030029
4664940000
1922.030029
2016-01-11
1926.119995
1935.650024
1901.099976
1923.670044
4607290000
1923.670044
2016-01-12
1927.829956
1947.380005
1914.349976
1938.680054
4887260000
1938.680054
2016-01-13
1940.339966
1950.329956
1886.410034
1890.280029
5087030000
1890.280029
2016-01-14
1891.680054
1934.469971
1878.930054
1921.839966
5241110000
1921.839966
2016-01-15
1916.680054
1916.680054
1857.829956
1880.329956
5468460000
1880.329956
2016-01-19
1888.660034
1901.439941
1864.599976
1881.329956
4928350000
1881.329956
2016-01-20
1876.180054
1876.180054
1812.290039
1859.329956
6416070000
1859.329956
6565 rows × 6 columns
In [13]:
# Backup data to csv
df.to_csv('data/gspc.csv')
df_test = pd.read_csv('data/gspc.csv')
df_test = df_test.set_index('Date')
df_test
Out[13]:
GSPC_Open
GSPC_High
GSPC_Low
GSPC_Close
GSPC_Volume
GSPC_AdjClose
Date
1990-01-02
353.399994
359.690002
351.980011
359.690002
162070000
359.690002
1990-01-03
359.690002
360.589996
357.890015
358.760010
192330000
358.760010
1990-01-04
358.760010
358.760010
352.890015
355.670013
177000000
355.670013
1990-01-05
355.670013
355.670013
351.350006
352.200012
158530000
352.200012
1990-01-08
352.200012
354.239990
350.540009
353.790009
140110000
353.790009
1990-01-09
353.829987
354.170013
349.609985
349.619995
155210000
349.619995
1990-01-10
349.619995
349.619995
344.320007
347.309998
175990000
347.309998
1990-01-11
347.309998
350.140015
347.309998
348.529999
154390000
348.529999
1990-01-12
348.529999
348.529999
339.489990
339.929993
183880000
339.929993
1990-01-15
339.929993
339.940002
336.570007
337.000000
140590000
337.000000
1990-01-16
337.000000
340.750000
333.369995
340.750000
186070000
340.750000
1990-01-17
340.769989
342.010010
336.260010
337.399994
170470000
337.399994
1990-01-18
337.399994
338.380005
333.980011
338.190002
178590000
338.190002
1990-01-19
338.190002
340.480011
338.190002
339.149994
185590000
339.149994
1990-01-22
339.140015
339.959991
330.279999
330.380005
148380000
330.380005
1990-01-23
330.380005
332.760010
328.670013
331.609985
179300000
331.609985
1990-01-24
331.609985
331.709991
324.170013
330.260010
207830000
330.260010
1990-01-25
330.260010
332.329987
325.329987
326.079987
172270000
326.079987
1990-01-26
326.089996
328.579987
321.440002
325.799988
198190000
325.799988
1990-01-29
325.799988
327.309998
321.790009
325.200012
150770000
325.200012
1990-01-30
325.200012
325.730011
319.829987
322.980011
186030000
322.980011
1990-01-31
322.980011
329.079987
322.980011
329.079987
189660000
329.079987
1990-02-01
329.079987
329.859985
327.760010
328.790009
154580000
328.790009
1990-02-02
328.790009
332.100006
328.089996
330.920013
164400000
330.920013
1990-02-05
330.920013
332.160004
330.450012
331.850006
130950000
331.850006
1990-02-06
331.850006
331.859985
328.200012
329.660004
134070000
329.660004
1990-02-07
329.660004
333.760010
326.549988
333.750000
186710000
333.750000
1990-02-08
333.750000
336.089996
332.000000
332.959991
176240000
332.959991
1990-02-09
333.019989
334.600006
332.410004
333.619995
146910000
333.619995
1990-02-12
333.619995
333.619995
329.970001
330.079987
118390000
330.079987
...
...
...
...
...
...
...
2015-12-07
2090.419922
2090.419922
2066.780029
2077.070068
4043820000
2077.070068
2015-12-08
2073.389893
2073.850098
2052.320068
2063.590088
4173570000
2063.590088
2015-12-09
2061.169922
2080.330078
2036.530029
2047.619995
4385250000
2047.619995
2015-12-10
2047.930054
2067.649902
2045.670044
2052.229980
3715150000
2052.229980
2015-12-11
2047.270020
2047.270020
2008.800049
2012.369995
4301060000
2012.369995
2015-12-14
2013.369995
2022.920044
1993.260010
2021.939941
4612440000
2021.939941
2015-12-15
2025.550049
2053.870117
2025.550049
2043.410034
4353540000
2043.410034
2015-12-16
2046.500000
2076.719971
2042.430054
2073.070068
4635450000
2073.070068
2015-12-17
2073.760010
2076.370117
2041.660034
2041.890015
4327390000
2041.890015
2015-12-18
2040.810059
2040.810059
2005.329956
2005.550049
6683070000
2005.550049
2015-12-21
2010.270020
2022.900024
2005.930054
2021.150024
3760280000
2021.150024
2015-12-22
2023.150024
2042.739990
2020.489990
2038.969971
3520860000
2038.969971
2015-12-23
2042.199951
2064.729980
2042.199951
2064.290039
3484090000
2064.290039
2015-12-24
2063.520020
2067.360107
2058.729980
2060.989990
1411860000
2060.989990
2015-12-28
2057.770020
2057.770020
2044.199951
2056.500000
2492510000
2056.500000
2015-12-29
2060.540039
2081.560059
2060.540039
2078.360107
2542000000
2078.360107
2015-12-30
2077.340088
2077.340088
2061.969971
2063.360107
2367430000
2063.360107
2015-12-31
2060.590088
2062.540039
2043.619995
2043.939941
2655330000
2043.939941
2016-01-04
2038.199951
2038.199951
1989.680054
2012.660034
4304880000
2012.660034
2016-01-05
2013.780029
2021.939941
2004.170044
2016.709961
3706620000
2016.709961
2016-01-06
2011.709961
2011.709961
1979.050049
1990.260010
4336660000
1990.260010
2016-01-07
1985.319946
1985.319946
1938.829956
1943.089966
5076590000
1943.089966
2016-01-08
1985.319946
1985.319946
1918.459961
1922.030029
4664940000
1922.030029
2016-01-11
1926.119995
1935.650024
1901.099976
1923.670044
4607290000
1923.670044
2016-01-12
1927.829956
1947.380005
1914.349976
1938.680054
4887260000
1938.680054
2016-01-13
1940.339966
1950.329956
1886.410034
1890.280029
5087030000
1890.280029
2016-01-14
1891.680054
1934.469971
1878.930054
1921.839966
5241110000
1921.839966
2016-01-15
1916.680054
1916.680054
1857.829956
1880.329956
5468460000
1880.329956
2016-01-19
1888.660034
1901.439941
1864.599976
1881.329956
4928350000
1881.329956
2016-01-20
1876.180054
1876.180054
1812.290039
1859.329956
6416070000
1859.329956
6565 rows × 6 columns
In [14]:
def load(ticker):
"""Load data from Quandl into a dataframe, modify
column names and check for non-numeric values."""
# Grab the Quandl token
token = os.environ.get('QUANDL_TOKEN')
if token is None:
token = input("Enter Quandl token: ")
# Call Quandl module, trim input by default from 1990 forward
df = Quandl.get(ticker, authtoken=token, trim_start='1990-01-01')
# Get the column labels
old_columns = list(df.columns.values)
# Use the ticker symbol as our new prefix
ticker_tag = ticker.split('_')[-1] + '_'
# Drop spaces and concatenate
new_labels = [ticker_tag + i.replace(' ', '') for i in old_columns]
# Create a dictionary of old and new column labels
new_columns = dict(zip(old_columns, new_labels))
# Rename the columns using our dictionary
df = df.rename(columns=new_columns)
nulls = df[~df.applymap(np.isreal).all(1)]
# Check for non-numeric values
if len(nulls) > 0:
raise ValueError('Dataframe contains non-numeric values')
row_count = len(df)
print('%d rows loaded into dataframe.' % row_count)
return df
In [15]:
df = load("YAHOO/INDEX_GSPC")
6565 rows loaded into dataframe.
Content source: rolandpan/TwitFin_dev
Similar notebooks: