In [1]:
import os
import glob

from random import random
import numpy as np
import pandas as pd
import math

In [2]:
csv_file = os.path.join('Data','daily_sp500_1998-2013','table_goog.csv')
days_window = 252
scale = True

In [3]:
data = pd.read_csv(csv_file, header=None, names=['Date', 'Time', 'Open', 'High', 'Low', 'Close', 'Volume'],
                parse_dates=['Date'])
data.set_index('Date', inplace=True)
del data['Time']

print(len(data))
data.head()


2260
Out[3]:
Open High Low Close Volume
Date
2004-08-19 100.01 104.06 95.96 100.76 20925213
2004-08-20 101.19 109.08 100.50 108.35 11224400
2004-08-23 110.76 113.48 109.56 109.95 8787658
2004-08-24 111.24 111.60 103.57 105.00 7384914
2004-08-25 104.96 108.00 103.88 105.96 4456538

In [4]:
# clear nan and 0 volume
data = data[~np.isnan(data.Volume)]
data.Volume.replace(0, 1, inplace=True)

# Calculate change
data['Change'] = (data.Close - data.Close.shift()) / data.Close.shift()

print(len(data))
data.head()


2260
Out[4]:
Open High Low Close Volume Change
Date
2004-08-19 100.01 104.06 95.96 100.76 20925213 NaN
2004-08-20 101.19 109.08 100.50 108.35 11224400 0.075328
2004-08-23 110.76 113.48 109.56 109.95 8787658 0.014767
2004-08-24 111.24 111.60 103.57 105.00 7384914 -0.045020
2004-08-25 104.96 108.00 103.88 105.96 4456538 0.009143

In [5]:
MinPercentileDays = 100

# pctrank = lambda x: pd.Series(x).rank(pct=True).iloc[-1]
def pctrank(data):
    return pd.Series(data).rank(pct=True).iloc[-1]
    

data['ClosePctl'] = data.Close.expanding(MinPercentileDays).apply(pctrank)
data['VolumePctl'] = data.Volume.expanding(MinPercentileDays).apply(pctrank)
data.dropna(axis=0, inplace=True)

print(len(data))
data.head()


2161
Out[5]:
Open High Low Close Volume Change ClosePctl VolumePctl
Date
2005-01-10 194.54 198.10 191.83 194.89 7261378 0.006299 0.970000 0.540000
2005-01-11 195.54 197.71 193.18 193.66 6525067 -0.006311 0.930693 0.435644
2005-01-12 194.33 195.93 190.50 195.34 8018579 0.008675 0.970588 0.588235
2005-01-13 195.38 197.39 194.05 195.45 6351912 0.000563 0.970874 0.427184
2005-01-14 195.85 200.01 194.13 200.00 8227647 0.023280 0.990385 0.625000

In [6]:
# Scale
Change = data.Change
if scale:
    mean_values = data.mean(axis=0)
    std_values = data.std(axis=0)
    data = (data - np.array(mean_values)) / np.array(std_values)
data['Change'] = Change

print(data)


                Open      High       Low     Close    Volume    Change  \
Date                                                                     
2005-01-10 -2.091139 -2.096571 -2.077672 -2.086718  0.622560  0.006299   
2005-01-11 -2.084515 -2.099147 -2.068700 -2.094865  0.434411 -0.006311   
2005-01-12 -2.092530 -2.110904 -2.086511 -2.083738  0.816046  0.008675   
2005-01-13 -2.085575 -2.101261 -2.062918 -2.083009  0.390165  0.000563   
2005-01-14 -2.082461 -2.083956 -2.062386 -2.052872  0.869468  0.023280   
2005-01-18 -2.049076 -2.050865 -2.032279 -2.028099  1.833863  0.018700   
2005-01-19 -2.024037 -2.049015 -2.045239 -2.070424  1.417181 -0.031364   
2005-01-20 -2.104122 -2.108791 -2.076542 -2.093408  0.965738 -0.017583   
2005-01-21 -2.091801 -2.114669 -2.097278 -2.126129  0.975394 -0.025480   
2005-01-24 -2.129889 -2.154497 -2.154169 -2.178720  2.226474 -0.042024   
2005-01-25 -2.172879 -2.201327 -2.180953 -2.204751  1.418492 -0.021713   
2005-01-26 -2.192288 -2.154365 -2.161945 -2.125135  1.812725  0.067883   
2005-01-27 -2.130552 -2.157602 -2.121736 -2.132487  0.403626 -0.005870   
2005-01-28 -2.120351 -2.119028 -2.114159 -2.116127  1.801131  0.013140   
2005-01-31 -2.096372 -2.108064 -2.078403 -2.081552  1.115840  0.027409   
2005-02-01 -2.094650 -2.106083 -2.085647 -2.107251  1.723317 -0.019829   
2005-02-02 -1.951769 -1.973057 -1.999049 -2.012468  6.454231  0.074613   
2005-02-03 -2.014101 -1.995713 -1.984759 -1.980608  1.983814  0.023338   
2005-02-04 -2.011783 -2.032833 -2.006093 -2.027304  2.407152 -0.033427   
2005-02-07 -2.020129 -2.041750 -2.053214 -2.081154  1.943115 -0.039880   
2005-02-08 -2.074049 -2.083890 -2.059728 -2.061482  1.619580  0.015174   
2005-02-09 -2.049937 -2.073454 -2.093423 -2.109702  2.997521 -0.036638   
2005-02-10 -2.106904 -2.135409 -2.121404 -2.131626  3.508471 -0.017292   
2005-02-11 -2.143336 -2.134748 -2.115954 -2.136263  2.025724 -0.003721   
2005-02-14 -2.168441 -2.131578 -2.149650 -2.100893  8.063398  0.028494   
2005-02-15 -2.097299 -2.085079 -2.069364 -2.084003  5.092575  0.013230   
2005-02-16 -2.089748 -2.088447 -2.061256 -2.064463  2.776890  0.015105   
2005-02-17 -2.069213 -2.085673 -2.044574 -2.067179  1.342056 -0.002068   
2005-02-18 -2.065173 -2.091684 -2.045571 -2.066317  0.685133  0.000657   
2005-02-22 -2.078156 -2.091287 -2.087243 -2.111159  2.076890 -0.034197   
...              ...       ...       ...       ...       ...       ...   
2013-06-28  2.415391  2.419550  2.457374  2.459129 -0.820791  0.003942   
2013-07-01  2.491170  2.487185  2.529219  2.502116 -0.852400  0.007365   
2013-07-02  2.517402  2.480052  2.477844  2.473502 -0.784647 -0.004867   
2013-07-03  2.448710  2.467832  2.486019  2.493770 -0.957454  0.003464   
2013-07-05  2.515679  2.509180  2.544771  2.534571 -0.862099  0.006949   
2013-07-08  2.574766  2.580910  2.609570  2.611472 -0.793102  0.013007   
2013-07-09  2.655446  2.625032  2.615552  2.618559 -0.766032  0.001183   
2013-07-10  2.606362  2.613077  2.631436  2.622798 -0.825424  0.000707   
2013-07-11  2.667966  2.676947  2.685536  2.713806 -0.630502  0.015167   
2013-07-12  2.716189  2.680976  2.730197  2.722351 -0.701162  0.001403   
2013-07-15  2.741890  2.724438  2.737641  2.740433 -0.797586  0.002964   
2013-07-16  2.763816  2.723843  2.721292  2.708706 -0.764534 -0.005186   
2013-07-17  2.721886  2.716908  2.737242  2.702215 -0.876554 -0.001067   
2013-07-18  2.707843  2.671597  2.650112  2.651147 -0.621259 -0.008400   
2013-07-19  2.490442  2.558982  2.466812  2.559145  0.517976 -0.015261   
2013-07-22  2.594969  2.623050  2.609371  2.645384 -0.567392  0.014526   
2013-07-23  2.648094  2.625362  2.626053  2.607564 -0.740467 -0.006279   
2013-07-24  2.631136  2.607529  2.633563  2.599152 -0.744381 -0.001405   
2013-07-25  2.540188  2.518691  2.536397  2.502447 -0.515781 -0.016180   
2013-07-26  2.491899  2.473380  2.509280  2.484828 -0.840593 -0.002996   
2013-07-29  2.480572  2.504886  2.501970  2.461381 -0.782231 -0.004000   
2013-07-30  2.484613  2.510501  2.501770  2.529008 -0.829579  0.011582   
2013-07-31  2.535419  2.516445  2.537128  2.504964 -0.764391 -0.004071   
2013-08-01  2.549793  2.566710  2.596145  2.609220 -0.735332  0.017723   
2013-08-02  2.603050  2.584147  2.634360  2.620745 -0.857665  0.001925   
2013-08-05  2.616895  2.575692  2.623594  2.612929 -0.931019 -0.001303   
2013-08-06  2.612721  2.603632  2.600532  2.557887 -0.885174 -0.009188   
2013-08-07  2.548799  2.529127  2.556401  2.523245 -0.905226 -0.005836   
2013-08-08  2.551449  2.512020  2.531279  2.538479 -0.878099  0.002582   
2013-08-09  2.518461  2.509906  2.562649  2.519470 -0.907462 -0.003213   

            ClosePctl  VolumePctl  
Date                               
2005-01-10   0.765869    1.055131  
2005-01-11   0.559882    0.632702  
2005-01-12   0.768952    1.250384  
2005-01-13   0.770448    0.598460  
2005-01-14   0.872694    1.399206  
2005-01-18   0.923084    2.184699  
2005-01-19   0.725330    1.733352  
2005-01-20   0.384342    1.366103  
2005-01-21  -0.144423    1.380465  
2005-01-24  -0.855797    2.322990  
2005-01-25  -1.220751    1.739601  
2005-01-26  -0.115571    2.114889  
2005-01-27  -0.246668    0.495648  
2005-01-28  -0.097188    2.057444  
2005-01-31   0.647269    1.425837  
2005-02-01   0.011695    2.001997  
2005-02-02   0.923084    2.847391  
2005-02-03   0.923084    2.190630  
2005-02-04   0.834262    2.402615  
2005-02-07   0.526744    2.134808  
2005-02-08   0.661059    1.837733  
2005-02-09  -0.246281    2.616097  
2005-02-10  -0.537379    2.651745  
2005-02-11  -0.695928    2.127341  
2005-02-14  -0.112335    2.917184  
2005-02-15   0.336149    2.787649  
2005-02-16   0.631946    2.467412  
2005-02-17   0.592974    1.323506  
2005-02-18   0.595553    0.798340  
2005-02-22  -0.417506    2.101320  
...               ...         ...  
2013-06-28   0.892548   -0.880369  
2013-07-01   0.901953   -0.983856  
2013-07-02   0.892575   -0.750073  
2013-07-03   0.897280   -1.112637  
2013-07-05   0.906671   -1.005787  
2013-07-08   0.913709   -0.792222  
2013-07-09   0.913713   -0.654848  
2013-07-10   0.913717   -0.890196  
2013-07-11   0.923084   -0.069506  
2013-07-12   0.923084   -0.371768  
2013-07-15   0.923084   -0.802009  
2013-07-16   0.916071   -0.648687  
2013-07-17   0.913738   -1.027889  
2013-07-18   0.909072   -0.035793  
2013-07-19   0.885735    2.046290  
2013-07-22   0.906751    0.152473  
2013-07-23   0.888101   -0.543472  
2013-07-24   0.885785   -0.556339  
2013-07-25   0.869491    0.300151  
2013-07-26   0.855540   -0.945451  
2013-07-29   0.848585   -0.724345  
2013-07-30   0.874216   -0.904274  
2013-07-31   0.867260   -0.642058  
2013-08-01   0.888209   -0.523746  
2013-08-02   0.895196   -0.994330  
2013-08-05   0.890563   -1.096665  
2013-08-06   0.869680   -1.042875  
2013-08-07   0.862742   -1.073390  
2013-08-08   0.867408   -1.026826  
2013-08-09   0.858157   -1.073441  

[2161 rows x 8 columns]

In [7]:
min_values = data.min(axis=0)
max_values = data.max(axis=0)

print("Min ===================")
print(min_values)
print("Max ===================")
print(max_values)


Min ===================
Open         -2.218585
High         -2.226690
Low          -2.205676
Close        -2.220449
Volume       -1.010434
Change       -0.097671
ClosePctl    -3.372556
VolumePctl   -1.128832
dtype: float64
Max ===================
Open          2.763816
High          2.724438
Low           2.737641
Close         2.740433
Volume        8.951960
Change        0.198242
ClosePctl     0.923084
VolumePctl    2.917184
dtype: float64

In [ ]: