notebook.community

Edit and run



In [1]:

    
from imports import *
import avg_clf_train
import import_data
from theano import *
from theano import tensor as T

%matplotlib inline



In [2]:

    
'''
X = np.array([[0,0,1],
              [0,1,1],
              [1,0,1],
              [1,1,1]])

print X.shape

y = np.array([[0,0,1,1]]).T

print y.shape
'''









    Out[2]:





'\nX = np.array([[0,0,1],\n              [0,1,1],\n              [1,0,1],\n              [1,1,1]])\n\nprint X.shape\n\ny = np.array([[0,0,1,1]]).T\n\nprint y.shape\n'



In [3]:

    
'''
X = T.dmatrix()
w = T.vector()
y = T.vector()

Xw = X * w
sigmoid = 1 / (1+T.exp(-Xw))

layer1 = function([Xw], sigmoid)
'''









    Out[3]:





'\nX = T.dmatrix()\nw = T.vector()\ny = T.vector()\n\nXw = X * w\nsigmoid = 1 / (1+T.exp(-Xw))\n\nlayer1 = function([Xw], sigmoid)\n'



In [4]:

    
tickers = [filename[:-4] for filename in os.listdir('quandl_data')]

stock_df, prediction_df = import_data.import_data(tickers)
print stock_df.shape
stock_df.tail()









    



(134093, 10)






    Out[4]:






  
    
      
      Open
      High
      Low
      Close
      Volume
      50dravg
      200dravg
      OC%
      HL%
      label
    
  
  
    
      4096
      0.52
      0.53
      0.50
      0.52
      1093100
      0.7780
      0.38757
      0.000000
      0.060
      0
    
    
      4097
      0.52
      0.52
      0.50
      0.52
      226000
      0.7808
      0.38852
      0.000000
      0.040
      0
    
    
      4098
      0.51
      0.52
      0.50
      0.51
      583300
      0.7838
      0.38927
      0.000000
      0.040
      1
    
    
      4099
      0.51
      0.56
      0.50
      0.56
      475900
      0.7884
      0.39017
      0.098039
      0.120
      0
    
    
      4100
      0.57
      0.63
      0.56
      0.57
      1537100
      0.7928
      0.39107
      0.000000
      0.125
      0



In [5]:

    
y = stock_df['label'].values
y = y.reshape(y.shape[0], 1)

stock_df = stock_df.drop('label', axis=1)
X = stock_df.values

print X.shape, y.shape









    



(134093, 9) (134093, 1)



In [6]:

    
new_y = []
for i in xrange(y.shape[0]):
    if y[i] == 0:
        new_y.append(np.array([[1],[0]]))
    elif y[i] == 1:
        new_y.append(np.array([[0],[1]]))
        
y = new_y



In [7]:

    
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
X









    Out[7]:





array([[ 0.0076876 ,  0.00847293,  0.00725023, ...,  0.04623293,
         0.21619189,  0.04217096],
       [ 0.00869581,  0.00930566,  0.00881316, ...,  0.04605935,
         0.1874379 ,  0.01970069],
       [ 0.00919992,  0.00918075,  0.00887828, ...,  0.04587912,
         0.18168801,  0.01536585],
       ..., 
       [ 0.00102963,  0.00104132,  0.00106408, ...,  0.00087466,
         0.19205459,  0.00763636],
       [ 0.00102963,  0.00112463,  0.00106408, ...,  0.00087706,
         0.2155907 ,  0.02290909],
       [ 0.0011557 ,  0.00127041,  0.00119438, ...,  0.00087945,
         0.19205459,  0.02386364]])



In [8]:

    
def add_bias(matrix):
    if matrix.ndim == 1:
        matrix = matrix.reshape(matrix.shape[0], 1)
    else:    
        if matrix.shape[1] == 1:
            matrix = np.insert(matrix, 0, 1, axis=0)
        else: 
            matrix = np.hstack((np.ones((matrix.shape[0], 1)), matrix))
    return matrix



In [9]:

    
def g(z):
    return np.true_divide(1, (1+np.exp(-z)))



In [40]:

    
def create_thetas(L, s):
    init_seed_value = 42
    thetas = []
    
    # skip last (output layer)
    for l in xrange(L-1):
        np.random.seed(init_seed_value+1)
        
        s_j = s[l]
        s_jplus1 = s[l+1]
        
        theta = np.random.standard_normal((s_jplus1,s_j+1))
        
        thetas.append(theta)
        
    return thetas



In [308]:

    
def J(m, K, thetas, h_thetas, y):
    summations = 0
    
    for i in xrange(m):
        for k in xrange(K):
            summations += y[i][k] * np.log(h_thetas[i][k]) + (1 - y[i][k]) * np.log(1 - h_thetas[i][k])

        left_J = np.true_divide(summations, -m)

        theta_sums = 0
        for l in xrange(L-1):
            for i in xrange(1,s[l]):
                for j in xrange(1,s[l+1]):
                    # skip bias-related weights
                    theta_sums = thetas[l][j][i]**2

        right_J = np.true_divide(lambda_reg,(2.0*m)) * theta_sums

        return left_J + right_J



In [430]:

    
def forward_propagation(L, x, thetas):
    
    # a1
    a = [x.reshape(x.shape[0], 1)]
    
    for l in xrange(L-1):
        print l
        
        a_l = a[l]
        
        # if not on first level, add bias
        if l != 0:
            a_l = add_bias(a_l)
            
        # z2, z3, z4, ...
        z = thetas[l].dot(a_l)
        
        # a2, a3, a4, ...
        a_l = g(z)
        
        # keep a_l for next iteration
        a.append(a_l)
    
    print "a shapes:", [a_l.shape for a_l in a]
    return a



In [13]:

    
X = add_bias(X)



In [39]:

    
# number of features (minus bias)
n = X.shape[1]-1

# number of output classes
K = 2

# number of layers
L = 4
# number of units in each layer
s = [n if (l < (L-1)) else K for l in xrange(L)]

# create thetas based on L and s
thetas = create_thetas(L, s)
[theta.shape for theta in thetas]









    Out[39]:





[(9, 10), (9, 10), (2, 10)]



In [15]:

    
#indices = np.random.choice(X.shape[0], 200)
#input_X = X[indices,:]
#output_y = y[indices, :]
#print input_X.shape, output_y.shape



In [418]:

    
def back_propagation(L, thetas, a, y_i, Deltas):
    
    deltas = []
    
    # output layer error
    deltas.append(a[L-1] - y_i)
    
    # indexes are weird
    for l in xrange(L-2, 0, -1):
        a_l = a[l]

        # skip bias
        theta_l_T = thetas[l][:,1:].T
        
        # elementwise multiplication
        g_prime = np.multiply(a_l, (1 - a_l))
        
        d = np.multiply(theta_l_T.dot(deltas[0]), g_prime)
        deltas.insert(0,d)
        
    for l in xrange(len(deltas)):
        a_l = a[l]
        print Deltas[l].shape, deltas[l].dot(a_l.T).shape
        # start with a1 and d2
        Deltas[l] += deltas[l].dot(a_l.T)
        
    return Deltas



In [431]:

    
lambda_reg = 100

h_thetas = []

Deltas = []
for theta in thetas:
    Deltas.append(np.zeros(theta.shape))
print "Delta shapes:", [Delta.shape for Delta in Deltas]

D = []

m = 1

# train NN via gradient descent
for i in xrange(m):
    a = forward_propagation(L, X[i], thetas)
    h_theta_i = a[L-1]
    h_thetas.append(h_theta_i)
    
    Deltas = back_propagation(L, thetas, a, y[i], Deltas)
    
#for l in xrange(L-1):
    #print np.true_divide(Deltas[l], (lambda_reg * thetas[l][:,1:])).shape
    #reg_D = np.true_divide(Deltas[l][:,1:] + (lambda_reg * thetas[l][:,1:]), m)
    #noreg_D = np.true_divide(Deltas[l][:,0] + (lambda_reg * thetas[l][:,0]), m)
    #D[l] = np.vstack(noreg_D, reg_D)
    
print "cost:", J(m, K, thetas, h_thetas, y)









    



Delta shapes: [(9, 10), (9, 10), (2, 10)]
0
1
2
a shapes: [(10, 1), (9, 1), (9, 1), (2, 1)]
(9, 10) (9, 10)
(9, 10) (9, 9)






    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-431-be77e97ffb8b> in <module>()
     18     h_thetas.append(h_theta_i)
     19 
---> 20     Deltas = back_propagation(L, thetas, a, y[i], Deltas)
     21 
     22 #for l in xrange(L-1):

<ipython-input-418-f48296d8c472> in back_propagation(L, thetas, a, y_i, Deltas)
     23         print Deltas[l].shape, deltas[l].dot(a_l.T).shape
     24         # start with a1 and d2
---> 25         Deltas[l] += deltas[l].dot(a_l.T)
     26 
     27     return Deltas

ValueError: operands could not be broadcast together with shapes (9,10) (9,9) (9,10)



In [19]:

    
print [theta.shape for theta in thetas]
thetas









    



[(9, 10), (1, 10)]






    Out[19]:





[array([[ 0.25739993, -0.90848143, -0.37850311, -0.5349156 ,  0.85807335,
         -0.41300998,  0.49818858,  2.01019925,  1.26286154, -0.43921486],
        [-0.34643789,  0.45531966, -1.66866271, -0.8620855 ,  0.49291085,
         -0.1243134 ,  1.93513629, -0.61844265, -1.04683899, -0.88961759],
        [ 0.01404054, -0.16082969,  2.23035965, -0.39911572,  0.05444456,
          0.88418182, -0.10798056,  0.55560698,  0.39490664,  0.83720502],
        [-1.40787817,  0.80784941, -0.13828364,  0.18717859, -0.38665814,
          1.65904873, -2.04706913,  1.39931699, -0.67900712,  1.52898513],
        [ 1.22121596,  1.01498852,  0.82812998,  2.26629271, -0.59495567,
         -0.58126954, -0.65589415,  0.92514885, -1.29916134,  1.01116687],
        [-0.28844018, -1.06771307, -1.0776009 , -0.79677376, -1.48604258,
          0.51412877,  0.85179086,  0.95867344, -0.62648405,  0.30793101],
        [ 0.00520568,  0.69153191,  0.44486216,  0.09027953, -1.8583429 ,
         -0.16658004,  0.11087648, -0.69477264, -0.26917557, -1.29922515],
        [-0.32110545,  0.50586874,  2.08905957, -1.01270925, -0.02397407,
         -0.96146905, -0.09256619, -0.22373208,  0.83289216,  0.97411958],
        [ 0.16281816, -0.11449202,  1.18646843,  0.17979165,  1.51644162,
         -1.63403202,  1.7819709 , -0.6177277 , -1.08639929, -0.04475333]]),
 array([[-0.56126641, -1.43341151, -0.67526193, -0.9851564 ,  0.71181553,
         -1.03376972,  0.19049115,  1.62496957,  0.87437877, -0.87397192]])]



In [20]:

    
pred_df = prediction_df[prediction_df['label'].apply(np.isnan) == True]



In [21]:

    
pred_tickers = pred_df['ticker'].unique()



In [22]:

    
pred_X = pred_df.drop(['ticker','label'], axis=1).values
print pred_X.shape
print pred_X[0]









    



(63, 9)
[  9.30000000e-01   9.50000000e-01   8.90000000e-01   9.20000000e-01
   1.23900000e+05   1.06900000e+00   8.92050000e-01  -1.07526882e-02
   6.74157303e-02]



In [23]:

    
pred_X = add_bias(pred_X)
pred_X[0]









    Out[23]:





array([  1.00000000e+00,   9.30000000e-01,   9.50000000e-01,
         8.90000000e-01,   9.20000000e-01,   1.23900000e+05,
         1.06900000e+00,   8.92050000e-01,  -1.07526882e-02,
         6.74157303e-02])



In [24]:

    
for i in xrange(pred_X.shape[0]):
    a1 = X[i,:].reshape(1,n+1)
    a2, a3 = forward_propagation(a1, thetas)
    print str(i).rjust(2), str(pred_tickers[i]).rjust(4), np.round(a3[0][0], 2)









    



 0 ABIO 0.11
 1 ACOR 0.11
 2 AERI 0.11
 3 AFFX 0.11
 4 AGEN 0.11
 5 APPY 0.11
 6 ARIA 0.11
 7 ARNA 0.11
 8 ARWR 0.11
 9 ATNM 0.11
10 AVXL 0.12
11 AXDX 0.11
12  AXN 0.11
13 BABY 0.12
14 BCRX 0.11
15 BGMD 0.11
16 BIIB 0.12
17 BLUE 0.12
18 BRKR 0.12
19 CBPO 0.12
20 CGEN 0.12
21 CLDN 0.12
22 CLDX 0.12
23 COHR 0.12
24 CPHD 0.12
25 CPRX 0.12
26 CRIS 0.12
27 CYBX 0.12
28 CYNO 0.12
29 CYTR 0.12
30 DARA 0.12
31 DSCO 0.12
32 DYAX 0.12
33 ECYT 0.12
34 ENZN 0.12
35 ETRM 0.12
36 EXAS 0.12
37 EXEL 0.12
38 FATE 0.12
39 FEIC 0.12
40 FLDM 0.12
41 GILD 0.12
42 GNCA 0.12
43 HALO 0.12
44 IART 0.12
45 IDRA 0.11
46 IDXX 0.12
47 ILMN 0.12
48 IMMU 0.12
49 IMRS 0.12
50 INCY 0.12
51  INO 0.12
52 LPCN 0.12
53 MEIP 0.12
54 MNKD 0.12
55 OREX 0.12
56 PGNX 0.12
57 RMTI 0.12
58 SGYP 0.12
59  SYN 0.12
60 THLD 0.12
61 TNXP 0.12
62 TPIV 0.12



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [25]:

    
positive_tickers = []
for i in xrange(len(pred_tickers)):
    print i, pred_tickers[i], y_predictions[i]
    if y_predictions[i] == 1:
        positive_tickers.append(pred_tickers[i])









    



0 ABIO





    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-25-424c0eed27cc> in <module>()
      1 positive_tickers = []
      2 for i in xrange(len(pred_tickers)):
----> 3     print i, pred_tickers[i], y_predictions[i]
      4     if y_predictions[i] == 1:
      5         positive_tickers.append(pred_tickers[i])

NameError: name 'y_predictions' is not defined



In [ ]:

    
for ticker in positive_tickers:
    
    past_days = 100
    
    oc = prediction_df[prediction_df['ticker'] == ticker]["OC%"][-past_days:]
    
    num_days = oc.shape[0]
    
    day_range = np.arange(num_days)
    
    plt.plot(day_range, oc, alpha=0.5)
    plt.plot(day_range, [0.05 for x in day_range], color='r')
    plt.title("{0} (previous {1} days)".format(ticker, num_days))
    plt.show()

    print "\t", ticker, "{}-day freq probability:".format(past_days), np.true_divide(np.sum(oc.values > 0.05), past_days)
    print "~"*50, "\n"



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:

    
'''
#errors = []
iterations = 1

for i in xrange(iterations):

    
#plt.plot([x for x in xrange(iterations)], errors)
#plt.show()
'''



In [ ]:

    
'''
X = np.array([[1,2,3,4],
              [1,3,4,5],
              [1,6,7,8],
              [1,5,4,3],
              [1,2,3,3]])
print X.shape

theta1 = np.array([[2,2,3,4],
                   [4,3,4,5],
                   [6,6,7,8]])

print theta1.T
print theta1.shape

print X.dot(theta1.T)
print X.dot(theta1.T).shape
'''

	Open	High	Low	Close	Volume	50dravg	200dravg	OC%	HL%	label
4096	0.52	0.53	0.50	0.52	1093100	0.7780	0.38757	0.000000	0.060	0
4097	0.52	0.52	0.50	0.52	226000	0.7808	0.38852	0.000000	0.040	0
4098	0.51	0.52	0.50	0.51	583300	0.7838	0.38927	0.000000	0.040	1
4099	0.51	0.56	0.50	0.56	475900	0.7884	0.39017	0.098039	0.120	0
4100	0.57	0.63	0.56	0.57	1537100	0.7928	0.39107	0.000000	0.125	0