In [1]:
from imports import *
import avg_clf_train
import import_data
from theano import *
from theano import tensor as T

%matplotlib inline


In [2]:
'''
X = np.array([[0,0,1],
              [0,1,1],
              [1,0,1],
              [1,1,1]])

print X.shape

y = np.array([[0,0,1,1]]).T

print y.shape
'''


Out[2]:
'\nX = np.array([[0,0,1],\n              [0,1,1],\n              [1,0,1],\n              [1,1,1]])\n\nprint X.shape\n\ny = np.array([[0,0,1,1]]).T\n\nprint y.shape\n'

In [3]:
'''
X = T.dmatrix()
w = T.vector()
y = T.vector()

Xw = X * w
sigmoid = 1 / (1+T.exp(-Xw))

layer1 = function([Xw], sigmoid)
'''


Out[3]:
'\nX = T.dmatrix()\nw = T.vector()\ny = T.vector()\n\nXw = X * w\nsigmoid = 1 / (1+T.exp(-Xw))\n\nlayer1 = function([Xw], sigmoid)\n'


In [4]:
tickers = [filename[:-4] for filename in os.listdir('quandl_data')]

stock_df, prediction_df = import_data.import_data(tickers)
print stock_df.shape
stock_df.tail()


(134093, 10)
Out[4]:
Open High Low Close Volume 50dravg 200dravg OC% HL% label
4096 0.52 0.53 0.50 0.52 1093100 0.7780 0.38757 0.000000 0.060 0
4097 0.52 0.52 0.50 0.52 226000 0.7808 0.38852 0.000000 0.040 0
4098 0.51 0.52 0.50 0.51 583300 0.7838 0.38927 0.000000 0.040 1
4099 0.51 0.56 0.50 0.56 475900 0.7884 0.39017 0.098039 0.120 0
4100 0.57 0.63 0.56 0.57 1537100 0.7928 0.39107 0.000000 0.125 0

In [5]:
y = stock_df['label'].values
y = y.reshape(y.shape[0], 1)

stock_df = stock_df.drop('label', axis=1)
X = stock_df.values

print X.shape, y.shape


(134093, 9) (134093, 1)

In [6]:
new_y = []
for i in xrange(y.shape[0]):
    if y[i] == 0:
        new_y.append(np.array([[1],[0]]))
    elif y[i] == 1:
        new_y.append(np.array([[0],[1]]))
        
y = new_y

In [7]:
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
X


Out[7]:
array([[ 0.0076876 ,  0.00847293,  0.00725023, ...,  0.04623293,
         0.21619189,  0.04217096],
       [ 0.00869581,  0.00930566,  0.00881316, ...,  0.04605935,
         0.1874379 ,  0.01970069],
       [ 0.00919992,  0.00918075,  0.00887828, ...,  0.04587912,
         0.18168801,  0.01536585],
       ..., 
       [ 0.00102963,  0.00104132,  0.00106408, ...,  0.00087466,
         0.19205459,  0.00763636],
       [ 0.00102963,  0.00112463,  0.00106408, ...,  0.00087706,
         0.2155907 ,  0.02290909],
       [ 0.0011557 ,  0.00127041,  0.00119438, ...,  0.00087945,
         0.19205459,  0.02386364]])


In [8]:
def add_bias(matrix):
    if matrix.ndim == 1:
        matrix = matrix.reshape(matrix.shape[0], 1)
    else:    
        if matrix.shape[1] == 1:
            matrix = np.insert(matrix, 0, 1, axis=0)
        else: 
            matrix = np.hstack((np.ones((matrix.shape[0], 1)), matrix))
    return matrix

In [9]:
def g(z):
    return np.true_divide(1, (1+np.exp(-z)))

In [40]:
def create_thetas(L, s):
    init_seed_value = 42
    thetas = []
    
    # skip last (output layer)
    for l in xrange(L-1):
        np.random.seed(init_seed_value+1)
        
        s_j = s[l]
        s_jplus1 = s[l+1]
        
        theta = np.random.standard_normal((s_jplus1,s_j+1))
        
        thetas.append(theta)
        
    return thetas

In [308]:
def J(m, K, thetas, h_thetas, y):
    summations = 0
    
    for i in xrange(m):
        for k in xrange(K):
            summations += y[i][k] * np.log(h_thetas[i][k]) + (1 - y[i][k]) * np.log(1 - h_thetas[i][k])

        left_J = np.true_divide(summations, -m)

        theta_sums = 0
        for l in xrange(L-1):
            for i in xrange(1,s[l]):
                for j in xrange(1,s[l+1]):
                    # skip bias-related weights
                    theta_sums = thetas[l][j][i]**2

        right_J = np.true_divide(lambda_reg,(2.0*m)) * theta_sums

        return left_J + right_J




In [430]:
def forward_propagation(L, x, thetas):
    
    # a1
    a = [x.reshape(x.shape[0], 1)]
    
    for l in xrange(L-1):
        print l
        
        a_l = a[l]
        
        # if not on first level, add bias
        if l != 0:
            a_l = add_bias(a_l)
            
        # z2, z3, z4, ...
        z = thetas[l].dot(a_l)
        
        # a2, a3, a4, ...
        a_l = g(z)
        
        # keep a_l for next iteration
        a.append(a_l)
    
    print "a shapes:", [a_l.shape for a_l in a]
    return a


In [13]:
X = add_bias(X)

In [39]:
# number of features (minus bias)
n = X.shape[1]-1

# number of output classes
K = 2

# number of layers
L = 4
# number of units in each layer
s = [n if (l < (L-1)) else K for l in xrange(L)]

# create thetas based on L and s
thetas = create_thetas(L, s)
[theta.shape for theta in thetas]


Out[39]:
[(9, 10), (9, 10), (2, 10)]

In [15]:
#indices = np.random.choice(X.shape[0], 200)
#input_X = X[indices,:]
#output_y = y[indices, :]
#print input_X.shape, output_y.shape

In [418]:
def back_propagation(L, thetas, a, y_i, Deltas):
    
    deltas = []
    
    # output layer error
    deltas.append(a[L-1] - y_i)
    
    # indexes are weird
    for l in xrange(L-2, 0, -1):
        a_l = a[l]

        # skip bias
        theta_l_T = thetas[l][:,1:].T
        
        # elementwise multiplication
        g_prime = np.multiply(a_l, (1 - a_l))
        
        d = np.multiply(theta_l_T.dot(deltas[0]), g_prime)
        deltas.insert(0,d)
        
    for l in xrange(len(deltas)):
        a_l = a[l]
        print Deltas[l].shape, deltas[l].dot(a_l.T).shape
        # start with a1 and d2
        Deltas[l] += deltas[l].dot(a_l.T)
        
    return Deltas

In [431]:
lambda_reg = 100

h_thetas = []

Deltas = []
for theta in thetas:
    Deltas.append(np.zeros(theta.shape))
print "Delta shapes:", [Delta.shape for Delta in Deltas]

D = []

m = 1

# train NN via gradient descent
for i in xrange(m):
    a = forward_propagation(L, X[i], thetas)
    h_theta_i = a[L-1]
    h_thetas.append(h_theta_i)
    
    Deltas = back_propagation(L, thetas, a, y[i], Deltas)
    
#for l in xrange(L-1):
    #print np.true_divide(Deltas[l], (lambda_reg * thetas[l][:,1:])).shape
    #reg_D = np.true_divide(Deltas[l][:,1:] + (lambda_reg * thetas[l][:,1:]), m)
    #noreg_D = np.true_divide(Deltas[l][:,0] + (lambda_reg * thetas[l][:,0]), m)
    #D[l] = np.vstack(noreg_D, reg_D)
    
print "cost:", J(m, K, thetas, h_thetas, y)


Delta shapes: [(9, 10), (9, 10), (2, 10)]
0
1
2
a shapes: [(10, 1), (9, 1), (9, 1), (2, 1)]
(9, 10) (9, 10)
(9, 10) (9, 9)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-431-be77e97ffb8b> in <module>()
     18     h_thetas.append(h_theta_i)
     19 
---> 20     Deltas = back_propagation(L, thetas, a, y[i], Deltas)
     21 
     22 #for l in xrange(L-1):

<ipython-input-418-f48296d8c472> in back_propagation(L, thetas, a, y_i, Deltas)
     23         print Deltas[l].shape, deltas[l].dot(a_l.T).shape
     24         # start with a1 and d2
---> 25         Deltas[l] += deltas[l].dot(a_l.T)
     26 
     27     return Deltas

ValueError: operands could not be broadcast together with shapes (9,10) (9,9) (9,10) 


In [19]:
print [theta.shape for theta in thetas]
thetas


[(9, 10), (1, 10)]
Out[19]:
[array([[ 0.25739993, -0.90848143, -0.37850311, -0.5349156 ,  0.85807335,
         -0.41300998,  0.49818858,  2.01019925,  1.26286154, -0.43921486],
        [-0.34643789,  0.45531966, -1.66866271, -0.8620855 ,  0.49291085,
         -0.1243134 ,  1.93513629, -0.61844265, -1.04683899, -0.88961759],
        [ 0.01404054, -0.16082969,  2.23035965, -0.39911572,  0.05444456,
          0.88418182, -0.10798056,  0.55560698,  0.39490664,  0.83720502],
        [-1.40787817,  0.80784941, -0.13828364,  0.18717859, -0.38665814,
          1.65904873, -2.04706913,  1.39931699, -0.67900712,  1.52898513],
        [ 1.22121596,  1.01498852,  0.82812998,  2.26629271, -0.59495567,
         -0.58126954, -0.65589415,  0.92514885, -1.29916134,  1.01116687],
        [-0.28844018, -1.06771307, -1.0776009 , -0.79677376, -1.48604258,
          0.51412877,  0.85179086,  0.95867344, -0.62648405,  0.30793101],
        [ 0.00520568,  0.69153191,  0.44486216,  0.09027953, -1.8583429 ,
         -0.16658004,  0.11087648, -0.69477264, -0.26917557, -1.29922515],
        [-0.32110545,  0.50586874,  2.08905957, -1.01270925, -0.02397407,
         -0.96146905, -0.09256619, -0.22373208,  0.83289216,  0.97411958],
        [ 0.16281816, -0.11449202,  1.18646843,  0.17979165,  1.51644162,
         -1.63403202,  1.7819709 , -0.6177277 , -1.08639929, -0.04475333]]),
 array([[-0.56126641, -1.43341151, -0.67526193, -0.9851564 ,  0.71181553,
         -1.03376972,  0.19049115,  1.62496957,  0.87437877, -0.87397192]])]

In [20]:
pred_df = prediction_df[prediction_df['label'].apply(np.isnan) == True]

In [21]:
pred_tickers = pred_df['ticker'].unique()

In [22]:
pred_X = pred_df.drop(['ticker','label'], axis=1).values
print pred_X.shape
print pred_X[0]


(63, 9)
[  9.30000000e-01   9.50000000e-01   8.90000000e-01   9.20000000e-01
   1.23900000e+05   1.06900000e+00   8.92050000e-01  -1.07526882e-02
   6.74157303e-02]

In [23]:
pred_X = add_bias(pred_X)
pred_X[0]


Out[23]:
array([  1.00000000e+00,   9.30000000e-01,   9.50000000e-01,
         8.90000000e-01,   9.20000000e-01,   1.23900000e+05,
         1.06900000e+00,   8.92050000e-01,  -1.07526882e-02,
         6.74157303e-02])

In [24]:
for i in xrange(pred_X.shape[0]):
    a1 = X[i,:].reshape(1,n+1)
    a2, a3 = forward_propagation(a1, thetas)
    print str(i).rjust(2), str(pred_tickers[i]).rjust(4), np.round(a3[0][0], 2)


 0 ABIO 0.11
 1 ACOR 0.11
 2 AERI 0.11
 3 AFFX 0.11
 4 AGEN 0.11
 5 APPY 0.11
 6 ARIA 0.11
 7 ARNA 0.11
 8 ARWR 0.11
 9 ATNM 0.11
10 AVXL 0.12
11 AXDX 0.11
12  AXN 0.11
13 BABY 0.12
14 BCRX 0.11
15 BGMD 0.11
16 BIIB 0.12
17 BLUE 0.12
18 BRKR 0.12
19 CBPO 0.12
20 CGEN 0.12
21 CLDN 0.12
22 CLDX 0.12
23 COHR 0.12
24 CPHD 0.12
25 CPRX 0.12
26 CRIS 0.12
27 CYBX 0.12
28 CYNO 0.12
29 CYTR 0.12
30 DARA 0.12
31 DSCO 0.12
32 DYAX 0.12
33 ECYT 0.12
34 ENZN 0.12
35 ETRM 0.12
36 EXAS 0.12
37 EXEL 0.12
38 FATE 0.12
39 FEIC 0.12
40 FLDM 0.12
41 GILD 0.12
42 GNCA 0.12
43 HALO 0.12
44 IART 0.12
45 IDRA 0.11
46 IDXX 0.12
47 ILMN 0.12
48 IMMU 0.12
49 IMRS 0.12
50 INCY 0.12
51  INO 0.12
52 LPCN 0.12
53 MEIP 0.12
54 MNKD 0.12
55 OREX 0.12
56 PGNX 0.12
57 RMTI 0.12
58 SGYP 0.12
59  SYN 0.12
60 THLD 0.12
61 TNXP 0.12
62 TPIV 0.12

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [25]:
positive_tickers = []
for i in xrange(len(pred_tickers)):
    print i, pred_tickers[i], y_predictions[i]
    if y_predictions[i] == 1:
        positive_tickers.append(pred_tickers[i])


0 ABIO
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-25-424c0eed27cc> in <module>()
      1 positive_tickers = []
      2 for i in xrange(len(pred_tickers)):
----> 3     print i, pred_tickers[i], y_predictions[i]
      4     if y_predictions[i] == 1:
      5         positive_tickers.append(pred_tickers[i])

NameError: name 'y_predictions' is not defined

In [ ]:
for ticker in positive_tickers:
    
    past_days = 100
    
    oc = prediction_df[prediction_df['ticker'] == ticker]["OC%"][-past_days:]
    
    num_days = oc.shape[0]
    
    day_range = np.arange(num_days)
    
    plt.plot(day_range, oc, alpha=0.5)
    plt.plot(day_range, [0.05 for x in day_range], color='r')
    plt.title("{0} (previous {1} days)".format(ticker, num_days))
    plt.show()

    print "\t", ticker, "{}-day freq probability:".format(past_days), np.true_divide(np.sum(oc.values > 0.05), past_days)
    print "~"*50, "\n"

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:
'''
#errors = []
iterations = 1

for i in xrange(iterations):

    
#plt.plot([x for x in xrange(iterations)], errors)
#plt.show()
'''

In [ ]:
'''
X = np.array([[1,2,3,4],
              [1,3,4,5],
              [1,6,7,8],
              [1,5,4,3],
              [1,2,3,3]])
print X.shape

theta1 = np.array([[2,2,3,4],
                   [4,3,4,5],
                   [6,6,7,8]])

print theta1.T
print theta1.shape

print X.dot(theta1.T)
print X.dot(theta1.T).shape
'''