In [1]:
from imports import *
import avg_clf_train
import import_data
from theano import *
from theano import tensor as T

%matplotlib inline


In [2]:
'''
X = np.array([[0,0,1],
              [0,1,1],
              [1,0,1],
              [1,1,1]])

print X.shape

y = np.array([[0,0,1,1]]).T

print y.shape
'''


Out[2]:
'\nX = np.array([[0,0,1],\n              [0,1,1],\n              [1,0,1],\n              [1,1,1]])\n\nprint X.shape\n\ny = np.array([[0,0,1,1]]).T\n\nprint y.shape\n'

In [3]:
'''
X = T.dmatrix()
w = T.vector()
y = T.vector()

Xw = X * w
sigmoid = 1 / (1+T.exp(-Xw))

layer1 = function([Xw], sigmoid)
'''


Out[3]:
'\nX = T.dmatrix()\nw = T.vector()\ny = T.vector()\n\nXw = X * w\nsigmoid = 1 / (1+T.exp(-Xw))\n\nlayer1 = function([Xw], sigmoid)\n'


In [4]:
tickers = [filename[:-4] for filename in os.listdir('quandl_data')]

stock_df, prediction_df = import_data.import_data(tickers)
print stock_df.shape
stock_df.tail()


(134150, 10)
Out[4]:
Open High Low Close Volume 50dravg 200dravg OC% HL% label
4096 0.52 0.53 0.50 0.52 1093100 0.7780 0.38757 0.000000 0.060 0
4097 0.52 0.52 0.50 0.52 226000 0.7808 0.38852 0.000000 0.040 0
4098 0.51 0.52 0.50 0.51 583300 0.7838 0.38927 0.000000 0.040 1
4099 0.51 0.56 0.50 0.56 475900 0.7884 0.39017 0.098039 0.120 0
4100 0.57 0.63 0.56 0.57 1537100 0.7928 0.39107 0.000000 0.125 0

In [5]:
y = stock_df['label'].values
y = y.reshape(y.shape[0], 1)

stock_df = stock_df.drop('label', axis=1)
X = stock_df.values

print X.shape, y.shape


(134150, 9) (134150, 1)

In [6]:
new_y = []
for i in xrange(y.shape[0]):
    if y[i] == 0:
        new_y.append(np.array([[1],[0]]))
    elif y[i] == 1:
        new_y.append(np.array([[0],[1]]))
        
y = new_y

In [7]:
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
X


Out[7]:
array([[ 0.0076876 ,  0.00847293,  0.00725023, ...,  0.04623293,
         0.21619189,  0.04217096],
       [ 0.00869581,  0.00930566,  0.00881316, ...,  0.04605935,
         0.1874379 ,  0.01970069],
       [ 0.00919992,  0.00918075,  0.00887828, ...,  0.04587912,
         0.18168801,  0.01536585],
       ..., 
       [ 0.00102963,  0.00104132,  0.00106408, ...,  0.00087466,
         0.19205459,  0.00763636],
       [ 0.00102963,  0.00112463,  0.00106408, ...,  0.00087706,
         0.2155907 ,  0.02290909],
       [ 0.0011557 ,  0.00127041,  0.00119438, ...,  0.00087945,
         0.19205459,  0.02386364]])


In [8]:
# number of layers
L = 4
# number of nodes per layer
s = [9, 9, 9, 2]

In [9]:
def create_W_and_b(L,s):
    # weight matrices
    W = []
    for l in xrange(L-1):
        np.random.seed(l)

        W.append(np.random.standard_normal((s[l+1],s[l])))
        W[l] = W[l] * 0.01

    # biases
    b = []
    for w in W:
        b.append(np.ones((w.shape[0],1)))
    
    return W, b
#W, b = create_W_and_b(L,s)
#print "W:", [w.shape for w in W]
#print W[0]
#print "b:", [b_l.shape for b_l in b]

In [10]:
def f(z):
    return np.true_divide(1, (1+np.exp(-z)))

In [11]:
def f_prime(z):
    return np.multiply(z, (1 - z))

In [12]:
# static version
'''
def forward_propagation(W,x,b):
    x = x.reshape(9,1)
    a0 = x
    z1 = W[0].dot(a0) + b[0]
    a1 = f(z1)
    z2 = W[1].dot(a1) + b[1]
    a2 = f(z2)
    z3 = W[2].dot(a2) + b[2]
    a3 = f(z3)
    
    return [z1,z2,z3], [a1, a1,a2,a3]
'''
# dynamic version
def forward_propagation(W,x_i,b):
    a0 = x_i.reshape(9,1)
    a = [a0]
    z = []

    for l in xrange(len(W)):
        
        z_l = W[l].dot(a[l]) + b[l]
        z.append(z_l)

        a_l = f(z[l])
        a.append(a_l)
    
    return z, a
#i = 399
#z, a = forward_propagation(W,X[i,:],b)
#print len(z), len(a)
#print [a_l.shape for a_l in a]
#print a[-1]
#print z

In [13]:
def back_propagation(z,a,W,y_i):
    z.insert(0,0) # 0 as a first-element placeholder to align z and a
    
    # output layer error
    d3 = np.multiply(-(y_i - a[3]), f_prime(z[3]))
    
    # hidden layers
    d2 = np.multiply(W[2].T.dot(d3), f_prime(z[2]))
    d1 = np.multiply(W[1].T.dot(d2), f_prime(z[1]))
    
    return d1, d2, d3
#back_propagation(z,a,W,y[0])

In [14]:
def J(m, lambda_reg, W, h_Wb, y):
    left_sum = 0
    for i in xrange(m):
        left_sum += np.true_divide((np.linalg.norm(h_Wb[i] - y[i])**2),2)
    cost = np.true_divide(left_sum, m) 
    
    right_sum = 0
    for w in W:
        right_sum += np.sum(np.square(w))
    reg = np.true_divide(lambda_reg, 2) * right_sum
    
    return cost + reg

In [44]:
def gradient_checking(m, lambda_reg, w, h_Wb, y):
    
    epsilon = 10**-4
    
    theta = np.ravel(w)
    theta = theta.reshape(theta.shape[0],1)
    
    grad_approx = np.zeros((theta.shape))
    
    e = np.zeros((theta.shape))
    
    theta_pos = theta
    theta_neg = theta
    
    for i in xrange(theta.shape[0]):
        e[i] = 1
    
        theta_pos[i] = theta_pos[i] + epsilon#(epsilon * e[i])
        theta_neg[i] = theta_neg[i] - epsilon#(epsilon * e[i])
    
        g = np.true_divide((J(m, lambda_reg, theta_pos, h_Wb, y) - J(m, lambda_reg, theta_neg, h_Wb, y)), (2 * epsilon))

        grad_approx[i] = g
        
    return grad_approx
#gradient_checking(m, lambda_reg, W[0], h_Wb, y)

In [43]:
W, b = create_W_and_b(L,s)

Delta_W_0 = np.zeros(W[0].shape)
Delta_b_0 = np.zeros(b[0].shape)

Delta_W_1 = np.zeros(W[1].shape)
Delta_b_1 = np.zeros(b[1].shape)

Delta_W_2 = np.zeros(W[2].shape)
Delta_b_2 = np.zeros(b[2].shape)

h_Wb = []

alpha = 0.1
lambda_reg = 10.0

costs = []

batches = 10

m = 100

for batch in xrange(batches):
    for i in xrange(m):
        z,a = forward_propagation(W,X[i,:],b)

        h_Wb.append(a[-1])

        d1,d2,d3 = back_propagation(z,a,W,y[i])

        # partial derivatives
        Delta_W_0 += d1.dot(a[0].T)
        Delta_b_0 += d1

        Delta_W_1 += d2.dot(a[1].T)
        Delta_b_1 += d2

        Delta_W_2 += d3.dot(a[2].T)
        Delta_b_2 += d3

    #print np.sum(Delta_W_0)
    #print np.sum(gradient_checking(m, lambda_reg, W[0], h_Wb[:m], y[:m]))

    W[0] -= alpha * ((np.true_divide(Delta_W_0, m) * (lambda_reg * W[0])))
    b[0] -= alpha * (np.true_divide(Delta_b_0, m))

    W[1] -= alpha * ((np.true_divide(Delta_W_1, m) * (lambda_reg * W[1])))
    b[1] -= alpha * (np.true_divide(Delta_b_1, m))

    W[2] -= alpha * ((np.true_divide(Delta_W_2, m) * (lambda_reg * W[2])))
    b[2] -= alpha * (np.true_divide(Delta_b_2, m))
    
    costs.append(J(i, lambda_reg, W, h_Wb, y))
    


plt.plot([x for x in xrange(batches)], costs)
plt.show()



In [36]:
W[2]


Out[36]:
array([[ -3.06561573e-05,  -3.98447659e-06,  -1.66187494e-04,
          1.12678620e-04,  -1.37598280e-04,  -6.38599273e-05,
          4.07134475e-05,  -9.58621574e-05,  -7.99128002e-05],
       [ -7.28754782e-03,   4.42497084e-03,   1.83514477e-02,
          3.33550253e-04,  -8.95321616e-03,   4.31840718e-03,
         -4.76790002e-03,  -1.53199198e-04,   9.41397317e-03]])

In [37]:
pred_df = prediction_df[prediction_df['label'].apply(np.isnan) == True]

In [38]:
pred_tickers = pred_df['ticker'].unique()

In [39]:
pred_X = pred_df.drop(['ticker','label'], axis=1).values
print pred_X.shape
print pred_X[0]


(63, 9)
[  9.20000000e-01   9.20000000e-01   8.70000000e-01   9.00000000e-01
   1.93900000e+05   1.07040000e+00   8.91100000e-01  -2.17391304e-02
   5.74712644e-02]

In [41]:
for i in xrange(pred_X.shape[0]):
    z,a = forward_propagation(W,pred_X[i,:],b)
    print str(i).rjust(2), str(pred_tickers[i]).rjust(4), np.round(a[-1], 2)


 0 ABIO [[ 0.43]
 [ 0.73]]
 1 ACOR [[ 0.43]
 [ 0.73]]
 2 AERI [[ 0.43]
 [ 0.73]]
 3 AFFX [[ 0.43]
 [ 0.73]]
 4 AGEN [[ 0.43]
 [ 0.73]]
 5 ARIA [[ 0.43]
 [ 0.73]]
 6 ARNA [[ 0.43]
 [ 0.73]]
 7 ARWR [[ 0.43]
 [ 0.73]]
 8 ATNM [[ 0.43]
 [ 0.73]]
 9 AVXL [[ 0.43]
 [ 0.73]]
10 AXDX [[ 0.43]
 [ 0.73]]
11  AXN [[ 0.43]
 [ 0.73]]
12 BABY [[ 0.43]
 [ 0.73]]
13 BCRX [[ 0.43]
 [ 0.73]]
14 BGMD [[ 0.43]
 [ 0.73]]
15 BIIB [[ 0.43]
 [ 0.73]]
16 BLUE [[ 0.43]
 [ 0.73]]
17 BRKR [[ 0.43]
 [ 0.73]]
18 CBMG [[ 0.43]
 [ 0.73]]
19 CBPO [[ 0.43]
 [ 0.73]]
20 CGEN [[ 0.43]
 [ 0.73]]
21 CLDN [[ 0.43]
 [ 0.73]]
22 CLDX [[ 0.43]
 [ 0.73]]
23 COHR [[ 0.43]
 [ 0.73]]
24 CPHD [[ 0.43]
 [ 0.73]]
25 CPRX [[ 0.43]
 [ 0.73]]
26 CRIS [[ 0.43]
 [ 0.73]]
27 CYBX [[ 0.43]
 [ 0.73]]
28 CYNO [[ 0.43]
 [ 0.73]]
29 CYTR [[ 0.43]
 [ 0.73]]
30 DSCO [[ 0.43]
 [ 0.73]]
31 DYAX [[ 0.43]
 [ 0.73]]
32 ECYT [[ 0.43]
 [ 0.73]]
33 ENZN [[ 0.43]
 [ 0.73]]
34 ETRM [[ 0.43]
 [ 0.73]]
35 EXAS [[ 0.43]
 [ 0.73]]
36 EXEL [[ 0.43]
 [ 0.73]]
37 FATE [[ 0.43]
 [ 0.73]]
38 FEIC [[ 0.43]
 [ 0.73]]
39 FLDM [[ 0.43]
 [ 0.73]]
40 GILD [[ 0.43]
 [ 0.73]]
41 GNCA [[ 0.43]
 [ 0.73]]
42 HALO [[ 0.43]
 [ 0.73]]
43 IART [[ 0.43]
 [ 0.73]]
44 IDRA [[ 0.43]
 [ 0.73]]
45 IDXX [[ 0.43]
 [ 0.73]]
46 ILMN [[ 0.43]
 [ 0.73]]
47 IMMU [[ 0.43]
 [ 0.73]]
48 IMRS [[ 0.43]
 [ 0.73]]
49 INCY [[ 0.43]
 [ 0.73]]
50  INO [[ 0.43]
 [ 0.73]]
51 LPCN [[ 0.43]
 [ 0.73]]
52 MEIP [[ 0.43]
 [ 0.73]]
53 MNKD [[ 0.43]
 [ 0.73]]
54 OREX [[ 0.43]
 [ 0.73]]
55 PGNX [[ 0.43]
 [ 0.73]]
56 QLTI [[ 0.43]
 [ 0.73]]
57 RMTI [[ 0.43]
 [ 0.73]]
58 SGYP [[ 0.43]
 [ 0.73]]
59  SYN [[ 0.43]
 [ 0.73]]
60 THLD [[ 0.43]
 [ 0.73]]
61 TNXP [[ 0.43]
 [ 0.73]]
62 TPIV [[ 0.43]
 [ 0.73]]

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [25]:
positive_tickers = []
for i in xrange(len(pred_tickers)):
    print i, pred_tickers[i], y_predictions[i]
    if y_predictions[i] == 1:
        positive_tickers.append(pred_tickers[i])


0 ABIO
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-25-424c0eed27cc> in <module>()
      1 positive_tickers = []
      2 for i in xrange(len(pred_tickers)):
----> 3     print i, pred_tickers[i], y_predictions[i]
      4     if y_predictions[i] == 1:
      5         positive_tickers.append(pred_tickers[i])

NameError: name 'y_predictions' is not defined

In [ ]:
for ticker in positive_tickers:
    
    past_days = 100
    
    oc = prediction_df[prediction_df['ticker'] == ticker]["OC%"][-past_days:]
    
    num_days = oc.shape[0]
    
    day_range = np.arange(num_days)
    
    plt.plot(day_range, oc, alpha=0.5)
    plt.plot(day_range, [0.05 for x in day_range], color='r')
    plt.title("{0} (previous {1} days)".format(ticker, num_days))
    plt.show()

    print "\t", ticker, "{}-day freq probability:".format(past_days), np.true_divide(np.sum(oc.values > 0.05), past_days)
    print "~"*50, "\n"

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:
'''
#errors = []
iterations = 1

for i in xrange(iterations):

    
#plt.plot([x for x in xrange(iterations)], errors)
#plt.show()
'''

In [ ]:
'''
X = np.array([[1,2,3,4],
              [1,3,4,5],
              [1,6,7,8],
              [1,5,4,3],
              [1,2,3,3]])
print X.shape

theta1 = np.array([[2,2,3,4],
                   [4,3,4,5],
                   [6,6,7,8]])

print theta1.T
print theta1.shape

print X.dot(theta1.T)
print X.dot(theta1.T).shape
'''