In [1]:
from imports import *
import avg_clf_train
import import_data

%matplotlib inline


In [2]:
check = True

if check == True:
    # check if last day's data is available
    print Quandl.get("YAHOO/HALO", authtoken='DVhizWXNTePyzzy1eHWR').tail(1)


                 Open       High  Low      Close   Volume  Adjusted Close
Date                                                                     
2015-08-18  19.809999  19.809999   19  19.030001  1101800       19.030001


In [3]:
download = False

start_tickers = ticker_list.tickers
tickers = []

print len(start_tickers), "total tickers to start\n"

if download == True:
    # download data
    for ticker in start_tickers:
        try:
            stock_df = Quandl.get("YAHOO/{}".format(ticker), authtoken='DVhizWXNTePyzzy1eHWR')
            stock_df.to_csv("quandl_data/{}.csv".format(ticker), index=False)
            tickers.append(ticker)
        except:
            print "removed:", ticker
            
elif download == False:
    tickers = [filename[:-4] for filename in os.listdir('quandl_data')]
            
print "\n", len(tickers), "available tickers:"
print tickers


97 total tickers to start


85 available tickers:
['ABIO', 'ACOR', 'ADMA', 'AERI', 'AFFX', 'AGEN', 'APPY', 'ARDM', 'ARIA', 'ARNA', 'ARWR', 'ATNM', 'AVXL', 'AXDX', 'AXGN', 'AXN', 'BABY', 'BASI', 'BCLI', 'BCRX', 'BGMD', 'BIIB', 'BLUE', 'BOTA', 'BRKR', 'CBLI', 'CBMG', 'CBMX', 'CBPO', 'CGEN', 'CLDN', 'CLDX', 'CNMD', 'COHR', 'CPHD', 'CPRX', 'CRIS', 'CUTR', 'CYBX', 'CYNO', 'CYTR', 'DARA', 'DRAD', 'DSCO', 'DYAX', 'ECTE', 'ECYT', 'ELOS', 'ENZN', 'ESMC', 'ETRM', 'EXAS', 'EXEL', 'FATE', 'FEIC', 'FLDM', 'FONR', 'GEVA', 'GILD', 'GNCA', 'HALO', 'HSKA', 'IART', 'ICCC', 'IDRA', 'IDXX', 'ILMN', 'IMMU', 'IMRS', 'INCY', 'INO', 'IRIX', 'LPCN', 'MEIP', 'MNKD', 'OREX', 'PGNX', 'QLTI', 'RMTI', 'SGYP', 'SNGX', 'SYN', 'THLD', 'TNXP', 'TPIV']


In [4]:
stock_df, prediction_df = import_data.import_data(tickers)
print stock_df.shape
stock_df.tail()


(134034, 10)
Out[4]:
Open High Low Close Volume 50dravg 200dravg OC% HL% label
4096 0.52 0.53 0.50 0.52 1093100 0.7780 0.38757 0.000000 0.060 0
4097 0.52 0.52 0.50 0.52 226000 0.7808 0.38852 0.000000 0.040 0
4098 0.51 0.52 0.50 0.51 583300 0.7838 0.38927 0.000000 0.040 1
4099 0.51 0.56 0.50 0.56 475900 0.7884 0.39017 0.098039 0.120 0
4100 0.57 0.63 0.56 0.57 1537100 0.7928 0.39107 0.000000 0.125 0


In [5]:
#stock_df[stock_df['Open'] > 5.0]

In [6]:
#prediction_df[prediction_df['Open'] > 5.0]

In [7]:
stock_df.describe()


Out[7]:
Open High Low Close Volume 50dravg 200dravg OC% HL% label
count 134034.000000 134034.000000 134034.000000 134034.000000 1.340340e+05 134034.000000 134034.000000 134034.000000 134034.000000 134034.000000
mean 22.272779 22.743264 21.790341 22.274280 1.635363e+06 21.982453 21.130649 0.000384 0.060502 0.066423
std 32.478383 33.028408 31.902986 32.470319 5.146146e+06 31.559408 28.843041 0.048146 0.067330 0.249022
min 0.020000 0.020000 0.010000 0.010000 1.001000e+05 0.013040 0.060366 -0.800000 0.000000 0.000000
25% 4.660000 4.810000 4.500000 4.650000 2.079000e+05 4.660000 4.751600 -0.019231 0.029259 0.000000
50% 11.249920 11.540000 10.960000 11.249920 4.372000e+05 11.165500 11.036425 0.000000 0.045288 0.000000
75% 28.400000 29.000000 27.750000 28.430000 1.072000e+06 28.033925 27.298400 0.017167 0.071429 0.000000
max 475.920013 480.179993 460.500000 475.980011 4.553760e+08 419.842800 376.095399 3.365482 5.238095 1.000000

In [8]:
stock_df.Volume.hist(bins=50)
plt.show()



In [9]:
for i in xrange(len(stock_df.columns)):
    print i, stock_df.columns[i], stock_df.corr()['label'].values[i]


0 Open -0.061916978716
1 High -0.0601920154086
2 Low -0.0639046497747
3 Close -0.0618858811241
4 Volume 0.00615471355362
5 50dravg -0.0586777777832
6 200dravg -0.0560666480781
7 OC% 0.00776985827436
8 HL% 0.143478463072
9 label 1.0

In [10]:
negative_df = stock_df[stock_df['label'] == 0]
positive_df = stock_df[stock_df['label'] == 1]

plt.scatter(negative_df['200dravg'], negative_df['Volume'])
plt.scatter(positive_df['200dravg'], positive_df['Volume'], color='r')
plt.show()



In [11]:
x, y, z = 'HL%', 'Low', 'Open'

fig = plt.figure()
ax = fig.gca(projection='3d')
ax.scatter(negative_df[x], negative_df[y], negative_df[z], alpha=0.5, color='y')
ax.scatter(positive_df[x], positive_df[y], positive_df[z], color='r')
ax.set_xlabel(x)
ax.set_ylabel(y)
ax.set_zlabel(z)
ax.view_init(azim=250)
plt.show()



In [12]:
y = stock_df['label'].values
stock_df = stock_df.drop('label', axis=1)
X = stock_df.values

print X.shape, y.shape


(134034, 9) (134034,)

In [13]:
plt.hist(y, bins=50, alpha=0.7, color='r')
plt.show()



In [14]:
y_values = np.unique(y, return_counts=True)[0]
print y_values.shape, "\n"
print y_values


(2,) 

[0 1]

In [15]:
num_of_classes = np.unique(y, return_counts=True)[1]
print num_of_classes
print "percent 1: ", np.true_divide(num_of_classes[1],np.sum(num_of_classes))


[125131   8903]
percent 1:  0.0664234447976

In [16]:
classes_to_remove = []
for i in np.where(num_of_classes == 1)[0]:
    classes_to_remove.append(y_values[i])

print len(classes_to_remove)
print classes_to_remove[:5]
print classes_to_remove[-5:]


0
[]
[]

In [17]:
print "number of labels: ", np.unique(y, return_counts=True)[0].shape[0]


number of labels:  2

In [18]:
#for i in xrange(X.shape[1]):
#    plt.scatter(X[:,i], y)
#    plt.show()

In [19]:
#for i in xrange(X.shape[1]):
#    plt.hist(X[:,i])
#    plt.show()


In [20]:
skb, learners = avg_clf_train.avg_clf_train(X, y, 4, 100.0, 0.1, stock_df)

'''
def avg_clf_train_func(X, y, k, stock_df): 

    clf_or_regr = "clf"

    t0 = time()


    ############################################################################ 

    pipeline = make_pipeline(DecisionTreeClassifier())

        # cross validation    
    cv = StratifiedShuffleSplit(y, test_size=0.2, random_state=42)

    # tune parameters
    params = dict()

    params['decisiontreeclassifier__criterion'] = ['gini', 'entropy']
    params['decisiontreeclassifier__max_features'] = ['auto', 'sqrt', 'log2', None]
    params['decisiontreeclassifier__class_weight'] = ['auto', None]
    params['decisiontreeclassifier__random_state'] = [42]

    grid_search = GridSearchCV(pipeline, param_grid=params, n_jobs=1, cv=cv)

    grid_search.fit(X, y)

    print grid_search.best_estimator_


    return skb, learners

skb, learners = avg_clf_train_func(X, y, 4, stock_df)
'''


k = 4
Open 393.82
High 371.29
Low 420.72
Close 393.49
Volume 3.86
50dravg 352.77
200dravg 329.33
OC% 9.75
HL% 2204.59

DecisionTreeClassifier(class_weight='auto', criterion='gini', max_depth=None,
            max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            random_state=42, splitter='best')
0.879247957623

confusion matrix:
     FALSE   TRUE
FALSE [23411  1541] 
TRUE  [1696  159]

             precision    recall  f1-score   support

          0       0.93      0.94      0.94     24952
          1       0.09      0.09      0.09      1855

avg / total       0.87      0.88      0.88     26807


minutes for learner to run: 0.011

Out[20]:
'\ndef avg_clf_train_func(X, y, k, stock_df): \n\n    clf_or_regr = "clf"\n\n    t0 = time()\n\n\n    ############################################################################ \n\n    pipeline = make_pipeline(DecisionTreeClassifier())\n\n        # cross validation    \n    cv = StratifiedShuffleSplit(y, test_size=0.2, random_state=42)\n\n    # tune parameters\n    params = dict()\n\n    params[\'decisiontreeclassifier__criterion\'] = [\'gini\', \'entropy\']\n    params[\'decisiontreeclassifier__max_features\'] = [\'auto\', \'sqrt\', \'log2\', None]\n    params[\'decisiontreeclassifier__class_weight\'] = [\'auto\', None]\n    params[\'decisiontreeclassifier__random_state\'] = [42]\n\n    grid_search = GridSearchCV(pipeline, param_grid=params, n_jobs=1, cv=cv)\n\n    grid_search.fit(X, y)\n\n    print grid_search.best_estimator_\n\n\n    return skb, learners\n\nskb, learners = avg_clf_train_func(X, y, 4, stock_df)\n'


In [21]:
#X_df = pd.DataFrame(X[:,:4])
#X_df = pd.DataFrame(X)
#X_df['labels'] = y
#sns.pairplot(X_df, hue='labels')
#plt.show()

In [22]:
#plt.hist(y, color='b', alpha=0.7)
#plt.hist(y_pred, color='y', alpha=0.7)
#plt.show()

In [23]:
#plt.scatter(np.arange(y.shape[0]), y, color='b', alpha=0.7)
#plt.scatter(np.arange(y_pred.shape[0]), y_pred, color='y', alpha=0.7)
#plt.show()

In [24]:
#y_pred - y

In [25]:
#np.sum(y)

In [26]:
#error_count = 0
#for i in xrange(len(y)):
#    if y_pred[i] != y[i]:
#        error_count += 1
#        
#print error_count, " / ", len(y)


In [27]:
pred_df = prediction_df[prediction_df['label'].apply(np.isnan) == True]

In [28]:
pred_tickers = pred_df['ticker'].unique()

In [29]:
pred_X = pred_df.drop(['ticker','label'], axis=1).values
print pred_X.shape
print pred_X[0]


(65, 9)
[  9.30000000e-01   9.30000000e-01   8.60000000e-01   9.10000000e-01
   1.47000000e+05   1.06760000e+00   8.93050000e-01  -2.15053763e-02
   8.13953488e-02]

In [30]:
pred_X = skb.transform(pred_X)
print pred_X.shape


(65, 4)

In [31]:
y_predictions = []
for learner in learners:
    y_pred = learner.predict(pred_X)
    print y_pred.shape
    y_predictions.append(y_pred)


(65,)

In [32]:
print len(y_predictions)
y_predictions = y_predictions[0]


1

In [33]:
#y_pred_avg = np.mean(y_predictions, axis=1)
#print y_pred_avg.shape

In [34]:
positive_tickers = []
for i in xrange(len(pred_tickers)):
    print i, pred_tickers[i], y_predictions[i]
    if y_predictions[i] == 1:
        positive_tickers.append(pred_tickers[i])


0 ABIO 0
1 ACOR 0
2 AERI 0
3 AFFX 0
4 AGEN 1
5 ARIA 0
6 ARNA 1
7 ARWR 0
8 ATNM 0
9 AVXL 0
10 AXDX 0
11 AXN 0
12 BABY 0
13 BCLI 1
14 BCRX 0
15 BGMD 0
16 BIIB 0
17 BLUE 0
18 BRKR 0
19 CBMG 0
20 CBPO 0
21 CGEN 0
22 CLDN 0
23 CLDX 0
24 CNMD 0
25 COHR 0
26 CPHD 0
27 CPRX 0
28 CRIS 0
29 CYBX 0
30 CYNO 0
31 CYTR 0
32 DSCO 1
33 DYAX 0
34 ECYT 0
35 ELOS 0
36 ENZN 0
37 ETRM 0
38 EXAS 0
39 EXEL 0
40 FATE 0
41 FEIC 0
42 FLDM 0
43 GILD 0
44 GNCA 0
45 HALO 0
46 IART 0
47 IDRA 0
48 IDXX 0
49 ILMN 0
50 IMMU 1
51 IMRS 0
52 INCY 0
53 INO 0
54 LPCN 0
55 MEIP 0
56 MNKD 0
57 OREX 0
58 PGNX 0
59 RMTI 0
60 SGYP 0
61 SYN 0
62 THLD 0
63 TNXP 0
64 TPIV 0

In [35]:
for ticker in positive_tickers:
    
    past_days = 100
    
    oc = prediction_df[prediction_df['ticker'] == ticker]["OC%"][-past_days:]
    
    num_days = oc.shape[0]
    
    day_range = np.arange(num_days)
    
    plt.plot(day_range, oc, alpha=0.5)
    plt.plot(day_range, [0.05 for x in day_range], color='r')
    plt.title("{0} (previous {1} days)".format(ticker, num_days))
    plt.show()

    print "\t", ticker, "{}-day freq probability:".format(past_days), np.true_divide(np.sum(oc.values > 0.05), past_days)
    print "~"*50, "\n"


	AGEN 100-day freq probability: 0.11
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

	ARNA 100-day freq probability: 0.02
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

	BCLI 100-day freq probability: 0.05
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

	DSCO 100-day freq probability: 0.08
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 

	IMMU 100-day freq probability: 0.06
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~