notebook.community

Edit and run



In [1]:

    
from imports import *
import avg_clf_train
import import_data

%matplotlib inline



In [2]:

    
check = True

if check == True:
    # check if last day's data is available
    print Quandl.get("YAHOO/HALO", authtoken='DVhizWXNTePyzzy1eHWR').tail(1)









    



                 Open       High  Low      Close   Volume  Adjusted Close
Date                                                                     
2015-08-18  19.809999  19.809999   19  19.030001  1101800       19.030001



In [3]:

    
download = False

start_tickers = ticker_list.tickers
tickers = []

print len(start_tickers), "total tickers to start\n"

if download == True:
    # download data
    for ticker in start_tickers:
        try:
            stock_df = Quandl.get("YAHOO/{}".format(ticker), authtoken='DVhizWXNTePyzzy1eHWR')
            stock_df.to_csv("quandl_data/{}.csv".format(ticker), index=False)
            tickers.append(ticker)
        except:
            print "removed:", ticker
            
elif download == False:
    tickers = [filename[:-4] for filename in os.listdir('quandl_data')]
            
print "\n", len(tickers), "available tickers:"
print tickers









    



97 total tickers to start


85 available tickers:
['ABIO', 'ACOR', 'ADMA', 'AERI', 'AFFX', 'AGEN', 'APPY', 'ARDM', 'ARIA', 'ARNA', 'ARWR', 'ATNM', 'AVXL', 'AXDX', 'AXGN', 'AXN', 'BABY', 'BASI', 'BCLI', 'BCRX', 'BGMD', 'BIIB', 'BLUE', 'BOTA', 'BRKR', 'CBLI', 'CBMG', 'CBMX', 'CBPO', 'CGEN', 'CLDN', 'CLDX', 'CNMD', 'COHR', 'CPHD', 'CPRX', 'CRIS', 'CUTR', 'CYBX', 'CYNO', 'CYTR', 'DARA', 'DRAD', 'DSCO', 'DYAX', 'ECTE', 'ECYT', 'ELOS', 'ENZN', 'ESMC', 'ETRM', 'EXAS', 'EXEL', 'FATE', 'FEIC', 'FLDM', 'FONR', 'GEVA', 'GILD', 'GNCA', 'HALO', 'HSKA', 'IART', 'ICCC', 'IDRA', 'IDXX', 'ILMN', 'IMMU', 'IMRS', 'INCY', 'INO', 'IRIX', 'LPCN', 'MEIP', 'MNKD', 'OREX', 'PGNX', 'QLTI', 'RMTI', 'SGYP', 'SNGX', 'SYN', 'THLD', 'TNXP', 'TPIV']



In [4]:

    
stock_df, prediction_df = import_data.import_data(tickers)
print stock_df.shape
stock_df.tail()









    



(134034, 10)






    Out[4]:






  
    
      
      Open
      High
      Low
      Close
      Volume
      50dravg
      200dravg
      OC%
      HL%
      label
    
  
  
    
      4096
      0.52
      0.53
      0.50
      0.52
      1093100
      0.7780
      0.38757
      0.000000
      0.060
      0
    
    
      4097
      0.52
      0.52
      0.50
      0.52
      226000
      0.7808
      0.38852
      0.000000
      0.040
      0
    
    
      4098
      0.51
      0.52
      0.50
      0.51
      583300
      0.7838
      0.38927
      0.000000
      0.040
      1
    
    
      4099
      0.51
      0.56
      0.50
      0.56
      475900
      0.7884
      0.39017
      0.098039
      0.120
      0
    
    
      4100
      0.57
      0.63
      0.56
      0.57
      1537100
      0.7928
      0.39107
      0.000000
      0.125
      0



In [5]:

    
#stock_df[stock_df['Open'] > 5.0]



In [6]:

    
#prediction_df[prediction_df['Open'] > 5.0]



In [7]:

    
stock_df.describe()









    Out[7]:






  
    
      
      Open
      High
      Low
      Close
      Volume
      50dravg
      200dravg
      OC%
      HL%
      label
    
  
  
    
      count
      134034.000000
      134034.000000
      134034.000000
      134034.000000
      1.340340e+05
      134034.000000
      134034.000000
      134034.000000
      134034.000000
      134034.000000
    
    
      mean
      22.272779
      22.743264
      21.790341
      22.274280
      1.635363e+06
      21.982453
      21.130649
      0.000384
      0.060502
      0.066423
    
    
      std
      32.478383
      33.028408
      31.902986
      32.470319
      5.146146e+06
      31.559408
      28.843041
      0.048146
      0.067330
      0.249022
    
    
      min
      0.020000
      0.020000
      0.010000
      0.010000
      1.001000e+05
      0.013040
      0.060366
      -0.800000
      0.000000
      0.000000
    
    
      25%
      4.660000
      4.810000
      4.500000
      4.650000
      2.079000e+05
      4.660000
      4.751600
      -0.019231
      0.029259
      0.000000
    
    
      50%
      11.249920
      11.540000
      10.960000
      11.249920
      4.372000e+05
      11.165500
      11.036425
      0.000000
      0.045288
      0.000000
    
    
      75%
      28.400000
      29.000000
      27.750000
      28.430000
      1.072000e+06
      28.033925
      27.298400
      0.017167
      0.071429
      0.000000
    
    
      max
      475.920013
      480.179993
      460.500000
      475.980011
      4.553760e+08
      419.842800
      376.095399
      3.365482
      5.238095
      1.000000



In [8]:

    
stock_df.Volume.hist(bins=50)
plt.show()



In [9]:

    
for i in xrange(len(stock_df.columns)):
    print i, stock_df.columns[i], stock_df.corr()['label'].values[i]









    



0 Open -0.061916978716
1 High -0.0601920154086
2 Low -0.0639046497747
3 Close -0.0618858811241
4 Volume 0.00615471355362
5 50dravg -0.0586777777832
6 200dravg -0.0560666480781
7 OC% 0.00776985827436
8 HL% 0.143478463072
9 label 1.0



In [10]:

    
negative_df = stock_df[stock_df['label'] == 0]
positive_df = stock_df[stock_df['label'] == 1]

plt.scatter(negative_df['200dravg'], negative_df['Volume'])
plt.scatter(positive_df['200dravg'], positive_df['Volume'], color='r')
plt.show()



In [11]:

    
x, y, z = 'HL%', 'Low', 'Open'

fig = plt.figure()
ax = fig.gca(projection='3d')
ax.scatter(negative_df[x], negative_df[y], negative_df[z], alpha=0.5, color='y')
ax.scatter(positive_df[x], positive_df[y], positive_df[z], color='r')
ax.set_xlabel(x)
ax.set_ylabel(y)
ax.set_zlabel(z)
ax.view_init(azim=250)
plt.show()



In [12]:

    
y = stock_df['label'].values
stock_df = stock_df.drop('label', axis=1)
X = stock_df.values

print X.shape, y.shape









    



(134034, 9) (134034,)



In [13]:

    
plt.hist(y, bins=50, alpha=0.7, color='r')
plt.show()



In [14]:

    
y_values = np.unique(y, return_counts=True)[0]
print y_values.shape, "\n"
print y_values









    



(2,) 

[0 1]



In [15]:

    
num_of_classes = np.unique(y, return_counts=True)[1]
print num_of_classes
print "percent 1: ", np.true_divide(num_of_classes[1],np.sum(num_of_classes))









    



[125131   8903]
percent 1:  0.0664234447976



In [16]:

    
classes_to_remove = []
for i in np.where(num_of_classes == 1)[0]:
    classes_to_remove.append(y_values[i])

print len(classes_to_remove)
print classes_to_remove[:5]
print classes_to_remove[-5:]









    



0
[]
[]



In [17]:

    
print "number of labels: ", np.unique(y, return_counts=True)[0].shape[0]









    



number of labels:  2



In [18]:

    
#for i in xrange(X.shape[1]):
#    plt.scatter(X[:,i], y)
#    plt.show()



In [19]:

    
#for i in xrange(X.shape[1]):
#    plt.hist(X[:,i])
#    plt.show()



In [20]:

    
skb, learners = avg_clf_train.avg_clf_train(X, y, 4, 100.0, 0.1, stock_df)

'''
def avg_clf_train_func(X, y, k, stock_df): 

    clf_or_regr = "clf"

    t0 = time()


    ############################################################################ 

    pipeline = make_pipeline(DecisionTreeClassifier())

        # cross validation    
    cv = StratifiedShuffleSplit(y, test_size=0.2, random_state=42)

    # tune parameters
    params = dict()

    params['decisiontreeclassifier__criterion'] = ['gini', 'entropy']
    params['decisiontreeclassifier__max_features'] = ['auto', 'sqrt', 'log2', None]
    params['decisiontreeclassifier__class_weight'] = ['auto', None]
    params['decisiontreeclassifier__random_state'] = [42]

    grid_search = GridSearchCV(pipeline, param_grid=params, n_jobs=1, cv=cv)

    grid_search.fit(X, y)

    print grid_search.best_estimator_


    return skb, learners

skb, learners = avg_clf_train_func(X, y, 4, stock_df)
'''









    



k = 4
Open 393.82
High 371.29
Low 420.72
Close 393.49
Volume 3.86
50dravg 352.77
200dravg 329.33
OC% 9.75
HL% 2204.59

DecisionTreeClassifier(class_weight='auto', criterion='gini', max_depth=None,
            max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            random_state=42, splitter='best')
0.879247957623

confusion matrix:
     FALSE   TRUE
FALSE [23411  1541] 
TRUE  [1696  159]

             precision    recall  f1-score   support

          0       0.93      0.94      0.94     24952
          1       0.09      0.09      0.09      1855

avg / total       0.87      0.88      0.88     26807


minutes for learner to run: 0.011







    Out[20]:





'\ndef avg_clf_train_func(X, y, k, stock_df): \n\n    clf_or_regr = "clf"\n\n    t0 = time()\n\n\n    ############################################################################ \n\n    pipeline = make_pipeline(DecisionTreeClassifier())\n\n        # cross validation    \n    cv = StratifiedShuffleSplit(y, test_size=0.2, random_state=42)\n\n    # tune parameters\n    params = dict()\n\n    params[\'decisiontreeclassifier__criterion\'] = [\'gini\', \'entropy\']\n    params[\'decisiontreeclassifier__max_features\'] = [\'auto\', \'sqrt\', \'log2\', None]\n    params[\'decisiontreeclassifier__class_weight\'] = [\'auto\', None]\n    params[\'decisiontreeclassifier__random_state\'] = [42]\n\n    grid_search = GridSearchCV(pipeline, param_grid=params, n_jobs=1, cv=cv)\n\n    grid_search.fit(X, y)\n\n    print grid_search.best_estimator_\n\n\n    return skb, learners\n\nskb, learners = avg_clf_train_func(X, y, 4, stock_df)\n'



In [21]:

    
#X_df = pd.DataFrame(X[:,:4])
#X_df = pd.DataFrame(X)
#X_df['labels'] = y
#sns.pairplot(X_df, hue='labels')
#plt.show()



In [22]:

    
#plt.hist(y, color='b', alpha=0.7)
#plt.hist(y_pred, color='y', alpha=0.7)
#plt.show()



In [23]:

    
#plt.scatter(np.arange(y.shape[0]), y, color='b', alpha=0.7)
#plt.scatter(np.arange(y_pred.shape[0]), y_pred, color='y', alpha=0.7)
#plt.show()



In [24]:

    
#y_pred - y



In [25]:

    
#np.sum(y)



In [26]:

    
#error_count = 0
#for i in xrange(len(y)):
#    if y_pred[i] != y[i]:
#        error_count += 1
#        
#print error_count, " / ", len(y)



In [27]:

    
pred_df = prediction_df[prediction_df['label'].apply(np.isnan) == True]



In [28]:

    
pred_tickers = pred_df['ticker'].unique()



In [29]:

    
pred_X = pred_df.drop(['ticker','label'], axis=1).values
print pred_X.shape
print pred_X[0]









    



(65, 9)
[  9.30000000e-01   9.30000000e-01   8.60000000e-01   9.10000000e-01
   1.47000000e+05   1.06760000e+00   8.93050000e-01  -2.15053763e-02
   8.13953488e-02]



In [30]:

    
pred_X = skb.transform(pred_X)
print pred_X.shape



In [31]:

    
y_predictions = []
for learner in learners:
    y_pred = learner.predict(pred_X)
    print y_pred.shape
    y_predictions.append(y_pred)









    



(65,)



In [32]:

    
print len(y_predictions)
y_predictions = y_predictions[0]



In [33]:

    
#y_pred_avg = np.mean(y_predictions, axis=1)
#print y_pred_avg.shape



In [34]:

    
positive_tickers = []
for i in xrange(len(pred_tickers)):
    print i, pred_tickers[i], y_predictions[i]
    if y_predictions[i] == 1:
        positive_tickers.append(pred_tickers[i])









    



0 ABIO 0
1 ACOR 0
2 AERI 0
3 AFFX 0
4 AGEN 1
5 ARIA 0
6 ARNA 1
7 ARWR 0
8 ATNM 0
9 AVXL 0
10 AXDX 0
11 AXN 0
12 BABY 0
13 BCLI 1
14 BCRX 0
15 BGMD 0
16 BIIB 0
17 BLUE 0
18 BRKR 0
19 CBMG 0
20 CBPO 0
21 CGEN 0
22 CLDN 0
23 CLDX 0
24 CNMD 0
25 COHR 0
26 CPHD 0
27 CPRX 0
28 CRIS 0
29 CYBX 0
30 CYNO 0
31 CYTR 0
32 DSCO 1
33 DYAX 0
34 ECYT 0
35 ELOS 0
36 ENZN 0
37 ETRM 0
38 EXAS 0
39 EXEL 0
40 FATE 0
41 FEIC 0
42 FLDM 0
43 GILD 0
44 GNCA 0
45 HALO 0
46 IART 0
47 IDRA 0
48 IDXX 0
49 ILMN 0
50 IMMU 1
51 IMRS 0
52 INCY 0
53 INO 0
54 LPCN 0
55 MEIP 0
56 MNKD 0
57 OREX 0
58 PGNX 0
59 RMTI 0
60 SGYP 0
61 SYN 0
62 THLD 0
63 TNXP 0
64 TPIV 0



In [35]:

    
for ticker in positive_tickers:
    
    past_days = 100
    
    oc = prediction_df[prediction_df['ticker'] == ticker]["OC%"][-past_days:]
    
    num_days = oc.shape[0]
    
    day_range = np.arange(num_days)
    
    plt.plot(day_range, oc, alpha=0.5)
    plt.plot(day_range, [0.05 for x in day_range], color='r')
    plt.title("{0} (previous {1} days)".format(ticker, num_days))
    plt.show()

    print "\t", ticker, "{}-day freq probability:".format(past_days), np.true_divide(np.sum(oc.values > 0.05), past_days)
    print "~"*50, "\n"









    












    



	AGEN 100-day freq probability: 0.11
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 







    












    



	ARNA 100-day freq probability: 0.02
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 







    












    



	BCLI 100-day freq probability: 0.05
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 







    












    



	DSCO 100-day freq probability: 0.08
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 







    












    



	IMMU 100-day freq probability: 0.06
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

	Open	High	Low	Close	Volume	50dravg	200dravg	OC%	HL%	label
4096	0.52	0.53	0.50	0.52	1093100	0.7780	0.38757	0.000000	0.060	0
4097	0.52	0.52	0.50	0.52	226000	0.7808	0.38852	0.000000	0.040	0
4098	0.51	0.52	0.50	0.51	583300	0.7838	0.38927	0.000000	0.040	1
4099	0.51	0.56	0.50	0.56	475900	0.7884	0.39017	0.098039	0.120	0
4100	0.57	0.63	0.56	0.57	1537100	0.7928	0.39107	0.000000	0.125	0

	Open	High	Low	Close	Volume	50dravg	200dravg	OC%	HL%	label
count	134034.000000	134034.000000	134034.000000	134034.000000	1.340340e+05	134034.000000	134034.000000	134034.000000	134034.000000	134034.000000
mean	22.272779	22.743264	21.790341	22.274280	1.635363e+06	21.982453	21.130649	0.000384	0.060502	0.066423
std	32.478383	33.028408	31.902986	32.470319	5.146146e+06	31.559408	28.843041	0.048146	0.067330	0.249022
min	0.020000	0.020000	0.010000	0.010000	1.001000e+05	0.013040	0.060366	-0.800000	0.000000	0.000000
25%	4.660000	4.810000	4.500000	4.650000	2.079000e+05	4.660000	4.751600	-0.019231	0.029259	0.000000
50%	11.249920	11.540000	10.960000	11.249920	4.372000e+05	11.165500	11.036425	0.000000	0.045288	0.000000
75%	28.400000	29.000000	27.750000	28.430000	1.072000e+06	28.033925	27.298400	0.017167	0.071429	0.000000
max	475.920013	480.179993	460.500000	475.980011	4.553760e+08	419.842800	376.095399	3.365482	5.238095	1.000000