In [3]:
import requests
import csv
import re
import numpy as np
import pandas as pd
from scipy import stats
import datetime
import Quandl
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score

from sklearn.pipeline import make_pipeline

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

from sklearn.feature_selection import SelectKBest
from sklearn.decomposition import RandomizedPCA 

from sklearn.preprocessing import PolynomialFeatures
 
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier 
from sklearn.svm import SVC

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import SGDRegressor 
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor 
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.grid_search import GridSearchCV

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


In [4]:
def get_stock_data(ticker, seconds_interval, num_of_days):
    
    url = "http://www.google.com/finance/getprices?q={0}&i={1}&p={2}d&f=d,o,h,l,c,v".format(ticker, seconds_interval, num_of_days)

    # get data and convert to data frame
    stock_df = pd.read_csv(url, skiprows=[0,1,2,3,5,6])

    # rename column name
    stock_df.rename(columns={'COLUMNS=DATE':'time'}, inplace=True)

    # remove 'a' from unix timestamps
    stock_df.replace(to_replace={'time':{'a':''}}, regex=True, inplace=True)

    # get entire column and convert to ints
    time_indices = [int(x) for x in stock_df['time'].values]

    # keep track of current timestamp
    last_timestamp = time_indices[0]

    # convert unix timestamp abbreviations into full unix timestamps
    for i in range(len(time_indices)):
        if time_indices[i] < last_timestamp:
            time_indices[i] = last_timestamp + (time_indices[i] * seconds_interval)
        else:
            last_timestamp = time_indices[i]

    # convert unix timestamps to human-readable formats
    time_indices = [datetime.datetime.fromtimestamp(x).strftime('%Y-%m-%d %H:%M:%S') for x in time_indices]
    
    # keep times (i.e., not dates)
    times = [float(x[-8:-3].replace(':','.')) for x in time_indices]

    # create new column in data frame
    stock_df['time'] = times

    # keep day of month
    #dates = [int(x[:10].split('-')[2]) for x in time_indices]
    # create new column in data frame
    #stock_df['month_date'] = dates

    # get weekday as int value
    #stock_df['week_day'] = [datetime.datetime.strptime(x[:10], '%Y-%m-%d').weekday() for x in time_indices]

    # create features
    stock_df['op_cl%'] = np.true_divide((stock_df['CLOSE'] - stock_df['OPEN']), stock_df['CLOSE'])
    stock_df['lo_hi%'] = np.true_divide((stock_df['HIGH'] - stock_df['LOW']), stock_df['HIGH'])
    stock_df['vol_norm'] = np.true_divide(stock_df['VOLUME'], np.max(stock_df['VOLUME']))

    # create labels dataframe
    labels_df = stock_df.copy(deep=True)

    # remove columns
    stock_df = stock_df.drop(['CLOSE', 'OPEN', 'LOW', 'HIGH', 'VOLUME'], axis=1)

    #print stock_df.shape
    #stock_df.head()
    
    return stock_df, labels_df

In [5]:
tickers = ['ARNA',
           'ATNM',
           'AVXL',
           'AXN',
           'BLFS',
           'BOTA',
           'CBLI',
           'CPRX', 
           'DARA',
           'ECYT',
           'EDAP',
           'EXAS',
           'HALO',
           'IDRA',
           'INO',
           'LOXO',
           'LPCN',
           'MEIP',
           'MNKD',
           'OREX',
           'PGNX',
           'QLTI',
           'RMTI',
           'SGYP',
           'SNGX',
           'SPY',
           'SYN', 
           'TENX',
           'THLD',
           'TNXP',
           'TPIV']


In [6]:
# download data

for ticker in tickers:
    
    seconds_interval = 1800 # 1800: 30-minute (seems the most consistent)
    stock_df, labels_df = get_stock_data(ticker, seconds_interval, 1000)
        
    stock_df.to_csv("goog_data/{}_features.csv".format(ticker), index=False)
    labels_df.to_csv("goog_data/{}_labels.csv".format(ticker), index=False)


In [442]:
# import data

X = []
y = []

for ticker in tickers:

    stock_df = pd.read_csv("goog_data/{}_features.csv".format(ticker))
    labels_df = pd.read_csv("goog_data/{}_labels.csv".format(ticker))

    num_of_times = stock_df['time'].unique().shape[0]
    stock_df = stock_df.drop('time', axis=1)
    assert num_of_times == 14, "wrong number of times"
    #print "number of times in a day: {}".format(num_of_times)

    num_of_days = stock_df.shape[0]/num_of_times
    #print "number of days: {}".format(num_of_days)

    for i in xrange(num_of_days):

        # features
        features = stock_df.values[:num_of_times].flatten()
        features = np.expand_dims(features, axis=0)
        #assert features.shape[1] == 84, "wrong number of columns"

        # combine features into rows of X
        if X == []:
            X = features
        else: 
            X = np.vstack((X, features))

        # labels
        labels = labels_df.values[:num_of_times].flatten()
        # (last - open) / last
        label = np.true_divide((labels[-8] - labels[1]), labels[-8])
        # make class
        label = int(label*100)
        
        # make binary class
        def binarize(label):
            if label >= 5:
                label = 1
            else:
                label = 0
            return label
        
        label = binarize(label)

        if y == []:
            y = np.array([label])
        else:
            y = np.append(y, np.array(label))  

        # remove used rows (go to next day)
        stock_df = stock_df[num_of_times:]
        labels_df = labels_df[num_of_times:]

    # rotate/discard rows
    X = X[:-1]
    y = y[1:]

print "\n", "*"*10, "\nfinal shapes: ", X.shape, y.shape


********** 
final shapes:  (1447, 42) (1447,)

In [443]:
plt.hist(y, bins=20, alpha=0.7, color='r')
plt.show()



In [444]:
y_values = np.unique(y, return_counts=True)[0]
print y_values


[0 1]

In [445]:
num_of_classes = np.unique(y, return_counts=True)[1]
print num_of_classes


[1315  132]

In [446]:
classes_to_remove = []
for i in np.where(num_of_classes == 1)[0]:
    classes_to_remove.append(y_values[i])

print classes_to_remove


[]

In [447]:
# single-class removal
rows_to_remove = []
for i in xrange(len(y)):
    if y[i] in classes_to_remove:
        rows_to_remove.append(i)
    
X = np.delete(X, rows_to_remove, axis=0)
y = np.delete(y, rows_to_remove)

print X.shape, y.shape


(1447, 42) (1447,)

In [448]:
# outlier-label removal
'''
rows_to_remove = []
for i in xrange(len(y)):
    if y[i] < -10 or y[i] > 10:
        rows_to_remove.append(i)
    
X = np.delete(X, rows_to_remove, axis=0)
y = np.delete(y, rows_to_remove)

print X.shape, y.shape
'''


Out[448]:
'\nrows_to_remove = []\nfor i in xrange(len(y)):\n    if y[i] < -10 or y[i] > 10:\n        rows_to_remove.append(i)\n    \nX = np.delete(X, rows_to_remove, axis=0)\ny = np.delete(y, rows_to_remove)\n\nprint X.shape, y.shape\n'

In [449]:
print "number of labels: ", np.unique(y, return_counts=True)[0].shape[0]


number of labels:  2

In [450]:
corr_df = pd.DataFrame(X)
corr_df['label'] = y
np.max(corr_df.corr()['label'].values[:-1])


Out[450]:
0.16541045758469208

In [451]:
#for i in xrange(X.shape[1]):
#    plt.scatter(X[:,i], y)
#    plt.show()


In [452]:
threshold = 10
if np.unique(y, return_counts=True)[0].shape[0] > 2:
    threshold = 3

skb = SelectKBest()
skb = skb.fit(X,y)
counter = 0
for i in xrange(skb.scores_.shape[0]):
    if skb.scores_[i] > threshold:
        counter += 1
        print i, skb.scores_[i]
        
print "\nfeatures scoring higher than {}: ".format(threshold), counter


10 13.9160374586
13 15.9807886228
16 37.5925500144
19 21.5172920992
22 16.4046434796
28 24.330042166
31 10.8137247828
34 14.0561887843
36 15.7085924366
37 40.6482566283
40 29.7184491097

features scoring higher than 10:  11

In [453]:
skb = SelectKBest(k=counter)
skb = skb.fit(X,y)
X = skb.transform(X)
X.shape


Out[453]:
(1447, 11)


In [ ]:
#clf = GaussianNB()    # Provided to give you a starting point. Try a varity of classifiers.
#clf = DecisionTreeClassifier()
clf = SVC()

#pipeline = make_pipeline(clf)
pipeline = make_pipeline(StandardScaler(), RandomizedPCA(), clf)

# cross validation    
cv = StratifiedShuffleSplit(y, test_size=0.2, random_state=42)

# tune parameters
params = dict()

# for PCA
params['randomizedpca__iterated_power'] = [1, 2, 3]
params['randomizedpca__n_components'] = [2, 4, 6, 8, 10]
params['randomizedpca__random_state'] = [42]
params['randomizedpca__whiten'] = [True, False]

if str(clf)[0] == 'D':
    params['decisiontreeclassifier__criterion'] = ['gini', 'entropy']
    params['decisiontreeclassifier__max_features'] = ['auto', 'sqrt', 'log2', None]
    params['decisiontreeclassifier__class_weight'] = ['auto', None]
    params['decisiontreeclassifier__random_state'] = [42]
    
if str(clf)[0] == 'S':
    # [2**x for x in np.arange(-15, 15+1, 3)]
    params['svc__C'] = [10, 100, 1000]
    params['svc__gamma'] = [0.1, 0.2, 0.5, 1.0]
    params['svc__random_state'] = [42]

grid_search = GridSearchCV(pipeline, param_grid=params, n_jobs=1, cv=cv)

grid_search.fit(X, y)

clf = grid_search.best_estimator_
print clf
print "\nscore: ", grid_search.best_score_

clf = clf.fit(X,y)
y_pred = clf.predict(X)
print "\n", classification_report(y, y_pred)

In [455]:
### labels as binary

# GaussianNB
## default (pipeline, stratifiedshufflesplit, gridsearch): 0.831379310345
## skb (k=11): 0.853448275862
## standardscaler: 0.853448275862
## default pca: 0.848965517241
## pca gridsearched: 0.880344827586 
###-> iterated_power=1, n_components=2, whiten=True

## defaults for following classifiers: 
###-> skb (k=11), pipeline [standardscaler, pca], stratifiedshufflesplit, gridsearch

# DecisionTreeClassifier
## default: 0.845517241379
## params gridsearched: 0.849655172414 
###-> class_weight='auto', criterion='gini', max_features=None

# SVC
## default: 0.910689655172
## params gridsearched: too long

In [456]:
### labels as ints, quasi-normally distributed

# GaussianNB
## default (pipeline, stratifiedshufflesplit, gridsearch): 0.0996515679443
## skb (k=10): 0.249128919861
## standardscaler: 0.249128919861
## default pca: 0.256445993031
## pca gridsearched: 0.27456445993 #-> iterated_power=1, n_components=2, whiten=True

## defaults for following classifiers: 
###-> skb (k=10), pipeline [standardscaler, pca], stratifiedshufflesplit, gridsearch

# DecisionTreeClassifier
## default: 0.150174216028
## params gridsearched: 0.15331010453 
###-> class_weight=None, criterion='gini', max_features='auto'

# SVC
## default: 0.286411149826
## params gridsearched: too long

In [457]:
#for i in xrange(X_train.shape[1]):
#    plt.scatter(X_train[:,i], y_train)
#    plt.show()

In [458]:
'''
num_rows = X_train.shape[0]
num_cols = X_train.shape[1]
rows_to_remove = set()

for i in xrange(num_cols):
    low = np.percentile(X_train[:,i], 5)
    high = np.percentile(X_train[:,i], 95)
    
    for j in xrange(num_rows):
        if X_train[j,i] == low or X_train[j,i] == high:
            rows_to_remove.add(j)

X_train = np.delete(X_train, list(rows_to_remove), axis=0)
y_train = np.delete(y_train, list(rows_to_remove))

print X_train.shape, y_train.shape
'''


Out[458]:
'\nnum_rows = X_train.shape[0]\nnum_cols = X_train.shape[1]\nrows_to_remove = set()\n\nfor i in xrange(num_cols):\n    low = np.percentile(X_train[:,i], 5)\n    high = np.percentile(X_train[:,i], 95)\n    \n    for j in xrange(num_rows):\n        if X_train[j,i] == low or X_train[j,i] == high:\n            rows_to_remove.add(j)\n\nX_train = np.delete(X_train, list(rows_to_remove), axis=0)\ny_train = np.delete(y_train, list(rows_to_remove))\n\nprint X_train.shape, y_train.shape\n'


In [459]:
'''
regr_or_clf = "clf"

if regr_or_clf == 'regr':
    
    #poly = PolynomialFeatures(degree=3)
    #poly.fit(X_train,y_train)
    #X_train = poly.transform(X_train)
    #X_test = poly.transform(X_test)

    high_score = -9999999

    best_y_pred = []

    for regr in [LinearRegression(), Ridge(), Lasso(), BayesianRidge(), SGDRegressor(), SVR(), DecisionTreeRegressor(), GradientBoostingRegressor()]:
        regr = regr.fit(X_train, y_train)
        score = regr.score(X_test, y_test)
        print score
        if score > high_score:
            high_score = score
            best_y_pred = regr.predict(X_test)
            
elif regr_or_clf == 'clf':
    
    best_f1_score = -9999999
    best_clf = []
    best_y_pred = []
    mean_y_pred = []

    for clf in [GaussianNB(), DecisionTreeClassifier(random_state=42), RandomForestClassifier(), SVC(C=100, kernel='rbf', gamma=10, random_state=42)]:
    #for clf in [SVC(C=1000, kernel='rbf', gamma=1, random_state=42)]:

        clf.fit(X_train, y_train)

        y_pred = clf.predict(X_test)

        clf_report = classification_report(y_test, y_pred)
        clf_report = [re.sub(r"[a-z]|\n", '', x) for x in clf_report.split(" ")]
        clf_report = filter(None, clf_report)
        f1_score = float(clf_report[-2])
        print f1_score

        if f1_score > best_f1_score:
            best_f1_score = f1_score
            best_clf = clf
            best_y_pred = y_pred

        #print "\nconfusion matrix:\n", confusion_matrix(y_test, y_pred)

        #print "\nclassification report:\n", classification_report(y_test, y_pred)
    print str(best_clf)[:3]

    print classification_report(y_test, best_y_pred)
'''


Out[459]:
'\nregr_or_clf = "clf"\n\nif regr_or_clf == \'regr\':\n    \n    #poly = PolynomialFeatures(degree=3)\n    #poly.fit(X_train,y_train)\n    #X_train = poly.transform(X_train)\n    #X_test = poly.transform(X_test)\n\n    high_score = -9999999\n\n    best_y_pred = []\n\n    for regr in [LinearRegression(), Ridge(), Lasso(), BayesianRidge(), SGDRegressor(), SVR(), DecisionTreeRegressor(), GradientBoostingRegressor()]:\n        regr = regr.fit(X_train, y_train)\n        score = regr.score(X_test, y_test)\n        print score\n        if score > high_score:\n            high_score = score\n            best_y_pred = regr.predict(X_test)\n            \nelif regr_or_clf == \'clf\':\n    \n    best_f1_score = -9999999\n    best_clf = []\n    best_y_pred = []\n    mean_y_pred = []\n\n    for clf in [GaussianNB(), DecisionTreeClassifier(random_state=42), RandomForestClassifier(), SVC(C=100, kernel=\'rbf\', gamma=10, random_state=42)]:\n    #for clf in [SVC(C=1000, kernel=\'rbf\', gamma=1, random_state=42)]:\n\n        clf.fit(X_train, y_train)\n\n        y_pred = clf.predict(X_test)\n\n        clf_report = classification_report(y_test, y_pred)\n        clf_report = [re.sub(r"[a-z]|\n", \'\', x) for x in clf_report.split(" ")]\n        clf_report = filter(None, clf_report)\n        f1_score = float(clf_report[-2])\n        print f1_score\n\n        if f1_score > best_f1_score:\n            best_f1_score = f1_score\n            best_clf = clf\n            best_y_pred = y_pred\n\n        #print "\nconfusion matrix:\n", confusion_matrix(y_test, y_pred)\n\n        #print "\nclassification report:\n", classification_report(y_test, y_pred)\n    print str(best_clf)[:3]\n\n    print classification_report(y_test, best_y_pred)\n'

In [460]:
#X_df = pd.DataFrame(X)
#X_df['labels'] = y
#sns.pairplot(X, hue='labels')
#plt.show()

In [461]:
plt.hist(y, color='b', alpha=0.7)
plt.hist(y_pred, color='y', alpha=0.7)
plt.show()



In [462]:
plt.scatter(np.arange(y.shape[0]), y, color='b', alpha=0.7)
plt.scatter(np.arange(y_pred.shape[0]), y_pred, color='y', alpha=0.7)
plt.show()



In [463]:
y_pred - y


Out[463]:
array([0, 0, 0, ..., 0, 0, 0])

In [464]:
np.sum(y)


Out[464]:
132

In [465]:
error_count = 0
for i in xrange(len(y)):
    if y_pred[i] != y[i]:
        error_count += 1
        
print error_count, " / ", len(y)


0  /  1447


In [466]:
# import data

new_X = []

for ticker in tickers:

    stock_df = pd.read_csv("goog_data/{}_features.csv".format(ticker))
    labels_df = pd.read_csv("goog_data/{}_labels.csv".format(ticker))

    num_of_times = stock_df['time'].unique().shape[0]

    stock_df = stock_df.drop('time', axis=1)
    assert num_of_times == 14, "wrong number of times"
    #print "number of times in a day: {}".format(num_of_times)

    # features
    features = stock_df.values[(-1*num_of_times):].flatten()
    features = np.expand_dims(features, axis=0)

    # combine features into rows of X
    if new_X == []:
        new_X = features
    else: 
        new_X = np.vstack((new_X, features)) 

print "\n", "*"*10, "\nfinal shapes: ", new_X.shape


********** 
final shapes:  (31, 42)

In [467]:
new_pred = clf.predict(new_X)
for i in xrange(len(tickers)):
    print i, tickers[i], new_pred[i]
    
'''

0 ARNA 0
1 ATNM 0
2 AVXL 0
3 AXN 0
4 BLFS 0
5 BOTA 1
6 CBLI 0
7 CPRX 0
8 DARA 0
9 ECYT 0
10 EDAP 0
11 EXAS 0
12 HALO 0
13 IDRA 0
14 INO 0
15 LOXO 0
16 LPCN 0
17 MEIP 0
18 MNKD 0
19 OREX 0
20 PGNX 1
21 QLTI 0
22 RMTI 0
23 SGYP 0
24 SNGX 0
25 SPY 0
26 SYN 1
27 TENX 0
28 THLD 0
29 TNXP 0
30 TPIV 1
'''


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-467-613565eb8755> in <module>()
----> 1 new_pred = clf.predict(new_X)
      2 for i in xrange(len(tickers)):
      3     print i, tickers[i], new_pred[i]
      4 
      5 '''

/usr/local/lib/python2.7/site-packages/sklearn/utils/metaestimators.pyc in <lambda>(*args, **kwargs)
     35             self.get_attribute(obj)
     36         # lambda, but not partial, allows help() to work with update_wrapper
---> 37         out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
     38         # update the docstring of the returned function
     39         update_wrapper(out, self.fn)

/usr/local/lib/python2.7/site-packages/sklearn/pipeline.pyc in predict(self, X)
    177         Xt = X
    178         for name, transform in self.steps[:-1]:
--> 179             Xt = transform.transform(Xt)
    180         return self.steps[-1][-1].predict(Xt)
    181 

/usr/local/lib/python2.7/site-packages/sklearn/preprocessing/data.pyc in transform(self, X, y, copy)
    391         else:
    392             if self.with_mean:
--> 393                 X -= self.mean_
    394             if self.with_std:
    395                 X /= self.std_

ValueError: operands could not be broadcast together with shapes (31,42) (11,) (31,42)