In [556]:
import requests
import csv
import re
import numpy as np
import pandas as pd
from scipy import stats
import datetime
import Quandl
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score

from sklearn.pipeline import make_pipeline

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

from sklearn.feature_selection import SelectKBest
from sklearn.decomposition import RandomizedPCA 

from sklearn.preprocessing import PolynomialFeatures
 
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier 
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import SGDRegressor 
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor 
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.grid_search import GridSearchCV

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


In [530]:
def get_stock_data(ticker, seconds_interval, num_of_days):
    
    url = "http://www.google.com/finance/getprices?q={0}&i={1}&p={2}d&f=d,o,h,l,c,v".format(ticker, seconds_interval, num_of_days)

    # get data and convert to data frame
    stock_df = pd.read_csv(url, skiprows=[0,1,2,3,5,6])

    # rename column name
    stock_df.rename(columns={'COLUMNS=DATE':'time'}, inplace=True)

    # remove 'a' from unix timestamps
    stock_df.replace(to_replace={'time':{'a':''}}, regex=True, inplace=True)

    # get entire column and convert to ints
    time_indices = [int(x) for x in stock_df['time'].values]

    # keep track of current timestamp
    last_timestamp = time_indices[0]

    # convert unix timestamp abbreviations into full unix timestamps
    for i in range(len(time_indices)):
        if time_indices[i] < last_timestamp:
            time_indices[i] = last_timestamp + (time_indices[i] * seconds_interval)
        else:
            last_timestamp = time_indices[i]

    # convert unix timestamps to human-readable formats
    time_indices = [datetime.datetime.fromtimestamp(x).strftime('%Y-%m-%d %H:%M:%S') for x in time_indices]
    
    # keep times (i.e., not dates)
    times = [float(x[-8:-3].replace(':','.')) for x in time_indices]

    # create new column in data frame
    stock_df['time'] = times

    # keep day of month
    #dates = [int(x[:10].split('-')[2]) for x in time_indices]
    # create new column in data frame
    #stock_df['month_date'] = dates

    # get weekday as int value
    #stock_df['week_day'] = [datetime.datetime.strptime(x[:10], '%Y-%m-%d').weekday() for x in time_indices]

    # create features
    stock_df['op_cl%'] = np.true_divide((stock_df['CLOSE'] - stock_df['OPEN']), stock_df['CLOSE'])
    stock_df['lo_hi%'] = np.true_divide((stock_df['HIGH'] - stock_df['LOW']), stock_df['HIGH'])
    stock_df['vol_norm'] = np.true_divide(stock_df['VOLUME'], np.max(stock_df['VOLUME']))

    # create labels dataframe
    labels_df = stock_df.copy(deep=True)

    # remove columns
    stock_df = stock_df.drop(['CLOSE', 'OPEN', 'LOW', 'HIGH', 'VOLUME'], axis=1)

    #print stock_df.shape
    #stock_df.head()
    
    return stock_df, labels_df

In [531]:
tickers = ['ARNA',
           'ATNM',
           'AVXL',
           'AXN',
           'BLFS',
           'BOTA',
           'CBLI',
           'CPRX', 
           'DARA',
           'ECYT',
           'EDAP',
           'EXAS',
           'HALO',
           'IDRA',
           'INO',
           'LOXO',
           'LPCN',
           'MEIP',
           'MNKD',
           'OREX',
           'PGNX',
           'QLTI',
           'RMTI',
           'SGYP',
           'SNGX',
           'SPY',
           'SYN', 
           'TENX',
           'THLD',
           'TNXP',
           'TPIV']


In [532]:
# download data

for ticker in tickers:
    
    seconds_interval = 1800 # 1800: 30-minute (seems the most consistent)
    stock_df, labels_df = get_stock_data(ticker, seconds_interval, 1000)
        
    stock_df.to_csv("goog_data/{}_features.csv".format(ticker), index=False)
    labels_df.to_csv("goog_data/{}_labels.csv".format(ticker), index=False)


In [652]:
# import data

X = []
y = []

for ticker in tickers:

    stock_df = pd.read_csv("goog_data/{}_features.csv".format(ticker))
    labels_df = pd.read_csv("goog_data/{}_labels.csv".format(ticker))

    num_of_times = stock_df['time'].unique().shape[0]
    stock_df = stock_df.drop('time', axis=1)
    assert num_of_times == 14, "wrong number of times"
    #print "number of times in a day: {}".format(num_of_times)

    num_of_days = stock_df.shape[0]/num_of_times
    #print "number of days: {}".format(num_of_days)

    for i in xrange(num_of_days):

        # features
        features = stock_df.values[:num_of_times].flatten()
        features = np.expand_dims(features, axis=0)
        #assert features.shape[1] == 84, "wrong number of columns"

        # combine features into rows of X
        if X == []:
            X = features
        else: 
            X = np.vstack((X, features))

        # labels
        labels = labels_df.values[:num_of_times].flatten()
        # (last - open) / last
        label = np.true_divide((labels[-8] - labels[1]), labels[-8])
        # make class
        label = int(label*100)
        
        # make binary class
        def binarize(label):
            if label >= 5:
                label = 1
            else:
                label = 0
            return label
        
        #label = binarize(label)

        if y == []:
            y = np.array([label])
        else:
            y = np.append(y, np.array(label))  

        # remove used rows (go to next day)
        stock_df = stock_df[num_of_times:]
        labels_df = labels_df[num_of_times:]

    # rotate/discard rows
    X = X[:-1]
    y = y[1:]

print "\n", "*"*10, "\nfinal shapes: ", X.shape, y.shape


********** 
final shapes:  (1423, 42) (1423,)

In [653]:
plt.hist(y, bins=20, alpha=0.7, color='r')
plt.show()



In [654]:
y_values = np.unique(y, return_counts=True)[0]
print y_values


[-30 -28 -26 -22 -21 -17 -16 -15 -14 -13 -12 -11 -10  -9  -8  -7  -6  -5
  -4  -3  -2  -1   0   1   2   3   4   5   6   7   8   9  10  11  12  13
  14  15  16  18  19  20  21  22  24  27  33  44  50]

In [655]:
num_of_classes = np.unique(y, return_counts=True)[1]
print num_of_classes


[  1   1   1   1   4   1   1   3   1   4   4   4   9   9  16  19  24  33
  46  82 104 156 414 144 102  83  36  28  21  15  11   5   8   5   5   5
   1   1   1   2   2   3   1   1   1   1   1   1   1]

In [656]:
classes_to_remove = []
for i in np.where(num_of_classes == 1)[0]:
    classes_to_remove.append(y_values[i])

print classes_to_remove


[-30, -28, -26, -22, -17, -16, -14, 14, 15, 16, 21, 22, 24, 27, 33, 44, 50]

In [657]:
# single-class removal
rows_to_remove = []
for i in xrange(len(y)):
    if y[i] in classes_to_remove:
        rows_to_remove.append(i)
    
X = np.delete(X, rows_to_remove, axis=0)
y = np.delete(y, rows_to_remove)

print X.shape, y.shape


(1406, 42) (1406,)

In [658]:
# outlier-label removal
'''
rows_to_remove = []
for i in xrange(len(y)):
    if y[i] < -10 or y[i] > 10:
        rows_to_remove.append(i)
    
X = np.delete(X, rows_to_remove, axis=0)
y = np.delete(y, rows_to_remove)

print X.shape, y.shape
'''


Out[658]:
'\nrows_to_remove = []\nfor i in xrange(len(y)):\n    if y[i] < -10 or y[i] > 10:\n        rows_to_remove.append(i)\n    \nX = np.delete(X, rows_to_remove, axis=0)\ny = np.delete(y, rows_to_remove)\n\nprint X.shape, y.shape\n'

In [659]:
print "number of labels: ", np.unique(y, return_counts=True)[0].shape[0]


number of labels:  32

In [660]:
corr_df = pd.DataFrame(X)
corr_df['label'] = y
np.max(corr_df.corr()['label'].values[:-1])


Out[660]:
0.095664359452292222

In [661]:
#for i in xrange(X.shape[1]):
#    plt.scatter(X[:,i], y)
#    plt.show()


In [662]:
threshold = 20
if np.unique(y, return_counts=True)[0].shape[0] > 2:
    threshold = 3

skb = SelectKBest()
skb = skb.fit(X,y)
counter = 0
for i in xrange(skb.scores_.shape[0]):
    if skb.scores_[i] > threshold:
        counter += 1
        print i, skb.scores_[i]
        
print "\nfeatures scoring higher than {}: ".format(threshold), counter


1 3.00740560307
13 3.35467847775
16 3.32084368572
19 5.06108323316
28 3.17564352806
31 3.67559019493
37 4.07442785701
40 3.96192637958

features scoring higher than 3:  8

In [663]:
skb = SelectKBest(k=counter)
skb = skb.fit(X,y)
X = skb.transform(X)
X.shape


Out[663]:
(1406, 8)


In [664]:
clf_or_regr = "regr"
pca = False

learner = []

pipeline = []

############################################################################

if clf_or_regr == "clf":
    learner = GaussianNB()
    #learner = DecisionTreeClassifier()
    #learner = RandomForestClassifier()
    #learner = SVC()
    #learner = LogisticRegression()
        
if clf_or_regr == "regr":
    learner = LinearRegression()
    #(), Ridge(), Lasso(), BayesianRidge(), SGDRegressor(), SVR(), DecisionTreeRegressor(), GradientBoostingRegressor()
        
############################################################################

if pca == False:
    pipeline = make_pipeline(learner)
elif pca == True:
    pipeline = make_pipeline(StandardScaler(), RandomizedPCA(), learner)

# cross validation    
cv = StratifiedShuffleSplit(y, test_size=0.2, random_state=42)

# tune parameters
params = dict()

# for PCA
if pca == True:
    params['randomizedpca__iterated_power'] = [1, 2, 3]
    params['randomizedpca__n_components'] = [2, 4, 6, 8, 10]
    params['randomizedpca__random_state'] = [42]
    params['randomizedpca__whiten'] = [True, False]
    
############################################################################

if clf_or_regr == "clf":
    if str(learner)[0] == 'X':
        params['decisiontreeclassifier__criterion'] = ['gini', 'entropy']
        params['decisiontreeclassifier__max_features'] = ['auto', 'sqrt', 'log2', None]
        params['decisiontreeclassifier__class_weight'] = ['auto', None]
        params['decisiontreeclassifier__random_state'] = [42]

    if str(learner)[0] == 'X':
        # [2**x for x in np.arange(-15, 15+1, 3)]
        params['svc__C'] = [2**x for x in np.arange(-15, 15+1, 3)]
        params['svc__gamma'] = [2**x for x in np.arange(-15, 15+1, 3)]
        params['svc__random_state'] = [42]

############################################################################        
        
grid_search = GridSearchCV(pipeline, param_grid=params, n_jobs=1, cv=cv)

grid_search.fit(X, y)

learner = grid_search.best_estimator_
print learner
print "\nscore: ", grid_search.best_score_

learner = learner.fit(X,y)
y_pred = learner.predict(X)

############################################################################

if clf_or_regr == "clf":
    print "\nconfusion matrix:\n", "     FALSE   TRUE\n", "FALSE", confusion_matrix(y, y_pred)[0], "\nTRUE ", confusion_matrix(y, y_pred)[1]
    print "\n", classification_report(y, y_pred)


Pipeline(steps=[('linearregression', LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False))])

score:  -0.0097979825141

In [665]:
#for i in xrange(X.shape[1]):
#    plt.scatter(X[:,i], y)
#    plt.show()


In [666]:
if X.shape[1] <= 5:
    X_df = pd.DataFrame(X[:,:4])
    X_df['labels'] = y
    sns.pairplot(X_df, hue='labels')
    plt.show()

In [667]:
plt.hist(y, color='b', alpha=0.7)
plt.hist(y_pred, color='y', alpha=0.7)
plt.show()



In [668]:
plt.scatter(np.arange(y.shape[0]), y, color='b', alpha=0.7)
plt.scatter(np.arange(y_pred.shape[0]), y_pred, color='y', alpha=0.7)
plt.show()



In [669]:
y_pred - y


Out[669]:
array([  0.78161612,   2.78289156,   3.87907456, ...,   0.15910416,
        -1.1897841 , -13.08724493])

In [670]:
np.sum(y)


Out[670]:
-169

In [671]:
error_count = 0
for i in xrange(len(y)):
    if y_pred[i] != y[i]:
        error_count += 1
        
print error_count, " / ", len(y)


1406  /  1406


In [672]:
# import data

new_X = []

for ticker in tickers:

    stock_df = pd.read_csv("goog_data/{}_features.csv".format(ticker))
    labels_df = pd.read_csv("goog_data/{}_labels.csv".format(ticker))

    num_of_times = stock_df['time'].unique().shape[0]

    stock_df = stock_df.drop('time', axis=1)
    assert num_of_times == 14, "wrong number of times"
    #print "number of times in a day: {}".format(num_of_times)

    # features
    features = stock_df.values[(-1*num_of_times):].flatten()
    features = np.expand_dims(features, axis=0)

    # combine features into rows of X
    if new_X == []:
        new_X = features
    else: 
        new_X = np.vstack((new_X, features)) 

print "\n", "*"*10, "\nfinal shapes: ", new_X.shape


********** 
final shapes:  (31, 42)

In [674]:
if X.shape[1] <= 5 or X.shape[1] <= counter:
    new_X = skb.transform(new_X)

new_pred = learner.predict(new_X)
for i in xrange(len(tickers)):
    print "[{}]".format(i), tickers[i], new_pred[i]


[0] ARNA -0.849653687177
[1] ATNM 0.318309018805
[2] AVXL 1.42291398684
[3] AXN -0.298392966221
[4] BLFS -0.146941617147
[5] BOTA -0.402527687756
[6] CBLI -1.39959266878
[7] CPRX -0.62921151297
[8] DARA -0.204179245471
[9] ECYT -0.670358334397
[10] EDAP -0.220096658929
[11] EXAS -0.645469659188
[12] HALO -0.610826186544
[13] IDRA -1.00745932329
[14] INO -1.05229115258
[15] LOXO -0.530890119979
[16] LPCN -0.33132492525
[17] MEIP -1.58807448338
[18] MNKD -0.826084502926
[19] OREX -0.548940620564
[20] PGNX -0.749969665619
[21] QLTI -0.364692269568
[22] RMTI -0.639507339215
[23] SGYP -0.683468167484
[24] SNGX -0.642596843544
[25] SPY -0.33497476569
[26] SYN -0.562766803252
[27] TENX -0.593800024829
[28] THLD -0.614860726467
[29] TNXP -0.834714140737
[30] TPIV -0.281729948618