In [36]:
from imports import *
import avg_clf_train
import import_data
%matplotlib inline
In [37]:
check = True
if check == True:
# check if last day's data is available
print Quandl.get("YAHOO/HALO", authtoken='DVhizWXNTePyzzy1eHWR').tail(1)
In [38]:
download = True
start_tickers = ticker_list.tickers
tickers = []
print len(start_tickers), "total tickers to start\n"
if download == True:
# download data
for ticker in start_tickers:
try:
stock_df = Quandl.get("YAHOO/{}".format(ticker), authtoken='DVhizWXNTePyzzy1eHWR')
stock_df.to_csv("quandl_data/{}.csv".format(ticker), index=False)
tickers.append(ticker)
except:
print "removed:", ticker
elif download == False:
tickers = [filename[:-4] for filename in os.listdir('quandl_data')]
print "\n", len(tickers), "available tickers:"
print tickers
In [4]:
stock_df, prediction_df = import_data.import_data(tickers)
print stock_df.shape
stock_df.tail()
Out[4]:
In [5]:
#stock_df[stock_df['Open'] > 5.0]
In [6]:
#prediction_df[prediction_df['Open'] > 5.0]
In [7]:
stock_df.describe()
Out[7]:
In [8]:
stock_df.Volume.hist(bins=50)
plt.show()
In [9]:
for i in xrange(len(stock_df.columns)):
print i, stock_df.columns[i], stock_df.corr()['label'].values[i]
In [10]:
negative_df = stock_df[stock_df['label'] == 0]
positive_df = stock_df[stock_df['label'] == 1]
plt.scatter(negative_df['200dravg'], negative_df['Volume'])
plt.scatter(positive_df['200dravg'], positive_df['Volume'], color='r')
plt.show()
In [11]:
x, y, z = 'HL%', 'Low', 'Open'
fig = plt.figure()
ax = fig.gca(projection='3d')
ax.scatter(negative_df[x], negative_df[y], negative_df[z], alpha=0.5, color='y')
ax.scatter(positive_df[x], positive_df[y], positive_df[z], color='r')
ax.set_xlabel(x)
ax.set_ylabel(y)
ax.set_zlabel(z)
ax.view_init(azim=250)
plt.show()
In [12]:
y = stock_df['label'].values
stock_df = stock_df.drop('label', axis=1)
X = stock_df.values
print X.shape, y.shape
In [13]:
plt.hist(y, bins=50, alpha=0.7, color='r')
plt.show()
In [14]:
y_values = np.unique(y, return_counts=True)[0]
print y_values.shape, "\n"
print y_values
In [15]:
num_of_classes = np.unique(y, return_counts=True)[1]
print num_of_classes
print "percent 1: ", np.true_divide(num_of_classes[1],np.sum(num_of_classes))
In [16]:
classes_to_remove = []
for i in np.where(num_of_classes == 1)[0]:
classes_to_remove.append(y_values[i])
print len(classes_to_remove)
print classes_to_remove[:5]
print classes_to_remove[-5:]
In [17]:
print "number of labels: ", np.unique(y, return_counts=True)[0].shape[0]
In [18]:
#for i in xrange(X.shape[1]):
# plt.scatter(X[:,i], y)
# plt.show()
In [19]:
#for i in xrange(X.shape[1]):
# plt.hist(X[:,i])
# plt.show()
In [20]:
skb, learners = avg_clf_train.avg_clf_train(X, y, 4, 100.0, 0.1, stock_df)
'''
def avg_clf_train_func(X, y, k, stock_df):
clf_or_regr = "clf"
t0 = time()
############################################################################
pipeline = make_pipeline(DecisionTreeClassifier())
# cross validation
cv = StratifiedShuffleSplit(y, test_size=0.2, random_state=42)
# tune parameters
params = dict()
params['decisiontreeclassifier__criterion'] = ['gini', 'entropy']
params['decisiontreeclassifier__max_features'] = ['auto', 'sqrt', 'log2', None]
params['decisiontreeclassifier__class_weight'] = ['auto', None]
params['decisiontreeclassifier__random_state'] = [42]
grid_search = GridSearchCV(pipeline, param_grid=params, n_jobs=1, cv=cv)
grid_search.fit(X, y)
print grid_search.best_estimator_
return skb, learners
skb, learners = avg_clf_train_func(X, y, 4, stock_df)
'''
Out[20]:
In [21]:
#X_df = pd.DataFrame(X[:,:4])
#X_df = pd.DataFrame(X)
#X_df['labels'] = y
#sns.pairplot(X_df, hue='labels')
#plt.show()
In [22]:
#plt.hist(y, color='b', alpha=0.7)
#plt.hist(y_pred, color='y', alpha=0.7)
#plt.show()
In [23]:
#plt.scatter(np.arange(y.shape[0]), y, color='b', alpha=0.7)
#plt.scatter(np.arange(y_pred.shape[0]), y_pred, color='y', alpha=0.7)
#plt.show()
In [24]:
#y_pred - y
In [25]:
#np.sum(y)
In [26]:
#error_count = 0
#for i in xrange(len(y)):
# if y_pred[i] != y[i]:
# error_count += 1
#
#print error_count, " / ", len(y)
In [27]:
pred_df = prediction_df[prediction_df['label'].apply(np.isnan) == True]
In [28]:
pred_tickers = pred_df['ticker'].unique()
In [29]:
pred_X = pred_df.drop(['ticker','label'], axis=1).values
print pred_X.shape
print pred_X[0]
In [30]:
pred_X = skb.transform(pred_X)
print pred_X.shape
In [31]:
y_predictions = []
for learner in learners:
y_pred = learner.predict(pred_X)
print y_pred.shape
y_predictions.append(y_pred)
In [32]:
print len(y_predictions)
y_predictions = y_predictions[0]
In [33]:
#y_pred_avg = np.mean(y_predictions, axis=1)
#print y_pred_avg.shape
In [34]:
positive_tickers = []
for i in xrange(len(pred_tickers)):
print i, pred_tickers[i], y_predictions[i]
if y_predictions[i] == 1:
positive_tickers.append(pred_tickers[i])
In [35]:
for ticker in positive_tickers:
past_days = 100
oc = prediction_df[prediction_df['ticker'] == ticker]["OC%"][-past_days:]
num_days = oc.shape[0]
day_range = np.arange(num_days)
plt.plot(day_range, oc, alpha=0.5)
plt.plot(day_range, [0.05 for x in day_range], color='r')
plt.title("{0} (previous {1} days)".format(ticker, num_days))
plt.show()
print "\t", ticker, "{}-day freq probability:".format(past_days), np.true_divide(np.sum(oc.values > 0.05), past_days)
print "~"*50, "\n"