Outline is based on this sentence:
For example, a project that performs various binary classification methods on a social science dataset you may want to focus on data munging, method selection, method evaluation, feature extraction, and presentation of analysis.
Sample outline:
Whether one trades in Stocks, Index, Currencies, Commodities, a person would like to know questions like:
There could be many more such questions. The first challenge is to know the trend and direction of the market. If there was a crystal ball that could provide meaningful prediction in advance on the trend and direction of the markets that could help take correct trading position to make profitable trades. Predictive analytics based on historical price data using Data Mining, Machine Learning and Artificial Intelligence can provide prediction in advance on whether the next day market will close higher or lower compared to its opening levels.
We chose to investigate whether there is a connection between the sentiment in the news for a given day and the resulting market value changes for Apple, Inc on the same day.
To get the news data related to Apple, Inc., we webscraped related news with term Apple from the Motley Fool of the last three years.
Note: we commented out lines of codes where the scraping part is done. The scrapped data is available in github repo with name "mfool.csv".
In [78]:
#data munging and feature extraction packages
import requests
import requests_ftp
import requests_cache
import lxml
import itertools
import pandas as pd
import re
import numpy as np
import seaborn as sns
import string
from bs4 import BeautifulSoup
from collections import Counter
from matplotlib import pyplot as plt
from wordcloud import WordCloud
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = [10, 8]
#machine learning from scikit-learn
from sklearn.metrics import classification_report,confusion_matrix, precision_recall_curve, roc_curve, auc
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
#Deep learning from Tensor Flow
#feed forward neural network
import tensorflow as tf
from tensorflow.contrib.learn.python.learn.estimators.dnn import DNNClassifier
from tensorflow.contrib.layers import real_valued_column
#recurrent neural nets
from tensorflow.contrib.layers.python.layers.initializers import xavier_initializer
from tensorflow.contrib import rnn
In [2]:
def motley_page_links(page):
"""
Given a page number, it returns all article links.
Input: a page number (default = 1)
Output: a list with links on the given page
"""
response = requests.get(
'https://www.fool.com/search/solr.aspx?page={}&q=apple&sort=date&source=isesitbut0000001'.format(page))
response.raise_for_status()
html = response.text
parsed_html = BeautifulSoup(html, 'lxml')
div_with_links = parsed_html.find_all(name = 'dl',
attrs = {'class' : 'results'})
links = []
for link in div_with_links[0].find_all('a', href = True):
links.append(link['href'])
return links
def motley_all_links(no_pages = 1):
"""
Given number of pages, it returns all the links
from "no_pages"
Input: number of pages (default = 1)
Output: a list with links from the pages
"""
all_links = []
for page in range(1, (no_pages + 1)):
all_links.extend(motley_page_links(page))
return all_links
def motley_article_info(url):
"""
Given an article url, it returns title, date, content
and url of that article.
Input: article url
Ouput: a dictionary with 'title', 'date',
'article', and 'url' as keys.
"""
response = requests.get(url)
response.raise_for_status()
html = response.text
parsed_html = BeautifulSoup(html, 'lxml')
content = parsed_html.find_all(name = 'div',
attrs = {'class' : 'full_article'})
date = parsed_html.find_all(name = 'div', attrs = {'class' : 'publication-date'})[0].text.strip()
title = parsed_html.find_all('h1')[0].text
article = ' '.join([t.text for t in content[0].find_all('p')])
return {'title' : title,
'date' : date,
'article' : article,
'url' : url}
def motley_df(no_pages):
"""
Creates DataFrame for the articles in url
with author, text, title, and url as column
names.
Input: A url, number of pages
Output: DataFrame with 4 columns: author,
text, title, and url.
"""
#get all links in the specified number of pages
#from url
links = motley_all_links(no_pages)
#create dataframe for each link and
#combine them into one dataframe
article_df = pd.DataFrame(index = [999999], columns=['article', 'date', 'title', 'url'])
for i, link in enumerate(links):
try:
append_to = pd.DataFrame(motley_article_info(link), index = [i])
article_df = article_df.append(append_to)
except:
pass
article_df = article_df.drop(999999)
return article_df
#df = motley_df(1000)
#convert_to_csv(df, "mfool.csv")
2. sentiment scoring
In [4]:
motley = pd.read_csv('mfool.csv')
negative = pd.read_csv('negative-words.txt', sep = ' ', header = None)
positive = pd.read_csv('positive-words.txt', sep=' ', header=None)
def score_word(word):
"""
returns -1 if negative meaning, +1 if positive meaning,
else 0
input: a word
ouput: -1, 0, or + 1
"""
if word.lower() in negative.values:
return -1
elif word.lower() in positive.values:
return +1
return 0
def get_scores(article):
"""
returns sentiment scores for a given article
input: an article
output: sentiment score
"""
wordsArticle = article.split(' ')
scores = [score_word(word) for word in wordsArticle]
return sum(scores)
motley['sentiment'] = motley['article'].apply(get_scores)
plt.hist(motley.sentiment, bins=50)
plt.xlabel('sentiment scores')
plt.ylabel('frequency')
plt.title('Distribution of sentiment scores of articles');
# motley.to_csv('motley_with_s_scores.csv', encoding='utf-8')
In [30]:
most_positive_article = motley['article'][motley['sentiment'] == np.max(motley['sentiment'])].values[0]
wc = WordCloud().generate(most_positive_article)
plt.imshow(wc)
plt.axis('off');
In [31]:
most_negative_article = motley['article'][motley['sentiment'] == np.min(motley['sentiment'])].values[0]
wc = WordCloud().generate(most_negative_article)
plt.imshow(wc)
plt.axis('off');
3. merging data sets
APPLE stock data was obtained using Quandl API at "https://www.quandl.com/api/v3/datasets/WIKI/AAPL.csv"
In [19]:
path = "../datasets/"
aapl = pd.read_csv(path+'WIKI_PRICES_AAPL.csv')
fool = pd.read_csv(path+'motley_with_s_scores.csv')
In [20]:
def format_df(stock_df, news_df, word):
"""
merges stock_df and news_df on "date"
column
input: stock df, news df, word
output: merged df
"""
stock_df['diff'] = stock_df['close']-stock_df['open']
news_df['Count'] = news_df['article'].apply(lambda x: x.count(word))
news_df.loc[news_df['Count'] <= 5, 'sentiment'] = 0
news_df['date'] = pd.to_datetime(news_df['date'])
news_df['date'] = news_df['date'].dt.strftime('%Y-%m-%d')
news_df = news_df.groupby(['date'], as_index = False).sum()
news_df = news_df[['date', 'sentiment', 'Count']]
merged_df = pd.merge(news_df, stock_df)
merged_df['bin_sentiment'] = pd.cut(merged_df['sentiment'], [-np.inf, -0.001, 0.001, np.inf], labels = [-1, 0, 1])
merged_df['bin_diff'] = pd.cut(merged_df['diff'], [-np.inf, -0.001, 0.001, np.inf], labels = [-1, 0, 1])
return merged_df
In [ ]:
merged_df = format_df(aapl, fool, 'Apple')
merged_df.head()
#merged_df.to_csv('merged_df.csv', encoding='utf-8')
In [35]:
def plot_ROC(y_test, scores, label, color):
"""
plots ROC curve
input: y_test, scores, and title
output: ROC curve
"""
false_pr, true_pr, _ = roc_curve(y_test, scores[:, 1])
roc_auc = auc(false_pr, true_pr)
plt.plot(false_pr, true_pr, lw = 3,
label='{}: area={:10.4f})'.format(label, roc_auc), color = color)
plt.plot([0, 1], [0, 1], color='black', lw=1, linestyle='--')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.legend(loc="best")
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('ROC')
def plot_PR(y_test, scores, label, color):
"""
plots PR curve
input: y_test, scores, title
output: Precision-Recall curve
"""
precision, recall, _ = precision_recall_curve(y_test, scores[:, 1])
plt.plot(recall, precision,lw = 2,
label='{}'.format(label), color = color)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend(loc="best")
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('PR')
def plot_confusionmatrix(ytrue, ypred):
"""
plots confusion matrix heatmap and prints out
classification report
input: ytrue (actual value), ypred(predicted value)
output: confusion matrix heatmap and classification report
"""
print (classification_report(ytrue, ypred))
print ('##################################################################')
cnf_matrix = confusion_matrix(ytrue, ypred)
sns.heatmap(cnf_matrix, cmap='coolwarm_r', annot = True, linewidths=.5, fmt = '.4g')
plt.title('Confusion matrix')
plt.xlabel('Prediction')
plt.ylabel('Actual');
In [25]:
apple = pd.read_csv(path + 'merged_df.csv')
apple.head()
print (apple.shape)
In [32]:
apple.plot('date', 'diff');
There is exterme fluctuation betweeen opening and closing prices of Apple, Inc. (as expected).
Let's choose the features and label (bin_diff) and make the dataframe ready for machine learning and deep learning.
In [62]:
aapl = apple.copy()[['date', 'sentiment', 'bin_diff']]
aapl.head()
Out[62]:
In [63]:
plt.hist(aapl['bin_diff']);
Let's drop the observation with "0" and make it binary classification.
In [64]:
aapl = aapl[aapl['bin_diff'] != 0]
Also, to make the models work properly, from now on, we re-code loss category from -1 to 0.
In [65]:
label = aapl['bin_diff'] == 1
label = label.astype(int)
let's look at the features and standardize them.
In [66]:
InputDF = aapl.copy().drop('bin_diff', axis = 1)
InputDF = InputDF.set_index('date')
In [67]:
InputDF.head()
Out[67]:
In [68]:
InputDF = InputDF.apply(lambda x:(x -x.mean())/x.std())
In [69]:
InputDF.head()
Out[69]:
In [70]:
test_size = 600
xtrain, xtest = InputDF.iloc[:test_size, :], InputDF.iloc[test_size:, :]
ytrain, ytest = label[:test_size], label[test_size:]
In [75]:
logreg = LogisticRegression()
logreg_model = logreg.fit(xtrain, ytrain)
logpred = logreg_model.predict(xtest)
logscores = logreg_model.predict_proba(xtest)
plot_confusionmatrix(ytest, logpred)
In [79]:
plot_ROC(ytest, logscores, 'Logistic regression', 'r')
In [80]:
plot_PR(ytest, logscores, 'Logistic regression', 'b')
In [83]:
svm = SVC(probability=True)
svm_model = svm.fit(xtrain, ytrain)
svmpred = svm_model.predict(xtest)
svmscores = svm_model.predict_proba(xtest)
plot_confusionmatrix(ytest, svmpred)
In [85]:
plot_ROC(ytest, svmscores, 'SVM', 'r')
In [86]:
plot_PR(ytest, svmscores, 'SVM', 'b')
In [87]:
rf = RandomForestClassifier()
rf_model = rf.fit(xtrain, ytrain)
rfpred = rf.predict(xtest)
rfscores = rf.predict_proba(xtest)
plot_confusionmatrix(ytest, rfpred)
In [88]:
plot_ROC(ytest, logscores, 'Random Forest', 'r')
In [89]:
plot_PR(ytest, logscores, 'Random Forest', 'b')
In [90]:
num_features = len(InputDF.columns)
dropout=0.2
hidden_1_size = 25
hidden_2_size = 5
num_classes = label.nunique()
NUM_EPOCHS=20
BATCH_SIZE=1
lr=0.0001
np.random.RandomState(52);
In [91]:
val = (InputDF[:-test_size].values, label[:-test_size].values)
train = (InputDF[-test_size:].values, label[-test_size:].values)
NUM_TRAIN_BATCHES = int(len(train[0])/BATCH_SIZE)
NUM_VAL_BATCHES = int(len(val[1])/BATCH_SIZE)
In [92]:
class Model():
def __init__(self):
global_step = tf.contrib.framework.get_or_create_global_step()
self.input_data = tf.placeholder(dtype=tf.float32,shape=[None,num_features])
self.target_data = tf.placeholder(dtype=tf.int32,shape=[None])
self.dropout_prob = tf.placeholder(dtype=tf.float32,shape=[])
with tf.variable_scope("ff"):
droped_input = tf.nn.dropout(self.input_data,keep_prob=self.dropout_prob)
layer_1 = tf.contrib.layers.fully_connected(
num_outputs=hidden_1_size,
inputs=droped_input,
)
layer_2 = tf.contrib.layers.fully_connected(
num_outputs=hidden_2_size,
inputs=layer_1,
)
self.logits = tf.contrib.layers.fully_connected(
num_outputs=num_classes,
activation_fn =None,
inputs=layer_2,
)
with tf.variable_scope("loss"):
self.losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits = self.logits,
labels = self.target_data)
mask = (1-tf.sign(1-self.target_data)) #Don't give credit for flat days
mask = tf.cast(mask,tf.float32)
self.loss = tf.reduce_sum(self.losses)
with tf.name_scope("train"):
opt = tf.train.AdamOptimizer(lr)
gvs = opt.compute_gradients(self.loss)
self.train_op = opt.apply_gradients(gvs, global_step=global_step)
with tf.name_scope("predictions"):
self.probs = tf.nn.softmax(self.logits)
self.predictions = tf.argmax(self.probs, 1)
correct_pred = tf.cast(tf.equal(self.predictions, tf.cast(self.target_data,tf.int64)),tf.float64)
self.accuracy = tf.reduce_mean(correct_pred)
In [110]:
with tf.Graph().as_default():
model = Model()
input_ = train[0]
target = train[1]
losses = []
with tf.Session() as sess:
init = tf.initialize_all_variables()
sess.run([init])
epoch_loss =0
for e in range(NUM_EPOCHS):
if epoch_loss >0 and epoch_loss <1:
break
epoch_loss =0
for batch in range(0,NUM_TRAIN_BATCHES):
start = batch*BATCH_SIZE
end = start + BATCH_SIZE
feed = {
model.input_data:input_[start:end],
model.target_data:target[start:end],
model.dropout_prob:0.9
}
_,loss,acc = sess.run(
[
model.train_op,
model.loss,
model.accuracy,
]
,feed_dict=feed
)
epoch_loss+=loss
losses.append(epoch_loss)
#print('step - {0} loss - {1} acc - {2}'.format((1+batch+NUM_TRAIN_BATCHES*e),epoch_loss,acc))
print('################ done training ################')
final_preds =np.array([])
final_scores =None
for batch in range(0,NUM_VAL_BATCHES):
start = batch*BATCH_SIZE
end = start + BATCH_SIZE
feed = {
model.input_data:val[0][start:end],
model.target_data:val[1][start:end],
model.dropout_prob:1
}
acc,preds,probs = sess.run(
[
model.accuracy,
model.predictions,
model.probs
]
,feed_dict=feed
)
#print(acc)
final_preds = np.concatenate((final_preds,preds),axis=0)
if final_scores is None:
final_scores = probs
else:
final_scores = np.concatenate((final_scores,probs),axis=0)
print ('################ done testing ################')
prediction_conf = final_scores[np.argmax(final_scores, 1)]
In [94]:
plt.scatter(np.linspace(0, 1, len(losses)), losses);
plt.title('Validation loss with epoch')
plt.ylabel('Validation Loss')
plt.xlabel('epoch progression');
In [95]:
plot_confusionmatrix(ytest, final_preds)
In [97]:
plot_ROC(ytest, final_scores, 'Feed forward neural net', 'r')
In [98]:
plot_PR(ytest, final_probs, 'Feed forward neural net', 'b')
In [99]:
RNN_HIDDEN_SIZE=4
FIRST_LAYER_SIZE=50
SECOND_LAYER_SIZE=10
NUM_LAYERS=2
BATCH_SIZE=1
NUM_EPOCHS=25
lr=0.0003
NUM_TRAIN_BATCHES = int(len(train[0])/BATCH_SIZE)
NUM_VAL_BATCHES = int(len(val[1])/BATCH_SIZE)
ATTN_LENGTH=30
beta=0
np.random.RandomState(52);
In [103]:
class RNNModel():
def __init__(self):
global_step = tf.contrib.framework.get_or_create_global_step()
self.input_data = tf.placeholder(dtype=tf.float32,shape=[BATCH_SIZE,num_features])
self.target_data = tf.placeholder(dtype=tf.int32,shape=[BATCH_SIZE])
self.dropout_prob = tf.placeholder(dtype=tf.float32,shape=[])
def makeGRUCells():
base_cell = rnn.GRUCell(num_units=RNN_HIDDEN_SIZE,)
layered_cell = rnn.MultiRNNCell([base_cell] * NUM_LAYERS,state_is_tuple=False)
attn_cell =tf.contrib.rnn.AttentionCellWrapper(cell=layered_cell,attn_length=ATTN_LENGTH,state_is_tuple=False)
return attn_cell
self.gru_cell = makeGRUCells()
self.zero_state = self.gru_cell.zero_state(1, tf.float32)
self.start_state = tf.placeholder(dtype=tf.float32,shape=[1,self.gru_cell.state_size])
with tf.variable_scope("ff",initializer=xavier_initializer(uniform=False)):
droped_input = tf.nn.dropout(self.input_data,keep_prob=self.dropout_prob)
layer_1 = tf.contrib.layers.fully_connected(
num_outputs=FIRST_LAYER_SIZE,
inputs=droped_input,
)
layer_2 = tf.contrib.layers.fully_connected(
num_outputs=RNN_HIDDEN_SIZE,
inputs=layer_1,
)
split_inputs = tf.reshape(droped_input,shape=[1,BATCH_SIZE,num_features],name="reshape_l1") # Each item in the batch is a time step, iterate through them
split_inputs = tf.unstack(split_inputs,axis=1,name="unpack_l1")
states =[]
outputs =[]
with tf.variable_scope("rnn",initializer=xavier_initializer(uniform=False)) as scope:
state = self.start_state
for i, inp in enumerate(split_inputs):
if i >0:
scope.reuse_variables()
output, state = self.gru_cell(inp, state)
states.append(state)
outputs.append(output)
self.end_state = states[-1]
outputs = tf.stack(outputs,axis=1) # Pack them back into a single tensor
outputs = tf.reshape(outputs,shape=[BATCH_SIZE,RNN_HIDDEN_SIZE])
self.logits = tf.contrib.layers.fully_connected(
num_outputs=num_classes,
inputs=outputs,
activation_fn=None
)
with tf.variable_scope("loss"):
self.penalties = tf.reduce_sum([beta*tf.nn.l2_loss(var) for var in tf.trainable_variables()])
self.losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits = self.logits,
labels = self.target_data)
self.loss = tf.reduce_sum(self.losses + beta*self.penalties)
with tf.name_scope("train_step"):
opt = tf.train.AdamOptimizer(lr)
gvs = opt.compute_gradients(self.loss)
self.train_op = opt.apply_gradients(gvs, global_step=global_step)
with tf.name_scope("predictions"):
self.probs = tf.nn.softmax(self.logits)
self.predictions = tf.argmax(self.probs, 1)
correct_pred = tf.cast(tf.equal(self.predictions, tf.cast(self.target_data,tf.int64)),tf.float64)
self.accuracy = tf.reduce_mean(correct_pred)
In [105]:
with tf.Graph().as_default():
model = RNNModel()
input_ = train[0]
target = train[1]
losses = []
with tf.Session() as sess:
init = tf.global_variables_initializer()
sess.run([init])
loss = 2000
for e in range(NUM_EPOCHS):
state = sess.run(model.zero_state)
epoch_loss =0
for batch in range(0,NUM_TRAIN_BATCHES):
start = batch*BATCH_SIZE
end = start + BATCH_SIZE
feed = {
model.input_data:input_[start:end],
model.target_data:target[start:end],
model.dropout_prob:0.5,
model.start_state:state
}
_,loss,acc,state = sess.run(
[
model.train_op,
model.loss,
model.accuracy,
model.end_state
]
,feed_dict=feed
)
epoch_loss+=loss
losses.append(epoch_loss)
#print('step - {0} loss - {1} acc - {2}'.format((e),epoch_loss,acc))
print('################ done training ################')
final_preds =np.array([])
final_scores = None
for batch in range(0,NUM_VAL_BATCHES):
start = batch*BATCH_SIZE
end = start + BATCH_SIZE
feed = {
model.input_data:val[0][start:end],
model.target_data:val[1][start:end],
model.dropout_prob:1,
model.start_state:state
}
acc,preds,state, probs = sess.run(
[
model.accuracy,
model.predictions,
model.end_state,
model.probs
]
,feed_dict=feed
)
#print(acc)
assert len(preds) == BATCH_SIZE
final_preds = np.concatenate((final_preds,preds),axis=0)
if final_scores is None:
final_scores = probs
else:
final_scores = np.concatenate((final_scores,probs),axis=0)
print('################ done testing ################')
In [106]:
plt.scatter(np.linspace(0, 1, len(losses)), losses);
plt.title('Validation loss with epoch')
plt.ylabel('Validation Loss')
plt.xlabel('epoch progression');
In [107]:
plot_confusionmatrix(ytest, final_preds)
In [108]:
plot_ROC(ytest, final_scores, 'Feed forward neural net', 'r')
In [109]:
plot_PR(ytest, final_scores, 'Feed forward neural net', 'b')
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: