In [1]:
# Imports

from datetime import datetime, timedelta

from Database import db

import numpy as np
import pickle
import os

import matplotlib.pyplot as plt

from keras.optimizers import RMSprop
from keras.models import Sequential, load_model, Model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, concatenate, SpatialDropout1D, GRU
from keras.layers import Dense, Flatten, Embedding, LSTM, Activation, BatchNormalization, Dropout, Conv1D, MaxPooling1D
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint, TensorBoard
import keras.backend as K


Using TensorFlow backend.

In [2]:
# Options

stocks      = ['AAPL', 'AMD', 'AMZN', 'GOOG', 'MSFT']
all_sources = ['reddit', 'reuters', 'twitter', 'seekingalpha', 'fool']

max_length  = 50
vocab_size  = None # Set by tokenizer
emb_size    = 300

model_type  = 'regression'

epochs      = 180
batch_size  = 32

In [10]:
def make_headline_to_effect_data():
    """
    Headline -> Effect
    
    Creates essentially the X, Y data for the embedding model to use
    when analyzing/encoding headlines. Returns a list of headlines and
    a list of corresponding 'effects' which represent a change in the stock price.
    """
    meta, headlines, effects = [], [], []
    
    with db() as (conn, cur):
        
        for stock in stocks:
            
            print("Fetching Stock..." + stock)
            
            ## Go through all the headlines ##
            
            cur.execute("SELECT date, source, content FROM headlines WHERE stock=? AND LENGTH(content) >= 16", [stock])
            headline_query = cur.fetchall()
            
            for (date, source, content) in headline_query:
                
                event_date = datetime.strptime(date, '%Y-%m-%d') # The date of headline
                
                add_time = lambda e, days: (e + timedelta(days=days)).strftime('%Y-%m-%d')
                
                ## Find corresponding tick data ## 
                
                cur.execute("""SELECT AVG(adjclose) FROM ticks WHERE stock=? AND date BETWEEN ? AND ? ORDER BY date""", 
                            [stock, 
                             add_time(event_date, -3), 
                             add_time(event_date, 0)])
                
                before_headline_ticks = cur.fetchall()
                
                cur.execute("""SELECT AVG(adjclose) FROM ticks WHERE stock=? AND date BETWEEN ? AND ? ORDER BY date""", 
                            [stock, 
                             add_time(event_date, 1), 
                             add_time(event_date, 6)])
                
                after_headline_ticks = cur.fetchall()
                
                ## Create training example ##
                
                if len(before_headline_ticks) > 0 and len(after_headline_ticks) > 0 and before_headline_ticks[0][0] != None and after_headline_ticks[0][0] != None:
                    
                    previous_tick = before_headline_ticks[-1][0]
                    result_tick = after_headline_ticks[0][0]
                    
                    if model_type == 'regression':
                        
                        # Percent Diff (+Normalization Constant)
                        effect = [(result_tick - previous_tick) / previous_tick / 0.0044]
                    
                    else:
                
                        if result_tick > previous_tick:

                            effect = [1., 0.]

                        else:

                            effect = [0., 1.]
                        
                    meta.append((source, event_date.weekday()))
                    headlines.append(content)
                    effects.append(effect)
                    
    return meta, headlines, np.array(effects)

In [4]:
def encode_sentences(meta, sentences, tokenizer=None, max_length=100, vocab_size=100):
    """
    Encoder
    
    Takes a list of headlines and converts them into vectors
    """
    ## Encoding Sentences
    
    if not tokenizer:
        
        tokenizer = Tokenizer(num_words=vocab_size, filters='', lower=False) # Already Preprocessed
    
        tokenizer.fit_on_texts(sentences)
    
    encoded_headlines = tokenizer.texts_to_sequences(sentences)
    
    padded_headlines = pad_sequences(encoded_headlines, maxlen=max_length, padding='post')
    
    ## Encoding Meta Data
    
    # OneHot(Source [reddit/twitter/reuters etc..]) + OneHot(WeekDay)
    
    meta_matrix = np.zeros((len(sentences), len(all_sources) + 7))
    index = 0
    
    for (source, weekday) in meta:
        
        meta_matrix[index, all_sources.index(source)] = 1
        meta_matrix[index, len(all_sources) + weekday] = 1
        
        index += 1
    
    return meta_matrix, padded_headlines, tokenizer

In [5]:
def split_data(X, X2, Y, ratio):
    """
    Splits X/Y to Train/Test
    """
    indexes = np.arange(X.shape[0])
    np.random.shuffle(indexes)
    
    X  = X[indexes]
    X2 = X2[indexes]
    Y  = Y[indexes]
    
    train_size = int(len(X) * ratio)
    
    trainX,  testX  = X[:train_size],  X[train_size:]
    trainX2, testX2 = X2[:train_size], X2[train_size:]
    trainY,  testY  = Y[:train_size],  Y[train_size:]
    
    return trainX, trainX2, trainY, testX, testX2, testY

In [6]:
def get_embedding_matrix(tokenizer, pretrained_file='glove.840B.300d.txt', purge=False):
    """Load Vectors from Glove File"""
    print("Loading WordVecs...")
    
    ## Load Glove File (Super Slow) ##
    
    glove_db = dict()
    
    with open(os.path.join('..', 'data', pretrained_file), 'r', encoding="utf-8") as glove:

        for line in glove:

            values = line.split(' ')
            word = values[0].replace('-', '').lower()
            coefs = np.asarray(values[1:], dtype='float32')
            glove_db[word] = coefs

    print('Loaded WordVectors...' + str(len(glove_db)))
    
    ## Set Embeddings ##
    
    embedding_matrix = np.zeros((vocab_size + 1, emb_size))
    
    for word, i in tokenizer.word_index.items():
        
        embedding_vector = glove_db.get(word)
        
        if embedding_vector is not None:
            
            embedding_matrix[i] = embedding_vector
            
        elif purge:
            
            with db() as (conn, cur):
                
                cur.execute("SELECT 1 FROM dictionary WHERE word=?", [word])
                
                if len(cur.fetchall()) == 0:
                    
                    print("Purge..." + word)

                    cur.execute("DELETE FROM headlines WHERE content LIKE ?", ["%" + word + "%"])
                    conn.commit()
            
    return embedding_matrix, glove_db

def correct_sign_acc(y_true, y_pred):
    """
    Accuracy of Being Positive or Negative
    """
    diff = K.equal(y_true > 0, y_pred > 0)
    
    return K.mean(diff, axis=-1)

def get_model(emb_matrix):
    
    ## Headline ##
    
    headline_input = Input(shape=(max_length,))
    
    emb = Embedding(vocab_size + 1, emb_size, input_length=max_length, weights=[emb_matrix], trainable=True)(headline_input)
    emb = SpatialDropout1D(.2)(emb)
    
    conv = Conv1D(filters=64, kernel_size=5, padding='same', activation='selu')(emb)
    conv = MaxPooling1D(pool_size=3)(conv)
    
    text_rnn = LSTM(200, dropout=0.3, recurrent_dropout=0.3, return_sequences=False)(conv)
    text_rnn = Activation('selu')(text_rnn)
    text_rnn = BatchNormalization()(text_rnn)
    
    # text_rnn = LSTM(300, dropout=0.3, recurrent_dropout=0.3)(text_rnn)
    # text_rnn = Activation('relu')(text_rnn)
    # text_rnn = BatchNormalization()(text_rnn)
    
    ## Source ##
    
    meta_input = Input(shape=(len(all_sources) + 7,))
    
    ## Combined ##
    
    merged = concatenate([text_rnn, meta_input])
    
    final_dense = Dense(100)(merged)
    final_dense = Activation('selu')(final_dense)
    final_dense = BatchNormalization()(final_dense)
    final_dense = Dropout(0.5)(final_dense)
    
    final_dense = Dense(100)(merged)
    final_dense = Activation('selu')(final_dense)
    final_dense = BatchNormalization()(final_dense)
    final_dense = Dropout(0.5)(final_dense)
    
    if model_type == 'regression':
        
        pred_dense = Dense(1)(final_dense)
        out = pred_dense
        
        model = Model(inputs=[headline_input, meta_input], outputs=out)
    
        model.compile(optimizer=RMSprop(lr=0.001), loss='mse', metrics=[correct_sign_acc])
    
    else:
    
        pred_dense = Dense(2)(final_dense)
        out = Activation('softmax')(pred_dense)
        
        model = Model(inputs=[headline_input, meta_input], outputs=out)
    
        model.compile(optimizer=RMSprop(lr=0.001), loss='categorical_crossentropy', metrics=['acc'])
    
    return model

In [11]:
if __name__ == "__main__":
    
    meta, headlines, effects = make_headline_to_effect_data()
    
    encoded_meta, encoded_headlines, toke = encode_sentences(meta, 
                                                             headlines, 
                                                             max_length=max_length, 
                                                             vocab_size=vocab_size)
    
    vocab_size = len(toke.word_counts)
    print("Found Words......" + str(vocab_size))
    
    emb_matrix, glove_db = get_embedding_matrix(toke)
    
    trainX, trainX2, trainY, testX, testX2, testY = split_data(encoded_headlines, encoded_meta, effects, .8)
    
    print(trainX.shape, trainX2.shape, testY.shape)


Fetching Stock...AAPL
Fetching Stock...AMD
Fetching Stock...AMZN
Fetching Stock...GOOG
Fetching Stock...MSFT
Found Words......12208
Loading WordVecs...
Loaded WordVectors...1634534
(15788, 50) (15788, 12) (3947, 1)

In [12]:
# TRAIN MODEL

if __name__ == "__main__": 
    
    ## Save Tokenizer ##
    
    with open(os.path.join('..', 'models', 'toke.pkl'), 'wb') as toke_file:
        pickle.dump(toke, toke_file, protocol=pickle.HIGHEST_PROTOCOL)
        
    ## Create Model ##
    
    model = get_model(emb_matrix)
    
    if model_type == 'regression':
        monitor_mode = 'correct_sign_acc'
    else:
        monitor_mode = 'val_acc'
    
    tensorboard = TensorBoard(log_dir="logs/{}".format(datetime.now().strftime("%Y,%m,%d-%H,%M,%S," + model_type)))
    e_stopping = EarlyStopping(monitor=monitor_mode, patience=60)
    checkpoint = ModelCheckpoint(os.path.join('..', 'models', 'media-headlines-' + model_type + '.h5'), 
                                 monitor=monitor_mode,
                                 verbose=0,
                                 save_best_only=True)
    
    ## Train ##
    
    history = model.fit([trainX, trainX2],
                        trainY,
                        epochs=epochs, 
                        batch_size=batch_size,
                        validation_data=([testX, testX2], testY),
                        verbose=0,
                        callbacks=[e_stopping, checkpoint, tensorboard])
    
    ## Display Train History ##
    
    plt.plot(np.log(history.history['loss']))
    plt.plot(np.log(history.history['val_loss']))
    plt.legend(['LogTrainLoss', 'LogTestLoss'])
    plt.show()
    
    plt.plot(history.history[monitor_mode])
    plt.plot(history.history['val_' + monitor_mode])
    plt.legend(['TrainAcc', 'TestAcc'])
    plt.show()



In [13]:
# TEST MODEL

if __name__ == "__main__":
    
    ## Load Model For Manual Testing ##
    
    import keras.metrics
    keras.metrics.correct_sign_acc = correct_sign_acc
    
    with open(os.path.join('..', 'models', 'toke.pkl'), 'rb') as toke_file:
        toke = pickle.load(toke_file)
    
    model = load_model(os.path.join('..', 'models', 'media-headlines-' + model_type + '.h5'))
    
    ## **This Test May Overlap w/Train Data** ##
    
    pretick_date = '2018-02-12'
    current_date = '2018-02-14'
    predict_date = '2018-02-15'
    stock = 'AAPL'
    
    with db() as (conn, cur):
        
        ## Select Actual Stock Values ##
        
        cur.execute("""SELECT adjclose FROM ticks WHERE stock=? AND date BETWEEN ? AND ? ORDER BY date""", 
                    [stock, current_date, predict_date])
        ticks = cur.fetchall()
        
        ## Find Headlines ##
    
        cur.execute("SELECT date, source, content FROM headlines WHERE date BETWEEN ? AND ? AND stock=?", [pretick_date, current_date, stock])
        headlines = cur.fetchall()
        
        ## Process ##
        
        meta, test_sents = [], []
        
        for (date, source, content) in headlines:
            
            meta.append([source, datetime.strptime(date, '%Y-%m-%d').weekday()])
            test_sents.append(content)
            
        encoded_meta, test_encoded, _ = encode_sentences(meta, 
                                                         test_sents, 
                                                         tokenizer=toke, 
                                                         max_length=max_length,
                                                         vocab_size=vocab_size)
        
        predictions = model.predict([test_encoded, encoded_meta])
        
        ## Display ##
        
        parse = lambda num: str(round(num, 2))
        
        print("Using: " + str(test_sents))
        
        if model_type == 'regression':
            
            print("Predicting Change Coef: " +  parse(np.mean(predictions[:, 0])))
            print("Predicting Price: " +  parse(np.mean(predictions[:, 0]) * 0.0044 * ticks[0][0] + ticks[0][0]))
            
        else:
        
            print("Predicting Change Coef: " +  parse(np.mean(predictions[:, 0]) - .5))
        
        print("Actual Stock: " + parse(ticks[0][0]) + " to " + parse(ticks[-1][0]))
        print("Actual Stock Change: " + parse(ticks[-1][0] - ticks[0][0]))


Using: ['potus **COMPANY** is bringing about b back in theyre going to build a tremendous campus theyre going to bui', '**COMPANY** confirms **PRODUCT** source code leak but heres why users need not be scared via', 'your **COMPANY** watch or fitbit may know even more about your health than you think', '**COMPANY** is now one of the most successful watch manufacturers in the world', '**COMPANY** suppliers qualcomm amp broadcom to meet directly to talk b acquisition deal', '**COMPANY** gets it just right from the internet of things to the internet of life via forbes', '**COMPANY** maps transit directions come to tucson arizona', 'how **COMPANY** plans to root out bugs and revamp **PRODUCT** software', '**COMPANY** highlights portrait selfies bouncing live photos in latest **PRODUCT** tutorials', 'google embracing notch design in android update preparing for new wave of **PRODUCT** x clones', '**COMPANY** to expand **COMPANY** music student discount to new regions this month', '**COMPANY** ceo downplays special dividend at shareholder meeting', 'broadcom scales back qualcomm board challenge to simple majority', '**COMPANY** shareholders defeat two proposals at annual meeting', 'norway wealth fund ceo to firms allow independent directors on your boards', 'the former head of windows and office steven sinofsky explains **COMPANY** at scale better than most **COMPANY** pundits', 'facebooks protect feature on **PRODUCT** essentially installs spyware on **PRODUCT** and **PRODUCT**', 'i think it would be cool if **COMPANY** did something where if you tell siri youre having anxiety it responds would yo', 'not to be reckless but i still dont have a case on my **PRODUCT** x cc **COMPANY** to cover me when it breaks', 'all eyes on ceo tim cook at **COMPANY**s annual shareholders meeting where investors look for answers behind slow iphon', 'the roof of **COMPANY**s visitor center is dripping water not sure why no rain here and a woman just slipped and fe', 'voters shoot down human rights committee proposal at **COMPANY** shareholders meeting', '**COMPANY** features that were actually lies', 'weve released our mac beta try it out here **COMPANY** cybersecurity infosec', 'give the **PRODUCT** x a headphone jack with this crowdfunded case', '**COMPANY**s spaceship is going to be harder to get a tour of than area', 'amazon stock climbed percent giving the company a market value of billion and topping microsofts mar', 'new **COMPANY** webpage touts apps amp technology for augmented reality on **PRODUCT**', 'heads of us law amp spy agencies say phones by **COMPANY** rival huawei pose inherent national security risk', 'qualcomm unveils gigabit lte modem amid rumors **COMPANY**s **PRODUCT** going intelonly', 'have a mac test our our mac beta because macs arent immune to malware', 'rearfacing d sensing technology possibly slated for fall **PRODUCT** allowing **COMPANY** to press ar advantage', 'if you launch something thats really different people are either going to say thats amazing or laugh in your f', '**COMPANY** expected to ship m lcd **PRODUCT** units priced at in late', 'went to **COMPANY** park and photographed it with an **PRODUCT** x of course', '**COMPANY** in talks for first order from chinese chipmaker nikkei', 'japan display books fourth straight quarterly loss no partner yet', 'japan display books th straight quarterly loss no partner yet', 'japan display reports th straight quarterly loss', 'warren buffetts berkshirehathaway increases **COMPANY** stake by percent now its largest holding', 'bill gates cautions **COMPANY** and other tech companies about arrogance inviting government interference', 'appaloosa more than triples share stake in **COMPANY**', '**COMPANY** brings **PRODUCT** student subscriptions to malaysia for only rm a month', 'went to **COMPANY** park and photographed it with an **PRODUCT** of course', 'google embracing notch design in android update preparing for new wave of **PRODUCT** clones', 'in software development cultural shift **COMPANY** wont hold features for major annual pointzero releases', 'verizon will begin locking **PRODUCT**s to its network starting this spring', 'its time for **COMPANY** to bring this **PRODUCT** pro feature to **PRODUCT**', 'dont overlook these metrics from **COMPANY**s record first quarter', 'benefit from **COMPANY**s rumored move to ditch qualcomm for intel in **PRODUCT**s', '**COMPANY** ceo tim cook gets defensive about **PRODUCT** x demand', 'here are alltime records **COMPANY** broke in s first quarter', '**COMPANY** incs **PRODUCT** average selling price challenge', 'companies set to lose from **COMPANY**s **PRODUCT**s', 'heres why i dont buy this **COMPANY** **PRODUCT** rumor', '**COMPANY** incs awkward d touch situation']
Predicting Change Coef: 7.98
Predicting Price: 173.24
Actual Stock: 167.37 to 172.99
Actual Stock Change: 5.62

In [14]:
# TEST MODEL

if __name__ == "__main__":
     
    ## Load Model For Manual Testing ##
    
    import keras.metrics
    keras.metrics.correct_sign_acc = correct_sign_acc
     
    with open(os.path.join('..', 'models', 'toke.pkl'), 'rb') as toke_file:
        toke = pickle.load(toke_file)
    
    model = load_model(os.path.join('..', 'models', 'media-headlines-' + model_type + '.h5'))
      
    ## Fake Unique Test Data ##
    
    headlines = [
        "**COMPANY** gains a ton of stock after creating **PRODUCT**",
        "**COMPANY** loses a ton of stock after killing **PRODUCT**"
    ]
    
    test_sents, meta = [], []
    
    for headline in headlines:
    
        for source in all_sources:

            for weekday in range(7):
            
                test_sents.append(headline)
                meta.append([source, weekday])
    
    ## Process ##
    
    encoded_meta, test_encoded, _ = encode_sentences(meta, 
                                                     test_sents, 
                                                     tokenizer=toke, 
                                                     max_length=max_length, 
                                                     vocab_size=vocab_size)
    
    predictions = model.predict([test_encoded, encoded_meta])
    
    predictions = predictions.reshape((len(headlines), len(all_sources), 7))
    
    ## Display Predictions ##
    
    from matplotlib.colors import Normalize
    
    for i, headline in enumerate(headlines):
        
        plt.imshow(predictions[i], interpolation='none', cmap='PRGn', norm=Normalize(vmin=-2, vmax=2))
        plt.title(headline)
        plt.xlabel('Weekday')
        plt.ylabel('Source')
        plt.xticks(np.arange(7), list('MTWTFSS'))
        plt.yticks(np.arange(len(all_sources)), all_sources)
        plt.show()



In [ ]: