In [2]:
import os
from IPython.display import display, Image
display(Image('./se.png', width=900))
Social graph force-directed
In [3]:
display(Image('./socialgraph.png', width=656))
Looks like a supervised learning problem:
Stories labeled relevant+popular are typically
Step 3: Dump a data file
Who shared it: 1-hot vector of ~1000 most frequent (frontpage) Twitter accounts, blogs, etc., e.g. https://twitter.com/valuewalk http://thereformedbroker.com/
Text of headline
In [2]:
row = [1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,1,
'what go happen 2017']
len(row)
Out[2]:
In [6]:
display(Image('./projector.png', width=656))
Step 5: Run feedforward NN, each obs is 1-hot + average embeddings
In [ ]:
# function to generate model
def create_model(nn_size=30, nn_reg_penalty=0.0, nn_dropout=(1.0/3.0)):
# create model
model = Sequential()
model.add(Dense(nn_size,
activation='relu',
kernel_initializer='TruncatedNormal',
kernel_regularizer=l1(nn_reg_penalty),
input_shape=(NUM_FEATURES,)
))
model.add(Dropout(nn_dropout))
model.add(Dense(1,
activation='sigmoid',
kernel_initializer='TruncatedNormal',
kernel_regularizer=l1(nn_reg_penalty)
))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', f_score])
print(model.summary())
return model
In [11]:
display(Image('./grid.png', width=368))
In [ ]:
Pretty good result
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
dense_129 (Dense) (None, 64) 135872
_________________________________________________________________
dropout_65 (Dropout) (None, 64) 0
_________________________________________________________________
dense_130 (Dense) (None, 1) 65
=================================================================
Total params: 135,937.0
Trainable params: 135,937.0
Non-trainable params: 0.0
_________________________________________________________________
None
12:38:45 Starting
12:38:45 epoch 0 of 200
Train on 130321 samples, validate on 43440 samples
Epoch 1/1
130321/130321 [==============================] - 9s - loss: 0.5374 - acc: 0.9660 - f_score: 0.0077 - val_loss: 0.2127 - val_acc: 0.9724 - val_f_score: 0.0000e+00
...
13:05:05 epoch 199 of 200
Train on 130321 samples, validate on 43440 samples
Epoch 1/1
130321/130321 [==============================] - 7s - loss: 0.0774 - acc: 0.9852 - f_score: 0.6018 - val_loss: 0.0730 - val_acc: 0.9864 - val_f_score: 0.6486
Best Xval loss epoch 133, value 0.072672
NN units 64
Reg_penalty 0.00010000
Dropout 0.5000
Final Train Accuracy 0.988, Train F1 0.760, f_score 0.809 (beta 0.667)
[[126403 1266]
[ 253 2399]]
Final Xval Accuracy 0.987, Xval F1 0.725, f_score 0.767 (beta 0.667)
[[42100 438]
[ 140 762]]
Raw score 2 0.01431860
Test Accuracy 0.987, Test F1 0.731
[[42132 418]
[ 136 754]]
1,domain_otherdomain,subsource_howardlindzon,subsource_jyarow,subsource_ReformedBroker,subsource_NickatFP,subsource_mathewi,subsource_othersubsource,subsource_LongShortTrader,subsource_DKThomp,subsource_Justin_B_Smith,source_Abnormal_Returns,what,go,happen,2017
* label
* domain encoded as a word token
* sources encoded as word tokens
* headline tokens
Dump all the tokens into a big file (actually much smaller than 1-hot vectors), train them with word2vec (possibly redundant step)
Create LSTM model like this
Layer (type) Output Shape Param #
=================================================================
embedding_1 (Embedding) (None, 120, 300) 3000300
_________________________________________________________________
lstm_1 (LSTM) (None, 128) 219648
_________________________________________________________________
dropout_1 (Dropout) (None, 128) 0
_________________________________________________________________
dense_1 (Dense) (None, 1) 129
=================================================================
Total params: 3,220,077.0
Trainable params: 3,220,077.0
Non-trainable params: 0.0
In [ ]:
# function to generate model
def create_model(lstm_size=30, lstm_reg_penalty=0.0, sigmoid_dropout=(1.0/3.0), sigmoid_reg_penalty=0.0001):
# create model
model = Sequential()
model.add(Embedding(len(dictionary) + 1,
embedding_vector_length,
weights=[embedding_matrix],
input_length=MAX_LENGTH,
trainable=True))
# LSTM with lstm_size units
model.add(LSTM(lstm_size,
kernel_regularizer=l1(lstm_reg_penalty)))
model.add(Dropout(sigmoid_dropout))
model.add(Dense(1,
activation='sigmoid',
kernel_initializer='TruncatedNormal',
kernel_regularizer=l1(sigmoid_reg_penalty)))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
return model
Hyperparameters:
Do a grid search, generally overtrains after very few epochs (near-perfect accuracy in training, mediocre performance in xval/test)
Train Accuracy 0.999, Train F1 0.975, f_score 0.981 (beta 0.667)
[[127430 144]
[ 30 3447]]
LSTM units 16
LSTM reg_penalty 0.00000000
Sigmoid dropout 0.5000
Sigmoid reg_penalty 0.00003000
Xval Accuracy 0.981, Xval F1 0.627, f_score 0.670 (beta 0.667)
[[42188 590]
[ 223 683]]
Instead, set the embeddings trainable=False
Finally
Pretty good result but not as good as NN
In [ ]: