In [154]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from pymongo import MongoClient
import requests

from tqdm import tqdm
from collections import Counter

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from keras.preprocessing.sequence import pad_sequences
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.callbacks import ProgbarLogger, Callback

In [42]:
db = MongoClient()['sheldon']
mongo = db.producthunt

In [12]:
resp = requests.get('https://api.producthunt.com/v1/posts?days_ago=1000',
                    headers={
                        'Authorization': 'Bearer XXX'
                    })

In [ ]:
mongo.delete_many({})

In [ ]:
posts = []
for i in tqdm(range(1, 1001)):
    resp = requests.get(
        'https://api.producthunt.com/v1/posts?days_ago={}'.format(i), 
         headers={'Authorization': 'Bearer XXX'}
    )
    documents = []
    for post in resp.json()['posts']:
        if post['category_id'] != 1:
            continue
        documents.append({'text': post['tagline'], '_id': post['id'], 'score': post['votes_count']})
    mongo.insert_many(documents)

In [56]:
mongo.find().count()


Out[56]:
25455

In [40]:
len(posts)


Out[40]:
11

In [77]:
scores = []
for item in tqdm(mongo.find()):
    scores.append((item['_id'], item['score'], item['text']))


0it [00:00, ?it/s]
102it [00:00, 372.42it/s]
25455it [00:00, 71672.64it/s]

In [75]:
sns.distplot([x[1] for x in scores if x[1] < 1000])


Out[75]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f1b12ad2fd0>
/home/sheldonai/anaconda3/lib/python3.6/site-packages/matplotlib/font_manager.py:1297: UserWarning: findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans
  (prop.get_family(), self.defaultFamily[fontext]))

In [121]:
scores_limit = 750
x_data, y_data = [], []
for item in mongo.find():
    x_data.append(item['text'].lower())
    y_data.append(min(item['score'], scores_limit))

Char RNN


In [88]:
counter = Counter()
for text in x_data:
    for sym in text:
        counter[sym] += 1
print('Total vocab size:', len(counter))
i = 0
for sym, num in counter.most_common():
    if num >= 50:
        i += 1
print('Bigger than 50 occurances vocab:', i)
print(counter.most_common(56))


Total vocab size: 452
Bigger than 50 occurances vocab: 56
[(' ', 161616), ('e', 108088), ('o', 81783), ('t', 77169), ('a', 76157), ('r', 69511), ('i', 65870), ('s', 65836), ('n', 61543), ('l', 39376), ('c', 34598), ('d', 33587), ('u', 31074), ('p', 28649), ('h', 27132), ('m', 26788), ('f', 22816), ('y', 21886), ('g', 20363), ('w', 16281), ('b', 15538), ('v', 11236), ('k', 10228), ('.', 5008), (',', 4748), ('-', 3678), ('x', 2574), ('&', 1887), ('j', 1531), ("'", 1503), ('z', 1428), ('0', 1113), ('q', 837), (')', 824), ('(', 818), ('1', 684), ('!', 661), ('/', 570), ('2', 439), ('3', 411), ('+', 356), ('5', 342), ('’', 243), ('6', 242), ('"', 229), ('4', 226), ('9', 209), ('$', 165), ('8', 163), ('?', 162), (':', 124), ('7', 107), ('#', 63), ('%', 61), ('️', 60), ('…', 51)]

In [181]:
# Not implemented yet.

Word RNN


In [94]:
stopwords_list = stopwords.words('english')

In [99]:
counter = Counter()
for text in x_data:
    for word in word_tokenize(text):
        if word not in stopwords_list:
            counter[word] += 1
print('Total vocab size:', len(counter))
i = 0
for sym, num in counter.most_common():
    if num >= 100:
        i += 1
print('Bigger than 100 occurances vocab:', i)
print(counter.most_common(i))


Total vocab size: 14635
Bigger than 100 occurances vocab: 236
[(',', 4690), ('.', 4654), ('&', 1887), ('app', 1396), ("'s", 1084), ('new', 843), (')', 824), ('(', 818), ('way', 785), ('create', 778), ('get', 700), ('share', 682), ('!', 661), ('free', 642), ('simple', 620), ('find', 616), ('best', 612), ('web', 535), ('world', 534), ('ios', 519), ('video', 487), ('friends', 481), ('mobile', 478), ('music', 473), ('one', 468), ('discover', 453), ('beautiful', 451), ('make', 434), ('people', 431), ('photos', 415), ('design', 410), ('iphone', 403), ('slack', 402), ('mac', 398), ('smart', 393), ('apps', 390), ('email', 387), ('videos', 383), ('social', 372), ('online', 359), ('time', 357), ('platform', 350), ('phone', 341), ('turn', 340), ('first', 336), ('tool', 324), ('easy', 314), ('build', 309), ('data', 308), ('watch', 308), ('live', 303), ('product', 298), ('every', 298), ('send', 296), ('news', 293), ('curated', 276), ('track', 276), ('text', 273), ('search', 270), ('content', 268), ('personal', 265), ('chat', 264), ('android', 257), ('website', 253), ('better', 252), ('google', 251), ('facebook', 249), ('made', 247), ('service', 243), ('game', 240), ('open', 237), ('twitter', 237), ('home', 236), ('using', 234), ('real', 233), ('favorite', 231), ('bot', 230), ('business', 228), ('easiest', 228), ('like', 224), ('learn', 223), ('photo', 223), ('help', 222), ('code', 221), ('apple', 221), ('team', 218), ('see', 218), ('daily', 213), ('use', 212), ('startup', 211), ('messenger', 206), ('without', 206), ('day', 205), ('easily', 203), ('+', 201), ('browser', 193), ('place', 192), ('products', 190), ('keyboard', 190), ('based', 186), ('fast', 185), ('manage', 184), ('life', 184), ('instagram', 183), ('chrome', 182), ('-', 181), ('book', 178), ('custom', 177), ('management', 177), ('tools', 177), ('real-time', 176), ('analytics', 175), ('source', 175), ('community', 174), ('add', 173), ('emoji', 172), ('gifs', 172), ('hunt', 172), ('digital', 170), ('tech', 169), ('stories', 169), ('work', 168), ('interactive', 168), ('$', 165), ('fun', 165), ('tv', 164), ('w/', 163), ('top', 162), ('?', 162), ('art', 160), ('startups', 160), ('media', 159), ('anywhere', 158), ('network', 157), ('camera', 155), ('right', 155), ('go', 155), ('site', 155), ('travel', 154), ('pre-launch', 152), ('api', 150), ('via', 150), ('delivered', 149), ('awesome', 148), ('list', 147), ('play', 147), ('around', 146), ('marketing', 145), ('developers', 145), ('great', 145), ('experience', 144), ('software', 142), ('minutes', 142), ('3d', 142), ('things', 140), ('images', 139), ('powerful', 137), ('ai', 136), ('keep', 136), ('buy', 136), ('seconds', 135), ('screen', 135), ('control', 134), ('marketplace', 133), ('sharing', 133), ('kids', 132), ('device', 132), ('messaging', 131), ('tracking', 131), ('take', 131), ('collection', 131), ('cloud', 131), ('messages', 130), ('next', 130), ('stickers', 130), ('designers', 130), ('on-demand', 130), ('page', 129), ('teams', 128), ('local', 127), ('ideas', 127), ('instantly', 125), ("n't", 124), ('editor', 123), ('explore', 123), ('love', 123), (':', 122), ('need', 120), ('money', 119), ('connect', 119), ("''", 119), ('virtual', 118), ('personalized', 118), ('emails', 117), ('ipad', 117), ('websites', 116), ('extension', 115), ('never', 115), ('projects', 115), ('assistant', 115), ('internet', 115), ('events', 115), ('visual', 113), ('imessage', 113), ('project', 113), ('anything', 113), ('devices', 112), ('store', 112), ('modern', 111), ('learning', 110), ('tinder', 110), ('resources', 110), ('save', 110), ('everything', 110), ('``', 110), ('users', 108), ('ui', 107), ('youtube', 107), ('audio', 106), ('read', 105), ('food', 105), ('private', 103), ('cards', 103), ('powered', 103), ('animated', 103), ('color', 103), ('links', 102), ('instant', 102), ('tab', 101), ('pocket', 101), ('click', 101), ('coffee', 101), ('back', 101), ('professional', 100), ('companies', 100), ('perfect', 100)]

In [100]:
vocab_size = 236
vocab_list = [x[0] for x in counter.most_common(vocab_size)]

In [102]:
words_num = []
for text in x_data:
    i = 0
    for word in word_tokenize(text):
        if word not in stopwords_list:
            i += 1
    words_num.append(i)
sns.distplot(words_num)


Out[102]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f1b0493ebe0>
/home/sheldonai/anaconda3/lib/python3.6/site-packages/matplotlib/font_manager.py:1297: UserWarning: findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans
  (prop.get_family(), self.defaultFamily[fontext]))

In [103]:
seq_size = 10

In [140]:
X = []
vocab_set = set(vocab_list)
for text in tqdm(x_data):
    words = [y for y in word_tokenize(text) if y not in stopwords_list and y in vocab_set]
    record = []
    for word in words:
        for i, vword in enumerate(vocab_list):
            if vword == word:
                record.append(i)
                break
    X.append(record)


100%|██████████| 25455/25455 [00:03<00:00, 7100.77it/s]

In [141]:
X = pad_sequences(X, maxlen=seq_size)

In [125]:
Y = []
for y in y_data:
    Y.append(y / scores_limit)
Y = np.array(Y)

In [143]:
X


Out[143]:
array([[  0,   0,   0, ...,  47, 151,   1],
       [  0,   0,   0, ...,   0,   0,  65],
       [  0,   0,   0, ...,   0,  57,  32],
       ..., 
       [  0,   0,   0, ..., 153,  30,   3],
       [  0,   0,   0, ...,   0,   0,   2],
       [  0,   0,   0, ...,   0,   0,  58]], dtype=int32)

In [161]:
embedding_vector_length = 32
model = Sequential()
model.add(Embedding(vocab_size, embedding_vector_length, input_length=seq_size))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_7 (Embedding)      (None, 10, 32)            7552      
_________________________________________________________________
lstm_7 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 101       
=================================================================
Total params: 60,853.0
Trainable params: 60,853
Non-trainable params: 0.0
_________________________________________________________________
None

In [155]:
class LossHistory(Callback):
    def on_train_begin(self, logs={}):
        self.loss = []
        self.acc = []
        self.val_loss = []
        self.val_acc = []


    def on_epoch_end(self, batch, logs={}):
        print('Epoch end:', logs.get('loss'))
        self.loss.append(logs.get('loss'))
        self.acc.append(logs.get('acc'))

In [ ]:
history = model.fit(X, Y, epochs=200, verbose=0, callbacks=[LossHistory()],)

In [163]:
plt.plot(history.history['loss'])


Out[163]:
[<matplotlib.lines.Line2D at 0x7f1aa2d370b8>]
/home/sheldonai/anaconda3/lib/python3.6/site-packages/matplotlib/font_manager.py:1297: UserWarning: findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans
  (prop.get_family(), self.defaultFamily[fontext]))

In [166]:
sns.distplot(model.predict(X))
sns.distplot(Y)


Out[166]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f1aa2c6dba8>
/home/sheldonai/anaconda3/lib/python3.6/site-packages/matplotlib/font_manager.py:1297: UserWarning: findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans
  (prop.get_family(), self.defaultFamily[fontext]))

Demo


In [180]:
STARTUP_TAGLINE = 'Interactive mystery and unannounced monthly deliveries'
text = word_tokenize(STARTUP_TAGLINE.lower())
record = []
for word in text:
    for i, vword in enumerate(vocab_list):
        if word == vword:
            record.append(i)
            break
x_eval = pad_sequences(np.array([record]), maxlen=seq_size)
print('"{}" will recieve {} points'.format(STARTUP_TAGLINE, 
                                           int(model.predict(x_eval)[0][0] * scores_limit)))


"Interactive mystery and unannounced monthly deliveries" will recieve 182 points