Bible2vec

Use Word2vec to find related scriptures.

After reading our Bible dataset we create "scripture embeddings" by taking the mean vector of the scipture's individual word embeddings. Similar scriptures are those that have a high cosine similarity between their mean vectors.


In [1]:
from gensim.models import Word2Vec
import numpy as np
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import re
from os import path
import glob
import pickle


/Users/JVillella/Development/ml-playground/bible2vec/venv/lib/python3.5/site-packages/gensim/utils.py:1015: UserWarning: Pattern library is not installed, lemmatization won't be available.
  warnings.warn("Pattern library is not installed, lemmatization won't be available.")

Configure paths


In [2]:
WIKIPEDIA_W2V_PATH = '../wiki2vec/data/enwiki.model'
DATA_DIR = './data'
BIBLE_DIR = path.join(DATA_DIR, 'bible')

# Path to save bible embeddings
EMBEDDINGS_PATH = path.join(DATA_DIR, 'embeddings.pkl')

Read Bible


In [3]:
def get_scripture_id(book, chapter, verse):
    book = book.lower().replace(' ', '')
    return '%s-%d-%d' % (book, chapter, verse)

In [4]:
def read_bible():
    """Returns dictionary of { scripture_id: verse }."""
    
    pattern = path.join(BIBLE_DIR, '*')
    book_names = glob.glob(pattern)

    book_names = list(map(path.basename, book_names))
    sciptures = map(read_book, book_names)
    
    bible = {}
    for book_idx, book in enumerate(sciptures):
        for chapter_idx, chapter in enumerate(book):
            for verse_idx, verse in enumerate(chapter):
                book_name = book_names[book_idx]
                s_id = get_scripture_id(book_name, chapter_idx + 1, verse_idx + 1)
                bible[s_id] = verse
    
    return bible

def read_book(book):
    pattern = path.join(BIBLE_DIR, '%s/*.txt' % book)
    n_chapters = len(glob.glob(pattern))
    chapters = [read_chapter(book, n) for n in range(1, n_chapters+1)]
    return chapters
    
def read_chapter(book, chapter):
    filename = path.join(BIBLE_DIR, '%s/%s%d.txt' % (book, book, chapter))
    with open(filename, 'rt') as f:
        lines = f.readlines()
        lines = [re.sub(r'\d+\s', '', l.rstrip()) for l in lines]
    return lines

Setup Word2Vec Model


In [5]:
def word_vec(model, words, normalize=False):
    words = filter(lambda w: w in model.vocab, words)
    vecs = np.array([model[w] for w in words])
    vec = vecs.mean(axis=0)
    if normalize:
        vec = vec / np.linalg.norm(vec)
    return vec

In [6]:
def tokenize(line, stemmer=None, stopwords=None):
    line = re.sub(r'[^\w\s]+', ' ', line)
    line = re.sub(r'\s+', ' ', line)
    line = line.strip().lower()
    words = line.split()    

    if stemmer is not None:
        words = [stemmer.stem(w) for w in words if w not in stopwords]

    if stopwords is not None:
        words = [w for w in words if w not in stopwords]
    
    return words

Load pretrained Wikipedia word2vec model


In [7]:
model = Word2Vec.load(WIKIPEDIA_W2V_PATH)

In [8]:
stemmer = SnowballStemmer('english')
stops = stopwords.words('english')

Calculate Scripture Similarities

scripture id -> scripture text


In [9]:
scriptures = read_bible()

print('%s scriptures' % len(scriptures))


31102 scriptures

embedding idx -> embedding


In [10]:
embeddings = [word_vec(model, tokenize(verse, stemmer=stemmer, stopwords=stops), normalize=True)
                       for verse in scriptures.values()]


/Users/JVillella/Development/ml-playground/bible2vec/venv/lib/python3.5/site-packages/numpy/core/_methods.py:59: RuntimeWarning: Mean of empty slice.
  warnings.warn("Mean of empty slice.", RuntimeWarning)
/Users/JVillella/Development/ml-playground/bible2vec/venv/lib/python3.5/site-packages/numpy/core/_methods.py:70: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)

In [11]:
def nans(shape, dtype=float):
    a = np.empty(shape, dtype)
    a.fill(np.nan)
    return a

# HACK: If word_vec(...) returns nan, turn it into a nan row so we can vstack. Improve this.
embeddings = [nans(400) if np.isnan(e).any() else e for e in embeddings]
embeddings = np.vstack(embeddings)

print('Embeddings shape: ', embeddings.shape)


Embeddings shape:  (31102, 400)

scripture id -> embedding idx


In [12]:
script_ids = scriptures.keys()
embedding_idxs = range(embeddings.shape[0])

script_id_embedding_idx = dict(zip(script_ids, embedding_idxs))

embedding idx -> scripture id


In [13]:
embedding_idx_script_id = dict(zip(embedding_idxs, script_ids))

Save Data


In [14]:
with open(EMBEDDINGS_PATH, 'wb') as f:
    pickle.dump({
        'scriptures': scriptures,
        'embeddings': embeddings,
        'script_id_embedding_idx': script_id_embedding_idx,
        'embedding_idx_script_id': embedding_idx_script_id
    }, f)

Test Results


In [15]:
from IPython.display import HTML

In [16]:
def similar_scriptures(scripture_id, top_k=0, drop_first=True):
    """Assumes all embeddings have been normalized."""
    
    embedding_idx = script_id_embedding_idx[scripture_id]
    embedding = embeddings[embedding_idx]
    
    cosines = embeddings.dot(embedding)
    indexes = np.argsort(-cosines)

    if drop_first:
        indexes = indexes[1:]

    if top_k > 0:
        indexes = indexes[:top_k]
    return [(embedding_idx_script_id[index], scriptures[embedding_idx_script_id[index]], cosines[index], index)
            for index in indexes]

In [17]:
def similar_scriptures_html(scripture_id, top_k=15):   
    def row_html(s):
        row_pattern = """
            <td>{s_id}</td>
            <td>{scripture}</td>
            <td>{score}</td>
        """
        row = row_pattern.format(s_id=s[0], scripture=s[1],
                                 score=np.round(s[2], decimals=2))
        return row
    
    similar = similar_scriptures(scripture_id, top_k=top_k)
    rows = map(row_html, similar)
    rows = ['<tr>\n%s\n</tr>' % r for r in rows]
    
    columns = ['Scripture ID', 'Scripture', 'Score']
    headers = ['<th>%s</th>' % h for h in columns]
    header_row = '<tr>\n%s\n</tr>' % '\n'.join(headers)

    rows = [header_row] + rows
    table = '<table>\n{}\n</table>'.format('\n'.join(rows))
    header = '<h3>Query: (%s) %s</h3>' % (scripture_id, scriptures[scripture_id])
    return header + table

In [18]:
scripture_id = get_scripture_id('Genesis', 1, 1)

HTML(similar_scriptures_html(scripture_id))


Out[18]:

Query: (genesis-1-1) In the beginning God created the heaven and the earth.

Scripture ID Scripture Score
genesis-2-4 These are the generations of the heavens and of the earth when they were created, in the day that the LORD God made the earth and the heavens, 0.86
genesis-6-13 And God said unto Noah, The end of all flesh is come before me; for the earth is filled with violence through them; and, behold, I will destroy them with the earth. 0.81
1kings-8-27 But will God indeed dwell on the earth? behold, the heaven and heaven of heavens cannot contain thee; how much less this house that I have builded? 0.8
genesis-1-17 And God set them in the firmament of the heaven to give light upon the earth, 0.8
john-1-2 The same was in the beginning with God. 0.78
genesis-6-17 And, behold, I, even I, do bring a flood of waters upon the earth, to destroy all flesh, wherein is the breath of life, from under heaven; and every thing that is in the earth shall die. 0.78
psalms-113-6 Who humbleth himself to behold the things that are in heaven, and in the earth! 0.78
jeremiah-22-29 O earth, earth, earth, hear the word of the LORD. 0.77
psalms-68-8 The earth shook, the heavens also dropped at the presence of God: even Sinai itself was moved at the presence of God, the God of Israel. 0.77
deuteronomy-10-14 Behold, the heaven and the heaven of heavens is the LORD's thy God, the earth also, with all that therein is. 0.77
revelation-14-19 And the angel thrust in his sickle into the earth, and gathered the vine of the earth, and cast it into the great winepress of the wrath of God. 0.77
joshua-2-11 And as soon as we had heard these things, our hearts did melt, neither did there remain any more courage in any man, because of you: for the LORD your God, he is God in heaven above, and in earth beneath. 0.77
2chronicles-6-18 But will God in very deed dwell with men on the earth? behold, heaven and the heaven of heavens cannot contain thee; how much less this house which I have built! 0.77
genesis-28-12 And he dreamed, and behold a ladder set up on the earth, and the top of it reached to heaven: and behold the angels of God ascending and descending on it. 0.77
isaiah-42-5 Thus saith God the LORD, he that created the heavens, and stretched them out; he that spread forth the earth, and that which cometh out of it; he that giveth breath unto the people upon it, and spirit to them that walk therein: 0.76