In [1]:
from gensim.models import Word2Vec
import numpy as np
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import re
from os import path
import glob
import pickle
Configure paths
In [2]:
WIKIPEDIA_W2V_PATH = '../wiki2vec/data/enwiki.model'
DATA_DIR = './data'
BIBLE_DIR = path.join(DATA_DIR, 'bible')
# Path to save bible embeddings
EMBEDDINGS_PATH = path.join(DATA_DIR, 'embeddings.pkl')
In [3]:
def get_scripture_id(book, chapter, verse):
book = book.lower().replace(' ', '')
return '%s-%d-%d' % (book, chapter, verse)
In [4]:
def read_bible():
"""Returns dictionary of { scripture_id: verse }."""
pattern = path.join(BIBLE_DIR, '*')
book_names = glob.glob(pattern)
book_names = list(map(path.basename, book_names))
sciptures = map(read_book, book_names)
bible = {}
for book_idx, book in enumerate(sciptures):
for chapter_idx, chapter in enumerate(book):
for verse_idx, verse in enumerate(chapter):
book_name = book_names[book_idx]
s_id = get_scripture_id(book_name, chapter_idx + 1, verse_idx + 1)
bible[s_id] = verse
return bible
def read_book(book):
pattern = path.join(BIBLE_DIR, '%s/*.txt' % book)
n_chapters = len(glob.glob(pattern))
chapters = [read_chapter(book, n) for n in range(1, n_chapters+1)]
return chapters
def read_chapter(book, chapter):
filename = path.join(BIBLE_DIR, '%s/%s%d.txt' % (book, book, chapter))
with open(filename, 'rt') as f:
lines = f.readlines()
lines = [re.sub(r'\d+\s', '', l.rstrip()) for l in lines]
return lines
In [5]:
def word_vec(model, words, normalize=False):
words = filter(lambda w: w in model.vocab, words)
vecs = np.array([model[w] for w in words])
vec = vecs.mean(axis=0)
if normalize:
vec = vec / np.linalg.norm(vec)
return vec
In [6]:
def tokenize(line, stemmer=None, stopwords=None):
line = re.sub(r'[^\w\s]+', ' ', line)
line = re.sub(r'\s+', ' ', line)
line = line.strip().lower()
words = line.split()
if stemmer is not None:
words = [stemmer.stem(w) for w in words if w not in stopwords]
if stopwords is not None:
words = [w for w in words if w not in stopwords]
return words
Load pretrained Wikipedia word2vec model
In [7]:
model = Word2Vec.load(WIKIPEDIA_W2V_PATH)
In [8]:
stemmer = SnowballStemmer('english')
stops = stopwords.words('english')
scripture id -> scripture text
In [9]:
scriptures = read_bible()
print('%s scriptures' % len(scriptures))
embedding idx -> embedding
In [10]:
embeddings = [word_vec(model, tokenize(verse, stemmer=stemmer, stopwords=stops), normalize=True)
for verse in scriptures.values()]
In [11]:
def nans(shape, dtype=float):
a = np.empty(shape, dtype)
a.fill(np.nan)
return a
# HACK: If word_vec(...) returns nan, turn it into a nan row so we can vstack. Improve this.
embeddings = [nans(400) if np.isnan(e).any() else e for e in embeddings]
embeddings = np.vstack(embeddings)
print('Embeddings shape: ', embeddings.shape)
scripture id -> embedding idx
In [12]:
script_ids = scriptures.keys()
embedding_idxs = range(embeddings.shape[0])
script_id_embedding_idx = dict(zip(script_ids, embedding_idxs))
embedding idx -> scripture id
In [13]:
embedding_idx_script_id = dict(zip(embedding_idxs, script_ids))
In [14]:
with open(EMBEDDINGS_PATH, 'wb') as f:
pickle.dump({
'scriptures': scriptures,
'embeddings': embeddings,
'script_id_embedding_idx': script_id_embedding_idx,
'embedding_idx_script_id': embedding_idx_script_id
}, f)
In [15]:
from IPython.display import HTML
In [16]:
def similar_scriptures(scripture_id, top_k=0, drop_first=True):
"""Assumes all embeddings have been normalized."""
embedding_idx = script_id_embedding_idx[scripture_id]
embedding = embeddings[embedding_idx]
cosines = embeddings.dot(embedding)
indexes = np.argsort(-cosines)
if drop_first:
indexes = indexes[1:]
if top_k > 0:
indexes = indexes[:top_k]
return [(embedding_idx_script_id[index], scriptures[embedding_idx_script_id[index]], cosines[index], index)
for index in indexes]
In [17]:
def similar_scriptures_html(scripture_id, top_k=15):
def row_html(s):
row_pattern = """
<td>{s_id}</td>
<td>{scripture}</td>
<td>{score}</td>
"""
row = row_pattern.format(s_id=s[0], scripture=s[1],
score=np.round(s[2], decimals=2))
return row
similar = similar_scriptures(scripture_id, top_k=top_k)
rows = map(row_html, similar)
rows = ['<tr>\n%s\n</tr>' % r for r in rows]
columns = ['Scripture ID', 'Scripture', 'Score']
headers = ['<th>%s</th>' % h for h in columns]
header_row = '<tr>\n%s\n</tr>' % '\n'.join(headers)
rows = [header_row] + rows
table = '<table>\n{}\n</table>'.format('\n'.join(rows))
header = '<h3>Query: (%s) %s</h3>' % (scripture_id, scriptures[scripture_id])
return header + table
In [18]:
scripture_id = get_scripture_id('Genesis', 1, 1)
HTML(similar_scriptures_html(scripture_id))
Out[18]: