In [1]:
import gensim
In [8]:
# ======= KEYWORD RANKING ========
# ================================
model = "/Users/Belal/Projects/jobs/i2x_job/keyword_xtract/w2v_models/GoogleNews-vectors-negative300.bin.gz"
print("loading Word2Vec model...")
model = gensim.models.Word2Vec.load_word2vec_format(model, limit=125000, binary=True)
print("loaded model!")
In [9]:
model.save_word2vec_format("modelling", binary=True, fvocab="vocabz.txt")
In [5]:
from __future__ import division
import struct
import sys
FILE_NAME = "/Users/Belal/Projects/jobs/i2x_job/keyword_xtract/w2v_models/GoogleNews-vectors-negative300.bin.gz"
MAX_VECTORS = 200000 # This script takes a lot of RAM (>2GB for 200K vectors), if you want to use the full 3M embeddings then you probably need to insert the vectors into some kind of database
FLOAT_SIZE = 4 # 32bit float
vectors = dict()
with open(FILE_NAME, 'rb') as f:
c = None
# read the header
header = ""
while c != "\n":
c = f.read(1)
header += c
total_num_vectors, vector_len = (int(x) for x in header.split())
num_vectors = min(MAX_VECTORS, total_num_vectors)
print "Number of vectors: %d/%d" % (num_vectors, total_num_vectors)
print "Vector size: %d" % vector_len
while len(vectors) < num_vectors:
word = ""
while True:
c = f.read(1)
if c == " ":
break
word += c
binary_vector = f.read(FLOAT_SIZE * vector_len)
vectors[word] = [ struct.unpack_from('f', binary_vector, i)[0]
for i in xrange(0, len(binary_vector), FLOAT_SIZE) ]
sys.stdout.write("%d%%\r" % (len(vectors) / num_vectors * 100))
sys.stdout.flush()
import cPickle
print "\nSaving..."
with open(FILE_NAME[:-3] + "pcl", 'wb') as f:
cPickle.dump(vectors, f, cPickle.HIGHEST_PROTOCOL)
In [6]:
import cPickle
print ("\nSaving...")
with open(FILE_NAME[:-3] + "pcl", 'wb') as f:
cPickle.dump(vectors, f, cPickle.HIGHEST_PROTOCOL)
In [ ]: