saving_google_w2v_model



In [1]:
import gensim

In [8]:
# ======= KEYWORD RANKING ========
# ================================

model = "/Users/Belal/Projects/jobs/i2x_job/keyword_xtract/w2v_models/GoogleNews-vectors-negative300.bin.gz"

print("loading Word2Vec model...")
model = gensim.models.Word2Vec.load_word2vec_format(model, limit=125000, binary=True)
print("loaded model!")


loading Word2Vec model...
loaded model!

In [9]:
model.save_word2vec_format("modelling", binary=True, fvocab="vocabz.txt")

In [5]:
from __future__ import division

import struct
import sys

FILE_NAME = "/Users/Belal/Projects/jobs/i2x_job/keyword_xtract/w2v_models/GoogleNews-vectors-negative300.bin.gz"
MAX_VECTORS = 200000 # This script takes a lot of RAM (>2GB for 200K vectors), if you want to use the full 3M embeddings then you probably need to insert the vectors into some kind of database
FLOAT_SIZE = 4 # 32bit float

vectors = dict()

with open(FILE_NAME, 'rb') as f:
    
    c = None
    
    # read the header
    header = ""
    while c != "\n":
        c = f.read(1)
        header += c

    total_num_vectors, vector_len = (int(x) for x in header.split())
    num_vectors = min(MAX_VECTORS, total_num_vectors)
    
    print "Number of vectors: %d/%d" % (num_vectors, total_num_vectors)
    print "Vector size: %d" % vector_len

    while len(vectors) < num_vectors:

        word = ""        
        while True:
            c = f.read(1)
            if c == " ":
                break
            word += c

        binary_vector = f.read(FLOAT_SIZE * vector_len)
        vectors[word] = [ struct.unpack_from('f', binary_vector, i)[0] 
                          for i in xrange(0, len(binary_vector), FLOAT_SIZE) ]
        
        sys.stdout.write("%d%%\r" % (len(vectors) / num_vectors * 100))
        sys.stdout.flush()

import cPickle

print "\nSaving..."
with open(FILE_NAME[:-3] + "pcl", 'wb') as f:
    cPickle.dump(vectors, f, cPickle.HIGHEST_PROTOCOL)


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-5-0bb1d2f35159> in <module>()
     20         header += c
     21 
---> 22     total_num_vectors, vector_len = (int(x) for x in header.split())
     23     num_vectors = min(MAX_VECTORS, total_num_vectors)
     24 

<ipython-input-5-0bb1d2f35159> in <genexpr>((x,))
     20         header += c
     21 
---> 22     total_num_vectors, vector_len = (int(x) for x in header.split())
     23     num_vectors = min(MAX_VECTORS, total_num_vectors)
     24 

ValueError: invalid literal for int() with base 10: '\x1f\x8b\x08\x08\x80\xff\xa8R\x02\x03GoogleNews-vectors-negative300.bin'

In [6]:
import cPickle

print ("\nSaving...")
with open(FILE_NAME[:-3] + "pcl", 'wb') as f:
    cPickle.dump(vectors, f, cPickle.HIGHEST_PROTOCOL)


---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-6-7640a1e47ddb> in <module>()
----> 1 import cPickle
      2 

ImportError: No module named 'cPickle'

In [ ]: