In [26]:
#load.py
import sys
reload(sys)
sys.setdefaultencoding('utf8')
import gensim
import os
import re
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim.models.doc2vec import TaggedDocument

def get_doc_list(folder_name):
    doc_list = []
    file_list = [folder_name + "/" + name for name in os.listdir(folder_name) if name.endswith('txt')]
    for file in file_list:
        st = open(file, 'r').read()
        doc_list.append(st)
    print('Found %s documents under the dir %s ..... ' % (len(file_list), folder_name))
    return doc_list

def get_doc(folder_name):
    doc_list = get_doc_list(folder_name)
    tokenizer = RegexpTokenizer(r'\w+')
    en_stop = get_stop_words('en')
    p_stemmer = PorterStemmer()
    
    taggeddoc = []
    
    texts = []
    for index, i in enumerate(doc_list):
        # for tagged doc
        wordslist = []
        tagslist = []
        
        # clean and tokenize document string
        raw = i.lower()
        tokens = tokenizer.tokenize(raw)
        
        # remove stop words from tokens
        stopped_tokens = [i  for i in tokens if not i in en_stop]
        
        # remove numbers
        number_tokens = [re.sub(r'[\d]', ' ', i) for i in stopped_tokens]
        number_tokens = ' '.join(number_tokens).split()
        
        #stem tokens
        stemmed_tokens = [p_stemmer.stem(i) for i in number_tokens]
        #remove empty
        length_tokens = [i for i in stemmed_tokens if len(i) > 1]
        #add tokens to list
        texts.append(length_tokens)
        
        td = TaggedDocument(gensim.utils.to_unicode(str.encode(' '.join(stemmed_tokens))).split(), str(index))
        taggeddoc.append(td)
        
    return taggeddoc

In [27]:
sentence = TaggedDocument(words = [u'some', u'words', u'here'], tags = [u'SEN_1'])

print sentence

In [28]:
# doc2vectest.py
import sys
reload(sys)
sys.setdefaultencoding('utf8')
import codecs
import gensim
#import load
#documents = load.get_doc('docs')
documents = get_doc('/Users/lipingzhang/Desktop/program/doc2vec/word_vectors_game_of_thrones-LIVE/data')
print('Data Loading finished')
print(len(documents), type(documents))

# build the model
model = gensim.models.Doc2Vec(documents, dm = 0, alpha = 0.025, size = 20, min_alpha = 0.025, min_count = 0)

# start training
for epoch in range(200):
    if epoch % 20 == 0:
        print('Now training epoch %s' & epoch)
    model.train(documents)
    # decrease the learning rate
    model.alpha -= 0.002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

# shows the similar words
print(model.most_similar('suppli'))

# shows the learnt embeeding
print(model['suppli'])

# shows the similar docs with id = 2
print(model.doc2vecs.most_similar(str(2)))


---------------------------------------------------------------------------
UnicodeDecodeError                        Traceback (most recent call last)
<ipython-input-28-e26114b05706> in <module>()
      7 #import load
      8 #documents = load.get_doc('docs')
----> 9 documents = get_doc('/Users/lipingzhang/Desktop/program/doc2vec/word_vectors_game_of_thrones-LIVE/data')
     10 print('Data Loading finished')
     11 print(len(documents), type(documents))

<ipython-input-26-0a1705921ded> in get_doc(folder_name)
     46 
     47         #stem tokens
---> 48         stemmed_tokens = [p_stemmer.stem(i) for i in number_tokens]
     49         #remove empty
     50         length_tokens = [i for i in stemmed_tokens if len(i) > 1]

/Users/lipingzhang/anaconda/lib/python2.7/site-packages/nltk/stem/porter.pyc in stem(self, word)
    663             return word
    664 
--> 665         stem = self._step1a(stem)
    666         stem = self._step1b(stem)
    667         stem = self._step1c(stem)

/Users/lipingzhang/anaconda/lib/python2.7/site-packages/nltk/stem/porter.pyc in _step1a(self, word)
    288         # that 'flies'->'fli' but 'dies'->'die' etc
    289         if self.mode == self.NLTK_EXTENSIONS:
--> 290             if word.endswith('ies') and len(word) == 4:
    291                 return self._replace_suffix(word, 'ies', 'ie')
    292 

/Users/lipingzhang/anaconda/lib/python2.7/encodings/utf_8.pyc in decode(input, errors)
     14 
     15 def decode(input, errors='strict'):
---> 16     return codecs.utf_8_decode(input, errors, True)
     17 
     18 class IncrementalEncoder(codecs.IncrementalEncoder):

UnicodeDecodeError: 'utf8' codec can't decode byte 0xe2 in position 5: unexpected end of data

In [29]:
model.most_similary('suppli')


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-29-4061f4232c9b> in <module>()
----> 1 model.most_similary('suppli')

NameError: name 'model' is not defined

In [30]:
model['suppli']


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-30-43b2d52d1930> in <module>()
----> 1 model['suppli']

NameError: name 'model' is not defined

In [31]:
model.doc2vecs.most_similar(str(2))
model.save('save/trained.model')
model.save_word2vec_format('save/trained.word2vec')


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-31-62ff271ec8d3> in <module>()
----> 1 model.doc2vecs.most_similar(str(2))
      2 model.save('save/trained.model')
      3 model.save_word2vec_format('save/trained.word2vec')

NameError: name 'model' is not defined

In [32]:
# load the word2vec
word2vec = gensim.models.Doc2Vec.load_word2vec_format('save/trained.word2vec')
print(word2vec['good'])

# load the doc2vec
model = gensim.models.Doc2Vec.load('save/trained.model')
docvecs = model.docvecs
#print(docvecs[str(3)])


---------------------------------------------------------------------------
DeprecationWarning                        Traceback (most recent call last)
<ipython-input-32-d9369bc5b212> in <module>()
      1 # load the word2vec
----> 2 word2vec = gensim.models.Doc2Vec.load_word2vec_format('save/trained.word2vec')
      3 print(word2vec['good'])
      4 
      5 # load the doc2vec

/Users/lipingzhang/anaconda/lib/python2.7/site-packages/gensim/models/word2vec.pyc in load_word2vec_format(cls, fname, fvocab, binary, encoding, unicode_errors, limit, datatype)
   1519                          limit=None, datatype=REAL):
   1520         """Deprecated. Use gensim.models.KeyedVectors.load_word2vec_format instead."""
-> 1521         raise DeprecationWarning("Deprecated. Use gensim.models.KeyedVectors.load_word2vec_format instead.")
   1522 
   1523     def save_word2vec_format(self, fname, fvocab=None, binary=False):

DeprecationWarning: Deprecated. Use gensim.models.KeyedVectors.load_word2vec_format instead.

In [33]:
def plotWords():
    # get model, we can use w2v only
    w2v, d2v = useModel()

    words_np = []
    # a list of labels (words)
    words_label = []
    for word in w2v.vocab.keys():
        words_np.append(w2v[word])
        words_label.append(word)

    print('Added %s words. Shape %s' % (len(words_np), np.shape(words_np)))


    pca = decomposition.PCA(n_components = 2)
    pca.fit(words_np)
    reduced = pca.transform(words_np)

    # plt.plot(pca.explained_variance_ratio)
    for index,vec in enumerate(reduced):
        # print('%s %s' % (words_label[index], vec))
        if index <100:
            x,y = vec[0], vec[1]
            plt.scatter(x, y)
            plt.annotate(words_label[index], xy=(x,y))
    plt.show()

In [ ]: