In [1]:
import pandas as pd
import sklearn
from bs4 import BeautifulSoup
import re
import nltk.data
from nltk.corpus import stopwords

In [2]:
train = pd.read_csv( "./data/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3 )
test = pd.read_csv( "./data/testData.tsv", header=0, delimiter="\t", quoting=3 )
unlabeled_train = pd.read_csv( "./data/unlabeledTrainData.tsv", header=0, delimiter="\t", quoting=3 )

In [3]:
# 定义与处理函数
def review_to_wordlist(review, remove_stopwords=False):
    # 去掉 html
    review_text = BeautifulSoup(review, "html5lib").get_text()
    # 去掉 none letter
    letters_only = re.sub("[^a-zA-Z]", " ", review_text)
    # 转换大小写并分割
    words = letters_only.lower().split()
    # stop_words
    # 删除 stop_words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    
    return words

In [4]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [5]:
def review_to_sentences(review, tokenizer, remove_stopwords=False):
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(review_to_wordlist(raw_sentence, remove_stopwords))
    
    return sentences

In [6]:
sentences = []  # Initialize an empty list of sentences

print("Parsing sentences from training set")
for review in train["review"]:
    sentences += review_to_sentences(review, tokenizer)

print("Parsing sentences from unlabeled set")
for review in unlabeled_train["review"]:
    sentences += review_to_sentences(review, tokenizer)


Parsing sentences from training set
/home/quoniammm/anaconda3/envs/tf/lib/python3.6/site-packages/bs4/__init__.py:219: UserWarning: "b'.'" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.
  ' Beautiful Soup.' % markup)
/home/quoniammm/anaconda3/envs/tf/lib/python3.6/site-packages/bs4/__init__.py:282: UserWarning: "http://www.happierabroad.com"" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
Parsing sentences from unlabeled set
/home/quoniammm/anaconda3/envs/tf/lib/python3.6/site-packages/bs4/__init__.py:282: UserWarning: "http://www.archive.org/details/LovefromaStranger"" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
/home/quoniammm/anaconda3/envs/tf/lib/python3.6/site-packages/bs4/__init__.py:282: UserWarning: "http://www.loosechangeguide.com/LooseChangeGuide.html"" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
/home/quoniammm/anaconda3/envs/tf/lib/python3.6/site-packages/bs4/__init__.py:282: UserWarning: "http://www.msnbc.msn.com/id/4972055/site/newsweek/"" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
/home/quoniammm/anaconda3/envs/tf/lib/python3.6/site-packages/bs4/__init__.py:219: UserWarning: "b'..'" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.
  ' Beautiful Soup.' % markup)
/home/quoniammm/anaconda3/envs/tf/lib/python3.6/site-packages/bs4/__init__.py:282: UserWarning: "http://www.youtube.com/watch?v=a0KSqelmgN8"" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
/home/quoniammm/anaconda3/envs/tf/lib/python3.6/site-packages/bs4/__init__.py:282: UserWarning: "http://jake-weird.blogspot.com/2007/08/beneath.html"" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup

In [7]:
sentences[0]


Out[7]:
['with',
 'all',
 'this',
 'stuff',
 'going',
 'down',
 'at',
 'the',
 'moment',
 'with',
 'mj',
 'i',
 've',
 'started',
 'listening',
 'to',
 'his',
 'music',
 'watching',
 'the',
 'odd',
 'documentary',
 'here',
 'and',
 'there',
 'watched',
 'the',
 'wiz',
 'and',
 'watched',
 'moonwalker',
 'again']

In [8]:
import logging
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s', level=logging.INFO)

num_features = 300
context = 10 # window_size
downsampling = 1e-3
min_word_count = 40
num_workers = 4

In [9]:
from gensim.models import word2vec
print("Training model...")
model = word2vec.Word2Vec(
    sentences,
    workers=num_workers,
    size=num_features,
    min_count=min_word_count,
    window=context,
    sample=downsampling
)

model.init_sims(replace=True)
model_name = "300features_40minwords_10context"
model.save(model_name)


2017-08-30 14:37:36,861: INFO: 'pattern' package not found; tag filters are not available for English
2017-08-30 14:37:36,868: INFO: collecting all words and their counts
2017-08-30 14:37:36,869: INFO: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-08-30 14:37:36,937: INFO: PROGRESS: at sentence #10000, processed 225803 words, keeping 17776 word types
2017-08-30 14:37:37,010: INFO: PROGRESS: at sentence #20000, processed 451892 words, keeping 24948 word types
Training model...
2017-08-30 14:37:37,077: INFO: PROGRESS: at sentence #30000, processed 671315 words, keeping 30034 word types
2017-08-30 14:37:37,148: INFO: PROGRESS: at sentence #40000, processed 897815 words, keeping 34348 word types
2017-08-30 14:37:37,216: INFO: PROGRESS: at sentence #50000, processed 1116963 words, keeping 37761 word types
2017-08-30 14:37:37,286: INFO: PROGRESS: at sentence #60000, processed 1338404 words, keeping 40723 word types
2017-08-30 14:37:37,361: INFO: PROGRESS: at sentence #70000, processed 1561580 words, keeping 43333 word types
2017-08-30 14:37:37,436: INFO: PROGRESS: at sentence #80000, processed 1780887 words, keeping 45714 word types
2017-08-30 14:37:37,526: INFO: PROGRESS: at sentence #90000, processed 2004996 words, keeping 48135 word types
2017-08-30 14:37:37,609: INFO: PROGRESS: at sentence #100000, processed 2226967 words, keeping 50207 word types
2017-08-30 14:37:37,673: INFO: PROGRESS: at sentence #110000, processed 2446581 words, keeping 52081 word types
2017-08-30 14:37:37,738: INFO: PROGRESS: at sentence #120000, processed 2668776 words, keeping 54119 word types
2017-08-30 14:37:37,802: INFO: PROGRESS: at sentence #130000, processed 2894304 words, keeping 55847 word types
2017-08-30 14:37:37,862: INFO: PROGRESS: at sentence #140000, processed 3107006 words, keeping 57346 word types
2017-08-30 14:37:37,923: INFO: PROGRESS: at sentence #150000, processed 3332628 words, keeping 59055 word types
2017-08-30 14:37:37,988: INFO: PROGRESS: at sentence #160000, processed 3555316 words, keeping 60617 word types
2017-08-30 14:37:38,048: INFO: PROGRESS: at sentence #170000, processed 3778656 words, keeping 62077 word types
2017-08-30 14:37:38,108: INFO: PROGRESS: at sentence #180000, processed 3999237 words, keeping 63496 word types
2017-08-30 14:37:38,169: INFO: PROGRESS: at sentence #190000, processed 4224450 words, keeping 64794 word types
2017-08-30 14:37:38,231: INFO: PROGRESS: at sentence #200000, processed 4448604 words, keeping 66087 word types
2017-08-30 14:37:38,298: INFO: PROGRESS: at sentence #210000, processed 4669968 words, keeping 67390 word types
2017-08-30 14:37:38,364: INFO: PROGRESS: at sentence #220000, processed 4894969 words, keeping 68697 word types
2017-08-30 14:37:38,429: INFO: PROGRESS: at sentence #230000, processed 5117546 words, keeping 69958 word types
2017-08-30 14:37:38,493: INFO: PROGRESS: at sentence #240000, processed 5345051 words, keeping 71167 word types
2017-08-30 14:37:38,552: INFO: PROGRESS: at sentence #250000, processed 5559166 words, keeping 72351 word types
2017-08-30 14:37:38,612: INFO: PROGRESS: at sentence #260000, processed 5779147 words, keeping 73478 word types
2017-08-30 14:37:38,674: INFO: PROGRESS: at sentence #270000, processed 6000436 words, keeping 74767 word types
2017-08-30 14:37:38,736: INFO: PROGRESS: at sentence #280000, processed 6226315 words, keeping 76369 word types
2017-08-30 14:37:38,800: INFO: PROGRESS: at sentence #290000, processed 6449475 words, keeping 77839 word types
2017-08-30 14:37:38,862: INFO: PROGRESS: at sentence #300000, processed 6674078 words, keeping 79171 word types
2017-08-30 14:37:38,929: INFO: PROGRESS: at sentence #310000, processed 6899392 words, keeping 80480 word types
2017-08-30 14:37:38,997: INFO: PROGRESS: at sentence #320000, processed 7124279 words, keeping 81808 word types
2017-08-30 14:37:39,058: INFO: PROGRESS: at sentence #330000, processed 7346022 words, keeping 83030 word types
2017-08-30 14:37:39,124: INFO: PROGRESS: at sentence #340000, processed 7575534 words, keeping 84280 word types
2017-08-30 14:37:39,189: INFO: PROGRESS: at sentence #350000, processed 7798804 words, keeping 85425 word types
2017-08-30 14:37:39,250: INFO: PROGRESS: at sentence #360000, processed 8019467 words, keeping 86596 word types
2017-08-30 14:37:39,318: INFO: PROGRESS: at sentence #370000, processed 8246659 words, keeping 87708 word types
2017-08-30 14:37:39,379: INFO: PROGRESS: at sentence #380000, processed 8471806 words, keeping 88878 word types
2017-08-30 14:37:39,449: INFO: PROGRESS: at sentence #390000, processed 8701556 words, keeping 89907 word types
2017-08-30 14:37:39,511: INFO: PROGRESS: at sentence #400000, processed 8924505 words, keeping 90916 word types
2017-08-30 14:37:39,573: INFO: PROGRESS: at sentence #410000, processed 9145855 words, keeping 91880 word types
2017-08-30 14:37:39,634: INFO: PROGRESS: at sentence #420000, processed 9366935 words, keeping 92912 word types
2017-08-30 14:37:39,696: INFO: PROGRESS: at sentence #430000, processed 9594472 words, keeping 93932 word types
2017-08-30 14:37:39,761: INFO: PROGRESS: at sentence #440000, processed 9821225 words, keeping 94906 word types
2017-08-30 14:37:39,820: INFO: PROGRESS: at sentence #450000, processed 10044987 words, keeping 96036 word types
2017-08-30 14:37:39,890: INFO: PROGRESS: at sentence #460000, processed 10277747 words, keeping 97088 word types
2017-08-30 14:37:39,958: INFO: PROGRESS: at sentence #470000, processed 10505672 words, keeping 97933 word types
2017-08-30 14:37:40,015: INFO: PROGRESS: at sentence #480000, processed 10726056 words, keeping 98862 word types
2017-08-30 14:37:40,081: INFO: PROGRESS: at sentence #490000, processed 10952800 words, keeping 99871 word types
2017-08-30 14:37:40,141: INFO: PROGRESS: at sentence #500000, processed 11174456 words, keeping 100765 word types
2017-08-30 14:37:40,208: INFO: PROGRESS: at sentence #510000, processed 11399731 words, keeping 101699 word types
2017-08-30 14:37:40,270: INFO: PROGRESS: at sentence #520000, processed 11623082 words, keeping 102598 word types
2017-08-30 14:37:40,337: INFO: PROGRESS: at sentence #530000, processed 11847480 words, keeping 103400 word types
2017-08-30 14:37:40,402: INFO: PROGRESS: at sentence #540000, processed 12072095 words, keeping 104265 word types
2017-08-30 14:37:40,470: INFO: PROGRESS: at sentence #550000, processed 12297646 words, keeping 105133 word types
2017-08-30 14:37:40,533: INFO: PROGRESS: at sentence #560000, processed 12518936 words, keeping 105997 word types
2017-08-30 14:37:40,599: INFO: PROGRESS: at sentence #570000, processed 12748083 words, keeping 106787 word types
2017-08-30 14:37:40,666: INFO: PROGRESS: at sentence #580000, processed 12969579 words, keeping 107665 word types
2017-08-30 14:37:40,732: INFO: PROGRESS: at sentence #590000, processed 13195104 words, keeping 108501 word types
2017-08-30 14:37:40,797: INFO: PROGRESS: at sentence #600000, processed 13417302 words, keeping 109218 word types
2017-08-30 14:37:40,861: INFO: PROGRESS: at sentence #610000, processed 13638325 words, keeping 110092 word types
2017-08-30 14:37:40,931: INFO: PROGRESS: at sentence #620000, processed 13864650 words, keeping 110837 word types
2017-08-30 14:37:40,997: INFO: PROGRESS: at sentence #630000, processed 14088936 words, keeping 111610 word types
2017-08-30 14:37:41,059: INFO: PROGRESS: at sentence #640000, processed 14309719 words, keeping 112416 word types
2017-08-30 14:37:41,124: INFO: PROGRESS: at sentence #650000, processed 14535475 words, keeping 113196 word types
2017-08-30 14:37:41,191: INFO: PROGRESS: at sentence #660000, processed 14758265 words, keeping 113945 word types
2017-08-30 14:37:41,257: INFO: PROGRESS: at sentence #670000, processed 14981658 words, keeping 114643 word types
2017-08-30 14:37:41,321: INFO: PROGRESS: at sentence #680000, processed 15206490 words, keeping 115354 word types
2017-08-30 14:37:41,386: INFO: PROGRESS: at sentence #690000, processed 15428683 words, keeping 116131 word types
2017-08-30 14:37:41,454: INFO: PROGRESS: at sentence #700000, processed 15657389 words, keeping 116943 word types
2017-08-30 14:37:41,516: INFO: PROGRESS: at sentence #710000, processed 15880378 words, keeping 117596 word types
2017-08-30 14:37:41,577: INFO: PROGRESS: at sentence #720000, processed 16105665 words, keeping 118221 word types
2017-08-30 14:37:41,638: INFO: PROGRESS: at sentence #730000, processed 16332046 words, keeping 118954 word types
2017-08-30 14:37:41,697: INFO: PROGRESS: at sentence #740000, processed 16553079 words, keeping 119668 word types
2017-08-30 14:37:41,756: INFO: PROGRESS: at sentence #750000, processed 16771406 words, keeping 120295 word types
2017-08-30 14:37:41,815: INFO: PROGRESS: at sentence #760000, processed 16990810 words, keeping 120930 word types
2017-08-30 14:37:41,878: INFO: PROGRESS: at sentence #770000, processed 17217947 words, keeping 121703 word types
2017-08-30 14:37:41,942: INFO: PROGRESS: at sentence #780000, processed 17448093 words, keeping 122402 word types
2017-08-30 14:37:42,004: INFO: PROGRESS: at sentence #790000, processed 17675169 words, keeping 123066 word types
2017-08-30 14:37:42,041: INFO: collected 123504 word types from a corpus of 17798270 raw words and 795538 sentences
2017-08-30 14:37:42,042: INFO: Loading a fresh vocabulary
2017-08-30 14:37:42,115: INFO: min_count=40 retains 16490 unique words (13% of original 123504, drops 107014)
2017-08-30 14:37:42,115: INFO: min_count=40 leaves 17239125 word corpus (96% of original 17798270, drops 559145)
2017-08-30 14:37:42,164: INFO: deleting the raw counts dictionary of 123504 items
2017-08-30 14:37:42,168: INFO: sample=0.001 downsamples 48 most-common words
2017-08-30 14:37:42,168: INFO: downsampling leaves estimated 12749798 word corpus (74.0% of prior 17239125)
2017-08-30 14:37:42,169: INFO: estimated required memory for 16490 words and 300 dimensions: 47821000 bytes
2017-08-30 14:37:42,230: INFO: resetting layer weights
2017-08-30 14:37:42,481: INFO: training model with 4 workers on 16490 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=10
2017-08-30 14:37:43,499: INFO: PROGRESS: at 0.38% examples, 241683 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:37:44,522: INFO: PROGRESS: at 0.83% examples, 260891 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:37:45,575: INFO: PROGRESS: at 1.28% examples, 264432 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:37:46,611: INFO: PROGRESS: at 1.73% examples, 267332 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:37:47,619: INFO: PROGRESS: at 2.21% examples, 273403 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:37:48,623: INFO: PROGRESS: at 2.67% examples, 276631 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:37:49,641: INFO: PROGRESS: at 3.14% examples, 278168 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:37:50,663: INFO: PROGRESS: at 3.60% examples, 279276 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:37:51,718: INFO: PROGRESS: at 3.99% examples, 273665 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:37:52,740: INFO: PROGRESS: at 4.46% examples, 275698 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:37:53,755: INFO: PROGRESS: at 4.91% examples, 276295 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:37:54,760: INFO: PROGRESS: at 5.37% examples, 277534 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:37:55,774: INFO: PROGRESS: at 5.82% examples, 277845 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:37:56,796: INFO: PROGRESS: at 6.28% examples, 277964 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:37:57,799: INFO: PROGRESS: at 6.63% examples, 274238 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:37:58,813: INFO: PROGRESS: at 7.08% examples, 274689 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:37:59,823: INFO: PROGRESS: at 7.52% examples, 275219 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:38:00,854: INFO: PROGRESS: at 7.97% examples, 275338 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:38:01,867: INFO: PROGRESS: at 8.40% examples, 275343 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:38:02,874: INFO: PROGRESS: at 8.77% examples, 273347 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:38:03,910: INFO: PROGRESS: at 9.19% examples, 272472 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:38:04,921: INFO: PROGRESS: at 9.63% examples, 272925 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:38:05,965: INFO: PROGRESS: at 10.09% examples, 273301 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:38:07,022: INFO: PROGRESS: at 10.45% examples, 270863 words/s, in_qsize 8, out_qsize 0
2017-08-30 14:38:08,031: INFO: PROGRESS: at 10.90% examples, 271355 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:38:09,115: INFO: PROGRESS: at 11.29% examples, 269704 words/s, in_qsize 8, out_qsize 0
2017-08-30 14:38:10,135: INFO: PROGRESS: at 11.73% examples, 270097 words/s, in_qsize 8, out_qsize 1
2017-08-30 14:38:11,183: INFO: PROGRESS: at 12.19% examples, 270694 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:38:12,188: INFO: PROGRESS: at 12.63% examples, 270936 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:38:13,232: INFO: PROGRESS: at 13.06% examples, 270579 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:38:14,239: INFO: PROGRESS: at 13.47% examples, 270337 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:38:15,251: INFO: PROGRESS: at 13.92% examples, 270714 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:38:16,260: INFO: PROGRESS: at 14.38% examples, 271318 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:38:17,267: INFO: PROGRESS: at 14.82% examples, 271683 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:38:18,288: INFO: PROGRESS: at 15.23% examples, 271143 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:38:19,339: INFO: PROGRESS: at 15.59% examples, 269613 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:38:20,398: INFO: PROGRESS: at 15.95% examples, 268109 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:38:21,437: INFO: PROGRESS: at 16.32% examples, 267012 words/s, in_qsize 7, out_qsize 1
2017-08-30 14:38:22,474: INFO: PROGRESS: at 16.79% examples, 267604 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:38:23,524: INFO: PROGRESS: at 17.19% examples, 267033 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:38:24,530: INFO: PROGRESS: at 17.63% examples, 267274 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:38:25,555: INFO: PROGRESS: at 18.05% examples, 267227 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:38:26,599: INFO: PROGRESS: at 18.51% examples, 267544 words/s, in_qsize 8, out_qsize 0
2017-08-30 14:38:27,604: INFO: PROGRESS: at 18.99% examples, 268234 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:38:28,622: INFO: PROGRESS: at 19.45% examples, 268665 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:38:29,632: INFO: PROGRESS: at 19.88% examples, 268831 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:38:30,653: INFO: PROGRESS: at 20.31% examples, 268921 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:38:31,666: INFO: PROGRESS: at 20.74% examples, 268913 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:38:32,730: INFO: PROGRESS: at 21.12% examples, 268053 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:38:33,742: INFO: PROGRESS: at 21.50% examples, 267357 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:38:34,751: INFO: PROGRESS: at 21.94% examples, 267536 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:38:35,752: INFO: PROGRESS: at 22.39% examples, 267884 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:38:36,796: INFO: PROGRESS: at 22.84% examples, 267888 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:38:37,798: INFO: PROGRESS: at 23.29% examples, 268207 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:38:38,824: INFO: PROGRESS: at 23.73% examples, 268267 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:38:39,846: INFO: PROGRESS: at 24.16% examples, 268225 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:38:40,862: INFO: PROGRESS: at 24.59% examples, 268212 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:38:41,884: INFO: PROGRESS: at 25.02% examples, 268291 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:38:42,889: INFO: PROGRESS: at 25.44% examples, 268211 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:38:43,946: INFO: PROGRESS: at 25.84% examples, 267779 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:38:45,012: INFO: PROGRESS: at 26.16% examples, 266414 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:38:46,015: INFO: PROGRESS: at 26.51% examples, 265696 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:38:47,017: INFO: PROGRESS: at 26.78% examples, 264114 words/s, in_qsize 5, out_qsize 2
2017-08-30 14:38:48,045: INFO: PROGRESS: at 27.18% examples, 263894 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:38:49,062: INFO: PROGRESS: at 27.49% examples, 262875 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:38:50,062: INFO: PROGRESS: at 27.92% examples, 263005 words/s, in_qsize 6, out_qsize 1
2017-08-30 14:38:51,112: INFO: PROGRESS: at 28.22% examples, 261795 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:38:52,125: INFO: PROGRESS: at 28.60% examples, 261476 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:38:53,202: INFO: PROGRESS: at 29.01% examples, 261236 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:38:54,220: INFO: PROGRESS: at 29.39% examples, 260909 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:38:55,287: INFO: PROGRESS: at 29.75% examples, 260326 words/s, in_qsize 8, out_qsize 0
2017-08-30 14:38:56,327: INFO: PROGRESS: at 30.16% examples, 260146 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:38:57,384: INFO: PROGRESS: at 30.61% examples, 260283 words/s, in_qsize 8, out_qsize 0
2017-08-30 14:38:58,439: INFO: PROGRESS: at 30.97% examples, 259774 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:38:59,473: INFO: PROGRESS: at 31.37% examples, 259530 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:39:00,481: INFO: PROGRESS: at 31.76% examples, 259479 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:39:01,495: INFO: PROGRESS: at 32.15% examples, 259314 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:39:02,523: INFO: PROGRESS: at 32.51% examples, 258842 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:39:03,547: INFO: PROGRESS: at 32.98% examples, 259282 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:39:04,565: INFO: PROGRESS: at 33.44% examples, 259636 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:39:05,567: INFO: PROGRESS: at 33.80% examples, 259261 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:39:06,578: INFO: PROGRESS: at 34.18% examples, 259038 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:39:07,596: INFO: PROGRESS: at 34.57% examples, 258881 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:39:08,674: INFO: PROGRESS: at 35.03% examples, 259045 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:39:09,707: INFO: PROGRESS: at 35.49% examples, 259340 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:39:10,716: INFO: PROGRESS: at 35.95% examples, 259697 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:39:11,756: INFO: PROGRESS: at 36.37% examples, 259712 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:39:12,762: INFO: PROGRESS: at 36.74% examples, 259351 words/s, in_qsize 8, out_qsize 0
2017-08-30 14:39:13,765: INFO: PROGRESS: at 37.17% examples, 259560 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:39:14,778: INFO: PROGRESS: at 37.59% examples, 259653 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:39:15,788: INFO: PROGRESS: at 38.01% examples, 259758 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:39:16,827: INFO: PROGRESS: at 38.47% examples, 259933 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:39:17,860: INFO: PROGRESS: at 38.93% examples, 260186 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:39:18,863: INFO: PROGRESS: at 39.37% examples, 260372 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:39:19,920: INFO: PROGRESS: at 39.79% examples, 260341 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:39:20,935: INFO: PROGRESS: at 40.20% examples, 260341 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:39:22,033: INFO: PROGRESS: at 40.65% examples, 260345 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:39:23,047: INFO: PROGRESS: at 41.08% examples, 260490 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:39:24,076: INFO: PROGRESS: at 41.51% examples, 260453 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:39:25,152: INFO: PROGRESS: at 41.90% examples, 260167 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:39:26,156: INFO: PROGRESS: at 42.29% examples, 259991 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:39:27,187: INFO: PROGRESS: at 42.65% examples, 259624 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:39:28,243: INFO: PROGRESS: at 43.04% examples, 259329 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:39:29,339: INFO: PROGRESS: at 43.32% examples, 258344 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:39:30,363: INFO: PROGRESS: at 43.73% examples, 258276 words/s, in_qsize 8, out_qsize 0
2017-08-30 14:39:31,368: INFO: PROGRESS: at 44.08% examples, 257926 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:39:32,420: INFO: PROGRESS: at 44.45% examples, 257607 words/s, in_qsize 8, out_qsize 0
2017-08-30 14:39:33,434: INFO: PROGRESS: at 44.87% examples, 257642 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:39:34,457: INFO: PROGRESS: at 45.23% examples, 257328 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:39:35,526: INFO: PROGRESS: at 45.67% examples, 257422 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:39:36,540: INFO: PROGRESS: at 46.04% examples, 257206 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:39:37,555: INFO: PROGRESS: at 46.50% examples, 257419 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:39:38,609: INFO: PROGRESS: at 46.96% examples, 257606 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:39:39,644: INFO: PROGRESS: at 47.42% examples, 257832 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:39:40,686: INFO: PROGRESS: at 47.89% examples, 258045 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:39:41,706: INFO: PROGRESS: at 48.32% examples, 258170 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:39:42,785: INFO: PROGRESS: at 48.77% examples, 258296 words/s, in_qsize 6, out_qsize 1
2017-08-30 14:39:43,805: INFO: PROGRESS: at 49.10% examples, 257833 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:39:44,835: INFO: PROGRESS: at 49.51% examples, 257821 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:39:45,849: INFO: PROGRESS: at 49.96% examples, 258080 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:39:46,850: INFO: PROGRESS: at 50.41% examples, 258249 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:39:47,860: INFO: PROGRESS: at 50.79% examples, 258105 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:39:48,888: INFO: PROGRESS: at 51.12% examples, 257702 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:39:49,927: INFO: PROGRESS: at 51.56% examples, 257841 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:39:50,956: INFO: PROGRESS: at 51.96% examples, 257778 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:39:52,032: INFO: PROGRESS: at 52.25% examples, 257072 words/s, in_qsize 5, out_qsize 2
2017-08-30 14:39:53,071: INFO: PROGRESS: at 52.67% examples, 257054 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:39:54,073: INFO: PROGRESS: at 52.97% examples, 256564 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:39:55,084: INFO: PROGRESS: at 53.37% examples, 256550 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:39:56,088: INFO: PROGRESS: at 53.77% examples, 256496 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:39:57,092: INFO: PROGRESS: at 54.21% examples, 256706 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:39:58,125: INFO: PROGRESS: at 54.61% examples, 256652 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:39:59,166: INFO: PROGRESS: at 55.01% examples, 256527 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:40:00,187: INFO: PROGRESS: at 55.41% examples, 256496 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:40:01,257: INFO: PROGRESS: at 55.76% examples, 256118 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:40:02,284: INFO: PROGRESS: at 56.23% examples, 256382 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:40:03,312: INFO: PROGRESS: at 56.62% examples, 256290 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:40:04,343: INFO: PROGRESS: at 57.01% examples, 256193 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:40:05,349: INFO: PROGRESS: at 57.47% examples, 256443 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:40:06,356: INFO: PROGRESS: at 57.83% examples, 256236 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:40:07,403: INFO: PROGRESS: at 58.12% examples, 255669 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:40:08,456: INFO: PROGRESS: at 58.51% examples, 255541 words/s, in_qsize 6, out_qsize 1
2017-08-30 14:40:09,483: INFO: PROGRESS: at 58.92% examples, 255508 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:40:10,526: INFO: PROGRESS: at 59.33% examples, 255447 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:40:11,529: INFO: PROGRESS: at 59.72% examples, 255458 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:40:12,540: INFO: PROGRESS: at 60.06% examples, 255165 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:40:13,562: INFO: PROGRESS: at 60.37% examples, 254764 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:40:14,637: INFO: PROGRESS: at 60.65% examples, 254142 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:40:15,702: INFO: PROGRESS: at 60.92% examples, 253494 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:40:16,714: INFO: PROGRESS: at 61.13% examples, 252709 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:40:17,720: INFO: PROGRESS: at 61.43% examples, 252269 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:40:18,728: INFO: PROGRESS: at 61.78% examples, 252061 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:40:19,737: INFO: PROGRESS: at 62.20% examples, 252127 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:40:20,746: INFO: PROGRESS: at 62.53% examples, 251832 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:40:21,788: INFO: PROGRESS: at 62.93% examples, 251803 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:40:22,840: INFO: PROGRESS: at 63.26% examples, 251446 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:40:23,893: INFO: PROGRESS: at 63.66% examples, 251353 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:40:24,908: INFO: PROGRESS: at 64.08% examples, 251411 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:40:25,910: INFO: PROGRESS: at 64.53% examples, 251621 words/s, in_qsize 6, out_qsize 1
2017-08-30 14:40:26,915: INFO: PROGRESS: at 64.99% examples, 251866 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:40:27,999: INFO: PROGRESS: at 65.45% examples, 251985 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:40:29,017: INFO: PROGRESS: at 65.91% examples, 252206 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:40:30,020: INFO: PROGRESS: at 66.35% examples, 252316 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:40:31,057: INFO: PROGRESS: at 66.83% examples, 252585 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:40:32,108: INFO: PROGRESS: at 67.28% examples, 252706 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:40:33,120: INFO: PROGRESS: at 67.67% examples, 252674 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:40:34,155: INFO: PROGRESS: at 67.99% examples, 252359 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:40:35,227: INFO: PROGRESS: at 68.41% examples, 252325 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:40:36,253: INFO: PROGRESS: at 68.76% examples, 252152 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:40:37,260: INFO: PROGRESS: at 69.12% examples, 252006 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:40:38,286: INFO: PROGRESS: at 69.51% examples, 251960 words/s, in_qsize 8, out_qsize 0
2017-08-30 14:40:39,309: INFO: PROGRESS: at 69.98% examples, 252201 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:40:40,334: INFO: PROGRESS: at 70.45% examples, 252437 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:40:41,393: INFO: PROGRESS: at 70.92% examples, 252621 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:40:42,406: INFO: PROGRESS: at 71.28% examples, 252468 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:40:43,423: INFO: PROGRESS: at 71.74% examples, 252707 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:40:44,464: INFO: PROGRESS: at 72.19% examples, 252873 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:40:45,466: INFO: PROGRESS: at 72.66% examples, 253091 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:40:46,487: INFO: PROGRESS: at 73.13% examples, 253321 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:40:47,498: INFO: PROGRESS: at 73.57% examples, 253482 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:40:48,512: INFO: PROGRESS: at 74.03% examples, 253676 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:40:49,540: INFO: PROGRESS: at 74.49% examples, 253851 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:40:50,556: INFO: PROGRESS: at 74.96% examples, 254080 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:40:51,567: INFO: PROGRESS: at 75.42% examples, 254270 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:40:52,567: INFO: PROGRESS: at 75.89% examples, 254511 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:40:53,604: INFO: PROGRESS: at 76.35% examples, 254662 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:40:54,633: INFO: PROGRESS: at 76.83% examples, 254899 words/s, in_qsize 8, out_qsize 0
2017-08-30 14:40:55,665: INFO: PROGRESS: at 77.30% examples, 255055 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:40:56,670: INFO: PROGRESS: at 77.76% examples, 255282 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:40:57,687: INFO: PROGRESS: at 78.22% examples, 255453 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:40:58,709: INFO: PROGRESS: at 78.69% examples, 255655 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:40:59,714: INFO: PROGRESS: at 79.16% examples, 255839 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:41:00,755: INFO: PROGRESS: at 79.62% examples, 256013 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:41:01,765: INFO: PROGRESS: at 80.09% examples, 256223 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:41:02,770: INFO: PROGRESS: at 80.54% examples, 256366 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:41:03,795: INFO: PROGRESS: at 81.02% examples, 256586 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:41:04,836: INFO: PROGRESS: at 81.48% examples, 256713 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:41:05,860: INFO: PROGRESS: at 81.97% examples, 256935 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:41:06,868: INFO: PROGRESS: at 82.45% examples, 257139 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:41:07,906: INFO: PROGRESS: at 82.91% examples, 257268 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:41:08,932: INFO: PROGRESS: at 83.39% examples, 257446 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:41:09,938: INFO: PROGRESS: at 83.87% examples, 257642 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:41:10,944: INFO: PROGRESS: at 84.31% examples, 257773 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:41:11,968: INFO: PROGRESS: at 84.78% examples, 257914 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:41:12,976: INFO: PROGRESS: at 85.22% examples, 258001 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:41:13,981: INFO: PROGRESS: at 85.61% examples, 257958 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:41:14,996: INFO: PROGRESS: at 86.02% examples, 257969 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:41:16,018: INFO: PROGRESS: at 86.44% examples, 257939 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:41:17,035: INFO: PROGRESS: at 86.83% examples, 257881 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:41:18,048: INFO: PROGRESS: at 87.17% examples, 257664 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:41:19,051: INFO: PROGRESS: at 87.54% examples, 257561 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:41:20,061: INFO: PROGRESS: at 87.96% examples, 257614 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:41:21,069: INFO: PROGRESS: at 88.32% examples, 257473 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:41:22,073: INFO: PROGRESS: at 88.66% examples, 257306 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:41:23,088: INFO: PROGRESS: at 89.05% examples, 257225 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:41:24,104: INFO: PROGRESS: at 89.42% examples, 257142 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:41:25,148: INFO: PROGRESS: at 89.82% examples, 257093 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:41:26,168: INFO: PROGRESS: at 90.22% examples, 257041 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:41:27,171: INFO: PROGRESS: at 90.67% examples, 257165 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:41:28,179: INFO: PROGRESS: at 91.08% examples, 257189 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:41:29,214: INFO: PROGRESS: at 91.43% examples, 257021 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:41:30,241: INFO: PROGRESS: at 91.69% examples, 256614 words/s, in_qsize 8, out_qsize 0
2017-08-30 14:41:31,270: INFO: PROGRESS: at 92.06% examples, 256491 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:41:32,300: INFO: PROGRESS: at 92.44% examples, 256400 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:41:33,351: INFO: PROGRESS: at 92.85% examples, 256350 words/s, in_qsize 8, out_qsize 0
2017-08-30 14:41:34,377: INFO: PROGRESS: at 93.21% examples, 256233 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:41:35,415: INFO: PROGRESS: at 93.63% examples, 256228 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:41:36,467: INFO: PROGRESS: at 94.07% examples, 256268 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:41:37,472: INFO: PROGRESS: at 94.42% examples, 256147 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:41:38,473: INFO: PROGRESS: at 94.81% examples, 256122 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:41:39,475: INFO: PROGRESS: at 95.21% examples, 256096 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:41:40,499: INFO: PROGRESS: at 95.58% examples, 255987 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:41:41,514: INFO: PROGRESS: at 95.95% examples, 255887 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:41:42,516: INFO: PROGRESS: at 96.24% examples, 255594 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:41:43,536: INFO: PROGRESS: at 96.62% examples, 255520 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:41:44,542: INFO: PROGRESS: at 96.97% examples, 255376 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:41:45,547: INFO: PROGRESS: at 97.37% examples, 255378 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:41:46,548: INFO: PROGRESS: at 97.74% examples, 255297 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:41:47,549: INFO: PROGRESS: at 98.10% examples, 255189 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:41:48,553: INFO: PROGRESS: at 98.44% examples, 255048 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:41:49,560: INFO: PROGRESS: at 98.90% examples, 255165 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:41:50,584: INFO: PROGRESS: at 99.25% examples, 255007 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:41:51,610: INFO: PROGRESS: at 99.61% examples, 254904 words/s, in_qsize 7, out_qsize 0
2017-08-30 14:41:52,562: INFO: worker thread finished; awaiting finish of 3 more threads
2017-08-30 14:41:52,589: INFO: worker thread finished; awaiting finish of 2 more threads
2017-08-30 14:41:52,610: INFO: PROGRESS: at 99.99% examples, 254856 words/s, in_qsize 1, out_qsize 1
2017-08-30 14:41:52,613: INFO: worker thread finished; awaiting finish of 1 more threads
2017-08-30 14:41:52,688: INFO: worker thread finished; awaiting finish of 0 more threads
2017-08-30 14:41:52,689: INFO: training on 88991350 raw words (63750931 effective words) took 250.2s, 254804 effective words/s
2017-08-30 14:41:52,689: INFO: precomputing L2-norms of word weight vectors
2017-08-30 14:41:52,826: INFO: saving Word2Vec object under 300features_40minwords_10context, separately None
2017-08-30 14:41:52,827: INFO: not storing attribute syn0norm
2017-08-30 14:41:52,828: INFO: not storing attribute cum_table
2017-08-30 14:41:53,180: INFO: saved 300features_40minwords_10context

In [10]:
# 检查训练结果
print(model.doesnt_match("apple pepole banana orange".split()))
print(model.most_similar("sex"))
print(model.most_similar("bitch"))


2017-08-30 14:41:53,201: WARNING: vectors for words {'pepole'} are not present in the model, ignoring these words
orange
[('sexual', 0.6434853076934814), ('masturbation', 0.5880445241928101), ('lesbian', 0.5711398124694824), ('nudity', 0.5502188205718994), ('nude', 0.544196605682373), ('gratuitous', 0.5305460691452026), ('rape', 0.5145243406295776), ('incest', 0.5125550031661987), ('lovemaking', 0.5069805383682251), ('explicit', 0.5014914274215698)]
[('slut', 0.6980215907096863), ('whore', 0.6937853693962097), ('blonde', 0.6554198265075684), ('bimbo', 0.652977466583252), ('stripper', 0.6326565742492676), ('perky', 0.6208070516586304), ('housewife', 0.6149146556854248), ('maid', 0.6066554188728333), ('bitchy', 0.6030336618423462), ('mona', 0.6029400825500488)]

In [12]:
# 载入训练好的模型
from gensim.models import Word2Vec
model = Word2Vec.load("300features_40minwords_10context")


2017-08-30 14:43:35,341: INFO: loading Word2Vec object from 300features_40minwords_10context
2017-08-30 14:43:35,637: INFO: loading wv recursively from 300features_40minwords_10context.wv.* with mmap=None
2017-08-30 14:43:35,638: INFO: setting ignored attribute syn0norm to None
2017-08-30 14:43:35,639: INFO: setting ignored attribute cum_table to None
2017-08-30 14:43:35,639: INFO: loaded 300features_40minwords_10context

In [14]:
model["flower"].shape


Out[14]:
(300,)

In [19]:
# import numpy as np

# def makeFeatureVec(words, model, num_features):
#     featureVec = np.zeros((num_features,), dtype="float32")
#     nwords = 0
#     index2word_set = set(model.index2word)
    
#     for word in words:
#         if word in index2word_set:
#             nwords = nwords + 1.
#             featureVec = np.add(featureVec, model[word])
            
#     featureVec = np.divide(featureVec, nwords)
#     return featureVec

# def getAvgFeatureVecs(reviews, model, num_features):
#     counter = 0.
#     reviewFeatureVecs = np.zeros((len(reviews), num_features), dtype="float32")
#     for review in reviews:
#         if counter%5000. == 0.:
#             print("Review {} of {}".format(counter, len(reviews)))
        
#         reviewFeatureVecs[counter] = makeFeatureVec(review, model, num_features)
        
#         counter += 1
        
#     return reviewFeatureVecs

In [20]:
# clean_train_reviews = []
# for review in train["review"]:
#     clean_train_reviews.append( review_to_wordlist( review, \
#         remove_stopwords=True ))

# trainDataVecs = getAvgFeatureVecs( clean_train_reviews, model, num_features )

# print("Creating average feature vecs for test reviews")
# clean_test_reviews = []
# for review in test["review"]:
#     clean_test_reviews.append( review_to_wordlist( review, \
#         remove_stopwords=True ))

# testDataVecs = getAvgFeatureVecs( clean_test_reviews, model, num_features )

In [21]:
# # Fit a random forest to the training data, using 100 trees
# from sklearn.ensemble import RandomForestClassifier
# forest = RandomForestClassifier( n_estimators = 100 )

# print "Fitting a random forest to labeled training data..."
# forest = forest.fit( trainDataVecs, train["sentiment"] )

# # Test & extract results 
# result = forest.predict( testDataVecs )

# # Write the test results 
# output = pd.DataFrame( data={"id":test["id"], "sentiment":result} )
# output.to_csv( "Word2Vec_AverageVectors.csv", index=False, quoting=3 )

In [ ]: