In [1]:
import numpy as np
import os
from random import shuffle
import re

In [2]:
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
output_notebook()


Loading BokehJS ...

In [3]:
import numpy as np
import pandas as pd
import re
from sklearn.preprocessing import LabelBinarizer
from tensorflow.contrib import learn
import tensorflow as tf

In [4]:
def load_files(train_files, dev_files, class_file):

    data_list = list()
    for file in train_files:
        data = pd.read_csv(file, encoding='latin-1')
        data_list.append(data)
    data = pd.DataFrame(np.concatenate(data_list))
    x_train = data[3]
    x_token = []
    for line in x_train:
        x_token.append(line.strip().split())
    return x_token

In [6]:
train_data = "eclipse/"
train_index = 11
data_dir = "../../data/data_by_ocean/" + train_data
models = "TextHMCNN"
data_results = data_dir + "results/" + models + '/'

In [7]:
class_file = data_results + "class_" + str(train_index) + ".csv"
train_files = [data_dir + str(i) + '.csv' for i in range(train_index)]
test_files = [data_dir +
              str(i) + '.csv' for i in range(train_index, train_index + 1)]
x_train = load_files(
    train_files, test_files, class_file)

In [10]:
vocabulary = []
for sentence in x_train:
    vocabulary.extend(sentence)
import collections
word_counts = collections.Counter(vocabulary).most_common(1000)

In [8]:
from gensim.models import Word2Vec


D:\ProgramData\Anaconda3\lib\site-packages\gensim\utils.py:855: UserWarning: detected Windows; aliasing chunkize to chunkize_serial
  warnings.warn("detected Windows; aliasing chunkize to chunkize_serial")

In [14]:
# trainning word
model_ted = Word2Vec(x_train, size=300, min_count=10)

In [15]:
words_top_ted = [word[0] for word in word_counts]
words_top_vec_ted = model_ted[words_top_ted]

In [16]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)
words_top_ted_tsne = tsne.fit_transform(words_top_vec_ted)

In [17]:
p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="word2vec T-SNE for most common words")

source = ColumnDataSource(data=dict(x1=words_top_ted_tsne[:,0],
                                    x2=words_top_ted_tsne[:,1],
                                    names=words_top_ted))

p.scatter(x="x1", y="x2", size=8, source=source)

labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
p.add_layout(labels)

show(p)



In [23]:
model_ted.wv.save_word2vec_format(fname='eclipse.bin', fvocab='eclipse.vocab', binary=True)

In [24]:
model_ted.wv.save_word2vec_format(fname='eclipse.csv')

In [ ]: